[CPU][ARM] Enable both f16 and f32 kernels for aarch64 and introduce …

…runtime f16 support check (#22992) Inherited from #22437 --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
openvinotoolkit · Mar 25, 2024 · cda5a02 · cda5a02
1 parent 4e6bfe8
commit cda5a02
Show file tree

Hide file tree

Showing 15 changed files with 122 additions and 65 deletions.
diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
@@ -172,6 +172,7 @@ jobs:
             -DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
             -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \
             -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \
+            -DOV_CPU_AARCH64_USE_MULTI_ISA=OFF \
             -S ${OPENVINO_REPO} \
             -B ${BUILD_DIR}
 

diff --git a/.gitignore b/.gitignore
@@ -61,4 +61,5 @@ __pycache__
 /tools/mo/*.svg
 /src/plugins/intel_cpu/tools/commit_slider/*.json
 /src/plugins/intel_cpu/tools/commit_slider/slider_cache/*
+/src/plugins/intel_cpu/thirdparty/ComputeLibrary/build/*
 .github/GITHUB_OUTPUT
diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt
@@ -30,6 +30,16 @@ elseif(OV_COMPILER_IS_CLANG)
     endif()
 endif()
 
+if (AARCH64 AND NOT APPLE AND CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2)
+    # according to https://github.com/ARM-software/ComputeLibrary/issues/1053#issuecomment-1846903707 comment
+    # the 'multi_isa=1' below enables FP32, FP16 and SVE / SVE2 kernels
+    # But: arm_sve.h header is not available on gcc older 10.2 (let's test it), so we have to check it
+    set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT ON)
+else()
+    set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT OFF)
+endif()
+set(OV_CPU_AARCH64_USE_MULTI_ISA ${OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT} CACHE BOOL "Build multi-ISA ACL")
+
 set(OV_CPU_ARM_TARGET_GENERIC_ARCHS armv8a
                                     armv8.2-a
                                     armv8.6-a armv8.6-a-sve armv8.6-a-sve2 armv8.6-a-sve2-sme2
@@ -41,17 +51,25 @@ if(ARM)
                                 # requires estate=32
                                 ${OV_CPU_ARM_TARGET_GENERIC_ARCHS})
 elseif(AARCH64)
-    set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
+    if(APPLE)
+        set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
+    else()
+        if(OV_CPU_AARCH64_USE_MULTI_ISA)
+            # set v8a even we want fp16 kernels, because
+            # we use multi_isa=1 in ACLConfig.cmake to enable both fp16 and fp32 kernels
+            # actual kernel is selected in runtime based on runtime capabilities
+            set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8a)
+        else()
+            set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
+        endif()
+    endif()
     set(OV_CPU_ARM_TARGET_ARCHS arm64-v8a
                                 arm64-v8.2-a arm64-v8.2-a-sve arm64-v8.2-a-sve2
                                 # used with estate=64
                                 ${OV_CPU_ARM_TARGET_GENERIC_ARCHS})
 endif()
 set(OV_CPU_ARM_TARGET_ARCH ${OV_CPU_ARM_TARGET_ARCH_DEFAULT} CACHE STRING "Architecture for ARM ComputeLibrary")
 set_property(CACHE OV_CPU_ARM_TARGET_ARCH PROPERTY STRINGS ${OV_CPU_ARM_TARGET_ARCHS})
-if(OV_CPU_ARM_TARGET_ARCH MATCHES "(armv|arm64-v)[8-9]\\.")
-    add_definitions(-DOV_CPU_ARM_ENABLE_FP16)
-endif()
 
 if(X86 OR X86_64 OR AARCH64)
     # disable mlas with webassembly

diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
@@ -284,14 +284,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                         inferencePrecision = ov::element::bf16;
                     }
                 } else if (prec == ov::element::f16) {
-#if defined(OPENVINO_ARCH_X86_64)
                     if (hasHardwareSupport(ov::element::f16)) {
                         inferencePrecision = ov::element::f16;
                     }
-#elif defined(OV_CPU_ARM_ENABLE_FP16)
-                    // TODO: add runtime FP16 feature support check for ARM
-                    inferencePrecision = ov::element::f16;
-#endif
                 } else if (prec == ov::element::f32) {
                     inferencePrecision = ov::element::f32;
                 } else {
@@ -382,12 +377,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
     if (!inferencePrecisionSetExplicitly) {
         if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) {
             inferencePrecision = ov::element::f32;
-#if defined(OV_CPU_ARM_ENABLE_FP16)
-            inferencePrecision = ov::element::f16;
-#else
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+            if (hasHardwareSupport(ov::element::f16)) {
+                inferencePrecision = ov::element::f16;
+            }
+#endif
             if (mayiuse(avx512_core_bf16))
                 inferencePrecision = ov::element::bf16;
-#endif
         } else {
             inferencePrecision = ov::element::f32;
         }

diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
@@ -37,11 +37,10 @@
 #include "utils/ngraph_utils.hpp"
 #include "utils/node_dumper.h"
 #include "utils/verbose.h"
+#include "utils/precision_support.h"
 
 #include <oneapi/dnnl/dnnl.hpp>
-#if defined(OV_CPU_ARM_ENABLE_FP16)
 #include "common/primitive_desc_iface.hpp"
-#endif
 
 #include "openvino/runtime/memory_solver.hpp"
 
@@ -425,10 +424,12 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc
     dnnl_primitive_desc_t result = nullptr;
     auto status = dnnl_reorder_primitive_desc_create(&result, srcMemDesc.get(), eng.get(), dstMemDesc.get(), eng.get(),
                                                      attr.get());
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     // temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM
     // pretend the reorder is not available to use Convert node instead
-    if (result && parse_impl_name(result->impl()->name()) == ref_any) {
+    if (hasHardwareSupport(ov::element::f16) &&
+        result &&
+        parse_impl_name(result->impl()->name()) == ref_any) {
         dnnl_primitive_desc_destroy(result);
         return false;
     }
@@ -1607,7 +1608,7 @@ void Graph::EnforceInferencePrecision() {
 
     if (inferPrec == ov::element::f32)
         return; // nothing to do, only precision reduction is currently allowed
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     if (inferPrec == ov::element::f16)
         return; // precision of configured by ov::pass::ConvertPrecision
 #endif

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -4,6 +4,7 @@
 
 #include "acl_eltwise.hpp"
 #include "acl_utils.hpp"
+#include "utils/debug_capabilities.h"
 
 namespace ov {
 namespace intel_cpu {
@@ -31,6 +32,17 @@ inline VectorDims reshape_sizes(VectorDims dims) {
     return result_dims;
 }
 
+inline void log_unsupported_prec(const std::vector<MemoryDescPtr>& srcDescs,
+                                 const std::vector<MemoryDescPtr>& dstDescs,
+                                 const Algorithm eltwiseAlgorithm) {
+    std::string srcPrec;
+    for (size_t i = 0; i < srcDescs.size(); i++) {
+        srcPrec += srcDescs[i]->getPrecision().to_string() + " ";
+    }
+    DEBUG_LOG(algToString(eltwiseAlgorithm), ": provided combination of src precisions: [", srcPrec,
+                          "] and dst precision: ", dstDescs[0]->getPrecision().to_string(), " is not supported");
+}
+
 bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) {
     if (one_of(algorithm, Algorithm::EltwiseSqrt,
                           Algorithm::EltwiseDivide,
@@ -94,6 +106,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
         case Algorithm::EltwiseHswish:
             if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -103,6 +116,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
             if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -113,6 +127,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -123,6 +138,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -134,6 +150,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -149,20 +166,26 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
         default:
+            DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported");
             return false;
     }
 
     for (const auto & srcDesc : srcDescs) {
-        if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN)
+        if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN) {
+            DEBUG_LOG("src descriptor layout is unsupported by ACL: ", srcDesc->serializeFormat());
             return false;
+        }
     }
     for (const auto & dstDesc : dstDescs) {
-        if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN)
+        if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN) {
+            DEBUG_LOG("dst descriptor layout is unsupported by ACL: ", dstDesc->serializeFormat());
             return false;
+        }
     }
 
     return true;

diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp
@@ -24,11 +24,9 @@
 #include "nodes/common/reorder_prim.h"
 #include "openvino/core/parallel.hpp"
 #include "shape_inference/shape_inference_pass_through.hpp"
-
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#include "utils/precision_support.h"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/transpose_list.hpp"
-#endif
 
 namespace ov {
 namespace intel_cpu {
@@ -128,7 +126,6 @@ void Reorder::executeDynamicImpl(dnnl::stream strm) {
     execute(strm);
 }
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
 void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc) {
     auto getOrderAndBlockedDims = [](const MemoryDesc& lhs, const MemoryDesc& rhs) -> std::pair<std::vector<size_t>, std::vector<size_t>> {
         const auto& in = lhs.as<BlockedMemoryDesc>()->getBlockDims();
@@ -180,7 +177,6 @@ void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr
     getSelectedPrimitiveDescriptor()->setImplementationType(transposeExecutor->implType());
     return;
 }
-#endif // OV_CPU_ARM_ENABLE_FP16
 
 void Reorder::prepareParams() {
     if (isOptimized)
@@ -211,7 +207,7 @@ void Reorder::prepareParams() {
     const auto&  parentDesc = srcMemPtr->getDescPtr();
     const auto&  childDesc = dstMemPtr->getDescPtr();
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     // @todo current oneDNN v3.2 lacks optimized jit implementation for fp16 reorders.
     // Use transpose executor as a temporary WA.
     if (everyone_is(ov::element::f16, parentDesc->getPrecision(), childDesc->getPrecision()) &&
@@ -405,7 +401,7 @@ void Reorder::optimizedNspc2Ncsp() {
 }
 
 void Reorder::execute(dnnl::stream strm) {
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     if (transposeExecutor) {
         auto dstMemPtr = getDstMemoryAtPort(0);
         auto srcMemPtr = getSrcMemoryAtPort(0);

diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h
@@ -6,9 +6,7 @@
 
 #include <node.h>
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
 #include "nodes/executors/transpose.hpp"
-#endif
 
 namespace ov {
 namespace intel_cpu {
@@ -76,10 +74,9 @@ class Reorder : public Node {
     void optimizedNspc2Ncsp();
     void optimizedNcsp2Nspc();
     void createReorderPrimitive(const dnnl::memory::desc &srcDesc, void* srcPtr, const dnnl::memory::desc &dstDesc, void* dstPtr);
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+
     void prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc);
     TransposeExecutorPtr transposeExecutor;
-#endif
 };
 
 }   // namespace node

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -318,9 +318,10 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         // @todo should we always convert to f32 regardless of hardware support, as it is done for f16?
         if (!hasHardwareSupport(ov::element::bf16))
             map.insert({ov::element::bf16, ov::element::f32});
-#if defined(OV_CPU_ARM_ENABLE_FP16)
-        if (inferencePrecision != ov::element::f16)
-            map.insert({ov::element::f16, ov::element::f32});
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+        if (inferencePrecision != ov::element::f16) {
+                map.insert({ov::element::f16, ov::element::f32});
+        }
 #else
         map.insert({ov::element::f16, ov::element::f32});
 #endif
@@ -329,11 +330,12 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
 
     type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}};
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     // It cannot be static data, because it may be difference for different inferencePrecision
     const auto precisions = get_convert_precisions();
     if (inferencePrecision == ov::element::f16) {
         precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}};
+        //keep fq nodes in f32 prec to avoid performance degradation
         type_to_fuse_map f16_fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}};
         const bool keep_precision_sensitive_in_fp32 = true;
         CPU_REGISTER_PASS_COMMON(manager,

diff --git a/src/plugins/intel_cpu/src/utils/precision_support.cpp b/src/plugins/intel_cpu/src/utils/precision_support.cpp
@@ -4,10 +4,16 @@
 
 #include "precision_support.h"
 
+#if defined(OPENVINO_ARCH_X86_64)
 #include "cpu/x64/cpu_isa_traits.hpp"
+#endif
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/core/visibility.hpp"
 
+#if defined(OV_CPU_WITH_ACL)
+#include "arm_compute/core/CPP/CPPTypes.h"
+#endif
+
 namespace ov {
 namespace intel_cpu {
 
@@ -17,8 +23,10 @@ static bool hasFP16HardwareSupport(const ov::element::Type& precision) {
         dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2))
         return true;
     return false;
-#elif defined(OV_CPU_ARM_ENABLE_FP16)
-    return true;  // @todo add runtime check for arm as well
+#elif defined(OPENVINO_ARCH_ARM64) && defined(OV_CPU_WITH_ACL)
+    //has_fp16() works correctly on aarch64 only
+    //TODO: remove else branch as soon as ACL issue #1096 is fixed
+    return arm_compute::CPUInfo::get().has_fp16();
 #else
     return false;
 #endif

diff --git a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
@@ -4,14 +4,32 @@
 
 set(TARGET_NAME ov_cpu_func_tests)
 
-add_library(cpuSpecificRtInfo STATIC
+if(SUGGEST_OVERRIDE_SUPPORTED)
+    # xbyak compilation fails
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-suggest-override")
+endif()
+
+add_library(cpuUtils STATIC
     $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.hpp
-    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.cpp)
-target_link_libraries(cpuSpecificRtInfo PRIVATE openvino::runtime)
+    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.cpp
+    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/precision_support.h
+    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/precision_support.cpp)
+set(CPU_UTILS_LINK_LIBRARIES openvino::runtime)
+set(CPU_UTILS_INCLUDE_PATHS)
+if(OV_CPU_WITH_ACL)
+    list(APPEND CPU_UTILS_LINK_LIBRARIES arm_compute::arm_compute)
+    list(APPEND CPU_UTILS_INCLUDE_PATHS $<TARGET_PROPERTY:arm_compute::arm_compute,SOURCE_DIR>)
+endif()
+if(OV_CPU_WITH_DNNL)
+    list(APPEND CPU_UTILS_LINK_LIBRARIES dnnl)
+    list(APPEND CPU_UTILS_INCLUDE_PATHS $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/thirdparty/onednn/src)
+endif()
+target_link_libraries(cpuUtils PRIVATE ${CPU_UTILS_LINK_LIBRARIES})
+target_include_directories(cpuUtils PUBLIC ${CPU_UTILS_INCLUDE_PATHS})
 
 set(INCLUDES ${CMAKE_CURRENT_SOURCE_DIR} $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src)
 set(DEPENDENCIES openvino_intel_cpu_plugin openvino_template_extension)
-set(LINK_LIBRARIES funcSharedTests cpuSpecificRtInfo openvino::snippets ov_snippets_models)
+set(LINK_LIBRARIES funcSharedTests cpuUtils openvino::snippets ov_snippets_models)
 
 if(ENABLE_OV_ONNX_FRONTEND)
     list(APPEND DEFINES TEST_MODELS="${TEST_MODEL_ZOO}")