From cda5a0239f5f661dc729cc2958fd61884c04082a Mon Sep 17 00:00:00 2001
From: Aleksandr Voron <aleksandr.voron@intel.com>
Date: Mon, 25 Mar 2024 11:22:00 +0100
Subject: [PATCH] [CPU][ARM] Enable both f16 and f32 kernels for aarch64 and
 introduce runtime f16 support check (#22992)

Inherited from https://github.com/openvinotoolkit/openvino/pull/22437

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/linux_arm64.yml             |  1 +
 .gitignore                                    |  1 +
 src/plugins/intel_cpu/CMakeLists.txt          | 26 +++++++++++++++---
 src/plugins/intel_cpu/src/config.cpp          | 14 ++++------
 src/plugins/intel_cpu/src/graph.cpp           | 11 ++++----
 .../src/nodes/executors/acl/acl_eltwise.cpp   | 27 +++++++++++++++++--
 src/plugins/intel_cpu/src/nodes/reorder.cpp   | 10 +++----
 src/plugins/intel_cpu/src/nodes/reorder.h     |  5 +---
 .../transformation_pipeline.cpp               | 10 ++++---
 .../intel_cpu/src/utils/precision_support.cpp | 12 +++++++--
 .../intel_cpu/tests/functional/CMakeLists.txt | 26 +++++++++++++++---
 .../custom/behavior/ov_plugin/properties.cpp  |  5 ++--
 .../shared_tests_instances/core_config.cpp    |  3 +--
 .../skip_tests_config.cpp                     | 19 +++++++------
 .../intel_cpu/thirdparty/ACLConfig.cmake      | 17 +++++-------
 15 files changed, 122 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
index 12bc0f8f84106e..32a2147a2b2433 100644
--- a/.github/workflows/linux_arm64.yml
+++ b/.github/workflows/linux_arm64.yml
@@ -172,6 +172,7 @@ jobs:
             -DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
             -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \
             -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \
+            -DOV_CPU_AARCH64_USE_MULTI_ISA=OFF \
             -S ${OPENVINO_REPO} \
             -B ${BUILD_DIR}
 
diff --git a/.gitignore b/.gitignore
index 9bc1e79b3e53b1..9dd22697d3780a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,4 +61,5 @@ __pycache__
 /tools/mo/*.svg
 /src/plugins/intel_cpu/tools/commit_slider/*.json
 /src/plugins/intel_cpu/tools/commit_slider/slider_cache/*
+/src/plugins/intel_cpu/thirdparty/ComputeLibrary/build/*
 .github/GITHUB_OUTPUT
diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt
index 70da87819f03e5..c65bceae2a1d0b 100644
--- a/src/plugins/intel_cpu/CMakeLists.txt
+++ b/src/plugins/intel_cpu/CMakeLists.txt
@@ -30,6 +30,16 @@ elseif(OV_COMPILER_IS_CLANG)
     endif()
 endif()
 
+if (AARCH64 AND NOT APPLE AND CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2)
+    # according to https://github.com/ARM-software/ComputeLibrary/issues/1053#issuecomment-1846903707 comment
+    # the 'multi_isa=1' below enables FP32, FP16 and SVE / SVE2 kernels
+    # But: arm_sve.h header is not available on gcc older 10.2 (let's test it), so we have to check it
+    set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT ON)
+else()
+    set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT OFF)
+endif()
+set(OV_CPU_AARCH64_USE_MULTI_ISA ${OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT} CACHE BOOL "Build multi-ISA ACL")
+
 set(OV_CPU_ARM_TARGET_GENERIC_ARCHS armv8a
                                     armv8.2-a
                                     armv8.6-a armv8.6-a-sve armv8.6-a-sve2 armv8.6-a-sve2-sme2
@@ -41,7 +51,18 @@ if(ARM)
                                 # requires estate=32
                                 ${OV_CPU_ARM_TARGET_GENERIC_ARCHS})
 elseif(AARCH64)
-    set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
+    if(APPLE)
+        set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
+    else()
+        if(OV_CPU_AARCH64_USE_MULTI_ISA)
+            # set v8a even we want fp16 kernels, because
+            # we use multi_isa=1 in ACLConfig.cmake to enable both fp16 and fp32 kernels
+            # actual kernel is selected in runtime based on runtime capabilities
+            set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8a)
+        else()
+            set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
+        endif()
+    endif()
     set(OV_CPU_ARM_TARGET_ARCHS arm64-v8a
                                 arm64-v8.2-a arm64-v8.2-a-sve arm64-v8.2-a-sve2
                                 # used with estate=64
@@ -49,9 +70,6 @@ elseif(AARCH64)
 endif()
 set(OV_CPU_ARM_TARGET_ARCH ${OV_CPU_ARM_TARGET_ARCH_DEFAULT} CACHE STRING "Architecture for ARM ComputeLibrary")
 set_property(CACHE OV_CPU_ARM_TARGET_ARCH PROPERTY STRINGS ${OV_CPU_ARM_TARGET_ARCHS})
-if(OV_CPU_ARM_TARGET_ARCH MATCHES "(armv|arm64-v)[8-9]\\.")
-    add_definitions(-DOV_CPU_ARM_ENABLE_FP16)
-endif()
 
 if(X86 OR X86_64 OR AARCH64)
     # disable mlas with webassembly
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index 8567914415e459..4d94abf72ebfc0 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -284,14 +284,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                         inferencePrecision = ov::element::bf16;
                     }
                 } else if (prec == ov::element::f16) {
-#if defined(OPENVINO_ARCH_X86_64)
                     if (hasHardwareSupport(ov::element::f16)) {
                         inferencePrecision = ov::element::f16;
                     }
-#elif defined(OV_CPU_ARM_ENABLE_FP16)
-                    // TODO: add runtime FP16 feature support check for ARM
-                    inferencePrecision = ov::element::f16;
-#endif
                 } else if (prec == ov::element::f32) {
                     inferencePrecision = ov::element::f32;
                 } else {
@@ -382,12 +377,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
     if (!inferencePrecisionSetExplicitly) {
         if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) {
             inferencePrecision = ov::element::f32;
-#if defined(OV_CPU_ARM_ENABLE_FP16)
-            inferencePrecision = ov::element::f16;
-#else
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+            if (hasHardwareSupport(ov::element::f16)) {
+                inferencePrecision = ov::element::f16;
+            }
+#endif
             if (mayiuse(avx512_core_bf16))
                 inferencePrecision = ov::element::bf16;
-#endif
         } else {
             inferencePrecision = ov::element::f32;
         }
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
index 3ffff01c6da6e7..ec9f4012ce53e0 100644
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -37,11 +37,10 @@
 #include "utils/ngraph_utils.hpp"
 #include "utils/node_dumper.h"
 #include "utils/verbose.h"
+#include "utils/precision_support.h"
 
 #include <oneapi/dnnl/dnnl.hpp>
-#if defined(OV_CPU_ARM_ENABLE_FP16)
 #include "common/primitive_desc_iface.hpp"
-#endif
 
 #include "openvino/runtime/memory_solver.hpp"
 
@@ -425,10 +424,12 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc
     dnnl_primitive_desc_t result = nullptr;
     auto status = dnnl_reorder_primitive_desc_create(&result, srcMemDesc.get(), eng.get(), dstMemDesc.get(), eng.get(),
                                                      attr.get());
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     // temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM
     // pretend the reorder is not available to use Convert node instead
-    if (result && parse_impl_name(result->impl()->name()) == ref_any) {
+    if (hasHardwareSupport(ov::element::f16) &&
+        result &&
+        parse_impl_name(result->impl()->name()) == ref_any) {
         dnnl_primitive_desc_destroy(result);
         return false;
     }
@@ -1607,7 +1608,7 @@ void Graph::EnforceInferencePrecision() {
 
     if (inferPrec == ov::element::f32)
         return; // nothing to do, only precision reduction is currently allowed
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     if (inferPrec == ov::element::f16)
         return; // precision of configured by ov::pass::ConvertPrecision
 #endif
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
index cdc038fbf9155d..ae091deed57121 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -4,6 +4,7 @@
 
 #include "acl_eltwise.hpp"
 #include "acl_utils.hpp"
+#include "utils/debug_capabilities.h"
 
 namespace ov {
 namespace intel_cpu {
@@ -31,6 +32,17 @@ inline VectorDims reshape_sizes(VectorDims dims) {
     return result_dims;
 }
 
+inline void log_unsupported_prec(const std::vector<MemoryDescPtr>& srcDescs,
+                                 const std::vector<MemoryDescPtr>& dstDescs,
+                                 const Algorithm eltwiseAlgorithm) {
+    std::string srcPrec;
+    for (size_t i = 0; i < srcDescs.size(); i++) {
+        srcPrec += srcDescs[i]->getPrecision().to_string() + " ";
+    }
+    DEBUG_LOG(algToString(eltwiseAlgorithm), ": provided combination of src precisions: [", srcPrec,
+                          "] and dst precision: ", dstDescs[0]->getPrecision().to_string(), " is not supported");
+}
+
 bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) {
     if (one_of(algorithm, Algorithm::EltwiseSqrt,
                           Algorithm::EltwiseDivide,
@@ -94,6 +106,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
         case Algorithm::EltwiseHswish:
             if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -103,6 +116,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
             if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -113,6 +127,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -123,6 +138,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -134,6 +150,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
@@ -149,20 +166,26 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
                   checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) ||
                   checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) ||
                   checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) {
+                log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
                 return false;
             }
             break;
         default:
+            DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported");
             return false;
     }
 
     for (const auto & srcDesc : srcDescs) {
-        if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN)
+        if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN) {
+            DEBUG_LOG("src descriptor layout is unsupported by ACL: ", srcDesc->serializeFormat());
             return false;
+        }
     }
     for (const auto & dstDesc : dstDescs) {
-        if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN)
+        if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN) {
+            DEBUG_LOG("dst descriptor layout is unsupported by ACL: ", dstDesc->serializeFormat());
             return false;
+        }
     }
 
     return true;
diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp
index a0d3f101f573a3..e1db1540fdd4b5 100644
--- a/src/plugins/intel_cpu/src/nodes/reorder.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp
@@ -24,11 +24,9 @@
 #include "nodes/common/reorder_prim.h"
 #include "openvino/core/parallel.hpp"
 #include "shape_inference/shape_inference_pass_through.hpp"
-
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#include "utils/precision_support.h"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/transpose_list.hpp"
-#endif
 
 namespace ov {
 namespace intel_cpu {
@@ -128,7 +126,6 @@ void Reorder::executeDynamicImpl(dnnl::stream strm) {
     execute(strm);
 }
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
 void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc) {
     auto getOrderAndBlockedDims = [](const MemoryDesc& lhs, const MemoryDesc& rhs) -> std::pair<std::vector<size_t>, std::vector<size_t>> {
         const auto& in = lhs.as<BlockedMemoryDesc>()->getBlockDims();
@@ -180,7 +177,6 @@ void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr
     getSelectedPrimitiveDescriptor()->setImplementationType(transposeExecutor->implType());
     return;
 }
-#endif // OV_CPU_ARM_ENABLE_FP16
 
 void Reorder::prepareParams() {
     if (isOptimized)
@@ -211,7 +207,7 @@ void Reorder::prepareParams() {
     const auto&  parentDesc = srcMemPtr->getDescPtr();
     const auto&  childDesc = dstMemPtr->getDescPtr();
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     // @todo current oneDNN v3.2 lacks optimized jit implementation for fp16 reorders.
     // Use transpose executor as a temporary WA.
     if (everyone_is(ov::element::f16, parentDesc->getPrecision(), childDesc->getPrecision()) &&
@@ -405,7 +401,7 @@ void Reorder::optimizedNspc2Ncsp() {
 }
 
 void Reorder::execute(dnnl::stream strm) {
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     if (transposeExecutor) {
         auto dstMemPtr = getDstMemoryAtPort(0);
         auto srcMemPtr = getSrcMemoryAtPort(0);
diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h
index 07a7b7b53230be..cb99caa07bdfa6 100644
--- a/src/plugins/intel_cpu/src/nodes/reorder.h
+++ b/src/plugins/intel_cpu/src/nodes/reorder.h
@@ -6,9 +6,7 @@
 
 #include <node.h>
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
 #include "nodes/executors/transpose.hpp"
-#endif
 
 namespace ov {
 namespace intel_cpu {
@@ -76,10 +74,9 @@ class Reorder : public Node {
     void optimizedNspc2Ncsp();
     void optimizedNcsp2Nspc();
     void createReorderPrimitive(const dnnl::memory::desc &srcDesc, void* srcPtr, const dnnl::memory::desc &dstDesc, void* dstPtr);
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+
     void prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc);
     TransposeExecutorPtr transposeExecutor;
-#endif
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 8dbdd42cee0726..cdea46e202b1cd 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -318,9 +318,10 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         // @todo should we always convert to f32 regardless of hardware support, as it is done for f16?
         if (!hasHardwareSupport(ov::element::bf16))
             map.insert({ov::element::bf16, ov::element::f32});
-#if defined(OV_CPU_ARM_ENABLE_FP16)
-        if (inferencePrecision != ov::element::f16)
-            map.insert({ov::element::f16, ov::element::f32});
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+        if (inferencePrecision != ov::element::f16) {
+                map.insert({ov::element::f16, ov::element::f32});
+        }
 #else
         map.insert({ov::element::f16, ov::element::f32});
 #endif
@@ -329,11 +330,12 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
 
     type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}};
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     // It cannot be static data, because it may be difference for different inferencePrecision
     const auto precisions = get_convert_precisions();
     if (inferencePrecision == ov::element::f16) {
         precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}};
+        //keep fq nodes in f32 prec to avoid performance degradation
         type_to_fuse_map f16_fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}};
         const bool keep_precision_sensitive_in_fp32 = true;
         CPU_REGISTER_PASS_COMMON(manager,
diff --git a/src/plugins/intel_cpu/src/utils/precision_support.cpp b/src/plugins/intel_cpu/src/utils/precision_support.cpp
index 4a89002e63da48..e2e55a4d0f6cca 100644
--- a/src/plugins/intel_cpu/src/utils/precision_support.cpp
+++ b/src/plugins/intel_cpu/src/utils/precision_support.cpp
@@ -4,10 +4,16 @@
 
 #include "precision_support.h"
 
+#if defined(OPENVINO_ARCH_X86_64)
 #include "cpu/x64/cpu_isa_traits.hpp"
+#endif
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/core/visibility.hpp"
 
+#if defined(OV_CPU_WITH_ACL)
+#include "arm_compute/core/CPP/CPPTypes.h"
+#endif
+
 namespace ov {
 namespace intel_cpu {
 
@@ -17,8 +23,10 @@ static bool hasFP16HardwareSupport(const ov::element::Type& precision) {
         dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2))
         return true;
     return false;
-#elif defined(OV_CPU_ARM_ENABLE_FP16)
-    return true;  // @todo add runtime check for arm as well
+#elif defined(OPENVINO_ARCH_ARM64) && defined(OV_CPU_WITH_ACL)
+    //has_fp16() works correctly on aarch64 only
+    //TODO: remove else branch as soon as ACL issue #1096 is fixed
+    return arm_compute::CPUInfo::get().has_fp16();
 #else
     return false;
 #endif
diff --git a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
index 3a58130da3463e..8e32bc3ec059b6 100644
--- a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
+++ b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
@@ -4,14 +4,32 @@
 
 set(TARGET_NAME ov_cpu_func_tests)
 
-add_library(cpuSpecificRtInfo STATIC
+if(SUGGEST_OVERRIDE_SUPPORTED)
+    # xbyak compilation fails
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-suggest-override")
+endif()
+
+add_library(cpuUtils STATIC
     $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.hpp
-    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.cpp)
-target_link_libraries(cpuSpecificRtInfo PRIVATE openvino::runtime)
+    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.cpp
+    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/precision_support.h
+    $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/precision_support.cpp)
+set(CPU_UTILS_LINK_LIBRARIES openvino::runtime)
+set(CPU_UTILS_INCLUDE_PATHS)
+if(OV_CPU_WITH_ACL)
+    list(APPEND CPU_UTILS_LINK_LIBRARIES arm_compute::arm_compute)
+    list(APPEND CPU_UTILS_INCLUDE_PATHS $<TARGET_PROPERTY:arm_compute::arm_compute,SOURCE_DIR>)
+endif()
+if(OV_CPU_WITH_DNNL)
+    list(APPEND CPU_UTILS_LINK_LIBRARIES dnnl)
+    list(APPEND CPU_UTILS_INCLUDE_PATHS $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/thirdparty/onednn/src)
+endif()
+target_link_libraries(cpuUtils PRIVATE ${CPU_UTILS_LINK_LIBRARIES})
+target_include_directories(cpuUtils PUBLIC ${CPU_UTILS_INCLUDE_PATHS})
 
 set(INCLUDES ${CMAKE_CURRENT_SOURCE_DIR} $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src)
 set(DEPENDENCIES openvino_intel_cpu_plugin openvino_template_extension)
-set(LINK_LIBRARIES funcSharedTests cpuSpecificRtInfo openvino::snippets ov_snippets_models)
+set(LINK_LIBRARIES funcSharedTests cpuUtils openvino::snippets ov_snippets_models)
 
 if(ENABLE_OV_ONNX_FRONTEND)
     list(APPEND DEFINES TEST_MODELS="${TEST_MODEL_ZOO}")
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
index 1b29347d6c0605..27c80bce3fc1a0 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
@@ -5,6 +5,7 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
+#include "utils/precision_support.h"
 #include "utils/properties_test.hpp"
 #include "common_test_utils/test_assertions.hpp"
 #include "openvino/runtime/properties.hpp"
@@ -208,8 +209,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginSetConfigAffinityCore) {
     ASSERT_EQ(false, value);
 }
 
-#if defined(OV_CPU_ARM_ENABLE_FP16)
-    const auto expected_precision_for_performance_mode = ov::element::f16;
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+    const auto expected_precision_for_performance_mode = ov::intel_cpu::hasHardwareSupport(ov::element::f16) ? ov::element::f16 : ov::element::f32;
 #else
     const auto expected_precision_for_performance_mode = ov::with_cpu_x86_bfloat16() ? ov::element::bf16 : ov::element::f32;
 #endif
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp
index 9dbdd255263b35..d2edd5a14eceb4 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/core_config.cpp
@@ -8,12 +8,11 @@ namespace ov {
 namespace test {
 
 void core_configuration(ov::test::SubgraphBaseTest* test) {
-    #if defined(OV_CPU_ARM_ENABLE_FP16) || defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
         //force fp32 inference precision if it is not configured specially
         if (!test->configuration.count(ov::hint::inference_precision.name())) {
             test->configuration.insert({ov::hint::inference_precision.name(), ov::element::f32.to_string()});
         }
-    #endif
+
         // todo: issue: 123320
         test->convert_precisions.insert({ov::element::bf16, ov::element::f32});
         test->convert_precisions.insert({ov::element::f16, ov::element::f32});
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 6ad35a8105d7b1..485c9cb5bd615a 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -5,6 +5,7 @@
 #include "openvino/core/visibility.hpp"
 #include "functional_test_utils/skip_tests_config.hpp"
 #include "openvino/runtime/system_conf.hpp"
+#include "utils/precision_support.h"
 
 #include <string>
 #include <vector>
@@ -337,7 +338,6 @@ std::vector<std::string> disabledTestPatterns() {
     // int8 specific
     retVector.emplace_back(R"(smoke_Quantized.*)");
 
-#    if defined(OV_CPU_ARM_ENABLE_FP16)
     // Issue: 123019
     retVector.emplace_back(R"(smoke_staticShapes4D.*INFERENCE_PRECISION_HINT=f16.*)");
     retVector.emplace_back(R"(smoke_dynamicShapes4D.*INFERENCE_PRECISION_HINT=f16.*)");
@@ -351,7 +351,6 @@ std::vector<std::string> disabledTestPatterns() {
     // Issue: 124395
     retVector.emplace_back(R"(smoke_VariableStateBasic/InferRequestVariableStateTest.*)");
     retVector.emplace_back(R"(smoke_VariableState/OVInferRequestVariableStateTest.*)");
-#    endif
 
 #endif
 
@@ -416,14 +415,14 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
     }
 #elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM)
-#    if !defined(OV_CPU_ARM_ENABLE_FP16)
-    // Skip fp16 tests for paltforms that don't support fp16 precision
-    retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
-#    else
-    // Issue 117407
-    retVector.emplace_back(
-        R"(.*EltwiseLayerCPUTest.*IS=\(\[1\.\.10\.2\.5\.6\]_\).*eltwiseOpType=SqDiff.*_configItem=INFERENCE_PRECISION_HINT=f16.*)");
-#    endif  // OV_CPU_ARM_ENABLE_FP16
+    if (!ov::intel_cpu::hasHardwareSupport(ov::element::f16)) {
+        // Skip fp16 tests for paltforms that don't support fp16 precision
+        retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
+    } else {
+        // Issue 117407
+        retVector.emplace_back(
+            R"(.*EltwiseLayerCPUTest.*IS=\(\[1\.\.10\.2\.5\.6\]_\).*eltwiseOpType=SqDiff.*_configItem=INFERENCE_PRECISION_HINT=f16.*)");
+    }
 #endif
     if (!ov::with_cpu_x86_avx512_core_vnni() && !ov::with_cpu_x86_avx512_core_amx_int8()) {
         // MatMul in Snippets uses BRGEMM that supports i8 only on platforms with VNNI or AMX instructions
diff --git a/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake b/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake
index 3afbae622af835..09774aa4bec493 100644
--- a/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake
+++ b/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake
@@ -98,7 +98,6 @@ elseif(NOT TARGET arm_compute::arm_compute)
     #
 
     set(ARM_COMPUTE_SOURCE_DIR "${intel_cpu_thirdparty_SOURCE_DIR}/ComputeLibrary")
-    set(ARM_COMPUTE_BINARY_DIR "${intel_cpu_thirdparty_BINARY_DIR}/ComputeLibrary")
 
     message(STATUS "Configure to build ${ARM_COMPUTE_SOURCE_DIR}")
 
@@ -149,17 +148,16 @@ elseif(NOT TARGET arm_compute::arm_compute)
         list(APPEND ARM_COMPUTE_OPTIONS estate=32)
     else()
         list(APPEND ARM_COMPUTE_OPTIONS estate=64)
-        if(NOT APPLE AND CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2)
-            # arm_sve.h header is not available on gcc older 10.2
-            # TODO: validate it on machines with FP16 / SVE support and enabled back
-            # list(APPEND ARM_COMPUTE_OPTIONS multi_isa=1)
+        if(OV_CPU_AARCH64_USE_MULTI_ISA)
+            list(APPEND ARM_COMPUTE_OPTIONS multi_isa=1)
+            # let's additionally enable SME as well
+            set(extra_cxx_flags "${extra_cxx_flags} -DENABLE_SME -DARM_COMPUTE_ENABLE_SME -DARM_COMPUTE_ENABLE_SME2")
         endif()
     endif()
 
     if(NOT MSVC64)
         list(APPEND ARM_COMPUTE_OPTIONS
-            build_dir=${ARM_COMPUTE_BINARY_DIR}
-            install_dir=${ARM_COMPUTE_BINARY_DIR}/install)
+            install_dir=install)
     endif()
 
     if(ARM_COMPUTE_SCONS_JOBS)
@@ -329,11 +327,10 @@ elseif(NOT TARGET arm_compute::arm_compute)
 
     if(MSVC64)
         set(arm_compute build/arm_compute-static.lib)
-        set(arm_compute_full_path "${ARM_COMPUTE_SOURCE_DIR}/${arm_compute}")
     else()
-        set(arm_compute ${ARM_COMPUTE_BINARY_DIR}/libarm_compute-static.a)
-        set(arm_compute_full_path "${arm_compute}")
+        set(arm_compute build/libarm_compute-static.a)
     endif()
+    set(arm_compute_full_path "${ARM_COMPUTE_SOURCE_DIR}/${arm_compute}")
 
     list(APPEND ARM_COMPUTE_OPTIONS fixed_format_kernels=True)