Skip to content

Commit

Permalink
[CPU][ARM] Enable both f16 and f32 kernels for aarch64 and introduce …
Browse files Browse the repository at this point in the history
…runtime f16 support check (#22992)

Inherited from #22437

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
  • Loading branch information
alvoron and ilya-lavrenov authored Mar 25, 2024
1 parent 4e6bfe8 commit cda5a02
Show file tree
Hide file tree
Showing 15 changed files with 122 additions and 65 deletions.
1 change: 1 addition & 0 deletions .github/workflows/linux_arm64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ jobs:
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \
-DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \
-DOV_CPU_AARCH64_USE_MULTI_ISA=OFF \
-S ${OPENVINO_REPO} \
-B ${BUILD_DIR}
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,5 @@ __pycache__
/tools/mo/*.svg
/src/plugins/intel_cpu/tools/commit_slider/*.json
/src/plugins/intel_cpu/tools/commit_slider/slider_cache/*
/src/plugins/intel_cpu/thirdparty/ComputeLibrary/build/*
.github/GITHUB_OUTPUT
26 changes: 22 additions & 4 deletions src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ elseif(OV_COMPILER_IS_CLANG)
endif()
endif()

if (AARCH64 AND NOT APPLE AND CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2)
# according to https://github.com/ARM-software/ComputeLibrary/issues/1053#issuecomment-1846903707 comment
# the 'multi_isa=1' below enables FP32, FP16 and SVE / SVE2 kernels
# But: arm_sve.h header is not available on gcc older 10.2 (let's test it), so we have to check it
set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT ON)
else()
set(OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT OFF)
endif()
set(OV_CPU_AARCH64_USE_MULTI_ISA ${OV_CPU_AARCH64_USE_MULTI_ISA_DEFAULT} CACHE BOOL "Build multi-ISA ACL")

set(OV_CPU_ARM_TARGET_GENERIC_ARCHS armv8a
armv8.2-a
armv8.6-a armv8.6-a-sve armv8.6-a-sve2 armv8.6-a-sve2-sme2
Expand All @@ -41,17 +51,25 @@ if(ARM)
# requires estate=32
${OV_CPU_ARM_TARGET_GENERIC_ARCHS})
elseif(AARCH64)
set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
if(APPLE)
set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
else()
if(OV_CPU_AARCH64_USE_MULTI_ISA)
# set v8a even we want fp16 kernels, because
# we use multi_isa=1 in ACLConfig.cmake to enable both fp16 and fp32 kernels
# actual kernel is selected in runtime based on runtime capabilities
set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8a)
else()
set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
endif()
endif()
set(OV_CPU_ARM_TARGET_ARCHS arm64-v8a
arm64-v8.2-a arm64-v8.2-a-sve arm64-v8.2-a-sve2
# used with estate=64
${OV_CPU_ARM_TARGET_GENERIC_ARCHS})
endif()
set(OV_CPU_ARM_TARGET_ARCH ${OV_CPU_ARM_TARGET_ARCH_DEFAULT} CACHE STRING "Architecture for ARM ComputeLibrary")
set_property(CACHE OV_CPU_ARM_TARGET_ARCH PROPERTY STRINGS ${OV_CPU_ARM_TARGET_ARCHS})
if(OV_CPU_ARM_TARGET_ARCH MATCHES "(armv|arm64-v)[8-9]\\.")
add_definitions(-DOV_CPU_ARM_ENABLE_FP16)
endif()

if(X86 OR X86_64 OR AARCH64)
# disable mlas with webassembly
Expand Down
14 changes: 5 additions & 9 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,14 +284,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
inferencePrecision = ov::element::bf16;
}
} else if (prec == ov::element::f16) {
#if defined(OPENVINO_ARCH_X86_64)
if (hasHardwareSupport(ov::element::f16)) {
inferencePrecision = ov::element::f16;
}
#elif defined(OV_CPU_ARM_ENABLE_FP16)
// TODO: add runtime FP16 feature support check for ARM
inferencePrecision = ov::element::f16;
#endif
} else if (prec == ov::element::f32) {
inferencePrecision = ov::element::f32;
} else {
Expand Down Expand Up @@ -382,12 +377,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
if (!inferencePrecisionSetExplicitly) {
if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) {
inferencePrecision = ov::element::f32;
#if defined(OV_CPU_ARM_ENABLE_FP16)
inferencePrecision = ov::element::f16;
#else
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
if (hasHardwareSupport(ov::element::f16)) {
inferencePrecision = ov::element::f16;
}
#endif
if (mayiuse(avx512_core_bf16))
inferencePrecision = ov::element::bf16;
#endif
} else {
inferencePrecision = ov::element::f32;
}
Expand Down
11 changes: 6 additions & 5 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,10 @@
#include "utils/ngraph_utils.hpp"
#include "utils/node_dumper.h"
#include "utils/verbose.h"
#include "utils/precision_support.h"

#include <oneapi/dnnl/dnnl.hpp>
#if defined(OV_CPU_ARM_ENABLE_FP16)
#include "common/primitive_desc_iface.hpp"
#endif

#include "openvino/runtime/memory_solver.hpp"

Expand Down Expand Up @@ -425,10 +424,12 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc
dnnl_primitive_desc_t result = nullptr;
auto status = dnnl_reorder_primitive_desc_create(&result, srcMemDesc.get(), eng.get(), dstMemDesc.get(), eng.get(),
attr.get());
#if defined(OV_CPU_ARM_ENABLE_FP16)
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
// temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM
// pretend the reorder is not available to use Convert node instead
if (result && parse_impl_name(result->impl()->name()) == ref_any) {
if (hasHardwareSupport(ov::element::f16) &&
result &&
parse_impl_name(result->impl()->name()) == ref_any) {
dnnl_primitive_desc_destroy(result);
return false;
}
Expand Down Expand Up @@ -1607,7 +1608,7 @@ void Graph::EnforceInferencePrecision() {

if (inferPrec == ov::element::f32)
return; // nothing to do, only precision reduction is currently allowed
#if defined(OV_CPU_ARM_ENABLE_FP16)
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
if (inferPrec == ov::element::f16)
return; // precision of configured by ov::pass::ConvertPrecision
#endif
Expand Down
27 changes: 25 additions & 2 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "acl_eltwise.hpp"
#include "acl_utils.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {
Expand Down Expand Up @@ -31,6 +32,17 @@ inline VectorDims reshape_sizes(VectorDims dims) {
return result_dims;
}

inline void log_unsupported_prec(const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const Algorithm eltwiseAlgorithm) {
std::string srcPrec;
for (size_t i = 0; i < srcDescs.size(); i++) {
srcPrec += srcDescs[i]->getPrecision().to_string() + " ";
}
DEBUG_LOG(algToString(eltwiseAlgorithm), ": provided combination of src precisions: [", srcPrec,
"] and dst precision: ", dstDescs[0]->getPrecision().to_string(), " is not supported");
}

bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) {
if (one_of(algorithm, Algorithm::EltwiseSqrt,
Algorithm::EltwiseDivide,
Expand Down Expand Up @@ -94,6 +106,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
case Algorithm::EltwiseHswish:
if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
return false;
}
break;
Expand All @@ -103,6 +116,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
return false;
}
break;
Expand All @@ -113,6 +127,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
return false;
}
break;
Expand All @@ -123,6 +138,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) ||
checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
return false;
}
break;
Expand All @@ -134,6 +150,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) ||
checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) ||
checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) {
log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
return false;
}
break;
Expand All @@ -149,20 +166,26 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) ||
checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) ||
checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) {
log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm);
return false;
}
break;
default:
DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported");
return false;
}

for (const auto & srcDesc : srcDescs) {
if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN)
if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN) {
DEBUG_LOG("src descriptor layout is unsupported by ACL: ", srcDesc->serializeFormat());
return false;
}
}
for (const auto & dstDesc : dstDescs) {
if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN)
if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN) {
DEBUG_LOG("dst descriptor layout is unsupported by ACL: ", dstDesc->serializeFormat());
return false;
}
}

return true;
Expand Down
10 changes: 3 additions & 7 deletions src/plugins/intel_cpu/src/nodes/reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@
#include "nodes/common/reorder_prim.h"
#include "openvino/core/parallel.hpp"
#include "shape_inference/shape_inference_pass_through.hpp"

#if defined(OV_CPU_ARM_ENABLE_FP16)
#include "utils/precision_support.h"
#include "nodes/executors/executor.hpp"
#include "nodes/executors/transpose_list.hpp"
#endif

namespace ov {
namespace intel_cpu {
Expand Down Expand Up @@ -128,7 +126,6 @@ void Reorder::executeDynamicImpl(dnnl::stream strm) {
execute(strm);
}

#if defined(OV_CPU_ARM_ENABLE_FP16)
void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc) {
auto getOrderAndBlockedDims = [](const MemoryDesc& lhs, const MemoryDesc& rhs) -> std::pair<std::vector<size_t>, std::vector<size_t>> {
const auto& in = lhs.as<BlockedMemoryDesc>()->getBlockDims();
Expand Down Expand Up @@ -180,7 +177,6 @@ void Reorder::prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr
getSelectedPrimitiveDescriptor()->setImplementationType(transposeExecutor->implType());
return;
}
#endif // OV_CPU_ARM_ENABLE_FP16

void Reorder::prepareParams() {
if (isOptimized)
Expand Down Expand Up @@ -211,7 +207,7 @@ void Reorder::prepareParams() {
const auto& parentDesc = srcMemPtr->getDescPtr();
const auto& childDesc = dstMemPtr->getDescPtr();

#if defined(OV_CPU_ARM_ENABLE_FP16)
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
// @todo current oneDNN v3.2 lacks optimized jit implementation for fp16 reorders.
// Use transpose executor as a temporary WA.
if (everyone_is(ov::element::f16, parentDesc->getPrecision(), childDesc->getPrecision()) &&
Expand Down Expand Up @@ -405,7 +401,7 @@ void Reorder::optimizedNspc2Ncsp() {
}

void Reorder::execute(dnnl::stream strm) {
#if defined(OV_CPU_ARM_ENABLE_FP16)
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
if (transposeExecutor) {
auto dstMemPtr = getDstMemoryAtPort(0);
auto srcMemPtr = getSrcMemoryAtPort(0);
Expand Down
5 changes: 1 addition & 4 deletions src/plugins/intel_cpu/src/nodes/reorder.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@

#include <node.h>

#if defined(OV_CPU_ARM_ENABLE_FP16)
#include "nodes/executors/transpose.hpp"
#endif

namespace ov {
namespace intel_cpu {
Expand Down Expand Up @@ -76,10 +74,9 @@ class Reorder : public Node {
void optimizedNspc2Ncsp();
void optimizedNcsp2Nspc();
void createReorderPrimitive(const dnnl::memory::desc &srcDesc, void* srcPtr, const dnnl::memory::desc &dstDesc, void* dstPtr);
#if defined(OV_CPU_ARM_ENABLE_FP16)

void prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc);
TransposeExecutorPtr transposeExecutor;
#endif
};

} // namespace node
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,10 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
// @todo should we always convert to f32 regardless of hardware support, as it is done for f16?
if (!hasHardwareSupport(ov::element::bf16))
map.insert({ov::element::bf16, ov::element::f32});
#if defined(OV_CPU_ARM_ENABLE_FP16)
if (inferencePrecision != ov::element::f16)
map.insert({ov::element::f16, ov::element::f32});
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
if (inferencePrecision != ov::element::f16) {
map.insert({ov::element::f16, ov::element::f32});
}
#else
map.insert({ov::element::f16, ov::element::f32});
#endif
Expand All @@ -329,11 +330,12 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis

type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}};

#if defined(OV_CPU_ARM_ENABLE_FP16)
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
// It cannot be static data, because it may be difference for different inferencePrecision
const auto precisions = get_convert_precisions();
if (inferencePrecision == ov::element::f16) {
precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}};
//keep fq nodes in f32 prec to avoid performance degradation
type_to_fuse_map f16_fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}};
const bool keep_precision_sensitive_in_fp32 = true;
CPU_REGISTER_PASS_COMMON(manager,
Expand Down
12 changes: 10 additions & 2 deletions src/plugins/intel_cpu/src/utils/precision_support.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@

#include "precision_support.h"

#if defined(OPENVINO_ARCH_X86_64)
#include "cpu/x64/cpu_isa_traits.hpp"
#endif
#include "openvino/core/type/element_type.hpp"
#include "openvino/core/visibility.hpp"

#if defined(OV_CPU_WITH_ACL)
#include "arm_compute/core/CPP/CPPTypes.h"
#endif

namespace ov {
namespace intel_cpu {

Expand All @@ -17,8 +23,10 @@ static bool hasFP16HardwareSupport(const ov::element::Type& precision) {
dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2))
return true;
return false;
#elif defined(OV_CPU_ARM_ENABLE_FP16)
return true; // @todo add runtime check for arm as well
#elif defined(OPENVINO_ARCH_ARM64) && defined(OV_CPU_WITH_ACL)
//has_fp16() works correctly on aarch64 only
//TODO: remove else branch as soon as ACL issue #1096 is fixed
return arm_compute::CPUInfo::get().has_fp16();
#else
return false;
#endif
Expand Down
26 changes: 22 additions & 4 deletions src/plugins/intel_cpu/tests/functional/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,32 @@

set(TARGET_NAME ov_cpu_func_tests)

add_library(cpuSpecificRtInfo STATIC
if(SUGGEST_OVERRIDE_SUPPORTED)
# xbyak compilation fails
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-suggest-override")
endif()

add_library(cpuUtils STATIC
$<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.hpp
$<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.cpp)
target_link_libraries(cpuSpecificRtInfo PRIVATE openvino::runtime)
$<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/rt_info/memory_formats_attribute.cpp
$<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/precision_support.h
$<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src/utils/precision_support.cpp)
set(CPU_UTILS_LINK_LIBRARIES openvino::runtime)
set(CPU_UTILS_INCLUDE_PATHS)
if(OV_CPU_WITH_ACL)
list(APPEND CPU_UTILS_LINK_LIBRARIES arm_compute::arm_compute)
list(APPEND CPU_UTILS_INCLUDE_PATHS $<TARGET_PROPERTY:arm_compute::arm_compute,SOURCE_DIR>)
endif()
if(OV_CPU_WITH_DNNL)
list(APPEND CPU_UTILS_LINK_LIBRARIES dnnl)
list(APPEND CPU_UTILS_INCLUDE_PATHS $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/thirdparty/onednn/src)
endif()
target_link_libraries(cpuUtils PRIVATE ${CPU_UTILS_LINK_LIBRARIES})
target_include_directories(cpuUtils PUBLIC ${CPU_UTILS_INCLUDE_PATHS})

set(INCLUDES ${CMAKE_CURRENT_SOURCE_DIR} $<TARGET_PROPERTY:openvino_intel_cpu_plugin,SOURCE_DIR>/src)
set(DEPENDENCIES openvino_intel_cpu_plugin openvino_template_extension)
set(LINK_LIBRARIES funcSharedTests cpuSpecificRtInfo openvino::snippets ov_snippets_models)
set(LINK_LIBRARIES funcSharedTests cpuUtils openvino::snippets ov_snippets_models)

if(ENABLE_OV_ONNX_FRONTEND)
list(APPEND DEFINES TEST_MODELS="${TEST_MODEL_ZOO}")
Expand Down
Loading

0 comments on commit cda5a02

Please sign in to comment.