Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add optimized routines for aarch64 #5231

Merged
merged 17 commits into from
Jun 29, 2022
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,6 @@
[submodule "contrib/cpu_features"]
path = contrib/cpu_features
url = https://github.com/google/cpu_features
[submodule "contrib/arm-optimized-routines"]
path = contrib/arm-optimized-routines
url = https://github.com/ARM-software/optimized-routines
4 changes: 4 additions & 0 deletions contrib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,7 @@ add_subdirectory(benchmark)

set (BUILD_TESTING OFF CACHE BOOL "Disable cpu-features testing" FORCE)
add_subdirectory(cpu_features)

if (ARCH_AARCH64 AND ARCH_LINUX)
add_subdirectory(arm-optimized-routines-cmake)
endif ()
1 change: 1 addition & 0 deletions contrib/arm-optimized-routines
Submodule arm-optimized-routines added at e373f6
40 changes: 40 additions & 0 deletions contrib/arm-optimized-routines-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2022 PingCAP, Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This library is to override performance-critical routines for aarch64 targets.
# The implementations are imported from official ARM repo.
# To reduce dispatching cost, indirect function technique is utilized. Therefore,
# this library should only be enabled with ELF targets.

# Considerations:
# - By Jun, 2022, most enterprise OSs (CentOS 7, CentOS Stream 8 and RHEL 8) still
# use relatively old glibc on ARM64, where ASIMD, MTE, DC ZVA and SVE are not
# fully utilized. However, it is becoming increasingly common to use ARM64 instances
# in cloud-native situations.
# - `optimized-routines` repo is actively maintained by ARM officials. Therefore,
# the qualities can be ensured while using it also enables us to keep sync with latest
# acceleration techniques.

ENABLE_LANGUAGE(C)
ENABLE_LANGUAGE(ASM)
set(TIFLASH_AOR_DIR ../arm-optimized-routines)

file(GLOB TIFLASH_AARCH64_STRING_FILES ${TIFLASH_AOR_DIR}/string/aarch64/*.S)
add_library(tiflash-aarch64-string STATIC ${TIFLASH_AARCH64_STRING_FILES} src/aor.c)
target_compile_options(tiflash-aarch64-string PRIVATE -march=armv8-a+sve)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious about why we need to add -march=armv8-a+sve explicitly? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it was because in their header file they use _ARM_FEATURE flag to gate the function prototypes.

target_include_directories(tiflash-aarch64-string PRIVATE ${TIFLASH_AOR_DIR}/string/include)

file(GLOB TIFLASH_AARCH64_MATH_FILES ${TIFLASH_AOR_DIR}/math/*.c)
add_library(tiflash-aarch64-math STATIC ${TIFLASH_AARCH64_MATH_FILES})
target_include_directories(tiflash-aarch64-math PRIVATE ${TIFLASH_AOR_DIR}/math/include)
115 changes: 115 additions & 0 deletions contrib/arm-optimized-routines-cmake/src/aor.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Copyright 2022 PingCAP, Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <stringlib.h>
#include <sys/auxv.h>

// Provide default macro definitions in case that they are not defined on current linux distro.
// For example, TiFlash compiled on older linux kernels may also be used in newer ones.
// These values should be stable for Linux: only false negative is expected when running on
// older kernels, but it is acceptable as `google/cpu_features` is also doing so.
#ifndef HWCAP2_MTE
#define HWCAP2_MTE (1 << 18)
#endif

#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif

#ifndef AT_HWCAP2
#define AT_HWCAP2 26
#endif

#ifndef AT_HWCAP
#define AT_HWCAP 16
#endif

/// check if MTE is supported in current environment
static inline bool mte_supported(void)
{
return (getauxval(AT_HWCAP2) & HWCAP2_MTE) != 0;
}

/// check if SVE is supported in current environment
static inline bool sve_supported(void)
{
return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
}

#define STRINGIFY_IMPL(X) #X
#define STRINGIFY(X) STRINGIFY_IMPL(X)
/**
* \brief
* Symbol is defined as hidden visibility. Therefore, implementations here are only to override routines with TiFlash
* binary itself. This is because dependencies like `ld.so`, `libgcc_s.so`, etc will need essential routines like
* `memcpy` to finish the early loading procedure. Therefore, declare such symbols as visible indirect function will
* create cyclic dependency. It shall be good enough to override symbols within TiFlash, as most heavy computation works
* are happening in the main binary.
* \param NAME: exported symbol name
* \param SVE: preferred implementation when SVE is available
* \param MTE: preferred implementation when MTE is available
* \param ASIMD: preferred implementation for generic aarch64 targets (ASIMD is required by default for Armv8 and above)
*/
#define DISPATCH(NAME, SVE, MTE, ASIMD) \
extern typeof(ASIMD) __tiflash_##NAME __attribute__((ifunc(STRINGIFY(__tiflash_##NAME##_resolver)))); \
extern typeof(ASIMD) NAME __attribute__((visibility("hidden"), alias(STRINGIFY(__tiflash_##NAME)))); \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wunused-function\"") static typeof(ASIMD) * __tiflash_##NAME##_resolver(void) \
{ \
if (sve_supported()) \
{ \
return SVE; \
} \
if (mte_supported()) \
{ \
return MTE; \
} \
return ASIMD; \
} \
_Pragma("GCC diagnostic pop")
#undef memcpy
#undef memmove
#undef memset
#undef memchr
#undef memrchr
#undef memcmp
#undef strcpy
#undef stpcpy
#undef strcmp
#undef strchr
#undef strrchr
#undef strchrnul
#undef strlen
#undef strnlen
#undef strncmp

DISPATCH(memcpy, __memcpy_aarch64_sve, __memcpy_aarch64_simd, __memcpy_aarch64_simd)
DISPATCH(memmove, __memmove_aarch64_sve, __memmove_aarch64_simd, __memmove_aarch64_simd)
DISPATCH(memset, __memset_aarch64, __memset_aarch64, __memset_aarch64)
DISPATCH(memchr, __memchr_aarch64_sve, __memchr_aarch64_mte, __memchr_aarch64)
DISPATCH(memrchr, __memrchr_aarch64, __memrchr_aarch64, __memrchr_aarch64)
DISPATCH(memcmp, __memcmp_aarch64_sve, __memcmp_aarch64, __memcmp_aarch64)
DISPATCH(strcpy, __strcpy_aarch64_sve, __strcpy_aarch64, __strcpy_aarch64)
DISPATCH(stpcpy, __stpcpy_aarch64_sve, __stpcpy_aarch64, __stpcpy_aarch64)
DISPATCH(strcmp, __strcmp_aarch64_sve, __strcmp_aarch64, __strcmp_aarch64)
DISPATCH(strchr, __strchr_aarch64_sve, __strchr_aarch64_mte, __strchr_aarch64)
DISPATCH(strrchr, __strrchr_aarch64_sve, __strrchr_aarch64_mte, __strrchr_aarch64)
DISPATCH(strchrnul, __strchrnul_aarch64_sve, __strchrnul_aarch64_mte, __strchrnul_aarch64)
DISPATCH(strlen, __strlen_aarch64_sve, __strlen_aarch64_mte, __strlen_aarch64)
DISPATCH(strnlen, __strnlen_aarch64_sve, __strnlen_aarch64, __strnlen_aarch64)
DISPATCH(strncmp, __strncmp_aarch64_sve, __strncmp_aarch64, __strncmp_aarch64)
7 changes: 5 additions & 2 deletions dbms/src/Server/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ void loadMiConfig(Logger * log)
}
#undef TRY_LOAD_CONF
#endif

namespace
{
[[maybe_unused]] void tryLoadBoolConfigFromEnv(Poco::Logger * log, bool & target, const char * name)
Expand Down Expand Up @@ -967,7 +968,10 @@ class Server::TcpHttpServersHolder
LOG_DEBUG(log, debug_msg);
}

const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const { return servers; }
const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const
{
return servers;
}

private:
Server & server;
Expand Down Expand Up @@ -1003,7 +1007,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
#ifdef TIFLASH_ENABLE_SVE_SUPPORT
tryLoadBoolConfigFromEnv(log, simd_option::ENABLE_SVE, "TIFLASH_ENABLE_SVE");
#endif

registerFunctions();
registerAggregateFunctions();
registerWindowFunctions();
Expand Down
4 changes: 4 additions & 0 deletions libs/libcommon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,7 @@ if (ARCH_AMD64)
src/crc64_sse2_asimd.cpp
APPEND COMPILE_FLAGS "-mpclmul")
endif()

if (ARCH_AARCH64 AND ARCH_LINUX)
target_link_libraries (common PUBLIC tiflash-aarch64-string tiflash-aarch64-math)
endif()