Skip to content

Commit

Permalink
Merge pull request #2 from seb711/simde-arm-action
Browse files Browse the repository at this point in the history
Simde arm action
  • Loading branch information
seb711 authored Jan 15, 2024
2 parents 5b4de3b + a11012c commit c46afeb
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 23 deletions.
19 changes: 15 additions & 4 deletions .github/workflows/vs17-arm-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,19 @@ jobs:
- {gen: Visual Studio 17 2022, arch: ARM64}
steps:
- name: checkout
uses: actions/checkout@v4
- name: Use cmake
uses: actions/checkout@v2
- name: Configure
run: |
cmake -G "${{matrix.gen}}" -A ${{ matrix.arch }} -DCMAKE_CROSSCOMPILING=1 -B build &&
cmake --build build --verbose
cmake -B build
- name: Build Debug
run: cmake --build build --config Debug --verbose
- name: Build Release
run: cmake --build build --config Release --verbose
- name: Run Release tests
run: |
cd build
ctest -C Release -LE explicitonly --output-on-failure
- name: Run Debug tests
run: |
cd build
ctest -C Debug -LE explicitonly --output-on-failure
7 changes: 6 additions & 1 deletion cmake_modules/environment.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
include(CheckCXXCompilerFlag)

if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(SUPPORT_NEON ON)
endif ()

# Check if the Visual Studio build is targeting ARM
if (CMAKE_GENERATOR_PLATFORM MATCHES "ARM64" OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM")
set(SUPPORT_NEON ON)
endif ()
5 changes: 5 additions & 0 deletions headers/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,12 @@
#ifdef _MSC_VER
#include <iso646.h>
#include <stdint.h>

#if (defined(_M_X64) || defined(_M_AMD64))
#include <intrin.h>
#elif defined(_M_ARM64)
#include <simde/x86/sse4.1.h>
#endif

#define __attribute__(n)
#define __restrict__ __restrict
Expand Down
35 changes: 29 additions & 6 deletions headers/cpubenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,34 @@ static __inline__ unsigned long long stopRDTSCP(void) {
"%rdx");
return (static_cast<unsigned long long>(cycles_high) << 32) | cycles_low;
}
#elif defined(_MSC_VER)
#elif (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))

static inline unsigned long long startRDTSC(void) { return __rdtsc(); }

static inline unsigned long long stopRDTSCP(void) { return __rdtsc(); }
#elif defined(_MSC_VER) && defined(_M_ARM64)
// oriented by zeromq implementation for msc arm/arm64
// https://github.com/zeromq/libzmq/blob/master/src/clock.cpp
inline unsigned long long rdtsc() {
const int64_t pmccntr_el0 = (((3 & 1) << 14) | // op0
((3 & 7) << 11) | // op1
((9 & 15) << 7) | // crn
((13 & 15) << 3) | // crm
((0 & 7) << 0)); // op2

return _ReadStatusReg (pmccntr_el0);
}

static inline unsigned long long startRDTSC(void) { return rdtsc(); }

static inline unsigned long long stopRDTSCP(void) { return rdtsc(); }
#elif (defined(_MSC_VER) && (defined(_M_ARM64)))
// Taken from microsoft documentation (see
// https://learn.microsoft.com/en-us/cpp/build/overview-of-arm-abi-conventions?view=msvc-170

static inline unsigned long long startRDTSC(void) { return __rdpmccntr64(); }

static inline unsigned long long stopRDTSCP(void) { return __rdpmccntr64(); }

#elif defined(__i386__) || defined(__x86_64__)

Expand All @@ -66,7 +89,7 @@ inline unsigned long long rdtsc() {
static __inline__ unsigned long long startRDTSC(void) { return rdtsc(); }

static __inline__ unsigned long long stopRDTSCP(void) { return rdtsc(); }
#elif defined(__aarch64__)
#elif (defined(__GNUC__) && (defined(__arch64__)))
inline uint64_t rdtsc() {
uint64_t cycles;
asm volatile("mrs %0, cntvct_el0"
Expand All @@ -77,14 +100,14 @@ static __inline__ unsigned long long stopRDTSCP(void) { return rdtsc(); }
static __inline__ uint64_t startRDTSC(void) { return rdtsc(); }

static __inline__ uint64_t stopRDTSCP(void) { return rdtsc(); }
#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__))
#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__)) || (defined(_MSC_VER) && defined(_M_ARM64))

// for PPC we should be able to use tbl, but I could not find
// an equivalent to rdtsc for ARM.

inline uint64 rdtsc() { return 0; }
static __inline__ ticks startRDTSC(void) { return 0; }
static __inline__ ticks stopRDTSCP(void) { return 0; }
inline uint64_t rdtsc() { return 0; }
static __inline__ uint64_t startRDTSC(void) { return 0; }
static __inline__ uint64_t stopRDTSCP(void) { return 0; }
#else
#error Unknown architecture
#endif
Expand Down
8 changes: 4 additions & 4 deletions headers/simdgroupsimple.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ namespace FastPForLib {
* efficiency is not that crucial here.
*/

#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || defined(_MSC_VER)
#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))

inline static void comprIncompleteBlock(const uint8_t &n, const __m128i *&in,
__m128i *&out) {
Expand All @@ -166,7 +166,7 @@ namespace FastPForLib {
_mm_storeu_si128(out++, comprBlock);
}

#elif defined(__aarch64__)
#elif (defined(__GNUC__) && (defined(__arch64__))) || (defined(_MSC_VER) && defined(_M_ARM64))
inline static void comprIncompleteBlock(const uint8_t &n, const __m128i *&in,
__m128i *&out) {
// Since we have to produce exactly one compressed vector anyway, we can
Expand Down Expand Up @@ -627,7 +627,7 @@ namespace FastPForLib {
* efficiency is not that crucial here.
*/

#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || defined(_MSC_VER)
#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))

inline static void decomprIncompleteBlock(const uint8_t &n,
const __m128i *&in,
Expand All @@ -641,7 +641,7 @@ namespace FastPForLib {
_mm_and_si128(_mm_srli_epi32(comprBlock, k * b), mask));
}

#elif defined(__aarch64__)
#elif (defined(__GNUC__) && (defined(__arch64__))) || (defined(_MSC_VER) && defined(_M_ARM64))
inline static void decomprIncompleteBlock(const uint8_t &n,
const __m128i *&in,
__m128i *&out) {
Expand Down
4 changes: 2 additions & 2 deletions src/simdbitpacking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8930,7 +8930,7 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in,
__m128i InReg2 = InReg1;
__m128i OutReg1, OutReg2, OutReg3, OutReg4;
const __m128i mask = _mm_set1_epi32(1);
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || (defined(_MSC_VER))
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))
unsigned shift = 0;

for (unsigned i = 0; i < 8; ++i) {
Expand All @@ -8943,7 +8943,7 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in,
_mm_storeu_si128(out++, OutReg3);
_mm_storeu_si128(out++, OutReg4);
}
#elif defined(__aarch64__)
#elif (defined(__GNUC__) && (defined(__arch64__))) || (defined(_MSC_VER) && defined(_M_ARM64))
OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 0), mask);
OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 1), mask);
OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 2), mask);
Expand Down
4 changes: 2 additions & 2 deletions src/simdunalignedbitpacking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8930,7 +8930,7 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in,
__m128i InReg2 = InReg1;
__m128i OutReg1, OutReg2, OutReg3, OutReg4;
const __m128i mask = _mm_set1_epi32(1);
#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER))
#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))
unsigned shift = 0;

for (unsigned i = 0; i < 8; ++i) {
Expand All @@ -8943,7 +8943,7 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in,
_mm_storeu_si128(out++, OutReg3);
_mm_storeu_si128(out++, OutReg4);
}
#else
#elif (defined(__GNUC__) && (defined(__arch64__))) || (defined(_MSC_VER) && defined(_M_ARM64))
OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 0), mask);
OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 1), mask);
OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 2), mask);
Expand Down
10 changes: 7 additions & 3 deletions src/streamvbyte.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@

#if defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#if (defined(_M_X64) || defined(_M_AMD64))
#include <intrin.h>
#elif defined(_M_ARM64)
#include <simde/x86/sse4.1.h>
#endif

#include <iso646.h>
#include <stdint.h>
#define __restrict__ __restrict
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>

#elif defined(__aarch64__)
/* GCC-compatible compiler, targeting ARM with NEON */
#include <simde/x86/sse4.1.h>
Expand Down Expand Up @@ -619,7 +623,7 @@ static const int8_t shuffleTable[256][16] = {
// static char HighTo32[16] = {8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14,
// 15, -1, -1};
// Byte Order: {0x0706050403020100, 0x0F0E0D0C0B0A0908}
#if !defined(_MSC_VER) || defined(__clang__)
#if !defined(_MSC_VER) || defined(__clang__) || (defined(_MSC_VER) && defined(_M_ARM64))
static const xmm_t High16To32 = { (long long)0xFFFF0B0AFFFF0908, (long long)0xFFFF0F0EFFFF0D0C};
#else
static const xmm_t High16To32 = {8, 9, -1, -1, 10, 11, -1, -1,
Expand Down
6 changes: 5 additions & 1 deletion src/varintdecode.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@

#if defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#if (defined(_M_X64) || defined(_M_AMD64))
#include <intrin.h>
#elif defined(_M_ARM64)
#include <simde/x86/sse4.1.h>
#endif
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
Expand Down

0 comments on commit c46afeb

Please sign in to comment.