Skip to content

Commit c08bd7b

Browse files
cshungjanvorliam11
authored
Getting vxsort working on Linux amd64 (#98712)
Co-authored-by: Jan Vorlicek <janvorli@microsoft.com> Co-authored-by: Adeel Mujahid <3840695+am11@users.noreply.github.com>
1 parent 71f45aa commit c08bd7b

32 files changed

+142
-141
lines changed

src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt

+6
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ set(CORECLR_LIBRARIES
111111
gc_pal
112112
)
113113

114+
if(CLR_CMAKE_TARGET_ARCH_AMD64)
115+
list(APPEND CORECLR_LIBRARIES
116+
gc_vxsort
117+
)
118+
endif(CLR_CMAKE_TARGET_ARCH_AMD64)
119+
114120
if(CLR_CMAKE_TARGET_WIN32)
115121
list(APPEND CORECLR_LIBRARIES
116122
${STATIC_MT_CRT_LIB}

src/coreclr/gc/CMakeLists.txt

+11-15
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,9 @@ else()
3636
windows/Native.rc)
3737
endif(CLR_CMAKE_HOST_UNIX)
3838

39-
if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
40-
set (GC_SOURCES
41-
${GC_SOURCES}
42-
vxsort/isa_detection.cpp
43-
vxsort/do_vxsort_avx2.cpp
44-
vxsort/do_vxsort_avx512.cpp
45-
vxsort/machine_traits.avx2.cpp
46-
vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
47-
vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
48-
vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
49-
vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
50-
vxsort/smallsort/avx2_load_mask_tables.cpp
51-
)
52-
endif (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
39+
if (CLR_CMAKE_TARGET_ARCH_AMD64)
40+
add_subdirectory(vxsort)
41+
endif (CLR_CMAKE_TARGET_ARCH_AMD64)
5342

5443
if (CLR_CMAKE_TARGET_WIN32)
5544
set(GC_HEADERS
@@ -87,7 +76,7 @@ if (CLR_CMAKE_TARGET_WIN32)
8776
handletablepriv.h
8877
objecthandle.h
8978
softwarewritewatch.h
90-
vxsort/do_vxsort.h)
79+
)
9180
endif(CLR_CMAKE_TARGET_WIN32)
9281

9382
if(CLR_CMAKE_HOST_WIN32)
@@ -100,6 +89,13 @@ endif(CLR_CMAKE_HOST_WIN32)
10089

10190
set (GC_LINK_LIBRARIES ${GC_LINK_LIBRARIES} gc_pal)
10291

92+
if(CLR_CMAKE_TARGET_ARCH_AMD64)
93+
list(APPEND GC_LINK_LIBRARIES
94+
gc_vxsort
95+
)
96+
endif(CLR_CMAKE_TARGET_ARCH_AMD64)
97+
98+
10399
list(APPEND GC_SOURCES ${GC_HEADERS})
104100

105101
convert_to_absolute_path(GC_SOURCES ${GC_SOURCES})

src/coreclr/gc/gc.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
#include "gcpriv.h"
2020

21-
#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
21+
#ifdef TARGET_AMD64
2222
#define USE_VXSORT
2323
#else
2424
#define USE_INTROSORT
@@ -10305,11 +10305,11 @@ static void do_vxsort (uint8_t** item_array, ptrdiff_t item_count, uint8_t* rang
1030510305
{
1030610306
// above this threshold, using AVX2 for sorting will likely pay off
1030710307
// despite possible downclocking on some devices
10308-
const size_t AVX2_THRESHOLD_SIZE = 8 * 1024;
10308+
const ptrdiff_t AVX2_THRESHOLD_SIZE = 8 * 1024;
1030910309

1031010310
// above this threshold, using AVX512F for sorting will likely pay off
1031110311
// despite possible downclocking on current devices
10312-
const size_t AVX512F_THRESHOLD_SIZE = 128 * 1024;
10312+
const ptrdiff_t AVX512F_THRESHOLD_SIZE = 128 * 1024;
1031310313

1031410314
if (item_count <= 1)
1031510315
return;

src/coreclr/gc/gcsvr.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
#define SERVER_GC 1
2222

23-
#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
23+
#ifdef TARGET_AMD64
2424
#include "vxsort/do_vxsort.h"
2525
#endif
2626

src/coreclr/gc/gcwks.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
#undef SERVER_GC
2121
#endif
2222

23-
#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
23+
#ifdef TARGET_AMD64
2424
#include "vxsort/do_vxsort.h"
2525
#endif
2626

src/coreclr/gc/unix/gcenv.unix.cpp

-6
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,6 @@
3535
#define __has_cpp_attribute(x) (0)
3636
#endif
3737

38-
#if __has_cpp_attribute(fallthrough)
39-
#define FALLTHROUGH [[fallthrough]]
40-
#else
41-
#define FALLTHROUGH
42-
#endif
43-
4438
#include <algorithm>
4539

4640
#if HAVE_SYS_TIME_H

src/coreclr/gc/vxsort/CMakeLists.txt

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
set(CMAKE_INCLUDE_CURRENT_DIR ON)
2+
include_directories("../env")
3+
4+
if(CLR_CMAKE_HOST_UNIX)
5+
set_source_files_properties(isa_detection.cpp PROPERTIES COMPILE_FLAGS -mavx2)
6+
set_source_files_properties(do_vxsort_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
7+
set_source_files_properties(do_vxsort_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx2)
8+
set_source_files_properties(machine_traits.avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
9+
set_source_files_properties(smallsort/bitonic_sort.AVX2.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
10+
set_source_files_properties(smallsort/bitonic_sort.AVX2.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
11+
set_source_files_properties(smallsort/bitonic_sort.AVX512.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
12+
set_source_files_properties(smallsort/bitonic_sort.AVX512.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
13+
set_source_files_properties(smallsort/avx2_load_mask_tables.cpp PROPERTIES COMPILE_FLAGS -mavx2)
14+
endif(CLR_CMAKE_HOST_UNIX)
15+
16+
set (VXSORT_SOURCES
17+
isa_detection.cpp
18+
do_vxsort_avx2.cpp
19+
do_vxsort_avx512.cpp
20+
machine_traits.avx2.cpp
21+
smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
22+
smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
23+
smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
24+
smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
25+
smallsort/avx2_load_mask_tables.cpp
26+
do_vxsort.h
27+
)
28+
29+
add_library(gc_vxsort STATIC ${VXSORT_SOURCES})

src/coreclr/gc/vxsort/defs.h

+1-30
Original file line numberDiff line numberDiff line change
@@ -45,36 +45,7 @@
4545
#define NOINLINE __attribute__((noinline))
4646
#endif
4747

48-
namespace std {
49-
template <class _Ty>
50-
class numeric_limits {
51-
public:
52-
static constexpr _Ty Max() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
53-
static constexpr _Ty Min() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
54-
};
55-
56-
template <>
57-
class numeric_limits<int32_t> {
58-
public:
59-
static constexpr int32_t Max() { return 0x7fffffff; }
60-
static constexpr int32_t Min() { return -0x7fffffff - 1; }
61-
};
62-
63-
template <>
64-
class numeric_limits<uint32_t> {
65-
public:
66-
static constexpr uint32_t Max() { return 0xffffffff; }
67-
static constexpr uint32_t Min() { return 0; }
68-
};
69-
70-
template <>
71-
class numeric_limits<int64_t> {
72-
public:
73-
static constexpr int64_t Max() { return 0x7fffffffffffffffi64; }
74-
75-
static constexpr int64_t Min() { return -0x7fffffffffffffffi64 - 1; }
76-
};
77-
} // namespace std
48+
#include <limits>
7849

7950
#ifndef max
8051
template <typename T>

src/coreclr/gc/vxsort/machine_traits.avx2.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <immintrin.h>
1414
#include <assert.h>
1515
#include <inttypes.h>
16+
#include <type_traits>
1617
#include "defs.h"
1718
#include "machine_traits.h"
1819

@@ -123,8 +124,7 @@ class vxsort_machine_traits<int64_t, AVX2> {
123124

124125
template <int Shift>
125126
static constexpr bool can_pack(T span) {
126-
const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
127-
return ((TU) span) < PACK_LIMIT;
127+
return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
128128
}
129129

130130
static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }

src/coreclr/gc/vxsort/machine_traits.avx512.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "vxsort_targets_enable_avx512.h"
1212

1313
#include <immintrin.h>
14+
#include <type_traits>
1415
#include "defs.h"
1516
#include "machine_traits.h"
1617

@@ -92,8 +93,7 @@ class vxsort_machine_traits<int64_t, AVX512> {
9293

9394
template <int Shift>
9495
static constexpr bool can_pack(T span) {
95-
const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
96-
return ((TU) span) < PACK_LIMIT;
96+
return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
9797
}
9898

9999
static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }

src/coreclr/gc/vxsort/packer.h

+6-6
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class packer {
5656
public:
5757

5858
static void pack(TFrom *mem, size_t len, TFrom base) {
59-
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
59+
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
6060
auto baseVec = MT::broadcast(offset);
6161

6262
auto pre_aligned_mem = reinterpret_cast<TFrom *>(reinterpret_cast<size_t>(mem) & ~ALIGN_MASK);
@@ -87,8 +87,8 @@ class packer {
8787

8888
assert(AH::is_aligned(mem_read));
8989

90-
auto memv_read = (TV *) mem_read;
91-
auto memv_write = (TV *) mem_write;
90+
TV * memv_read = (TV *) mem_read;
91+
TV * memv_write = (TV *) mem_write;
9292

9393
auto lenv = len / N;
9494
len -= (lenv * N);
@@ -156,7 +156,7 @@ class packer {
156156

157157

158158
static void unpack(TTo *mem, size_t len, TFrom base) {
159-
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
159+
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
160160
auto baseVec = MT::broadcast(offset);
161161

162162
auto mem_read = mem + len;
@@ -184,8 +184,8 @@ class packer {
184184
assert(AH::is_aligned(mem_read));
185185

186186
auto lenv = len / (N * 2);
187-
auto memv_read = ((TV *) mem_read) - 1;
188-
auto memv_write = ((TV *) mem_write) - 2;
187+
TV * memv_read = ((TV *) mem_read) - 1;
188+
TV * memv_write = ((TV *) mem_write) - 2;
189189
len -= lenv * N * 2;
190190

191191
while (lenv >= Unroll) {

src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];
3939

4040
template<> struct bitonic<int32_t, AVX2> {
4141
static const int N = 8;
42-
static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
42+
static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
4343
public:
4444

4545
static INLINE void sort_01v_ascending(__m256i& d01) {

src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];
3939

4040
template<> struct bitonic<int64_t, AVX2> {
4141
static const int N = 4;
42-
static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
42+
static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
4343
public:
4444

4545
static INLINE void sort_01v_ascending(__m256i& d01) {

src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace vxsort {
3636
namespace smallsort {
3737
template<> struct bitonic<int32_t, AVX512> {
3838
static const int N = 16;
39-
static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
39+
static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
4040
public:
4141

4242
static INLINE void sort_01v_ascending(__m512i& d01) {

src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace vxsort {
3636
namespace smallsort {
3737
template<> struct bitonic<int64_t, AVX512> {
3838
static const int N = 8;
39-
static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
39+
static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
4040
public:
4141

4242
static INLINE void sort_01v_ascending(__m512i& d01) {

src/coreclr/gc/vxsort/smallsort/codegen/avx2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ def generate_prologue(self, f):
303303
304304
template<> struct bitonic<{t}, AVX2> {{
305305
static const int N = {self.vector_size()};
306-
static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
306+
static constexpr {t} MAX = std::numeric_limits<{t}>::max();
307307
public:
308308
"""
309309
print(s, file=f)

src/coreclr/gc/vxsort/smallsort/codegen/avx512.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def generate_prologue(self, f):
299299
namespace smallsort {{
300300
template<> struct bitonic<{t}, AVX512> {{
301301
static const int N = {self.vector_size()};
302-
static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
302+
static constexpr {t} MAX = std::numeric_limits<{t}>::max();
303303
public:
304304
"""
305305
print(s, file=f)

0 commit comments

Comments
 (0)