From 39f1764c68c4ee1d38300c43f7ce72bf53493eba Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Wed, 22 Nov 2023 11:26:21 -0300 Subject: [PATCH] add a compile-time option to enable 4k page sizes (#52229) We're suffering from heavy fragmentation in some of our workloads. Add a build-time option to enable 4k pages (instead of 16k) in the GC, since that improves memory utilization considerably for us. Drawback is that this may increase the number of `madvise` system calls in the sweeping phase by a factor of 4, but concurrent page sweeping should help with some of that. --- src/gc.h | 24 ++++++++++++++++++++++- src/julia_internal.h | 45 ++++++++++++++++++++++++++++++++++++-------- src/julia_threads.h | 12 +++--------- src/options.h | 5 +++++ uv_constants.jl | 5 +++++ 5 files changed, 73 insertions(+), 18 deletions(-) create mode 100644 uv_constants.jl diff --git a/src/gc.h b/src/gc.h index 6334a1b0a9fdd0..1fbf865e7d2e39 100644 --- a/src/gc.h +++ b/src/gc.h @@ -31,8 +31,12 @@ extern "C" { #endif +#ifdef GC_SMALL_PAGE +#define GC_PAGE_LG2 12 // log2(size of a page) +#else #define GC_PAGE_LG2 14 // log2(size of a page) -#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k +#endif +#define GC_PAGE_SZ (1 << GC_PAGE_LG2) #define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT)) #define jl_malloc_tag ((void*)0xdeadaa01) @@ -241,6 +245,23 @@ typedef struct { _Atomic(size_t) n_pages_allocd; } gc_fragmentation_stat_t; +#ifdef GC_SMALL_PAGE +#ifdef _P64 +#define REGION0_PG_COUNT (1 << 16) +#define REGION1_PG_COUNT (1 << 18) +#define REGION2_PG_COUNT (1 << 18) +#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0xFFFF) // shift by GC_PAGE_LG2 +#define REGION1_INDEX(p) (((uintptr_t)(p) >> 28) & 0x3FFFF) +#define REGION_INDEX(p) (((uintptr_t)(p) >> 46) & 0x3FFFF) +#else +#define REGION0_PG_COUNT (1 << 10) +#define REGION1_PG_COUNT (1 << 10) +#define REGION2_PG_COUNT (1 << 0) +#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0x3FF) // shift by GC_PAGE_LG2 +#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF) +#define REGION_INDEX(p) (0) +#endif +#else #ifdef _P64 #define REGION0_PG_COUNT (1 << 16) #define REGION1_PG_COUNT (1 << 16) @@ -256,6 +277,7 @@ typedef struct { #define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF) #define REGION_INDEX(p) (0) #endif +#endif // define the representation of the levels of the page-table (0 to 2) typedef struct { diff --git a/src/julia_internal.h b/src/julia_internal.h index 5b9603602e4498..e06a834a807528 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -359,24 +359,48 @@ static const int jl_gc_sizeclasses[] = { 144, 160, 176, 192, 208, 224, 240, 256, // the following tables are computed for maximum packing efficiency via the formula: - // pg = 2^14 + // pg = GC_SMALL_PAGE ? 2^12 : 2^14 // sz = (div.(pg-8, rng).÷16)*16; hcat(sz, (pg-8).÷sz, pg .- (pg-8).÷sz.*sz)' +#ifdef GC_SMALL_PAGE + // rng = 15:-1:2 (14 pools) + 272, 288, 304, 336, 368, 400, 448, 496, 576, 672, 816, 1008, 1360, 2032 +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, /pool +// 16, 64, 144, 64, 48, 96, 64, 128, 64, 64, 16, 64, 16, 32, bytes lost +#else // rng = 60:-4:32 (8 pools) 272, 288, 304, 336, 368, 400, 448, 496, -// 60, 56, 53, 48, 44, 40, 36, 33, /pool -// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost +// 60, 56, 53, 48, 44, 40, 36, 33, /pool +// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost // rng = 30:-2:16 (8 pools) 544, 576, 624, 672, 736, 816, 896, 1008, -// 30, 28, 26, 24, 22, 20, 18, 16, /pool -// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost +// 30, 28, 26, 24, 22, 20, 18, 16, /pool +// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost // rng = 15:-1:8 (8 pools) 1088, 1168, 1248, 1360, 1488, 1632, 1808, 2032 -// 15, 14, 13, 12, 11, 10, 9, 8, /pool -// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost +// 15, 14, 13, 12, 11, 10, 9, 8, /pool +// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost +#endif }; +#ifdef GC_SMALL_PAGE +#ifdef _P64 +# define JL_GC_N_POOLS 39 +#elif MAX_ALIGN == 8 +# define JL_GC_N_POOLS 40 +#else +# define JL_GC_N_POOLS 41 +#endif +#else +#ifdef _P64 +# define JL_GC_N_POOLS 49 +#elif MAX_ALIGN == 8 +# define JL_GC_N_POOLS 50 +#else +# define JL_GC_N_POOLS 51 +#endif +#endif static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, ""); STATIC_INLINE int jl_gc_alignment(size_t sz) @@ -403,7 +427,12 @@ JL_DLLEXPORT int jl_alignment(size_t sz); // the following table is computed as: // [searchsortedfirst(jl_gc_sizeclasses, i) - 1 for i = 0:16:jl_gc_sizeclasses[end]] -static const uint8_t szclass_table[] = {0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48}; +static const uint8_t szclass_table[] = +#ifdef GC_SMALL_PAGE + {0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,35,35,35,36,36,36,36,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38}; +#else + {0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,38,38,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,46,46,46,46,46,46,46,46,46,47,47,47,47,47,47,47,47,47,47,47,48,48,48,48,48,48,48,48,48,48,48,48,48,48}; +#endif static_assert(sizeof(szclass_table) == 128, ""); STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz) diff --git a/src/julia_threads.h b/src/julia_threads.h index 11c609a8fe2982..f69f9dd4baacf1 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -4,8 +4,8 @@ #ifndef JL_THREADS_H #define JL_THREADS_H -#include "work-stealing-queue.h" #include "julia_atomics.h" +#include "work-stealing-queue.h" #ifndef _OS_WINDOWS_ #include "pthread.h" #endif @@ -160,14 +160,8 @@ typedef struct { arraylist_t *last_remset; // variables for allocating objects from pools -#ifdef _P64 -# define JL_GC_N_POOLS 49 -#elif MAX_ALIGN == 8 -# define JL_GC_N_POOLS 50 -#else -# define JL_GC_N_POOLS 51 -#endif - jl_gc_pool_t norm_pools[JL_GC_N_POOLS]; +#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h` + jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS]; #define JL_N_STACK_POOLS 16 small_arraylist_t free_stacks[JL_N_STACK_POOLS]; diff --git a/src/options.h b/src/options.h index 06af3e33fcbdcf..1ff0f0ce545bdd 100644 --- a/src/options.h +++ b/src/options.h @@ -81,6 +81,11 @@ // Automatic Instrumenting Profiler //#define ENABLE_TIMINGS +// pool allocator configuration options + +// GC_SMALL_PAGE allocates objects in 4k pages +// #define GC_SMALL_PAGE + // method dispatch profiling -------------------------------------------------- diff --git a/uv_constants.jl b/uv_constants.jl new file mode 100644 index 00000000000000..1db24e45bc2fc6 --- /dev/null +++ b/uv_constants.jl @@ -0,0 +1,5 @@ +-mmacosx-version-min=11.0 + +-P +-I/Users/dnetto/RAI/julia-RAI/usr/include +16