diff --git a/src/gc-debug.c b/src/gc-debug.c index 124b7da74dee1..3aa1612572bf6 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -1100,13 +1100,14 @@ void gc_stats_big_obj(void) v = v->next; } - mallocarray_t *ma = ptls2->heap.mallocarrays; - while (ma != NULL) { - if (gc_marked(jl_astaggedvalue(ma->a)->bits.gc)) { + void **lst = ptls2->gc_tls.heap.mallocarrays.items; + for (size_t i = 0, l = ptls2->gc_tls.heap.mallocarrays.len; i < l; i++) { + jl_genericmemory_t *m = (jl_genericmemory_t*)((uintptr_t)lst[i] & ~(uintptr_t)1); + uint8_t bits = jl_astaggedvalue(m)->bits.gc; + if (gc_marked(bits)) { nused++; - nbytes += jl_genericmemory_nbytes((jl_genericmemory_t*)ma->a); + nbytes += jl_genericmemory_nbytes(m); } - ma = ma->next; } } diff --git a/src/gc.c b/src/gc.c index e89e16ff187c0..c4c83861f5a52 100644 --- a/src/gc.c +++ b/src/gc.c @@ -6,7 +6,11 @@ #include "julia_atomics.h" #include "julia_gcext.h" #include "julia_assert.h" -#ifdef __GLIBC__ +#include + +#if defined(_OS_DARWIN_) +#include +#else #include // for malloc_trim #endif @@ -1121,17 +1125,8 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){ // This is **NOT** a GC safe point. - mallocarray_t *ma; - if (ptls->heap.mafreelist == NULL) { - ma = (mallocarray_t*)malloc_s(sizeof(mallocarray_t)); - } - else { - ma = ptls->heap.mafreelist; - ptls->heap.mafreelist = ma->next; - } - ma->a = (jl_value_t*)((uintptr_t)m | !!isaligned); - ma->next = ptls->heap.mallocarrays; - ptls->heap.mallocarrays = ma; + void *a = (void*)((uintptr_t)m | !!isaligned); + small_arraylist_push(&ptls->heap.mallocarrays, a); } @@ -1143,10 +1138,6 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT jl_batch_accum_heap_size(ptls, sz); } -void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT -{ - jl_batch_accum_free_size(jl_current_task->ptls, sz); -} // Only safe to update the heap inside the GC static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT @@ -1222,19 +1213,21 @@ size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT } -static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT +static void jl_gc_free_memory(jl_genericmemory_t *v, int isaligned) JL_NOTSAFEPOINT { assert(jl_is_genericmemory(v)); jl_genericmemory_t *m = (jl_genericmemory_t*)v; assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2); char *d = (char*)m->ptr; + size_t freed_bytes = memory_block_usable_size(d, isaligned); + assert(freed_bytes != 0); if (isaligned) jl_free_aligned(d); else free(d); jl_atomic_store_relaxed(&gc_heap_stats.heap_size, - jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_genericmemory_nbytes(m)); - gc_num.freed += jl_genericmemory_nbytes(m); + jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - freed_bytes); + gc_num.freed += freed_bytes; gc_num.freecall++; } @@ -1245,24 +1238,23 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - mallocarray_t *ma = ptls2->heap.mallocarrays; - mallocarray_t **pma = &ptls2->heap.mallocarrays; - while (ma != NULL) { - mallocarray_t *nxt = ma->next; - jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1); - int bits = jl_astaggedvalue(a)->bits.gc; - if (gc_marked(bits)) { - pma = &ma->next; + size_t n = 0; + size_t l = ptls2->heap.mallocarrays.len; + void **lst = ptls2->heap.mallocarrays.items; + // filter without preserving order + while (n < l) { + jl_genericmemory_t *m = (jl_genericmemory_t*)((uintptr_t)lst[n] & ~1); + if (gc_marked(jl_astaggedvalue(m)->bits.gc)) { + n++; } else { - *pma = nxt; - int isaligned = (uintptr_t)ma->a & 1; - jl_gc_free_memory(a, isaligned); - free(ma); + int isaligned = (uintptr_t)lst[n] & 1; + jl_gc_free_memory(m, isaligned); + l--; + lst[n] = lst[l]; } - gc_time_count_mallocd_memory(bits); - ma = nxt; } + ptls2->heap.mallocarrays.len = l; } } gc_time_mallocd_memory_end(); @@ -3968,8 +3960,7 @@ void jl_init_thread_heap(jl_ptls_t ptls) small_arraylist_new(&heap->live_tasks, 0); for (int i = 0; i < JL_N_STACK_POOLS; i++) small_arraylist_new(&heap->free_stacks[i], 0); - heap->mallocarrays = NULL; - heap->mafreelist = NULL; + small_arraylist_new(&heap->mallocarrays, 0); heap->big_objects = NULL; heap->remset = &heap->_remset[0]; heap->last_remset = &heap->_remset[1]; @@ -4069,58 +4060,44 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void) jl_throw(jl_memory_exception); } -// allocation wrappers that track allocation and let collection run +// allocation wrappers that add to gc pressure -JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) +JL_DLLEXPORT void *jl_malloc(size_t sz) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; - void *data = malloc(sz); - if (data != NULL && pgcstack != NULL && ct->world_age) { - jl_ptls_t ptls = ct->ptls; - maybe_collect(ptls); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_batch_accum_heap_size(ptls, sz); - } - return data; + return jl_gc_counted_malloc(sz); } -JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) +//_unchecked_calloc does not check for potential overflow of nm*sz +STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) { + size_t nmsz = nm*sz; + return jl_gc_counted_calloc(nmsz, 1); +} + +JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; - void *data = calloc(nm, sz); - if (data != NULL && pgcstack != NULL && ct->world_age) { - jl_ptls_t ptls = ct->ptls; - maybe_collect(ptls); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_batch_accum_heap_size(ptls, sz * nm); - } - return data; + if (nm > SSIZE_MAX/sz) + return NULL; + return _unchecked_calloc(nm, sz); } -JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) +JL_DLLEXPORT void jl_free(void *p) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; - free(p); - if (pgcstack != NULL && ct->world_age) { - jl_batch_accum_free_size(ct->ptls, sz); + if (p != NULL) { + size_t sz = memory_block_usable_size(p, 0); + free(p); + jl_task_t *ct = jl_get_current_task(); + if (ct != NULL) + jl_batch_accum_free_size(ct->ptls, sz); } } -JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) +JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; + size_t old = p ? memory_block_usable_size(p, 0) : 0; void *data = realloc(p, sz); - if (data != NULL && pgcstack != NULL && ct->world_age) { + jl_task_t *ct = jl_get_current_task(); + if (data != NULL && ct != NULL) { + sz = memory_block_usable_size(data, 0); jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); if (!(sz < old)) @@ -4140,63 +4117,80 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size return data; } -// allocation wrappers that save the size of allocations, to allow using -// jl_gc_counted_* functions with a libc-compatible API. - -JL_DLLEXPORT void *jl_malloc(size_t sz) +JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { - int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT); - if (p == NULL) - return NULL; - p[0] = sz; - return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16 + jl_task_t *ct = jl_current_task; + void *data = malloc(sz); + if (data != NULL && ct != NULL && ct->world_age) { + sz = memory_block_usable_size(data, 0); + jl_ptls_t ptls = ct->ptls; + maybe_collect(ptls); + jl_atomic_store_relaxed(&ptls->gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); + jl_atomic_store_relaxed(&ptls->gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_batch_accum_heap_size(ptls, sz); + } + return data; } -//_unchecked_calloc does not check for potential overflow of nm*sz -STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) { - size_t nmsz = nm*sz; - int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1); - if (p == NULL) - return NULL; - p[0] = nmsz; - return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16 +JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) +{ + jl_task_t *ct = jl_current_task; + void *data = calloc(nm, sz); + if (data != NULL && ct != NULL && ct->world_age) { + sz = memory_block_usable_size(data, 0); + jl_ptls_t ptls = ct->ptls; + maybe_collect(ptls); + jl_atomic_store_relaxed(&ptls->gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); + jl_atomic_store_relaxed(&ptls->gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_batch_accum_heap_size(ptls, sz); + } + return data; } -JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz) +JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) { - if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT) - return NULL; - return _unchecked_calloc(nm, sz); + jl_free(p); } -JL_DLLEXPORT void jl_free(void *p) +JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { - if (p != NULL) { - int64_t *pp = (int64_t *)p - 2; - size_t sz = pp[0]; - jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT); - } + return jl_realloc(p, sz); } -JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) +// =========================================================================== // +// malloc wrappers, aligned allocation +// =========================================================================== // + +#if defined(_OS_WINDOWS_) +// helper function based partly on wine msvcrt80+ heap.c +// but with several fixes to improve the correctness of the computation and remove unnecessary parameters +#define SAVED_PTR(x) ((void *)((DWORD_PTR)((char *)x - sizeof(void *)) & \ + ~(sizeof(void *) - 1))) +static size_t _aligned_msize(void *p) { - int64_t *pp; - size_t szold; - if (p == NULL) { - pp = NULL; - szold = 0; - } - else { - pp = (int64_t *)p - 2; - szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT; - } - int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT); - if (pnew == NULL) - return NULL; - pnew[0] = sz; - return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16 + void *alloc_ptr = *(void**)SAVED_PTR(p); + return _msize(alloc_ptr) - ((char*)p - (char*)alloc_ptr); } +#undef SAVED_PTR +#endif +size_t memory_block_usable_size(void *p, int isaligned) JL_NOTSAFEPOINT +{ +#if defined(_OS_WINDOWS_) + if (isaligned) + return _aligned_msize(p); + else + return _msize(p); +#elif defined(_OS_DARWIN_) + return malloc_size(p); +#else + return malloc_usable_size(p); +#endif +} // allocating blocks for Arrays and Strings JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) @@ -4214,12 +4208,13 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) void *b = malloc_cache_align(allocsz); if (b == NULL) jl_throw(jl_memory_exception); - + size_t allocated_bytes = memory_block_usable_size(b, 1); + assert(allocated_bytes >= allocsz); jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocated_bytes); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_batch_accum_heap_size(ptls, allocsz); + jl_batch_accum_heap_size(ptls, allocated_bytes); #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif diff --git a/src/gc.h b/src/gc.h index 01d8745b2899e..7e4da2bd1900f 100644 --- a/src/gc.h +++ b/src/gc.h @@ -143,11 +143,6 @@ JL_EXTENSION typedef struct _bigval_t { // data structure for tracking malloc'd arrays and genericmemory. -typedef struct _mallocarray_t { - jl_value_t *a; - struct _mallocarray_t *next; -} mallocarray_t; - // pool page metadata typedef struct _jl_gc_pagemeta_t { // next metadata structure in per-thread list diff --git a/src/genericmemory.c b/src/genericmemory.c index b36852d53f9c8..02293867da4df 100644 --- a/src/genericmemory.c +++ b/src/genericmemory.c @@ -165,7 +165,8 @@ JL_DLLEXPORT jl_genericmemory_t *jl_ptr_to_genericmemory(jl_value_t *mtype, void if (own_buffer) { int isaligned = 0; // TODO: allow passing memalign'd buffers jl_gc_track_malloced_genericmemory(ct->ptls, m, isaligned); - jl_gc_count_allocd(nel*elsz); + size_t allocated_bytes = memory_block_usable_size(data, isaligned); + jl_gc_count_allocd(allocated_bytes); } return m; } @@ -208,8 +209,6 @@ JL_DLLEXPORT jl_value_t *jl_genericmemory_to_string(jl_genericmemory_t *m, size_ JL_GC_PUSH1(&o); jl_value_t *str = jl_pchar_to_string((const char*)m->ptr, len); JL_GC_POP(); - if (how == 1) // TODO: we might like to early-call jl_gc_free_memory here instead actually, but hopefully `m` will die soon - jl_gc_count_freed(mlength); return str; } // n.b. how == 0 is always pool-allocated, so the freed bytes are computed from the pool not the object diff --git a/src/julia_internal.h b/src/julia_internal.h index 1c2d071d1a6cd..05a2f1e677d60 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -608,6 +608,7 @@ jl_svec_t *jl_perm_symsvec(size_t n, ...); #endif jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz); +JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz); JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz); JL_DLLEXPORT void JL_NORETURN jl_throw_out_of_memory_error(void); @@ -618,6 +619,7 @@ JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT; void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT; void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT; size_t jl_genericmemory_nbytes(jl_genericmemory_t *a) JL_NOTSAFEPOINT; +size_t memory_block_usable_size(void *mem, int isaligned) JL_NOTSAFEPOINT; void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT; void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT; void jl_gc_run_all_finalizers(jl_task_t *ct); diff --git a/src/julia_threads.h b/src/julia_threads.h index 3a0f7f12bffe5..0ca47cc553c88 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -130,8 +130,7 @@ typedef struct { small_arraylist_t live_tasks; // variables for tracking malloc'd arrays - struct _mallocarray_t *mallocarrays; - struct _mallocarray_t *mafreelist; + small_arraylist_t mallocarrays; // variables for tracking big objects struct _bigval_t *big_objects; diff --git a/src/mtarraylist.c b/src/mtarraylist.c index 8bad44797dab4..1bd6810cda8a6 100644 --- a/src/mtarraylist.c +++ b/src/mtarraylist.c @@ -14,8 +14,8 @@ extern "C" { // but there can be any number of observers typedef struct { - _Atomic(uint32_t) len; - uint32_t max; + _Atomic(size_t) len; + size_t max; _Atomic(_Atomic(void*)*) items; _Atomic(void*) _space[SMALL_AL_N_INLINE]; } small_mtarraylist_t; diff --git a/src/support/arraylist.h b/src/support/arraylist.h index 6ad2f0e2f28c9..edad2880dbed2 100644 --- a/src/support/arraylist.h +++ b/src/support/arraylist.h @@ -5,7 +5,7 @@ #define AL_N_INLINE 29 -#define SMALL_AL_N_INLINE 6 +#define SMALL_AL_N_INLINE 5 #ifdef __cplusplus extern "C" { @@ -13,7 +13,7 @@ extern "C" { #include "analyzer_annotations.h" -typedef struct { +typedef struct { // 32 words size_t len; size_t max; void **items; @@ -27,9 +27,9 @@ void arraylist_push(arraylist_t *a, void *elt) JL_NOTSAFEPOINT; void *arraylist_pop(arraylist_t *a) JL_NOTSAFEPOINT; JL_DLLEXPORT void arraylist_grow(arraylist_t *a, size_t n) JL_NOTSAFEPOINT; -typedef struct { - uint32_t len; - uint32_t max; +typedef struct { // 8 words + size_t len; + size_t max; void **items; void *_space[SMALL_AL_N_INLINE]; } small_arraylist_t;