From f9b1bb4f100401b54de193bf8f75218db403763f Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Fri, 26 Jan 2018 15:24:13 +0100
Subject: [PATCH 01/42] fix invalid range check in allocate large span

---
 rpmalloc/rpmalloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 448b6b0f..4617dd8d 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -856,6 +856,8 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 static void*
 _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	//Calculate number of needed max sized spans (including header)
+	//Since this function is never called if size > LARGE_SIZE_LIMIT
+	//the num_spans is guaranteed to be <= LARGE_CLASS_COUNT
 	size += SPAN_HEADER_SIZE;
 	size_t num_spans = size / SPAN_MAX_SIZE;
 	if (size % SPAN_MAX_SIZE)
@@ -912,7 +914,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 
 use_cache:
 	//Step 1: Check if cache for this large size class (or the following, unless first class) has a span
-	while (!heap->large_cache[idx] && (idx < LARGE_CLASS_COUNT) && (idx < num_spans + 1))
+	while (!heap->large_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (num_spans + 1)))
 		++idx;
 	span_t* span = heap->large_cache[idx];
 	if (span) {

From 5045c7b4ac1ef95e8a8b372ec687001a2ac3146b Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Fri, 26 Jan 2018 19:30:35 +0100
Subject: [PATCH 02/42] test for first alloc of each size bucket

---
 test/main.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/main.c b/test/main.c
index c88248e4..db77c1c0 100644
--- a/test/main.c
+++ b/test/main.c
@@ -141,6 +141,15 @@ test_alloc(void) {
 
 	rpmalloc_finalize();
 
+	for (iloop = 16; iloop < (2 * 1024 * 1024); iloop += 16) {
+		rpmalloc_initialize();
+		void* addr = rpmalloc(iloop);
+		if (!addr)
+			return -1;
+		rpfree(addr);
+		rpmalloc_finalize();
+	}
+
 	printf("Memory allocation tests passed\n");
 
 	return 0;

From 11779588fe0279e9585b77370f223c38dc400fe9 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Fri, 26 Jan 2018 19:32:56 +0100
Subject: [PATCH 03/42] additional first alloc tests

---
 test/main.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/main.c b/test/main.c
index db77c1c0..6af58970 100644
--- a/test/main.c
+++ b/test/main.c
@@ -150,6 +150,15 @@ test_alloc(void) {
 		rpmalloc_finalize();
 	}
 
+	rpmalloc_initialize();
+	for (iloop = 16; iloop < (2 * 1024 * 1024); iloop += 16) {
+		void* addr = rpmalloc(iloop);
+		if (!addr)
+			return -1;
+		rpfree(addr);
+	}
+	rpmalloc_finalize();
+
 	printf("Memory allocation tests passed\n");
 
 	return 0;

From 6357bf593c963a19381e91ac6f2c370543c0d207 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 28 Jan 2018 19:31:15 +0100
Subject: [PATCH 04/42] fix zero size allocations

---
 rpmalloc/rpmalloc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 4617dd8d..75775fc6 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -719,9 +719,10 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 #endif
 
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
-	const size_t class_idx = _memory_size_class[(size <= SMALL_SIZE_LIMIT) ?
-		((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) - 1 :
-		SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT) - 1].class_idx;
+	const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ?
+		((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) :
+		SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT);
+	const size_t class_idx = _memory_size_class[base_idx ? (base_idx - 1) : 0].class_idx;
 
 	span_block_t* active_block = heap->active_block + class_idx;
 	size_class_t* size_class = _memory_size_class + class_idx;

From b1c938278c3c1f75cce9899f5449efcd2c8537af Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 28 Jan 2018 19:34:15 +0100
Subject: [PATCH 05/42] add tests for zero allocations

---
 test/main.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/test/main.c b/test/main.c
index 6af58970..cd8d8aee 100644
--- a/test/main.c
+++ b/test/main.c
@@ -141,21 +141,39 @@ test_alloc(void) {
 
 	rpmalloc_finalize();
 
-	for (iloop = 16; iloop < (2 * 1024 * 1024); iloop += 16) {
+	for (iloop = 0; iloop < 2048; iloop += 16) {
 		rpmalloc_initialize();
-		void* addr = rpmalloc(iloop);
-		if (!addr)
+		addr[0] = rpmalloc(iloop);
+		if (!addr[0])
 			return -1;
-		rpfree(addr);
+		rpfree(addr[0]);
+		rpmalloc_finalize();
+	}
+
+	for (iloop = 2048; iloop < (64 * 1024); iloop += 512) {
+		rpmalloc_initialize();
+		addr[0] = rpmalloc(iloop);
+		if (!addr[0])
+			return -1;
+		rpfree(addr[0]);
+		rpmalloc_finalize();
+	}
+
+	for (iloop = (64 * 1024); iloop < (2 * 1024 * 1024); iloop += 4096) {
+		rpmalloc_initialize();
+		addr[0] = rpmalloc(iloop);
+		if (!addr[0])
+			return -1;
+		rpfree(addr[0]);
 		rpmalloc_finalize();
 	}
 
 	rpmalloc_initialize();
-	for (iloop = 16; iloop < (2 * 1024 * 1024); iloop += 16) {
-		void* addr = rpmalloc(iloop);
-		if (!addr)
+	for (iloop = 0; iloop < (2 * 1024 * 1024); iloop += 16) {
+		addr[0] = rpmalloc(iloop);
+		if (!addr[0])
 			return -1;
-		rpfree(addr);
+		rpfree(addr[0]);
 	}
 	rpmalloc_finalize();
 

From 78c4e26f47b5a75d71fa4761b3b1e8003f933238 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 28 Jan 2018 19:35:10 +0100
Subject: [PATCH 06/42] fix statistics

---
 rpmalloc/rpmalloc.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 75775fc6..a7f41739 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -507,7 +507,7 @@ static void*
 _memory_map(size_t page_count, size_t* align_offset) {
 	void* mapped_address;
 	void* aligned_address;
-	size_t size = page_count * _memory_page_size;
+	const size_t size = page_count * _memory_page_size;
 
 	mapped_address = _memory_config.memory_map(size);
 
@@ -530,8 +530,8 @@ _memory_map(size_t page_count, size_t* align_offset) {
 	}
 
 #if ENABLE_STATISTICS
-	atomic_add32(&_mapped_pages, (int32_t)(size >> _memory_page_size_shift));
-	atomic_add32(&_mapped_total, (int32_t)(size >> _memory_page_size_shift));
+	atomic_add32(&_mapped_pages, (int32_t)page_count);
+	atomic_add32(&_mapped_total, (int32_t)page_count);
 #endif
  
  	return aligned_address;
@@ -544,6 +544,9 @@ _memory_unmap(void* address, size_t page_count, size_t align_offset) {
 	if (align_offset)
 		size += SPAN_ADDRESS_GRANULARITY;
 	_memory_config.memory_unmap(mapped_address, size);
+#if ENABLE_STATISTICS
+	atomic_add32(&_mapped_pages, -(int32_t)page_count);
+#endif
 }
 
 //! Insert the given list of memory page spans in the global cache for small/medium blocks
@@ -1696,19 +1699,16 @@ static void*
 _memory_map_os(size_t size) {
 	void* ptr;
 
-#if ENABLE_STATISTICS
-	atomic_add32(&_mapped_pages, (int32_t)(size >> _memory_page_size_shift));
-	atomic_add32(&_mapped_total, (int32_t)(size >> _memory_page_size_shift));
-#endif
-
 #ifdef PLATFORM_WINDOWS
 	ptr = VirtualAlloc(0, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 #else
 	size_t padding = SPAN_ADDRESS_GRANULARITY;
 
 	ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0);
-	if (ptr == MAP_FAILED)
+	if (ptr == MAP_FAILED) {
+		assert(!"Failed to map virtual memory block");
 		return 0;
+	}
 
 	padding -= (uintptr_t)ptr % SPAN_ADDRESS_GRANULARITY;
 	ptr = pointer_offset(ptr, padding);
@@ -1720,11 +1720,6 @@ _memory_map_os(size_t size) {
 //! Unmap pages from virtual memory
 static void
 _memory_unmap_os(void* ptr, size_t size) {
-#if ENABLE_STATISTICS
-	atomic_add32(&_mapped_pages, -(int32_t)(size >> _memory_page_size_shift));
-	atomic_add32(&_unmapped_total, (int32_t)(size >> _memory_page_size_shift));
-#endif
-
 #ifdef PLATFORM_WINDOWS
 	VirtualFree(ptr, 0, MEM_RELEASE);
 	(void)sizeof(size);

From ea580d1ddbe17f3a26c5a2ff5f8671c8b6e8b74d Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 28 Jan 2018 20:15:07 +0100
Subject: [PATCH 07/42] update changelog

---
 CHANGELOG | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 0ca80dec..51fa1113 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+1.2.3
+
+Fixed an issue where an allocation of zero bytes would cause a segmentation fault from indexing size class array with index -1.
+
+Fixed an issue where an allocation of maximum large block size (2097120 bytes) would index the heap cache array out of bounds and potentially cause a segmentation fault depending on earlier allocation patterns.
+
+
 1.2.2
 
 Add configurable memory mapper providing map/unmap of memory pages. Default to VirtualAlloc/mmap if none provided. This allows rpmalloc to be used in contexts where memory is provided by internal means.

From bb8a749984049de82153312a9892de4d80d2189f Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 30 Jan 2018 12:34:59 +0100
Subject: [PATCH 08/42] Configurable span granularity (#48)

---
 CHANGELOG           |   6 +
 README.md           |   2 +-
 rpmalloc/rpmalloc.c | 538 +++++++++++++++++++++-----------------------
 rpmalloc/rpmalloc.h |  27 ++-
 test/main.c         |  16 +-
 5 files changed, 295 insertions(+), 294 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 51fa1113..602a810d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,15 @@
 1.2.3
 
+Make span size configurable and all spans equal in size, removing span size classes and streamlining the thread cache.
+
 Fixed an issue where an allocation of zero bytes would cause a segmentation fault from indexing size class array with index -1.
 
 Fixed an issue where an allocation of maximum large block size (2097120 bytes) would index the heap cache array out of bounds and potentially cause a segmentation fault depending on earlier allocation patterns.
 
+Fixed an issue where memory pages at start of aligned span run was not completely unmapped on POSIX systems.
+
+Added function to access the allocator configuration after initialization to find default values.
+
 
 1.2.2
 
diff --git a/README.md b/README.md
index 617c131a..d22c7784 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Please consider our Patreon to support our work - https://www.patreon.com/rampan
 Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder)) / Rampant Pixels - http://www.rampantpixels.com
 
 # Performance
-We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~1800 lines of C code.
+We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2000 lines of C code.
 
 Contained in a parallel repository is a benchmark utility that performs interleaved allocations (both aligned to 8 or 16 bytes, and unaligned) and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
 
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index a7f41739..3533985a 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -21,8 +21,9 @@
 // Presets for cache limits
 #if defined(ENABLE_UNLIMITED_CACHE)
 // Unlimited caches
-#define MIN_SPAN_CACHE_RELEASE 16
+#define MIN_SPAN_CACHE_RELEASE 64
 #define MAX_SPAN_CACHE_DIVISOR 1
+#define MIN_SPAN_CACHE_SIZE 0
 #elif defined(DISABLE_CACHE)
 //Disable cache
 #define MIN_SPAN_CACHE_RELEASE 1
@@ -35,15 +36,15 @@
 #define GLOBAL_SPAN_CACHE_MULTIPLIER 1
 #else
 // Default - performance priority cache limits
-//! Limit of thread cache in number of spans for each page count class (undefine for unlimited cache - i.e never release spans to global cache unless thread finishes)
+//! Limit of thread cache in number of spans (undefine for unlimited cache - i.e never release spans to global cache unless thread finishes)
 //! Minimum cache size to remain after a release to global cache
-#define MIN_SPAN_CACHE_SIZE 8
+#define MIN_SPAN_CACHE_SIZE 64
 //! Minimum number of spans to transfer between thread and global cache
-#define MIN_SPAN_CACHE_RELEASE 16
+#define MIN_SPAN_CACHE_RELEASE 32
 //! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor)
-#define MAX_SPAN_CACHE_DIVISOR 8
+#define MAX_SPAN_CACHE_DIVISOR 4
 //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
-#define GLOBAL_SPAN_CACHE_MULTIPLIER 4
+#define GLOBAL_SPAN_CACHE_MULTIPLIER 8
 #endif
 
 //! Size of heap hashmap
@@ -205,31 +206,24 @@ thread_yield(void);
 static size_t _memory_page_size;
 //! Shift to divide by page size
 static size_t _memory_page_size_shift;
-//! Maximum number of pages in a span (span max size divided by page size)
-static size_t _memory_max_page_count;
-
-//! Granularity of all memory page spans for small & medium block allocations
-#define SPAN_ADDRESS_GRANULARITY  65536
-
-//! Maximum size of a span of memory pages
-#define SPAN_MAX_SIZE             (SPAN_ADDRESS_GRANULARITY)
-//! Mask for getting the start of a span of memory pages
-#define SPAN_MASK                 (~((uintptr_t)SPAN_MAX_SIZE - 1))
-//! Maximum number of memory pages in a span
-#define SPAN_MAX_PAGE_COUNT       (SPAN_MAX_SIZE >> _memory_page_size_shift)
-//! Number of size classes for spans
-#define SPAN_CLASS_COUNT          4
-//! Span size class granularity
-#define SPAN_CLASS_GRANULARITY    ((SPAN_ADDRESS_GRANULARITY >> _memory_page_size_shift) / SPAN_CLASS_COUNT)
+//! Granularity at which memor pages are mapped by OS
+static size_t _memory_map_granularity;
+
+//! Size of a span of memory pages
+static size_t _memory_span_size;
+//! Mask to get to start of a memory span
+static uintptr_t _memory_span_mask;
+//! Number of memory pages in a single span (or 1, if span < page)
+static size_t _memory_span_pages_single;
 
 //! Granularity of a small allocation block
-#define SMALL_GRANULARITY         16
+#define SMALL_GRANULARITY         32
 //! Small granularity shift count
-#define SMALL_GRANULARITY_SHIFT   4
-//! Maximum size of a small block
-#define SMALL_SIZE_LIMIT          2032
+#define SMALL_GRANULARITY_SHIFT   5
 //! Number of small block size classes
-#define SMALL_CLASS_COUNT         (SMALL_SIZE_LIMIT / SMALL_GRANULARITY)
+#define SMALL_CLASS_COUNT         63
+//! Maximum size of a small block
+#define SMALL_SIZE_LIMIT          2016
 
 //! Granularity of a medium allocation block
 #define MEDIUM_GRANULARITY        512
@@ -246,7 +240,7 @@ static size_t _memory_max_page_count;
 //! Number of large block size classes
 #define LARGE_CLASS_COUNT         32
 //! Maximum size of a large block
-#define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * SPAN_MAX_SIZE) - SPAN_HEADER_SIZE)
+#define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
 
 #define SPAN_LIST_LOCK_TOKEN      ((void*)1)
 
@@ -266,7 +260,7 @@ typedef uint32_t count_t;
 #if ENABLE_VALIDATE_ARGS
 //! Maximum allocation size to avoid integer overflow
 #undef  MAX_ALLOC_SIZE
-#define MAX_ALLOC_SIZE            (((size_t)-1) - SPAN_ADDRESS_GRANULARITY)
+#define MAX_ALLOC_SIZE            (((size_t)-1) - _memory_span_size)
 #endif
 
 // Data types
@@ -348,12 +342,12 @@ struct heap_t {
 	span_block_t active_block[SIZE_CLASS_COUNT];
 	//! Active span for each size class
 	span_t*      active_span[SIZE_CLASS_COUNT];
-	//! List of demi-used spans with free blocks for each size class (double linked list)
+	//! List of semi-used spans with free blocks for each size class (double linked list)
 	span_t*      size_cache[SIZE_CLASS_COUNT];
 	//! List of free spans for each page count (single linked list)
-	span_t*      span_cache[SPAN_CLASS_COUNT];
+	span_t*      span_cache;
 	//! Allocation counters
-	span_counter_t span_counter[SPAN_CLASS_COUNT];
+	span_counter_t span_counter;
 	//! List of free spans for each large class count (single linked list)
 	span_t*      large_cache[LARGE_CLASS_COUNT];
 	//! Allocation counters for large blocks
@@ -378,9 +372,7 @@ struct heap_t {
 
 struct size_class_t {
 	//! Size of blocks in this class
-	uint16_t size;
-	//! Number of pages to allocate for a chunk
-	uint16_t page_count;
+	uint32_t size;
 	//! Number of blocks in each chunk
 	uint16_t block_count;
 	//! Class index this class is merged with
@@ -398,7 +390,7 @@ static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
 static atomic32_t _memory_heap_id;
 
 //! Global span cache
-static atomicptr_t _memory_span_cache[SPAN_CLASS_COUNT];
+static atomicptr_t _memory_span_cache;
 
 //! Global large cache
 static atomicptr_t _memory_large_cache[LARGE_CLASS_COUNT];
@@ -416,7 +408,7 @@ static atomic32_t _memory_orphan_counter;
 static atomic32_t _memory_active_heaps;
 
 //! Adaptive cache max allocation count
-static uint32_t _memory_max_allocation[SPAN_CLASS_COUNT];
+static uint32_t _memory_max_allocation;
 
 //! Adaptive cache max allocation count
 static uint32_t _memory_max_allocation_large[LARGE_CLASS_COUNT];
@@ -465,10 +457,10 @@ set_thread_heap(heap_t* heap) {
 }
 
 static void*
-_memory_map_os(size_t page_count);
+_memory_map_os(size_t size, size_t* offset);
 
 static void
-_memory_unmap_os(void* ptr, size_t page_count);
+_memory_unmap_os(void* address, size_t size, size_t offset);
 
 static int
 _memory_deallocate_deferred(heap_t* heap, size_t size_class);
@@ -483,13 +475,6 @@ _memory_heap_lookup(int32_t id) {
 	return heap;
 }
 
-//! Get the span size class from page count
-static size_t
-_span_class_from_page_count(size_t page_count) {
-	assert((page_count > 0) && (page_count <= _memory_max_page_count));
-	return ((page_count + SPAN_CLASS_GRANULARITY - 1) / SPAN_CLASS_GRANULARITY) - 1;
-}
-
 //! Increase an allocation counter
 static void
 _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter) {
@@ -503,71 +488,65 @@ _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter) {
 	}
 }
 
-static void*
-_memory_map(size_t page_count, size_t* align_offset) {
-	void* mapped_address;
-	void* aligned_address;
-	const size_t size = page_count * _memory_page_size;
-
-	mapped_address = _memory_config.memory_map(size);
+static size_t
+_memory_span_pages(size_t num_spans) {
+	if (_memory_span_size >= _memory_page_size)
+		return _memory_span_pages_single * num_spans;
 
-	if (!((uintptr_t)mapped_address & ~(uintptr_t)SPAN_MASK)) {
-		aligned_address = mapped_address;
-		*align_offset = 0;
-	}
-	else {
-		//Retry with space for alignment
-		_memory_config.memory_unmap(mapped_address, size);
+	size_t num_bytes = (num_spans * _memory_span_size);
+	size_t num_pages = num_bytes >> _memory_page_size_shift;
+	if (num_bytes & (_memory_page_size - 1))
+		++num_pages;
+	return num_pages;
+}
 
-		size_t padding = SPAN_ADDRESS_GRANULARITY;
-		mapped_address = _memory_config.memory_map(size + padding);
-		padding -= (uintptr_t)mapped_address % SPAN_ADDRESS_GRANULARITY;
-		aligned_address = pointer_offset(mapped_address, padding);
-		//Offset could be 0x10000 (64KiB) if mapped pages are aligned, divide by 2 to fit in uint16_t
-		assert(padding <= SPAN_ADDRESS_GRANULARITY);
-		assert(!((uintptr_t)mapped_address & ~(uintptr_t)SPAN_MASK));
-		*align_offset = (size_t)padding / 2;
-	}
+static void*
+_memory_map(size_t page_count, size_t* offset) {
+	const size_t size = page_count * _memory_page_size;
 
 #if ENABLE_STATISTICS
 	atomic_add32(&_mapped_pages, (int32_t)page_count);
 	atomic_add32(&_mapped_total, (int32_t)page_count);
 #endif
- 
- 	return aligned_address;
+
+	return _memory_config.memory_map ?
+		_memory_config.memory_map(size, offset) :
+		_memory_map_os(size, offset);
 }
 
 static void
-_memory_unmap(void* address, size_t page_count, size_t align_offset) {
+_memory_unmap(void* address, size_t page_count, size_t offset) {
 	size_t size = page_count * _memory_page_size;
-	void* mapped_address = pointer_offset(address, -(offset_t)(align_offset * 2));
-	if (align_offset)
-		size += SPAN_ADDRESS_GRANULARITY;
-	_memory_config.memory_unmap(mapped_address, size);
+
 #if ENABLE_STATISTICS
 	atomic_add32(&_mapped_pages, -(int32_t)page_count);
+	atomic_add32(&_unmapped_total, (int32_t)page_count);
 #endif
+
+	if (_memory_config.memory_unmap)
+		_memory_config.memory_unmap(address, size, offset);
+	else
+		_memory_unmap_os(address, size, offset);
 }
 
 //! Insert the given list of memory page spans in the global cache for small/medium blocks
 static void
-_memory_global_cache_insert(span_t* first_span, size_t list_size, size_t page_count) {
+_memory_global_cache_insert(span_t* first_span, size_t list_size) {
 	assert((list_size == 1) || (first_span->next_span != 0));
 #if MAX_SPAN_CACHE_DIVISOR > 0
 	while (1) {
-		size_t span_class_idx = _span_class_from_page_count(page_count);
-		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[span_class_idx]);
+		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
 		if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK;
-			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK));
+			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 
 #ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
-			size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation[span_class_idx] / MAX_SPAN_CACHE_DIVISOR);
+			size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation / MAX_SPAN_CACHE_DIVISOR);
 			if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE))
 				break;
 #endif
-			//We only have 16 bits for size of list, avoid overflow
-			if ((global_list_size + list_size) > 0xFFFF)
+			//We only have limited number of bits for size of list, avoid overflow
+			if ((global_list_size + list_size) & _memory_span_mask)
 				break;
 
 			//Use prev_span as skip pointer over this sublist range of spans
@@ -577,7 +556,7 @@ _memory_global_cache_insert(span_t* first_span, size_t list_size, size_t page_co
 			//Insert sublist into global cache
 			global_list_size += list_size;
 			void* first_span_ptr = (void*)((uintptr_t)first_span | global_list_size);
-			if (atomic_cas_ptr(&_memory_span_cache[span_class_idx], first_span_ptr, global_span_ptr))
+			if (atomic_cas_ptr(&_memory_span_cache, first_span_ptr, global_span_ptr))
 				return;
 		}
 		else {
@@ -591,36 +570,35 @@ _memory_global_cache_insert(span_t* first_span, size_t list_size, size_t page_co
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		assert(first_span);
 		span_t* next_span = first_span->next_span;
-		_memory_unmap(first_span, page_count, first_span->data.list.align_offset);
+		_memory_unmap(first_span, _memory_span_pages(1), first_span->data.list.align_offset);
 		first_span = next_span;
 	}
 }
 
 //! Extract a number of memory page spans from the global cache for small/medium blocks
 static span_t*
-_memory_global_cache_extract(size_t page_count) {
+_memory_global_cache_extract(void) {
 	span_t* span = 0;
-	size_t span_class_idx = _span_class_from_page_count(page_count);
-	atomicptr_t* cache = &_memory_span_cache[span_class_idx];
 	atomic_thread_fence_acquire();
-	void* global_span_ptr = atomic_load_ptr(cache);
+	void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
 	while (global_span_ptr) {
 		if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) &&
-		        atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) {
+		        atomic_cas_ptr(&_memory_span_cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) {
 			//Grab a number of thread cache spans, using the skip span pointer
 			//stored in prev_span to quickly skip ahead in the list to get the new head
-			uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK;
-			span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK));
+			uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 			assert((span->data.list.size == 1) || (span->next_span != 0));
 
 			span_t* new_global_span = span->prev_span;
 			global_span_count -= span->data.list.size;
+			assert(!(global_span_count & _memory_span_mask));
 
 			//Set new head of global cache list
-			void* new_cache_head = global_span_count ?
+			void* new_cache_head = global_span_count && new_global_span ?
 			                       ((void*)((uintptr_t)new_global_span | global_span_count)) :
 			                       0;
-			atomic_store_ptr(cache, new_cache_head);
+			atomic_store_ptr(&_memory_span_cache, new_cache_head);
 			atomic_thread_fence_release();
 			break;
 		}
@@ -628,7 +606,7 @@ _memory_global_cache_extract(size_t page_count) {
 		//List busy, yield timeslice and retry
 		thread_yield();
 		atomic_thread_fence_acquire();
-		global_span_ptr = atomic_load_ptr(cache);
+		global_span_ptr = atomic_load_ptr(&_memory_span_cache);
 	}
 
 	return span;
@@ -645,15 +623,15 @@ _memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t sp
 	while (1) {
 		void* global_span_ptr = atomic_load_ptr(cache);
 		if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK;
-			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK));
+			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 
 #ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
 			size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation_large[span_count-1] / MAX_SPAN_CACHE_DIVISOR);
 			if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE))
 				break;
 #endif
-			if ((global_list_size + list_size) > 0xFFFF)
+			if ((global_list_size + list_size) & _memory_span_mask)
 				break;
 
 			span_list->data.list.size = (uint32_t)list_size;
@@ -674,7 +652,7 @@ _memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t sp
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		assert(span_list);
 		span_t* next_span = span_list->next_span;
-		_memory_unmap(span_list, span_count * SPAN_MAX_PAGE_COUNT, span_list->data.list.align_offset);
+		_memory_unmap(span_list, _memory_span_pages(span_count), span_list->data.list.align_offset);
 		span_list = next_span;
 	}
 }
@@ -690,15 +668,16 @@ _memory_global_cache_large_extract(size_t span_count) {
 	while (global_span_ptr) {
 		if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) &&
 			atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK;
-			span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK));
+			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 			assert((span->data.list.size == 1) || (span->next_span != 0));
 			assert(span->size_class == (SIZE_CLASS_COUNT + (span_count - 1)));
 
 			span_t* new_global_span = span->prev_span;
 			global_list_size -= span->data.list.size;
+			assert(!(global_list_size & _memory_span_mask));
 
-			void* new_global_span_ptr = global_list_size ?
+			void* new_global_span_ptr = global_list_size && new_global_span ?
 			                            ((void*)((uintptr_t)new_global_span | global_list_size)) :
 			                            0;
 			atomic_store_ptr(cache, new_global_span_ptr);
@@ -795,14 +774,13 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 
 	//Step 4: No semi-used span available, try grab a span from the thread cache
-	size_t span_class_idx = _span_class_from_page_count(size_class->page_count);
-	span_t* span = heap->span_cache[span_class_idx];
+	span_t* span = heap->span_cache;
 	if (!span) {
 		//Step 5: No span available in the thread cache, try grab a list of spans from the global cache
-		span = _memory_global_cache_extract(size_class->page_count);
+		span = _memory_global_cache_extract();
 #if ENABLE_STATISTICS
 		if (span)
-			heap->global_to_thread += (size_t)span->data.list.size * size_class->page_count * _memory_page_size;
+			heap->global_to_thread += (size_t)span->data.list.size * _memory_span_size;
 #endif
 	}
 	if (span) {
@@ -811,16 +789,16 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 			span_t* next_span = span->next_span;
 			assert(next_span);
 			next_span->data.list.size = span->data.list.size - 1;
-			heap->span_cache[span_class_idx] = next_span;
+			heap->span_cache = next_span;
 		}
 		else {
-			heap->span_cache[span_class_idx] = 0;
+			heap->span_cache = 0;
 		}
 	}
 	else {
 		//Step 6: All caches empty, map in new memory pages
 		size_t align_offset = 0;
-		span = _memory_map(size_class->page_count, &align_offset);
+		span = _memory_map(_memory_span_pages(1), &align_offset);
 		span->data.block.align_offset = (uint16_t)align_offset;
 	}
 
@@ -845,7 +823,7 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 
 	//Track counters
-	_memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]);
+	_memory_counter_increase(&heap->span_counter, &_memory_max_allocation);
 
 #if ENABLE_STATISTICS
 	//Store the requested size for statistics
@@ -863,26 +841,25 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	//Since this function is never called if size > LARGE_SIZE_LIMIT
 	//the num_spans is guaranteed to be <= LARGE_CLASS_COUNT
 	size += SPAN_HEADER_SIZE;
-	size_t num_spans = size / SPAN_MAX_SIZE;
-	if (size % SPAN_MAX_SIZE)
+	size_t num_spans = size / _memory_span_size;
+	if (size & ~_memory_span_mask)
 		++num_spans;
 	size_t idx = num_spans - 1;
 
 	if (!idx) {
 		//Shared with medium/small spans
-		size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT);
 		//Step 1: Check span cache
-		span_t* span = heap->span_cache[span_class_idx];
+		span_t* span = heap->span_cache;
 		if (!span) {
 			_memory_deallocate_deferred(heap, 0);
-			span = heap->span_cache[span_class_idx];
+			span = heap->span_cache;
 		}
 		if (!span) {
 			//Step 2: No span available in the thread cache, try grab a list of spans from the global cache
-			span = _memory_global_cache_extract(SPAN_MAX_PAGE_COUNT);
+			span = _memory_global_cache_extract();
 #if ENABLE_STATISTICS
 			if (span)
-				heap->global_to_thread += (size_t)span->data.list.size * SPAN_MAX_PAGE_COUNT * _memory_page_size;
+				heap->global_to_thread += (size_t)span->data.list.size * _memory_span_size;
 #endif
 		}
 		if (span) {
@@ -891,16 +868,16 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 				span_t* next_span = span->next_span;
 				assert(next_span);
 				next_span->data.list.size = span->data.list.size - 1;
-				heap->span_cache[span_class_idx] = next_span;
+				heap->span_cache = next_span;
 			}
 			else {
-				heap->span_cache[span_class_idx] = 0;
+				heap->span_cache = 0;
 			}
 		}
 		else {
 			//Step 3: All caches empty, map in new memory pages
 			size_t align_offset = 0;
-			span = _memory_map(SPAN_MAX_PAGE_COUNT, &align_offset);
+			span = _memory_map(_memory_span_pages(1), &align_offset);
 			span->data.block.align_offset = (uint16_t)align_offset;
 		}
 
@@ -911,7 +888,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		span->size_class = SIZE_CLASS_COUNT;
 
 		//Track counters
-		_memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]);
+		_memory_counter_increase(&heap->span_counter, &_memory_max_allocation);
 
 		return pointer_offset(span, SPAN_HEADER_SIZE);
 	}
@@ -953,7 +930,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	span = _memory_global_cache_large_extract(num_spans);
 	if (span) {
 #if ENABLE_STATISTICS
-		heap->global_to_thread += (size_t)span->data.list.size * num_spans * SPAN_MAX_SIZE;
+		heap->global_to_thread += (size_t)span->data.list.size * num_spans * _memory_span_size;
 #endif
 		//We got a list from global cache, store remainder in thread cache
 		if (span->data.list.size > 1) {
@@ -967,7 +944,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	else {
 		//Step 4: Map in more memory pages
 		size_t align_offset = 0;
-		span = _memory_map(num_spans * SPAN_MAX_PAGE_COUNT, &align_offset);
+		span = _memory_map(_memory_span_pages(num_spans), &align_offset);
 		span->data.block.align_offset = (uint16_t)align_offset;
 	}
 	//Mark span as owned by this heap
@@ -994,12 +971,12 @@ _memory_allocate_heap(void) {
 	atomic_thread_fence_acquire();
 	do {
 		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap = (void*)((uintptr_t)raw_heap & ~(uintptr_t)0xFFFF);
+		heap = (void*)((uintptr_t)raw_heap & _memory_span_mask);
 		if (!heap)
 			break;
 		next_heap = heap->next_orphan;
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & 0xFFFF));
+		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & ~_memory_span_mask));
 	}
 	while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
 
@@ -1061,23 +1038,25 @@ _memory_list_remove(span_t** head, span_t* span) {
 
 //! Insert span into thread cache, releasing to global cache if overflow
 static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span, size_t page_count) {
+_memory_heap_cache_insert(heap_t* heap, span_t* span) {
 #if MAX_SPAN_CACHE_DIVISOR == 0
 	(void)sizeof(heap);
-	_memory_global_cache_insert(span, 1, page_count);
+	_memory_global_cache_insert(span, 1);
 #else
-	size_t span_class_idx = _span_class_from_page_count(page_count);
-	span_t** cache = &heap->span_cache[span_class_idx];
+	span_t** cache = &heap->span_cache;
 	span->next_span = *cache;
 	if (*cache)
 		span->data.list.size = (*cache)->data.list.size + 1;
 	else
 		span->data.list.size = 1;
 	*cache = span;
-#if MAX_SPAN_CACHE_DIVISOR > 1
 	//Check if cache exceeds limit
+#if MAX_SPAN_CACHE_DIVISOR > 0
 	if ((span->data.list.size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) &&
-			(span->data.list.size > heap->span_counter[span_class_idx].cache_limit)) {
+			(span->data.list.size > heap->span_counter.cache_limit)) {
+#else
+	if (span->data.list.size > 65534) {
+#endif
 		//Release to global cache
 		count_t list_size = 1;
 		span_t* next = span->next_span;
@@ -1090,13 +1069,12 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span, size_t page_count) {
 		next->data.list.size = span->data.list.size - list_size;
 		last->next_span = 0; //Terminate list
 		*cache = next;
-		_memory_global_cache_insert(span, list_size, page_count);
+		_memory_global_cache_insert(span, list_size);
 #if ENABLE_STATISTICS
-		heap->thread_to_global += list_size * page_count * _memory_page_size;
+		heap->thread_to_global += list_size * _memory_span_size;
 #endif
 	}
 #endif
-#endif
 }
 
 //! Deallocate the given small/medium memory block from the given heap
@@ -1119,9 +1097,8 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 	//Check if the span will become completely free
 	if (block_data->free_count == ((count_t)size_class->block_count - 1)) {
 		//Track counters
-		size_t span_class_idx = _span_class_from_page_count(size_class->page_count);
-		assert(heap->span_counter[span_class_idx].current_allocations > 0);
-		--heap->span_counter[span_class_idx].current_allocations;
+		assert(heap->span_counter.current_allocations > 0);
+		--heap->span_counter.current_allocations;
 
 		//If it was active, reset counter. Otherwise, if not active, remove from
 		//partial free list if we had a previous free block (guard for classes with only 1 block)
@@ -1131,7 +1108,7 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 			_memory_list_remove(&heap->size_cache[class_idx], span);
 
 		//Add to span cache
-		_memory_heap_cache_insert(heap, span, size_class->page_count);
+		_memory_heap_cache_insert(heap, span);
 		return;
 	}
 
@@ -1157,10 +1134,10 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	//Check if aliased with 64KiB small/medium spans
 	if (span->size_class == SIZE_CLASS_COUNT) {
 		//Track counters
-		size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT);
-		--heap->span_counter[span_class_idx].current_allocations;
+		assert(heap->span_counter.current_allocations > 0);
+		--heap->span_counter.current_allocations;
 		//Add to span cache
-		_memory_heap_cache_insert(heap, span, SPAN_MAX_PAGE_COUNT);
+		_memory_heap_cache_insert(heap, span);
 		return;
 	}
 
@@ -1181,10 +1158,13 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	else
 		span->data.list.size = 1;
 	*cache = span;
-#if MAX_SPAN_CACHE_DIVISOR > 1
+#if MAX_SPAN_CACHE_DIVISOR > 0
 	//Check if cache exceeds limit
 	if ((span->data.list.size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) &&
 			(span->data.list.size > counter->cache_limit)) {
+#else
+	if (span->data.list.size > 65534) {
+#endif
 		//Release to global cache
 		count_t list_size = 1;
 		span_t* next = span->next_span;
@@ -1200,11 +1180,10 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 		*cache = next;
 		_memory_global_cache_large_insert(span, list_size, idx + 1);
 #if ENABLE_STATISTICS
-		heap->thread_to_global += list_size * (idx + 1) * SPAN_MAX_SIZE;
+		heap->thread_to_global += list_size * (idx + 1) * _memory_span_size;
 #endif
 	}
 #endif
-#endif
 }
 
 //! Process pending deferred cross-thread deallocations
@@ -1222,7 +1201,7 @@ _memory_deallocate_deferred(heap_t* heap, size_t size_class) {
 	do {
 		void* next = *(void**)p;
 		//Get span and check which type of block
-		span_t* span = (void*)((uintptr_t)p & SPAN_MASK);
+		span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
 		if (span->size_class < SIZE_CLASS_COUNT) {
 			//Small/medium block
 			got_class |= (span->size_class == size_class);
@@ -1264,7 +1243,7 @@ _memory_allocate(size_t size) {
 	//Oversized, allocate pages directly
 	size += SPAN_HEADER_SIZE;
 	size_t num_pages = size >> _memory_page_size_shift;
-	if (size % _memory_page_size)
+	if (size & (_memory_page_size - 1))
 		++num_pages;
 	size_t align_offset = 0;
 	span_t* span = _memory_map(num_pages, &align_offset);
@@ -1283,7 +1262,7 @@ _memory_deallocate(void* p) {
 		return;
 
 	//Grab the span (always at start of span, using 64KiB alignment)
-	span_t* span = (void*)((uintptr_t)p & SPAN_MASK);
+	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
 	int32_t heap_id = atomic_load32(&span->heap_id);
 	heap_t* heap = get_thread_heap();
 	//Check if block belongs to this heap or if deallocation should be deferred
@@ -1307,8 +1286,8 @@ _memory_deallocate(void* p) {
 static void*
 _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 	if (p) {
-		//Grab the span (always at start of span, using 64KiB alignment)
-		span_t* span = (void*)((uintptr_t)p & SPAN_MASK);
+		//Grab the span using guaranteed span alignment
+		span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
 		int32_t heap_id = atomic_load32(&span->heap_id);
 		if (heap_id) {
 			if (span->size_class < SIZE_CLASS_COUNT) {
@@ -1322,21 +1301,21 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 			else {
 				//Large block
 				size_t total_size = size + SPAN_HEADER_SIZE;
-				size_t num_spans = total_size / SPAN_MAX_SIZE;
-				if (total_size % SPAN_MAX_SIZE)
+				size_t num_spans = total_size / _memory_span_size;
+				if (total_size & ~_memory_span_mask)
 					++num_spans;
 				size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
 				if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2)))
 					return p; //Still fits and less than half of memory would be freed
 				if (!oldsize)
-					oldsize = (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE;
+					oldsize = (current_spans * _memory_span_size) - SPAN_HEADER_SIZE;
 			}
 		}
 		else {
 			//Oversized block
 			size_t total_size = size + SPAN_HEADER_SIZE;
 			size_t num_pages = total_size >> _memory_page_size_shift;
-			if (total_size % _memory_page_size)
+			if (total_size & (_memory_page_size - 1))
 				++num_pages;
 			//Page count is stored in next_span
 			size_t current_pages = (size_t)span->next_span;
@@ -1363,8 +1342,8 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 //! Get the usable size of the given block
 static size_t
 _memory_usable_size(void* p) {
-	//Grab the span (always at start of span, using 64KiB alignment)
-	span_t* span = (void*)((uintptr_t)p & SPAN_MASK);
+	//Grab the span using guaranteed span alignment
+	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
 	int32_t heap_id = atomic_load32(&span->heap_id);
 	if (heap_id) {
 		if (span->size_class < SIZE_CLASS_COUNT) {
@@ -1375,7 +1354,7 @@ _memory_usable_size(void* p) {
 
 		//Large block
 		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
-		return (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE;
+		return (current_spans * _memory_span_size) - SPAN_HEADER_SIZE;
 	}
 
 	//Oversized block, page count is stored in next_span
@@ -1386,27 +1365,18 @@ _memory_usable_size(void* p) {
 //! Adjust and optimize the size class properties for the given class
 static void
 _memory_adjust_size_class(size_t iclass) {
-	//Calculate how many pages are needed for 255 blocks
 	size_t block_size = _memory_size_class[iclass].size;
-	size_t page_count = (block_size * 255) >> _memory_page_size_shift;
-	page_count = (page_count == 0) ? 1 : ((page_count > _memory_max_page_count) ? _memory_max_page_count : page_count);
-	//Merge page counts to span size class granularity
-	page_count = ((page_count + (SPAN_CLASS_GRANULARITY - 1)) / SPAN_CLASS_GRANULARITY) * SPAN_CLASS_GRANULARITY;
-	if (page_count > _memory_max_page_count)
-		page_count = _memory_max_page_count;
-	size_t block_count = ((page_count * _memory_page_size) - SPAN_HEADER_SIZE) / block_size;
-	//Store the final configuration
-	_memory_size_class[iclass].page_count = (uint16_t)page_count;
+	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
+
 	_memory_size_class[iclass].block_count = (uint16_t)block_count;
 	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
-	
+
 	//Check if previous size classes can be merged
 	size_t prevclass = iclass;
 	while (prevclass > 0) {
 		--prevclass;
 		//A class can be merged if number of pages and number of blocks are equal
-		if ((_memory_size_class[prevclass].page_count == _memory_size_class[iclass].page_count) &&
-		        (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)) {
+		if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) {
 			memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
 		}
 		else {
@@ -1437,28 +1407,24 @@ int
 rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	if (config)
 		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
-	if (!_memory_config.memory_map)
-		_memory_config.memory_map = _memory_map_os;
-	if (!_memory_config.memory_unmap)
-		_memory_config.memory_unmap = _memory_unmap_os;
-	
+
 	_memory_page_size = _memory_config.page_size;
 	if (!_memory_page_size) {
 #ifdef PLATFORM_WINDOWS
 		SYSTEM_INFO system_info;
 		memset(&system_info, 0, sizeof(system_info));
 		GetSystemInfo(&system_info);
-		if (system_info.dwAllocationGranularity < SPAN_ADDRESS_GRANULARITY)
-			return -1;
 		_memory_page_size = system_info.dwPageSize;
+		_memory_map_granularity = system_info.dwAllocationGranularity;
 #else
 		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
+		_memory_map_granularity = _memory_page_size;
 #endif
 	}
 	if (_memory_page_size < 512)
 		_memory_page_size = 512;
-	if (_memory_page_size > 16384)
-		_memory_page_size = 16384;
+	if (_memory_page_size > (16 * 1024))
+		_memory_page_size = (16 * 1024);
 
 	_memory_page_size_shift = 0;
 	size_t page_size_bit = _memory_page_size;
@@ -1468,7 +1434,23 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	}
 
 	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
-	_memory_max_page_count = (SPAN_MAX_SIZE >> _memory_page_size_shift);
+
+	size_t span_size = _memory_config.span_size;
+	if (!span_size)
+		span_size = (64 * 1024);
+	if (span_size > (256 * 1024))
+		span_size = (256 * 1024);
+	_memory_span_size = 512;
+	while (_memory_span_size < span_size)
+		_memory_span_size <<= 1;
+
+	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
+	_memory_span_pages_single = _memory_span_size >> _memory_page_size_shift;
+	if (!_memory_span_pages_single)
+		_memory_span_pages_single = 1;
+
+	_memory_config.page_size = _memory_page_size;
+	_memory_config.span_size = _memory_span_size;
 
 #if defined(__APPLE__) && ENABLE_PRELOAD
 	if (pthread_key_create(&_memory_thread_heap, 0))
@@ -1511,24 +1493,21 @@ rpmalloc_finalize(void) {
 		while (heap) {
 			_memory_deallocate_deferred(heap, 0);
 
-			for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) {
-				const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY;
-				span_t* span = heap->span_cache[iclass];
-				unsigned int span_count = span ? span->data.list.size : 0;
-				for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
-					span_t* next_span = span->next_span;
-					_memory_unmap(span, page_count, span->data.list.align_offset);
-					span = next_span;
-				}
+			span_t* span = heap->span_cache;
+			size_t span_count = span ? span->data.list.size : 0;
+			for (size_t ispan = 0; ispan < span_count; ++ispan) {
+				span_t* next_span = span->next_span;
+				_memory_unmap(span, _memory_span_pages(1), span->data.list.align_offset);
+				span = next_span;
 			}
 
 			//Free large spans
 			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				const size_t span_count = iclass + 1;
-				span_t* span = heap->large_cache[iclass];
+				span_count = iclass + 1;
+				span = heap->large_cache[iclass];
 				while (span) {
 					span_t* next_span = span->next_span;
-					_memory_unmap(span, span_count * SPAN_MAX_PAGE_COUNT, span->data.list.align_offset);
+					_memory_unmap(span, _memory_span_pages(span_count), span->data.list.align_offset);
 					span = next_span;
 				}
 			}
@@ -1543,34 +1522,32 @@ rpmalloc_finalize(void) {
 	atomic_store_ptr(&_memory_orphan_heaps, 0);
 
 	//Free global caches
-	for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) {
-		void* span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
-		size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK;
-		span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK));
-		while (cache_count) {
-			span_t* skip_span = span->prev_span;
-			unsigned int span_count = span->data.list.size;
-			for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
-				span_t* next_span = span->next_span;
-				_memory_unmap(span, (iclass + 1) * SPAN_CLASS_GRANULARITY, span->data.list.align_offset);
-				span = next_span;
-			}
-			span = skip_span;
-			cache_count -= span_count;
+	void* span_ptr = atomic_load_ptr(&_memory_span_cache);
+	size_t cache_count = (uintptr_t)span_ptr & ~_memory_span_mask;
+	span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & _memory_span_mask));
+	while (cache_count) {
+		span_t* skip_span = span->prev_span;
+		unsigned int span_count = span->data.list.size;
+		for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
+			span_t* next_span = span->next_span;
+			_memory_unmap(span, _memory_span_pages(1), span->data.list.align_offset);
+			span = next_span;
 		}
-		atomic_store_ptr(&_memory_span_cache[iclass], 0);
+		span = skip_span;
+		cache_count -= span_count;
 	}
+	atomic_store_ptr(&_memory_span_cache, 0);
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		void* span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
-		size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK;
-		span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK));
+		span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
+		cache_count = (uintptr_t)span_ptr & ~_memory_span_mask;
+		span = (span_t*)((void*)((uintptr_t)span_ptr & _memory_span_mask));
 		while (cache_count) {
 			span_t* skip_span = span->prev_span;
 			unsigned int span_count = span->data.list.size;
 			for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
 				span_t* next_span = span->next_span;
-				_memory_unmap(span, (iclass + 1) * SPAN_MAX_PAGE_COUNT, span->data.list.align_offset);
+				_memory_unmap(span, _memory_span_pages(iclass + 1), span->data.list.align_offset);
 				span = next_span;
 			}
 			span = skip_span;
@@ -1612,35 +1589,32 @@ rpmalloc_thread_finalize(void) {
 	_memory_deallocate_deferred(heap, 0);
 
 	//Release thread cache spans back to global cache
-	for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) {
-		const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY;
-		span_t* span = heap->span_cache[iclass];
-		while (span) {
-			if (span->data.list.size > MIN_SPAN_CACHE_RELEASE) {
-				count_t list_size = 1;
-				span_t* next = span->next_span;
-				span_t* last = span;
-				while (list_size < MIN_SPAN_CACHE_RELEASE) {
-					last = next;
-					next = next->next_span;
-					++list_size;
-				}
-				last->next_span = 0; //Terminate list
-				next->data.list.size = span->data.list.size - list_size;
-				_memory_global_cache_insert(span, list_size, page_count);
-				span = next;
-			}
-			else {
-				_memory_global_cache_insert(span, span->data.list.size, page_count);
-				span = 0;
+	span_t* span = heap->span_cache;
+	while (span) {
+		if (span->data.list.size > MIN_SPAN_CACHE_RELEASE) {
+			count_t list_size = 1;
+			span_t* next = span->next_span;
+			span_t* last = span;
+			while (list_size < MIN_SPAN_CACHE_RELEASE) {
+				last = next;
+				next = next->next_span;
+				++list_size;
 			}
+			last->next_span = 0; //Terminate list
+			next->data.list.size = span->data.list.size - list_size;
+			_memory_global_cache_insert(span, list_size);
+			span = next;
+		}
+		else {
+			_memory_global_cache_insert(span, span->data.list.size);
+			span = 0;
 		}
-		heap->span_cache[iclass] = 0;
 	}
+	heap->span_cache = 0;
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		const size_t span_count = iclass + 1;
-		span_t* span = heap->large_cache[iclass];
+		span = heap->large_cache[iclass];
 		while (span) {
 			if (span->data.list.size > MIN_SPAN_CACHE_RELEASE) {
 				count_t list_size = 1;
@@ -1664,25 +1638,15 @@ rpmalloc_thread_finalize(void) {
 		heap->large_cache[iclass] = 0;
 	}
 
-	//Reset allocation counters
-	memset(heap->span_counter, 0, sizeof(heap->span_counter));
-	memset(heap->large_counter, 0, sizeof(heap->large_counter));
-#if ENABLE_STATISTICS
-	heap->requested = 0;
-	heap->allocated = 0;
-	heap->thread_to_global = 0;
-	heap->global_to_thread = 0;
-#endif
-
 	//Orphan the heap
 	void* raw_heap;
 	uintptr_t orphan_counter;
 	heap_t* last_heap;
 	do {
 		last_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap->next_orphan = (void*)((uintptr_t)last_heap & ~(uintptr_t)0xFFFF);
+		heap->next_orphan = (void*)((uintptr_t)last_heap & _memory_span_mask);
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & 0xFFFF));
+		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & ~_memory_span_mask));
 	}
 	while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
 
@@ -1694,37 +1658,62 @@ rpmalloc_is_thread_initialized(void) {
 	return (get_thread_heap() != 0) ? 1 : 0;
 }
 
+const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &_memory_config;
+}
+
 //! Map new pages to virtual memory
 static void*
-_memory_map_os(size_t size) {
+_memory_map_os(size_t size, size_t* offset) {
 	void* ptr;
+	size_t padding = 0;
+
+	if (_memory_span_size > _memory_map_granularity)
+		padding = _memory_span_size;
 
 #ifdef PLATFORM_WINDOWS
-	ptr = VirtualAlloc(0, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	if (!ptr) {
+		assert("Failed to map virtual memory block" == 0);
+		return 0;
+	}
 #else
-	size_t padding = SPAN_ADDRESS_GRANULARITY;
-
 	ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0);
 	if (ptr == MAP_FAILED) {
-		assert(!"Failed to map virtual memory block");
+		assert("Failed to map virtual memory block" == 0);
 		return 0;
 	}
-
-	padding -= (uintptr_t)ptr % SPAN_ADDRESS_GRANULARITY;
-	ptr = pointer_offset(ptr, padding);
 #endif
 
+	if (padding) {
+		padding -= (uintptr_t)ptr & ~_memory_span_mask;
+		ptr = pointer_offset(ptr, padding);
+		assert(padding <= _memory_span_size);
+		assert(!(padding & 3));
+		assert(!((uintptr_t)ptr & ~_memory_span_mask));
+		*offset = padding >> 2;
+	}
+
 	return ptr;
 }
 
 //! Unmap pages from virtual memory
 static void
-_memory_unmap_os(void* ptr, size_t size) {
+_memory_unmap_os(void* address, size_t size, size_t offset) {
+	if (offset) {
+		size += _memory_span_size;
+		address = pointer_offset(address, -(offset_t)(offset << 2));
+	}
 #ifdef PLATFORM_WINDOWS
-	VirtualFree(ptr, 0, MEM_RELEASE);
 	(void)sizeof(size);
+	if (!VirtualFree(address, 0, MEM_RELEASE)) {
+		assert("Failed to unmap virtual memory block" == 0);
+	}
 #else
-	munmap(ptr, size);
+	if (munmap(address, size)) {
+		assert("Failed to unmap virtual memory block" == 0);
+	}
 #endif
 }
 
@@ -1772,7 +1761,7 @@ _memory_validate_integrity(void* p) {
 		return;
 	void* block_start;
 	size_t block_size = _memory_usable_size(p);
-	span_t* span = (void*)((uintptr_t)p & SPAN_MASK);
+	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
 	int32_t heap_id = atomic_load32(&span->heap_id);
 	if (heap_id) {
 		if (span->size_class < SIZE_CLASS_COUNT) {
@@ -1818,7 +1807,7 @@ void
 rpfree(void* ptr) {
 #if ENABLE_GUARDS
 	_memory_validate_integrity(ptr);
-#endif	
+#endif
 	_memory_deallocate(ptr);
 }
 
@@ -1983,7 +1972,7 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 	void* p = atomic_load_ptr(&heap->defer_deallocate);
 	while (p) {
 		void* next = *(void**)p;
-		span_t* span = (void*)((uintptr_t)p & SPAN_MASK);
+		span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
 		stats->deferred += _memory_size_class[span->size_class].size;
 		p = next;
 	}
@@ -1999,10 +1988,8 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 		}
 	}
 
-	for (size_t isize = 0; isize < SPAN_CLASS_COUNT; ++isize) {
-		if (heap->span_cache[isize])
-			stats->spancache = (size_t)heap->span_cache[isize]->data.list.size * (isize + 1) * SPAN_CLASS_GRANULARITY * _memory_page_size;
-	}
+	if (heap->span_cache)
+		stats->spancache = (size_t)heap->span_cache->data.list.size * _memory_span_size;
 }
 
 void
@@ -2013,24 +2000,23 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
 	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
 #endif
-	for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) {
-		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
-		while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) {
-			thread_yield();
-			global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
-		}
-		uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK;
-		size_t list_bytes = global_span_count * (iclass + 1) * SPAN_CLASS_GRANULARITY * _memory_page_size;
-		stats->cached += list_bytes;
+	void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
+	while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) {
+		thread_yield();
+		global_span_ptr = atomic_load_ptr(&_memory_span_cache);
 	}
+	uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+	size_t list_bytes = global_span_count * _memory_span_size;
+	stats->cached += list_bytes;
+
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		void* global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
+		global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
 		while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) {
 			thread_yield();
 			global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
 		}
-		uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK;
-		size_t list_bytes = global_span_count * (iclass + 1) * SPAN_MAX_PAGE_COUNT * _memory_page_size;
+		global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+		list_bytes = global_span_count * (iclass + 1) * _memory_span_size;
 		stats->cached_large += list_bytes;
 	}
 }
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 52ab2d3f..2a8f9549 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -68,17 +68,23 @@ typedef struct rpmalloc_thread_statistics_t {
 
 typedef struct rpmalloc_config_t {
 	//! Map memory pages for the given number of bytes. The returned address MUST be
-	//  2 byte aligned, and should ideally be 64KiB aligned. If memory returned is not
-	//  64KiB aligned rpmalloc will call unmap and then another map request with size
-	//  padded by 64KiB in order to align it internally.
-	void* (*memory_map)(size_t size);
+	//  aligned to the rpmalloc span size, which will always be a power of two.
+	//  Optionally the function can store an alignment offset in the offset variable
+	//  in case it performs alignment and the returned pointer is offset from the
+	//  actual start of the memory region due to this alignment. The alignment offset
+	//  will be passed to the memory unmap function.
+	void* (*memory_map)(size_t size, size_t* offset);
 	//! Unmap the memory pages starting at address and spanning the given number of bytes.
-	//  Address will always be an address returned by an earlier call to memory_map function.
-	void (*memory_unmap)(void* address, size_t size);
-	//! Size of memory pages. All allocation requests will be made in multiples of this page
-	//  size. If set to 0, rpmalloc will use system calls to determine the page size. The page
-	//  size MUST be a power of two in [512,16384] range (2^9 to 2^14).
+	//  The address, size and offset variables will always be a value triple as used
+	//  in and returned by an earlier call to memory_map
+	void (*memory_unmap)(void* address, size_t size, size_t offset);
+	//! Size of memory pages. If set to 0, rpmalloc will use system calls to determine the page size.
+	//  The page size MUST be a power of two in [512,16384] range (2^9 to 2^14).
 	size_t page_size;
+	//! Size of a span of memory pages. MUST be a multiple of page size, and in [512,262144] range (unless 0).
+	//  Set to 0 to use the default span size. All memory mapping requests to memory_map will be made with
+	//  size set to a multiple of the span size.
+	size_t span_size;
 	//! Debug callback if memory guards are enabled. Called if a memory overwrite is detected
 	void (*memory_overwrite)(void* address);
 } rpmalloc_config_t;
@@ -89,6 +95,9 @@ rpmalloc_initialize(void);
 extern int
 rpmalloc_initialize_config(const rpmalloc_config_t* config);
 
+extern const rpmalloc_config_t*
+rpmalloc_config(void);
+
 extern void
 rpmalloc_finalize(void);
 
diff --git a/test/main.c b/test/main.c
index cd8d8aee..9aac1107 100644
--- a/test/main.c
+++ b/test/main.c
@@ -49,7 +49,7 @@ test_alloc(void) {
 			if (addr[ipass] == 0)
 				return -1;
 
-			memcpy(addr[ipass], data, 500);
+			memcpy(addr[ipass], data + ipass, 500);
 
 			for (icheck = 0; icheck < ipass; ++icheck) {
 				if (addr[icheck] == addr[ipass])
@@ -66,7 +66,7 @@ test_alloc(void) {
 		}
 
 		for (ipass = 0; ipass < 8142; ++ipass) {
-			if (memcmp(addr[ipass], data, 500))
+			if (memcmp(addr[ipass], data + ipass, 500))
 				return -1;
 		}
 
@@ -82,7 +82,7 @@ test_alloc(void) {
 			if (addr[ipass] == 0)
 				return -1;
 
-			memcpy(addr[ipass], data, cursize);
+			memcpy(addr[ipass], data + ipass, cursize);
 
 			for (icheck = 0; icheck < ipass; ++icheck) {
 				if (addr[icheck] == addr[ipass])
@@ -100,7 +100,7 @@ test_alloc(void) {
 
 		for (ipass = 0; ipass < 1024; ++ipass) {
 			unsigned int cursize = datasize[ipass%7] + ipass;
-			if (memcmp(addr[ipass], data, cursize))
+			if (memcmp(addr[ipass], data + ipass, cursize))
 				return -1;
 		}
 
@@ -114,7 +114,7 @@ test_alloc(void) {
 			if (addr[ipass] == 0)
 				return -1;
 
-			memcpy(addr[ipass], data, 500);
+			memcpy(addr[ipass], data + ipass, 500);
 
 			for (icheck = 0; icheck < ipass; ++icheck) {
 				if (addr[icheck] == addr[ipass])
@@ -131,7 +131,7 @@ test_alloc(void) {
 		}
 
 		for (ipass = 0; ipass < 1024; ++ipass) {
-			if (memcmp(addr[ipass], data, 500))
+			if (memcmp(addr[ipass], data + ipass, 500))
 				return -1;
 		}
 
@@ -254,7 +254,7 @@ allocator_thread(void* argp) {
 				ret = -1;
 				goto end;
 			}
-			
+
 			rpfree(addr[ipass]);
 		}
 	}
@@ -292,9 +292,9 @@ crossallocator_thread(void* argp) {
 		}
 	}
 
+end:
 	rpmalloc_thread_finalize();
 
-end:
 	thread_exit((uintptr_t)ret);
 }
 

From 0b83775571bd27ae2834a67ca63a7e634510caa8 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 31 Jan 2018 08:51:04 +0100
Subject: [PATCH 09/42] update docs

---
 README.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index d22c7784..a3c2c7c3 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Platforms currently supported:
 - Linux
 - Android
 
-The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation.
+The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation and mapping granularity/size.
 
 This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
 
@@ -18,7 +18,7 @@ Please consider our Patreon to support our work - https://www.patreon.com/rampan
 Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder)) / Rampant Pixels - http://www.rampantpixels.com
 
 # Performance
-We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2000 lines of C code.
+We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2000 lines of C code.
 
 Contained in a parallel repository is a benchmark utility that performs interleaved allocations (both aligned to 8 or 16 bytes, and unaligned) and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
 
@@ -37,7 +37,7 @@ The easiest way to use the library is simply adding rpmalloc.[h|c] to your proje
 
 __rpmalloc_initialize__ : Call at process start to initialize the allocator
 
-__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend and/or memory page size
+__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend, memory page size and mapping granularity.
 
 __rpmalloc_finalize__: Call at process exit to finalize the allocator
 
@@ -45,6 +45,8 @@ __rpmalloc_thread_initialize__: Call at each thread start to initialize the thre
 
 __rpmalloc_thread_finalize__: Call at each thread exit to finalize and release thread cache back to global cache
 
+__rpmalloc_config__: Get the current runtime configuration of the allocator
+
 Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replacement functions. Remember all allocations are 16-byte aligned, so no need to call the explicit rpmemalign/rpaligned_alloc/rpposix_memalign functions unless you need greater alignment, they are simply wrappers to make it easier to replace in existing code.
 
 If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, also include the `malloc.c` file in your project. The automatic init/fini is only implemented for Linux and macOS targets. The list of libc entry points replaced may not be complete, use libc replacement only as a convenience for testing the library on an existing code base, not a final solution.
@@ -78,13 +80,13 @@ Overwrite and underwrite guards are enabled if __ENABLE_GUARDS__ is defined to 1
 The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages.
 
 # Implementation details
-The allocator is based on 64KiB page alignment and 16 byte block alignment, where all runs of memory pages are mapped to 64KiB boundaries. On Windows this is automatically guaranteed by the VirtualAlloc granularity, and on mmap systems it is achieved by atomically incrementing the address where pages are mapped to. By aligning to 64KiB boundaries the free operation can locate the header of the memory block without having to do a table lookup (as tcmalloc does) by simply masking out the low 16 bits of the address.
+The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits).
 
-Memory blocks are divided into three categories. Small blocks are [16, 2032] bytes, medium blocks (2032, 32720] bytes, and large blocks (32720, 2097120] bytes. The three categories are further divided in size classes.
+Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 2016] bytes, medium blocks (2016, 32720] bytes, and large blocks (32720, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (2016, span size] bytes.
 
-Small blocks have a size class granularity of 16 bytes each in 127 buckets. Medium blocks have a granularity of 512 bytes, 60 buckets. Large blocks have a 64KiB granularity, 32 buckets. All allocations are fitted to these size class boundaries (an allocation of 34 bytes will allocate a block of 48 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested.
+Small blocks have a size class granularity of 32 bytes each in 63 buckets. Medium blocks have a granularity of 512 bytes, 60 buckets (default). Large blocks have a the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 42 bytes will allocate a block of 64 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested.
 
-Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans for each number of pages in the span configuration. The fourth level is a global list of free spans for each number of pages in the span configuration.
+Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans. The fourth level is a global list of free spans.
 
 Each span for a small and medium size class keeps track of how many blocks are allocated/free, as well as a list of which blocks that are free for allocation. To avoid locks, each span is completely owned by the allocating thread, and all cross-thread deallocations will be deferred to the owner thread.
 
@@ -93,9 +95,9 @@ Large blocks, or super spans, are cached in two levels. The first level is a per
 # Memory mapping
 By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes.
 
-The functions do not need to deal with alignment, this is done by rpmalloc internally. However, ideally the map function should return pages aligned to 64KiB boundaries in order to avoid extra mapping requests (see caveats section below). What will happen is that if the first map call returns an address that is not 64KiB aligned, rpmalloc will immediately unmap that block and call a new mapping request with the size increased by 64KiB, then perform the alignment internally. To avoid this double mapping, always return blocks aligned to 64KiB.
+The functions must guarantee alignment to the configured span size. Either provide the span size during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the span size. The span size MUST be a power of two in [512, 262144] range, and be a multiple (or divisor) of the memory page size.
 
-Memory mapping requests are always done in multiples of the memory page size. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two in [512, 16384] range.
+Memory mapping requests are always done in multiples of the span size or memory page size, whichever is larger. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two in [512, 16384] range.
 
 # Memory guards
 If you define the __ENABLE_GUARDS__ to 1, all memory allocations will be padded with extra guard areas before and after the memory block (while still honoring the requested alignment). These dead zones will be filled with a pattern and checked when the block is freed. If the patterns are not intact the callback set in initialization config is called, or if not set an assert is fired.
@@ -127,7 +129,7 @@ Threads that perform a lot of allocations and deallocations in a pattern that ha
 # Caveats
 Cross-thread deallocations are more costly than in-thread deallocations, since the spans are completely owned by the allocating thread. The free operation will be deferred using an atomic list operation and the actual free operation will be performed when the owner thread requires a new block of the corresponding size class.
 
-VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with 64KiB to be able to always return a memory area with this alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space.
+VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space.
 
 The free, realloc and usable size functions all require the passed pointer to be within the first 64KiB page block of the start of the memory block. You cannot pass in any pointer from the memory block address range. 
 

From d4237be15757c0365fe5ecd5354745b5ed7be5c9 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 4 Feb 2018 21:03:31 +0100
Subject: [PATCH 10/42] Super spans (#50)

allow spans to map/unmap in superspan groups
unify cache handling and allow breakup of large superspans
---
 CHANGELOG           |   8 +-
 README.md           |   4 +-
 rpmalloc/rpmalloc.c | 561 +++++++++++++++++++++++---------------------
 rpmalloc/rpmalloc.h |  17 +-
 4 files changed, 312 insertions(+), 278 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 602a810d..c9274332 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,15 +1,21 @@
-1.2.3
+1.3.0
 
 Make span size configurable and all spans equal in size, removing span size classes and streamlining the thread cache.
 
+Allow super spans to be reserved in advance and split up in multiple used spans to reduce number of system calls. This will not increase committed physical pages, only reserved virtual memory space.
+
 Fixed an issue where an allocation of zero bytes would cause a segmentation fault from indexing size class array with index -1.
 
 Fixed an issue where an allocation of maximum large block size (2097120 bytes) would index the heap cache array out of bounds and potentially cause a segmentation fault depending on earlier allocation patterns.
 
 Fixed an issue where memory pages at start of aligned span run was not completely unmapped on POSIX systems.
 
+Fixed an issue where spans were not correctly marked as owned by the heap after traversing the global span cache.
+
 Added function to access the allocator configuration after initialization to find default values.
 
+Removed allocated and reserved statistics to reduce code complexity.
+
 
 1.2.2
 
diff --git a/README.md b/README.md
index a3c2c7c3..50bbc7ad 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,9 @@ By default the allocator uses OS APIs to map virtual memory pages as needed, eit
 
 The functions must guarantee alignment to the configured span size. Either provide the span size during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the span size. The span size MUST be a power of two in [512, 262144] range, and be a multiple (or divisor) of the memory page size.
 
-Memory mapping requests are always done in multiples of the span size or memory page size, whichever is larger. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two in [512, 16384] range.
+Memory mapping requests are always done in multiples of the memory page size, whichever is larger. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two in [512, 16384] range.
+
+To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 8). If the platform can handle partial unmaps (unmapping one or more spans of memory pages mapped in a larger batch) the `unmap_partial` configuration variable should be set to non-zero. If not, spans will be kept until the entire batch can be unmapped.
 
 # Memory guards
 If you define the __ENABLE_GUARDS__ to 1, all memory allocations will be padded with extra guard areas before and after the memory block (while still honoring the requested alignment). These dead zones will be filled with a pattern and checked when the block is freed. If the patterns are not intact the callback set in initialization config is called, or if not set an assert is fired.
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 3533985a..57ca6667 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -24,16 +24,19 @@
 #define MIN_SPAN_CACHE_RELEASE 64
 #define MAX_SPAN_CACHE_DIVISOR 1
 #define MIN_SPAN_CACHE_SIZE 0
+#define DEFAULT_SPAN_MAP_COUNT 16
 #elif defined(DISABLE_CACHE)
 //Disable cache
 #define MIN_SPAN_CACHE_RELEASE 1
 #define MAX_SPAN_CACHE_DIVISOR 0
+#define DEFAULT_SPAN_MAP_COUNT 1
 #elif defined(ENABLE_SPACE_PRIORITY_CACHE)
 // Space priority cache limits
 #define MIN_SPAN_CACHE_SIZE 8
 #define MIN_SPAN_CACHE_RELEASE 8
 #define MAX_SPAN_CACHE_DIVISOR 16
 #define GLOBAL_SPAN_CACHE_MULTIPLIER 1
+#define DEFAULT_SPAN_MAP_COUNT 4
 #else
 // Default - performance priority cache limits
 //! Limit of thread cache in number of spans (undefine for unlimited cache - i.e never release spans to global cache unless thread finishes)
@@ -45,6 +48,8 @@
 #define MAX_SPAN_CACHE_DIVISOR 4
 //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
 #define GLOBAL_SPAN_CACHE_MULTIPLIER 8
+//! Default number of spans to map in call to map more virtual memory
+#define DEFAULT_SPAN_MAP_COUNT 8
 #endif
 
 //! Size of heap hashmap
@@ -214,7 +219,7 @@ static size_t _memory_span_size;
 //! Mask to get to start of a memory span
 static uintptr_t _memory_span_mask;
 //! Number of memory pages in a single span (or 1, if span < page)
-static size_t _memory_span_pages_single;
+static size_t _memory_span_pages;
 
 //! Granularity of a small allocation block
 #define SMALL_GRANULARITY         32
@@ -242,8 +247,6 @@ static size_t _memory_span_pages_single;
 //! Maximum size of a large block
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
 
-#define SPAN_LIST_LOCK_TOKEN      ((void*)1)
-
 #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
 #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
 
@@ -280,6 +283,9 @@ typedef union span_data_t span_data_t;
 //! Cache data
 typedef struct span_counter_t span_counter_t;
 
+#define SPAN_FLAG_MASTER 1
+#define SPAN_FLAG_SUBSPAN 2
+
 //Alignment offset must match in both structures
 //to keep the data when transitioning between being
 //used for blocks and being part of a list
@@ -314,7 +320,9 @@ struct span_t {
 	//!	Heap ID
 	atomic32_t  heap_id;
 	//! Size class
-	count_t     size_class;
+	uint16_t    size_class;
+	//! Flags and counters
+	uint16_t    flags;
 	//! Span data
 	span_data_t data;
 	//! Next span
@@ -344,10 +352,16 @@ struct heap_t {
 	span_t*      active_span[SIZE_CLASS_COUNT];
 	//! List of semi-used spans with free blocks for each size class (double linked list)
 	span_t*      size_cache[SIZE_CLASS_COUNT];
-	//! List of free spans for each page count (single linked list)
+	//! List of free spans (single linked list)
 	span_t*      span_cache;
 	//! Allocation counters
 	span_counter_t span_counter;
+	//! Mapped but unused spans
+	span_t*      span_reserve;
+	//! Master span for mapped but unused spans
+	span_t*      span_reserve_master;
+	//! Number of mapped but unused spans
+	size_t       spans_reserved;
 	//! List of free spans for each large class count (single linked list)
 	span_t*      large_cache[LARGE_CLASS_COUNT];
 	//! Allocation counters for large blocks
@@ -359,10 +373,6 @@ struct heap_t {
 	//! Memory pages alignment offset
 	size_t       align_offset;
 #if ENABLE_STATISTICS
-	//! Number of bytes currently requested in allocations
-	size_t       requested;
-	//! Number of bytes current allocated
-	size_t       allocated;
 	//! Number of bytes transitioned thread -> global
 	size_t       thread_to_global;
 	//! Number of bytes transitioned global -> thread
@@ -482,196 +492,149 @@ _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter) {
 		counter->max_allocations = counter->current_allocations;
 #if MAX_SPAN_CACHE_DIVISOR > 0
 		counter->cache_limit = counter->max_allocations / MAX_SPAN_CACHE_DIVISOR;
+		if (counter->cache_limit > (_memory_span_size - 2))
+			counter->cache_limit = (_memory_span_size - 2);
+		if (counter->cache_limit < (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE))
+			counter->cache_limit = (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE);
+#else
+		counter->cache_limit = (_memory_span_size - 2);
 #endif
 		if (counter->max_allocations > *global_counter)
 			*global_counter = counter->max_allocations;
 	}
 }
 
-static size_t
-_memory_span_pages(size_t num_spans) {
-	if (_memory_span_size >= _memory_page_size)
-		return _memory_span_pages_single * num_spans;
-
-	size_t num_bytes = (num_spans * _memory_span_size);
-	size_t num_pages = num_bytes >> _memory_page_size_shift;
-	if (num_bytes & (_memory_page_size - 1))
-		++num_pages;
-	return num_pages;
-}
-
 static void*
-_memory_map(size_t page_count, size_t* offset) {
-	const size_t size = page_count * _memory_page_size;
-
+_memory_map(size_t size, size_t* offset) {
 #if ENABLE_STATISTICS
+	const size_t page_count = (size >> _memory_page_size_shift);
 	atomic_add32(&_mapped_pages, (int32_t)page_count);
 	atomic_add32(&_mapped_total, (int32_t)page_count);
 #endif
-
-	return _memory_config.memory_map ?
-		_memory_config.memory_map(size, offset) :
-		_memory_map_os(size, offset);
+	assert(!(size % _memory_page_size));
+	return _memory_config.memory_map(size, offset);
 }
 
 static void
-_memory_unmap(void* address, size_t page_count, size_t offset) {
-	size_t size = page_count * _memory_page_size;
-
+_memory_unmap(void* address, size_t size, size_t offset) {
 #if ENABLE_STATISTICS
+	const size_t page_count = (size >> _memory_page_size_shift);
 	atomic_add32(&_mapped_pages, -(int32_t)page_count);
 	atomic_add32(&_unmapped_total, (int32_t)page_count);
 #endif
-
-	if (_memory_config.memory_unmap)
-		_memory_config.memory_unmap(address, size, offset);
-	else
-		_memory_unmap_os(address, size, offset);
+	assert(!((uintptr_t)address & ~_memory_span_mask));
+	assert(!(size % _memory_page_size));
+	_memory_config.memory_unmap(address, size, offset);
 }
 
-//! Insert the given list of memory page spans in the global cache for small/medium blocks
-static void
-_memory_global_cache_insert(span_t* first_span, size_t list_size) {
-	assert((list_size == 1) || (first_span->next_span != 0));
-#if MAX_SPAN_CACHE_DIVISOR > 0
-	while (1) {
-		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
-		if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
-
-#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
-			size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation / MAX_SPAN_CACHE_DIVISOR);
-			if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE))
-				break;
-#endif
-			//We only have limited number of bits for size of list, avoid overflow
-			if ((global_list_size + list_size) & _memory_span_mask)
-				break;
+//! Map in memory pages for the given number of spans (or use previously reserved pages)
+static span_t*
+_memory_map_spans(heap_t* heap, size_t num_spans) {
+	if (num_spans <= heap->spans_reserved) {
+		span_t* span = heap->span_reserve;
+		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
+		heap->spans_reserved -= num_spans;
+		//set flag in span that it is a subspan with a master span
+		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) / _memory_span_size);
+		span->flags = SPAN_FLAG_SUBSPAN | (distance << 2);
+		return span;
+	}
+
+	//We cannot request extra spans if we already have some (but not enough) pending reserved spans
+	size_t request_spans = (heap->spans_reserved || (num_spans > _memory_config.span_map_count)) ? num_spans : _memory_config.span_map_count;
+	size_t align_offset = 0;
+	span_t* span = _memory_map(_memory_span_size * request_spans, &align_offset);
+	span->data.block.align_offset = (uint16_t)align_offset;
 
-			//Use prev_span as skip pointer over this sublist range of spans
-			first_span->data.list.size = (uint32_t)list_size;
-			first_span->prev_span = global_span;
+	if (request_spans > num_spans) {
+		assert(request_spans == _memory_config.span_map_count);
+		heap->spans_reserved = request_spans - num_spans;
+		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
+		heap->span_reserve_master = span;
 
-			//Insert sublist into global cache
-			global_list_size += list_size;
-			void* first_span_ptr = (void*)((uintptr_t)first_span | global_list_size);
-			if (atomic_cas_ptr(&_memory_span_cache, first_span_ptr, global_span_ptr))
-				return;
-		}
-		else {
-			//Atomic operation failed, yield timeslice and retry
-			thread_yield();
-			atomic_thread_fence_acquire();
-		}
+		span->flags = SPAN_FLAG_MASTER | ((uint16_t)request_spans << 2);
 	}
-#endif
-	//Global cache full, release pages
-	for (size_t ispan = 0; ispan < list_size; ++ispan) {
-		assert(first_span);
-		span_t* next_span = first_span->next_span;
-		_memory_unmap(first_span, _memory_span_pages(1), first_span->data.list.align_offset);
-		first_span = next_span;
+	else {
+		span->flags = 0;
 	}
-}
-
-//! Extract a number of memory page spans from the global cache for small/medium blocks
-static span_t*
-_memory_global_cache_extract(void) {
-	span_t* span = 0;
-	atomic_thread_fence_acquire();
-	void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
-	while (global_span_ptr) {
-		if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) &&
-		        atomic_cas_ptr(&_memory_span_cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) {
-			//Grab a number of thread cache spans, using the skip span pointer
-			//stored in prev_span to quickly skip ahead in the list to get the new head
-			uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
-			assert((span->data.list.size == 1) || (span->next_span != 0));
 
-			span_t* new_global_span = span->prev_span;
-			global_span_count -= span->data.list.size;
-			assert(!(global_span_count & _memory_span_mask));
-
-			//Set new head of global cache list
-			void* new_cache_head = global_span_count && new_global_span ?
-			                       ((void*)((uintptr_t)new_global_span | global_span_count)) :
-			                       0;
-			atomic_store_ptr(&_memory_span_cache, new_cache_head);
-			atomic_thread_fence_release();
-			break;
-		}
+	return span;
+}
 
-		//List busy, yield timeslice and retry
-		thread_yield();
-		atomic_thread_fence_acquire();
-		global_span_ptr = atomic_load_ptr(&_memory_span_cache);
+//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
+static void
+_memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
+	//Check if span is a subspan with a master span or unmap cannot do partial unmappings
+	if (_memory_config.unmap_partial || !(span->flags & 3)) {
+		_memory_unmap(span, _memory_span_size * num_spans, align_offset);
+		return;
 	}
 
-	return span;
+	uint32_t is_subspan = span->flags & SPAN_FLAG_SUBSPAN;
+	uint32_t is_master = span->flags & SPAN_FLAG_MASTER;
+	assert((is_subspan || is_master) && !(is_subspan && is_master));
+	uint32_t distance = (is_subspan ? (span->flags >> 2) : 0);
+	span_t* master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
+	uint32_t remains = master->flags >> 2;
+	if (remains <= num_spans) {
+		assert(remains == num_spans);
+		_memory_unmap(master, _memory_span_size * _memory_config.span_map_count, master->data.list.align_offset);
+	}
+	else {
+		assert(remains > num_spans);
+		remains -= (uint32_t)num_spans;
+		master->flags = ((uint16_t)remains << 2) | SPAN_FLAG_MASTER;
+	}
 }
 
-/*! Insert the given list of memory page spans in the global cache for large blocks,
-    similar to _memory_global_cache_insert */
+//! Insert the given list of memory page spans in the global cache
 static void
-_memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t span_count) {
+_memory_cache_insert(atomicptr_t* cache, span_t* span_list, size_t list_size, size_t span_count, size_t cache_limit) {
 	assert((list_size == 1) || (span_list->next_span != 0));
-	assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1)));
 #if MAX_SPAN_CACHE_DIVISOR > 0
-	atomicptr_t* cache = &_memory_large_cache[span_count - 1];
 	while (1) {
 		void* global_span_ptr = atomic_load_ptr(cache);
-		if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
+		uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+		span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 
-#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
-			size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation_large[span_count-1] / MAX_SPAN_CACHE_DIVISOR);
-			if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE))
-				break;
-#endif
-			if ((global_list_size + list_size) & _memory_span_mask)
-				break;
+		if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE))
+			break;
+		if ((global_list_size + list_size) & _memory_span_mask)
+			break;
 
-			span_list->data.list.size = (uint32_t)list_size;
-			span_list->prev_span = global_span;
+		span_list->data.list.size = (uint32_t)list_size;
+		span_list->prev_span = global_span;
 
-			global_list_size += list_size;
-			void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size);
-			if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr))
-				return;
-		}
-		else {
-			thread_yield();
-			atomic_thread_fence_acquire();
-		}
+		global_list_size += list_size;
+		void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size);
+		if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr))
+			return;
+
+		thread_yield();
+		atomic_thread_fence_acquire();
 	}
 #endif
 	//Global cache full, release spans
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		assert(span_list);
 		span_t* next_span = span_list->next_span;
-		_memory_unmap(span_list, _memory_span_pages(span_count), span_list->data.list.align_offset);
+		_memory_unmap_spans(span_list, span_count, span_list->data.list.align_offset);
 		span_list = next_span;
 	}
 }
 
-/*! Extract a number of memory page spans from the global cache for large blocks,
-    similar to _memory_global_cache_extract */
+//! Extract a number of memory page spans from the global cache
 static span_t*
-_memory_global_cache_large_extract(size_t span_count) {
+_memory_cache_extract(atomicptr_t* cache, size_t span_count) {
 	span_t* span = 0;
-	atomicptr_t* cache = &_memory_large_cache[span_count - 1];
 	atomic_thread_fence_acquire();
 	void* global_span_ptr = atomic_load_ptr(cache);
 	while (global_span_ptr) {
-		if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) &&
-			atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) {
+		if (atomic_cas_ptr(cache, 0, global_span_ptr)) {
 			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 			assert((span->data.list.size == 1) || (span->next_span != 0));
-			assert(span->size_class == (SIZE_CLASS_COUNT + (span_count - 1)));
+			assert(span->data.list.size <= global_list_size);
 
 			span_t* new_global_span = span->prev_span;
 			global_list_size -= span->data.list.size;
@@ -680,9 +643,10 @@ _memory_global_cache_large_extract(size_t span_count) {
 			void* new_global_span_ptr = global_list_size && new_global_span ?
 			                            ((void*)((uintptr_t)new_global_span | global_list_size)) :
 			                            0;
-			atomic_store_ptr(cache, new_global_span_ptr);
-			atomic_thread_fence_release();
-			break;
+			if (atomic_cas_ptr(cache, new_global_span_ptr, 0)) {
+				span->prev_span = 0;
+				break;
+			}
 		}
 
 		thread_yield();
@@ -692,29 +656,56 @@ _memory_global_cache_large_extract(size_t span_count) {
 	return span;
 }
 
+//! Insert the given list of memory page spans in the global cache for small/medium blocks
+static void
+_memory_global_cache_insert(span_t* span_list, size_t list_size) {
+#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
+	const size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation / MAX_SPAN_CACHE_DIVISOR);
+#else
+	const size_t cache_limit = 0;
+#endif
+	_memory_cache_insert(&_memory_span_cache, span_list, list_size, 1, cache_limit);
+}
+
+//! Extract a number of memory page spans from the global cache for small/medium blocks
+static span_t*
+_memory_global_cache_extract(void) {
+	return _memory_cache_extract(&_memory_span_cache, 1);
+}
+
+//! Insert the given list of memory page spans in the global cache for large blocks
+static void
+_memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t span_count) {
+	assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1)));
+#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
+	const size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation_large[span_count-1] / MAX_SPAN_CACHE_DIVISOR);
+#else
+	const size_t cache_limit = 0;
+#endif
+	_memory_cache_insert(&_memory_large_cache[span_count - 1], span_list, list_size, span_count, cache_limit);
+}
+
+//! Extract a number of memory page spans from the global cache for large blocks
+static span_t*
+_memory_global_cache_large_extract(size_t span_count) {
+	return _memory_cache_extract(&_memory_large_cache[span_count - 1], span_count);
+}
+
 //! Allocate a small/medium sized memory block from the given heap
 static void*
 _memory_allocate_from_heap(heap_t* heap, size_t size) {
-#if ENABLE_STATISTICS
-	//For statistics we need to store the requested size in the memory block
-	size += sizeof(size_t);
-#endif
 
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
 	const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ?
 		((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) :
 		SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT);
+	assert(!base_idx || ((base_idx - 1) < SIZE_CLASS_COUNT));
 	const size_t class_idx = _memory_size_class[base_idx ? (base_idx - 1) : 0].class_idx;
 
 	span_block_t* active_block = heap->active_block + class_idx;
 	size_class_t* size_class = _memory_size_class + class_idx;
 	const count_t class_size = size_class->size;
 
-#if ENABLE_STATISTICS
-	heap->allocated += class_size;
-	heap->requested += size;
-#endif
-
 	//Step 1: Try to get a block from the currently active span. The span block bookkeeping
 	//        data for the active span is stored in the heap for faster access
 use_active:
@@ -745,11 +736,6 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 			assert(active_block->free_list < size_class->block_count);
 		}
 
-#if ENABLE_STATISTICS
-		//Store the requested size for statistics
-		*(size_t*)pointer_offset(block, class_size - sizeof(size_t)) = size;
-#endif
-
 		return block;
 	}
 
@@ -767,9 +753,13 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 		span_t* span = heap->size_cache[class_idx];
 		*active_block = span->data.block;
 		assert(active_block->free_count > 0);
-		span_t* next_span = span->next_span;
-		heap->size_cache[class_idx] = next_span;
+		heap->size_cache[class_idx] = span->next_span;
 		heap->active_span[class_idx] = span;
+
+		//Mark span as owned by this heap
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
+
 		goto use_active;
 	}
 
@@ -797,16 +787,14 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 	else {
 		//Step 6: All caches empty, map in new memory pages
-		size_t align_offset = 0;
-		span = _memory_map(_memory_span_pages(1), &align_offset);
-		span->data.block.align_offset = (uint16_t)align_offset;
+		span = _memory_map_spans(heap, 1);
 	}
 
 	//Mark span as owned by this heap and set base data
 	atomic_store32(&span->heap_id, heap->id);
 	atomic_thread_fence_release();
 
-	span->size_class = (count_t)class_idx;
+	span->size_class = (uint16_t)class_idx;
 
 	//If we only have one block we will grab it, otherwise
 	//set span as new span to use for next allocation
@@ -825,11 +813,6 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	//Track counters
 	_memory_counter_increase(&heap->span_counter, &_memory_max_allocation);
 
-#if ENABLE_STATISTICS
-	//Store the requested size for statistics
-	*(size_t*)pointer_offset(span, SPAN_HEADER_SIZE + class_size - sizeof(size_t)) = size;
-#endif
-
 	//Return first block if memory page span
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -876,9 +859,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		}
 		else {
 			//Step 3: All caches empty, map in new memory pages
-			size_t align_offset = 0;
-			span = _memory_map(_memory_span_pages(1), &align_offset);
-			span->data.block.align_offset = (uint16_t)align_offset;
+			span = _memory_map_spans(heap, 1);
 		}
 
 		//Mark span as owned by this heap and set base data
@@ -894,6 +875,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	}
 
 use_cache:
+	assert((idx > 0) && (idx < LARGE_CLASS_COUNT));
 	//Step 1: Check if cache for this large size class (or the following, unless first class) has a span
 	while (!heap->large_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (num_spans + 1)))
 		++idx;
@@ -910,7 +892,11 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 			heap->large_cache[idx] = 0;
 		}
 
-		span->size_class = SIZE_CLASS_COUNT + (count_t)idx;
+		span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
+
+		//Mark span as owned by this heap
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
 
 		//Increase counter
 		_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]);
@@ -920,6 +906,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 
 	//Restore index, we're back to smallest fitting span count
 	idx = num_spans - 1;
+	assert((idx > 0) && (idx < LARGE_CLASS_COUNT));
 
 	//Step 2: Process deferred deallocation
 	if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx))
@@ -943,15 +930,13 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	}
 	else {
 		//Step 4: Map in more memory pages
-		size_t align_offset = 0;
-		span = _memory_map(_memory_span_pages(num_spans), &align_offset);
-		span->data.block.align_offset = (uint16_t)align_offset;
+		span = _memory_map_spans(heap, num_spans);
 	}
 	//Mark span as owned by this heap
 	atomic_store32(&span->heap_id, heap->id);
 	atomic_thread_fence_release();
 
-	span->size_class = SIZE_CLASS_COUNT + (count_t)idx;
+	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
 
 	//Increase counter
 	_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]);
@@ -987,7 +972,7 @@ _memory_allocate_heap(void) {
 
 	//Map in pages for a new heap
 	size_t align_offset = 0;
-	heap = _memory_map(1 + (sizeof(heap_t) >> _memory_page_size_shift), &align_offset);
+	heap = _memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
 	memset(heap, 0, sizeof(heap_t));
 	heap->align_offset = align_offset;
 
@@ -1030,9 +1015,11 @@ _memory_list_remove(span_t** head, span_t* span) {
 		*head = span->next_span;
 	}
 	else {
-		if (span->next_span)
-			span->next_span->prev_span = span->prev_span;
-		span->prev_span->next_span = span->next_span;
+		span_t* next_span = span->next_span;
+		span_t* prev_span = span->prev_span;
+		if (next_span)
+			next_span->prev_span = prev_span;
+		prev_span->next_span = next_span;
 	}
 }
 
@@ -1041,7 +1028,7 @@ static void
 _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 #if MAX_SPAN_CACHE_DIVISOR == 0
 	(void)sizeof(heap);
-	_memory_global_cache_insert(span, 1);
+	const size_t list_size = 1;
 #else
 	span_t** cache = &heap->span_cache;
 	span->next_span = *cache;
@@ -1050,30 +1037,25 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 	else
 		span->data.list.size = 1;
 	*cache = span;
-	//Check if cache exceeds limit
-#if MAX_SPAN_CACHE_DIVISOR > 0
-	if ((span->data.list.size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) &&
-			(span->data.list.size > heap->span_counter.cache_limit)) {
-#else
-	if (span->data.list.size > 65534) {
+	if (span->data.list.size <= heap->span_counter.cache_limit)
+		return;
+	//Release to global cache if exceeding limit
+	count_t list_size = 1;
+	span_t* next = span->next_span;
+	span_t* last = span;
+	while (list_size < MIN_SPAN_CACHE_RELEASE) {
+		last = next;
+		next = next->next_span;
+		++list_size;
+	}
+	next->data.list.size = span->data.list.size - list_size;
+	last->next_span = 0;
+	*cache = next;
 #endif
-		//Release to global cache
-		count_t list_size = 1;
-		span_t* next = span->next_span;
-		span_t* last = span;
-		while (list_size < MIN_SPAN_CACHE_RELEASE) {
-			last = next;
-			next = next->next_span;
-			++list_size;
-		}
-		next->data.list.size = span->data.list.size - list_size;
-		last->next_span = 0; //Terminate list
-		*cache = next;
-		_memory_global_cache_insert(span, list_size);
+
+	_memory_global_cache_insert(span, list_size);
 #if ENABLE_STATISTICS
-		heap->thread_to_global += list_size * _memory_span_size;
-#endif
-	}
+	heap->thread_to_global += list_size * _memory_span_size;
 #endif
 }
 
@@ -1089,16 +1071,12 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 		heap->active_block + class_idx :
 		&span->data.block;
 
-#if ENABLE_STATISTICS
-	heap->allocated -= size_class->size;
-	heap->requested -= *(size_t*)pointer_offset(p, size_class->size - sizeof(size_t));
-#endif
-
 	//Check if the span will become completely free
 	if (block_data->free_count == ((count_t)size_class->block_count - 1)) {
 		//Track counters
 		assert(heap->span_counter.current_allocations > 0);
-		--heap->span_counter.current_allocations;
+		if (heap->span_counter.current_allocations)
+			--heap->span_counter.current_allocations;
 
 		//If it was active, reset counter. Otherwise, if not active, remove from
 		//partial free list if we had a previous free block (guard for classes with only 1 block)
@@ -1107,7 +1085,7 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 		else if (block_data->free_count > 0)
 			_memory_list_remove(&heap->size_cache[class_idx], span);
 
-		//Add to span cache
+		//Add to heap span cache
 		_memory_heap_cache_insert(heap, span);
 		return;
 	}
@@ -1131,25 +1109,44 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 //! Deallocate the given large memory block from the given heap
 static void
 _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
-	//Check if aliased with 64KiB small/medium spans
+	//Check if aliased with small/medium spans
 	if (span->size_class == SIZE_CLASS_COUNT) {
 		//Track counters
 		assert(heap->span_counter.current_allocations > 0);
-		--heap->span_counter.current_allocations;
+		if (heap->span_counter.current_allocations)
+			--heap->span_counter.current_allocations;
 		//Add to span cache
 		_memory_heap_cache_insert(heap, span);
 		return;
 	}
 
 	//Decrease counter
-	size_t idx = span->size_class - SIZE_CLASS_COUNT;
+	assert(span->size_class > SIZE_CLASS_COUNT);
+	size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT;
+	size_t num_spans = idx + 1;
+	assert((idx > 0) && (idx < LARGE_CLASS_COUNT));
 	span_counter_t* counter = heap->large_counter + idx;
 	assert(counter->current_allocations > 0);
-	--counter->current_allocations;
+	if (counter->current_allocations)
+		--counter->current_allocations;
 
 #if MAX_SPAN_CACHE_DIVISOR == 0
-	_memory_global_cache_large_insert(span, 1, idx + 1);
+	const size_t list_size = 1;
 #else
+	if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit)) {
+		//Break up as single span cache
+		span_t* master = span;
+		master->flags = SPAN_FLAG_MASTER | (num_spans << 2);
+		for (size_t ispan = 1; ispan < num_spans; ++ispan) {
+			span->next_span = pointer_offset(span, _memory_span_size);
+			span = span->next_span;
+		}
+		span->next_span = 0;
+		master->data.list.size = num_spans;
+		heap->span_cache = master;
+		return;
+	}
+
 	//Insert into cache list
 	span_t** cache = heap->large_cache + idx;
 	span->next_span = *cache;
@@ -1158,31 +1155,27 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	else
 		span->data.list.size = 1;
 	*cache = span;
-#if MAX_SPAN_CACHE_DIVISOR > 0
-	//Check if cache exceeds limit
-	if ((span->data.list.size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) &&
-			(span->data.list.size > counter->cache_limit)) {
-#else
-	if (span->data.list.size > 65534) {
+	if (span->data.list.size <= counter->cache_limit)
+		return;
+
+	//Release to global cache if exceeding limit
+	count_t list_size = 1;
+	span_t* next = span->next_span;
+	span_t* last = span;
+	while (list_size < MIN_SPAN_CACHE_RELEASE) {
+		last = next;
+		next = next->next_span;
+		++list_size;
+	}
+	assert(next->next_span);
+	next->data.list.size = span->data.list.size - list_size;
+	last->next_span = 0;
+	*cache = next;
 #endif
-		//Release to global cache
-		count_t list_size = 1;
-		span_t* next = span->next_span;
-		span_t* last = span;
-		while (list_size < MIN_SPAN_CACHE_RELEASE) {
-			last = next;
-			next = next->next_span;
-			++list_size;
-		}
-		assert(next->next_span);
-		next->data.list.size = span->data.list.size - list_size;
-		last->next_span = 0; //Terminate list
-		*cache = next;
-		_memory_global_cache_large_insert(span, list_size, idx + 1);
+
+	_memory_global_cache_large_insert(span, list_size, num_spans);
 #if ENABLE_STATISTICS
-		heap->thread_to_global += list_size * (idx + 1) * _memory_span_size;
-#endif
-	}
+	heap->thread_to_global += list_size * num_spans * _memory_span_size;
 #endif
 }
 
@@ -1246,7 +1239,7 @@ _memory_allocate(size_t size) {
 	if (size & (_memory_page_size - 1))
 		++num_pages;
 	size_t align_offset = 0;
-	span_t* span = _memory_map(num_pages, &align_offset);
+	span_t* span = _memory_map(num_pages * _memory_page_size, &align_offset);
 	atomic_store32(&span->heap_id, 0);
 	//Store page count in next_span
 	span->next_span = (span_t*)((uintptr_t)num_pages);
@@ -1278,7 +1271,7 @@ _memory_deallocate(void* p) {
 	else {
 		//Oversized allocation, page count is stored in next_span
 		size_t num_pages = (size_t)span->next_span;
-		_memory_unmap(span, num_pages, span->data.list.align_offset);
+		_memory_unmap(span, num_pages * _memory_page_size, span->data.list.align_offset);
 	}
 }
 
@@ -1408,6 +1401,13 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	if (config)
 		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
 
+	int default_mapper = 0;
+	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
+		default_mapper = 1;
+		_memory_config.memory_map = _memory_map_os;
+		_memory_config.memory_unmap = _memory_unmap_os;
+	}
+
 	_memory_page_size = _memory_config.page_size;
 	if (!_memory_page_size) {
 #ifdef PLATFORM_WINDOWS
@@ -1416,11 +1416,16 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		GetSystemInfo(&system_info);
 		_memory_page_size = system_info.dwPageSize;
 		_memory_map_granularity = system_info.dwAllocationGranularity;
+		if (default_mapper)
+			_memory_config.unmap_partial = 0;
 #else
 		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
 		_memory_map_granularity = _memory_page_size;
+		if (default_mapper)
+			_memory_config.unmap_partial = 1;
 #endif
 	}
+
 	if (_memory_page_size < 512)
 		_memory_page_size = 512;
 	if (_memory_page_size > (16 * 1024))
@@ -1445,13 +1450,15 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		_memory_span_size <<= 1;
 
 	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
-	_memory_span_pages_single = _memory_span_size >> _memory_page_size_shift;
-	if (!_memory_span_pages_single)
-		_memory_span_pages_single = 1;
 
 	_memory_config.page_size = _memory_page_size;
 	_memory_config.span_size = _memory_span_size;
 
+	if (!_memory_config.span_map_count)
+		_memory_config.span_map_count = DEFAULT_SPAN_MAP_COUNT;
+	if (_memory_config.span_size * _memory_config.span_map_count < _memory_config.page_size)
+		_memory_config.span_map_count = (_memory_config.page_size / _memory_config.span_size);
+
 #if defined(__APPLE__) && ENABLE_PRELOAD
 	if (pthread_key_create(&_memory_thread_heap, 0))
 		return -1;
@@ -1497,7 +1504,7 @@ rpmalloc_finalize(void) {
 			size_t span_count = span ? span->data.list.size : 0;
 			for (size_t ispan = 0; ispan < span_count; ++ispan) {
 				span_t* next_span = span->next_span;
-				_memory_unmap(span, _memory_span_pages(1), span->data.list.align_offset);
+				_memory_unmap_spans(span, 1, span->data.list.align_offset);
 				span = next_span;
 			}
 
@@ -1507,13 +1514,32 @@ rpmalloc_finalize(void) {
 				span = heap->large_cache[iclass];
 				while (span) {
 					span_t* next_span = span->next_span;
-					_memory_unmap(span, _memory_span_pages(span_count), span->data.list.align_offset);
+					_memory_unmap_spans(span, span_count, span->data.list.align_offset);
 					span = next_span;
 				}
 			}
 
+			if (heap->spans_reserved) {
+				//Special handling if we cannot deal with partial unmaps, since the reserved
+				//spans are unused and cannot be unmapped through _memory_unmap_spans since
+				//they lack the data in next_span pointer
+				if (_memory_config.unmap_partial) {
+					_memory_unmap_spans(heap->span_reserve, heap->spans_reserved, 0);
+				}
+				else {
+					span_t* master = heap->span_reserve_master;
+					uint32_t remaining = master->flags >> 2;
+					assert(remaining >= heap->spans_reserved);
+					remaining -= (uint32_t)heap->spans_reserved;
+					if (!remaining)
+						_memory_unmap(master, _memory_span_size * _memory_config.span_map_count, master->data.list.align_offset);
+					else
+						master->flags = ((uint16_t)remaining << 2) | SPAN_FLAG_MASTER;
+				}
+			}
+
 			heap_t* next_heap = heap->next_heap;
-			_memory_unmap(heap, 1 + (sizeof(heap_t) >> _memory_page_size_shift), heap->align_offset);
+			_memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset);
 			heap = next_heap;
 		}
 
@@ -1530,7 +1556,7 @@ rpmalloc_finalize(void) {
 		unsigned int span_count = span->data.list.size;
 		for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
 			span_t* next_span = span->next_span;
-			_memory_unmap(span, _memory_span_pages(1), span->data.list.align_offset);
+			_memory_unmap_spans(span, 1, span->data.list.align_offset);
 			span = next_span;
 		}
 		span = skip_span;
@@ -1547,7 +1573,7 @@ rpmalloc_finalize(void) {
 			unsigned int span_count = span->data.list.size;
 			for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
 				span_t* next_span = span->next_span;
-				_memory_unmap(span, _memory_span_pages(iclass + 1), span->data.list.align_offset);
+				_memory_unmap_spans(span, iclass + 1, span->data.list.align_offset);
 				span = next_span;
 			}
 			span = skip_span;
@@ -1600,7 +1626,7 @@ rpmalloc_thread_finalize(void) {
 				next = next->next_span;
 				++list_size;
 			}
-			last->next_span = 0; //Terminate list
+			last->next_span = 0;
 			next->data.list.size = span->data.list.size - list_size;
 			_memory_global_cache_insert(span, list_size);
 			span = next;
@@ -1625,7 +1651,7 @@ rpmalloc_thread_finalize(void) {
 					next = next->next_span;
 					++list_size;
 				}
-				last->next_span = 0; //Terminate list
+				last->next_span = 0;
 				next->data.list.size = span->data.list.size - list_size;
 				_memory_global_cache_large_insert(span, list_size, span_count);
 				span = next;
@@ -1702,8 +1728,9 @@ _memory_map_os(size_t size, size_t* offset) {
 static void
 _memory_unmap_os(void* address, size_t size, size_t offset) {
 	if (offset) {
-		size += _memory_span_size;
-		address = pointer_offset(address, -(offset_t)(offset << 2));
+		offset <<= 2;
+		size += offset;
+		address = pointer_offset(address, -(offset_t)offset);
 	}
 #ifdef PLATFORM_WINDOWS
 	(void)sizeof(size);
@@ -1781,8 +1808,10 @@ _memory_validate_integrity(void* p) {
 	uint32_t* deadzone = block_start;
 	//If these asserts fire, you have written to memory before the block start
 	for (int i = 0; i < 4; ++i) {
-		if (deadzone[i] == MAGIC_GUARD)
+		if (deadzone[i] == MAGIC_GUARD) {
+			deadzone[i] = 0;
 			continue;
+		}
 		if (_memory_config.memory_overwrite)
 			_memory_config.memory_overwrite(p);
 		else
@@ -1792,8 +1821,10 @@ _memory_validate_integrity(void* p) {
 	deadzone = (uint32_t*)pointer_offset(block_start, block_size - 16);
 	//If these asserts fire, you have written to memory after the block end
 	for (int i = 0; i < 4; ++i) {
-		if (deadzone[i] == MAGIC_GUARD)
+		if (deadzone[i] == MAGIC_GUARD) {
+			deadzone[i] = 0;
 			continue;
+		}
 		if (_memory_config.memory_overwrite)
 			_memory_config.memory_overwrite(p);
 		else
@@ -1880,7 +1911,7 @@ void*
 rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
                   unsigned int flags) {
 #if ENABLE_VALIDATE_ARGS
-	if ((size + alignment < size) || (alignment > PAGE_SIZE)) {
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
 		errno = EINVAL;
 		return 0;
 	}
@@ -1918,7 +1949,7 @@ rpaligned_alloc(size_t alignment, size_t size) {
 		return rpmalloc(size);
 
 #if ENABLE_VALIDATE_ARGS
-	if ((size + alignment < size) || (alignment > PAGE_SIZE)) {
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
 		errno = EINVAL;
 		return 0;
 	}
@@ -1965,10 +1996,6 @@ void
 rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 	memset(stats, 0, sizeof(rpmalloc_thread_statistics_t));
 	heap_t* heap = get_thread_heap();
-#if ENABLE_STATISTICS
-	stats->allocated = heap->allocated;
-	stats->requested = heap->requested;
-#endif
 	void* p = atomic_load_ptr(&heap->defer_deallocate);
 	while (p) {
 		void* next = *(void**)p;
@@ -2001,20 +2028,12 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
 #endif
 	void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
-	while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) {
-		thread_yield();
-		global_span_ptr = atomic_load_ptr(&_memory_span_cache);
-	}
 	uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 	size_t list_bytes = global_span_count * _memory_span_size;
 	stats->cached += list_bytes;
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
-		while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) {
-			thread_yield();
-			global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
-		}
 		global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 		list_bytes = global_span_count * (iclass + 1) * _memory_span_size;
 		stats->cached_large += list_bytes;
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 2a8f9549..93723b20 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -48,10 +48,6 @@ typedef struct rpmalloc_global_statistics_t {
 } rpmalloc_global_statistics_t;
 
 typedef struct rpmalloc_thread_statistics_t {
-	//! Amount of memory currently requested in allocations (only if ENABLE_STATISTICS=1)
-	size_t requested;
-	//! Amount of memory actually allocated in memory blocks (only if ENABLE_STATISTICS=1)
-	size_t allocated;
 	//! Current number of bytes available for allocation from active spans
 	size_t active;
 	//! Current number of bytes available in thread size class caches
@@ -72,7 +68,9 @@ typedef struct rpmalloc_config_t {
 	//  Optionally the function can store an alignment offset in the offset variable
 	//  in case it performs alignment and the returned pointer is offset from the
 	//  actual start of the memory region due to this alignment. The alignment offset
-	//  will be passed to the memory unmap function.
+	//  will be passed to the memory unmap function. The alignment offset MUST NOT be
+	//  larger than 65535 (storable in an uint16_t), if it is you must use natural
+	//  alignment to shift it into 16 bits.
 	void* (*memory_map)(size_t size, size_t* offset);
 	//! Unmap the memory pages starting at address and spanning the given number of bytes.
 	//  The address, size and offset variables will always be a value triple as used
@@ -85,6 +83,15 @@ typedef struct rpmalloc_config_t {
 	//  Set to 0 to use the default span size. All memory mapping requests to memory_map will be made with
 	//  size set to a multiple of the span size.
 	size_t span_size;
+	//! Number of spans to map at each request to map new virtual memory blocks. This can
+	//  be used to minimize the system call overhead at the cost of virtual memory address
+	//  space. The extra mapped pages will not be written until actually used, so physical
+	//  committed memory should not be affected in the default implementation.
+	size_t span_map_count;
+	//! Set to 1 if partial ranges can be unmapped of a mapped span of memory pages (like munmap
+	//  on POSIX systems). Set to 0 if the entire span needs to be unmapped at the same time (like
+	//  VirtualFree with MEM_RELEASE on Windows).
+	int unmap_partial;
 	//! Debug callback if memory guards are enabled. Called if a memory overwrite is detected
 	void (*memory_overwrite)(void* address);
 } rpmalloc_config_t;

From 0ce90327fa3e20c0dae990229cc7d7d17d908377 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 4 Feb 2018 21:22:34 +0100
Subject: [PATCH 11/42] fix break of superspans only if not already part of a
 superspan

---
 rpmalloc/rpmalloc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 57ca6667..42364e49 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -67,7 +67,7 @@
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            0
+#define ENABLE_ASSERTS            1
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -77,7 +77,7 @@
 
 #ifndef ENABLE_GUARDS
 //! Enable overwrite/underwrite guards
-#define ENABLE_GUARDS             0
+#define ENABLE_GUARDS             1
 #endif
 
 // Platform and arch specifics
@@ -1133,13 +1133,14 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 #if MAX_SPAN_CACHE_DIVISOR == 0
 	const size_t list_size = 1;
 #else
-	if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit)) {
+	if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit) && !span->flags) {
 		//Break up as single span cache
 		span_t* master = span;
 		master->flags = SPAN_FLAG_MASTER | (num_spans << 2);
 		for (size_t ispan = 1; ispan < num_spans; ++ispan) {
 			span->next_span = pointer_offset(span, _memory_span_size);
 			span = span->next_span;
+			span->flags = SPAN_FLAG_SUBSPAN | (ispan << 2);
 		}
 		span->next_span = 0;
 		master->data.list.size = num_spans;

From dfb8fbb2dda537ee1299257cf6bb35ac1abbfdf7 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 4 Feb 2018 21:24:04 +0100
Subject: [PATCH 12/42] set correct default defines

---
 rpmalloc/rpmalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 42364e49..84004d36 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -67,7 +67,7 @@
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            1
+#define ENABLE_ASSERTS            0
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -77,7 +77,7 @@
 
 #ifndef ENABLE_GUARDS
 //! Enable overwrite/underwrite guards
-#define ENABLE_GUARDS             1
+#define ENABLE_GUARDS             0
 #endif
 
 // Platform and arch specifics

From 2e6de2d2cf1c19a41fe33d46206010509b37471c Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 4 Feb 2018 23:02:49 +0100
Subject: [PATCH 13/42] reduce cache size of large superspans based on
 allocation size

---
 rpmalloc/rpmalloc.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 84004d36..37913247 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -487,17 +487,19 @@ _memory_heap_lookup(int32_t id) {
 
 //! Increase an allocation counter
 static void
-_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter) {
+_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, uint32_t span_count) {
 	if (++counter->current_allocations > counter->max_allocations) {
 		counter->max_allocations = counter->current_allocations;
+		const uint32_t cache_limit_max = _memory_span_size - 2;
 #if MAX_SPAN_CACHE_DIVISOR > 0
 		counter->cache_limit = counter->max_allocations / MAX_SPAN_CACHE_DIVISOR;
-		if (counter->cache_limit > (_memory_span_size - 2))
-			counter->cache_limit = (_memory_span_size - 2);
-		if (counter->cache_limit < (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE))
-			counter->cache_limit = (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE);
+		const uint32_t cache_limit_min = (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) / span_count;
+		if (counter->cache_limit > cache_limit_max)
+			counter->cache_limit = cache_limit_max;
+		if (counter->cache_limit < cache_limit_min)
+			counter->cache_limit = cache_limit_min;
 #else
-		counter->cache_limit = (_memory_span_size - 2);
+		counter->cache_limit = cache_limit_max;
 #endif
 		if (counter->max_allocations > *global_counter)
 			*global_counter = counter->max_allocations;
@@ -597,7 +599,7 @@ _memory_cache_insert(atomicptr_t* cache, span_t* span_list, size_t list_size, si
 		uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 		span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 
-		if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE))
+		if ((global_list_size >= cache_limit) && (global_list_size > (MIN_SPAN_CACHE_SIZE / span_count)))
 			break;
 		if ((global_list_size + list_size) & _memory_span_mask)
 			break;
@@ -811,7 +813,7 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 
 	//Track counters
-	_memory_counter_increase(&heap->span_counter, &_memory_max_allocation);
+	_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1);
 
 	//Return first block if memory page span
 	return pointer_offset(span, SPAN_HEADER_SIZE);
@@ -869,7 +871,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		span->size_class = SIZE_CLASS_COUNT;
 
 		//Track counters
-		_memory_counter_increase(&heap->span_counter, &_memory_max_allocation);
+		_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1);
 
 		return pointer_offset(span, SPAN_HEADER_SIZE);
 	}
@@ -899,7 +901,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		atomic_thread_fence_release();
 
 		//Increase counter
-		_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]);
+		_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans);
 
 		return pointer_offset(span, SPAN_HEADER_SIZE);
 	}
@@ -939,7 +941,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
 
 	//Increase counter
-	_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]);
+	_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans);
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -1163,7 +1165,8 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	count_t list_size = 1;
 	span_t* next = span->next_span;
 	span_t* last = span;
-	while (list_size < MIN_SPAN_CACHE_RELEASE) {
+	count_t min_list_size = (MIN_SPAN_CACHE_RELEASE / num_spans);
+	while (list_size < min_list_size) {
 		last = next;
 		next = next->next_span;
 		++list_size;
@@ -1643,11 +1646,11 @@ rpmalloc_thread_finalize(void) {
 		const size_t span_count = iclass + 1;
 		span = heap->large_cache[iclass];
 		while (span) {
-			if (span->data.list.size > MIN_SPAN_CACHE_RELEASE) {
+			if (span->data.list.size > (MIN_SPAN_CACHE_RELEASE / span_count)) {
 				count_t list_size = 1;
 				span_t* next = span->next_span;
 				span_t* last = span;
-				while (list_size < MIN_SPAN_CACHE_RELEASE) {
+				while (list_size < (MIN_SPAN_CACHE_RELEASE / span_count)) {
 					last = next;
 					next = next->next_span;
 					++list_size;

From 21f9ff585fef1510c27963f841a63385d345f6a1 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 4 Feb 2018 23:10:43 +0100
Subject: [PATCH 14/42] type cleanups

---
 rpmalloc/rpmalloc.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 37913247..ea270891 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -218,8 +218,6 @@ static size_t _memory_map_granularity;
 static size_t _memory_span_size;
 //! Mask to get to start of a memory span
 static uintptr_t _memory_span_mask;
-//! Number of memory pages in a single span (or 1, if span < page)
-static size_t _memory_span_pages;
 
 //! Granularity of a small allocation block
 #define SMALL_GRANULARITY         32
@@ -487,13 +485,13 @@ _memory_heap_lookup(int32_t id) {
 
 //! Increase an allocation counter
 static void
-_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, uint32_t span_count) {
+_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) {
 	if (++counter->current_allocations > counter->max_allocations) {
 		counter->max_allocations = counter->current_allocations;
-		const uint32_t cache_limit_max = _memory_span_size - 2;
+		const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2;
 #if MAX_SPAN_CACHE_DIVISOR > 0
 		counter->cache_limit = counter->max_allocations / MAX_SPAN_CACHE_DIVISOR;
-		const uint32_t cache_limit_min = (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) / span_count;
+		const uint32_t cache_limit_min = (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) / (uint32_t)span_count;
 		if (counter->cache_limit > cache_limit_max)
 			counter->cache_limit = cache_limit_max;
 		if (counter->cache_limit < cache_limit_min)
@@ -538,7 +536,7 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 		heap->spans_reserved -= num_spans;
 		//set flag in span that it is a subspan with a master span
 		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) / _memory_span_size);
-		span->flags = SPAN_FLAG_SUBSPAN | (distance << 2);
+		span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | (distance << 2));
 		return span;
 	}
 
@@ -554,7 +552,7 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
 		heap->span_reserve_master = span;
 
-		span->flags = SPAN_FLAG_MASTER | ((uint16_t)request_spans << 2);
+		span->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)request_spans << 2));
 	}
 	else {
 		span->flags = 0;
@@ -574,7 +572,7 @@ _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
 
 	uint32_t is_subspan = span->flags & SPAN_FLAG_SUBSPAN;
 	uint32_t is_master = span->flags & SPAN_FLAG_MASTER;
-	assert((is_subspan || is_master) && !(is_subspan && is_master));
+	assert((is_subspan || is_master) && !(is_subspan && is_master)); (void)sizeof(is_master);
 	uint32_t distance = (is_subspan ? (span->flags >> 2) : 0);
 	span_t* master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
 	uint32_t remains = master->flags >> 2;
@@ -585,7 +583,7 @@ _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
 	else {
 		assert(remains > num_spans);
 		remains -= (uint32_t)num_spans;
-		master->flags = ((uint16_t)remains << 2) | SPAN_FLAG_MASTER;
+		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
 	}
 }
 
@@ -627,7 +625,7 @@ _memory_cache_insert(atomicptr_t* cache, span_t* span_list, size_t list_size, si
 
 //! Extract a number of memory page spans from the global cache
 static span_t*
-_memory_cache_extract(atomicptr_t* cache, size_t span_count) {
+_memory_cache_extract(atomicptr_t* cache) {
 	span_t* span = 0;
 	atomic_thread_fence_acquire();
 	void* global_span_ptr = atomic_load_ptr(cache);
@@ -672,7 +670,7 @@ _memory_global_cache_insert(span_t* span_list, size_t list_size) {
 //! Extract a number of memory page spans from the global cache for small/medium blocks
 static span_t*
 _memory_global_cache_extract(void) {
-	return _memory_cache_extract(&_memory_span_cache, 1);
+	return _memory_cache_extract(&_memory_span_cache);
 }
 
 //! Insert the given list of memory page spans in the global cache for large blocks
@@ -690,7 +688,7 @@ _memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t sp
 //! Extract a number of memory page spans from the global cache for large blocks
 static span_t*
 _memory_global_cache_large_extract(size_t span_count) {
-	return _memory_cache_extract(&_memory_large_cache[span_count - 1], span_count);
+	return _memory_cache_extract(&_memory_large_cache[span_count - 1]);
 }
 
 //! Allocate a small/medium sized memory block from the given heap
@@ -1138,14 +1136,14 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit) && !span->flags) {
 		//Break up as single span cache
 		span_t* master = span;
-		master->flags = SPAN_FLAG_MASTER | (num_spans << 2);
+		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)num_spans << 2));
 		for (size_t ispan = 1; ispan < num_spans; ++ispan) {
 			span->next_span = pointer_offset(span, _memory_span_size);
 			span = span->next_span;
-			span->flags = SPAN_FLAG_SUBSPAN | (ispan << 2);
+			span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | ((uint16_t)ispan << 2));
 		}
 		span->next_span = 0;
-		master->data.list.size = num_spans;
+		master->data.list.size = (uint32_t)num_spans;
 		heap->span_cache = master;
 		return;
 	}
@@ -1538,7 +1536,7 @@ rpmalloc_finalize(void) {
 					if (!remaining)
 						_memory_unmap(master, _memory_span_size * _memory_config.span_map_count, master->data.list.align_offset);
 					else
-						master->flags = ((uint16_t)remaining << 2) | SPAN_FLAG_MASTER;
+						master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remaining << 2));
 				}
 			}
 

From d766970568e1b8b5d5a33c08d155805f6a87b5d7 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 5 Feb 2018 15:17:48 +0100
Subject: [PATCH 15/42] allow decommits as well as unmaps

---
 rpmalloc/rpmalloc.c | 197 ++++++++++++++++++++++----------------------
 rpmalloc/rpmalloc.h |  12 ++-
 2 files changed, 105 insertions(+), 104 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index ea270891..2bdef06c 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -24,30 +24,45 @@
 #define MIN_SPAN_CACHE_RELEASE 64
 #define MAX_SPAN_CACHE_DIVISOR 1
 #define MIN_SPAN_CACHE_SIZE 0
+#define MIN_LARGE_SPAN_CACHE_RELEASE 64
+#define MAX_LARGE_SPAN_CACHE_DIVISOR 1
+#define MIN_LARGE_SPAN_CACHE_SIZE 0
 #define DEFAULT_SPAN_MAP_COUNT 16
 #elif defined(DISABLE_CACHE)
 //Disable cache
 #define MIN_SPAN_CACHE_RELEASE 1
 #define MAX_SPAN_CACHE_DIVISOR 0
+#define MIN_SPAN_CACHE_SIZE 0
+#define MIN_LARGE_SPAN_CACHE_RELEASE 1
+#define MAX_LARGE_SPAN_CACHE_DIVISOR 0
+#define MIN_LARGE_SPAN_CACHE_SIZE 0
 #define DEFAULT_SPAN_MAP_COUNT 1
 #elif defined(ENABLE_SPACE_PRIORITY_CACHE)
 // Space priority cache limits
 #define MIN_SPAN_CACHE_SIZE 8
 #define MIN_SPAN_CACHE_RELEASE 8
 #define MAX_SPAN_CACHE_DIVISOR 16
-#define GLOBAL_SPAN_CACHE_MULTIPLIER 1
+#define MIN_LARGE_SPAN_CACHE_SIZE 2
+#define MIN_LARGE_SPAN_CACHE_RELEASE 2
+#define MAX_LARGE_SPAN_CACHE_DIVISOR 32
+#define GLOBAL_CACHE_MULTIPLIER 1
 #define DEFAULT_SPAN_MAP_COUNT 4
 #else
 // Default - performance priority cache limits
-//! Limit of thread cache in number of spans (undefine for unlimited cache - i.e never release spans to global cache unless thread finishes)
 //! Minimum cache size to remain after a release to global cache
 #define MIN_SPAN_CACHE_SIZE 64
 //! Minimum number of spans to transfer between thread and global cache
-#define MIN_SPAN_CACHE_RELEASE 32
+#define MIN_SPAN_CACHE_RELEASE 16
 //! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor)
 #define MAX_SPAN_CACHE_DIVISOR 4
+//! Minimum cache size to remain after a release to global cache, large spans
+#define MIN_LARGE_SPAN_CACHE_SIZE 8
+//! Minimum number of spans to transfer between thread and global cache, large spans
+#define MIN_LARGE_SPAN_CACHE_RELEASE 4
+//! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor)
+#define MAX_LARGE_SPAN_CACHE_DIVISOR 16
 //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
-#define GLOBAL_SPAN_CACHE_MULTIPLIER 8
+#define GLOBAL_CACHE_MULTIPLIER 8
 //! Default number of spans to map in call to map more virtual memory
 #define DEFAULT_SPAN_MAP_COUNT 8
 #endif
@@ -62,12 +77,12 @@
 
 #ifndef ENABLE_STATISTICS
 //! Enable statistics collection
-#define ENABLE_STATISTICS         0
+#define ENABLE_STATISTICS         1
 #endif
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            0
+#define ENABLE_ASSERTS            1
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -422,8 +437,10 @@ static uint32_t _memory_max_allocation;
 static uint32_t _memory_max_allocation_large[LARGE_CLASS_COUNT];
 
 #if ENABLE_STATISTICS
-//! Total number of mapped memory pages
+//! Total number of currently mapped memory pages
 static atomic32_t _mapped_pages;
+//! Total number of currently lost spans
+static atomic32_t _reserved_spans;
 //! Running counter of total number of mapped memory pages since start
 static atomic32_t _mapped_total;
 //! Running counter of total number of unmapped memory pages since start
@@ -468,7 +485,7 @@ static void*
 _memory_map_os(size_t size, size_t* offset);
 
 static void
-_memory_unmap_os(void* address, size_t size, size_t offset);
+_memory_unmap_os(void* address, size_t size, size_t offset, int release);
 
 static int
 _memory_deallocate_deferred(heap_t* heap, size_t size_class);
@@ -485,17 +502,16 @@ _memory_heap_lookup(int32_t id) {
 
 //! Increase an allocation counter
 static void
-_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) {
+_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count, size_t cache_limit_min) {
 	if (++counter->current_allocations > counter->max_allocations) {
 		counter->max_allocations = counter->current_allocations;
 		const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2;
 #if MAX_SPAN_CACHE_DIVISOR > 0
-		counter->cache_limit = counter->max_allocations / MAX_SPAN_CACHE_DIVISOR;
-		const uint32_t cache_limit_min = (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) / (uint32_t)span_count;
-		if (counter->cache_limit > cache_limit_max)
-			counter->cache_limit = cache_limit_max;
+		counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR);
 		if (counter->cache_limit < cache_limit_min)
 			counter->cache_limit = cache_limit_min;
+		if (counter->cache_limit > cache_limit_max)
+			counter->cache_limit = cache_limit_max;
 #else
 		counter->cache_limit = cache_limit_max;
 #endif
@@ -516,7 +532,7 @@ _memory_map(size_t size, size_t* offset) {
 }
 
 static void
-_memory_unmap(void* address, size_t size, size_t offset) {
+_memory_unmap(void* address, size_t size, size_t offset, int release) {
 #if ENABLE_STATISTICS
 	const size_t page_count = (size >> _memory_page_size_shift);
 	atomic_add32(&_mapped_pages, -(int32_t)page_count);
@@ -524,7 +540,7 @@ _memory_unmap(void* address, size_t size, size_t offset) {
 #endif
 	assert(!((uintptr_t)address & ~_memory_span_mask));
 	assert(!(size % _memory_page_size));
-	_memory_config.memory_unmap(address, size, offset);
+	_memory_config.memory_unmap(address, size, offset, release);
 }
 
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
@@ -541,77 +557,76 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 	}
 
 	//We cannot request extra spans if we already have some (but not enough) pending reserved spans
-	size_t request_spans = (heap->spans_reserved || (num_spans > _memory_config.span_map_count)) ? num_spans : _memory_config.span_map_count;
+	//Also, if given number of spans is more than one we cannot map extra spans as we lose info on
+	//how many spans is part of the master span
+	size_t request_spans = (heap->spans_reserved || (num_spans > 1)) ? num_spans : _memory_config.span_map_count;
 	size_t align_offset = 0;
 	span_t* span = _memory_map(_memory_span_size * request_spans, &align_offset);
 	span->data.block.align_offset = (uint16_t)align_offset;
-
 	if (request_spans > num_spans) {
 		assert(request_spans == _memory_config.span_map_count);
 		heap->spans_reserved = request_spans - num_spans;
 		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
 		heap->span_reserve_master = span;
-
 		span->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)request_spans << 2));
+		atomic_add32(&_reserved_spans, (int32_t)request_spans);
 	}
 	else {
 		span->flags = 0;
 	}
-
 	return span;
 }
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
 _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
-	//Check if span is a subspan with a master span or unmap cannot do partial unmappings
-	if (_memory_config.unmap_partial || !(span->flags & 3)) {
-		_memory_unmap(span, _memory_span_size * num_spans, align_offset);
-		return;
-	}
-
-	uint32_t is_subspan = span->flags & SPAN_FLAG_SUBSPAN;
-	uint32_t is_master = span->flags & SPAN_FLAG_MASTER;
-	assert((is_subspan || is_master) && !(is_subspan && is_master)); (void)sizeof(is_master);
-	uint32_t distance = (is_subspan ? (span->flags >> 2) : 0);
-	span_t* master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
-	uint32_t remains = master->flags >> 2;
-	if (remains <= num_spans) {
-		assert(remains == num_spans);
-		_memory_unmap(master, _memory_span_size * _memory_config.span_map_count, master->data.list.align_offset);
+	if (span->flags) {
+		uint32_t is_subspan = span->flags & SPAN_FLAG_SUBSPAN;
+		uint32_t is_master = span->flags & SPAN_FLAG_MASTER;
+		assert((is_subspan || is_master) && !(is_subspan && is_master)); (void)sizeof(is_master);
+		uint32_t distance = (is_subspan ? (span->flags >> 2) : 0);
+		span_t* master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
+		uint32_t remains = master->flags >> 2;
+		remains = ((uint32_t)num_spans >= remains) ? 0 : (remains - (uint32_t)num_spans);
+		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
+		if (is_subspan)
+			_memory_unmap(span, _memory_span_size * num_spans, span->data.list.align_offset, 0);
+		if (!remains) {
+			_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
+			atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
+		}
 	}
 	else {
-		assert(remains > num_spans);
-		remains -= (uint32_t)num_spans;
-		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
+		_memory_unmap(span, _memory_span_size * num_spans, span->data.list.align_offset, 1);
 	}
 }
 
+#define CACHE_IN_PROGRESS ((void*)1)
+
 //! Insert the given list of memory page spans in the global cache
 static void
 _memory_cache_insert(atomicptr_t* cache, span_t* span_list, size_t list_size, size_t span_count, size_t cache_limit) {
 	assert((list_size == 1) || (span_list->next_span != 0));
 #if MAX_SPAN_CACHE_DIVISOR > 0
-	while (1) {
+	while (cache_limit) {
+		atomic_thread_fence_acquire();
 		void* global_span_ptr = atomic_load_ptr(cache);
-		uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-		span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
-
-		if ((global_list_size >= cache_limit) && (global_list_size > (MIN_SPAN_CACHE_SIZE / span_count)))
-			break;
-		if ((global_list_size + list_size) & _memory_span_mask)
-			break;
+		if (global_span_ptr != CACHE_IN_PROGRESS) {
+			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 
-		span_list->data.list.size = (uint32_t)list_size;
-		span_list->prev_span = global_span;
+			if ((global_list_size >= cache_limit) || ((global_list_size + list_size) & _memory_span_mask))
+				break;
 
-		global_list_size += list_size;
-		void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size);
-		if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr))
-			return;
+			span_list->data.list.size = (uint32_t)list_size;
+			span_list->prev_span = global_span;
 
+			global_list_size += list_size;
+			void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size);
+			if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr))
+				return;
+		}
 		thread_yield();
-		atomic_thread_fence_acquire();
 	}
 #endif
 	//Global cache full, release spans
@@ -630,7 +645,7 @@ _memory_cache_extract(atomicptr_t* cache) {
 	atomic_thread_fence_acquire();
 	void* global_span_ptr = atomic_load_ptr(cache);
 	while (global_span_ptr) {
-		if (atomic_cas_ptr(cache, 0, global_span_ptr)) {
+		if ((global_span_ptr != CACHE_IN_PROGRESS) && atomic_cas_ptr(cache, CACHE_IN_PROGRESS, global_span_ptr)) {
 			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 			assert((span->data.list.size == 1) || (span->next_span != 0));
@@ -643,10 +658,9 @@ _memory_cache_extract(atomicptr_t* cache) {
 			void* new_global_span_ptr = global_list_size && new_global_span ?
 			                            ((void*)((uintptr_t)new_global_span | global_list_size)) :
 			                            0;
-			if (atomic_cas_ptr(cache, new_global_span_ptr, 0)) {
-				span->prev_span = 0;
-				break;
-			}
+			atomic_store_ptr(cache, new_global_span_ptr);
+			atomic_thread_fence_release();
+			break;
 		}
 
 		thread_yield();
@@ -659,10 +673,10 @@ _memory_cache_extract(atomicptr_t* cache) {
 //! Insert the given list of memory page spans in the global cache for small/medium blocks
 static void
 _memory_global_cache_insert(span_t* span_list, size_t list_size) {
-#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
-	const size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation / MAX_SPAN_CACHE_DIVISOR);
+#ifdef GLOBAL_CACHE_MULTIPLIER
+	const size_t cache_limit = GLOBAL_CACHE_MULTIPLIER * (_memory_max_allocation / MAX_SPAN_CACHE_DIVISOR);
 #else
-	const size_t cache_limit = 0;
+	const size_t cache_limit = _memory_span_size - 2;
 #endif
 	_memory_cache_insert(&_memory_span_cache, span_list, list_size, 1, cache_limit);
 }
@@ -677,8 +691,8 @@ _memory_global_cache_extract(void) {
 static void
 _memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t span_count) {
 	assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1)));
-#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER
-	const size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation_large[span_count-1] / MAX_SPAN_CACHE_DIVISOR);
+#ifdef GLOBAL_CACHE_MULTIPLIER
+	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation_large[span_count - 1]) / (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
 #else
 	const size_t cache_limit = 0;
 #endif
@@ -811,7 +825,7 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 
 	//Track counters
-	_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1);
+	_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1, MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE);
 
 	//Return first block if memory page span
 	return pointer_offset(span, SPAN_HEADER_SIZE);
@@ -869,7 +883,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		span->size_class = SIZE_CLASS_COUNT;
 
 		//Track counters
-		_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1);
+		_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1, MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE);
 
 		return pointer_offset(span, SPAN_HEADER_SIZE);
 	}
@@ -899,7 +913,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		atomic_thread_fence_release();
 
 		//Increase counter
-		_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans);
+		_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans, MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE);
 
 		return pointer_offset(span, SPAN_HEADER_SIZE);
 	}
@@ -939,7 +953,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
 
 	//Increase counter
-	_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans);
+	_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans, MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE);
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -1130,7 +1144,7 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	if (counter->current_allocations)
 		--counter->current_allocations;
 
-#if MAX_SPAN_CACHE_DIVISOR == 0
+#if MAX_LARGE_SPAN_CACHE_DIVISOR == 0
 	const size_t list_size = 1;
 #else
 	if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit) && !span->flags) {
@@ -1163,14 +1177,14 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	count_t list_size = 1;
 	span_t* next = span->next_span;
 	span_t* last = span;
-	count_t min_list_size = (MIN_SPAN_CACHE_RELEASE / num_spans);
+	count_t min_list_size = MIN_LARGE_SPAN_CACHE_RELEASE;
 	while (list_size < min_list_size) {
 		last = next;
 		next = next->next_span;
 		++list_size;
 	}
-	assert(next->next_span);
-	next->data.list.size = span->data.list.size - list_size;
+	if (next)
+		next->data.list.size = span->data.list.size - list_size;
 	last->next_span = 0;
 	*cache = next;
 #endif
@@ -1273,7 +1287,7 @@ _memory_deallocate(void* p) {
 	else {
 		//Oversized allocation, page count is stored in next_span
 		size_t num_pages = (size_t)span->next_span;
-		_memory_unmap(span, num_pages * _memory_page_size, span->data.list.align_offset);
+		_memory_unmap(span, num_pages * _memory_page_size, span->data.list.align_offset, 1);
 	}
 }
 
@@ -1418,13 +1432,9 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		GetSystemInfo(&system_info);
 		_memory_page_size = system_info.dwPageSize;
 		_memory_map_granularity = system_info.dwAllocationGranularity;
-		if (default_mapper)
-			_memory_config.unmap_partial = 0;
 #else
 		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
 		_memory_map_granularity = _memory_page_size;
-		if (default_mapper)
-			_memory_config.unmap_partial = 1;
 #endif
 	}
 
@@ -1522,26 +1532,15 @@ rpmalloc_finalize(void) {
 			}
 
 			if (heap->spans_reserved) {
-				//Special handling if we cannot deal with partial unmaps, since the reserved
-				//spans are unused and cannot be unmapped through _memory_unmap_spans since
-				//they lack the data in next_span pointer
-				if (_memory_config.unmap_partial) {
-					_memory_unmap_spans(heap->span_reserve, heap->spans_reserved, 0);
-				}
-				else {
-					span_t* master = heap->span_reserve_master;
-					uint32_t remaining = master->flags >> 2;
-					assert(remaining >= heap->spans_reserved);
-					remaining -= (uint32_t)heap->spans_reserved;
-					if (!remaining)
-						_memory_unmap(master, _memory_span_size * _memory_config.span_map_count, master->data.list.align_offset);
-					else
-						master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remaining << 2));
-				}
+				_memory_unmap(heap->span_reserve, _memory_span_size * heap->spans_reserved, 0, 0);
+				span_t* master = heap->span_reserve_master;
+				uint32_t remains = master->flags >> 2;
+				if (remains <= heap->spans_reserved)
+					_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1);
 			}
 
 			heap_t* next_heap = heap->next_heap;
-			_memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset);
+			_memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset, 1);
 			heap = next_heap;
 		}
 
@@ -1586,6 +1585,11 @@ rpmalloc_finalize(void) {
 
 	atomic_thread_fence_release();
 
+#if ENABLE_STATISTICS
+	assert(!atomic_load32(&_mapped_pages));
+	assert(!atomic_load32(&_reserved_spans));
+#endif
+
 #if defined(__APPLE__) && ENABLE_PRELOAD
 	pthread_key_delete(_memory_thread_heap);
 #endif
@@ -1644,11 +1648,11 @@ rpmalloc_thread_finalize(void) {
 		const size_t span_count = iclass + 1;
 		span = heap->large_cache[iclass];
 		while (span) {
-			if (span->data.list.size > (MIN_SPAN_CACHE_RELEASE / span_count)) {
+			if (span->data.list.size > (MIN_LARGE_SPAN_CACHE_RELEASE / span_count)) {
 				count_t list_size = 1;
 				span_t* next = span->next_span;
 				span_t* last = span;
-				while (list_size < (MIN_SPAN_CACHE_RELEASE / span_count)) {
+				while (list_size < (MIN_LARGE_SPAN_CACHE_RELEASE / span_count)) {
 					last = next;
 					next = next->next_span;
 					++list_size;
@@ -1728,15 +1732,14 @@ _memory_map_os(size_t size, size_t* offset) {
 
 //! Unmap pages from virtual memory
 static void
-_memory_unmap_os(void* address, size_t size, size_t offset) {
+_memory_unmap_os(void* address, size_t size, size_t offset, int master) {
 	if (offset) {
 		offset <<= 2;
 		size += offset;
 		address = pointer_offset(address, -(offset_t)offset);
 	}
 #ifdef PLATFORM_WINDOWS
-	(void)sizeof(size);
-	if (!VirtualFree(address, 0, MEM_RELEASE)) {
+	if (!VirtualFree(address, master ? 0 : size, master ? MEM_RELEASE : MEM_DECOMMIT )) {
 		assert("Failed to unmap virtual memory block" == 0);
 	}
 #else
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 93723b20..29e3ef79 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -73,9 +73,11 @@ typedef struct rpmalloc_config_t {
 	//  alignment to shift it into 16 bits.
 	void* (*memory_map)(size_t size, size_t* offset);
 	//! Unmap the memory pages starting at address and spanning the given number of bytes.
-	//  The address, size and offset variables will always be a value triple as used
-	//  in and returned by an earlier call to memory_map
-	void (*memory_unmap)(void* address, size_t size, size_t offset);
+	//  If release is set to 1, the unmap is for an entire span range as returned by
+	//  a previous call to memory_map and that the entire range should be released.
+	//  If release is set to 0, the unmap is a partial decommit of a subset of the mapped
+	//  memory range.
+	void (*memory_unmap)(void* address, size_t size, size_t offset, int release);
 	//! Size of memory pages. If set to 0, rpmalloc will use system calls to determine the page size.
 	//  The page size MUST be a power of two in [512,16384] range (2^9 to 2^14).
 	size_t page_size;
@@ -88,10 +90,6 @@ typedef struct rpmalloc_config_t {
 	//  space. The extra mapped pages will not be written until actually used, so physical
 	//  committed memory should not be affected in the default implementation.
 	size_t span_map_count;
-	//! Set to 1 if partial ranges can be unmapped of a mapped span of memory pages (like munmap
-	//  on POSIX systems). Set to 0 if the entire span needs to be unmapped at the same time (like
-	//  VirtualFree with MEM_RELEASE on Windows).
-	int unmap_partial;
 	//! Debug callback if memory guards are enabled. Called if a memory overwrite is detected
 	void (*memory_overwrite)(void* address);
 } rpmalloc_config_t;

From b29a8aa41aeba1fa5f384f43551ac73719782868 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 5 Feb 2018 20:24:23 +0100
Subject: [PATCH 16/42] add test project to msvs

---
 build/msvs/rpmalloc.sln |  15 ++-
 build/msvs/test.vcxproj | 235 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 build/msvs/test.vcxproj

diff --git a/build/msvs/rpmalloc.sln b/build/msvs/rpmalloc.sln
index 32d1b29b..b806eda3 100644
--- a/build/msvs/rpmalloc.sln
+++ b/build/msvs/rpmalloc.sln
@@ -1,10 +1,12 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
-VisualStudioVersion = 15.0.26228.4
+VisualStudioVersion = 15.0.27130.2010
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rpmalloc", "rpmalloc.vcxproj", "{65DC4291-954E-4B91-8889-4F3ADCC9D2D5}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test.vcxproj", "{C31980DD-1241-4EF8-A351-69DAF982A7B9}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -21,8 +23,19 @@ Global
 		{65DC4291-954E-4B91-8889-4F3ADCC9D2D5}.Release|x64.Build.0 = Release|x64
 		{65DC4291-954E-4B91-8889-4F3ADCC9D2D5}.Release|x86.ActiveCfg = Release|Win32
 		{65DC4291-954E-4B91-8889-4F3ADCC9D2D5}.Release|x86.Build.0 = Release|Win32
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Debug|x64.ActiveCfg = Debug|x64
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Debug|x64.Build.0 = Debug|x64
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Debug|x86.ActiveCfg = Debug|Win32
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Debug|x86.Build.0 = Debug|Win32
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x64.ActiveCfg = Release|x64
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x64.Build.0 = Release|x64
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x86.ActiveCfg = Release|Win32
+		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {50C54715-12C7-4F8C-B7B6-B65A30D91DFF}
+	EndGlobalSection
 EndGlobal
diff --git a/build/msvs/test.vcxproj b/build/msvs/test.vcxproj
new file mode 100644
index 00000000..512edd98
--- /dev/null
+++ b/build/msvs/test.vcxproj
@@ -0,0 +1,235 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\test\main.c" />
+    <ClCompile Include="..\..\test\thread.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\test\test.h" />
+    <ClInclude Include="..\..\test\thread.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="rpmalloc.vcxproj">
+      <Project>{65dc4291-954e-4b91-8889-4f3adcc9d2d5}</Project>
+    </ProjectReference>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{C31980DD-1241-4EF8-A351-69DAF982A7B9}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>test</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>..\..\bin\windows\release\x86-64\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>..\..\bin\windows\debug\x86\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>..\..\bin\windows\debug\x86-64\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>..\..\bin\windows\release\x86\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>
+      </SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <ExceptionHandling>false</ExceptionHandling>
+      <StringPooling>true</StringPooling>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <AdditionalIncludeDirectories>..\..\test;..\..\rpmalloc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <SDLCheck>
+      </SDLCheck>
+      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <MinimalRebuild>false</MinimalRebuild>
+      <ExceptionHandling>false</ExceptionHandling>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <AdditionalIncludeDirectories>..\..\test;..\..\rpmalloc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>
+      </SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <MinimalRebuild>false</MinimalRebuild>
+      <ExceptionHandling>false</ExceptionHandling>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <AdditionalIncludeDirectories>..\..\test;..\..\rpmalloc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <SDLCheck>
+      </SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <ExceptionHandling>false</ExceptionHandling>
+      <StringPooling>true</StringPooling>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <AdditionalIncludeDirectories>..\..\test;..\..\rpmalloc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file

From 4a1a5ef3c21793ded7c18532bda370ac8d998fe7 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 5 Feb 2018 20:24:43 +0100
Subject: [PATCH 17/42] fix reserved span cleanup

---
 rpmalloc/rpmalloc.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 2bdef06c..8987067b 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -77,12 +77,12 @@
 
 #ifndef ENABLE_STATISTICS
 //! Enable statistics collection
-#define ENABLE_STATISTICS         1
+#define ENABLE_STATISTICS         0
 #endif
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            1
+#define ENABLE_ASSERTS            0
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -136,6 +136,12 @@
 #include <string.h>
 
 #if ENABLE_ASSERTS
+#  ifdef NDEBUG
+#    undef NDEBUG
+#  endif
+#  ifndef _DEBUG
+#    define _DEBUG
+#  endif
 #  include <assert.h>
 #else
 #  undef  assert
@@ -509,7 +515,7 @@ _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size
 #if MAX_SPAN_CACHE_DIVISOR > 0
 		counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR);
 		if (counter->cache_limit < cache_limit_min)
-			counter->cache_limit = cache_limit_min;
+			counter->cache_limit = (uint32_t)cache_limit_min;
 		if (counter->cache_limit > cache_limit_max)
 			counter->cache_limit = cache_limit_max;
 #else
@@ -569,7 +575,9 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
 		heap->span_reserve_master = span;
 		span->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)request_spans << 2));
+#if ENABLE_STATISTICS
 		atomic_add32(&_reserved_spans, (int32_t)request_spans);
+#endif
 	}
 	else {
 		span->flags = 0;
@@ -593,7 +601,9 @@ _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
 			_memory_unmap(span, _memory_span_size * num_spans, span->data.list.align_offset, 0);
 		if (!remains) {
 			_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
+#if ENABLE_STATISTICS
 			atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
+#endif
 		}
 	}
 	else {
@@ -675,10 +685,11 @@ static void
 _memory_global_cache_insert(span_t* span_list, size_t list_size) {
 #ifdef GLOBAL_CACHE_MULTIPLIER
 	const size_t cache_limit = GLOBAL_CACHE_MULTIPLIER * (_memory_max_allocation / MAX_SPAN_CACHE_DIVISOR);
+	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * MIN_SPAN_CACHE_SIZE;
 #else
 	const size_t cache_limit = _memory_span_size - 2;
 #endif
-	_memory_cache_insert(&_memory_span_cache, span_list, list_size, 1, cache_limit);
+	_memory_cache_insert(&_memory_span_cache, span_list, list_size, 1, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 }
 
 //! Extract a number of memory page spans from the global cache for small/medium blocks
@@ -693,10 +704,11 @@ _memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t sp
 	assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1)));
 #ifdef GLOBAL_CACHE_MULTIPLIER
 	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation_large[span_count - 1]) / (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
+	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * MIN_LARGE_SPAN_CACHE_SIZE;
 #else
-	const size_t cache_limit = 0;
+	const size_t cache_limit = _memory_span_size - 2;
 #endif
-	_memory_cache_insert(&_memory_large_cache[span_count - 1], span_list, list_size, span_count, cache_limit);
+	_memory_cache_insert(&_memory_large_cache[span_count - 1], span_list, list_size, span_count, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 }
 
 //! Extract a number of memory page spans from the global cache for large blocks
@@ -1535,8 +1547,16 @@ rpmalloc_finalize(void) {
 				_memory_unmap(heap->span_reserve, _memory_span_size * heap->spans_reserved, 0, 0);
 				span_t* master = heap->span_reserve_master;
 				uint32_t remains = master->flags >> 2;
-				if (remains <= heap->spans_reserved)
-					_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1);
+				if (remains <= heap->spans_reserved) {
+					_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
+#if ENABLE_STATISTICS
+					atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
+#endif
+				}
+				else {
+					remains -= (uint32_t)heap->spans_reserved;
+					master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
+				}
 			}
 
 			heap_t* next_heap = heap->next_heap;
@@ -1658,7 +1678,8 @@ rpmalloc_thread_finalize(void) {
 					++list_size;
 				}
 				last->next_span = 0;
-				next->data.list.size = span->data.list.size - list_size;
+				if (next)
+					next->data.list.size = span->data.list.size - list_size;
 				_memory_global_cache_large_insert(span, list_size, span_count);
 				span = next;
 			}

From 9fc41b8ff2a6620a313a6d4ca290977f9ec0f764 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 5 Feb 2018 23:39:06 +0100
Subject: [PATCH 18/42] unify shifts and masks

---
 rpmalloc/rpmalloc.c | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 8987067b..c0f70a15 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -232,11 +232,13 @@ thread_yield(void);
 static size_t _memory_page_size;
 //! Shift to divide by page size
 static size_t _memory_page_size_shift;
-//! Granularity at which memor pages are mapped by OS
+//! Granularity at which memory pages are mapped by OS
 static size_t _memory_map_granularity;
 
 //! Size of a span of memory pages
 static size_t _memory_span_size;
+//! Shift to divide by span size
+static size_t _memory_span_size_shift;
 //! Mask to get to start of a memory span
 static uintptr_t _memory_span_mask;
 
@@ -557,7 +559,7 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
 		heap->spans_reserved -= num_spans;
 		//set flag in span that it is a subspan with a master span
-		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) / _memory_span_size);
+		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift);
 		span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | (distance << 2));
 		return span;
 	}
@@ -850,8 +852,8 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	//Since this function is never called if size > LARGE_SIZE_LIMIT
 	//the num_spans is guaranteed to be <= LARGE_CLASS_COUNT
 	size += SPAN_HEADER_SIZE;
-	size_t num_spans = size / _memory_span_size;
-	if (size & ~_memory_span_mask)
+	size_t num_spans = size >> _memory_span_size_shift;
+	if (size & (_memory_span_size - 1))
 		++num_spans;
 	size_t idx = num_spans - 1;
 
@@ -1159,20 +1161,23 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 #if MAX_LARGE_SPAN_CACHE_DIVISOR == 0
 	const size_t list_size = 1;
 #else
-	if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit) && !span->flags) {
+	/* TODO: Once requirement that master span is one page and keeps track of how
+	   many spans are part of the superspan, reenable this */
+	/*if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit) && !span->flags) {
 		//Break up as single span cache
 		span_t* master = span;
 		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)num_spans << 2));
 		for (size_t ispan = 1; ispan < num_spans; ++ispan) {
 			span->next_span = pointer_offset(span, _memory_span_size);
 			span = span->next_span;
+			span->data.list.align_offset = 0;
 			span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | ((uint16_t)ispan << 2));
 		}
 		span->next_span = 0;
 		master->data.list.size = (uint32_t)num_spans;
 		heap->span_cache = master;
 		return;
-	}
+	}*/
 
 	//Insert into cache list
 	span_t** cache = heap->large_cache + idx;
@@ -1322,8 +1327,8 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 			else {
 				//Large block
 				size_t total_size = size + SPAN_HEADER_SIZE;
-				size_t num_spans = total_size / _memory_span_size;
-				if (total_size & ~_memory_span_mask)
+				size_t num_spans = total_size >> _memory_span_size_shift;
+				if (total_size & (_memory_span_mask - 1))
 					++num_spans;
 				size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
 				if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2)))
@@ -1350,7 +1355,7 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 	//Size is greater than block size, need to allocate a new block and deallocate the old
 	//Avoid hysteresis by overallocating if increase is small (below 37%)
 	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
-	void* block = _memory_allocate(size > lower_bound ? size : lower_bound);
+	void* block = _memory_allocate((size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size));
 	if (p) {
 		if (!(flags & RPMALLOC_NO_PRESERVE))
 			memcpy(block, p, oldsize < size ? oldsize : size);
@@ -1461,7 +1466,6 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		++_memory_page_size_shift;
 		page_size_bit >>= 1;
 	}
-
 	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
 
 	size_t span_size = _memory_config.span_size;
@@ -1470,9 +1474,11 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	if (span_size > (256 * 1024))
 		span_size = (256 * 1024);
 	_memory_span_size = 512;
-	while (_memory_span_size < span_size)
+	_memory_span_size_shift = 9;
+	while (_memory_span_size < span_size) {
 		_memory_span_size <<= 1;
-
+		++_memory_span_size_shift;
+	}
 	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
 
 	_memory_config.page_size = _memory_page_size;
@@ -2054,12 +2060,22 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
 #endif
 	void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
+	while (global_span_ptr == CACHE_IN_PROGRESS) {
+		thread_yield();
+		atomic_thread_fence_acquire();
+		global_span_ptr = atomic_load_ptr(&_memory_span_cache);
+	}
 	uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 	size_t list_bytes = global_span_count * _memory_span_size;
 	stats->cached += list_bytes;
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
+		while (global_span_ptr == CACHE_IN_PROGRESS) {
+			thread_yield();
+			atomic_thread_fence_acquire();
+			global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
+		}
 		global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 		list_bytes = global_span_count * (iclass + 1) * _memory_span_size;
 		stats->cached_large += list_bytes;

From 69175b647298200c3cb234756ca86b1b6dc40478 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 6 Feb 2018 15:28:33 +0100
Subject: [PATCH 19/42] source cleanup and unification

---
 rpmalloc/rpmalloc.c | 243 ++++++++++++++++++++------------------------
 1 file changed, 111 insertions(+), 132 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index c0f70a15..5092a2d7 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -77,12 +77,12 @@
 
 #ifndef ENABLE_STATISTICS
 //! Enable statistics collection
-#define ENABLE_STATISTICS         0
+#define ENABLE_STATISTICS         1
 #endif
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            0
+#define ENABLE_ASSERTS            1
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -92,7 +92,7 @@
 
 #ifndef ENABLE_GUARDS
 //! Enable overwrite/underwrite guards
-#define ENABLE_GUARDS             0
+#define ENABLE_GUARDS             1
 #endif
 
 // Platform and arch specifics
@@ -561,6 +561,7 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 		//set flag in span that it is a subspan with a master span
 		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift);
 		span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | (distance << 2));
+		span->data.block.align_offset = 0;
 		return span;
 	}
 
@@ -590,26 +591,30 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
 _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
-	if (span->flags) {
-		uint32_t is_subspan = span->flags & SPAN_FLAG_SUBSPAN;
-		uint32_t is_master = span->flags & SPAN_FLAG_MASTER;
-		assert((is_subspan || is_master) && !(is_subspan && is_master)); (void)sizeof(is_master);
-		uint32_t distance = (is_subspan ? (span->flags >> 2) : 0);
-		span_t* master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
-		uint32_t remains = master->flags >> 2;
-		remains = ((uint32_t)num_spans >= remains) ? 0 : (remains - (uint32_t)num_spans);
-		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
-		if (is_subspan)
-			_memory_unmap(span, _memory_span_size * num_spans, span->data.list.align_offset, 0);
-		if (!remains) {
-			_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
+	if (!span->flags) {
+		_memory_unmap(span, _memory_span_size * num_spans, span->data.list.align_offset, 1);
+		return;
+	}
+
+	uint32_t is_master = (span->flags & SPAN_FLAG_MASTER);
+	span_t* master = is_master ? span : (pointer_offset(span, -(int)(span->flags >> 2) * (int)_memory_span_size));
+	uint32_t remains = master->flags >> 2;
+
+	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
+	assert((master->flags & SPAN_FLAG_MASTER) && !(master->flags & SPAN_FLAG_SUBSPAN));
+	assert(remains >= num_spans);
+
+	remains = ((uint32_t)num_spans >= remains) ? 0 : (remains - (uint32_t)num_spans);
+	master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
+	if (!is_master) {
+		assert(span->data.list.align_offset == 0);
+		_memory_unmap(span, _memory_span_size * num_spans, 0, 0);
+	}
+	if (!remains) {
+		_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
 #if ENABLE_STATISTICS
-			atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
+		atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
 #endif
-		}
-	}
-	else {
-		_memory_unmap(span, _memory_span_size * num_spans, span->data.list.align_offset, 1);
 	}
 }
 
@@ -690,6 +695,7 @@ _memory_global_cache_insert(span_t* span_list, size_t list_size) {
 	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * MIN_SPAN_CACHE_SIZE;
 #else
 	const size_t cache_limit = _memory_span_size - 2;
+	const size_t cache_limit_min = cache_limit;
 #endif
 	_memory_cache_insert(&_memory_span_cache, span_list, list_size, 1, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 }
@@ -709,6 +715,7 @@ _memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t sp
 	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * MIN_LARGE_SPAN_CACHE_SIZE;
 #else
 	const size_t cache_limit = _memory_span_size - 2;
+	const size_t cache_limit_min = cache_limit;
 #endif
 	_memory_cache_insert(&_memory_large_cache[span_count - 1], span_list, list_size, span_count, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 }
@@ -1372,11 +1379,9 @@ _memory_usable_size(void* p) {
 	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
 	int32_t heap_id = atomic_load32(&span->heap_id);
 	if (heap_id) {
-		if (span->size_class < SIZE_CLASS_COUNT) {
-			//Small/medium block
-			size_class_t* size_class = _memory_size_class + span->size_class;
-			return size_class->size;
-		}
+		//Small/medium block
+		if (span->size_class < SIZE_CLASS_COUNT)
+			return _memory_size_class[span->size_class].size;
 
 		//Large block
 		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
@@ -1550,17 +1555,23 @@ rpmalloc_finalize(void) {
 			}
 
 			if (heap->spans_reserved) {
-				_memory_unmap(heap->span_reserve, _memory_span_size * heap->spans_reserved, 0, 0);
+				span_t* span = heap->span_reserve;
 				span_t* master = heap->span_reserve_master;
 				uint32_t remains = master->flags >> 2;
-				if (remains <= heap->spans_reserved) {
+							
+				assert(master != span);
+				assert(remains >= heap->spans_reserved);
+				
+				remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved);
+				assert(span->data.list.align_offset == 0);
+				_memory_unmap(span, _memory_span_size * heap->spans_reserved, 0, 0);
+				if (!remains) {
 					_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
 #if ENABLE_STATISTICS
 					atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
 #endif
 				}
 				else {
-					remains -= (uint32_t)heap->spans_reserved;
 					master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
 				}
 			}
@@ -1725,20 +1736,16 @@ rpmalloc_config(void) {
 //! Map new pages to virtual memory
 static void*
 _memory_map_os(size_t size, size_t* offset) {
-	void* ptr;
-	size_t padding = 0;
-
-	if (_memory_span_size > _memory_map_granularity)
-		padding = _memory_span_size;
+	size_t padding = (_memory_span_size > _memory_map_granularity) ? _memory_span_size : 0;
 
 #ifdef PLATFORM_WINDOWS
-	ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	void* ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 	if (!ptr) {
 		assert("Failed to map virtual memory block" == 0);
 		return 0;
 	}
 #else
-	ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0);
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0);
 	if (ptr == MAP_FAILED) {
 		assert("Failed to map virtual memory block" == 0);
 		return 0;
@@ -1759,14 +1766,15 @@ _memory_map_os(size_t size, size_t* offset) {
 
 //! Unmap pages from virtual memory
 static void
-_memory_unmap_os(void* address, size_t size, size_t offset, int master) {
+_memory_unmap_os(void* address, size_t size, size_t offset, int release) {
+	assert(release || (offset == 0));
 	if (offset) {
 		offset <<= 2;
 		size += offset;
 		address = pointer_offset(address, -(offset_t)offset);
 	}
 #ifdef PLATFORM_WINDOWS
-	if (!VirtualFree(address, master ? 0 : size, master ? MEM_RELEASE : MEM_DECOMMIT )) {
+	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
 		assert("Failed to unmap virtual memory block" == 0);
 	}
 #else
@@ -1786,36 +1794,9 @@ thread_yield(void) {
 #endif
 }
 
-// Extern interface
-
-RPMALLOC_RESTRICT void*
-rpmalloc(size_t size) {
-#if ENABLE_VALIDATE_ARGS
-	if (size >= MAX_ALLOC_SIZE) {
-		errno = EINVAL;
-		return 0;
-	}
-#endif
-#if ENABLE_GUARDS
-	size += 32;
-#endif
-	void* block = _memory_allocate(size);
-#if ENABLE_GUARDS
-	if (block) {
-		size_t block_size = _memory_usable_size(block);
-		uint32_t* deadzone = block;
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-		deadzone = (uint32_t*)pointer_offset(block, block_size - 16);
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-		block = pointer_offset(block, 16);
-	}
-#endif
-	return block;
-}
-
 #if ENABLE_GUARDS
 static void
-_memory_validate_integrity(void* p) {
+_memory_guard_validate(void* p) {
 	if (!p)
 		return;
 	void* block_start;
@@ -1828,49 +1809,81 @@ _memory_validate_integrity(void* p) {
 			size_class_t* size_class = _memory_size_class + span->size_class;
 			count_t block_offset = (count_t)pointer_diff(p, span_blocks_start);
 			count_t block_idx = block_offset / (count_t)size_class->size;
-	 		block_start = pointer_offset(span_blocks_start, block_idx * size_class->size);
-	 	}
-	 	else {
+			block_start = pointer_offset(span_blocks_start, block_idx * size_class->size);
+		}
+		else {
 			block_start = pointer_offset(span, SPAN_HEADER_SIZE);
-	 	}
-  	}
+		}
+	}
 	else {
 		block_start = pointer_offset(span, SPAN_HEADER_SIZE);
 	}
 	uint32_t* deadzone = block_start;
 	//If these asserts fire, you have written to memory before the block start
 	for (int i = 0; i < 4; ++i) {
-		if (deadzone[i] == MAGIC_GUARD) {
-			deadzone[i] = 0;
-			continue;
+		if (deadzone[i] != MAGIC_GUARD) {
+			if (_memory_config.memory_overwrite)
+				_memory_config.memory_overwrite(p);
+			else
+				assert("Memory overwrite before block start" == 0);
+			return;
 		}
-		if (_memory_config.memory_overwrite)
-			_memory_config.memory_overwrite(p);
-		else
-			assert(deadzone[i] == MAGIC_GUARD && "Memory overwrite before block start");
-		return;
+		deadzone[i] = 0;
 	}
 	deadzone = (uint32_t*)pointer_offset(block_start, block_size - 16);
 	//If these asserts fire, you have written to memory after the block end
 	for (int i = 0; i < 4; ++i) {
-		if (deadzone[i] == MAGIC_GUARD) {
-			deadzone[i] = 0;
-			continue;
+		if (deadzone[i] != MAGIC_GUARD) {
+			if (_memory_config.memory_overwrite)
+				_memory_config.memory_overwrite(p);
+			else
+				assert("Memory overwrite after block end" == 0);
+			return;
 		}
-		if (_memory_config.memory_overwrite)
-			_memory_config.memory_overwrite(p);
-		else
-			assert(deadzone[i] == MAGIC_GUARD && "Memory overwrite after block end");
-		return;
+		deadzone[i] = 0;
 	}
 }
+#else
+#define _memory_guard_validate(block)
 #endif
 
-void
-rpfree(void* ptr) {
 #if ENABLE_GUARDS
-	_memory_validate_integrity(ptr);
+static void
+_memory_guard_block(void* block, size_t size) {
+	if (block) {
+		size_t block_size = _memory_usable_size(block);
+		uint32_t* deadzone = block;
+		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
+		deadzone = (uint32_t*)pointer_offset(block, block_size - 16);
+		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
+	}
+}
+#define _memory_guard_pre_alloc(size) size += 32
+#define _memory_guard_post_alloc(block, size) size = _memory_guard_block(block, size); block = pointer_offset(block, 16); size -= 32
+#else
+#define _memory_guard_pre_alloc(size)
+#define _memory_guard_post_alloc(block, size)
 #endif
+
+// Extern interface
+
+RPMALLOC_RESTRICT void*
+rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	_memory_guard_pre_alloc(size);
+	void* block = _memory_allocate(size);
+	_memory_guard_post_alloc(block, size);
+	return block;
+}
+
+void
+rpfree(void* ptr) {
+	_memory_guard_validate(ptr);
 	_memory_deallocate(ptr);
 }
 
@@ -1894,21 +1907,9 @@ rpcalloc(size_t num, size_t size) {
 #else
 	total = num * size;
 #endif
-#if ENABLE_GUARDS
-	total += 32;
-#endif
+	_memory_guard_pre_alloc(total);
 	void* block = _memory_allocate(total);
-#if ENABLE_GUARDS
-	if (block) {
-		size_t block_size = _memory_usable_size(block);
-		uint32_t* deadzone = block;
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-		deadzone = (uint32_t*)pointer_offset(block, block_size - 16);
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-		block = pointer_offset(block, 16);
-		total -= 32;
-	}
-#endif
+	_memory_guard_post_alloc(block, total);
 	memset(block, 0, total);
 	return block;
 }
@@ -1921,21 +1922,10 @@ rprealloc(void* ptr, size_t size) {
 		return ptr;
 	}
 #endif
-#if ENABLE_GUARDS
-	_memory_validate_integrity(ptr);
-	size += 32;
-#endif
+	_memory_guard_validate(ptr);
+	_memory_guard_pre_alloc(size);
 	void* block = _memory_reallocate(ptr, size, 0, 0);
-#if ENABLE_GUARDS
-	if (block) {
-		size_t block_size = _memory_usable_size(block);
-		uint32_t* deadzone = block;
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-		deadzone = (uint32_t*)pointer_offset(block, block_size - 16);
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-		block = pointer_offset(block, 16);
-	}
-#endif
+	_memory_guard_post_alloc(block, size);
 	return block;
 }
 
@@ -1956,21 +1946,10 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
 		rpfree(ptr);
 	}
 	else {
-#if ENABLE_GUARDS
-		_memory_validate_integrity(ptr);
-		size += 32;
-#endif
+		_memory_guard_validate(ptr);
+		_memory_guard_pre_alloc(size);
 		block = _memory_reallocate(ptr, size, oldsize, flags);
-#if ENABLE_GUARDS
-		if (block) {
-			size_t block_size = _memory_usable_size(block);
-			uint32_t* deadzone = block;
-			deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-			deadzone = (uint32_t*)pointer_offset(block, block_size - 16);
-			deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-			block = pointer_offset(block, 16);
-		}
-#endif
+		_memory_guard_post_alloc(block, size);
 	}
 	return block;
 }

From f864e394d45d6e1f5c34cfaf2cbf7620853ce2b3 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 6 Feb 2018 17:49:19 +0100
Subject: [PATCH 20/42] fix guards

---
 rpmalloc/rpmalloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 5092a2d7..06a3b1d2 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -77,12 +77,12 @@
 
 #ifndef ENABLE_STATISTICS
 //! Enable statistics collection
-#define ENABLE_STATISTICS         1
+#define ENABLE_STATISTICS         0
 #endif
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            1
+#define ENABLE_ASSERTS            0
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -92,7 +92,7 @@
 
 #ifndef ENABLE_GUARDS
 //! Enable overwrite/underwrite guards
-#define ENABLE_GUARDS             1
+#define ENABLE_GUARDS             0
 #endif
 
 // Platform and arch specifics
@@ -1859,7 +1859,7 @@ _memory_guard_block(void* block, size_t size) {
 	}
 }
 #define _memory_guard_pre_alloc(size) size += 32
-#define _memory_guard_post_alloc(block, size) size = _memory_guard_block(block, size); block = pointer_offset(block, 16); size -= 32
+#define _memory_guard_post_alloc(block, size) _memory_guard_block(block, size); block = pointer_offset(block, 16); size -= 32
 #else
 #define _memory_guard_pre_alloc(size)
 #define _memory_guard_post_alloc(block, size)

From 0db3a173234d3aa1e66259246b500e2fd3ee7564 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 6 Feb 2018 22:07:36 +0100
Subject: [PATCH 21/42] unify code between span counts

---
 rpmalloc/rpmalloc.c | 531 +++++++++++++++-----------------------------
 rpmalloc/rpmalloc.h |   2 -
 2 files changed, 178 insertions(+), 355 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 06a3b1d2..17b1f28f 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -374,19 +374,15 @@ struct heap_t {
 	//! List of semi-used spans with free blocks for each size class (double linked list)
 	span_t*      size_cache[SIZE_CLASS_COUNT];
 	//! List of free spans (single linked list)
-	span_t*      span_cache;
+	span_t*      span_cache[LARGE_CLASS_COUNT];
 	//! Allocation counters
-	span_counter_t span_counter;
+	span_counter_t span_counter[LARGE_CLASS_COUNT];
 	//! Mapped but unused spans
 	span_t*      span_reserve;
 	//! Master span for mapped but unused spans
 	span_t*      span_reserve_master;
 	//! Number of mapped but unused spans
 	size_t       spans_reserved;
-	//! List of free spans for each large class count (single linked list)
-	span_t*      large_cache[LARGE_CLASS_COUNT];
-	//! Allocation counters for large blocks
-	span_counter_t large_counter[LARGE_CLASS_COUNT];
 	//! Next heap in id list
 	heap_t*      next_heap;
 	//! Next heap in orphan list
@@ -421,10 +417,7 @@ static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
 static atomic32_t _memory_heap_id;
 
 //! Global span cache
-static atomicptr_t _memory_span_cache;
-
-//! Global large cache
-static atomicptr_t _memory_large_cache[LARGE_CLASS_COUNT];
+static atomicptr_t _memory_span_cache[LARGE_CLASS_COUNT];
 
 //! All heaps
 static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
@@ -439,10 +432,7 @@ static atomic32_t _memory_orphan_counter;
 static atomic32_t _memory_active_heaps;
 
 //! Adaptive cache max allocation count
-static uint32_t _memory_max_allocation;
-
-//! Adaptive cache max allocation count
-static uint32_t _memory_max_allocation_large[LARGE_CLASS_COUNT];
+static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT];
 
 #if ENABLE_STATISTICS
 //! Total number of currently mapped memory pages
@@ -510,14 +500,15 @@ _memory_heap_lookup(int32_t id) {
 
 //! Increase an allocation counter
 static void
-_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count, size_t cache_limit_min) {
+_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) {
 	if (++counter->current_allocations > counter->max_allocations) {
 		counter->max_allocations = counter->current_allocations;
 		const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2;
 #if MAX_SPAN_CACHE_DIVISOR > 0
 		counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR);
+		const uint32_t cache_limit_min = (span_count == 1) ? (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) : (MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE);
 		if (counter->cache_limit < cache_limit_min)
-			counter->cache_limit = (uint32_t)cache_limit_min;
+			counter->cache_limit = cache_limit_min;
 		if (counter->cache_limit > cache_limit_max)
 			counter->cache_limit = cache_limit_max;
 #else
@@ -554,7 +545,7 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
 static span_t*
 _memory_map_spans(heap_t* heap, size_t num_spans) {
-	if (num_spans <= heap->spans_reserved) {
+	if ((num_spans == 1) && num_spans <= heap->spans_reserved) {
 		span_t* span = heap->span_reserve;
 		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
 		heap->spans_reserved -= num_spans;
@@ -596,6 +587,7 @@ _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
 		return;
 	}
 
+	assert(num_spans == 1);
 	uint32_t is_master = (span->flags & SPAN_FLAG_MASTER);
 	span_t* master = is_master ? span : (pointer_offset(span, -(int)(span->flags >> 2) * (int)_memory_span_size));
 	uint32_t remains = master->flags >> 2;
@@ -605,7 +597,6 @@ _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
 	assert(remains >= num_spans);
 
 	remains = ((uint32_t)num_spans >= remains) ? 0 : (remains - (uint32_t)num_spans);
-	master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
 	if (!is_master) {
 		assert(span->data.list.align_offset == 0);
 		_memory_unmap(span, _memory_span_size * num_spans, 0, 0);
@@ -616,14 +607,90 @@ _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
 		atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
 #endif
 	}
+	else {
+		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
+	}
+}
+
+//! Unmap a single linked list of spans
+static void
+_memory_unmap_span_list(span_t* span, size_t span_count) {
+	size_t list_size = span ? span->data.list.size : 0;
+	for (size_t ispan = 0; ispan < list_size; ++ispan) {
+		span_t* next_span = span->next_span;
+		_memory_unmap_spans(span, span_count, span->data.list.align_offset);
+		span = next_span;
+	}
+}
+
+//! Add span to head of single linked span list
+static size_t
+_memory_span_list_add(span_t** head, span_t* span) {
+	span->next_span = *head;
+	if (*head)
+		span->data.list.size = (*head)->data.list.size + 1;
+	else
+		span->data.list.size = 1;
+	*head = span;
+	return span->data.list.size;
+}
+
+//! Split a single linked span list
+static span_t*
+_memory_span_list_split(span_t* span, size_t limit) {
+	span_t* next = 0;
+	if (span->data.list.size > limit) {
+		count_t list_size = 1;
+		span_t* last = span;
+		next = span->next_span;
+		while (list_size < limit) {
+			last = next;
+			next = next->next_span;
+			++list_size;
+		}
+		last->next_span = 0;
+		if (next)
+			next->data.list.size = span->data.list.size - list_size;
+		span->data.list.size = list_size;
+		span->prev_span = 0;
+	}
+	return next;
+}
+
+//! Add a span to a double linked list
+static void
+_memory_span_list_doublelink_add(span_t** head, span_t* span) {
+	if (*head) {
+		(*head)->prev_span = span;
+		span->next_span = *head;
+	}
+	else {
+		span->next_span = 0;
+	}
+	*head = span;
+}
+
+//! Remove a span from a double linked list
+static void
+_memory_span_list_doublelink_remove(span_t** head, span_t* span) {
+	if (*head == span) {
+		*head = span->next_span;
+	}
+	else {
+		span_t* next_span = span->next_span;
+		span_t* prev_span = span->prev_span;
+		if (next_span)
+			next_span->prev_span = prev_span;
+		prev_span->next_span = next_span;
+	}
 }
 
 #define CACHE_IN_PROGRESS ((void*)1)
 
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_cache_insert(atomicptr_t* cache, span_t* span_list, size_t list_size, size_t span_count, size_t cache_limit) {
-	assert((list_size == 1) || (span_list->next_span != 0));
+_memory_cache_insert(atomicptr_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
+	assert((span->data.list.size == 1) || (span->next_span != 0));
 #if MAX_SPAN_CACHE_DIVISOR > 0
 	while (cache_limit) {
 		atomic_thread_fence_acquire();
@@ -632,14 +699,12 @@ _memory_cache_insert(atomicptr_t* cache, span_t* span_list, size_t list_size, si
 			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 
-			if ((global_list_size >= cache_limit) || ((global_list_size + list_size) & _memory_span_mask))
+			global_list_size += span->data.list.size;
+			if ((global_list_size >= cache_limit) || (global_list_size & _memory_span_mask))
 				break;
 
-			span_list->data.list.size = (uint32_t)list_size;
-			span_list->prev_span = global_span;
-
-			global_list_size += list_size;
-			void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size);
+			span->prev_span = global_span;
+			void* new_global_span_ptr = (void*)((uintptr_t)span | global_list_size);
 			if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr))
 				return;
 		}
@@ -647,12 +712,7 @@ _memory_cache_insert(atomicptr_t* cache, span_t* span_list, size_t list_size, si
 	}
 #endif
 	//Global cache full, release spans
-	for (size_t ispan = 0; ispan < list_size; ++ispan) {
-		assert(span_list);
-		span_t* next_span = span_list->next_span;
-		_memory_unmap_spans(span_list, span_count, span_list->data.list.align_offset);
-		span_list = next_span;
-	}
+	_memory_unmap_span_list(span, span_count);
 }
 
 //! Extract a number of memory page spans from the global cache
@@ -663,7 +723,7 @@ _memory_cache_extract(atomicptr_t* cache) {
 	void* global_span_ptr = atomic_load_ptr(cache);
 	while (global_span_ptr) {
 		if ((global_span_ptr != CACHE_IN_PROGRESS) && atomic_cas_ptr(cache, CACHE_IN_PROGRESS, global_span_ptr)) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+			/*uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 			assert((span->data.list.size == 1) || (span->next_span != 0));
 			assert(span->data.list.size <= global_list_size);
@@ -676,6 +736,8 @@ _memory_cache_extract(atomicptr_t* cache) {
 			                            ((void*)((uintptr_t)new_global_span | global_list_size)) :
 			                            0;
 			atomic_store_ptr(cache, new_global_span_ptr);
+			atomic_thread_fence_release();*/
+			atomic_store_ptr(cache, global_span_ptr);
 			atomic_thread_fence_release();
 			break;
 		}
@@ -687,43 +749,37 @@ _memory_cache_extract(atomicptr_t* cache) {
 	return span;
 }
 
-//! Insert the given list of memory page spans in the global cache for small/medium blocks
+//! Finalize a global cache
 static void
-_memory_global_cache_insert(span_t* span_list, size_t list_size) {
-#ifdef GLOBAL_CACHE_MULTIPLIER
-	const size_t cache_limit = GLOBAL_CACHE_MULTIPLIER * (_memory_max_allocation / MAX_SPAN_CACHE_DIVISOR);
-	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * MIN_SPAN_CACHE_SIZE;
-#else
-	const size_t cache_limit = _memory_span_size - 2;
-	const size_t cache_limit_min = cache_limit;
-#endif
-	_memory_cache_insert(&_memory_span_cache, span_list, list_size, 1, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
-}
-
-//! Extract a number of memory page spans from the global cache for small/medium blocks
-static span_t*
-_memory_global_cache_extract(void) {
-	return _memory_cache_extract(&_memory_span_cache);
+_memory_cache_finalize(atomicptr_t* cache, size_t span_count) {
+	void* span_ptr = atomic_load_ptr(cache);
+	span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & _memory_span_mask));
+	while (span) {
+		span_t* skip_span = span->prev_span;
+		_memory_unmap_span_list(span, span_count);
+		span = skip_span;
+	}
+	atomic_store_ptr(cache, 0);
 }
 
-//! Insert the given list of memory page spans in the global cache for large blocks
+//! Insert the given list of memory page spans in the global cache
 static void
-_memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t span_count) {
-	assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1)));
+_memory_global_cache_insert(span_t* span_list, size_t span_count) {
 #ifdef GLOBAL_CACHE_MULTIPLIER
-	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation_large[span_count - 1]) / (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
-	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * MIN_LARGE_SPAN_CACHE_SIZE;
+	const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
+	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
+	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
 #else
 	const size_t cache_limit = _memory_span_size - 2;
 	const size_t cache_limit_min = cache_limit;
 #endif
-	_memory_cache_insert(&_memory_large_cache[span_count - 1], span_list, list_size, span_count, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
+	_memory_cache_insert(&_memory_span_cache[span_count - 1], span_list, span_count, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 }
 
 //! Extract a number of memory page spans from the global cache for large blocks
 static span_t*
-_memory_global_cache_large_extract(size_t span_count) {
-	return _memory_cache_extract(&_memory_large_cache[span_count - 1]);
+_memory_global_cache_extract(size_t span_count) {
+	return _memory_cache_extract(&_memory_span_cache[span_count - 1]);
 }
 
 //! Allocate a small/medium sized memory block from the given heap
@@ -799,10 +855,10 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 
 	//Step 4: No semi-used span available, try grab a span from the thread cache
-	span_t* span = heap->span_cache;
+	span_t* span = heap->span_cache[0];
 	if (!span) {
 		//Step 5: No span available in the thread cache, try grab a list of spans from the global cache
-		span = _memory_global_cache_extract();
+		span = _memory_global_cache_extract(1);
 #if ENABLE_STATISTICS
 		if (span)
 			heap->global_to_thread += (size_t)span->data.list.size * _memory_span_size;
@@ -814,10 +870,10 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 			span_t* next_span = span->next_span;
 			assert(next_span);
 			next_span->data.list.size = span->data.list.size - 1;
-			heap->span_cache = next_span;
+			heap->span_cache[0] = next_span;
 		}
 		else {
-			heap->span_cache = 0;
+			heap->span_cache[0] = 0;
 		}
 	}
 	else {
@@ -846,7 +902,7 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 
 	//Track counters
-	_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1, MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE);
+	_memory_counter_increase(&heap->span_counter[0], &_memory_max_allocation[0], 1);
 
 	//Return first block if memory page span
 	return pointer_offset(span, SPAN_HEADER_SIZE);
@@ -857,74 +913,28 @@ static void*
 _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	//Calculate number of needed max sized spans (including header)
 	//Since this function is never called if size > LARGE_SIZE_LIMIT
-	//the num_spans is guaranteed to be <= LARGE_CLASS_COUNT
+	//the span_count is guaranteed to be <= LARGE_CLASS_COUNT
 	size += SPAN_HEADER_SIZE;
-	size_t num_spans = size >> _memory_span_size_shift;
+	size_t span_count = size >> _memory_span_size_shift;
 	if (size & (_memory_span_size - 1))
-		++num_spans;
-	size_t idx = num_spans - 1;
-
-	if (!idx) {
-		//Shared with medium/small spans
-		//Step 1: Check span cache
-		span_t* span = heap->span_cache;
-		if (!span) {
-			_memory_deallocate_deferred(heap, 0);
-			span = heap->span_cache;
-		}
-		if (!span) {
-			//Step 2: No span available in the thread cache, try grab a list of spans from the global cache
-			span = _memory_global_cache_extract();
-#if ENABLE_STATISTICS
-			if (span)
-				heap->global_to_thread += (size_t)span->data.list.size * _memory_span_size;
-#endif
-		}
-		if (span) {
-			if (span->data.list.size > 1) {
-				//We got a list of spans, we will use first as active and store remainder in thread cache
-				span_t* next_span = span->next_span;
-				assert(next_span);
-				next_span->data.list.size = span->data.list.size - 1;
-				heap->span_cache = next_span;
-			}
-			else {
-				heap->span_cache = 0;
-			}
-		}
-		else {
-			//Step 3: All caches empty, map in new memory pages
-			span = _memory_map_spans(heap, 1);
-		}
-
-		//Mark span as owned by this heap and set base data
-		atomic_store32(&span->heap_id, heap->id);
-		atomic_thread_fence_release();
-
-		span->size_class = SIZE_CLASS_COUNT;
-
-		//Track counters
-		_memory_counter_increase(&heap->span_counter, &_memory_max_allocation, 1, MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE);
-
-		return pointer_offset(span, SPAN_HEADER_SIZE);
-	}
+		++span_count;
+	size_t idx = span_count - 1;
 
 use_cache:
-	assert((idx > 0) && (idx < LARGE_CLASS_COUNT));
 	//Step 1: Check if cache for this large size class (or the following, unless first class) has a span
-	while (!heap->large_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (num_spans + 1)))
+	while (!heap->span_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (span_count + 1)))
 		++idx;
-	span_t* span = heap->large_cache[idx];
+	span_t* span = heap->span_cache[idx];
 	if (span) {
 		//Happy path, use from cache
 		if (span->data.list.size > 1) {
 			span_t* new_head = span->next_span;
 			assert(new_head);
 			new_head->data.list.size = span->data.list.size - 1;
-			heap->large_cache[idx] = new_head;
+			heap->span_cache[idx] = new_head;
 		}
 		else {
-			heap->large_cache[idx] = 0;
+			heap->span_cache[idx] = 0;
 		}
 
 		span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
@@ -934,25 +944,24 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		atomic_thread_fence_release();
 
 		//Increase counter
-		_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans, MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE);
+		_memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count);
 
 		return pointer_offset(span, SPAN_HEADER_SIZE);
 	}
 
 	//Restore index, we're back to smallest fitting span count
-	idx = num_spans - 1;
-	assert((idx > 0) && (idx < LARGE_CLASS_COUNT));
+	idx = span_count - 1;
 
 	//Step 2: Process deferred deallocation
 	if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx))
 		goto use_cache;
-	assert(!heap->large_cache[idx]);
+	assert(!heap->span_cache[idx]);
 
 	//Step 3: Extract a list of spans from global cache
-	span = _memory_global_cache_large_extract(num_spans);
+	span = _memory_global_cache_extract(span_count);
 	if (span) {
 #if ENABLE_STATISTICS
-		heap->global_to_thread += (size_t)span->data.list.size * num_spans * _memory_span_size;
+		heap->global_to_thread += (size_t)span->data.list.size * span_count * _memory_span_size;
 #endif
 		//We got a list from global cache, store remainder in thread cache
 		if (span->data.list.size > 1) {
@@ -960,12 +969,12 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 			assert(new_head);
 			new_head->prev_span = 0;
 			new_head->data.list.size = span->data.list.size - 1;
-			heap->large_cache[idx] = new_head;
+			heap->span_cache[idx] = new_head;
 		}
 	}
 	else {
 		//Step 4: Map in more memory pages
-		span = _memory_map_spans(heap, num_spans);
+		span = _memory_map_spans(heap, span_count);
 	}
 	//Mark span as owned by this heap
 	atomic_store32(&span->heap_id, heap->id);
@@ -974,7 +983,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
 
 	//Increase counter
-	_memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx], num_spans, MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE);
+	_memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count);
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -1030,68 +1039,22 @@ _memory_allocate_heap(void) {
 	return heap;
 }
 
-//! Add a span to a double linked list
-static void
-_memory_list_add(span_t** head, span_t* span) {
-	if (*head) {
-		(*head)->prev_span = span;
-		span->next_span = *head;
-	}
-	else {
-		span->next_span = 0;
-	}
-	*head = span;
-}
-
-//! Remove a span from a double linked list
-static void
-_memory_list_remove(span_t** head, span_t* span) {
-	if (*head == span) {
-		*head = span->next_span;
-	}
-	else {
-		span_t* next_span = span->next_span;
-		span_t* prev_span = span->prev_span;
-		if (next_span)
-			next_span->prev_span = prev_span;
-		prev_span->next_span = next_span;
-	}
-}
-
-//! Insert span into thread cache, releasing to global cache if overflow
+//! Insert span into thread heap cache, releasing to global cache if overflow
 static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span) {
+_memory_heap_cache_insert(heap_t* heap, span_t* span, size_t span_count) {
 #if MAX_SPAN_CACHE_DIVISOR == 0
 	(void)sizeof(heap);
-	const size_t list_size = 1;
+	span->data.list.size = 1;
 #else
-	span_t** cache = &heap->span_cache;
-	span->next_span = *cache;
-	if (*cache)
-		span->data.list.size = (*cache)->data.list.size + 1;
-	else
-		span->data.list.size = 1;
-	*cache = span;
-	if (span->data.list.size <= heap->span_counter.cache_limit)
+	size_t idx = span_count - 1;
+	if (_memory_span_list_add(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
 		return;
-	//Release to global cache if exceeding limit
-	count_t list_size = 1;
-	span_t* next = span->next_span;
-	span_t* last = span;
-	while (list_size < MIN_SPAN_CACHE_RELEASE) {
-		last = next;
-		next = next->next_span;
-		++list_size;
-	}
-	next->data.list.size = span->data.list.size - list_size;
-	last->next_span = 0;
-	*cache = next;
+	heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit);
 #endif
-
-	_memory_global_cache_insert(span, list_size);
 #if ENABLE_STATISTICS
-	heap->thread_to_global += list_size * _memory_span_size;
+	heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size;
 #endif
+	_memory_global_cache_insert(span, span_count);
 }
 
 //! Deallocate the given small/medium memory block from the given heap
@@ -1109,26 +1072,26 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 	//Check if the span will become completely free
 	if (block_data->free_count == ((count_t)size_class->block_count - 1)) {
 		//Track counters
-		assert(heap->span_counter.current_allocations > 0);
-		if (heap->span_counter.current_allocations)
-			--heap->span_counter.current_allocations;
+		assert(heap->span_counter[0].current_allocations > 0);
+		if (heap->span_counter[0].current_allocations)
+			--heap->span_counter[0].current_allocations;
 
 		//If it was active, reset counter. Otherwise, if not active, remove from
 		//partial free list if we had a previous free block (guard for classes with only 1 block)
 		if (is_active)
 			block_data->free_count = 0;
 		else if (block_data->free_count > 0)
-			_memory_list_remove(&heap->size_cache[class_idx], span);
+			_memory_span_list_doublelink_remove(&heap->size_cache[class_idx], span);
 
 		//Add to heap span cache
-		_memory_heap_cache_insert(heap, span);
+		_memory_heap_cache_insert(heap, span, 1);
 		return;
 	}
 
 	//Check if first free block for this span (previously fully allocated)
 	if (block_data->free_count == 0) {
 		//add to free list and disable autolink
-		_memory_list_add(&heap->size_cache[class_idx], span);
+		_memory_span_list_doublelink_add(&heap->size_cache[class_idx], span);
 		block_data->first_autolink = (uint16_t)size_class->block_count;
 	}
 	++block_data->free_count;
@@ -1144,79 +1107,35 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 //! Deallocate the given large memory block from the given heap
 static void
 _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
-	//Check if aliased with small/medium spans
-	if (span->size_class == SIZE_CLASS_COUNT) {
-		//Track counters
-		assert(heap->span_counter.current_allocations > 0);
-		if (heap->span_counter.current_allocations)
-			--heap->span_counter.current_allocations;
-		//Add to span cache
-		_memory_heap_cache_insert(heap, span);
-		return;
-	}
-
 	//Decrease counter
-	assert(span->size_class > SIZE_CLASS_COUNT);
 	size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT;
-	size_t num_spans = idx + 1;
-	assert((idx > 0) && (idx < LARGE_CLASS_COUNT));
-	span_counter_t* counter = heap->large_counter + idx;
-	assert(counter->current_allocations > 0);
-	if (counter->current_allocations)
-		--counter->current_allocations;
-
-#if MAX_LARGE_SPAN_CACHE_DIVISOR == 0
-	const size_t list_size = 1;
-#else
+	size_t span_count = idx + 1;
+	assert(span->size_class > SIZE_CLASS_COUNT);
+	assert(idx < LARGE_CLASS_COUNT);
+	assert(heap->span_counter[idx].current_allocations > 0);
+	if (heap->span_counter[idx].current_allocations)
+		--heap->span_counter[idx].current_allocations;
+
 	/* TODO: Once requirement that master span is one page and keeps track of how
 	   many spans are part of the superspan, reenable this */
-	/*if (!heap->span_cache && (num_spans <= heap->span_counter.cache_limit) && !span->flags) {
+	/*if (!heap->span_cache && (span_count <= heap->span_counter.cache_limit) && !span->flags) {
 		//Break up as single span cache
 		span_t* master = span;
-		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)num_spans << 2));
-		for (size_t ispan = 1; ispan < num_spans; ++ispan) {
+		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)span_count << 2));
+		for (size_t ispan = 1; ispan < span_count; ++ispan) {
 			span->next_span = pointer_offset(span, _memory_span_size);
 			span = span->next_span;
 			span->data.list.align_offset = 0;
 			span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | ((uint16_t)ispan << 2));
 		}
 		span->next_span = 0;
-		master->data.list.size = (uint32_t)num_spans;
+		master->data.list.size = (uint32_t)span_count;
 		heap->span_cache = master;
 		return;
 	}*/
 
 	//Insert into cache list
-	span_t** cache = heap->large_cache + idx;
-	span->next_span = *cache;
-	if (*cache)
-		span->data.list.size = (*cache)->data.list.size + 1;
-	else
-		span->data.list.size = 1;
-	*cache = span;
-	if (span->data.list.size <= counter->cache_limit)
-		return;
-
-	//Release to global cache if exceeding limit
-	count_t list_size = 1;
-	span_t* next = span->next_span;
-	span_t* last = span;
-	count_t min_list_size = MIN_LARGE_SPAN_CACHE_RELEASE;
-	while (list_size < min_list_size) {
-		last = next;
-		next = next->next_span;
-		++list_size;
-	}
-	if (next)
-		next->data.list.size = span->data.list.size - list_size;
-	last->next_span = 0;
-	*cache = next;
-#endif
-
-	_memory_global_cache_large_insert(span, list_size, num_spans);
-#if ENABLE_STATISTICS
-	heap->thread_to_global += list_size * num_spans * _memory_span_size;
-#endif
+	_memory_heap_cache_insert(heap, span, span_count);
 }
 
 //! Process pending deferred cross-thread deallocations
@@ -1535,24 +1454,9 @@ rpmalloc_finalize(void) {
 		while (heap) {
 			_memory_deallocate_deferred(heap, 0);
 
-			span_t* span = heap->span_cache;
-			size_t span_count = span ? span->data.list.size : 0;
-			for (size_t ispan = 0; ispan < span_count; ++ispan) {
-				span_t* next_span = span->next_span;
-				_memory_unmap_spans(span, 1, span->data.list.align_offset);
-				span = next_span;
-			}
-
-			//Free large spans
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				span_count = iclass + 1;
-				span = heap->large_cache[iclass];
-				while (span) {
-					span_t* next_span = span->next_span;
-					_memory_unmap_spans(span, span_count, span->data.list.align_offset);
-					span = next_span;
-				}
-			}
+			//Free span caches
+			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+				_memory_unmap_span_list(heap->span_cache[iclass], iclass + 1);
 
 			if (heap->spans_reserved) {
 				span_t* span = heap->span_reserve;
@@ -1586,39 +1490,8 @@ rpmalloc_finalize(void) {
 	atomic_store_ptr(&_memory_orphan_heaps, 0);
 
 	//Free global caches
-	void* span_ptr = atomic_load_ptr(&_memory_span_cache);
-	size_t cache_count = (uintptr_t)span_ptr & ~_memory_span_mask;
-	span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & _memory_span_mask));
-	while (cache_count) {
-		span_t* skip_span = span->prev_span;
-		unsigned int span_count = span->data.list.size;
-		for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
-			span_t* next_span = span->next_span;
-			_memory_unmap_spans(span, 1, span->data.list.align_offset);
-			span = next_span;
-		}
-		span = skip_span;
-		cache_count -= span_count;
-	}
-	atomic_store_ptr(&_memory_span_cache, 0);
-
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
-		cache_count = (uintptr_t)span_ptr & ~_memory_span_mask;
-		span = (span_t*)((void*)((uintptr_t)span_ptr & _memory_span_mask));
-		while (cache_count) {
-			span_t* skip_span = span->prev_span;
-			unsigned int span_count = span->data.list.size;
-			for (unsigned int ispan = 0; ispan < span_count; ++ispan) {
-				span_t* next_span = span->next_span;
-				_memory_unmap_spans(span, iclass + 1, span->data.list.align_offset);
-				span = next_span;
-			}
-			span = skip_span;
-			cache_count -= span_count;
-		}
-		atomic_store_ptr(&_memory_large_cache[iclass], 0);
-	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		_memory_cache_finalize(&_memory_span_cache[iclass], iclass + 1);
 
 	atomic_thread_fence_release();
 
@@ -1658,54 +1531,15 @@ rpmalloc_thread_finalize(void) {
 	_memory_deallocate_deferred(heap, 0);
 
 	//Release thread cache spans back to global cache
-	span_t* span = heap->span_cache;
-	while (span) {
-		if (span->data.list.size > MIN_SPAN_CACHE_RELEASE) {
-			count_t list_size = 1;
-			span_t* next = span->next_span;
-			span_t* last = span;
-			while (list_size < MIN_SPAN_CACHE_RELEASE) {
-				last = next;
-				next = next->next_span;
-				++list_size;
-			}
-			last->next_span = 0;
-			next->data.list.size = span->data.list.size - list_size;
-			_memory_global_cache_insert(span, list_size);
-			span = next;
-		}
-		else {
-			_memory_global_cache_insert(span, span->data.list.size);
-			span = 0;
-		}
-	}
-	heap->span_cache = 0;
-
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		const size_t span_count = iclass + 1;
-		span = heap->large_cache[iclass];
+		span_t* span = heap->span_cache[iclass];
 		while (span) {
-			if (span->data.list.size > (MIN_LARGE_SPAN_CACHE_RELEASE / span_count)) {
-				count_t list_size = 1;
-				span_t* next = span->next_span;
-				span_t* last = span;
-				while (list_size < (MIN_LARGE_SPAN_CACHE_RELEASE / span_count)) {
-					last = next;
-					next = next->next_span;
-					++list_size;
-				}
-				last->next_span = 0;
-				if (next)
-					next->data.list.size = span->data.list.size - list_size;
-				_memory_global_cache_large_insert(span, list_size, span_count);
-				span = next;
-			}
-			else {
-				_memory_global_cache_large_insert(span, span->data.list.size, span_count);
-				span = 0;
-			}
+			span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count));
+			_memory_global_cache_insert(span, span_count);
+			span = next;
 		}
-		heap->large_cache[iclass] = 0;
+		heap->span_cache[iclass] = 0;
 	}
 
 	//Orphan the heap
@@ -2026,8 +1860,10 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 		}
 	}
 
-	if (heap->span_cache)
-		stats->spancache = (size_t)heap->span_cache->data.list.size * _memory_span_size;
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (heap->span_cache[iclass])
+			stats->spancache = (size_t)heap->span_cache[iclass]->data.list.size * (iclass + 1) * _memory_span_size;
+	}
 }
 
 void
@@ -2038,25 +1874,14 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
 	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
 #endif
-	void* global_span_ptr = atomic_load_ptr(&_memory_span_cache);
-	while (global_span_ptr == CACHE_IN_PROGRESS) {
-		thread_yield();
-		atomic_thread_fence_acquire();
-		global_span_ptr = atomic_load_ptr(&_memory_span_cache);
-	}
-	uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-	size_t list_bytes = global_span_count * _memory_span_size;
-	stats->cached += list_bytes;
-
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
+		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
 		while (global_span_ptr == CACHE_IN_PROGRESS) {
 			thread_yield();
 			atomic_thread_fence_acquire();
-			global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]);
+			global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
 		}
-		global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-		list_bytes = global_span_count * (iclass + 1) * _memory_span_size;
-		stats->cached_large += list_bytes;
+		uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+		stats->cached += global_span_count * (iclass + 1) * _memory_span_size;
 	}
 }
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 29e3ef79..3b1f37dd 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -39,8 +39,6 @@ typedef struct rpmalloc_global_statistics_t {
 	size_t mapped;
 	//! Current amount of memory in global caches for small and medium sizes (<64KiB)
 	size_t cached;
-	//! Curren amount of memory in global caches for large sizes (>=64KiB)
-	size_t cached_large;
 	//! Total amount of memory mapped (only if ENABLE_STATISTICS=1)
 	size_t mapped_total;
 	//! Total amount of memory unmapped (only if ENABLE_STATISTICS=1)

From 162a1e6f6bd76984629edbe3d42d20bcc39159a3 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 6 Feb 2018 23:28:33 +0100
Subject: [PATCH 22/42] allow control over thread and global cache use

---
 rpmalloc/rpmalloc.c | 142 ++++++++++++++++++++++++++++++--------------
 1 file changed, 98 insertions(+), 44 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 17b1f28f..191c92d4 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -13,31 +13,16 @@
 
 // Build time configurable limits
 
-// Presets, if none is defined it will default to performance priority
-//#define ENABLE_UNLIMITED_CACHE
-//#define DISABLE_CACHE
-//#define ENABLE_SPACE_PRIORITY_CACHE
+#ifndef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE      0
+#endif
+
+#ifndef ENABLE_SPACE_PRIORITY_CACHE
+#define ENABLE_SPACE_PRIORITY_CACHE 0
+#endif
 
 // Presets for cache limits
-#if defined(ENABLE_UNLIMITED_CACHE)
-// Unlimited caches
-#define MIN_SPAN_CACHE_RELEASE 64
-#define MAX_SPAN_CACHE_DIVISOR 1
-#define MIN_SPAN_CACHE_SIZE 0
-#define MIN_LARGE_SPAN_CACHE_RELEASE 64
-#define MAX_LARGE_SPAN_CACHE_DIVISOR 1
-#define MIN_LARGE_SPAN_CACHE_SIZE 0
-#define DEFAULT_SPAN_MAP_COUNT 16
-#elif defined(DISABLE_CACHE)
-//Disable cache
-#define MIN_SPAN_CACHE_RELEASE 1
-#define MAX_SPAN_CACHE_DIVISOR 0
-#define MIN_SPAN_CACHE_SIZE 0
-#define MIN_LARGE_SPAN_CACHE_RELEASE 1
-#define MAX_LARGE_SPAN_CACHE_DIVISOR 0
-#define MIN_LARGE_SPAN_CACHE_SIZE 0
-#define DEFAULT_SPAN_MAP_COUNT 1
-#elif defined(ENABLE_SPACE_PRIORITY_CACHE)
+#if ENABLE_SPACE_PRIORITY_CACHE
 // Space priority cache limits
 #define MIN_SPAN_CACHE_SIZE 8
 #define MIN_SPAN_CACHE_RELEASE 8
@@ -63,12 +48,32 @@
 #define MAX_LARGE_SPAN_CACHE_DIVISOR 16
 //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
 #define GLOBAL_CACHE_MULTIPLIER 8
+#endif
+
+#ifndef DEFAULT_SPAN_MAP_COUNT
 //! Default number of spans to map in call to map more virtual memory
-#define DEFAULT_SPAN_MAP_COUNT 8
+#define DEFAULT_SPAN_MAP_COUNT    8
 #endif
 
+#ifndef HEAP_ARRAY_SIZE
 //! Size of heap hashmap
 #define HEAP_ARRAY_SIZE           79
+#endif
+
+#ifndef ENABLE_THREAD_CACHE
+//! Enable per-thread cache
+#define ENABLE_THREAD_CACHE       1
+#endif
+
+#ifndef ENABLE_GLOBAL_CACHE
+//! Enable global cache shared between all threads
+#define ENABLE_GLOBAL_CACHE       1
+#endif
+
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Unlimited cache disables any cache limitations
+#define ENABLE_UNLIMITED_CACHE    1
+#endif
 
 #ifndef ENABLE_VALIDATE_ARGS
 //! Enable validation of args to public entry points
@@ -95,6 +100,11 @@
 #define ENABLE_GUARDS             0
 #endif
 
+#if !ENABLE_THREAD_CACHE
+#  undef ENABLE_GLOBAL_CACHE
+#  define ENABLE_GLOBAL_CACHE 0
+#endif
+
 // Platform and arch specifics
 
 #ifdef _MSC_VER
@@ -373,10 +383,12 @@ struct heap_t {
 	span_t*      active_span[SIZE_CLASS_COUNT];
 	//! List of semi-used spans with free blocks for each size class (double linked list)
 	span_t*      size_cache[SIZE_CLASS_COUNT];
+#if ENABLE_THREAD_CACHE
 	//! List of free spans (single linked list)
 	span_t*      span_cache[LARGE_CLASS_COUNT];
 	//! Allocation counters
 	span_counter_t span_counter[LARGE_CLASS_COUNT];
+#endif
 	//! Mapped but unused spans
 	span_t*      span_reserve;
 	//! Master span for mapped but unused spans
@@ -416,8 +428,10 @@ static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
 //! Heap ID counter
 static atomic32_t _memory_heap_id;
 
+#if ENABLE_GLOBAL_CACHE
 //! Global span cache
 static atomicptr_t _memory_span_cache[LARGE_CLASS_COUNT];
+#endif
 
 //! All heaps
 static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
@@ -431,8 +445,10 @@ static atomic32_t _memory_orphan_counter;
 //! Active heap count
 static atomic32_t _memory_active_heaps;
 
+#if ENABLE_THREAD_CACHE
 //! Adaptive cache max allocation count
 static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT];
+#endif
 
 #if ENABLE_STATISTICS
 //! Total number of currently mapped memory pages
@@ -445,6 +461,8 @@ static atomic32_t _mapped_total;
 static atomic32_t _unmapped_total;
 #endif
 
+#define MEMORY_UNUSED(x) (void)sizeof((x))
+
 //! Current thread heap
 #if defined(__APPLE__) && ENABLE_PRELOAD
 static pthread_key_t _memory_thread_heap;
@@ -498,13 +516,15 @@ _memory_heap_lookup(int32_t id) {
 	return heap;
 }
 
+#if ENABLE_THREAD_CACHE
+
 //! Increase an allocation counter
 static void
 _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) {
 	if (++counter->current_allocations > counter->max_allocations) {
 		counter->max_allocations = counter->current_allocations;
 		const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2;
-#if MAX_SPAN_CACHE_DIVISOR > 0
+#if !ENABLE_UNLIMITED_CACHE
 		counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR);
 		const uint32_t cache_limit_min = (span_count == 1) ? (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) : (MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE);
 		if (counter->cache_limit < cache_limit_min)
@@ -519,6 +539,10 @@ _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size
 	}
 }
 
+#else
+#  define _memory_counter_increase(counter, global_counter, span_count) do {} while (0)
+#endif
+
 static void*
 _memory_map(size_t size, size_t* offset) {
 #if ENABLE_STATISTICS
@@ -685,13 +709,13 @@ _memory_span_list_doublelink_remove(span_t** head, span_t* span) {
 	}
 }
 
+#if ENABLE_GLOBAL_CACHE
 #define CACHE_IN_PROGRESS ((void*)1)
 
 //! Insert the given list of memory page spans in the global cache
 static void
 _memory_cache_insert(atomicptr_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
 	assert((span->data.list.size == 1) || (span->next_span != 0));
-#if MAX_SPAN_CACHE_DIVISOR > 0
 	while (cache_limit) {
 		atomic_thread_fence_acquire();
 		void* global_span_ptr = atomic_load_ptr(cache);
@@ -710,7 +734,6 @@ _memory_cache_insert(atomicptr_t* cache, span_t* span, size_t span_count, size_t
 		}
 		thread_yield();
 	}
-#endif
 	//Global cache full, release spans
 	_memory_unmap_span_list(span, span_count);
 }
@@ -723,7 +746,7 @@ _memory_cache_extract(atomicptr_t* cache) {
 	void* global_span_ptr = atomic_load_ptr(cache);
 	while (global_span_ptr) {
 		if ((global_span_ptr != CACHE_IN_PROGRESS) && atomic_cas_ptr(cache, CACHE_IN_PROGRESS, global_span_ptr)) {
-			/*uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
+			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
 			assert((span->data.list.size == 1) || (span->next_span != 0));
 			assert(span->data.list.size <= global_list_size);
@@ -732,11 +755,9 @@ _memory_cache_extract(atomicptr_t* cache) {
 			global_list_size -= span->data.list.size;
 			assert(!(global_list_size & _memory_span_mask));
 
-			void* new_global_span_ptr = global_list_size && new_global_span ?
-			                            ((void*)((uintptr_t)new_global_span | global_list_size)) :
-			                            0;
-			atomic_store_ptr(cache, new_global_span_ptr);
-			atomic_thread_fence_release();*/
+			global_span_ptr = global_list_size && new_global_span ?
+			                  ((void*)((uintptr_t)new_global_span | global_list_size)) :
+			                  0;
 			atomic_store_ptr(cache, global_span_ptr);
 			atomic_thread_fence_release();
 			break;
@@ -762,24 +783,29 @@ _memory_cache_finalize(atomicptr_t* cache, size_t span_count) {
 	atomic_store_ptr(cache, 0);
 }
 
+#endif
+
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_global_cache_insert(span_t* span_list, size_t span_count) {
-#ifdef GLOBAL_CACHE_MULTIPLIER
+_memory_global_cache_insert(span_t* span, size_t span_count) {
+#if ENABLE_GLOBAL_CACHE
 	const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
 	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
 	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
+	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, span_count, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 #else
-	const size_t cache_limit = _memory_span_size - 2;
-	const size_t cache_limit_min = cache_limit;
+	_memory_unmap_span_list(span, span_count);
 #endif
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span_list, span_count, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 }
 
 //! Extract a number of memory page spans from the global cache for large blocks
 static span_t*
 _memory_global_cache_extract(size_t span_count) {
+#if ENABLE_GLOBAL_CACHE
 	return _memory_cache_extract(&_memory_span_cache[span_count - 1]);
+#else
+	return 0;
+#endif
 }
 
 //! Allocate a small/medium sized memory block from the given heap
@@ -855,6 +881,7 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	}
 
 	//Step 4: No semi-used span available, try grab a span from the thread cache
+#if ENABLE_THREAD_CACHE
 	span_t* span = heap->span_cache[0];
 	if (!span) {
 		//Step 5: No span available in the thread cache, try grab a list of spans from the global cache
@@ -880,6 +907,9 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 		//Step 6: All caches empty, map in new memory pages
 		span = _memory_map_spans(heap, 1);
 	}
+#else
+	span_t* span = _memory_map_spans(heap, 1);
+#endif
 
 	//Mark span as owned by this heap and set base data
 	atomic_store32(&span->heap_id, heap->id);
@@ -919,12 +949,14 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	if (size & (_memory_span_size - 1))
 		++span_count;
 	size_t idx = span_count - 1;
+	span_t* span;
 
+#if ENABLE_THREAD_CACHE
 use_cache:
 	//Step 1: Check if cache for this large size class (or the following, unless first class) has a span
 	while (!heap->span_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (span_count + 1)))
 		++idx;
-	span_t* span = heap->span_cache[idx];
+	span = heap->span_cache[idx];
 	if (span) {
 		//Happy path, use from cache
 		if (span->data.list.size > 1) {
@@ -956,7 +988,11 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx))
 		goto use_cache;
 	assert(!heap->span_cache[idx]);
+#else
+	_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
+#endif
 
+#if ENABLE_GLOBAL_CACHE
 	//Step 3: Extract a list of spans from global cache
 	span = _memory_global_cache_extract(span_count);
 	if (span) {
@@ -976,6 +1012,10 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		//Step 4: Map in more memory pages
 		span = _memory_map_spans(heap, span_count);
 	}
+#else
+	span = _memory_map_spans(heap, span_count);
+#endif
+
 	//Mark span as owned by this heap
 	atomic_store32(&span->heap_id, heap->id);
 	atomic_thread_fence_release();
@@ -1042,17 +1082,17 @@ _memory_allocate_heap(void) {
 //! Insert span into thread heap cache, releasing to global cache if overflow
 static void
 _memory_heap_cache_insert(heap_t* heap, span_t* span, size_t span_count) {
-#if MAX_SPAN_CACHE_DIVISOR == 0
-	(void)sizeof(heap);
-	span->data.list.size = 1;
-#else
+#if ENABLE_THREAD_CACHE
 	size_t idx = span_count - 1;
 	if (_memory_span_list_add(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
 		return;
 	heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit);
-#endif
 #if ENABLE_STATISTICS
 	heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size;
+#endif
+#else
+	MEMORY_UNUSED(heap);
+	span->data.list.size = 1;
 #endif
 	_memory_global_cache_insert(span, span_count);
 }
@@ -1071,10 +1111,12 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 
 	//Check if the span will become completely free
 	if (block_data->free_count == ((count_t)size_class->block_count - 1)) {
+#if ENABLE_THREAD_CACHE
 		//Track counters
 		assert(heap->span_counter[0].current_allocations > 0);
 		if (heap->span_counter[0].current_allocations)
 			--heap->span_counter[0].current_allocations;
+#endif
 
 		//If it was active, reset counter. Otherwise, if not active, remove from
 		//partial free list if we had a previous free block (guard for classes with only 1 block)
@@ -1112,9 +1154,11 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	size_t span_count = idx + 1;
 	assert(span->size_class > SIZE_CLASS_COUNT);
 	assert(idx < LARGE_CLASS_COUNT);
+#if ENABLE_THREAD_CACHE
 	assert(heap->span_counter[idx].current_allocations > 0);
 	if (heap->span_counter[idx].current_allocations)
 		--heap->span_counter[idx].current_allocations;
+#endif
 
 	/* TODO: Once requirement that master span is one page and keeps track of how
 	   many spans are part of the superspan, reenable this */
@@ -1455,8 +1499,10 @@ rpmalloc_finalize(void) {
 			_memory_deallocate_deferred(heap, 0);
 
 			//Free span caches
+#if ENABLE_THREAD_CACHE
 			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
 				_memory_unmap_span_list(heap->span_cache[iclass], iclass + 1);
+#endif
 
 			if (heap->spans_reserved) {
 				span_t* span = heap->span_reserve;
@@ -1490,8 +1536,10 @@ rpmalloc_finalize(void) {
 	atomic_store_ptr(&_memory_orphan_heaps, 0);
 
 	//Free global caches
+#if ENABLE_GLOBAL_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
 		_memory_cache_finalize(&_memory_span_cache[iclass], iclass + 1);
+#endif
 
 	atomic_thread_fence_release();
 
@@ -1531,6 +1579,7 @@ rpmalloc_thread_finalize(void) {
 	_memory_deallocate_deferred(heap, 0);
 
 	//Release thread cache spans back to global cache
+#if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		const size_t span_count = iclass + 1;
 		span_t* span = heap->span_cache[iclass];
@@ -1541,6 +1590,7 @@ rpmalloc_thread_finalize(void) {
 		}
 		heap->span_cache[iclass] = 0;
 	}
+#endif
 
 	//Orphan the heap
 	void* raw_heap;
@@ -1860,10 +1910,12 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 		}
 	}
 
+#if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		if (heap->span_cache[iclass])
 			stats->spancache = (size_t)heap->span_cache[iclass]->data.list.size * (iclass + 1) * _memory_span_size;
 	}
+#endif
 }
 
 void
@@ -1874,6 +1926,7 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
 	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
 #endif
+#if ENABLE_GLOBAL_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
 		while (global_span_ptr == CACHE_IN_PROGRESS) {
@@ -1884,4 +1937,5 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 		uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 		stats->cached += global_span_count * (iclass + 1) * _memory_span_size;
 	}
+#endif
 }

From 67794c120b91eb885dd4d2acfe0c1232736f85a7 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 7 Feb 2018 08:41:58 +0100
Subject: [PATCH 23/42] unify source and prefer reserved spans over global
 cache

---
 rpmalloc/rpmalloc.c | 144 +++++++++++++++++++-------------------------
 1 file changed, 61 insertions(+), 83 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 191c92d4..97eee44b 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -569,7 +569,7 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
 static span_t*
 _memory_map_spans(heap_t* heap, size_t num_spans) {
-	if ((num_spans == 1) && num_spans <= heap->spans_reserved) {
+	if (num_spans <= heap->spans_reserved) {
 		span_t* span = heap->span_reserve;
 		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
 		heap->spans_reserved -= num_spans;
@@ -611,7 +611,6 @@ _memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
 		return;
 	}
 
-	assert(num_spans == 1);
 	uint32_t is_master = (span->flags & SPAN_FLAG_MASTER);
 	span_t* master = is_master ? span : (pointer_offset(span, -(int)(span->flags >> 2) * (int)_memory_span_size));
 	uint32_t remains = master->flags >> 2;
@@ -649,7 +648,7 @@ _memory_unmap_span_list(span_t* span, size_t span_count) {
 
 //! Add span to head of single linked span list
 static size_t
-_memory_span_list_add(span_t** head, span_t* span) {
+_memory_span_list_push(span_t** head, span_t* span) {
 	span->next_span = *head;
 	if (*head)
 		span->data.list.size = (*head)->data.list.size + 1;
@@ -659,6 +658,20 @@ _memory_span_list_add(span_t** head, span_t* span) {
 	return span->data.list.size;
 }
 
+//! Remove span from head of single linked span list, returns the new list head
+static span_t*
+_memory_span_list_pop(span_t** head) {
+	span_t* span = *head;
+	span_t* next_span = 0;
+	if (span->data.list.size > 1) {
+		next_span = span->next_span;
+		assert(next_span);
+		next_span->data.list.size = span->data.list.size - 1;
+	}
+	*head = next_span;
+	return span;
+}
+
 //! Split a single linked span list
 static span_t*
 _memory_span_list_split(span_t* span, size_t limit) {
@@ -811,7 +824,6 @@ _memory_global_cache_extract(size_t span_count) {
 //! Allocate a small/medium sized memory block from the given heap
 static void*
 _memory_allocate_from_heap(heap_t* heap, size_t size) {
-
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
 	const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ?
 		((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) :
@@ -880,43 +892,35 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 		goto use_active;
 	}
 
-	//Step 4: No semi-used span available, try grab a span from the thread cache
+	span_t* span = 0;
 #if ENABLE_THREAD_CACHE
-	span_t* span = heap->span_cache[0];
+	//Step 4: Try grab a span from the thread cache
+	if (heap->span_cache[0])
+		span = _memory_span_list_pop(&heap->span_cache[0]);
+#endif
+	//Step 5: Try grab a span from the thread reserved spans
+	if (!span && heap->spans_reserved)
+		span = _memory_map_spans(heap, 1);
 	if (!span) {
-		//Step 5: No span available in the thread cache, try grab a list of spans from the global cache
-		span = _memory_global_cache_extract(1);
+		//Step 6: No span available in the thread cache, try grab a list of spans from the global cache
+		heap->span_cache[0] = _memory_global_cache_extract(1);
+		if (heap->span_cache[0]) {
 #if ENABLE_STATISTICS
-		if (span)
-			heap->global_to_thread += (size_t)span->data.list.size * _memory_span_size;
+			heap->global_to_thread += (size_t)heap->span_cache[0]->data.list.size * _memory_span_size;
 #endif
-	}
-	if (span) {
-		if (span->data.list.size > 1) {
-			//We got a list of spans, we will use first as active and store remainder in thread cache
-			span_t* next_span = span->next_span;
-			assert(next_span);
-			next_span->data.list.size = span->data.list.size - 1;
-			heap->span_cache[0] = next_span;
-		}
-		else {
-			heap->span_cache[0] = 0;
+			span = _memory_span_list_pop(&heap->span_cache[0]);
 		}
 	}
-	else {
-		//Step 6: All caches empty, map in new memory pages
+	if (!span) {
+		//Step 7: All caches empty, map in new memory pages
 		span = _memory_map_spans(heap, 1);
 	}
-#else
-	span_t* span = _memory_map_spans(heap, 1);
-#endif
 
 	//Mark span as owned by this heap and set base data
+	span->size_class = (uint16_t)class_idx;
 	atomic_store32(&span->heap_id, heap->id);
 	atomic_thread_fence_release();
 
-	span->size_class = (uint16_t)class_idx;
-
 	//If we only have one block we will grab it, otherwise
 	//set span as new span to use for next allocation
 	if (size_class->block_count > 1) {
@@ -953,75 +957,50 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 
 #if ENABLE_THREAD_CACHE
 use_cache:
-	//Step 1: Check if cache for this large size class (or the following, unless first class) has a span
+	//Step 1: Check if cache for this large size class or the following has a span
 	while (!heap->span_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (span_count + 1)))
 		++idx;
-	span = heap->span_cache[idx];
-	if (span) {
-		//Happy path, use from cache
-		if (span->data.list.size > 1) {
-			span_t* new_head = span->next_span;
-			assert(new_head);
-			new_head->data.list.size = span->data.list.size - 1;
-			heap->span_cache[idx] = new_head;
-		}
-		else {
-			heap->span_cache[idx] = 0;
-		}
-
-		span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
-
-		//Mark span as owned by this heap
-		atomic_store32(&span->heap_id, heap->id);
-		atomic_thread_fence_release();
-
-		//Increase counter
-		_memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count);
+	if (heap->span_cache[idx])
+		span = _memory_span_list_pop(&heap->span_cache[idx]);
+	if (!span) {
+		//Restore index, we're back to smallest fitting span count
+		idx = span_count - 1;
 
-		return pointer_offset(span, SPAN_HEADER_SIZE);
+		//Step 2: Process deferred deallocation
+		if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx))
+			goto use_cache;
+		assert(!heap->span_cache[idx]);
 	}
-
-	//Restore index, we're back to smallest fitting span count
-	idx = span_count - 1;
-
-	//Step 2: Process deferred deallocation
-	if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx))
-		goto use_cache;
-	assert(!heap->span_cache[idx]);
 #else
 	_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
 #endif
 
+	//Step 3: Try grab a span from the thread reserved spans
+	if (!span && (heap->spans_reserved >= span_count))
+		span = _memory_map_spans(heap, span_count);
+
 #if ENABLE_GLOBAL_CACHE
-	//Step 3: Extract a list of spans from global cache
-	span = _memory_global_cache_extract(span_count);
-	if (span) {
+	//Step 4: Extract a list of spans from global cache
+	if (!span) {
+		heap->span_cache[idx] = _memory_global_cache_extract(span_count);
+		if (heap->span_cache[idx]) {
 #if ENABLE_STATISTICS
-		heap->global_to_thread += (size_t)span->data.list.size * span_count * _memory_span_size;
-#endif
-		//We got a list from global cache, store remainder in thread cache
-		if (span->data.list.size > 1) {
-			span_t* new_head = span->next_span;
-			assert(new_head);
-			new_head->prev_span = 0;
-			new_head->data.list.size = span->data.list.size - 1;
-			heap->span_cache[idx] = new_head;
+			heap->global_to_thread += (size_t)heap->span_cache[idx]->data.list.size * span_count * _memory_span_size;
+#endif
+			span = _memory_span_list_pop(&heap->span_cache[idx]);
 		}
 	}
-	else {
-		//Step 4: Map in more memory pages
+#endif
+	if (!span) {
+		//Step 5: Map in more memory pages
 		span = _memory_map_spans(heap, span_count);
 	}
-#else
-	span = _memory_map_spans(heap, span_count);
-#endif
 
-	//Mark span as owned by this heap
+	//Mark span as owned by this heap and set base data
+	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
 	atomic_store32(&span->heap_id, heap->id);
 	atomic_thread_fence_release();
 
-	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
-
 	//Increase counter
 	_memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count);
 
@@ -1079,12 +1058,12 @@ _memory_allocate_heap(void) {
 	return heap;
 }
 
-//! Insert span into thread heap cache, releasing to global cache if overflow
+//! Insert a single span into thread heap cache, releasing to global cache if overflow
 static void
 _memory_heap_cache_insert(heap_t* heap, span_t* span, size_t span_count) {
 #if ENABLE_THREAD_CACHE
 	size_t idx = span_count - 1;
-	if (_memory_span_list_add(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
+	if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
 		return;
 	heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit);
 #if ENABLE_STATISTICS
@@ -1534,6 +1513,7 @@ rpmalloc_finalize(void) {
 		atomic_store_ptr(&_memory_heaps[list_idx], 0);
 	}
 	atomic_store_ptr(&_memory_orphan_heaps, 0);
+	atomic_thread_fence_release();
 
 	//Free global caches
 #if ENABLE_GLOBAL_CACHE
@@ -1541,8 +1521,6 @@ rpmalloc_finalize(void) {
 		_memory_cache_finalize(&_memory_span_cache[iclass], iclass + 1);
 #endif
 
-	atomic_thread_fence_release();
-
 #if ENABLE_STATISTICS
 	assert(!atomic_load32(&_mapped_pages));
 	assert(!atomic_load32(&_reserved_spans));

From abe67544068b335414b80d43b6f4ca7bd7e51a8c Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 7 Feb 2018 15:34:13 +0100
Subject: [PATCH 24/42] refactor global cache and reserved spans

---
 rpmalloc/rpmalloc.c | 214 ++++++++++++++++++++------------------------
 test/main.c         | 180 +++++++++++++++++++++++++++----------
 2 files changed, 233 insertions(+), 161 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 97eee44b..63c1a616 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -14,10 +14,12 @@
 // Build time configurable limits
 
 #ifndef ENABLE_UNLIMITED_CACHE
+//! Unlimited cache disables any cache limitations
 #define ENABLE_UNLIMITED_CACHE      0
 #endif
 
 #ifndef ENABLE_SPACE_PRIORITY_CACHE
+//! Minimize overhead
 #define ENABLE_SPACE_PRIORITY_CACHE 0
 #endif
 
@@ -66,15 +68,10 @@
 #endif
 
 #ifndef ENABLE_GLOBAL_CACHE
-//! Enable global cache shared between all threads
+//! Enable global cache shared between all threads, requires thread cache
 #define ENABLE_GLOBAL_CACHE       1
 #endif
 
-#ifndef ENABLE_UNLIMITED_CACHE
-//! Unlimited cache disables any cache limitations
-#define ENABLE_UNLIMITED_CACHE    1
-#endif
-
 #ifndef ENABLE_VALIDATE_ARGS
 //! Enable validation of args to public entry points
 #define ENABLE_VALIDATE_ARGS      0
@@ -566,33 +563,38 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 	_memory_config.memory_unmap(address, size, offset, release);
 }
 
+#define SPAN_MAKE_FLAGS(flag, remdist, count) ((uint16_t)(flag | ((uint16_t)(remdist - 1) << 2) | ((uint16_t)(count - 1) << 9))); assert(flag < 4); assert(remdist < 128); assert(count < 128)
+#define SPAN_HAS_FLAG(flags, flag) (flags & flag)
+#define SPAN_DISTANCE(flags) (1 + ((flags >> 2) & 0x7f))
+#define SPAN_REMAINS(flags) (1 + ((flags >> 2) & 0x7f))
+#define SPAN_COUNT(flags) (1 + ((flags >> 9) & 0x7f))
+#define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)((flags & 0xfe03) | ((uint16_t)(remains - 1) << 2))); assert(remains < 128)
+
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
 static span_t*
-_memory_map_spans(heap_t* heap, size_t num_spans) {
-	if (num_spans <= heap->spans_reserved) {
+_memory_map_spans(heap_t* heap, size_t span_count) {
+	if (span_count <= heap->spans_reserved) {
 		span_t* span = heap->span_reserve;
-		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
-		heap->spans_reserved -= num_spans;
+		heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
+		heap->spans_reserved -= span_count;
 		//set flag in span that it is a subspan with a master span
 		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift);
-		span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | (distance << 2));
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
 		span->data.block.align_offset = 0;
 		return span;
 	}
 
 	//We cannot request extra spans if we already have some (but not enough) pending reserved spans
-	//Also, if given number of spans is more than one we cannot map extra spans as we lose info on
-	//how many spans is part of the master span
-	size_t request_spans = (heap->spans_reserved || (num_spans > 1)) ? num_spans : _memory_config.span_map_count;
+	size_t request_spans = (heap->spans_reserved || (span_count > _memory_config.span_map_count)) ? span_count : _memory_config.span_map_count;
 	size_t align_offset = 0;
-	span_t* span = _memory_map(_memory_span_size * request_spans, &align_offset);
+	span_t* span = _memory_map(request_spans * _memory_span_size, &align_offset);
 	span->data.block.align_offset = (uint16_t)align_offset;
-	if (request_spans > num_spans) {
-		assert(request_spans == _memory_config.span_map_count);
-		heap->spans_reserved = request_spans - num_spans;
-		heap->span_reserve = pointer_offset(span, num_spans * _memory_span_size);
+	if (request_spans > span_count) {
+		assert(request_spans < 127);
+		heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
 		heap->span_reserve_master = span;
-		span->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)request_spans << 2));
+		heap->spans_reserved = request_spans - span_count;
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, request_spans, span_count);
 #if ENABLE_STATISTICS
 		atomic_add32(&_reserved_spans, (int32_t)request_spans);
 #endif
@@ -605,33 +607,39 @@ _memory_map_spans(heap_t* heap, size_t num_spans) {
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
-_memory_unmap_spans(span_t* span, size_t num_spans, size_t align_offset) {
+_memory_unmap_spans(span_t* span, size_t span_count, size_t align_offset) {
 	if (!span->flags) {
-		_memory_unmap(span, _memory_span_size * num_spans, span->data.list.align_offset, 1);
+		_memory_unmap(span, _memory_span_size * span_count, span->data.list.align_offset, 1);
 		return;
 	}
 
-	uint32_t is_master = (span->flags & SPAN_FLAG_MASTER);
-	span_t* master = is_master ? span : (pointer_offset(span, -(int)(span->flags >> 2) * (int)_memory_span_size));
-	uint32_t remains = master->flags >> 2;
+	uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER);
+	span_t* master = is_master ? span : (pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size));
+	uint32_t remains = SPAN_REMAINS(master->flags);
 
-	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
-	assert((master->flags & SPAN_FLAG_MASTER) && !(master->flags & SPAN_FLAG_SUBSPAN));
-	assert(remains >= num_spans);
+	assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+	assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && !SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
+	assert(remains >= span_count);
 
-	remains = ((uint32_t)num_spans >= remains) ? 0 : (remains - (uint32_t)num_spans);
+	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
 	if (!is_master) {
 		assert(span->data.list.align_offset == 0);
-		_memory_unmap(span, _memory_span_size * num_spans, 0, 0);
+		assert(span_count == SPAN_COUNT(span->flags));
+		_memory_unmap(span, span_count * _memory_span_size, 0, 0);
+#if ENABLE_STATISTICS
+		atomic_add32(&_reserved_spans, -(int32_t)span_count);
+#endif
 	}
 	if (!remains) {
-		_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
+		uint32_t master_span_count = SPAN_COUNT(master->flags);
+		assert(!is_master || (span_count == master_span_count));
+		_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
 #if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
+		atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
 #endif
 	}
 	else {
-		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
+		SPAN_SET_REMAINS(master->flags, remains);
 	}
 }
 
@@ -723,71 +731,46 @@ _memory_span_list_doublelink_remove(span_t** head, span_t* span) {
 }
 
 #if ENABLE_GLOBAL_CACHE
-#define CACHE_IN_PROGRESS ((void*)1)
+
+static atomic32_t _global_cache_counter;
 
 //! Insert the given list of memory page spans in the global cache
 static void
 _memory_cache_insert(atomicptr_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
 	assert((span->data.list.size == 1) || (span->next_span != 0));
-	while (cache_limit) {
-		atomic_thread_fence_acquire();
-		void* global_span_ptr = atomic_load_ptr(cache);
-		if (global_span_ptr != CACHE_IN_PROGRESS) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-			span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
-
-			global_list_size += span->data.list.size;
-			if ((global_list_size >= cache_limit) || (global_list_size & _memory_span_mask))
-				break;
-
-			span->prev_span = global_span;
-			void* new_global_span_ptr = (void*)((uintptr_t)span | global_list_size);
-			if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr))
-				return;
-		}
-		thread_yield();
-	}
-	//Global cache full, release spans
-	_memory_unmap_span_list(span, span_count);
+	void* current_cache, *new_cache;
+	do {
+		current_cache = atomic_load_ptr(cache);
+		span->prev_span = (void*)((uintptr_t)current_cache & _memory_span_mask);
+		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&_global_cache_counter) & ~_memory_span_mask));
+	} while (!atomic_cas_ptr(cache, new_cache, current_cache));
 }
 
 //! Extract a number of memory page spans from the global cache
 static span_t*
 _memory_cache_extract(atomicptr_t* cache) {
-	span_t* span = 0;
-	atomic_thread_fence_acquire();
-	void* global_span_ptr = atomic_load_ptr(cache);
-	while (global_span_ptr) {
-		if ((global_span_ptr != CACHE_IN_PROGRESS) && atomic_cas_ptr(cache, CACHE_IN_PROGRESS, global_span_ptr)) {
-			uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-			span = (span_t*)((void*)((uintptr_t)global_span_ptr & _memory_span_mask));
-			assert((span->data.list.size == 1) || (span->next_span != 0));
-			assert(span->data.list.size <= global_list_size);
-
-			span_t* new_global_span = span->prev_span;
-			global_list_size -= span->data.list.size;
-			assert(!(global_list_size & _memory_span_mask));
-
-			global_span_ptr = global_list_size && new_global_span ?
-			                  ((void*)((uintptr_t)new_global_span | global_list_size)) :
-			                  0;
-			atomic_store_ptr(cache, global_span_ptr);
-			atomic_thread_fence_release();
-			break;
+	span_t* span;
+	do {
+		span = 0;
+		void* global_span = atomic_load_ptr(cache);
+		uintptr_t span_ptr = (uintptr_t)global_span & _memory_span_mask;
+		if (span_ptr) {
+			span = (void*)span_ptr;
+			//By accessing the span ptr before it is swapped out of list we assume that a contenting thread
+			//does not manage to traverse the span to being unmapped before we access it
+			void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&_global_cache_counter) & ~_memory_span_mask));
+			if (atomic_cas_ptr(cache, new_cache, global_span))
+				break;
 		}
-
-		thread_yield();
-		atomic_thread_fence_acquire();
-		global_span_ptr = atomic_load_ptr(cache);
-	}
+	} while (span);
 	return span;
 }
 
 //! Finalize a global cache
 static void
 _memory_cache_finalize(atomicptr_t* cache, size_t span_count) {
-	void* span_ptr = atomic_load_ptr(cache);
-	span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & _memory_span_mask));
+	void* current_cache = atomic_load_ptr(cache);
+	span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
 	while (span) {
 		span_t* skip_span = span->prev_span;
 		_memory_unmap_span_list(span, span_count);
@@ -953,24 +936,21 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	if (size & (_memory_span_size - 1))
 		++span_count;
 	size_t idx = span_count - 1;
-	span_t* span;
-
+	
+	span_t* span = 0;
 #if ENABLE_THREAD_CACHE
-use_cache:
 	//Step 1: Check if cache for this large size class or the following has a span
 	while (!heap->span_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (span_count + 1)))
 		++idx;
-	if (heap->span_cache[idx])
-		span = _memory_span_list_pop(&heap->span_cache[idx]);
-	if (!span) {
+	if (!heap->span_cache[idx]) {
 		//Restore index, we're back to smallest fitting span count
 		idx = span_count - 1;
 
 		//Step 2: Process deferred deallocation
-		if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx))
-			goto use_cache;
-		assert(!heap->span_cache[idx]);
+		_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
 	}
+	if (heap->span_cache[idx])
+		span = _memory_span_list_pop(&heap->span_cache[idx]);
 #else
 	_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
 #endif
@@ -1131,34 +1111,34 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	//Decrease counter
 	size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT;
 	size_t span_count = idx + 1;
-	assert(span->size_class > SIZE_CLASS_COUNT);
+	assert(span->size_class >= SIZE_CLASS_COUNT);
 	assert(idx < LARGE_CLASS_COUNT);
 #if ENABLE_THREAD_CACHE
 	assert(heap->span_counter[idx].current_allocations > 0);
 	if (heap->span_counter[idx].current_allocations)
 		--heap->span_counter[idx].current_allocations;
 #endif
-
-	/* TODO: Once requirement that master span is one page and keeps track of how
-	   many spans are part of the superspan, reenable this */
-	/*if (!heap->span_cache && (span_count <= heap->span_counter.cache_limit) && !span->flags) {
+	if (!heap->span_cache[0] && (span_count <= heap->span_counter[0].cache_limit) && !span->flags) {
 		//Break up as single span cache
 		span_t* master = span;
-		master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)span_count << 2));
+		master->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, span_count, 1);
 		for (size_t ispan = 1; ispan < span_count; ++ispan) {
 			span->next_span = pointer_offset(span, _memory_span_size);
 			span = span->next_span;
 			span->data.list.align_offset = 0;
-			span->flags = (uint16_t)(SPAN_FLAG_SUBSPAN | ((uint16_t)ispan << 2));
+			span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, ispan, 1);
 		}
 		span->next_span = 0;
 		master->data.list.size = (uint32_t)span_count;
-		heap->span_cache = master;
-		return;
-	}*/
-
-	//Insert into cache list
-	_memory_heap_cache_insert(heap, span, span_count);
+		heap->span_cache[0] = master;
+#if ENABLE_STATISTICS
+		atomic_add32(&_reserved_spans, (int32_t)span_count);
+#endif
+	}
+	else {
+		//Insert into cache list
+		_memory_heap_cache_insert(heap, span, span_count);
+	}
 }
 
 //! Process pending deferred cross-thread deallocations
@@ -1435,6 +1415,8 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		_memory_config.span_map_count = DEFAULT_SPAN_MAP_COUNT;
 	if (_memory_config.span_size * _memory_config.span_map_count < _memory_config.page_size)
 		_memory_config.span_map_count = (_memory_config.page_size / _memory_config.span_size);
+	if (_memory_config.span_map_count > 128)
+		_memory_config.span_map_count = 128;
 
 #if defined(__APPLE__) && ENABLE_PRELOAD
 	if (pthread_key_create(&_memory_thread_heap, 0))
@@ -1470,6 +1452,8 @@ rpmalloc_finalize(void) {
 	atomic_thread_fence_acquire();
 
 	rpmalloc_thread_finalize();
+	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
+	assert(atomic_load32(&_memory_active_heaps) == 0);
 
 	//Free all thread caches
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
@@ -1486,22 +1470,24 @@ rpmalloc_finalize(void) {
 			if (heap->spans_reserved) {
 				span_t* span = heap->span_reserve;
 				span_t* master = heap->span_reserve_master;
-				uint32_t remains = master->flags >> 2;
+				uint32_t remains = SPAN_REMAINS(master->flags);
 							
 				assert(master != span);
 				assert(remains >= heap->spans_reserved);
-				
+				_memory_unmap(span, heap->spans_reserved * _memory_span_size, 0, 0);
+#if ENABLE_STATISTICS
+				atomic_add32(&_reserved_spans, -(int32_t)heap->spans_reserved);
+#endif
 				remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved);
-				assert(span->data.list.align_offset == 0);
-				_memory_unmap(span, _memory_span_size * heap->spans_reserved, 0, 0);
 				if (!remains) {
-					_memory_unmap(master, _memory_span_size, master->data.list.align_offset, 1); //Master span is always 1 span wide
+					uint32_t master_span_count = SPAN_COUNT(master->flags);
+					_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
 #if ENABLE_STATISTICS
-					atomic_add32(&_reserved_spans, -(int32_t)_memory_config.span_map_count);
+					atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
 #endif
 				}
 				else {
-					master->flags = (uint16_t)(SPAN_FLAG_MASTER | ((uint16_t)remains << 2));
+					SPAN_SET_REMAINS(master->flags, remains);
 				}
 			}
 
@@ -1522,6 +1508,7 @@ rpmalloc_finalize(void) {
 #endif
 
 #if ENABLE_STATISTICS
+	//If you hit these asserts you probably have memory leaks or double frees in your code
 	assert(!atomic_load32(&_mapped_pages));
 	assert(!atomic_load32(&_reserved_spans));
 #endif
@@ -1608,7 +1595,7 @@ _memory_map_os(size_t size, size_t* offset) {
 	}
 #else
 	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0);
-	if (ptr == MAP_FAILED) {
+	if ((ptr == MAP_FAILED) || !ptr) {
 		assert("Failed to map virtual memory block" == 0);
 		return 0;
 	}
@@ -1907,11 +1894,6 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 #if ENABLE_GLOBAL_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
-		while (global_span_ptr == CACHE_IN_PROGRESS) {
-			thread_yield();
-			atomic_thread_fence_acquire();
-			global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
-		}
 		uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
 		stats->cached += global_span_count * (iclass + 1) * _memory_span_size;
 	}
diff --git a/test/main.c b/test/main.c
index 9aac1107..b976e39f 100644
--- a/test/main.c
+++ b/test/main.c
@@ -28,6 +28,11 @@
 #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
 //#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
 
+static size_t _hardware_threads;
+
+static void
+test_initialize(void);
+
 static int
 test_alloc(void) {
 	unsigned int iloop = 0;
@@ -43,6 +48,10 @@ test_alloc(void) {
 	for (id = 0; id < 20000; ++id)
 		data[id] = (char)(id % 139 + id % 17);
 
+	void* testptr = rpmalloc(253000);
+	testptr = rprealloc(testptr, 154);
+	rpfree(testptr);
+
 	for (iloop = 0; iloop < 64; ++iloop) {
 		for (ipass = 0; ipass < 8142; ++ipass) {
 			addr[ipass] = rpmalloc(500);
@@ -197,22 +206,24 @@ allocator_thread(void* argp) {
 	unsigned int ipass = 0;
 	unsigned int icheck = 0;
 	unsigned int id = 0;
-	void* addr[4096];
-	char data[8192];
+	void** addr;
+	uint32_t* data;
 	unsigned int cursize;
 	unsigned int iwait = 0;
 	int ret = 0;
 
 	rpmalloc_thread_initialize();
 
-	for (id = 0; id < 8192; ++id)
-		data[id] = (char)id;
+	addr = rpmalloc(sizeof(void*) * arg.passes);
+	data = rpmalloc(512 * 1024);
+	for (id = 0; id < 512 * 1024 / 4; ++id)
+		data[id] = id;
 
 	thread_sleep(1);
 
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
-			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + (iloop % 1024);
+			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
 
 			addr[ipass] = rpmalloc(4 + cursize);
 			if (addr[ipass] == 0) {
@@ -259,6 +270,9 @@ allocator_thread(void* argp) {
 		}
 	}
 
+	rpfree(data);
+	rpfree(addr);
+
 	rpmalloc_thread_finalize();
 
 end:
@@ -276,11 +290,11 @@ crossallocator_thread(void* argp) {
 
 	rpmalloc_thread_initialize();
 
-	thread_sleep(1);
+	thread_sleep(10);
 
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
-			cursize = arg.datasize[(iloop + ipass + iwait) % arg.num_datasize ] + (iloop % 1024);
+			cursize = arg.datasize[(iloop + ipass + iwait) % arg.num_datasize ] + ((iloop + ipass) % 1024);
 
 			void* addr = rpmalloc(cursize);
 			if (addr == 0) {
@@ -320,7 +334,7 @@ initfini_thread(void* argp) {
 		rpmalloc_thread_initialize();
 
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
-			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + (iloop % 1024);
+			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
 
 			addr[ipass] = rpmalloc(4 + cursize);
 			if (addr[ipass] == 0) {
@@ -384,18 +398,31 @@ test_threaded(void) {
 
 	rpmalloc_initialize();
 
-	num_alloc_threads = 3;
+	num_alloc_threads = _hardware_threads;
+	if (num_alloc_threads < 2)
+		num_alloc_threads = 2;
+	if (num_alloc_threads > 32)
+		num_alloc_threads = 32;
 
 	arg.datasize[0] = 19;
 	arg.datasize[1] = 249;
 	arg.datasize[2] = 797;
-	arg.datasize[3] = 3;
-	arg.datasize[4] = 79;
-	arg.datasize[5] = 34;
+	arg.datasize[3] = 3058;
+	arg.datasize[4] = 47892;
+	arg.datasize[5] = 173902;
 	arg.datasize[6] = 389;
-	arg.num_datasize = 7;
-	arg.loops = 4096;
-	arg.passes = 1024;
+	arg.datasize[7] = 19;
+	arg.datasize[8] = 2493;
+	arg.datasize[9] = 7979;
+	arg.datasize[10] = 3;
+	arg.datasize[11] = 79374;
+	arg.datasize[12] = 3432;
+	arg.datasize[13] = 548;
+	arg.datasize[14] = 38934;
+	arg.datasize[15] = 234;
+	arg.num_datasize = 16;
+	arg.loops = 100;
+	arg.passes = 4000;
 
 	thread_arg targ = { allocator_thread, &arg };
 	for (i = 0; i < num_alloc_threads; ++i)
@@ -420,39 +447,63 @@ test_threaded(void) {
 
 static int 
 test_crossthread(void) {
-	uintptr_t thread;
-	allocator_thread_arg_t arg;
+	uintptr_t thread[8];
+	allocator_thread_arg_t arg[8];
+	thread_arg targ[8];
 
 	rpmalloc_initialize();
 
-	arg.loops = 100;
-	arg.passes = 1024;
-	arg.pointers = rpmalloc(sizeof(void*) * arg.loops * arg.passes);
-	arg.datasize[0] = 19;
-	arg.datasize[1] = 249;
-	arg.datasize[2] = 797;
-	arg.datasize[3] = 3;
-	arg.datasize[4] = 79;
-	arg.datasize[5] = 34;
-	arg.datasize[6] = 389;
-	arg.num_datasize = 7;
-
-	thread_arg targ = { crossallocator_thread, &arg };
-	thread = thread_run(&targ);
+	size_t num_alloc_threads = _hardware_threads;
+	if (num_alloc_threads < 2)
+		num_alloc_threads = 2;
+	if (num_alloc_threads > 4)
+		num_alloc_threads = 4;
+
+	for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread) {
+		unsigned int iadd = ithread * (16 + ithread);
+		arg[ithread].loops = 50;
+		arg[ithread].passes = 1024;
+		arg[ithread].pointers = rpmalloc(sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
+		arg[ithread].datasize[0] = 19 + iadd;
+		arg[ithread].datasize[1] = 249 + iadd;
+		arg[ithread].datasize[2] = 797 + iadd;
+		arg[ithread].datasize[3] = 3 + iadd;
+		arg[ithread].datasize[4] = 7923 + iadd;
+		arg[ithread].datasize[5] = 344 + iadd;
+		arg[ithread].datasize[6] = 3892 + iadd;
+		arg[ithread].datasize[7] = 19 + iadd;
+		arg[ithread].datasize[8] = 14954 + iadd;
+		arg[ithread].datasize[9] = 39723 + iadd;
+		arg[ithread].datasize[10] = 15 + iadd;
+		arg[ithread].datasize[11] = 493 + iadd;
+		arg[ithread].datasize[12] = 34 + iadd;
+		arg[ithread].datasize[13] = 894 + iadd;
+		arg[ithread].datasize[14] = 6893 + iadd;
+		arg[ithread].datasize[15] = 2893 + iadd;
+		arg[ithread].num_datasize = 16;
+
+		targ[ithread].fn = crossallocator_thread;
+		targ[ithread].arg = &arg[ithread];
+	}
 
-	thread_sleep(1000);
+	for (int iloop = 0; iloop < 32; ++iloop) {
+		for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread)
+			thread[ithread] = thread_run(&targ[ithread]);
 
-	if (thread_join(thread) != 0)
-		return -1;
+		thread_sleep(100);
 
-	//Off-thread deallocation
-	for (size_t iptr = 0; iptr < arg.loops * arg.passes; ++iptr)
-		rpfree(arg.pointers[iptr]);
+		for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread) {
+			if (thread_join(thread[ithread]) != 0)
+				return -1;
 
-	rpfree(arg.pointers);
+			//Off-thread deallocation
+			for (size_t iptr = 0; iptr < arg[ithread].loops * arg[ithread].passes; ++iptr)
+				rpfree(arg[ithread].pointers[iptr]);
+		}
+	}
 
-	//Simulate thread exit
-	rpmalloc_thread_finalize();
+	for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread)
+		rpfree(arg[ithread].pointers);
 
 	rpmalloc_finalize();
 
@@ -472,7 +523,11 @@ test_threadspam(void) {
 	rpmalloc_initialize();
 
 	num_passes = 100;
-	num_alloc_threads = 5;
+	num_alloc_threads = _hardware_threads;
+	if (num_alloc_threads < 2)
+		num_alloc_threads = 2;
+	if (num_alloc_threads > 64)
+		num_alloc_threads = 64;
 
 	arg.loops = 500;
 	arg.passes = 10;
@@ -584,22 +639,23 @@ int
 test_run(int argc, char** argv) {
 	(void)sizeof(argc);
 	(void)sizeof(argv);
+	test_initialize();
 	if (test_alloc())
 		return -1;
-	if (test_threaded())
-		return -1;
 	if (test_crossthread())
 		return -1;
 	if (test_threadspam())
 		return -1;
 	if (test_overwrite())
 		return -1;
+	if (test_threaded())
+		return -1;
 	return 0;
 }
 
-#if ( defined( __APPLE__ ) && __APPLE__ )
+#if (defined(__APPLE__) && __APPLE__)
 #  include <TargetConditionals.h>
-#  if defined( __IPHONE__ ) || ( defined( TARGET_OS_IPHONE ) && TARGET_OS_IPHONE ) || ( defined( TARGET_IPHONE_SIMULATOR ) && TARGET_IPHONE_SIMULATOR )
+#  if defined(__IPHONE__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR)
 #    define NO_MAIN 1
 #  endif
 #endif
@@ -612,3 +668,37 @@ main(int argc, char** argv) {
 }
 
 #endif
+
+#ifdef _WIN32
+#include <Windows.h>
+
+static void
+test_initialize(void) {
+	SYSTEM_INFO system_info;
+	GetSystemInfo(&system_info);
+	_hardware_threads = (size_t)system_info.dwNumberOfProcessors;
+}
+
+#elif (defined(__linux__) || defined(__linux))
+
+static void
+test_initialize(void) {
+	cpu_set_t prevmask, testmask;
+	CPU_ZERO(&prevmask);
+	CPU_ZERO(&testmask);
+	sched_getaffinity(0, sizeof(prevmask), &prevmask);     //Get current mask
+	sched_setaffinity(0, sizeof(testmask), &testmask);     //Set zero mask
+	sched_getaffinity(0, sizeof(testmask), &testmask);     //Get mask for all CPUs
+	sched_setaffinity(0, sizeof(prevmask), &prevmask);     //Reset current mask
+	int num = CPU_COUNT(&testmask);
+	_hardware_threads = (size_t)(num > 1 ? num : 1);
+}
+
+#else
+
+static void
+test_initialize(void) {
+	_hardware_threads = 1;
+}
+
+#endif

From 18d0773955ce465e5f4d15f5b4da9c5f2cbbccfa Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 7 Feb 2018 18:02:59 +0100
Subject: [PATCH 25/42] cleanups

---
 rpmalloc/rpmalloc.c | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 63c1a616..468d9bda 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -143,10 +143,8 @@
 #include <string.h>
 
 #if ENABLE_ASSERTS
-#  ifdef NDEBUG
-#    undef NDEBUG
-#  endif
-#  ifndef _DEBUG
+#  undef NDEBUG
+#  if defined(_MSC_VER) && !defined(_DEBUG)
 #    define _DEBUG
 #  endif
 #  include <assert.h>
@@ -230,9 +228,6 @@ atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) {
 #endif
 }
 
-static void
-thread_yield(void);
-
 // Preconfigured limits and sizes
 
 //! Memory page size
@@ -607,7 +602,7 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
-_memory_unmap_spans(span_t* span, size_t span_count, size_t align_offset) {
+_memory_unmap_spans(span_t* span, size_t span_count) {
 	if (!span->flags) {
 		_memory_unmap(span, _memory_span_size * span_count, span->data.list.align_offset, 1);
 		return;
@@ -649,7 +644,7 @@ _memory_unmap_span_list(span_t* span, size_t span_count) {
 	size_t list_size = span ? span->data.list.size : 0;
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		span_t* next_span = span->next_span;
-		_memory_unmap_spans(span, span_count, span->data.list.align_offset);
+		_memory_unmap_spans(span, span_count);
 		span = next_span;
 	}
 }
@@ -737,6 +732,8 @@ static atomic32_t _global_cache_counter;
 //! Insert the given list of memory page spans in the global cache
 static void
 _memory_cache_insert(atomicptr_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
+	MEMORY_UNUSED(span_count);
+	MEMORY_UNUSED(cache_limit);
 	assert((span->data.list.size == 1) || (span->next_span != 0));
 	void* current_cache, *new_cache;
 	do {
@@ -1627,22 +1624,13 @@ _memory_unmap_os(void* address, size_t size, size_t offset, int release) {
 		assert("Failed to unmap virtual memory block" == 0);
 	}
 #else
+	MEMORY_UNUSED(release);
 	if (munmap(address, size)) {
 		assert("Failed to unmap virtual memory block" == 0);
 	}
 #endif
 }
 
-//! Yield the thread remaining timeslice
-static void
-thread_yield(void) {
-#ifdef PLATFORM_WINDOWS
-	YieldProcessor();
-#else
-	sched_yield();
-#endif
-}
-
 #if ENABLE_GUARDS
 static void
 _memory_guard_validate(void* p) {
@@ -1698,7 +1686,7 @@ _memory_guard_validate(void* p) {
 
 #if ENABLE_GUARDS
 static void
-_memory_guard_block(void* block, size_t size) {
+_memory_guard_block(void* block) {
 	if (block) {
 		size_t block_size = _memory_usable_size(block);
 		uint32_t* deadzone = block;
@@ -1708,7 +1696,7 @@ _memory_guard_block(void* block, size_t size) {
 	}
 }
 #define _memory_guard_pre_alloc(size) size += 32
-#define _memory_guard_post_alloc(block, size) _memory_guard_block(block, size); block = pointer_offset(block, 16); size -= 32
+#define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 16); size -= 32
 #else
 #define _memory_guard_pre_alloc(size)
 #define _memory_guard_post_alloc(block, size)

From cdc39bffc8529bfab5fe5bf96f3cd4cf758a6352 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 7 Feb 2018 23:29:04 +0100
Subject: [PATCH 26/42] work in progress on reusing larger super spans

---
 rpmalloc/rpmalloc.c | 114 ++++++++++++++++++++++++++++++--------------
 1 file changed, 79 insertions(+), 35 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 468d9bda..211a231e 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -565,6 +565,18 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 #define SPAN_COUNT(flags) (1 + ((flags >> 9) & 0x7f))
 #define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)((flags & 0xfe03) | ((uint16_t)(remains - 1) << 2))); assert(remains < 128)
 
+static void
+_memory_map_span_as_reserved(heap_t* heap, span_t* span, size_t reserved, size_t used) {
+	assert(reserved < 127);
+	heap->span_reserve = pointer_offset(span, used * _memory_span_size);
+	heap->span_reserve_master = span;
+	heap->spans_reserved = reserved - used;
+	span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved, used);
+#if ENABLE_STATISTICS
+	atomic_add32(&_reserved_spans, (int32_t)reserved);
+#endif
+}
+
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
 static span_t*
 _memory_map_spans(heap_t* heap, size_t span_count) {
@@ -584,19 +596,10 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 	size_t align_offset = 0;
 	span_t* span = _memory_map(request_spans * _memory_span_size, &align_offset);
 	span->data.block.align_offset = (uint16_t)align_offset;
-	if (request_spans > span_count) {
-		assert(request_spans < 127);
-		heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
-		heap->span_reserve_master = span;
-		heap->spans_reserved = request_spans - span_count;
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, request_spans, span_count);
-#if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, (int32_t)request_spans);
-#endif
-	}
-	else {
+	if (request_spans > span_count)
+		_memory_map_span_as_reserved(heap, span, request_spans, span_count);
+	else
 		span->flags = 0;
-	}
 	return span;
 }
 
@@ -649,6 +652,24 @@ _memory_unmap_span_list(span_t* span, size_t span_count) {
 	}
 }
 
+//! Make a span list out of a super span
+static span_t*
+_memory_span_list_make(span_t* master, size_t reserved, size_t used) {
+	master->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved, used);
+	span_t* head = pointer_offset(master, _memory_span_size);
+	span_t* span = head;
+	span_t* last = 0;
+	for (size_t ispan = 1; ispan < reserved; ++ispan) {
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, ispan, 1);
+		span->next_span = pointer_offset(span, _memory_span_size);
+		span->data.list.align_offset = 0;
+		last = span;
+		span = span->next_span;
+	}
+	last->next_span = 0;
+	return head;
+}
+
 //! Add span to head of single linked span list
 static size_t
 _memory_span_list_push(span_t** head, span_t* span) {
@@ -881,6 +902,20 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	//Step 5: Try grab a span from the thread reserved spans
 	if (!span && heap->spans_reserved)
 		span = _memory_map_spans(heap, 1);
+#if ENABLE_THREAD_CACHE
+	if (!span) {
+		//Step 6: Locate a larger span in thread cache
+		size_t span_count = 1;
+		for (; span_count < LARGE_CLASS_COUNT; ++span_count) {
+			if (heap->span_cache[span_count - 1]) {
+				span = _memory_span_list_pop(&heap->span_cache[span_count - 1]);
+				break;
+			}
+		}
+		if (span)
+			_memory_map_span_as_reserved(heap, span, span_count, 1);
+	}
+#endif
 	if (!span) {
 		//Step 6: No span available in the thread cache, try grab a list of spans from the global cache
 		heap->span_cache[0] = _memory_global_cache_extract(1);
@@ -951,13 +986,35 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 #else
 	_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
 #endif
-
 	//Step 3: Try grab a span from the thread reserved spans
 	if (!span && (heap->spans_reserved >= span_count))
 		span = _memory_map_spans(heap, span_count);
-
+#if ENABLE_THREAD_CACHE
+	if (!span) {
+		//Step 4: Locate a larger span in thread cache
+		size_t reserve_count = span_count + 1;
+		for (; reserve_count < LARGE_CLASS_COUNT; ++reserve_count) {
+			if (heap->span_cache[reserve_count - 1]) {
+				span = _memory_span_list_pop(&heap->span_cache[reserve_count - 1]);
+				break;
+			}
+		}
+		if (span) {
+			if (!heap->spans_reserved) {
+				_memory_map_span_as_reserved(heap, span, reserve_count, span_count);
+			}
+			else {
+				size_t remain_count = reserve_count - span_count;
+				span_t* span_list = _memory_span_list_make(span, reserve_count, span_count);
+				span->next_span = 0;
+				assert(!heap->span_cache[remain_count - 1]);
+				heap->span_cache[remain_count - 1] = span_list;
+			}
+		}
+	}
+#endif
 #if ENABLE_GLOBAL_CACHE
-	//Step 4: Extract a list of spans from global cache
+	//Step 5: Extract a list of spans from global cache
 	if (!span) {
 		heap->span_cache[idx] = _memory_global_cache_extract(span_count);
 		if (heap->span_cache[idx]) {
@@ -969,7 +1026,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	}
 #endif
 	if (!span) {
-		//Step 5: Map in more memory pages
+		//Step 6: Map in more memory pages
 		span = _memory_map_spans(heap, span_count);
 	}
 
@@ -1115,27 +1172,14 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	if (heap->span_counter[idx].current_allocations)
 		--heap->span_counter[idx].current_allocations;
 #endif
-	if (!heap->span_cache[0] && (span_count <= heap->span_counter[0].cache_limit) && !span->flags) {
+	if (!heap->spans_reserved && !span->flags) {
 		//Break up as single span cache
-		span_t* master = span;
-		master->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, span_count, 1);
-		for (size_t ispan = 1; ispan < span_count; ++ispan) {
-			span->next_span = pointer_offset(span, _memory_span_size);
-			span = span->next_span;
-			span->data.list.align_offset = 0;
-			span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, ispan, 1);
-		}
-		span->next_span = 0;
-		master->data.list.size = (uint32_t)span_count;
-		heap->span_cache[0] = master;
-#if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, (int32_t)span_count);
-#endif
-	}
-	else {
-		//Insert into cache list
-		_memory_heap_cache_insert(heap, span, span_count);
+		_memory_map_span_as_reserved(heap, span, span_count, 1);
+		span_count = 1;
 	}
+
+	//Insert into cache list
+	_memory_heap_cache_insert(heap, span, span_count);
 }
 
 //! Process pending deferred cross-thread deallocations

From 4d97b38783158acada71bccb26c010fd2566e440 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Thu, 8 Feb 2018 15:22:39 +0100
Subject: [PATCH 27/42] work in progress

---
 rpmalloc/rpmalloc.c | 372 ++++++++++++++++++++++----------------------
 1 file changed, 185 insertions(+), 187 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 211a231e..77a7b63c 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -79,12 +79,12 @@
 
 #ifndef ENABLE_STATISTICS
 //! Enable statistics collection
-#define ENABLE_STATISTICS         0
+#define ENABLE_STATISTICS         1
 #endif
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            0
+#define ENABLE_ASSERTS            1
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -94,7 +94,7 @@
 
 #ifndef ENABLE_GUARDS
 //! Enable overwrite/underwrite guards
-#define ENABLE_GUARDS             0
+#define ENABLE_GUARDS             1
 #endif
 
 #if !ENABLE_THREAD_CACHE
@@ -305,6 +305,8 @@ typedef struct span_list_t span_list_t;
 typedef union span_data_t span_data_t;
 //! Cache data
 typedef struct span_counter_t span_counter_t;
+//! Global cache
+typedef struct global_cache_t global_cache_t;
 
 #define SPAN_FLAG_MASTER 1
 #define SPAN_FLAG_SUBSPAN 2
@@ -411,6 +413,13 @@ struct size_class_t {
 };
 _Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
 
+struct global_cache_t {
+	//! Cache list pointer
+	atomicptr_t cache;
+	//! ABA counter
+	atomic32_t counter;
+};
+
 //! Configuration
 static rpmalloc_config_t _memory_config;
 
@@ -422,7 +431,7 @@ static atomic32_t _memory_heap_id;
 
 #if ENABLE_GLOBAL_CACHE
 //! Global span cache
-static atomicptr_t _memory_span_cache[LARGE_CLASS_COUNT];
+static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
 
 //! All heaps
@@ -566,15 +575,30 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 #define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)((flags & 0xfe03) | ((uint16_t)(remains - 1) << 2))); assert(remains < 128)
 
 static void
-_memory_map_span_as_reserved(heap_t* heap, span_t* span, size_t reserved, size_t used) {
-	assert(reserved < 127);
-	heap->span_reserve = pointer_offset(span, used * _memory_span_size);
-	heap->span_reserve_master = span;
-	heap->spans_reserved = reserved - used;
-	span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved, used);
+_memory_map_span_as_reserved(heap_t* heap, span_t* span, size_t reserved_count, size_t span_count) {
+	assert((reserved_count > 1) && (reserved_count < 127));
+	assert(!heap->spans_reserved);
+	heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
+	heap->spans_reserved = reserved_count - span_count;
+	if (!span->flags) {
+		heap->span_reserve_master = span;
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved_count, span_count);
 #if ENABLE_STATISTICS
-	atomic_add32(&_reserved_spans, (int32_t)reserved);
+		atomic_add32(&_reserved_spans, (int32_t)reserved_count);
 #endif
+	}
+	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		uint16_t remains = SPAN_REMAINS(span->flags);
+		heap->span_reserve_master = span;
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, span_count);
+	}
+	else {
+		uint16_t distance = SPAN_DISTANCE(span->flags);
+		heap->span_reserve_master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
+		assert(SPAN_HAS_FLAG(heap->span_reserve_master->flags, SPAN_FLAG_MASTER));
+		assert(SPAN_REMAINS(heap->span_reserve_master->flags) >= span_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
+	}
 }
 
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
@@ -595,11 +619,10 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 	size_t request_spans = (heap->spans_reserved || (span_count > _memory_config.span_map_count)) ? span_count : _memory_config.span_map_count;
 	size_t align_offset = 0;
 	span_t* span = _memory_map(request_spans * _memory_span_size, &align_offset);
+	span->flags = 0;
 	span->data.block.align_offset = (uint16_t)align_offset;
 	if (request_spans > span_count)
 		_memory_map_span_as_reserved(heap, span, request_spans, span_count);
-	else
-		span->flags = 0;
 	return span;
 }
 
@@ -654,20 +677,29 @@ _memory_unmap_span_list(span_t* span, size_t span_count) {
 
 //! Make a span list out of a super span
 static span_t*
-_memory_span_list_make(span_t* master, size_t reserved, size_t used) {
-	master->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved, used);
-	span_t* head = pointer_offset(master, _memory_span_size);
-	span_t* span = head;
-	span_t* last = 0;
-	for (size_t ispan = 1; ispan < reserved; ++ispan) {
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, ispan, 1);
-		span->next_span = pointer_offset(span, _memory_span_size);
-		span->data.list.align_offset = 0;
-		last = span;
-		span = span->next_span;
-	}
-	last->next_span = 0;
-	return head;
+_memory_span_split(span_t* span, size_t reserved_count, size_t span_count) {
+	uint16_t distance = 0;
+	if (!span->flags) {
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved_count, span_count);
+#if ENABLE_STATISTICS
+		atomic_add32(&_reserved_spans, (int32_t)reserved_count);
+#endif
+	}
+	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		uint16_t remains = SPAN_REMAINS(span->flags);
+		assert(remains >= reserved_count);
+		assert(SPAN_COUNT(span->flags) == reserved_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, span_count);
+	}
+	else {
+		distance = SPAN_DISTANCE(span->flags);
+		assert(SPAN_COUNT(span->flags) == reserved_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
+	}
+	span_t* subspan = pointer_offset(span, span_count * _memory_span_size);
+	subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + span_count, reserved_count - span_count);
+	subspan->data.list.align_offset = 0;
+	return subspan;
 }
 
 //! Add span to head of single linked span list
@@ -700,6 +732,8 @@ _memory_span_list_pop(span_t** head) {
 static span_t*
 _memory_span_list_split(span_t* span, size_t limit) {
 	span_t* next = 0;
+	if (limit < 2)
+		limit = 2;
 	if (span->data.list.size > limit) {
 		count_t list_size = 1;
 		span_t* last = span;
@@ -748,53 +782,59 @@ _memory_span_list_doublelink_remove(span_t** head, span_t* span) {
 
 #if ENABLE_GLOBAL_CACHE
 
-static atomic32_t _global_cache_counter;
-
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_cache_insert(atomicptr_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
-	MEMORY_UNUSED(span_count);
-	MEMORY_UNUSED(cache_limit);
+_memory_cache_insert(global_cache_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
 	assert((span->data.list.size == 1) || (span->next_span != 0));
+	uintptr_t list_size = span->data.list.size;
 	void* current_cache, *new_cache;
 	do {
-		current_cache = atomic_load_ptr(cache);
-		span->prev_span = (void*)((uintptr_t)current_cache & _memory_span_mask);
-		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&_global_cache_counter) & ~_memory_span_mask));
-	} while (!atomic_cas_ptr(cache, new_cache, current_cache));
+		current_cache = atomic_load_ptr(&cache->cache);
+		span->prev_span = current_cache;
+		uintptr_t prev_size = ((uintptr_t)current_cache & ~_memory_span_mask);
+		uintptr_t new_size = prev_size + list_size;
+		if ((new_size > cache_limit) || (new_size & _memory_span_mask)) {
+			_memory_unmap_span_list(span, span_count);
+			return;
+		}
+		new_cache = (void*)((uintptr_t)span | new_size);
+	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
 }
 
 //! Extract a number of memory page spans from the global cache
 static span_t*
-_memory_cache_extract(atomicptr_t* cache) {
-	span_t* span;
+_memory_cache_extract(global_cache_t* cache) {
+	uintptr_t span_ptr;
 	do {
-		span = 0;
-		void* global_span = atomic_load_ptr(cache);
-		uintptr_t span_ptr = (uintptr_t)global_span & _memory_span_mask;
+		void* global_span = atomic_load_ptr(&cache->cache);
+		span_ptr = (uintptr_t)global_span & _memory_span_mask;
 		if (span_ptr) {
-			span = (void*)span_ptr;
-			//By accessing the span ptr before it is swapped out of list we assume that a contenting thread
+			span_t* span = (void*)span_ptr;
+			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
 			//does not manage to traverse the span to being unmapped before we access it
-			void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&_global_cache_counter) & ~_memory_span_mask));
-			if (atomic_cas_ptr(cache, new_cache, global_span))
-				break;
+			//void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
+			if (atomic_cas_ptr(&cache->cache, span->prev_span, global_span))
+				return span;
 		}
-	} while (span);
-	return span;
+	} while (span_ptr);
+	return 0;
 }
 
 //! Finalize a global cache
 static void
-_memory_cache_finalize(atomicptr_t* cache, size_t span_count) {
-	void* current_cache = atomic_load_ptr(cache);
+_memory_cache_finalize(global_cache_t* cache, size_t span_count) {
+	void* current_cache = atomic_load_ptr(&cache->cache);
 	span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
 	while (span) {
-		span_t* skip_span = span->prev_span;
+		span_t* skip_span = (void*)((uintptr_t)span->prev_span & _memory_span_mask);
+		uintptr_t next_size = ((uintptr_t)span->prev_span & ~_memory_span_mask);
+		uintptr_t total_size = (uintptr_t)current_cache & ~_memory_span_mask;
+		current_cache = span->prev_span;
+		assert(total_size == ((int32_t)span->data.list.size + next_size));
 		_memory_unmap_span_list(span, span_count);
 		span = skip_span;
 	}
-	atomic_store_ptr(cache, 0);
+	atomic_store_ptr(&cache->cache, 0);
 }
 
 #endif
@@ -822,6 +862,67 @@ _memory_global_cache_extract(size_t span_count) {
 #endif
 }
 
+//! Insert a single span into thread heap cache, releasing to global cache if overflow
+static void
+_memory_heap_cache_insert(heap_t* heap, span_t* span, size_t span_count) {
+#if ENABLE_THREAD_CACHE
+	size_t idx = span_count - 1;
+	if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
+		return;
+	heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit);
+#if ENABLE_STATISTICS
+	heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size;
+#endif
+#else
+	MEMORY_UNUSED(heap);
+	span->data.list.size = 1;
+#endif
+	_memory_global_cache_insert(span, span_count);
+}
+
+//! Extract the given number of spans from the heap caches
+static span_t*
+_memory_heap_cache_extract(heap_t* heap, size_t span_count) {
+	size_t idx = span_count - 1;
+#if ENABLE_THREAD_CACHE
+	if (heap->span_cache[idx])
+		return _memory_span_list_pop(&heap->span_cache[idx]);
+#endif
+	if (heap->spans_reserved >= span_count)
+		return _memory_map_spans(heap, span_count);
+#if ENABLE_THREAD_CACHE
+	span_t* span = 0;
+	for (++idx; idx < LARGE_CLASS_COUNT; ++idx) {
+		if (heap->span_cache[idx]) {
+			span = _memory_span_list_pop(&heap->span_cache[idx]);
+			break;
+		}
+	}
+	if (span) {
+		size_t reserve_count = idx + 1;
+
+		if (!heap->spans_reserved) {
+			_memory_map_span_as_reserved(heap, span, reserve_count, span_count);
+		}
+		else {
+			size_t remain_count = reserve_count - span_count;
+			span_t* subspan = _memory_span_split(span, reserve_count, span_count);
+			_memory_heap_cache_insert(heap, subspan, remain_count);
+		}
+		return span;
+	}
+#endif
+	idx = span_count - 1;
+	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
+	if (heap->span_cache[idx]) {
+#if ENABLE_STATISTICS
+		heap->global_to_thread += (size_t)heap->span_cache[idx]->data.list.size * span_count * _memory_span_size;
+#endif
+		return _memory_span_list_pop(&heap->span_cache[idx]);
+	}
+	return 0;
+}
+
 //! Allocate a small/medium sized memory block from the given heap
 static void*
 _memory_allocate_from_heap(heap_t* heap, size_t size) {
@@ -893,43 +994,9 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 		goto use_active;
 	}
 
-	span_t* span = 0;
-#if ENABLE_THREAD_CACHE
-	//Step 4: Try grab a span from the thread cache
-	if (heap->span_cache[0])
-		span = _memory_span_list_pop(&heap->span_cache[0]);
-#endif
-	//Step 5: Try grab a span from the thread reserved spans
-	if (!span && heap->spans_reserved)
-		span = _memory_map_spans(heap, 1);
-#if ENABLE_THREAD_CACHE
-	if (!span) {
-		//Step 6: Locate a larger span in thread cache
-		size_t span_count = 1;
-		for (; span_count < LARGE_CLASS_COUNT; ++span_count) {
-			if (heap->span_cache[span_count - 1]) {
-				span = _memory_span_list_pop(&heap->span_cache[span_count - 1]);
-				break;
-			}
-		}
-		if (span)
-			_memory_map_span_as_reserved(heap, span, span_count, 1);
-	}
-#endif
-	if (!span) {
-		//Step 6: No span available in the thread cache, try grab a list of spans from the global cache
-		heap->span_cache[0] = _memory_global_cache_extract(1);
-		if (heap->span_cache[0]) {
-#if ENABLE_STATISTICS
-			heap->global_to_thread += (size_t)heap->span_cache[0]->data.list.size * _memory_span_size;
-#endif
-			span = _memory_span_list_pop(&heap->span_cache[0]);
-		}
-	}
-	if (!span) {
-		//Step 7: All caches empty, map in new memory pages
+	span_t* span = _memory_heap_cache_extract(heap, 1);
+	if (!span)
 		span = _memory_map_spans(heap, 1);
-	}
 
 	//Mark span as owned by this heap and set base data
 	span->size_class = (uint16_t)class_idx;
@@ -968,67 +1035,17 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 	if (size & (_memory_span_size - 1))
 		++span_count;
 	size_t idx = span_count - 1;
-	
-	span_t* span = 0;
+
 #if ENABLE_THREAD_CACHE
-	//Step 1: Check if cache for this large size class or the following has a span
-	while (!heap->span_cache[idx] && (idx < (LARGE_CLASS_COUNT - 1)) && (idx < (span_count + 1)))
-		++idx;
-	if (!heap->span_cache[idx]) {
-		//Restore index, we're back to smallest fitting span count
-		idx = span_count - 1;
-
-		//Step 2: Process deferred deallocation
+	if (!heap->span_cache[idx])
 		_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
-	}
-	if (heap->span_cache[idx])
-		span = _memory_span_list_pop(&heap->span_cache[idx]);
 #else
 	_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
 #endif
-	//Step 3: Try grab a span from the thread reserved spans
-	if (!span && (heap->spans_reserved >= span_count))
-		span = _memory_map_spans(heap, span_count);
-#if ENABLE_THREAD_CACHE
-	if (!span) {
-		//Step 4: Locate a larger span in thread cache
-		size_t reserve_count = span_count + 1;
-		for (; reserve_count < LARGE_CLASS_COUNT; ++reserve_count) {
-			if (heap->span_cache[reserve_count - 1]) {
-				span = _memory_span_list_pop(&heap->span_cache[reserve_count - 1]);
-				break;
-			}
-		}
-		if (span) {
-			if (!heap->spans_reserved) {
-				_memory_map_span_as_reserved(heap, span, reserve_count, span_count);
-			}
-			else {
-				size_t remain_count = reserve_count - span_count;
-				span_t* span_list = _memory_span_list_make(span, reserve_count, span_count);
-				span->next_span = 0;
-				assert(!heap->span_cache[remain_count - 1]);
-				heap->span_cache[remain_count - 1] = span_list;
-			}
-		}
-	}
-#endif
-#if ENABLE_GLOBAL_CACHE
-	//Step 5: Extract a list of spans from global cache
-	if (!span) {
-		heap->span_cache[idx] = _memory_global_cache_extract(span_count);
-		if (heap->span_cache[idx]) {
-#if ENABLE_STATISTICS
-			heap->global_to_thread += (size_t)heap->span_cache[idx]->data.list.size * span_count * _memory_span_size;
-#endif
-			span = _memory_span_list_pop(&heap->span_cache[idx]);
-		}
-	}
-#endif
-	if (!span) {
-		//Step 6: Map in more memory pages
+
+	span_t* span = _memory_heap_cache_extract(heap, span_count);
+	if (!span)
 		span = _memory_map_spans(heap, span_count);
-	}
 
 	//Mark span as owned by this heap and set base data
 	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
@@ -1062,54 +1079,35 @@ _memory_allocate_heap(void) {
 	}
 	while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
 
-	if (heap) {
-		heap->next_orphan = 0;
-		return heap;
-	}
+	if (!heap) {
+		//Map in pages for a new heap
+		size_t align_offset = 0;
+		heap = _memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
+		memset(heap, 0, sizeof(heap_t));
+		heap->align_offset = align_offset;
 
-	//Map in pages for a new heap
-	size_t align_offset = 0;
-	heap = _memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
-	memset(heap, 0, sizeof(heap_t));
-	heap->align_offset = align_offset;
+		//Get a new heap ID
+		do {
+			heap->id = atomic_incr32(&_memory_heap_id);
+			if (_memory_heap_lookup(heap->id))
+				heap->id = 0;
+		} while (!heap->id);
 
-	//Get a new heap ID
-	do {
-		heap->id = atomic_incr32(&_memory_heap_id);
-		if (_memory_heap_lookup(heap->id))
-			heap->id = 0;
+		//Link in heap in heap ID map
+		size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+		do {
+			next_heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+			heap->next_heap = next_heap;
+		} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
 	}
-	while (!heap->id);
 
-	//Link in heap in heap ID map
-	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-	do {
-		next_heap = atomic_load_ptr(&_memory_heaps[list_idx]);
-		heap->next_heap = next_heap;
-	}
-	while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+	heap->span_counter[0].cache_limit = MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE;
+	for (size_t idx = 1; idx < LARGE_CLASS_COUNT; ++idx)
+		heap->span_counter[idx].cache_limit = MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE;
 
 	return heap;
 }
 
-//! Insert a single span into thread heap cache, releasing to global cache if overflow
-static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span, size_t span_count) {
-#if ENABLE_THREAD_CACHE
-	size_t idx = span_count - 1;
-	if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
-		return;
-	heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit);
-#if ENABLE_STATISTICS
-	heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size;
-#endif
-#else
-	MEMORY_UNUSED(heap);
-	span->data.list.size = 1;
-#endif
-	_memory_global_cache_insert(span, span_count);
-}
-
 //! Deallocate the given small/medium memory block from the given heap
 static void
 _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
@@ -1172,7 +1170,7 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	if (heap->span_counter[idx].current_allocations)
 		--heap->span_counter[idx].current_allocations;
 #endif
-	if (!heap->spans_reserved && !span->flags) {
+	if (!heap->spans_reserved && (span_count > 1)) {
 		//Break up as single span cache
 		_memory_map_span_as_reserved(heap, span, span_count, 1);
 		span_count = 1;
@@ -1925,9 +1923,9 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 #endif
 #if ENABLE_GLOBAL_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]);
-		uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~_memory_span_mask;
-		stats->cached += global_span_count * (iclass + 1) * _memory_span_size;
+		uintptr_t global_span_ptr = (uintptr_t)atomic_load_ptr(&_memory_span_cache[iclass].cache);
+		span_t* cache = (span_t*)(global_span_ptr & _memory_span_mask);
+		stats->cached += (size_t)(cache ? atomic_load32(&cache->heap_id) : 0) * (iclass + 1) * _memory_span_size;
 	}
 #endif
 }

From b032f526409cb39078d61ad6b45587176fad8567 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Thu, 8 Feb 2018 20:02:51 +0100
Subject: [PATCH 28/42] use external global cache size and aba counter

---
 rpmalloc/rpmalloc.c | 45 +++++++++++++++++++++++----------------------
 test/main.c         |  2 ++
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 77a7b63c..1de88cbe 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -159,17 +159,17 @@
 
 // Atomic access abstraction
 ALIGNED_STRUCT(atomic32_t, 4) {
-	int32_t nonatomic;
+	volatile int32_t nonatomic;
 };
 typedef struct atomic32_t atomic32_t;
 
 ALIGNED_STRUCT(atomic64_t, 8) {
-	int64_t nonatomic;
+	volatile int64_t nonatomic;
 };
 typedef struct atomic64_t atomic64_t;
 
 ALIGNED_STRUCT(atomicptr_t, 8) {
-	void* nonatomic;
+	volatile void* nonatomic;
 };
 typedef struct atomicptr_t atomicptr_t;
 
@@ -205,7 +205,7 @@ atomic_add32(atomic32_t* val, int32_t add) {
 
 static FORCEINLINE void*
 atomic_load_ptr(atomicptr_t* src) {
-	return src->nonatomic;
+	return (void*)((uintptr_t)src->nonatomic);
 }
 
 static FORCEINLINE void
@@ -416,6 +416,8 @@ _Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
 struct global_cache_t {
 	//! Cache list pointer
 	atomicptr_t cache;
+	//! Cache size
+	atomic32_t size;
 	//! ABA counter
 	atomic32_t counter;
 };
@@ -673,6 +675,7 @@ _memory_unmap_span_list(span_t* span, size_t span_count) {
 		_memory_unmap_spans(span, span_count);
 		span = next_span;
 	}
+	assert(!span);
 }
 
 //! Make a span list out of a super span
@@ -786,18 +789,17 @@ _memory_span_list_doublelink_remove(span_t** head, span_t* span) {
 static void
 _memory_cache_insert(global_cache_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
 	assert((span->data.list.size == 1) || (span->next_span != 0));
-	uintptr_t list_size = span->data.list.size;
+	int32_t list_size = (int32_t)span->data.list.size;
+	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
+		_memory_unmap_span_list(span, span_count);
+		atomic_add32(&cache->size, -list_size);
+		return;
+	}
 	void* current_cache, *new_cache;
 	do {
 		current_cache = atomic_load_ptr(&cache->cache);
-		span->prev_span = current_cache;
-		uintptr_t prev_size = ((uintptr_t)current_cache & ~_memory_span_mask);
-		uintptr_t new_size = prev_size + list_size;
-		if ((new_size > cache_limit) || (new_size & _memory_span_mask)) {
-			_memory_unmap_span_list(span, span_count);
-			return;
-		}
-		new_cache = (void*)((uintptr_t)span | new_size);
+		span->prev_span = (void*)((uintptr_t)current_cache & _memory_span_mask);
+		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
 	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
 }
 
@@ -812,9 +814,11 @@ _memory_cache_extract(global_cache_t* cache) {
 			span_t* span = (void*)span_ptr;
 			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
 			//does not manage to traverse the span to being unmapped before we access it
-			//void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-			if (atomic_cas_ptr(&cache->cache, span->prev_span, global_span))
+			void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
+			if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
+				atomic_add32(&cache->size, -(int32_t)span->data.list.size);
 				return span;
+			}
 		}
 	} while (span_ptr);
 	return 0;
@@ -827,14 +831,13 @@ _memory_cache_finalize(global_cache_t* cache, size_t span_count) {
 	span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
 	while (span) {
 		span_t* skip_span = (void*)((uintptr_t)span->prev_span & _memory_span_mask);
-		uintptr_t next_size = ((uintptr_t)span->prev_span & ~_memory_span_mask);
-		uintptr_t total_size = (uintptr_t)current_cache & ~_memory_span_mask;
-		current_cache = span->prev_span;
-		assert(total_size == ((int32_t)span->data.list.size + next_size));
+		atomic_add32(&cache->size, -(int32_t)span->data.list.size);
 		_memory_unmap_span_list(span, span_count);
 		span = skip_span;
 	}
+	assert(!atomic_load32(&cache->size));
 	atomic_store_ptr(&cache->cache, 0);
+	atomic_store32(&cache->size, 0);
 }
 
 #endif
@@ -1923,9 +1926,7 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 #endif
 #if ENABLE_GLOBAL_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		uintptr_t global_span_ptr = (uintptr_t)atomic_load_ptr(&_memory_span_cache[iclass].cache);
-		span_t* cache = (span_t*)(global_span_ptr & _memory_span_mask);
-		stats->cached += (size_t)(cache ? atomic_load32(&cache->heap_id) : 0) * (iclass + 1) * _memory_span_size;
+		stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size;
 	}
 #endif
 }
diff --git a/test/main.c b/test/main.c
index b976e39f..28532dc7 100644
--- a/test/main.c
+++ b/test/main.c
@@ -658,6 +658,8 @@ test_run(int argc, char** argv) {
 #  if defined(__IPHONE__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR)
 #    define NO_MAIN 1
 #  endif
+#elif (defined(__linux__) || defined(__linux))
+#  include <sched.h>
 #endif
 
 #if !defined(NO_MAIN)

From 755550523151a030915bd99a81f369e6f686b320 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Fri, 9 Feb 2018 15:39:39 +0100
Subject: [PATCH 29/42] set span count in span flags for all spans

---
 rpmalloc/rpmalloc.c | 154 +++++++++++++++++++++++++++-----------------
 1 file changed, 94 insertions(+), 60 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 1de88cbe..13a20bb6 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -569,37 +569,48 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 	_memory_config.memory_unmap(address, size, offset, release);
 }
 
-#define SPAN_MAKE_FLAGS(flag, remdist, count) ((uint16_t)(flag | ((uint16_t)(remdist - 1) << 2) | ((uint16_t)(count - 1) << 9))); assert(flag < 4); assert(remdist < 128); assert(count < 128)
-#define SPAN_HAS_FLAG(flags, flag) (flags & flag)
-#define SPAN_DISTANCE(flags) (1 + ((flags >> 2) & 0x7f))
-#define SPAN_REMAINS(flags) (1 + ((flags >> 2) & 0x7f))
-#define SPAN_COUNT(flags) (1 + ((flags >> 9) & 0x7f))
-#define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)((flags & 0xfe03) | ((uint16_t)(remains - 1) << 2))); assert(remains < 128)
+#define SPAN_MAKE_FLAGS(flags, remdist, count) ((uint16_t)((flags) | ((uint16_t)((remdist) - 1) << 2) | ((uint16_t)((count) - 1) << 9))); assert((flags) < 4); assert((remdist) && (remdist) < 128); assert((count) && (count) < 128)
+#define SPAN_HAS_FLAG(flags, flag) ((flags) & (flag))
+#define SPAN_DISTANCE(flags) (1 + (((flags) >> 2) & 0x7f))
+#define SPAN_REMAINS(flags) (1 + (((flags) >> 2) & 0x7f))
+#define SPAN_COUNT(flags) (1 + (((flags) >> 9) & 0x7f))
+#define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)(((flags) & 0xfe03) | ((uint16_t)((remains) - 1) << 2))); assert((remains) < 128)
+
+static atomic32_t _created_masters;
+static atomic32_t _created_subspans;
+static atomic32_t _freed_masters;
+static atomic32_t _freed_subspans;
 
 static void
-_memory_map_span_as_reserved(heap_t* heap, span_t* span, size_t reserved_count, size_t span_count) {
-	assert((reserved_count > 1) && (reserved_count < 127));
+_memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_count) {
+	size_t current_count = SPAN_COUNT(span->flags);
+	assert((current_count > 1) && (current_count < 127));
 	assert(!heap->spans_reserved);
-	heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
-	heap->spans_reserved = reserved_count - span_count;
-	if (!span->flags) {
+	assert(SPAN_COUNT(span->flags) == current_count);
+	assert(current_count > use_count);
+	heap->span_reserve = pointer_offset(span, use_count * _memory_span_size);
+	heap->spans_reserved = current_count - use_count;
+	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
 		heap->span_reserve_master = span;
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved_count, span_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
+		atomic_incr32(&_created_masters);
 #if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, (int32_t)reserved_count);
+		atomic_add32(&_reserved_spans, (int32_t)current_count);
 #endif
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
 		uint16_t remains = SPAN_REMAINS(span->flags);
+		assert(remains >= current_count);
 		heap->span_reserve_master = span;
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, span_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count);
 	}
-	else {
+	else { //SPAN_FLAG_SUBSPAN
 		uint16_t distance = SPAN_DISTANCE(span->flags);
-		heap->span_reserve_master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
-		assert(SPAN_HAS_FLAG(heap->span_reserve_master->flags, SPAN_FLAG_MASTER));
-		assert(SPAN_REMAINS(heap->span_reserve_master->flags) >= span_count);
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
+		span_t* master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
+		heap->span_reserve_master = master;
+		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER));
+		assert(SPAN_REMAINS(master->flags) >= current_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count);
 	}
 }
 
@@ -614,6 +625,7 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift);
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
 		span->data.block.align_offset = 0;
+		atomic_incr32(&_created_subspans);
 		return span;
 	}
 
@@ -621,17 +633,18 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 	size_t request_spans = (heap->spans_reserved || (span_count > _memory_config.span_map_count)) ? span_count : _memory_config.span_map_count;
 	size_t align_offset = 0;
 	span_t* span = _memory_map(request_spans * _memory_span_size, &align_offset);
-	span->flags = 0;
+	span->flags = SPAN_MAKE_FLAGS(0, request_spans, request_spans);
 	span->data.block.align_offset = (uint16_t)align_offset;
 	if (request_spans > span_count)
-		_memory_map_span_as_reserved(heap, span, request_spans, span_count);
+		_memory_set_span_remainder_as_reserved(heap, span, span_count);
 	return span;
 }
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
-_memory_unmap_spans(span_t* span, size_t span_count) {
-	if (!span->flags) {
+_memory_unmap_spans(span_t* span) {
+	size_t span_count = SPAN_COUNT(span->flags);
+	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
 		_memory_unmap(span, _memory_span_size * span_count, span->data.list.align_offset, 1);
 		return;
 	}
@@ -652,6 +665,10 @@ _memory_unmap_spans(span_t* span, size_t span_count) {
 #if ENABLE_STATISTICS
 		atomic_add32(&_reserved_spans, -(int32_t)span_count);
 #endif
+		atomic_incr32(&_freed_subspans);
+	}
+	else {
+		atomic_incr32(&_freed_masters);
 	}
 	if (!remains) {
 		uint32_t master_span_count = SPAN_COUNT(master->flags);
@@ -668,11 +685,11 @@ _memory_unmap_spans(span_t* span, size_t span_count) {
 
 //! Unmap a single linked list of spans
 static void
-_memory_unmap_span_list(span_t* span, size_t span_count) {
+_memory_unmap_span_list(span_t* span) {
 	size_t list_size = span ? span->data.list.size : 0;
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		span_t* next_span = span->next_span;
-		_memory_unmap_spans(span, span_count);
+		_memory_unmap_spans(span);
 		span = next_span;
 	}
 	assert(!span);
@@ -680,27 +697,26 @@ _memory_unmap_span_list(span_t* span, size_t span_count) {
 
 //! Make a span list out of a super span
 static span_t*
-_memory_span_split(span_t* span, size_t reserved_count, size_t span_count) {
+_memory_span_split(span_t* span, size_t use_count) {
 	uint16_t distance = 0;
-	if (!span->flags) {
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, reserved_count, span_count);
+	size_t current_count = SPAN_COUNT(span->flags);
+	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
 #if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, (int32_t)reserved_count);
+		atomic_add32(&_reserved_spans, (int32_t)current_count);
 #endif
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
 		uint16_t remains = SPAN_REMAINS(span->flags);
-		assert(remains >= reserved_count);
-		assert(SPAN_COUNT(span->flags) == reserved_count);
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, span_count);
+		assert(remains >= current_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count);
 	}
-	else {
+	else { //SPAN_FLAG_SUBSPAN
 		distance = SPAN_DISTANCE(span->flags);
-		assert(SPAN_COUNT(span->flags) == reserved_count);
-		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count);
 	}
-	span_t* subspan = pointer_offset(span, span_count * _memory_span_size);
-	subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + span_count, reserved_count - span_count);
+	span_t* subspan = pointer_offset(span, use_count * _memory_span_size);
+	subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + use_count, current_count - use_count);
 	subspan->data.list.align_offset = 0;
 	return subspan;
 }
@@ -747,8 +763,8 @@ _memory_span_list_split(span_t* span, size_t limit) {
 			++list_size;
 		}
 		last->next_span = 0;
-		if (next)
-			next->data.list.size = span->data.list.size - list_size;
+		assert(next);
+		next->data.list.size = span->data.list.size - list_size;
 		span->data.list.size = list_size;
 		span->prev_span = 0;
 	}
@@ -785,13 +801,22 @@ _memory_span_list_doublelink_remove(span_t** head, span_t* span) {
 
 #if ENABLE_GLOBAL_CACHE
 
+static atomic32_t _cache_unmaps;
+static atomic32_t _cache_unmaps_masters;
+static atomic32_t _cache_unmaps_subspans;
+
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_cache_insert(global_cache_t* cache, span_t* span, size_t span_count, size_t cache_limit) {
+_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
 	assert((span->data.list.size == 1) || (span->next_span != 0));
 	int32_t list_size = (int32_t)span->data.list.size;
 	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
-		_memory_unmap_span_list(span, span_count);
+		atomic_incr32(&_cache_unmaps);
+		if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER))
+			atomic_incr32(&_cache_unmaps_masters);
+		if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN))
+			atomic_incr32(&_cache_unmaps_subspans);
+		_memory_unmap_span_list(span);
 		atomic_add32(&cache->size, -list_size);
 		return;
 	}
@@ -826,13 +851,13 @@ _memory_cache_extract(global_cache_t* cache) {
 
 //! Finalize a global cache
 static void
-_memory_cache_finalize(global_cache_t* cache, size_t span_count) {
+_memory_cache_finalize(global_cache_t* cache) {
 	void* current_cache = atomic_load_ptr(&cache->cache);
 	span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
 	while (span) {
 		span_t* skip_span = (void*)((uintptr_t)span->prev_span & _memory_span_mask);
 		atomic_add32(&cache->size, -(int32_t)span->data.list.size);
-		_memory_unmap_span_list(span, span_count);
+		_memory_unmap_span_list(span);
 		span = skip_span;
 	}
 	assert(!atomic_load32(&cache->size));
@@ -844,12 +869,13 @@ _memory_cache_finalize(global_cache_t* cache, size_t span_count) {
 
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_global_cache_insert(span_t* span, size_t span_count) {
+_memory_global_cache_insert(span_t* span) {
 #if ENABLE_GLOBAL_CACHE
+	size_t span_count = SPAN_COUNT(span->flags);
 	const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
 	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
 	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, span_count, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
+	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 #else
 	_memory_unmap_span_list(span, span_count);
 #endif
@@ -859,7 +885,9 @@ _memory_global_cache_insert(span_t* span, size_t span_count) {
 static span_t*
 _memory_global_cache_extract(size_t span_count) {
 #if ENABLE_GLOBAL_CACHE
-	return _memory_cache_extract(&_memory_span_cache[span_count - 1]);
+	span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
+	assert(!span || (SPAN_COUNT(span->flags) == span_count));
+	return span;
 #else
 	return 0;
 #endif
@@ -867,12 +895,14 @@ _memory_global_cache_extract(size_t span_count) {
 
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
 static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span, size_t span_count) {
+_memory_heap_cache_insert(heap_t* heap, span_t* span) {
 #if ENABLE_THREAD_CACHE
+	size_t span_count = SPAN_COUNT(span->flags);
 	size_t idx = span_count - 1;
 	if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
 		return;
 	heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit);
+	assert(span->data.list.size == heap->span_counter[idx].cache_limit);
 #if ENABLE_STATISTICS
 	heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size;
 #endif
@@ -880,7 +910,7 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span, size_t span_count) {
 	MEMORY_UNUSED(heap);
 	span->data.list.size = 1;
 #endif
-	_memory_global_cache_insert(span, span_count);
+	_memory_global_cache_insert(span);
 }
 
 //! Extract the given number of spans from the heap caches
@@ -902,16 +932,15 @@ _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
 		}
 	}
 	if (span) {
-		size_t reserve_count = idx + 1;
-
+		assert(SPAN_COUNT(span->flags) > span_count);
 		if (!heap->spans_reserved) {
-			_memory_map_span_as_reserved(heap, span, reserve_count, span_count);
+			_memory_set_span_remainder_as_reserved(heap, span, span_count);
 		}
 		else {
-			size_t remain_count = reserve_count - span_count;
-			span_t* subspan = _memory_span_split(span, reserve_count, span_count);
-			_memory_heap_cache_insert(heap, subspan, remain_count);
+			span_t* subspan = _memory_span_split(span, span_count);
+			_memory_heap_cache_insert(heap, subspan);
 		}
+		assert(SPAN_COUNT(span->flags) == span_count);
 		return span;
 	}
 #endif
@@ -1002,6 +1031,7 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 		span = _memory_map_spans(heap, 1);
 
 	//Mark span as owned by this heap and set base data
+	assert(SPAN_COUNT(span->flags) == 1);
 	span->size_class = (uint16_t)class_idx;
 	atomic_store32(&span->heap_id, heap->id);
 	atomic_thread_fence_release();
@@ -1051,6 +1081,7 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 		span = _memory_map_spans(heap, span_count);
 
 	//Mark span as owned by this heap and set base data
+	assert(SPAN_COUNT(span->flags) == span_count);
 	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
 	atomic_store32(&span->heap_id, heap->id);
 	atomic_thread_fence_release();
@@ -1116,6 +1147,7 @@ static void
 _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 	//Check if span is the currently active span in order to operate
 	//on the correct bookkeeping data
+	assert(SPAN_COUNT(span->flags) == 1);
 	const count_t class_idx = span->size_class;
 	size_class_t* size_class = _memory_size_class + class_idx;
 	int is_active = (heap->active_span[class_idx] == span);
@@ -1140,7 +1172,7 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 			_memory_span_list_doublelink_remove(&heap->size_cache[class_idx], span);
 
 		//Add to heap span cache
-		_memory_heap_cache_insert(heap, span, 1);
+		_memory_heap_cache_insert(heap, span);
 		return;
 	}
 
@@ -1166,6 +1198,7 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 	//Decrease counter
 	size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT;
 	size_t span_count = idx + 1;
+	assert(SPAN_COUNT(span->flags) == span_count);
 	assert(span->size_class >= SIZE_CLASS_COUNT);
 	assert(idx < LARGE_CLASS_COUNT);
 #if ENABLE_THREAD_CACHE
@@ -1175,12 +1208,12 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 #endif
 	if (!heap->spans_reserved && (span_count > 1)) {
 		//Break up as single span cache
-		_memory_map_span_as_reserved(heap, span, span_count, 1);
+		_memory_set_span_remainder_as_reserved(heap, span, 1);
 		span_count = 1;
 	}
 
 	//Insert into cache list
-	_memory_heap_cache_insert(heap, span, span_count);
+	_memory_heap_cache_insert(heap, span);
 }
 
 //! Process pending deferred cross-thread deallocations
@@ -1506,7 +1539,7 @@ rpmalloc_finalize(void) {
 			//Free span caches
 #if ENABLE_THREAD_CACHE
 			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-				_memory_unmap_span_list(heap->span_cache[iclass], iclass + 1);
+				_memory_unmap_span_list(heap->span_cache[iclass]);
 #endif
 
 			if (heap->spans_reserved) {
@@ -1546,7 +1579,7 @@ rpmalloc_finalize(void) {
 	//Free global caches
 #if ENABLE_GLOBAL_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-		_memory_cache_finalize(&_memory_span_cache[iclass], iclass + 1);
+		_memory_cache_finalize(&_memory_span_cache[iclass]);
 #endif
 
 #if ENABLE_STATISTICS
@@ -1591,8 +1624,9 @@ rpmalloc_thread_finalize(void) {
 		const size_t span_count = iclass + 1;
 		span_t* span = heap->span_cache[iclass];
 		while (span) {
+			assert(SPAN_COUNT(span->flags) == span_count);
 			span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count));
-			_memory_global_cache_insert(span, span_count);
+			_memory_global_cache_insert(span);
 			span = next;
 		}
 		heap->span_cache[iclass] = 0;

From 51f655ebcd2071ba2fc9c5b3096b58f1b419f465 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Fri, 9 Feb 2018 23:17:38 +0100
Subject: [PATCH 30/42] mark unmapped master spans

---
 rpmalloc/rpmalloc.c | 63 ++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 13a20bb6..dbc66b64 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -94,7 +94,7 @@
 
 #ifndef ENABLE_GUARDS
 //! Enable overwrite/underwrite guards
-#define ENABLE_GUARDS             1
+#define ENABLE_GUARDS             0
 #endif
 
 #if !ENABLE_THREAD_CACHE
@@ -576,14 +576,10 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 #define SPAN_COUNT(flags) (1 + (((flags) >> 9) & 0x7f))
 #define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)(((flags) & 0xfe03) | ((uint16_t)((remains) - 1) << 2))); assert((remains) < 128)
 
-static atomic32_t _created_masters;
-static atomic32_t _created_subspans;
-static atomic32_t _freed_masters;
-static atomic32_t _freed_subspans;
-
 static void
 _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_count) {
 	size_t current_count = SPAN_COUNT(span->flags);
+	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	assert((current_count > 1) && (current_count < 127));
 	assert(!heap->spans_reserved);
 	assert(SPAN_COUNT(span->flags) == current_count);
@@ -593,7 +589,6 @@ _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_co
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
 		heap->span_reserve_master = span;
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
-		atomic_incr32(&_created_masters);
 #if ENABLE_STATISTICS
 		atomic_add32(&_reserved_spans, (int32_t)current_count);
 #endif
@@ -612,6 +607,7 @@ _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_co
 		assert(SPAN_REMAINS(master->flags) >= current_count);
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count);
 	}
+	assert((SPAN_COUNT(span->flags) + heap->spans_reserved) == current_count);
 }
 
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
@@ -625,7 +621,6 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift);
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
 		span->data.block.align_offset = 0;
-		atomic_incr32(&_created_subspans);
 		return span;
 	}
 
@@ -644,6 +639,7 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 static void
 _memory_unmap_spans(span_t* span) {
 	size_t span_count = SPAN_COUNT(span->flags);
+	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
 		_memory_unmap(span, _memory_span_size * span_count, span->data.list.align_offset, 1);
 		return;
@@ -654,29 +650,26 @@ _memory_unmap_spans(span_t* span) {
 	uint32_t remains = SPAN_REMAINS(master->flags);
 
 	assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
-	assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && !SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
+	assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER));
 	assert(remains >= span_count);
 
 	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
 	if (!is_master) {
 		assert(span->data.list.align_offset == 0);
-		assert(span_count == SPAN_COUNT(span->flags));
 		_memory_unmap(span, span_count * _memory_span_size, 0, 0);
-#if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, -(int32_t)span_count);
-#endif
-		atomic_incr32(&_freed_subspans);
 	}
 	else {
-		atomic_incr32(&_freed_masters);
+		//Special double flag to denote an unmapped master
+		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
 	}
-	if (!remains) {
-		uint32_t master_span_count = SPAN_COUNT(master->flags);
-		assert(!is_master || (span_count == master_span_count));
-		_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
 #if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
+	atomic_add32(&_reserved_spans, -(int32_t)span_count);
 #endif
+
+	if (!remains) {
+		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
+		span_count = SPAN_COUNT(master->flags);
+		_memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1);
 	}
 	else {
 		SPAN_SET_REMAINS(master->flags, remains);
@@ -700,6 +693,8 @@ static span_t*
 _memory_span_split(span_t* span, size_t use_count) {
 	uint16_t distance = 0;
 	size_t current_count = SPAN_COUNT(span->flags);
+	assert(current_count > use_count);
+	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
 #if ENABLE_STATISTICS
@@ -1500,6 +1495,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 
 	atomic_store32(&_memory_heap_id, 0);
 	atomic_store32(&_memory_orphan_counter, 0);
+	atomic_store32(&_memory_active_heaps, 0);
 
 	//Setup all small and medium size classes
 	size_t iclass;
@@ -1536,12 +1532,24 @@ rpmalloc_finalize(void) {
 		while (heap) {
 			_memory_deallocate_deferred(heap, 0);
 
-			//Free span caches
+			//Free span caches (other thread might have deferred after the thread using this heap finalized)
 #if ENABLE_THREAD_CACHE
 			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
 				_memory_unmap_span_list(heap->span_cache[iclass]);
 #endif
+			heap = heap->next_heap;
+		}
+	}
+
+#if ENABLE_GLOBAL_CACHE
+	//Free global caches
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		_memory_cache_finalize(&_memory_span_cache[iclass]);
+#endif
 
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		while (heap) {
 			if (heap->spans_reserved) {
 				span_t* span = heap->span_reserve;
 				span_t* master = heap->span_reserve_master;
@@ -1556,10 +1564,11 @@ rpmalloc_finalize(void) {
 				remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved);
 				if (!remains) {
 					uint32_t master_span_count = SPAN_COUNT(master->flags);
-					_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
 #if ENABLE_STATISTICS
-					atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
+					if (!SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN))
+						atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
 #endif
+					_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
 				}
 				else {
 					SPAN_SET_REMAINS(master->flags, remains);
@@ -1576,12 +1585,6 @@ rpmalloc_finalize(void) {
 	atomic_store_ptr(&_memory_orphan_heaps, 0);
 	atomic_thread_fence_release();
 
-	//Free global caches
-#if ENABLE_GLOBAL_CACHE
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-		_memory_cache_finalize(&_memory_span_cache[iclass]);
-#endif
-
 #if ENABLE_STATISTICS
 	//If you hit these asserts you probably have memory leaks or double frees in your code
 	assert(!atomic_load32(&_mapped_pages));
@@ -1597,7 +1600,7 @@ rpmalloc_finalize(void) {
 void
 rpmalloc_thread_initialize(void) {
 	if (!get_thread_heap()) {
-		heap_t* heap =  _memory_allocate_heap();
+		heap_t* heap = _memory_allocate_heap();
 #if ENABLE_STATISTICS
 		heap->thread_to_global = 0;
 		heap->global_to_thread = 0;

From dea2a84dbf483c4b39a2f06b5b67fe726291ac83 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sat, 10 Feb 2018 19:22:19 +0100
Subject: [PATCH 31/42] work in progress

---
 rpmalloc/rpmalloc.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index dbc66b64..95b0b8cc 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -641,7 +641,7 @@ _memory_unmap_spans(span_t* span) {
 	size_t span_count = SPAN_COUNT(span->flags);
 	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
-		_memory_unmap(span, _memory_span_size * span_count, span->data.list.align_offset, 1);
+		_memory_unmap(span, span_count * _memory_span_size, span->data.list.align_offset, 1);
 		return;
 	}
 
@@ -657,19 +657,21 @@ _memory_unmap_spans(span_t* span) {
 	if (!is_master) {
 		assert(span->data.list.align_offset == 0);
 		_memory_unmap(span, span_count * _memory_span_size, 0, 0);
+#if ENABLE_STATISTICS
+		atomic_add32(&_reserved_spans, -(int32_t)span_count);
+#endif
 	}
 	else {
 		//Special double flag to denote an unmapped master
 		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
 	}
-#if ENABLE_STATISTICS
-	atomic_add32(&_reserved_spans, -(int32_t)span_count);
-#endif
-
 	if (!remains) {
 		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
 		span_count = SPAN_COUNT(master->flags);
 		_memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1);
+#if ENABLE_STATISTICS
+		atomic_add32(&_reserved_spans, -(int32_t)span_count);
+#endif
 	}
 	else {
 		SPAN_SET_REMAINS(master->flags, remains);
@@ -679,7 +681,7 @@ _memory_unmap_spans(span_t* span) {
 //! Unmap a single linked list of spans
 static void
 _memory_unmap_span_list(span_t* span) {
-	size_t list_size = span ? span->data.list.size : 0;
+	size_t list_size = span->data.list.size;
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		span_t* next_span = span->next_span;
 		_memory_unmap_spans(span);
@@ -927,15 +929,19 @@ _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
 		}
 	}
 	if (span) {
-		assert(SPAN_COUNT(span->flags) > span_count);
+		size_t got_count = SPAN_COUNT(span->flags);
+		assert(got_count > span_count);
+		span_t* subspan = _memory_span_split(span, span_count);
+		assert((SPAN_COUNT(span->flags) + SPAN_COUNT(subspan->flags)) == got_count);
+		assert(SPAN_COUNT(span->flags) == span_count);
 		if (!heap->spans_reserved) {
-			_memory_set_span_remainder_as_reserved(heap, span, span_count);
+			heap->spans_reserved = got_count - span_count;
+			heap->span_reserve = subspan;
+			heap->span_reserve_master = pointer_offset(subspan, -(int32_t)SPAN_DISTANCE(subspan->flags) * (int32_t)_memory_span_size);
 		}
 		else {
-			span_t* subspan = _memory_span_split(span, span_count);
 			_memory_heap_cache_insert(heap, subspan);
 		}
-		assert(SPAN_COUNT(span->flags) == span_count);
 		return span;
 	}
 #endif
@@ -1565,8 +1571,7 @@ rpmalloc_finalize(void) {
 				if (!remains) {
 					uint32_t master_span_count = SPAN_COUNT(master->flags);
 #if ENABLE_STATISTICS
-					if (!SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN))
-						atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
+					atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
 #endif
 					_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
 				}

From 5de466ca41b92557ca80686fa32f4e421f93183c Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sat, 10 Feb 2018 21:39:15 +0100
Subject: [PATCH 32/42] fix unmaps cross threads

---
 build/msvs/test.vcxproj |  4 ++
 rpmalloc/rpmalloc.c     | 90 +++++++++++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 22 deletions(-)

diff --git a/build/msvs/test.vcxproj b/build/msvs/test.vcxproj
index 512edd98..0a7b33f4 100644
--- a/build/msvs/test.vcxproj
+++ b/build/msvs/test.vcxproj
@@ -87,21 +87,25 @@
     <LinkIncremental>false</LinkIncremental>
     <OutDir>..\..\bin\windows\release\x86-64\</OutDir>
     <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <TargetName>rpmalloc-test</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <LinkIncremental>false</LinkIncremental>
     <OutDir>..\..\bin\windows\debug\x86\</OutDir>
     <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <TargetName>rpmalloc-test</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>false</LinkIncremental>
     <OutDir>..\..\bin\windows\debug\x86-64\</OutDir>
     <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <TargetName>rpmalloc-test</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <LinkIncremental>false</LinkIncremental>
     <OutDir>..\..\bin\windows\release\x86\</OutDir>
     <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <TargetName>rpmalloc-test</TargetName>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 95b0b8cc..3e4e7087 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -79,12 +79,12 @@
 
 #ifndef ENABLE_STATISTICS
 //! Enable statistics collection
-#define ENABLE_STATISTICS         1
+#define ENABLE_STATISTICS         0
 #endif
 
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
-#define ENABLE_ASSERTS            1
+#define ENABLE_ASSERTS            0
 #endif
 
 #ifndef ENABLE_PRELOAD
@@ -369,8 +369,6 @@ struct span_counter_t {
 struct heap_t {
 	//! Heap ID
 	int32_t      id;
-	//! Deferred deallocation
-	atomicptr_t  defer_deallocate;
 	//! Free count for each size class active span
 	span_block_t active_block[SIZE_CLASS_COUNT];
 	//! Active span for each size class
@@ -389,6 +387,10 @@ struct heap_t {
 	span_t*      span_reserve_master;
 	//! Number of mapped but unused spans
 	size_t       spans_reserved;
+	//! Deferred deallocation
+	atomicptr_t  defer_deallocate;
+	//! Deferred unmaps
+	atomicptr_t  defer_unmap;
 	//! Next heap in id list
 	heap_t*      next_heap;
 	//! Next heap in orphan list
@@ -635,9 +637,23 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 	return span;
 }
 
+static int
+_memory_unmap_defer(int32_t heap_id, span_t* span) {
+	//Get the heap and link in pointer in list of deferred operations
+	heap_t* heap = _memory_heap_lookup(heap_id);
+	if (!heap)
+		return 0;
+	void* last_ptr;
+	do {
+		last_ptr = atomic_load_ptr(&heap->defer_unmap);
+		*(void**)pointer_offset(span, SPAN_HEADER_SIZE) = last_ptr; //Safe to use block, it's being deallocated
+	} while (!atomic_cas_ptr(&heap->defer_unmap, span, last_ptr));
+	return 1;
+}
+
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
-_memory_unmap_spans(span_t* span) {
+_memory_unmap_spans(heap_t* heap, span_t* span) {
 	size_t span_count = SPAN_COUNT(span->flags);
 	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
@@ -654,6 +670,12 @@ _memory_unmap_spans(span_t* span) {
 	assert(remains >= span_count);
 
 	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
+	//Check if we own the master span if we need to store remaining spans
+	int32_t master_heap_id = atomic_load32(&master->heap_id);
+	if (/*remains &&*/ heap && (master_heap_id != heap->id)) {
+		if (_memory_unmap_defer(master_heap_id, span))
+			return;
+	}
 	if (!is_master) {
 		assert(span->data.list.align_offset == 0);
 		_memory_unmap(span, span_count * _memory_span_size, 0, 0);
@@ -678,13 +700,33 @@ _memory_unmap_spans(span_t* span) {
 	}
 }
 
+//! Process pending deferred cross-thread unmaps
+static span_t*
+_memory_unmap_deferred(heap_t* heap, size_t wanted_count) {
+	//Grab the current list of deferred unmaps
+	atomic_thread_fence_acquire();
+	span_t* span = atomic_load_ptr(&heap->defer_unmap);
+	if (!span || !atomic_cas_ptr(&heap->defer_unmap, 0, span))
+		return 0;
+	span_t* found_span = 0;
+	do {
+		void* next = *(void**)pointer_offset(span, SPAN_HEADER_SIZE);
+		if (!found_span && SPAN_COUNT(span->flags) == wanted_count)
+			found_span = span;
+		else
+			_memory_unmap_spans(heap, span);
+		span = next;
+	} while (span);
+	return found_span;
+}
+
 //! Unmap a single linked list of spans
 static void
-_memory_unmap_span_list(span_t* span) {
+_memory_unmap_span_list(heap_t* heap, span_t* span) {
 	size_t list_size = span->data.list.size;
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		span_t* next_span = span->next_span;
-		_memory_unmap_spans(span);
+		_memory_unmap_spans(heap, span);
 		span = next_span;
 	}
 	assert(!span);
@@ -804,7 +846,7 @@ static atomic32_t _cache_unmaps_subspans;
 
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
+_memory_cache_insert(heap_t* heap, global_cache_t* cache, span_t* span, size_t cache_limit) {
 	assert((span->data.list.size == 1) || (span->next_span != 0));
 	int32_t list_size = (int32_t)span->data.list.size;
 	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
@@ -813,7 +855,7 @@ _memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
 			atomic_incr32(&_cache_unmaps_masters);
 		if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN))
 			atomic_incr32(&_cache_unmaps_subspans);
-		_memory_unmap_span_list(span);
+		_memory_unmap_span_list(heap, span);
 		atomic_add32(&cache->size, -list_size);
 		return;
 	}
@@ -854,7 +896,7 @@ _memory_cache_finalize(global_cache_t* cache) {
 	while (span) {
 		span_t* skip_span = (void*)((uintptr_t)span->prev_span & _memory_span_mask);
 		atomic_add32(&cache->size, -(int32_t)span->data.list.size);
-		_memory_unmap_span_list(span);
+		_memory_unmap_span_list(0, span);
 		span = skip_span;
 	}
 	assert(!atomic_load32(&cache->size));
@@ -866,13 +908,13 @@ _memory_cache_finalize(global_cache_t* cache) {
 
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_global_cache_insert(span_t* span) {
+_memory_global_cache_insert(heap_t* heap, span_t* span) {
 #if ENABLE_GLOBAL_CACHE
 	size_t span_count = SPAN_COUNT(span->flags);
 	const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
 	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
 	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
+	_memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 #else
 	_memory_unmap_span_list(span, span_count);
 #endif
@@ -907,7 +949,7 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 	MEMORY_UNUSED(heap);
 	span->data.list.size = 1;
 #endif
-	_memory_global_cache_insert(span);
+	_memory_global_cache_insert(heap, span);
 }
 
 //! Extract the given number of spans from the heap caches
@@ -920,8 +962,10 @@ _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
 #endif
 	if (heap->spans_reserved >= span_count)
 		return _memory_map_spans(heap, span_count);
+	span_t* span = _memory_unmap_deferred(heap, span_count);
+	if (span)
+		return span;
 #if ENABLE_THREAD_CACHE
-	span_t* span = 0;
 	for (++idx; idx < LARGE_CLASS_COUNT; ++idx) {
 		if (heap->span_cache[idx]) {
 			span = _memory_span_list_pop(&heap->span_cache[idx]);
@@ -1223,9 +1267,7 @@ _memory_deallocate_deferred(heap_t* heap, size_t size_class) {
 	//Grab the current list of deferred deallocations
 	atomic_thread_fence_acquire();
 	void* p = atomic_load_ptr(&heap->defer_deallocate);
-	if (!p)
-		return 0;
-	if (!atomic_cas_ptr(&heap->defer_deallocate, 0, p))
+	if (!p || !atomic_cas_ptr(&heap->defer_deallocate, 0, p))
 		return 0;
 	//Keep track if we deallocate in the given size class
 	int got_class = 0;
@@ -1540,8 +1582,10 @@ rpmalloc_finalize(void) {
 
 			//Free span caches (other thread might have deferred after the thread using this heap finalized)
 #if ENABLE_THREAD_CACHE
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-				_memory_unmap_span_list(heap->span_cache[iclass]);
+			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+				if (heap->span_cache[iclass])
+					_memory_unmap_span_list(0, heap->span_cache[iclass]);
+			}
 #endif
 			heap = heap->next_heap;
 		}
@@ -1555,7 +1599,10 @@ rpmalloc_finalize(void) {
 
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
 		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		atomic_store_ptr(&_memory_heaps[list_idx], 0);
 		while (heap) {
+			_memory_unmap_deferred(heap, 0);
+
 			if (heap->spans_reserved) {
 				span_t* span = heap->span_reserve;
 				span_t* master = heap->span_reserve_master;
@@ -1584,8 +1631,6 @@ rpmalloc_finalize(void) {
 			_memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset, 1);
 			heap = next_heap;
 		}
-
-		atomic_store_ptr(&_memory_heaps[list_idx], 0);
 	}
 	atomic_store_ptr(&_memory_orphan_heaps, 0);
 	atomic_thread_fence_release();
@@ -1625,6 +1670,7 @@ rpmalloc_thread_finalize(void) {
 	atomic_add32(&_memory_active_heaps, -1);
 
 	_memory_deallocate_deferred(heap, 0);
+	_memory_unmap_deferred(heap, 0);
 
 	//Release thread cache spans back to global cache
 #if ENABLE_THREAD_CACHE
@@ -1634,7 +1680,7 @@ rpmalloc_thread_finalize(void) {
 		while (span) {
 			assert(SPAN_COUNT(span->flags) == span_count);
 			span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count));
-			_memory_global_cache_insert(span);
+			_memory_global_cache_insert(0, span);
 			span = next;
 		}
 		heap->span_cache[iclass] = 0;

From 957f3ac8a19438e3f4f50f56840c27b93be40c96 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sat, 10 Feb 2018 23:31:04 +0100
Subject: [PATCH 33/42] fix no global cache case

---
 rpmalloc/rpmalloc.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 3e4e7087..d427f259 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -643,6 +643,7 @@ _memory_unmap_defer(int32_t heap_id, span_t* span) {
 	heap_t* heap = _memory_heap_lookup(heap_id);
 	if (!heap)
 		return 0;
+	atomic_store32(&span->heap_id, heap_id);
 	void* last_ptr;
 	do {
 		last_ptr = atomic_load_ptr(&heap->defer_unmap);
@@ -672,7 +673,7 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
 	//Check if we own the master span if we need to store remaining spans
 	int32_t master_heap_id = atomic_load32(&master->heap_id);
-	if (/*remains &&*/ heap && (master_heap_id != heap->id)) {
+	if (heap && (master_heap_id != heap->id)) {
 		if (_memory_unmap_defer(master_heap_id, span))
 			return;
 	}
@@ -711,10 +712,13 @@ _memory_unmap_deferred(heap_t* heap, size_t wanted_count) {
 	span_t* found_span = 0;
 	do {
 		void* next = *(void**)pointer_offset(span, SPAN_HEADER_SIZE);
-		if (!found_span && SPAN_COUNT(span->flags) == wanted_count)
+		if (!found_span && SPAN_COUNT(span->flags) == wanted_count) {
+			assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 			found_span = span;
-		else
+		}
+		else {
 			_memory_unmap_spans(heap, span);
+		}
 		span = next;
 	} while (span);
 	return found_span;
@@ -916,7 +920,7 @@ _memory_global_cache_insert(heap_t* heap, span_t* span) {
 	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
 	_memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 #else
-	_memory_unmap_span_list(span, span_count);
+	_memory_unmap_span_list(heap, span);
 #endif
 }
 
@@ -1601,8 +1605,6 @@ rpmalloc_finalize(void) {
 		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
 		atomic_store_ptr(&_memory_heaps[list_idx], 0);
 		while (heap) {
-			_memory_unmap_deferred(heap, 0);
-
 			if (heap->spans_reserved) {
 				span_t* span = heap->span_reserve;
 				span_t* master = heap->span_reserve_master;
@@ -1627,6 +1629,8 @@ rpmalloc_finalize(void) {
 				}
 			}
 
+			_memory_unmap_deferred(heap, 0);
+
 			heap_t* next_heap = heap->next_heap;
 			_memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset, 1);
 			heap = next_heap;

From 48d61762d9e9197d8c9c822f190243cd25040ba2 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 11 Feb 2018 21:33:27 +0100
Subject: [PATCH 34/42] correctly handle remainder in master spans

---
 rpmalloc/rpmalloc.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index d427f259..fc995b75 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -589,6 +589,8 @@ _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_co
 	heap->span_reserve = pointer_offset(span, use_count * _memory_span_size);
 	heap->spans_reserved = current_count - use_count;
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
 		heap->span_reserve_master = span;
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
 #if ENABLE_STATISTICS
@@ -596,6 +598,7 @@ _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_co
 #endif
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		assert(atomic_load32(&span->heap_id) == heap->id);
 		uint16_t remains = SPAN_REMAINS(span->flags);
 		assert(remains >= current_count);
 		heap->span_reserve_master = span;
@@ -647,7 +650,7 @@ _memory_unmap_defer(int32_t heap_id, span_t* span) {
 	void* last_ptr;
 	do {
 		last_ptr = atomic_load_ptr(&heap->defer_unmap);
-		*(void**)pointer_offset(span, SPAN_HEADER_SIZE) = last_ptr; //Safe to use block, it's being deallocated
+		span->next_span = last_ptr;
 	} while (!atomic_cas_ptr(&heap->defer_unmap, span, last_ptr));
 	return 1;
 }
@@ -664,13 +667,10 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 
 	uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER);
 	span_t* master = is_master ? span : (pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size));
-	uint32_t remains = SPAN_REMAINS(master->flags);
 
 	assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER));
-	assert(remains >= span_count);
 
-	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
 	//Check if we own the master span if we need to store remaining spans
 	int32_t master_heap_id = atomic_load32(&master->heap_id);
 	if (heap && (master_heap_id != heap->id)) {
@@ -688,6 +688,10 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 		//Special double flag to denote an unmapped master
 		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
 	}
+
+	uint32_t remains = SPAN_REMAINS(master->flags);
+	assert(remains >= span_count);
+	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
 	if (!remains) {
 		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
 		span_count = SPAN_COUNT(master->flags);
@@ -711,13 +715,19 @@ _memory_unmap_deferred(heap_t* heap, size_t wanted_count) {
 		return 0;
 	span_t* found_span = 0;
 	do {
-		void* next = *(void**)pointer_offset(span, SPAN_HEADER_SIZE);
-		if (!found_span && SPAN_COUNT(span->flags) == wanted_count) {
-			assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
-			found_span = span;
-		}
-		else {
-			_memory_unmap_spans(heap, span);
+		void* next = span->next_span;
+		uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER);
+		span_t* master = is_master ? span : (pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size));
+		int32_t master_heap_id = atomic_load32(&master->heap_id);
+		if ((atomic_load32(&span->heap_id) == master_heap_id) ||
+				!_memory_unmap_defer(master_heap_id, span)) {
+			if (!found_span && SPAN_COUNT(span->flags) == wanted_count) {
+				assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+				found_span = span;
+			}
+			else {
+				_memory_unmap_spans(heap, span);
+			}
 		}
 		span = next;
 	} while (span);
@@ -738,18 +748,21 @@ _memory_unmap_span_list(heap_t* heap, span_t* span) {
 
 //! Make a span list out of a super span
 static span_t*
-_memory_span_split(span_t* span, size_t use_count) {
+_memory_span_split(heap_t* heap, span_t* span, size_t use_count) {
 	uint16_t distance = 0;
 	size_t current_count = SPAN_COUNT(span->flags);
 	assert(current_count > use_count);
 	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
 #if ENABLE_STATISTICS
 		atomic_add32(&_reserved_spans, (int32_t)current_count);
 #endif
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		assert(atomic_load32(&span->heap_id) == heap->id);
 		uint16_t remains = SPAN_REMAINS(span->flags);
 		assert(remains >= current_count);
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count);
@@ -979,7 +992,9 @@ _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
 	if (span) {
 		size_t got_count = SPAN_COUNT(span->flags);
 		assert(got_count > span_count);
-		span_t* subspan = _memory_span_split(span, span_count);
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
+		span_t* subspan = _memory_span_split(heap, span, span_count);
 		assert((SPAN_COUNT(span->flags) + SPAN_COUNT(subspan->flags)) == got_count);
 		assert(SPAN_COUNT(span->flags) == span_count);
 		if (!heap->spans_reserved) {

From 720668a80db8c67c06e2901a7fcb479f244364ba Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sun, 11 Feb 2018 22:32:21 +0100
Subject: [PATCH 35/42] cleanups, documentation and formatting

---
 rpmalloc/rpmalloc.c | 275 ++++++++++++++++++++++++++++----------------
 1 file changed, 173 insertions(+), 102 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index fc995b75..3763ad20 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -13,50 +13,6 @@
 
 // Build time configurable limits
 
-#ifndef ENABLE_UNLIMITED_CACHE
-//! Unlimited cache disables any cache limitations
-#define ENABLE_UNLIMITED_CACHE      0
-#endif
-
-#ifndef ENABLE_SPACE_PRIORITY_CACHE
-//! Minimize overhead
-#define ENABLE_SPACE_PRIORITY_CACHE 0
-#endif
-
-// Presets for cache limits
-#if ENABLE_SPACE_PRIORITY_CACHE
-// Space priority cache limits
-#define MIN_SPAN_CACHE_SIZE 8
-#define MIN_SPAN_CACHE_RELEASE 8
-#define MAX_SPAN_CACHE_DIVISOR 16
-#define MIN_LARGE_SPAN_CACHE_SIZE 2
-#define MIN_LARGE_SPAN_CACHE_RELEASE 2
-#define MAX_LARGE_SPAN_CACHE_DIVISOR 32
-#define GLOBAL_CACHE_MULTIPLIER 1
-#define DEFAULT_SPAN_MAP_COUNT 4
-#else
-// Default - performance priority cache limits
-//! Minimum cache size to remain after a release to global cache
-#define MIN_SPAN_CACHE_SIZE 64
-//! Minimum number of spans to transfer between thread and global cache
-#define MIN_SPAN_CACHE_RELEASE 16
-//! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor)
-#define MAX_SPAN_CACHE_DIVISOR 4
-//! Minimum cache size to remain after a release to global cache, large spans
-#define MIN_LARGE_SPAN_CACHE_SIZE 8
-//! Minimum number of spans to transfer between thread and global cache, large spans
-#define MIN_LARGE_SPAN_CACHE_RELEASE 4
-//! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor)
-#define MAX_LARGE_SPAN_CACHE_DIVISOR 16
-//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
-#define GLOBAL_CACHE_MULTIPLIER 8
-#endif
-
-#ifndef DEFAULT_SPAN_MAP_COUNT
-//! Default number of spans to map in call to map more virtual memory
-#define DEFAULT_SPAN_MAP_COUNT    8
-#endif
-
 #ifndef HEAP_ARRAY_SIZE
 //! Size of heap hashmap
 #define HEAP_ARRAY_SIZE           79
@@ -67,7 +23,10 @@
 #define ENABLE_THREAD_CACHE       1
 #endif
 
-#ifndef ENABLE_GLOBAL_CACHE
+#if !ENABLE_THREAD_CACHE
+#  undef ENABLE_GLOBAL_CACHE
+#  define ENABLE_GLOBAL_CACHE     0
+#elif !defined(ENABLE_GLOBAL_CACHE)
 //! Enable global cache shared between all threads, requires thread cache
 #define ENABLE_GLOBAL_CACHE       1
 #endif
@@ -97,9 +56,57 @@
 #define ENABLE_GUARDS             0
 #endif
 
-#if !ENABLE_THREAD_CACHE
-#  undef ENABLE_GLOBAL_CACHE
-#  define ENABLE_GLOBAL_CACHE 0
+#ifndef DEFAULT_SPAN_MAP_COUNT
+//! Default number of spans to map in call to map more virtual memory
+#define DEFAULT_SPAN_MAP_COUNT    8
+#endif
+
+// Presets for cache limits
+#if ENABLE_THREAD_CACHE
+
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Unlimited cache disables any cache limitations
+#define ENABLE_UNLIMITED_CACHE      0
+#endif
+
+#ifndef ENABLE_SPACE_PRIORITY_CACHE
+//! Minimize overhead
+#define ENABLE_SPACE_PRIORITY_CACHE 0
+#endif
+
+#if ENABLE_SPACE_PRIORITY_CACHE
+// Space priority cache limits
+#define MIN_SPAN_CACHE_SIZE 8
+#define MIN_SPAN_CACHE_RELEASE 8
+#define MAX_SPAN_CACHE_DIVISOR 16
+#define MIN_LARGE_SPAN_CACHE_SIZE 2
+#define MIN_LARGE_SPAN_CACHE_RELEASE 2
+#define MAX_LARGE_SPAN_CACHE_DIVISOR 32
+#if ENABLE_GLOBAL_CACHE
+#define GLOBAL_CACHE_MULTIPLIER 1
+#endif
+#ifndef DEFAULT_SPAN_MAP_COUNT
+#define DEFAULT_SPAN_MAP_COUNT 4
+#endif
+#else
+// Default - performance priority cache limits
+//! Minimum cache size to remain after a release to global cache
+#define MIN_SPAN_CACHE_SIZE 64
+//! Minimum number of spans to transfer between thread and global cache
+#define MIN_SPAN_CACHE_RELEASE 16
+//! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor)
+#define MAX_SPAN_CACHE_DIVISOR 4
+//! Minimum cache size to remain after a release to global cache, large spans
+#define MIN_LARGE_SPAN_CACHE_SIZE 8
+//! Minimum number of spans to transfer between thread and global cache, large spans
+#define MIN_LARGE_SPAN_CACHE_RELEASE 4
+//! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor)
+#define MAX_LARGE_SPAN_CACHE_DIVISOR 16
+//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
+#if ENABLE_GLOBAL_CACHE
+#define GLOBAL_CACHE_MULTIPLIER 8
+#endif
+#endif
 #endif
 
 // Platform and arch specifics
@@ -221,7 +228,7 @@ atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) {
 	                                      (long long)val, (long long)ref) == (long long)ref) ? 1 : 0;
 #  else
 	return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic,
-	                                      (long)val, (long)ref) == (long)ref) ? 1 : 0;
+	                                    (long)val, (long)ref) == (long)ref) ? 1 : 0;
 #  endif
 #else
 	return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val);
@@ -297,9 +304,9 @@ typedef struct heap_t heap_t;
 typedef struct span_t span_t;
 //! Size class definition
 typedef struct size_class_t size_class_t;
-//! Span block bookkeeping 
+//! Span block bookkeeping
 typedef struct span_block_t span_block_t;
-//! Span list bookkeeping 
+//! Span list bookkeeping
 typedef struct span_list_t span_list_t;
 //! Span data union, usage depending on span state
 typedef union span_data_t span_data_t;
@@ -308,12 +315,13 @@ typedef struct span_counter_t span_counter_t;
 //! Global cache
 typedef struct global_cache_t global_cache_t;
 
+//! Flag indicating span is the first (master) span of a split superspan
 #define SPAN_FLAG_MASTER 1
+//! Flag indicating span is a secondary (sub) span of a split superspan
 #define SPAN_FLAG_SUBSPAN 2
 
-//Alignment offset must match in both structures
-//to keep the data when transitioning between being
-//used for blocks and being part of a list
+//Alignment offset must match in both structures to keep the data when
+//transitioning between being used for blocks and being part of a list
 struct span_block_t {
 	//! Free list
 	uint16_t    free_list;
@@ -341,11 +349,22 @@ union span_data_t {
 	span_list_t list;
 };
 
+//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
+//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
+//span or a super span. A super span can further be diviced into multiple spans (or this, super spans), where the first
+//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
+//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
+//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
+//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
+//to reduce physical memory use).
 struct span_t {
 	//!	Heap ID
 	atomic32_t  heap_id;
 	//! Size class
 	uint16_t    size_class;
+	// TODO: If we could store remainder part of flags as an atomic counter, the entire check
+	//       if master is owned by calling heap could be simplified to an atomic dec from any thread
+	//       since remainder of a split super span only ever decreases, never increases
 	//! Flags and counters
 	uint16_t    flags;
 	//! Span data
@@ -357,6 +376,7 @@ struct span_t {
 };
 _Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
 
+//Adaptive cache counter of a single superspan span count
 struct span_counter_t {
 	//! Allocation high water mark
 	uint32_t  max_allocations;
@@ -484,6 +504,7 @@ static pthread_key_t _memory_thread_heap;
 static _Thread_local heap_t* _memory_thread_heap TLS_MODEL;
 #endif
 
+//! Get the current thread heap
 static FORCEINLINE heap_t*
 get_thread_heap(void) {
 #if defined(__APPLE__) && ENABLE_PRELOAD
@@ -493,6 +514,7 @@ get_thread_heap(void) {
 #endif
 }
 
+//! Set the current thread heap
 static void
 set_thread_heap(heap_t* heap) {
 #if defined(__APPLE__) && ENABLE_PRELOAD
@@ -502,12 +524,15 @@ set_thread_heap(heap_t* heap) {
 #endif
 }
 
+//! Default implementation to map more virtual memory
 static void*
 _memory_map_os(size_t size, size_t* offset);
 
+//! Default implementation to unmap virtual memory
 static void
 _memory_unmap_os(void* address, size_t size, size_t offset, int release);
 
+//! Deallocate any deferred blocks and check for the given size class
 static int
 _memory_deallocate_deferred(heap_t* heap, size_t size_class);
 
@@ -548,6 +573,7 @@ _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size
 #  define _memory_counter_increase(counter, global_counter, span_count) do {} while (0)
 #endif
 
+//! Map more virtual memory
 static void*
 _memory_map(size_t size, size_t* offset) {
 #if ENABLE_STATISTICS
@@ -559,6 +585,7 @@ _memory_map(size_t size, size_t* offset) {
 	return _memory_config.memory_map(size, offset);
 }
 
+//! Unmap virtual memory
 static void
 _memory_unmap(void* address, size_t size, size_t offset, int release) {
 #if ENABLE_STATISTICS
@@ -571,24 +598,34 @@ _memory_unmap(void* address, size_t size, size_t offset, int release) {
 	_memory_config.memory_unmap(address, size, offset, release);
 }
 
+//! Make flags field in a span from flags, remainder/distance and count
 #define SPAN_MAKE_FLAGS(flags, remdist, count) ((uint16_t)((flags) | ((uint16_t)((remdist) - 1) << 2) | ((uint16_t)((count) - 1) << 9))); assert((flags) < 4); assert((remdist) && (remdist) < 128); assert((count) && (count) < 128)
+//! Check if span has any of the given flags
 #define SPAN_HAS_FLAG(flags, flag) ((flags) & (flag))
+//! Get the distance from flags field
 #define SPAN_DISTANCE(flags) (1 + (((flags) >> 2) & 0x7f))
+//! Get the remainder from flags field
 #define SPAN_REMAINS(flags) (1 + (((flags) >> 2) & 0x7f))
+//! Get the count from flags field
 #define SPAN_COUNT(flags) (1 + (((flags) >> 9) & 0x7f))
+//! Set the remainder in the flags field (MUST be done from the owner heap thread)
 #define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)(((flags) & 0xfe03) | ((uint16_t)((remains) - 1) << 2))); assert((remains) < 128)
 
+//! Resize the given super span to the given count of spans, store the remainder in the heap reserved spans fields
 static void
 _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_count) {
 	size_t current_count = SPAN_COUNT(span->flags);
+
 	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	assert((current_count > 1) && (current_count < 127));
 	assert(!heap->spans_reserved);
 	assert(SPAN_COUNT(span->flags) == current_count);
 	assert(current_count > use_count);
+
 	heap->span_reserve = pointer_offset(span, use_count * _memory_span_size);
 	heap->spans_reserved = current_count - use_count;
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		//We must store the heap id before setting as master, to force unmaps to defer to this heap thread
 		atomic_store32(&span->heap_id, heap->id);
 		atomic_thread_fence_release();
 		heap->span_reserve_master = span;
@@ -598,6 +635,7 @@ _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_co
 #endif
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		//Only owner heap thread can modify a master span
 		assert(atomic_load32(&span->heap_id) == heap->id);
 		uint16_t remains = SPAN_REMAINS(span->flags);
 		assert(remains >= current_count);
@@ -605,6 +643,7 @@ _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_co
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count);
 	}
 	else { //SPAN_FLAG_SUBSPAN
+		//Resizing a subspan is a safe operation in any thread
 		uint16_t distance = SPAN_DISTANCE(span->flags);
 		span_t* master = pointer_offset(span, -(int)distance * (int)_memory_span_size);
 		heap->span_reserve_master = master;
@@ -622,7 +661,7 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 		span_t* span = heap->span_reserve;
 		heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
 		heap->spans_reserved -= span_count;
-		//set flag in span that it is a subspan with a master span
+		//Declare the span to be a subspan with given distance from master span
 		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift);
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
 		span->data.block.align_offset = 0;
@@ -635,11 +674,14 @@ _memory_map_spans(heap_t* heap, size_t span_count) {
 	span_t* span = _memory_map(request_spans * _memory_span_size, &align_offset);
 	span->flags = SPAN_MAKE_FLAGS(0, request_spans, request_spans);
 	span->data.block.align_offset = (uint16_t)align_offset;
-	if (request_spans > span_count)
+	if (request_spans > span_count) {
+		//We have extra spans, store them as reserved spans in heap
 		_memory_set_span_remainder_as_reserved(heap, span, span_count);
+	}
 	return span;
 }
 
+//! Defer unmapping of the given span to the owner heap
 static int
 _memory_unmap_defer(int32_t heap_id, span_t* span) {
 	//Get the heap and link in pointer in list of deferred operations
@@ -660,6 +702,7 @@ static void
 _memory_unmap_spans(heap_t* heap, span_t* span) {
 	size_t span_count = SPAN_COUNT(span->flags);
 	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+	//A plain run of spans can be unmapped directly
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
 		_memory_unmap(span, span_count * _memory_span_size, span->data.list.align_offset, 1);
 		return;
@@ -671,13 +714,14 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 	assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER));
 
-	//Check if we own the master span if we need to store remaining spans
+	//Check if we own the master span, otherwise defer (only owner of master span can modify remainder field)
 	int32_t master_heap_id = atomic_load32(&master->heap_id);
 	if (heap && (master_heap_id != heap->id)) {
 		if (_memory_unmap_defer(master_heap_id, span))
 			return;
 	}
 	if (!is_master) {
+		//Directly unmap subspans
 		assert(span->data.list.align_offset == 0);
 		_memory_unmap(span, span_count * _memory_span_size, 0, 0);
 #if ENABLE_STATISTICS
@@ -686,13 +730,15 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 	}
 	else {
 		//Special double flag to denote an unmapped master
+		//It must be kept in memory since span header must be used
 		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
 	}
-
+	//We are in owner thread of the master span
 	uint32_t remains = SPAN_REMAINS(master->flags);
 	assert(remains >= span_count);
 	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
 	if (!remains) {
+		//Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
 		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
 		span_count = SPAN_COUNT(master->flags);
 		_memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1);
@@ -701,6 +747,7 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 #endif
 	}
 	else {
+		//Set remaining spans
 		SPAN_SET_REMAINS(master->flags, remains);
 	}
 }
@@ -715,17 +762,19 @@ _memory_unmap_deferred(heap_t* heap, size_t wanted_count) {
 		return 0;
 	span_t* found_span = 0;
 	do {
+		//Verify that we own the master span, otherwise re-defer to owner
 		void* next = span->next_span;
-		uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER);
-		span_t* master = is_master ? span : (pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size));
-		int32_t master_heap_id = atomic_load32(&master->heap_id);
-		if ((atomic_load32(&span->heap_id) == master_heap_id) ||
-				!_memory_unmap_defer(master_heap_id, span)) {
-			if (!found_span && SPAN_COUNT(span->flags) == wanted_count) {
-				assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
-				found_span = span;
-			}
-			else {
+		if (!found_span && SPAN_COUNT(span->flags) == wanted_count) {
+			assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+			found_span = span;
+		}
+		else {
+			uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER);
+			span_t* master = is_master ? span : (pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size));
+			int32_t master_heap_id = atomic_load32(&master->heap_id);
+			if ((atomic_load32(&span->heap_id) == master_heap_id) ||
+			        !_memory_unmap_defer(master_heap_id, span)) {
+				//We own the master span (or heap merged and abandoned)
 				_memory_unmap_spans(heap, span);
 			}
 		}
@@ -746,7 +795,9 @@ _memory_unmap_span_list(heap_t* heap, span_t* span) {
 	assert(!span);
 }
 
-//! Make a span list out of a super span
+#if ENABLE_THREAD_CACHE
+
+//! Split a super span in two
 static span_t*
 _memory_span_split(heap_t* heap, span_t* span, size_t use_count) {
 	uint16_t distance = 0;
@@ -754,6 +805,7 @@ _memory_span_split(heap_t* heap, span_t* span, size_t use_count) {
 	assert(current_count > use_count);
 	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		//Must store heap in master span before use, to avoid issues when unmapping subspans
 		atomic_store32(&span->heap_id, heap->id);
 		atomic_thread_fence_release();
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
@@ -762,6 +814,7 @@ _memory_span_split(heap_t* heap, span_t* span, size_t use_count) {
 #endif
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		//Only valid to call on master span if we own it
 		assert(atomic_load32(&span->heap_id) == heap->id);
 		uint16_t remains = SPAN_REMAINS(span->flags);
 		assert(remains >= current_count);
@@ -771,6 +824,7 @@ _memory_span_split(heap_t* heap, span_t* span, size_t use_count) {
 		distance = SPAN_DISTANCE(span->flags);
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count);
 	}
+	//Setup remainder as a subspan
 	span_t* subspan = pointer_offset(span, use_count * _memory_span_size);
 	subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + use_count, current_count - use_count);
 	subspan->data.list.align_offset = 0;
@@ -827,6 +881,8 @@ _memory_span_list_split(span_t* span, size_t limit) {
 	return next;
 }
 
+#endif
+
 //! Add a span to a double linked list
 static void
 _memory_span_list_doublelink_add(span_t** head, span_t* span) {
@@ -857,21 +913,13 @@ _memory_span_list_doublelink_remove(span_t** head, span_t* span) {
 
 #if ENABLE_GLOBAL_CACHE
 
-static atomic32_t _cache_unmaps;
-static atomic32_t _cache_unmaps_masters;
-static atomic32_t _cache_unmaps_subspans;
-
 //! Insert the given list of memory page spans in the global cache
 static void
 _memory_cache_insert(heap_t* heap, global_cache_t* cache, span_t* span, size_t cache_limit) {
 	assert((span->data.list.size == 1) || (span->next_span != 0));
 	int32_t list_size = (int32_t)span->data.list.size;
+	//Unmap if cache has reached the limit
 	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
-		atomic_incr32(&_cache_unmaps);
-		if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER))
-			atomic_incr32(&_cache_unmaps_masters);
-		if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN))
-			atomic_incr32(&_cache_unmaps_subspans);
 		_memory_unmap_span_list(heap, span);
 		atomic_add32(&cache->size, -list_size);
 		return;
@@ -905,7 +953,7 @@ _memory_cache_extract(global_cache_t* cache) {
 	return 0;
 }
 
-//! Finalize a global cache
+//! Finalize a global cache, only valid from allocator finalization (not thread safe)
 static void
 _memory_cache_finalize(global_cache_t* cache) {
 	void* current_cache = atomic_load_ptr(&cache->cache);
@@ -921,34 +969,27 @@ _memory_cache_finalize(global_cache_t* cache) {
 	atomic_store32(&cache->size, 0);
 }
 
-#endif
-
 //! Insert the given list of memory page spans in the global cache
 static void
 _memory_global_cache_insert(heap_t* heap, span_t* span) {
-#if ENABLE_GLOBAL_CACHE
+	//Calculate adaptive limits
 	size_t span_count = SPAN_COUNT(span->flags);
 	const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
 	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
 	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
 	_memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
-#else
-	_memory_unmap_span_list(heap, span);
-#endif
 }
 
 //! Extract a number of memory page spans from the global cache for large blocks
 static span_t*
 _memory_global_cache_extract(size_t span_count) {
-#if ENABLE_GLOBAL_CACHE
 	span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
 	assert(!span || (SPAN_COUNT(span->flags) == span_count));
 	return span;
-#else
-	return 0;
-#endif
 }
 
+#endif
+
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
 static void
 _memory_heap_cache_insert(heap_t* heap, span_t* span) {
@@ -966,23 +1007,31 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 	MEMORY_UNUSED(heap);
 	span->data.list.size = 1;
 #endif
+#if ENABLE_GLOBAL_CACHE
 	_memory_global_cache_insert(heap, span);
+#else
+	_memory_unmap_span_list(heap, span);
+#endif
 }
 
-//! Extract the given number of spans from the heap caches
+//! Extract the given number of spans from the different cache levels
 static span_t*
 _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
-	size_t idx = span_count - 1;
 #if ENABLE_THREAD_CACHE
+	size_t idx = span_count - 1;
+	//Step 1: check thread cache
 	if (heap->span_cache[idx])
 		return _memory_span_list_pop(&heap->span_cache[idx]);
 #endif
+	//Step 2: Check reserved spans
 	if (heap->spans_reserved >= span_count)
 		return _memory_map_spans(heap, span_count);
+	//Step 3: Try processing deferred unmappings
 	span_t* span = _memory_unmap_deferred(heap, span_count);
 	if (span)
 		return span;
 #if ENABLE_THREAD_CACHE
+	//Step 4: Check larger super spans and split if we find one
 	for (++idx; idx < LARGE_CLASS_COUNT; ++idx) {
 		if (heap->span_cache[idx]) {
 			span = _memory_span_list_pop(&heap->span_cache[idx]);
@@ -990,10 +1039,13 @@ _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
 		}
 	}
 	if (span) {
+		//Mark the span as owned by this heap before splitting
 		size_t got_count = SPAN_COUNT(span->flags);
 		assert(got_count > span_count);
 		atomic_store32(&span->heap_id, heap->id);
 		atomic_thread_fence_release();
+
+		//Split the span and store as reserved if no previously reserved spans, or in thread cache otherwise
 		span_t* subspan = _memory_span_split(heap, span, span_count);
 		assert((SPAN_COUNT(span->flags) + SPAN_COUNT(subspan->flags)) == got_count);
 		assert(SPAN_COUNT(span->flags) == span_count);
@@ -1007,7 +1059,8 @@ _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
 		}
 		return span;
 	}
-#endif
+#if ENABLE_GLOBAL_CACHE
+	//Step 5: Extract from global cache
 	idx = span_count - 1;
 	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
 	if (heap->span_cache[idx]) {
@@ -1016,6 +1069,8 @@ _memory_heap_cache_extract(heap_t* heap, size_t span_count) {
 #endif
 		return _memory_span_list_pop(&heap->span_cache[idx]);
 	}
+#endif
+#endif
 	return 0;
 }
 
@@ -1024,8 +1079,8 @@ static void*
 _memory_allocate_from_heap(heap_t* heap, size_t size) {
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
 	const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ?
-		((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) :
-		SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT);
+	                        ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) :
+	                        SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT);
 	assert(!base_idx || ((base_idx - 1) < SIZE_CLASS_COUNT));
 	const size_t class_idx = _memory_size_class[base_idx ? (base_idx - 1) : 0].class_idx;
 
@@ -1090,9 +1145,12 @@ _memory_allocate_from_heap(heap_t* heap, size_t size) {
 		goto use_active;
 	}
 
+	//Step 4: Find a span in one of the cache levels
 	span_t* span = _memory_heap_cache_extract(heap, 1);
-	if (!span)
+	if (!span) {
+		//Step 5: Map in more virtual memory
 		span = _memory_map_spans(heap, 1);
+	}
 
 	//Mark span as owned by this heap and set base data
 	assert(SPAN_COUNT(span->flags) == 1);
@@ -1139,10 +1197,12 @@ _memory_allocate_large_from_heap(heap_t* heap, size_t size) {
 #else
 	_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
 #endif
-
+	//Step 1: Find span in one of the cache levels
 	span_t* span = _memory_heap_cache_extract(heap, span_count);
-	if (!span)
+	if (!span) {
+		//Step 2: Map in more virtual memory
 		span = _memory_map_spans(heap, span_count);
+	}
 
 	//Mark span as owned by this heap and set base data
 	assert(SPAN_COUNT(span->flags) == span_count);
@@ -1199,9 +1259,15 @@ _memory_allocate_heap(void) {
 		} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
 	}
 
+#if ENABLE_THREAD_CACHE
 	heap->span_counter[0].cache_limit = MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE;
 	for (size_t idx = 1; idx < LARGE_CLASS_COUNT; ++idx)
 		heap->span_counter[idx].cache_limit = MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE;
+#endif
+
+	//Clean up any deferred operations
+	_memory_unmap_deferred(heap, 0);
+	_memory_deallocate_deferred(heap, 0);
 
 	return heap;
 }
@@ -1216,8 +1282,8 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
 	size_class_t* size_class = _memory_size_class + class_idx;
 	int is_active = (heap->active_span[class_idx] == span);
 	span_block_t* block_data = is_active ?
-		heap->active_block + class_idx :
-		&span->data.block;
+	                           heap->active_block + class_idx :
+	                           &span->data.block;
 
 	//Check if the span will become completely free
 	if (block_data->free_count == ((count_t)size_class->block_count - 1)) {
@@ -1271,7 +1337,8 @@ _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
 		--heap->span_counter[idx].current_allocations;
 #endif
 	if (!heap->spans_reserved && (span_count > 1)) {
-		//Break up as single span cache
+		//Split the span and store remainder as reserved spans
+		//Must split to a dummy 1-span master since we cannot have master spans as reserved
 		_memory_set_span_remainder_as_reserved(heap, span, 1);
 		span_count = 1;
 	}
@@ -1624,7 +1691,7 @@ rpmalloc_finalize(void) {
 				span_t* span = heap->span_reserve;
 				span_t* master = heap->span_reserve_master;
 				uint32_t remains = SPAN_REMAINS(master->flags);
-							
+
 				assert(master != span);
 				assert(remains >= heap->spans_reserved);
 				_memory_unmap(span, heap->spans_reserved * _memory_span_size, 0, 0);
@@ -1694,14 +1761,18 @@ rpmalloc_thread_finalize(void) {
 	//Release thread cache spans back to global cache
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		const size_t span_count = iclass + 1;
 		span_t* span = heap->span_cache[iclass];
+#if ENABLE_GLOBAL_CACHE
+		const size_t span_count = iclass + 1;
 		while (span) {
 			assert(SPAN_COUNT(span->flags) == span_count);
 			span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count));
 			_memory_global_cache_insert(0, span);
 			span = next;
 		}
+#else
+		_memory_unmap_span_list(heap, span);
+#endif
 		heap->span_cache[iclass] = 0;
 	}
 #endif

From c947f385fe2f0da83d16ffa0d04c47d108ce19a7 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 12 Feb 2018 21:48:12 +0000
Subject: [PATCH 36/42] python3 compatibility

---
 build/ninja/toolchain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/ninja/toolchain.py b/build/ninja/toolchain.py
index 1d7c6f74..593349a6 100644
--- a/build/ninja/toolchain.py
+++ b/build/ninja/toolchain.py
@@ -139,7 +139,7 @@ def initialize_default_archs(self):
       elif localarch == 'i686':
         self.archs = ['x86']
       else:
-        self.archs = [localarch]
+        self.archs = [str(localarch)]
     elif self.target.is_macos():
       self.archs = ['x86-64']
     elif self.target.is_ios():
@@ -159,7 +159,7 @@ def initialize_configs(self, configs):
       self.initialize_default_configs()
 
   def initialize_default_configs(self):
-    self.configs = ['debug', 'release'] #, 'profile', 'deploy']
+    self.configs = ['debug', 'release']
 
   def initialize_toolchain(self):
     if self.android != None:

From 9d2392d0ac9b7c29b51771915e7582aaf667a6fa Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 12 Feb 2018 23:36:55 +0100
Subject: [PATCH 37/42] cleanups and updated docs

---
 README.md           |  31 ++++++++------
 rpmalloc/rpmalloc.c | 100 ++++++++++++++++----------------------------
 rpmalloc/rpmalloc.h |   2 +-
 3 files changed, 55 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index 50bbc7ad..b82f9b76 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # rpmalloc - Rampant Pixels Memory Allocator
-This library provides a public domain cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. The latest source code is always available at https://github.com/rampantpixels/rpmalloc
+This library provides a public domain cross platform lock free thread caching 32-byte aligned memory allocator implemented in C. The latest source code is always available at https://github.com/rampantpixels/rpmalloc
 
 Platforms currently supported:
 
@@ -18,7 +18,7 @@ Please consider our Patreon to support our work - https://www.patreon.com/rampan
 Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder)) / Rampant Pixels - http://www.rampantpixels.com
 
 # Performance
-We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2000 lines of C code.
+We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2100 lines of C code. All allocations have a natural 32-byte alignment.
 
 Contained in a parallel repository is a benchmark utility that performs interleaved allocations (both aligned to 8 or 16 bytes, and unaligned) and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
 
@@ -61,11 +61,11 @@ The latest stable release is available in the master branch. For latest developm
 # Cache configuration options
 Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by either defining one of four presets, or by editing the individual defines in the `rpmalloc.c` source file for fine tuned control. If you do not define any of the following three directives, the default preset will be used which is to increase caches and prioritize performance over memory overhead (but not making caches unlimited).
 
-__ENABLE_UNLIMITED_CACHE__: This will make all caches infinite, i.e never release spans to global cache unless thread finishes, and never unmap memory pages back to the OS. Highest performance but largest memory overhead.
+__ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead.
 
-__ENABLE_SPACE_PRIORITY_CACHE__: This will reduce caches to minimize memory overhead while still maintaining decent performance.
+__ENABLE_GLOBAL_CACHE__: By default defined to 1, enables the global cache shared between all threads. Set to 0 to disable the global cache and directly unmap pages evicted from the thread cache.
 
-__DISABLE_CACHE__: This will completely disable caches for free pages and instead immediately unmap memory pages back to the OS when no longer in use. Minimizes memory overhead but heavily reduces performance.
+__ENABLE_THREAD_CACHE__: By default defined to 1, enables the per-thread cache. Set to 0 to disable the thread cache and directly unmap pages no longer in use (also disables the global cache).
 
 # Other configuration options
 Detailed statistics are available if __ENABLE_STATISTICS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. This will cause a slight overhead in runtime to collect statistics for each memory operation, and will also add 4 bytes overhead per allocation to track sizes.
@@ -74,15 +74,15 @@ Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is de
 
 Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`.
 
-Overwrite and underwrite guards are enabled if __ENABLE_GUARDS__ is defined to 1 (default is 0, or disabled), either on compile command line or by settings the value in `rpmalloc.c`. This will introduce up to 32 byte overhead on each allocation to store magic numbers, which will be verified when freeing the memory block. The actual overhead is dependent on the requested size compared to size class limits.
+Overwrite and underwrite guards are enabled if __ENABLE_GUARDS__ is defined to 1 (default is 0, or disabled), either on compile command line or by settings the value in `rpmalloc.c`. This will introduce up to 64 byte overhead on each allocation to store magic numbers, which will be verified when freeing the memory block. The actual overhead is dependent on the requested size compared to size class limits.
 
 # Quick overview
 The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages.
 
 # Implementation details
-The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits).
+The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 32 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits).
 
-Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 2016] bytes, medium blocks (2016, 32720] bytes, and large blocks (32720, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (2016, span size] bytes.
+Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [32, 2016] bytes, medium blocks (2016, 32720] bytes, and large blocks (32720, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (2016, span size] bytes.
 
 Small blocks have a size class granularity of 32 bytes each in 63 buckets. Medium blocks have a granularity of 512 bytes, 60 buckets (default). Large blocks have a the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 42 bytes will allocate a block of 64 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested.
 
@@ -95,12 +95,19 @@ Large blocks, or super spans, are cached in two levels. The first level is a per
 # Memory mapping
 By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes.
 
-The functions must guarantee alignment to the configured span size. Either provide the span size during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the span size. The span size MUST be a power of two in [512, 262144] range, and be a multiple (or divisor) of the memory page size.
+The functions must guarantee alignment to the configured span size. Either provide the span size during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the span size. The span size MUST be a power of two in [4096, 262144] range, and be a multiple (or divisor) of the memory page size.
 
 Memory mapping requests are always done in multiples of the memory page size, whichever is larger. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two in [512, 16384] range.
 
 To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 8). If the platform can handle partial unmaps (unmapping one or more spans of memory pages mapped in a larger batch) the `unmap_partial` configuration variable should be set to non-zero. If not, spans will be kept until the entire batch can be unmapped.
 
+# Span breaking
+Super spans (spans a multiple > 1 of the span size) can be subdivided into smaller spans to fulfull a need to map a new span of memory. By default the allocator will greedily grab and break any larger span from the available caches before mapping new virtual memory. However, spans can currently not be glued together to form larger super spans again. Subspans can traverse the cache and be used by different threads individually.
+
+A span that is a subspan of a larger super span can be individually decommitted to reduce physical memory pressure when the span is evicted from caches and scheduled to be unmapped. The entire original super span will keep track of the subspans it is broken up into, and when the entire range is decommitted tha super span will be unmapped. This allows platforms like Windows that require the entire virtual memory range that was mapped in a call to VirtualAlloc to be unmapped in one call to VirtualFree, while still decommitting individual pages in subspans.
+
+If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting invididual pages and 1 for releasing the entire super span memory range.
+
 # Memory guards
 If you define the __ENABLE_GUARDS__ to 1, all memory allocations will be padded with extra guard areas before and after the memory block (while still honoring the requested alignment). These dead zones will be filled with a pattern and checked when the block is freed. If the patterns are not intact the callback set in initialization config is called, or if not set an assert is fired.
 
@@ -122,17 +129,17 @@ Threads that keep ownership of allocated memory blocks within the thread and fre
 Threads that have allocation patterns where the difference in memory usage high and low water marks fit within the thread cache thresholds in the allocator will never touch the global cache except during thread init/fini and have optimal performance. Tweaking the cache limits can be done on a per-size-class basis.
 
 # Worst case scenarios
-Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (16, 32, 48, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 64KiB per size class.
+Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (32, 64, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 64KiB (or the configured span size) per size class. The cache for free spans will be reused by all size classes.
 
 An application that has a producer-consumer scheme between threads where one thread performs all allocations and another frees all memory will have a sub-optimal performance due to blocks crossing thread boundaries will be freed in a two step process - first deferred to the allocating thread, then freed when that thread has need for more memory pages for the requested size. However, depending on the use case the performance overhead might be small.
 
-Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache.
+Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead).
 
 # Caveats
 Cross-thread deallocations are more costly than in-thread deallocations, since the spans are completely owned by the allocating thread. The free operation will be deferred using an atomic list operation and the actual free operation will be performed when the owner thread requires a new block of the corresponding size class.
 
 VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space.
 
-The free, realloc and usable size functions all require the passed pointer to be within the first 64KiB page block of the start of the memory block. You cannot pass in any pointer from the memory block address range. 
+The free, realloc and usable size functions all require the passed pointer to be within the first 64KiB (or whatever you set the span size to) of the start of the memory block. You cannot pass in any pointer from the memory block address range. 
 
 All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. The library does not try to guard against errors.
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 3763ad20..30fa7fe6 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -58,7 +58,7 @@
 
 #ifndef DEFAULT_SPAN_MAP_COUNT
 //! Default number of spans to map in call to map more virtual memory
-#define DEFAULT_SPAN_MAP_COUNT    8
+#define DEFAULT_SPAN_MAP_COUNT    16
 #endif
 
 // Presets for cache limits
@@ -69,27 +69,6 @@
 #define ENABLE_UNLIMITED_CACHE      0
 #endif
 
-#ifndef ENABLE_SPACE_PRIORITY_CACHE
-//! Minimize overhead
-#define ENABLE_SPACE_PRIORITY_CACHE 0
-#endif
-
-#if ENABLE_SPACE_PRIORITY_CACHE
-// Space priority cache limits
-#define MIN_SPAN_CACHE_SIZE 8
-#define MIN_SPAN_CACHE_RELEASE 8
-#define MAX_SPAN_CACHE_DIVISOR 16
-#define MIN_LARGE_SPAN_CACHE_SIZE 2
-#define MIN_LARGE_SPAN_CACHE_RELEASE 2
-#define MAX_LARGE_SPAN_CACHE_DIVISOR 32
-#if ENABLE_GLOBAL_CACHE
-#define GLOBAL_CACHE_MULTIPLIER 1
-#endif
-#ifndef DEFAULT_SPAN_MAP_COUNT
-#define DEFAULT_SPAN_MAP_COUNT 4
-#endif
-#else
-// Default - performance priority cache limits
 //! Minimum cache size to remain after a release to global cache
 #define MIN_SPAN_CACHE_SIZE 64
 //! Minimum number of spans to transfer between thread and global cache
@@ -102,12 +81,11 @@
 #define MIN_LARGE_SPAN_CACHE_RELEASE 4
 //! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor)
 #define MAX_LARGE_SPAN_CACHE_DIVISOR 16
-//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
 #if ENABLE_GLOBAL_CACHE
+//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
 #define GLOBAL_CACHE_MULTIPLIER 8
 #endif
 #endif
-#endif
 
 // Platform and arch specifics
 
@@ -573,28 +551,30 @@ _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size
 #  define _memory_counter_increase(counter, global_counter, span_count) do {} while (0)
 #endif
 
+#if ENABLE_STATISTICS
+#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
+#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
+#else
+#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
+#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
+#endif
+
 //! Map more virtual memory
 static void*
 _memory_map(size_t size, size_t* offset) {
-#if ENABLE_STATISTICS
-	const size_t page_count = (size >> _memory_page_size_shift);
-	atomic_add32(&_mapped_pages, (int32_t)page_count);
-	atomic_add32(&_mapped_total, (int32_t)page_count);
-#endif
 	assert(!(size % _memory_page_size));
+	_memory_statistics_add(&_mapped_pages, (size >> _memory_page_size_shift));
+	_memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
 	return _memory_config.memory_map(size, offset);
 }
 
 //! Unmap virtual memory
 static void
 _memory_unmap(void* address, size_t size, size_t offset, int release) {
-#if ENABLE_STATISTICS
-	const size_t page_count = (size >> _memory_page_size_shift);
-	atomic_add32(&_mapped_pages, -(int32_t)page_count);
-	atomic_add32(&_unmapped_total, (int32_t)page_count);
-#endif
 	assert(!((uintptr_t)address & ~_memory_span_mask));
 	assert(!(size % _memory_page_size));
+	_memory_statistics_sub(&_mapped_pages, (size >> _memory_page_size_shift));
+	_memory_statistics_add(&_unmapped_total, (size >> _memory_page_size_shift));
 	_memory_config.memory_unmap(address, size, offset, release);
 }
 
@@ -630,9 +610,7 @@ _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_co
 		atomic_thread_fence_release();
 		heap->span_reserve_master = span;
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
-#if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, (int32_t)current_count);
-#endif
+		_memory_statistics_add(&_reserved_spans, current_count);
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
 		//Only owner heap thread can modify a master span
@@ -724,9 +702,7 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 		//Directly unmap subspans
 		assert(span->data.list.align_offset == 0);
 		_memory_unmap(span, span_count * _memory_span_size, 0, 0);
-#if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, -(int32_t)span_count);
-#endif
+		_memory_statistics_sub(&_reserved_spans, span_count);
 	}
 	else {
 		//Special double flag to denote an unmapped master
@@ -742,9 +718,7 @@ _memory_unmap_spans(heap_t* heap, span_t* span) {
 		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
 		span_count = SPAN_COUNT(master->flags);
 		_memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1);
-#if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, -(int32_t)span_count);
-#endif
+		_memory_statistics_sub(&_reserved_spans, span_count);
 	}
 	else {
 		//Set remaining spans
@@ -809,9 +783,7 @@ _memory_span_split(heap_t* heap, span_t* span, size_t use_count) {
 		atomic_store32(&span->heap_id, heap->id);
 		atomic_thread_fence_release();
 		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
-#if ENABLE_STATISTICS
-		atomic_add32(&_reserved_spans, (int32_t)current_count);
-#endif
+		_memory_statistics_add(&_reserved_spans, current_count);
 	}
 	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
 		//Only valid to call on master span if we own it
@@ -1604,8 +1576,8 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		span_size = (64 * 1024);
 	if (span_size > (256 * 1024))
 		span_size = (256 * 1024);
-	_memory_span_size = 512;
-	_memory_span_size_shift = 9;
+	_memory_span_size = 4096;
+	_memory_span_size_shift = 12;
 	while (_memory_span_size < span_size) {
 		_memory_span_size <<= 1;
 		++_memory_span_size_shift;
@@ -1695,15 +1667,11 @@ rpmalloc_finalize(void) {
 				assert(master != span);
 				assert(remains >= heap->spans_reserved);
 				_memory_unmap(span, heap->spans_reserved * _memory_span_size, 0, 0);
-#if ENABLE_STATISTICS
-				atomic_add32(&_reserved_spans, -(int32_t)heap->spans_reserved);
-#endif
+				_memory_statistics_sub(&_reserved_spans, heap->spans_reserved);
 				remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved);
 				if (!remains) {
 					uint32_t master_span_count = SPAN_COUNT(master->flags);
-#if ENABLE_STATISTICS
-					atomic_add32(&_reserved_spans, -(int32_t)master_span_count);
-#endif
+					_memory_statistics_sub(&_reserved_spans, master_span_count);
 					_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
 				}
 				else {
@@ -1880,7 +1848,7 @@ _memory_guard_validate(void* p) {
 	}
 	uint32_t* deadzone = block_start;
 	//If these asserts fire, you have written to memory before the block start
-	for (int i = 0; i < 4; ++i) {
+	for (int i = 0; i < 8; ++i) {
 		if (deadzone[i] != MAGIC_GUARD) {
 			if (_memory_config.memory_overwrite)
 				_memory_config.memory_overwrite(p);
@@ -1890,9 +1858,9 @@ _memory_guard_validate(void* p) {
 		}
 		deadzone[i] = 0;
 	}
-	deadzone = (uint32_t*)pointer_offset(block_start, block_size - 16);
+	deadzone = (uint32_t*)pointer_offset(block_start, block_size - 32);
 	//If these asserts fire, you have written to memory after the block end
-	for (int i = 0; i < 4; ++i) {
+	for (int i = 0; i < 8; ++i) {
 		if (deadzone[i] != MAGIC_GUARD) {
 			if (_memory_config.memory_overwrite)
 				_memory_config.memory_overwrite(p);
@@ -1913,13 +1881,15 @@ _memory_guard_block(void* block) {
 	if (block) {
 		size_t block_size = _memory_usable_size(block);
 		uint32_t* deadzone = block;
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
-		deadzone = (uint32_t*)pointer_offset(block, block_size - 16);
-		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = MAGIC_GUARD;
+		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] =
+		deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD;
+		deadzone = (uint32_t*)pointer_offset(block, block_size - 32);
+		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] =
+		deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD;
 	}
 }
-#define _memory_guard_pre_alloc(size) size += 32
-#define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 16); size -= 32
+#define _memory_guard_pre_alloc(size) size += 64
+#define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 32); size -= 64
 #else
 #define _memory_guard_pre_alloc(size)
 #define _memory_guard_post_alloc(block, size)
@@ -1999,7 +1969,7 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
 	}
 #endif
 	void* block;
-	if (alignment > 16) {
+	if (alignment > 32) {
 		block = rpaligned_alloc(alignment, size);
 		if (!(flags & RPMALLOC_NO_PRESERVE))
 			memcpy(block, ptr, oldsize < size ? oldsize : size);
@@ -2016,7 +1986,7 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
 
 RPMALLOC_RESTRICT void*
 rpaligned_alloc(size_t alignment, size_t size) {
-	if (alignment <= 16)
+	if (alignment <= 32)
 		return rpmalloc(size);
 
 #if ENABLE_VALIDATE_ARGS
@@ -2052,7 +2022,7 @@ rpmalloc_usable_size(void* ptr) {
 	if (ptr) {
 		size = _memory_usable_size(ptr);
 #if ENABLE_GUARDS
-		size -= 32;
+		size -= 64;
 #endif
 	}
 	return size;
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 3b1f37dd..b4e22217 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -79,7 +79,7 @@ typedef struct rpmalloc_config_t {
 	//! Size of memory pages. If set to 0, rpmalloc will use system calls to determine the page size.
 	//  The page size MUST be a power of two in [512,16384] range (2^9 to 2^14).
 	size_t page_size;
-	//! Size of a span of memory pages. MUST be a multiple of page size, and in [512,262144] range (unless 0).
+	//! Size of a span of memory pages. MUST be a multiple of page size, and in [4096,262144] range (unless 0).
 	//  Set to 0 to use the default span size. All memory mapping requests to memory_map will be made with
 	//  size set to a multiple of the span size.
 	size_t span_size;

From 38fe77fa96e0efcd6f2868f2fca7d83cbc6d1254 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 13 Feb 2018 08:46:19 +0100
Subject: [PATCH 38/42] cleanup predefines

---
 configure.py        | 11 ++++++-----
 rpmalloc/rpmalloc.c | 33 +++++++++++++++++++--------------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/configure.py b/configure.py
index f307b4ad..309879a7 100755
--- a/configure.py
+++ b/configure.py
@@ -15,15 +15,16 @@
 toolchain = generator.toolchain
 
 rpmalloc_lib = generator.lib(module = 'rpmalloc', libname = 'rpmalloc', sources = ['rpmalloc.c'])
-rpmallocguards_lib = generator.lib(module = 'rpmalloc', libname = 'rpmallocguards', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_GUARDS=1']})
+rpmallocguard_lib = generator.lib(module = 'rpmalloc', libname = 'rpmallocguard', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1', 'ENABLE_GUARDS=1']})
 
-if not target.is_android():
+if not target.is_android() and not target.is_ios():
 	rpmallocwrap_lib = generator.lib(module = 'rpmalloc', libname = 'rpmallocwrap', sources = ['rpmalloc.c', 'malloc.c', 'new.cc'], variables = {'defines': ['ENABLE_PRELOAD=1']})
 
-if not target.is_windows() and not target.is_android() and not target.is_ios():
+if not target.is_android() and not target.is_ios():
 	rpmalloc_so = generator.sharedlib(module = 'rpmalloc', libname = 'rpmalloc', sources = ['rpmalloc.c'])
+
+if not target.is_windows() and not target.is_android() and not target.is_ios():
 	rpmallocwrap_so = generator.sharedlib(module = 'rpmalloc', libname = 'rpmallocwrap', sources = ['rpmalloc.c', 'malloc.c', 'new.cc'], variables = {'runtime': 'c++', 'defines': ['ENABLE_PRELOAD=1']})
 
 if not target.is_ios() and not target.is_android():
-	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmalloc_lib], libs = ['rpmalloc'], includepaths = ['rpmalloc', 'test'])
-	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test-guards', implicit_deps = [rpmallocguards_lib], libs = ['rpmallocguards'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_GUARDS=1']})
+	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmallocguard_lib], libs = ['rpmallocguards'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_GUARDS=1']})
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 30fa7fe6..c8749122 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -23,10 +23,7 @@
 #define ENABLE_THREAD_CACHE       1
 #endif
 
-#if !ENABLE_THREAD_CACHE
-#  undef ENABLE_GLOBAL_CACHE
-#  define ENABLE_GLOBAL_CACHE     0
-#elif !defined(ENABLE_GLOBAL_CACHE)
+#ifndef ENABLE_GLOBAL_CACHE
 //! Enable global cache shared between all threads, requires thread cache
 #define ENABLE_GLOBAL_CACHE       1
 #endif
@@ -56,19 +53,16 @@
 #define ENABLE_GUARDS             0
 #endif
 
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Unlimited cache disables any cache limitations
+#define ENABLE_UNLIMITED_CACHE    0
+#endif
+
 #ifndef DEFAULT_SPAN_MAP_COUNT
 //! Default number of spans to map in call to map more virtual memory
 #define DEFAULT_SPAN_MAP_COUNT    16
 #endif
 
-// Presets for cache limits
-#if ENABLE_THREAD_CACHE
-
-#ifndef ENABLE_UNLIMITED_CACHE
-//! Unlimited cache disables any cache limitations
-#define ENABLE_UNLIMITED_CACHE      0
-#endif
-
 //! Minimum cache size to remain after a release to global cache
 #define MIN_SPAN_CACHE_SIZE 64
 //! Minimum number of spans to transfer between thread and global cache
@@ -81,10 +75,21 @@
 #define MIN_LARGE_SPAN_CACHE_RELEASE 4
 //! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor)
 #define MAX_LARGE_SPAN_CACHE_DIVISOR 16
-#if ENABLE_GLOBAL_CACHE
 //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
 #define GLOBAL_CACHE_MULTIPLIER 8
+
+#if !ENABLE_THREAD_CACHE
+#  undef ENABLE_GLOBAL_CACHE
+#  define ENABLE_GLOBAL_CACHE 0
+#  undef MIN_SPAN_CACHE_SIZE
+#  undef MIN_SPAN_CACHE_RELEASE
+#  undef MAX_SPAN_CACHE_DIVISOR
+#  undef MIN_LARGE_SPAN_CACHE_SIZE
+#  undef MIN_LARGE_SPAN_CACHE_RELEASE
+#  undef MAX_LARGE_SPAN_CACHE_DIVISOR
 #endif
+#if !ENABLE_GLOBAL_CACHE
+#  undef GLOBAL_CACHE_MULTIPLIER
 #endif
 
 // Platform and arch specifics
@@ -1578,7 +1583,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		span_size = (256 * 1024);
 	_memory_span_size = 4096;
 	_memory_span_size_shift = 12;
-	while (_memory_span_size < span_size) {
+	while ((_memory_span_size < span_size) || (_memory_span_size < _memory_page_size)) {
 		_memory_span_size <<= 1;
 		++_memory_span_size_shift;
 	}

From 5a279e43ff273810712e00992216b33dedf2b2a4 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 13 Feb 2018 08:51:03 +0100
Subject: [PATCH 39/42] update docs

---
 CACHE.md  | 2 +-
 README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CACHE.md b/CACHE.md
index b29907f0..64509302 100644
--- a/CACHE.md
+++ b/CACHE.md
@@ -1,5 +1,5 @@
 # Thread caches
-rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc.
+rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let spans of memory pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc.
 
 The rpmalloc thread cache is configured to be unlimited, performance oriented as meaning default values, size oriented where both thread cache and global cache is reduced significantly, or disabled where both thread and global caches are disabled and completely free pages are directly unmapped.
 
diff --git a/README.md b/README.md
index b82f9b76..c93fc6c2 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ The configure + ninja build also produces two shared object/dynamic libraries. T
 The latest stable release is available in the master branch. For latest development code, use the develop branch.
 
 # Cache configuration options
-Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by either defining one of four presets, or by editing the individual defines in the `rpmalloc.c` source file for fine tuned control. If you do not define any of the following three directives, the default preset will be used which is to increase caches and prioritize performance over memory overhead (but not making caches unlimited).
+Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by editing the individual defines in the `rpmalloc.c` source file for fine tuned control.
 
 __ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead.
 

From d1b13a56f9942279d797df30b50d8ae82d43ca2c Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 13 Feb 2018 08:14:43 +0000
Subject: [PATCH 40/42] fix typo

---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 309879a7..f6907704 100755
--- a/configure.py
+++ b/configure.py
@@ -27,4 +27,4 @@
 	rpmallocwrap_so = generator.sharedlib(module = 'rpmalloc', libname = 'rpmallocwrap', sources = ['rpmalloc.c', 'malloc.c', 'new.cc'], variables = {'runtime': 'c++', 'defines': ['ENABLE_PRELOAD=1']})
 
 if not target.is_ios() and not target.is_android():
-	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmallocguard_lib], libs = ['rpmallocguards'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_GUARDS=1']})
+	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmallocguard_lib], libs = ['rpmallocguard'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_GUARDS=1']})

From a0d8c5eed98270dec5b19df9dcfcdf30e73502c9 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 14 Feb 2018 11:55:45 +0100
Subject: [PATCH 41/42] fix non-standard span size support + cleanups

---
 rpmalloc/rpmalloc.c | 186 ++++++++++++++++++++++----------------------
 rpmalloc/rpmalloc.h |  10 +--
 2 files changed, 98 insertions(+), 98 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index c8749122..9fcf1de8 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -11,58 +11,47 @@
 
 #include "rpmalloc.h"
 
-// Build time configurable limits
-
+/// Build time configurable limits
 #ifndef HEAP_ARRAY_SIZE
 //! Size of heap hashmap
 #define HEAP_ARRAY_SIZE           79
 #endif
-
 #ifndef ENABLE_THREAD_CACHE
 //! Enable per-thread cache
 #define ENABLE_THREAD_CACHE       1
 #endif
-
 #ifndef ENABLE_GLOBAL_CACHE
 //! Enable global cache shared between all threads, requires thread cache
 #define ENABLE_GLOBAL_CACHE       1
 #endif
-
 #ifndef ENABLE_VALIDATE_ARGS
 //! Enable validation of args to public entry points
 #define ENABLE_VALIDATE_ARGS      0
 #endif
-
 #ifndef ENABLE_STATISTICS
 //! Enable statistics collection
 #define ENABLE_STATISTICS         0
 #endif
-
 #ifndef ENABLE_ASSERTS
 //! Enable asserts
 #define ENABLE_ASSERTS            0
 #endif
-
 #ifndef ENABLE_PRELOAD
 //! Support preloading
 #define ENABLE_PRELOAD            0
 #endif
-
 #ifndef ENABLE_GUARDS
 //! Enable overwrite/underwrite guards
 #define ENABLE_GUARDS             0
 #endif
-
 #ifndef ENABLE_UNLIMITED_CACHE
 //! Unlimited cache disables any cache limitations
 #define ENABLE_UNLIMITED_CACHE    0
 #endif
-
 #ifndef DEFAULT_SPAN_MAP_COUNT
 //! Default number of spans to map in call to map more virtual memory
 #define DEFAULT_SPAN_MAP_COUNT    16
 #endif
-
 //! Minimum cache size to remain after a release to global cache
 #define MIN_SPAN_CACHE_SIZE 64
 //! Minimum number of spans to transfer between thread and global cache
@@ -76,7 +65,7 @@
 //! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor)
 #define MAX_LARGE_SPAN_CACHE_DIVISOR 16
 //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
-#define GLOBAL_CACHE_MULTIPLIER 8
+#define MAX_GLOBAL_CACHE_MULTIPLIER 8
 
 #if !ENABLE_THREAD_CACHE
 #  undef ENABLE_GLOBAL_CACHE
@@ -89,11 +78,10 @@
 #  undef MAX_LARGE_SPAN_CACHE_DIVISOR
 #endif
 #if !ENABLE_GLOBAL_CACHE
-#  undef GLOBAL_CACHE_MULTIPLIER
+#  undef MAX_GLOBAL_CACHE_MULTIPLIER
 #endif
 
-// Platform and arch specifics
-
+/// Platform and arch specifics
 #ifdef _MSC_VER
 #  define ALIGNED_STRUCT(name, alignment) __declspec(align(alignment)) struct name
 #  define FORCEINLINE __forceinline
@@ -127,6 +115,10 @@
 
 #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
 #  define PLATFORM_WINDOWS 1
+#  define PLATFORM_POSIX 0
+#else
+#  define PLATFORM_WINDOWS 0
+#  define PLATFORM_POSIX 1
 #endif
 
 #include <stdint.h>
@@ -147,7 +139,7 @@
 #  define MAGIC_GUARD 0xDEADBAAD
 #endif
 
-// Atomic access abstraction
+/// Atomic access abstraction
 ALIGNED_STRUCT(atomic32_t, 4) {
 	volatile int32_t nonatomic;
 };
@@ -218,22 +210,7 @@ atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) {
 #endif
 }
 
-// Preconfigured limits and sizes
-
-//! Memory page size
-static size_t _memory_page_size;
-//! Shift to divide by page size
-static size_t _memory_page_size_shift;
-//! Granularity at which memory pages are mapped by OS
-static size_t _memory_map_granularity;
-
-//! Size of a span of memory pages
-static size_t _memory_span_size;
-//! Shift to divide by span size
-static size_t _memory_span_size_shift;
-//! Mask to get to start of a memory span
-static uintptr_t _memory_span_mask;
-
+/// Preconfigured limits and sizes
 //! Granularity of a small allocation block
 #define SMALL_GRANULARITY         32
 //! Small granularity shift count
@@ -242,30 +219,26 @@ static uintptr_t _memory_span_mask;
 #define SMALL_CLASS_COUNT         63
 //! Maximum size of a small block
 #define SMALL_SIZE_LIMIT          2016
-
 //! Granularity of a medium allocation block
 #define MEDIUM_GRANULARITY        512
 //! Medium granularity shift count
 #define MEDIUM_GRANULARITY_SHIFT  9
 //! Number of medium block size classes
 #define MEDIUM_CLASS_COUNT        60
-//! Maximum size of a medium block
-#define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE)
-
 //! Total number of small + medium size classes
 #define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
-
 //! Number of large block size classes
 #define LARGE_CLASS_COUNT         32
+//! Maximum size of a medium block
+#define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE)
 //! Maximum size of a large block
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
+//! Size of a span header
+#define SPAN_HEADER_SIZE          32
 
 #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
 #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
 
-//! Size of a span header
-#define SPAN_HEADER_SIZE          32
-
 #if ARCH_64BIT
 typedef int64_t offset_t;
 #else
@@ -279,8 +252,7 @@ typedef uint32_t count_t;
 #define MAX_ALLOC_SIZE            (((size_t)-1) - _memory_span_size)
 #endif
 
-// Data types
-
+/// Data types
 //! A memory heap, per thread
 typedef struct heap_t heap_t;
 //! Span of memory pages
@@ -427,37 +399,45 @@ struct global_cache_t {
 	atomic32_t counter;
 };
 
+/// Global data
 //! Configuration
 static rpmalloc_config_t _memory_config;
-
+//! Memory page size
+static size_t _memory_page_size;
+//! Shift to divide by page size
+static size_t _memory_page_size_shift;
+//! Mask to get to start of a memory page
+static size_t _memory_page_mask;
+//! Granularity at which memory pages are mapped by OS
+static size_t _memory_map_granularity;
+//! Size of a span of memory pages
+static size_t _memory_span_size;
+//! Shift to divide by span size
+static size_t _memory_span_size_shift;
+//! Mask to get to start of a memory span
+static uintptr_t _memory_span_mask;
 //! Global size classes
 static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
-
+//! Run-time size limit of medium blocks
+static size_t _memory_medium_size_limit;
 //! Heap ID counter
 static atomic32_t _memory_heap_id;
-
+#if ENABLE_THREAD_CACHE
+//! Adaptive cache max allocation count
+static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT];
+#endif
 #if ENABLE_GLOBAL_CACHE
 //! Global span cache
 static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
-
 //! All heaps
 static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
-
 //! Orphaned heaps
 static atomicptr_t _memory_orphan_heaps;
-
 //! Running orphan counter to avoid ABA issues in linked list
 static atomic32_t _memory_orphan_counter;
-
 //! Active heap count
 static atomic32_t _memory_active_heaps;
-
-#if ENABLE_THREAD_CACHE
-//! Adaptive cache max allocation count
-static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT];
-#endif
-
 #if ENABLE_STATISTICS
 //! Total number of currently mapped memory pages
 static atomic32_t _mapped_pages;
@@ -576,7 +556,7 @@ _memory_map(size_t size, size_t* offset) {
 //! Unmap virtual memory
 static void
 _memory_unmap(void* address, size_t size, size_t offset, int release) {
-	assert(!((uintptr_t)address & ~_memory_span_mask));
+	assert((size < _memory_span_size) || !((uintptr_t)address & ~_memory_span_mask));
 	assert(!(size % _memory_page_size));
 	_memory_statistics_sub(&_mapped_pages, (size >> _memory_page_size_shift));
 	_memory_statistics_add(&_unmapped_total, (size >> _memory_page_size_shift));
@@ -682,7 +662,7 @@ _memory_unmap_defer(int32_t heap_id, span_t* span) {
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
-_memory_unmap_spans(heap_t* heap, span_t* span) {
+_memory_unmap_span(heap_t* heap, span_t* span) {
 	size_t span_count = SPAN_COUNT(span->flags);
 	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
 	//A plain run of spans can be unmapped directly
@@ -754,7 +734,7 @@ _memory_unmap_deferred(heap_t* heap, size_t wanted_count) {
 			if ((atomic_load32(&span->heap_id) == master_heap_id) ||
 			        !_memory_unmap_defer(master_heap_id, span)) {
 				//We own the master span (or heap merged and abandoned)
-				_memory_unmap_spans(heap, span);
+				_memory_unmap_span(heap, span);
 			}
 		}
 		span = next;
@@ -768,7 +748,7 @@ _memory_unmap_span_list(heap_t* heap, span_t* span) {
 	size_t list_size = span->data.list.size;
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		span_t* next_span = span->next_span;
-		_memory_unmap_spans(heap, span);
+		_memory_unmap_span(heap, span);
 		span = next_span;
 	}
 	assert(!span);
@@ -952,8 +932,8 @@ _memory_global_cache_insert(heap_t* heap, span_t* span) {
 	//Calculate adaptive limits
 	size_t span_count = SPAN_COUNT(span->flags);
 	const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
-	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
-	const size_t cache_limit_min = GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
+	const size_t cache_limit = (MAX_GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
+	const size_t cache_limit_min = MAX_GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
 	_memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
 }
 
@@ -980,15 +960,14 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 #if ENABLE_STATISTICS
 	heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size;
 #endif
-#else
-	MEMORY_UNUSED(heap);
-	span->data.list.size = 1;
-#endif
 #if ENABLE_GLOBAL_CACHE
 	_memory_global_cache_insert(heap, span);
 #else
 	_memory_unmap_span_list(heap, span);
 #endif
+#else
+	_memory_unmap_span(heap, span);
+#endif
 }
 
 //! Extract the given number of spans from the different cache levels
@@ -1205,12 +1184,12 @@ _memory_allocate_heap(void) {
 	atomic_thread_fence_acquire();
 	do {
 		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap = (void*)((uintptr_t)raw_heap & _memory_span_mask);
+		heap = (void*)((uintptr_t)raw_heap & _memory_page_mask);
 		if (!heap)
 			break;
 		next_heap = heap->next_orphan;
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & ~_memory_span_mask));
+		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & ~_memory_page_mask));
 	}
 	while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
 
@@ -1371,7 +1350,7 @@ _memory_deallocate_defer(int32_t heap_id, void* p) {
 //! Allocate a block of the given size
 static void*
 _memory_allocate(size_t size) {
-	if (size <= MEDIUM_SIZE_LIMIT)
+	if (size <= _memory_medium_size_limit)
 		return _memory_allocate_from_heap(get_thread_heap(), size);
 	else if (size <= LARGE_SIZE_LIMIT)
 		return _memory_allocate_large_from_heap(get_thread_heap(), size);
@@ -1551,7 +1530,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 
 	_memory_page_size = _memory_config.page_size;
 	if (!_memory_page_size) {
-#ifdef PLATFORM_WINDOWS
+#if PLATFORM_WINDOWS
 		SYSTEM_INFO system_info;
 		memset(&system_info, 0, sizeof(system_info));
 		GetSystemInfo(&system_info);
@@ -1575,6 +1554,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		page_size_bit >>= 1;
 	}
 	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
+	_memory_page_mask = ~(uintptr_t)(_memory_page_size - 1);
 
 	size_t span_size = _memory_config.span_size;
 	if (!span_size)
@@ -1615,10 +1595,14 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		_memory_size_class[iclass].size = (uint16_t)size;
 		_memory_adjust_size_class(iclass);
 	}
+
+	_memory_medium_size_limit = _memory_span_size - SPAN_HEADER_SIZE;
+	if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
+		_memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
 	for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
 		size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
-		if (size > MEDIUM_SIZE_LIMIT)
-			size = MEDIUM_SIZE_LIMIT;
+		if (size > _memory_medium_size_limit)
+			size = _memory_medium_size_limit;
 		_memory_size_class[SMALL_CLASS_COUNT + iclass].size = (uint16_t)size;
 		_memory_adjust_size_class(SMALL_CLASS_COUNT + iclass);
 	}
@@ -1709,13 +1693,13 @@ rpmalloc_finalize(void) {
 void
 rpmalloc_thread_initialize(void) {
 	if (!get_thread_heap()) {
+		atomic_incr32(&_memory_active_heaps);
 		heap_t* heap = _memory_allocate_heap();
 #if ENABLE_STATISTICS
 		heap->thread_to_global = 0;
 		heap->global_to_thread = 0;
 #endif
 		set_thread_heap(heap);
-		atomic_incr32(&_memory_active_heaps);
 	}
 }
 
@@ -1726,8 +1710,6 @@ rpmalloc_thread_finalize(void) {
 	if (!heap)
 		return;
 
-	atomic_add32(&_memory_active_heaps, -1);
-
 	_memory_deallocate_deferred(heap, 0);
 	_memory_unmap_deferred(heap, 0);
 
@@ -1744,7 +1726,8 @@ rpmalloc_thread_finalize(void) {
 			span = next;
 		}
 #else
-		_memory_unmap_span_list(heap, span);
+		if (span)
+			_memory_unmap_span_list(heap, span);
 #endif
 		heap->span_cache[iclass] = 0;
 	}
@@ -1756,13 +1739,14 @@ rpmalloc_thread_finalize(void) {
 	heap_t* last_heap;
 	do {
 		last_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap->next_orphan = (void*)((uintptr_t)last_heap & _memory_span_mask);
+		heap->next_orphan = (void*)((uintptr_t)last_heap & _memory_page_mask);
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & ~_memory_span_mask));
+		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & ~_memory_page_mask));
 	}
 	while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
 
 	set_thread_heap(0);
+	atomic_add32(&_memory_active_heaps, -1);
 }
 
 int
@@ -1778,9 +1762,11 @@ rpmalloc_config(void) {
 //! Map new pages to virtual memory
 static void*
 _memory_map_os(size_t size, size_t* offset) {
-	size_t padding = (_memory_span_size > _memory_map_granularity) ? _memory_span_size : 0;
+	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans
+	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
 
-#ifdef PLATFORM_WINDOWS
+#if PLATFORM_WINDOWS
+	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
 	void* ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 	if (!ptr) {
 		assert("Failed to map virtual memory block" == 0);
@@ -1795,12 +1781,19 @@ _memory_map_os(size_t size, size_t* offset) {
 #endif
 
 	if (padding) {
-		padding -= (uintptr_t)ptr & ~_memory_span_mask;
-		ptr = pointer_offset(ptr, padding);
-		assert(padding <= _memory_span_size);
-		assert(!(padding & 3));
+		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+#if PLATFORM_POSIX
+		//Unmap the last unused pages, for Windows this is done with the final VirtualFree with MEM_RELEASE call
+		size_t remains = padding - final_padding;
+		if (remains)
+			munmap(pointer_offset(ptr, final_padding + size), remains);
+#endif
+		ptr = pointer_offset(ptr, final_padding);
+		assert(final_padding <= _memory_span_size);
+		assert(!(final_padding & 5));
 		assert(!((uintptr_t)ptr & ~_memory_span_mask));
-		*offset = padding >> 2;
+		*offset = final_padding >> 3;
+		assert(*offset < 65536);
 	}
 
 	return ptr;
@@ -1810,13 +1803,16 @@ _memory_map_os(size_t size, size_t* offset) {
 static void
 _memory_unmap_os(void* address, size_t size, size_t offset, int release) {
 	assert(release || (offset == 0));
-	if (offset) {
-		offset <<= 2;
+	if (release && offset) {
+		offset <<= 3;
+#if PLATFORM_POSIX
 		size += offset;
-		address = pointer_offset(address, -(offset_t)offset);
+#endif
+		address = pointer_offset(address, -(int32_t)offset);
 	}
-#ifdef PLATFORM_WINDOWS
+#if PLATFORM_WINDOWS
 	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
+		DWORD err = GetLastError();
 		assert("Failed to unmap virtual memory block" == 0);
 	}
 #else
@@ -1894,9 +1890,11 @@ _memory_guard_block(void* block) {
 	}
 }
 #define _memory_guard_pre_alloc(size) size += 64
+#define _memory_guard_pre_realloc(block, size) block = pointer_offset(block, -32); size += 64
 #define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 32); size -= 64
 #else
 #define _memory_guard_pre_alloc(size)
+#define _memory_guard_pre_realloc(block, size)
 #define _memory_guard_post_alloc(block, size)
 #endif
 
@@ -1926,7 +1924,7 @@ RPMALLOC_RESTRICT void*
 rpcalloc(size_t num, size_t size) {
 	size_t total;
 #if ENABLE_VALIDATE_ARGS
-#ifdef PLATFORM_WINDOWS
+#if PLATFORM_WINDOWS
 	int err = SizeTMult(num, size, &total);
 	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
 		errno = EINVAL;
@@ -1958,7 +1956,7 @@ rprealloc(void* ptr, size_t size) {
 	}
 #endif
 	_memory_guard_validate(ptr);
-	_memory_guard_pre_alloc(size);
+	_memory_guard_pre_realloc(ptr, size);
 	void* block = _memory_reallocate(ptr, size, 0, 0);
 	_memory_guard_post_alloc(block, size);
 	return block;
@@ -1982,7 +1980,7 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
 	}
 	else {
 		_memory_guard_validate(ptr);
-		_memory_guard_pre_alloc(size);
+		_memory_guard_pre_realloc(ptr, size);
 		block = _memory_reallocate(ptr, size, oldsize, flags);
 		_memory_guard_post_alloc(block, size);
 	}
@@ -2035,7 +2033,9 @@ rpmalloc_usable_size(void* ptr) {
 
 void
 rpmalloc_thread_collect(void) {
-	_memory_deallocate_deferred(get_thread_heap(), 0);
+	heap_t* heap = get_thread_heap();
+	_memory_unmap_deferred(heap, 0);
+	_memory_deallocate_deferred(0, 0);
 }
 
 void
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index b4e22217..f5961011 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -76,12 +76,12 @@ typedef struct rpmalloc_config_t {
 	//  If release is set to 0, the unmap is a partial decommit of a subset of the mapped
 	//  memory range.
 	void (*memory_unmap)(void* address, size_t size, size_t offset, int release);
-	//! Size of memory pages. If set to 0, rpmalloc will use system calls to determine the page size.
-	//  The page size MUST be a power of two in [512,16384] range (2^9 to 2^14).
+	//! Size of memory pages. The page size MUST be a power of two in [512,16384] range
+	//  (2^9 to 2^14) unless 0 - set to 0 to use system page size. All memory mapping
+	//  requests to memory_map will be made with size set to a multiple of the page size.
 	size_t page_size;
-	//! Size of a span of memory pages. MUST be a multiple of page size, and in [4096,262144] range (unless 0).
-	//  Set to 0 to use the default span size. All memory mapping requests to memory_map will be made with
-	//  size set to a multiple of the span size.
+	//! Size of a span of memory pages. MUST be a multiple of page size, and in [4096,262144]
+	//  range (unless 0 - set to 0 to use the default span size).
 	size_t span_size;
 	//! Number of spans to map at each request to map new virtual memory blocks. This can
 	//  be used to minimize the system call overhead at the cost of virtual memory address

From f8d13eb66c52c34f4722215394cc53d366a169be Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 14 Feb 2018 17:48:30 +0100
Subject: [PATCH 42/42] update changelog

---
 CHANGELOG | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index c9274332..f2f25a7b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,8 @@ Make span size configurable and all spans equal in size, removing span size clas
 
 Allow super spans to be reserved in advance and split up in multiple used spans to reduce number of system calls. This will not increase committed physical pages, only reserved virtual memory space.
 
+Allow super spans to be reused for allocations of lower size, breaking up the super span and storing remainder in thread cache in order to reduce load on global cache and reduce cache overhead.
+
 Fixed an issue where an allocation of zero bytes would cause a segmentation fault from indexing size class array with index -1.
 
 Fixed an issue where an allocation of maximum large block size (2097120 bytes) would index the heap cache array out of bounds and potentially cause a segmentation fault depending on earlier allocation patterns.