From 36fe1daaca6322224ce81dca6a671d434e0106de Mon Sep 17 00:00:00 2001 From: Antoine Pitrou <antoine@python.org> Date: Tue, 16 Jul 2024 14:06:47 +0200 Subject: [PATCH] GH-43254: [C++] Always prefer mimalloc to jemalloc (#40875) ### Rationale for this change As discussed [on the mailing-list](https://lists.apache.org/thread/dts9ggvkthczfpmd25wrz449mxod76o2), this PR switches the default memory pool to mimalloc for all platforms. This should have several desirable effects: * less variability between platforms * mimalloc generally has a nicer, more consistent API and is easier to work with (in particular, jemalloc's configuration scheme is slightly abtruse) * potentially better performance, or at least not significantly worse, than the statu quo ### Are these changes tested? Yes, by existing CI configurations. ### Are there any user-facing changes? Behavior should not change. Performance characteristics of some user workloads might improve or regress, but this is something we cannot predict in advance. * GitHub Issue: #43254 Lead-authored-by: Antoine Pitrou <antoine@python.org> Co-authored-by: Sutou Kouhei <kou@clear-code.com> Signed-off-by: Antoine Pitrou <antoine@python.org> --- cpp/src/arrow/memory_pool.cc | 18 ++++++++---------- dev/archery/archery/benchmark/runner.py | 2 ++ dev/tasks/linux-packages/github.linux.yml | 2 +- docs/source/cpp/memory.rst | 6 +++--- docs/source/python/memory.rst | 8 ++++---- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 2f8ce3a6fa8c7..1e855311a98ed 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -85,19 +85,17 @@ struct SupportedBackend { const std::vector<SupportedBackend>& SupportedBackends() { static std::vector<SupportedBackend> backends = { - // ARROW-12316: Apple => mimalloc first, then jemalloc - // non-Apple => jemalloc first, then mimalloc -#if defined(ARROW_JEMALLOC) && !defined(__APPLE__) - {"jemalloc", MemoryPoolBackend::Jemalloc}, -#endif + // mimalloc is our preferred allocator for several reasons: + // 1) it has good performance + // 2) it is well-supported on all our main platforms (Linux, macOS, Windows) + // 3) it is easy to configure and has a consistent API. #ifdef ARROW_MIMALLOC - {"mimalloc", MemoryPoolBackend::Mimalloc}, + {"mimalloc", MemoryPoolBackend::Mimalloc}, #endif -#if defined(ARROW_JEMALLOC) && defined(__APPLE__) - {"jemalloc", MemoryPoolBackend::Jemalloc}, +#ifdef ARROW_JEMALLOC + {"jemalloc", MemoryPoolBackend::Jemalloc}, #endif - {"system", MemoryPoolBackend::System} - }; + {"system", MemoryPoolBackend::System}}; return backends; } diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index a91989fb95257..9ebb9226e3743 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -123,6 +123,8 @@ def default_configuration(**kwargs): with_csv=True, with_dataset=True, with_json=True, + with_jemalloc=True, + with_mimalloc=True, with_parquet=True, with_python=False, with_brotli=True, diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml index 9e24835b8b627..891682c4358d8 100644 --- a/dev/tasks/linux-packages/github.linux.yml +++ b/dev/tasks/linux-packages/github.linux.yml @@ -64,7 +64,7 @@ jobs: run: | set -e pushd arrow/dev/tasks/linux-packages - rake version:update + rake version:update ARROW_RELEASE_TIME="$(date --iso-8601=seconds)" rake docker:pull || : rake --trace {{ task_namespace }}:build BUILD_DIR=build popd diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst index 33907b5580f61..032b7d1ac90f1 100644 --- a/docs/source/cpp/memory.rst +++ b/docs/source/cpp/memory.rst @@ -139,9 +139,9 @@ Default Memory Pool The default memory pool depends on how Arrow C++ was compiled: -- if enabled at compile time, a `jemalloc <http://jemalloc.net/>`_ heap; -- otherwise, if enabled at compile time, a - `mimalloc <https://github.com/microsoft/mimalloc>`_ heap; +- if enabled at compile time, a `mimalloc <https://github.com/microsoft/mimalloc>`_ + heap; +- otherwise, if enabled at compile time, a `jemalloc <http://jemalloc.net/>`_ heap; - otherwise, the C library ``malloc`` heap. Overriding the Default Memory Pool diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 7b49d48ab20fa..029d30cc1b693 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -110,12 +110,12 @@ the buffer is garbage-collected, all of the memory is freed: pa.total_allocated_bytes() Besides the default built-in memory pool, there may be additional memory pools -to choose (such as `mimalloc <https://github.com/microsoft/mimalloc>`_) -from depending on how Arrow was built. One can get the backend -name for a memory pool:: +to choose from (such as `jemalloc <http://jemalloc.net/>`_) +depending on how Arrow was built. One can get the backend name for a memory +pool:: >>> pa.default_memory_pool().backend_name - 'jemalloc' + 'mimalloc' .. seealso:: :ref:`API documentation for memory pools <api.memory_pool>`.