From 3f23468b5e899209de1d8fa95009cf11ae38a346 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Wed, 12 Jan 2022 22:08:51 -0800 Subject: [PATCH] Greatly improve performance of sorting dictionaries Dictionary lookup by index is slow, so read all of the keys/values from the dictionary into a vector and sort that instead. Using a dictionary iterator to read the values replaces O(N log N) ClusterTree lookups with O(log N), so this is asymptotically faster. The benchmark is sped up from 3.77 seconds to 21.62 milliseconds, but more reasonably sized Dictionaries will see proportionally smaller benefits. The downside of this is that the array of Mixed requires 24 bytes of scratch space per element in the dictionary. We already require 8 bytes per element to store the results, so this is just a constant factor increase rather than an aysmtotic change in memory use and it's probably not a problem. --- CHANGELOG.md | 1 + src/realm/dictionary.cpp | 74 ++++++++++++++++++---------- src/realm/dictionary.hpp | 3 ++ test/benchmark-common-tasks/main.cpp | 11 ++--- 4 files changed, 55 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 589e5ec9f2e..2893150115e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * Add support for building with Xcode 14 using the CMake project ([PR #5577](https://github.com/realm/realm-core/pull/5577)). * Expose MongoDB client interface in the C API. ([PR #5638](https://github.com/realm/realm-core/pull/5638)). * Add support in the C API for constructing a new `realm_app_t` object via `realm_app_create`. ([PR #5570](https://github.com/realm/realm-core/issues/5570)) +* Greatly improve the performance of sorting or distincting a Dictionary's keys or values. The most expensive operation is now performed O(log N) rather than O(N log N) times, and large Dictionaries can see upwards of 99% reduction in time to sort. ([PR #5166](https://github.com/realm/realm-core/pulls/5166)) ### Fixed * ([#????](https://github.com/realm/realm-core/issues/????), since v?.?.?) diff --git a/src/realm/dictionary.cpp b/src/realm/dictionary.cpp index 458f07d652e..7d04acf2e9c 100644 --- a/src/realm/dictionary.cpp +++ b/src/realm/dictionary.cpp @@ -451,27 +451,49 @@ void Dictionary::align_indices(std::vector& indices) const } } -void Dictionary::sort(std::vector& indices, bool ascending) const +namespace { +std::vector get_keys(const Dictionary& dict) +{ + std::vector values; + values.reserve(dict.size()); + for (auto it = dict.begin(), end = dict.end(); it != end; ++it) + values.push_back(it.key()); + return values; +} + +std::vector get_values(const Dictionary& dict) +{ + std::vector values; + values.reserve(dict.size()); + for (auto it = dict.begin(), end = dict.end(); it != end; ++it) + values.push_back(it.value()); + return values; +} + +void do_sort(std::vector& indices, bool ascending, const std::vector& values) { - align_indices(indices); auto b = indices.begin(); auto e = indices.end(); - std::sort(b, e, [this, ascending](size_t i1, size_t i2) { - auto v1 = get_any(i1); - auto v2 = get_any(i2); - return ascending ? v1 < v2 : v2 < v1; + std::sort(b, e, [ascending, &values](size_t i1, size_t i2) { + return ascending ? values[i1] < values[i2] : values[i2] < values[i1]; }); } +} // anonymous namespace -void Dictionary::distinct(std::vector& indices, util::Optional ascending) const +void Dictionary::sort(std::vector& indices, bool ascending) const { align_indices(indices); + do_sort(indices, ascending, get_values(*this)); +} - bool sort_ascending = ascending ? *ascending : true; - sort(indices, sort_ascending); +void Dictionary::distinct(std::vector& indices, util::Optional ascending) const +{ + align_indices(indices); + auto values = get_values(*this); + do_sort(indices, ascending.value_or(true), values); indices.erase(std::unique(indices.begin(), indices.end(), - [this](size_t i1, size_t i2) { - return get_any(i1) == get_any(i2); + [&values](size_t i1, size_t i2) { + return values[i1] == values[i2]; }), indices.end()); @@ -484,13 +506,7 @@ void Dictionary::distinct(std::vector& indices, util::Optional asc void Dictionary::sort_keys(std::vector& indices, bool ascending) const { align_indices(indices); - auto b = indices.begin(); - auto e = indices.end(); - std::sort(b, e, [this, ascending](size_t i1, size_t i2) { - auto k1 = get_key(i1); - auto k2 = get_key(i2); - return ascending ? k1 < k2 : k2 < k1; - }); + do_sort(indices, ascending, get_keys(*this)); } void Dictionary::distinct_keys(std::vector& indices, util::Optional) const @@ -1072,34 +1088,38 @@ Dictionary::Iterator::Iterator(const Dictionary* dict, size_t pos) { } -auto Dictionary::Iterator::operator*() const -> value_type +Mixed Dictionary::Iterator::key() const { - update(); - Mixed key; switch (m_key_type) { case type_String: { ArrayString keys(m_tree.get_alloc()); ref_type ref = to_ref(Array::get(m_leaf.get_mem().get_addr(), 1)); keys.init_from_ref(ref); - key = Mixed(keys.get(m_state.m_current_index)); - break; + return Mixed(keys.get(m_state.m_current_index)); } case type_Int: { ArrayInteger keys(m_tree.get_alloc()); ref_type ref = to_ref(Array::get(m_leaf.get_mem().get_addr(), 1)); keys.init_from_ref(ref); - key = Mixed(keys.get(m_state.m_current_index)); - break; + return Mixed(keys.get(m_state.m_current_index)); } default: throw std::runtime_error("Not implemented"); - break; } +} + +Mixed Dictionary::Iterator::value() const +{ ArrayMixed values(m_tree.get_alloc()); ref_type ref = to_ref(Array::get(m_leaf.get_mem().get_addr(), 2)); values.init_from_ref(ref); + return values.get(m_state.m_current_index); +} - return std::make_pair(key, values.get(m_state.m_current_index)); +auto Dictionary::Iterator::operator*() const -> value_type +{ + update(); + return std::make_pair(key(), value()); } diff --git a/src/realm/dictionary.hpp b/src/realm/dictionary.hpp index 7ea1f5762ce..e9127834f8a 100644 --- a/src/realm/dictionary.hpp +++ b/src/realm/dictionary.hpp @@ -221,6 +221,9 @@ class Dictionary::Iterator : public ClusterTree::Iterator { using ClusterTree::Iterator::get_position; + Mixed key() const; + Mixed value() const; + private: friend class Dictionary; diff --git a/test/benchmark-common-tasks/main.cpp b/test/benchmark-common-tasks/main.cpp index 1ce031bf9ef..5dcd02cd015 100644 --- a/test/benchmark-common-tasks/main.cpp +++ b/test/benchmark-common-tasks/main.cpp @@ -1224,7 +1224,7 @@ struct BenchmarkSortInt : BenchmarkWithInts { struct BenchmarkSortIntList : Benchmark { const char* name() const { - return "BenchmarkSortIntList"; + return "SortIntList"; } void before_all(DBRef group) @@ -1267,12 +1267,9 @@ struct BenchmarkSortIntList : Benchmark { }; struct BenchmarkSortIntDictionary : Benchmark { - // Dictionary sorting is sufficiently slow that we want to use a smaller size - static const size_t size = BASE_SIZE / 10; - const char* name() const { - return "BenchmarkSortIntDictionary"; + return "SortIntDictionary"; } void before_all(DBRef group) @@ -1285,11 +1282,11 @@ struct BenchmarkSortIntDictionary : Benchmark { Dictionary dict = obj.get_dictionary(m_col); Random r; - for (size_t i = 0; i < size; ++i) { + for (size_t i = 0; i < BASE_SIZE; ++i) { dict.insert(util::to_string(i), r.draw_int()); } tr.commit(); - m_indices.reserve(size); + m_indices.reserve(BASE_SIZE); } void after_all(DBRef db)