From 71a705fa97b7c2982e8d1be732ae8baeb3305cf0 Mon Sep 17 00:00:00 2001 From: Istvan Soos Date: Wed, 11 Jun 2025 18:45:42 +0200 Subject: [PATCH] Use top-k sorted list builder instead of full-list sorts in search indexes. --- app/lib/search/mem_index.dart | 48 ++++++++++++++++++--------------- app/lib/search/token_index.dart | 20 ++++++-------- app/lib/shared/utils.dart | 44 ++++++++++++++++++++++++++++++ app/test/shared/utils_test.dart | 33 +++++++++++++++++++++++ 4 files changed, 112 insertions(+), 33 deletions(-) diff --git a/app/lib/search/mem_index.dart b/app/lib/search/mem_index.dart index 53d5e4678..1cec90d31 100644 --- a/app/lib/search/mem_index.dart +++ b/app/lib/search/mem_index.dart @@ -12,6 +12,7 @@ import 'package:meta/meta.dart'; import 'package:pub_dev/service/topics/models.dart'; import 'package:pub_dev/third_party/bit_array/bit_array.dart'; +import '../shared/utils.dart' show TopKSortedListBuilder; import 'models.dart'; import 'search_service.dart'; import 'text_utils.dart'; @@ -263,6 +264,13 @@ class InMemoryPackageIndex { // extra item, that will be addressed after the ranking score is determined. var totalCount = packageScores?.positiveCount() ?? predicateFilterCount; + // Checking if it is worth to calculate the sorted order, estimating the + // total count by overcounting the best name matches. + final maximumTotalCount = totalCount + (bestNameIndex != null ? 1 : 0); + if (maximumTotalCount < query.offset) { + return PackageSearchResult.empty(); + } + Iterable indexedHits; switch (query.effectiveOrder) { case SearchOrder.top: @@ -285,8 +293,8 @@ class InMemoryPackageIndex { } indexedHits = _rankWithValues( packageScores, - requiredLengthThreshold: query.offset, bestNameIndex: bestNameIndex ?? -1, + topK: query.offset + query.limit, ); break; case SearchOrder.created: @@ -512,33 +520,31 @@ class InMemoryPackageIndex { return _TextResults(topApiPages); } - List _rankWithValues( + Iterable _rankWithValues( IndexedScore score, { - // if the item count is fewer than this threshold, an empty list will be returned - required int requiredLengthThreshold, - // When no best name match is applied, this parameter will be `-1` + /// When no best name match is applied, this parameter will be `-1` required int bestNameIndex, + + /// Return (and sort) only the top-k results. + required int topK, }) { - final list = []; + final builder = TopKSortedListBuilder(topK, (aIndex, bIndex) { + if (aIndex == bestNameIndex) return -1; + if (bIndex == bestNameIndex) return 1; + final aScore = score.getValue(aIndex); + final bScore = score.getValue(bIndex); + final scoreCompare = -aScore.compareTo(bScore); + if (scoreCompare != 0) return scoreCompare; + // if two packages got the same score, order by last updated + return _compareUpdated(_documents[aIndex], _documents[bIndex]); + }); for (var i = 0; i < score.length; i++) { final value = score.getValue(i); if (value <= 0.0 && i != bestNameIndex) continue; - list.add(IndexedPackageHit( - i, PackageHit(package: score.keys[i], score: value))); - } - if (requiredLengthThreshold > list.length) { - // There is no point to sort or even keep the results, as the search query offset ignores these anyway. - return []; + builder.add(i); } - list.sort((a, b) { - if (a.index == bestNameIndex) return -1; - if (b.index == bestNameIndex) return 1; - final scoreCompare = -a.hit.score!.compareTo(b.hit.score!); - if (scoreCompare != 0) return scoreCompare; - // if two packages got the same score, order by last updated - return _compareUpdated(_documents[a.index], _documents[b.index]); - }); - return list; + return builder.getTopK().map((i) => IndexedPackageHit( + i, PackageHit(package: score.keys[i], score: score.getValue(i)))); } List _rankWithComparator( diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart index 038955af0..442349a7b 100644 --- a/app/lib/search/token_index.dart +++ b/app/lib/search/token_index.dart @@ -5,6 +5,7 @@ import 'dart:math' as math; import 'package:meta/meta.dart'; +import 'package:pub_dev/shared/utils.dart'; import 'package:pub_dev/third_party/bit_array/bit_array.dart'; import 'text_utils.dart'; @@ -313,21 +314,16 @@ class IndexedScore { } Map top(int count, {double? minValue}) { - final list = []; - double? lastValue; + minValue ??= 0.0; + final builder = TopKSortedListBuilder( + count, (a, b) => -_values[a].compareTo(_values[b])); for (var i = 0; i < length; i++) { final v = _values[i]; - if (minValue != null && v < minValue) continue; - if (list.length == count) { - if (lastValue != null && lastValue >= v) continue; - list[count - 1] = i; - } else { - list.add(i); - } - list.sort((a, b) => -_values[a].compareTo(_values[b])); - lastValue = _values[list.last]; + if (v < minValue) continue; + builder.add(i); } - return Map.fromEntries(list.map((i) => MapEntry(_keys[i], _values[i]))); + return Map.fromEntries( + builder.getTopK().map((i) => MapEntry(_keys[i], _values[i]))); } Map toMap() { diff --git a/app/lib/shared/utils.dart b/app/lib/shared/utils.dart index ee659ff22..1d977c4bb 100644 --- a/app/lib/shared/utils.dart +++ b/app/lib/shared/utils.dart @@ -148,6 +148,50 @@ class DurationTracker extends LastNTracker { }; } +/// Builds a sorted list of the top-k items using the provided comparator. +/// +/// The algorithm uses a binary tree insertion, resulting in O(N * log(K)) comparison. +class TopKSortedListBuilder { + final int _k; + final Comparator _compare; + final _list = []; + + TopKSortedListBuilder(this._k, this._compare); + + void addAll(Iterable items) { + for (final item in items) { + add(item); + } + } + + void add(T item) { + if (_list.length >= _k && _compare(_list.last, item) <= 0) { + return; + } + var start = 0, end = _list.length; + while (start < end) { + final mid = (start + end) >> 1; + if (_compare(_list[mid], item) <= 0) { + start = mid + 1; + } else { + end = mid; + } + } + if (_list.length < _k) { + _list.insert(start, item); + return; + } + for (var i = _list.length - 1; i > start; i--) { + _list[i] = _list[i - 1]; + } + _list[start] = item; + } + + Iterable getTopK() { + return _list; + } +} + /// Returns the MIME content type based on the name of the file. String contentType(String name) { final ext = p.extension(name).replaceAll('.', ''); diff --git a/app/test/shared/utils_test.dart b/app/test/shared/utils_test.dart index c882064f9..8ad56c10b 100644 --- a/app/test/shared/utils_test.dart +++ b/app/test/shared/utils_test.dart @@ -73,4 +73,37 @@ void main() { expect(tracker.getLatency().inMilliseconds, greaterThan(15000)); }); }); + + group('top-k sorted list', () { + int compare(int a, int b) => -a.compareTo(b); + + test('no items', () { + final builder = TopKSortedListBuilder(5, compare); + expect(builder.getTopK().toList(), []); + }); + + test('single item', () { + final builder = TopKSortedListBuilder(5, compare); + builder.add(1); + expect(builder.getTopK().toList(), [1]); + }); + + test('three items ascending', () { + final builder = TopKSortedListBuilder(5, compare); + builder.addAll([1, 2, 3]); + expect(builder.getTopK().toList(), [3, 2, 1]); + }); + + test('three items descending', () { + final builder = TopKSortedListBuilder(5, compare); + builder.addAll([3, 2, 1]); + expect(builder.getTopK().toList(), [3, 2, 1]); + }); + + test('10 items + repeated', () { + final builder = TopKSortedListBuilder(5, compare); + builder.addAll([1, 10, 2, 9, 3, 8, 4, 7, 6, 5, 9]); + expect(builder.getTopK().toList(), [10, 9, 9, 8, 7]); + }); + }); }