Skip to content

Commit

Permalink
Move the terms index of _id off-heap. (#52518)
Browse files Browse the repository at this point in the history
In #42838 we moved the terms index of all fields off-heap except the
`_id` field because we were worried it might make indexing slower. In
general, the indexing rate is only affected if explicit IDs are used, as
otherwise Elasticsearch almost never performs lookups in the terms
dictionary for the purpose of indexing. So it's quite wasteful to
require the terms index of `_id` to be loaded on-heap for users who have
append-only workloads. Furthermore I've been conducting benchmarks when
indexing with explicit ids on the http_logs dataset that suggest that
the slowdown is low enough that it's probably not worth forcing the terms
index to be kept on-heap. Here are some numbers for the median indexing
rate in docs/s:

| Run | Master  | Patch   |
| --- | ------- | ------- |
| 1   | 45851.2 | 46401.4 |
| 2   | 45192.6 | 44561.0 |
| 3   | 45635.2 | 44137.0 |
| 4   | 46435.0 | 44692.8 |
| 5   | 45829.0 | 44949.0 |

And now heap usage in MB for segments:

| Run | Master  | Patch    |
| --- | ------- | -------- |
| 1   | 41.1720 | 0.352083 |
| 2   | 45.1545 | 0.382534 |
| 3   | 41.7746 | 0.381285 |
| 4   | 45.3673 | 0.412737 |
| 5   | 45.4616 | 0.375063 |

Indexing rate decreased by 1.8% on average, while memory usage decreased
by more than 100x.

The `http_logs` dataset contains small documents and has a simple
indexing chain. More complex indexing chains, e.g. with more fields,
ingest pipelines, etc. would see an even lower decrease of indexing rate.
  • Loading branch information
jpountz authored Feb 24, 2020
1 parent de3d674 commit f993ef8
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
IndexSettings.DEFAULT_PIPELINE,
IndexSettings.FINAL_PIPELINE,
MetaDataIndexStateService.VERIFIED_BEFORE_CLOSE_SETTING,
IndexSettings.ON_HEAP_ID_TERMS_INDEX,

// validate that built-in similarities don't get redefined
Setting.groupSetting("index.similarity.", (s) -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ public final class IndexSettings {
"[true, false, checksum] but was: " + s);
}
}, Property.IndexScope);
// This setting is undocumented as it is considered as an escape hatch.
public static final Setting<Boolean> ON_HEAP_ID_TERMS_INDEX =
Setting.boolSetting("index.force_memory_id_terms_dictionary", false, Property.IndexScope);

/**
* Index setting describing the maximum value of from + size on a query.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2211,22 +2211,24 @@ IndexWriter createWriter(Directory directory, IndexWriterConfig iwc) throws IOEx
}
}

static Map<String, String> getReaderAttributes(Directory directory) {
static Map<String, String> getReaderAttributes(Directory directory, IndexSettings indexSettings) {
Directory unwrap = FilterDirectory.unwrap(directory);
boolean defaultOffHeap = FsDirectoryFactory.isHybridFs(unwrap) || unwrap instanceof MMapDirectory;
HashMap<String, String> map = new HashMap(2);
map.put(BlockTreeTermsReader.FST_MODE_KEY, // if we are using MMAP for term dics we force all off heap unless it's the ID field
defaultOffHeap ? FSTLoadMode.OFF_HEAP.name() : FSTLoadMode.ON_HEAP.name());
map.put(BlockTreeTermsReader.FST_MODE_KEY + "." + IdFieldMapper.NAME, // always force ID field on-heap for fast updates
FSTLoadMode.ON_HEAP.name());
return Collections.unmodifiableMap(map);
Map<String, String> attributes = new HashMap<>();
attributes.put(BlockTreeTermsReader.FST_MODE_KEY, defaultOffHeap ? FSTLoadMode.OFF_HEAP.name() : FSTLoadMode.ON_HEAP.name());
if (IndexSettings.ON_HEAP_ID_TERMS_INDEX.exists(indexSettings.getSettings())) {
final boolean idOffHeap = IndexSettings.ON_HEAP_ID_TERMS_INDEX.get(indexSettings.getSettings()) == false;
attributes.put(BlockTreeTermsReader.FST_MODE_KEY + "." + IdFieldMapper.NAME,
idOffHeap ? FSTLoadMode.OFF_HEAP.name() : FSTLoadMode.ON_HEAP.name());
}
return Collections.unmodifiableMap(attributes);
}

private IndexWriterConfig getIndexWriterConfig() {
final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
iwc.setCommitOnClose(false); // we by default don't commit on close
iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
iwc.setReaderAttributes(getReaderAttributes(store.directory()));
iwc.setReaderAttributes(getReaderAttributes(store.directory(), engineConfig.getIndexSettings()));
iwc.setIndexDeletionPolicy(combinedDeletionPolicy);
// with tests.verbose, lucene sets this up: plumb to align with filesystem stream
boolean verbose = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5886,53 +5886,71 @@ public void testRefreshAndCloseEngineConcurrently() throws Exception {
}

public void testGetReaderAttributes() throws IOException {
Settings.Builder settingsBuilder = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT);
Settings settings = settingsBuilder.build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("foo", settings);

try(BaseDirectoryWrapper dir = newFSDirectory(createTempDir())) {
Directory unwrap = FilterDirectory.unwrap(dir);
boolean isMMap = unwrap instanceof MMapDirectory;
Map<String, String> readerAttributes = InternalEngine.getReaderAttributes(dir);
assertEquals(2, readerAttributes.size());
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst._id"));
if (isMMap) {
assertEquals("OFF_HEAP", readerAttributes.get("blocktree.terms.fst"));
} else {
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst"));
}
Map<String, String> readerAttributes = InternalEngine.getReaderAttributes(dir, indexSettings);
assertEquals(Collections.singletonMap("blocktree.terms.fst", isMMap ? "OFF_HEAP" : "ON_HEAP"), readerAttributes);
}

try(MMapDirectory dir = new MMapDirectory(createTempDir())) {
Map<String, String> readerAttributes =
InternalEngine.getReaderAttributes(randomBoolean() ? dir :
new MockDirectoryWrapper(random(), dir));
assertEquals(2, readerAttributes.size());
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst._id"));
assertEquals("OFF_HEAP", readerAttributes.get("blocktree.terms.fst"));
new MockDirectoryWrapper(random(), dir), indexSettings);
assertEquals(Collections.singletonMap("blocktree.terms.fst", "OFF_HEAP"), readerAttributes);
}

Settings.Builder settingsBuilder = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT);
Settings settings = settingsBuilder.build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("foo", settings);
FsDirectoryFactory service = new FsDirectoryFactory();
Path tempDir = createTempDir().resolve(indexSettings.getUUID()).resolve("0");
ShardPath path = new ShardPath(false, tempDir, tempDir, new ShardId(indexSettings.getIndex(), 0));
try (Directory directory = service.newDirectory(indexSettings, path)) {

Map<String, String> readerAttributes =
InternalEngine.getReaderAttributes(randomBoolean() ? directory :
new MockDirectoryWrapper(random(), directory));
assertEquals(2, readerAttributes.size());
new MockDirectoryWrapper(random(), directory), indexSettings);
assertEquals(1, readerAttributes.size());

switch (IndexModule.defaultStoreType(true)) {
case HYBRIDFS:
case MMAPFS:
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst._id"));
assertEquals("OFF_HEAP", readerAttributes.get("blocktree.terms.fst"));
break;
case NIOFS:
case SIMPLEFS:
case FS:
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst"));
break;
default:
fail("unknownw type");
}
}

settingsBuilder.put(IndexSettings.ON_HEAP_ID_TERMS_INDEX.getKey(), true);
settings = settingsBuilder.build();
indexSettings = IndexSettingsModule.newIndexSettings("foo", settings);
try (Directory directory = service.newDirectory(indexSettings, path)) {

Map<String, String> readerAttributes =
InternalEngine.getReaderAttributes(randomBoolean() ? directory :
new MockDirectoryWrapper(random(), directory), indexSettings);
assertEquals(2, readerAttributes.size());

switch (IndexModule.defaultStoreType(true)) {
case HYBRIDFS:
case MMAPFS:
assertEquals("OFF_HEAP", readerAttributes.get("blocktree.terms.fst"));
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst._id"));
break;
case NIOFS:
case SIMPLEFS:
case FS:
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst"));
assertEquals("ON_HEAP", readerAttributes.get("blocktree.terms.fst._id"));
break;
default:
fail("unknownw type");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -457,10 +457,14 @@ protected Settings.Builder setRandomIndexSettings(Random random, Settings.Builde
RandomNumbers.randomIntBetween(random, 1, 15) + "ms");
}

if (randomBoolean()) {
if (random.nextBoolean()) {
builder.put(Store.FORCE_RAM_TERM_DICT.getKey(), true);
}

if (random.nextBoolean()) {
builder.put(IndexSettings.ON_HEAP_ID_TERMS_INDEX.getKey(), random.nextBoolean());
}

return builder;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ static String[] extractLeaderShardHistoryUUIDs(Map<String, String> ccrIndexMetaD
nonReplicatedSettings.add(IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING);
nonReplicatedSettings.add(IndexSettings.INDEX_GC_DELETES_SETTING);
nonReplicatedSettings.add(IndexSettings.MAX_REFRESH_LISTENERS_PER_SHARD);
nonReplicatedSettings.add(IndexSettings.ON_HEAP_ID_TERMS_INDEX);

nonReplicatedSettings.add(IndicesRequestCache.INDEX_CACHE_REQUEST_ENABLED_SETTING);
nonReplicatedSettings.add(BitsetFilterCache.INDEX_LOAD_RANDOM_ACCESS_FILTERS_EAGERLY_SETTING);
Expand Down

0 comments on commit f993ef8

Please sign in to comment.