Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move synonym map off-heap for SynonymGraphFilter #13054

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,8 @@ private void parse() throws IOException {
}

// Interleaves all output tokens onto the futureOutputs:
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset)
throws IOException {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

final int code = bytesReader.readVInt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ private boolean parse() throws IOException {
* Expands the output graph into the necessary tokens, adding synonyms as side paths parallel to
* the input tokens, and buffers them in the output token buffer.
*/
private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
private void bufferOutputTokens(BytesRef bytes, int matchInputLength) throws IOException {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

final int code = bytesReader.readVInt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Iterator;
Expand Down Expand Up @@ -85,6 +86,7 @@ public class SynonymGraphFilterFactory extends TokenFilterFactory implements Res
private final boolean expand;
private final String analyzerName;
private final Map<String, String> tokArgs = new HashMap<>();
private final Path compiledSynonymsPath;

private SynonymMap map;

Expand Down Expand Up @@ -113,6 +115,9 @@ public SynonymGraphFilterFactory(Map<String, String> args) {
itr.remove();
}
}
String compiledSynonymsPathArg = get(args, "compiledSynonymsPath");
compiledSynonymsPath =
compiledSynonymsPathArg == null ? null : Path.of(compiledSynonymsPathArg);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
Expand Down Expand Up @@ -168,29 +173,36 @@ protected TokenStreamComponents createComponents(String fieldName) {
protected SynonymMap loadSynonyms(
ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer)
throws IOException, ParseException {
CharsetDecoder decoder =
StandardCharsets.UTF_8
.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

SynonymMap.Parser parser;
Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
try {
parser =
clazz
.getConstructor(boolean.class, boolean.class, Analyzer.class)
.newInstance(dedup, expand, analyzer);
} catch (Exception e) {
throw new RuntimeException(e);
SynonymMapDirectory compiledSynonymsDirectory = null;
if (compiledSynonymsPath != null) {
compiledSynonymsDirectory = new SynonymMapDirectory(compiledSynonymsPath);
}
if (compiledSynonymsDirectory == null || compiledSynonymsDirectory.hasSynonyms() == false) {
CharsetDecoder decoder =
StandardCharsets.UTF_8
.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

SynonymMap.Parser parser;
Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
try {
parser =
clazz
.getConstructor(boolean.class, boolean.class, Analyzer.class)
.newInstance(dedup, expand, analyzer);
} catch (Exception e) {
throw new RuntimeException(e);
}

List<String> files = splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
parser.parse(new InputStreamReader(loader.openResource(file), decoder));
List<String> files = splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
parser.parse(new InputStreamReader(loader.openResource(file), decoder));
}
return parser.build(compiledSynonymsDirectory);
}
return parser.build();
return compiledSynonymsDirectory.readMap();
}

// (there are no tests for this functionality)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.internal.hppc.IntHashSet;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefHash;
Expand All @@ -53,12 +54,24 @@ public class SynonymMap {
public final FST<BytesRef> fst;

/** map&lt;ord, outputword&gt; */
public final BytesRefHash words;
public final SynonymDictionary words;

/** maxHorizontalContext: maximum context we need on the tokenstream */
public final int maxHorizontalContext;

public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) {
this(
fst,
new SynonymDictionary() {
@Override
public void get(int id, BytesRef scratch) {
words.get(id, scratch);
}
},
maxHorizontalContext);
}

SynonymMap(FST<BytesRef> fst, SynonymDictionary words, int maxHorizontalContext) {
this.fst = fst;
this.words = words;
this.maxHorizontalContext = maxHorizontalContext;
Expand Down Expand Up @@ -218,12 +231,26 @@ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
add(input, countWords(input), output, countWords(output), includeOrig);
}

/** Builds an {@link SynonymMap} and returns it. */
/** Buils a {@link SynonymMap} and returns it. */
public SynonymMap build() throws IOException {
return build(null);
}

/**
* Builds a {@link SynonymMap} and returns it. If directory is non-null, it will write the
* compiled SynonymMap to disk and return an off-heap version.
*/
public SynonymMap build(SynonymMapDirectory directory) throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options?
FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
FSTCompiler.Builder<BytesRef> fstCompilerBuilder =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
IndexOutput fstOutput = null;
if (directory != null) {
fstOutput = directory.fstOutput();
fstCompilerBuilder.dataOutput(fstOutput);
}
FSTCompiler<BytesRef> fstCompiler = fstCompilerBuilder.build();

BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
Expand Down Expand Up @@ -290,11 +317,28 @@ public SynonymMap build() throws IOException {
fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
}

FST<BytesRef> fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
FST.FSTMetadata<BytesRef> fstMetaData = fstCompiler.compile();
if (directory != null) {
fstOutput.close(); // TODO -- Should fstCompiler.compile take care of this?
try (SynonymMapDirectory.WordsOutput wordsOutput = directory.wordsOutput()) {
BytesRef scratchRef = new BytesRef();
for (int i = 0; i < words.size(); i++) {
words.get(i, scratchRef);
wordsOutput.addWord(scratchRef);
}
}
directory.writeMetadata(words.size(), maxHorizontalContext, fstMetaData);
return directory.readMap();
}
FST<BytesRef> fst = FST.fromFSTReader(fstMetaData, fstCompiler.getFSTReader());
return new SynonymMap(fst, words, maxHorizontalContext);
}
}

abstract static class SynonymDictionary {
public abstract void get(int id, BytesRef scratch) throws IOException;
}

/**
* Abstraction for parsing synonym files.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.synonym;

import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;

/**
* Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the
* FST and output words are kept off-heap.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the user control separately whether FST and output words are off heap or not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point, if you're using SynonymMapDirectory, you get off-heap FST and on-heap words.

If you don't use SynonymMapDirectory (i.e. you're using the existing constructor or the arg-less version of SynonymMap.Builder.build()), then everything is on-heap like before.

The numbers I posted in #13054 (comment) (which, granted, was just a single synthetic benchmark) seemed to suggest (to me, at least) that the "sweet spot" is off-heap FST with on-heap words. The performance hit from moving words off-heap (at least with my implementation) was pretty bad. Lots of seeking involved. Also, the vast majority of heap savings came from moving the FST.

I'm happy to bring back off-heap words as an option if we think someone would be willing to take that perf hit for slightly lower heap utilization.

*
* @lucene.experimental
*/
public class SynonymMapDirectory implements Closeable {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe mark this with @lucene.experimental so we are free to change the API within non-major releases?

Or: could this be package private? Does the user need to create this wrapper themselves for some reason?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking that the user would create a SynonymMapDirectory and pass it to SynonymMap.Builder.build(SynonymMapDirectory) as the way of opting in to off-heap FSTs for their synonyms.

Given the issue you called out below regarding the need to close the IndexInput for the FST, I feel like the user needs to hold onto "something" (other than the SynonymMap) that gives them an obligation to close filesystem resources when they're done.

Alternatively, I'd be happy to make SynonymMap implement Closeable. Then I'd probably just ask the user to specify a Path instead. At that point, we could hide SynonymMapDirectory altogether.

private final SynonymMapFormat synonymMapFormat =
new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less?
private final Directory directory;
private final List<Closeable> resources = new ArrayList<>();

public SynonymMapDirectory(Path path) throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to store several SynonymMaps in one Directory? Or one must make a separate Directory for each? That's maybe fine ... e.g. one could make a FilterDirectory impl that can share a single underlying filesystem directory and distinguish the files by e.g. a unique filename prefix or so.

Copy link
Contributor Author

@msfroh msfroh Aug 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, since I split the synonyms across three files (synonyms.mdt, synonyms.wrd, and synonyms.fst), I assumed that there would be a single synonym map per directory.

That said, I suppose it wouldn't be hard to combine those three into a single file (with a .syn extension, say), where the SynonymMapDirectory could look at a prefix. Specifically, the current implementation reads the metadata and words once (keeping the words on-heap), then spends the rest of its time in the FST.

Then a single filesystem directory could have something like:

first_synonyms.syn
second_synonyms.syn
... etc. ...

What do you think? (I'm also happy to let each serialized SynonymMap live in its own directory.)

directory = FSDirectory.open(path);
}

IndexOutput fstOutput() throws IOException {
return synonymMapFormat.getFSTOutput(directory);
}

WordsOutput wordsOutput() throws IOException {
return synonymMapFormat.getWordsOutput(directory);
}

void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata)
throws IOException {
synonymMapFormat.writeMetadata(
directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata));
}

SynonymMap readMap() throws IOException {
CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory);
resources.add(closeableSynonymMap);
return closeableSynonymMap.map;
}

boolean hasSynonyms() throws IOException {
// TODO should take the path to the synonyms file to compare file hash against file used to
// build the directory
return directory.listAll().length > 0;
}

@Override
public void close() throws IOException {
for (Closeable c : resources) {
c.close();
}
directory.close();
}

/**
* Abstraction to support writing individual output words to the directory. Should be closed after
* the last word is written.
*/
abstract static class WordsOutput implements Closeable {
public abstract void addWord(BytesRef word) throws IOException;
}

private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable {
@Override
public void close() throws IOException {
indexInput.close();
}
}

private record SynonymMetadata(
int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {}

private static class SynonymMapFormat {
private static final String FST_FILE = "synonyms.fst";
private static final String WORDS_FILE = "synonyms.wrd";
private static final String METADATA_FILE = "synonyms.mdt";

private IndexOutput getFSTOutput(Directory directory) throws IOException {
return directory.createOutput(FST_FILE, IOContext.DEFAULT);
}

private WordsOutput getWordsOutput(Directory directory) throws IOException {
IndexOutput wordsOutput = directory.createOutput(WORDS_FILE, IOContext.DEFAULT);
return new WordsOutput() {
@Override
public void close() throws IOException {
wordsOutput.close();
}

@Override
public void addWord(BytesRef word) throws IOException {
wordsOutput.writeVInt(word.length);
wordsOutput.writeBytes(word.bytes, word.offset, word.length);
}
};
}
;

private void writeMetadata(Directory directory, SynonymMetadata synonymMetadata)
throws IOException {
try (IndexOutput metadataOutput = directory.createOutput(METADATA_FILE, IOContext.DEFAULT)) {
metadataOutput.writeVInt(synonymMetadata.wordCount);
metadataOutput.writeVInt(synonymMetadata.maxHorizontalContext);
synonymMetadata.fstMetadata.save(metadataOutput);
}
directory.sync(List.of(FST_FILE, WORDS_FILE, METADATA_FILE));
}

private SynonymMetadata readMetadata(Directory directory) throws IOException {
try (IndexInput metadataInput = directory.openInput(METADATA_FILE, IOContext.READONCE)) {
int wordCount = metadataInput.readVInt();
int maxHorizontalContext = metadataInput.readVInt();
FST.FSTMetadata<BytesRef> fstMetadata =
FST.readMetadata(metadataInput, ByteSequenceOutputs.getSingleton());
return new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata);
}
}

private CloseableSynonymMap readSynonymMap(Directory directory) throws IOException {
SynonymMetadata synonymMetadata = readMetadata(directory);
IndexInput in = directory.openInput(FST_FILE, IOContext.DEFAULT);
FST<BytesRef> fst =
FST.fromFSTReader(
synonymMetadata.fstMetadata, new OffHeapFSTStore(in, 0, synonymMetadata.fstMetadata));
OnHeapSynonymDictionary words;
try (IndexInput wordsInput = directory.openInput(WORDS_FILE, IOContext.DEFAULT)) {
words = new OnHeapSynonymDictionary(synonymMetadata.wordCount, wordsInput);
}
SynonymMap map = new SynonymMap(fst, words, synonymMetadata.maxHorizontalContext);
return new CloseableSynonymMap(map, in);
}

private static class OnHeapSynonymDictionary extends SynonymMap.SynonymDictionary {
private final int[] bytesStartArray;
private final byte[] wordBytes;

private OnHeapSynonymDictionary(int wordCount, IndexInput wordsFile) throws IOException {
bytesStartArray = new int[wordCount + 1];
int pos = 0;
for (int i = 0; i < wordCount; i++) {
bytesStartArray[i] = pos;
int size = wordsFile.readVInt();
pos += size;
wordsFile.seek(wordsFile.getFilePointer() + size);
}
bytesStartArray[wordCount] = pos;
wordsFile.seek(0);
wordBytes = new byte[pos];
for (int i = 0; i < wordCount; i++) {
int size = wordsFile.readVInt();
wordsFile.readBytes(wordBytes, bytesStartArray[i], size);
}
}

@Override
public void get(int id, BytesRef scratch) {
scratch.bytes = wordBytes;
scratch.offset = bytesStartArray[id];
scratch.length = bytesStartArray[id + 1] - bytesStartArray[id];
}
}
}
}
Loading