Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore #13429

Merged
merged 2 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ Improvements

* GITHUB#13276: UnifiedHighlighter: new 'passageSortComparator' option to allow sorting other than offset order. (Seunghan Jung)

* GITHUB#13429: Hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore (Peter Gromov)

Optimizations
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -384,88 +384,115 @@ private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind

private class WordCompressor {
private final Comparator<State> solutionFitness =
Comparator.comparingInt((State s) -> s.forbidden)
.thenComparingInt(s -> s.underGenerated)
Comparator.comparingInt((State s) -> -s.potentialCoverage)
.thenComparingInt(s -> s.stemToFlags.size())
.thenComparingInt(s -> s.underGenerated)
.thenComparingInt(s -> s.overGenerated);
private final Set<String> forbidden;
private final Runnable checkCanceled;
private final Set<String> wordSet;
private final Set<String> existingStems;
private final Map<String, Set<FlagSet>> stemToPossibleFlags = new HashMap<>();
private final Map<String, Integer> stemCounts = new LinkedHashMap<>();
private final Map<String, Set<String>> stemsToForms = new LinkedHashMap<>();

WordCompressor(List<String> words, Set<String> forbidden, Runnable checkCanceled) {
this.forbidden = forbidden;
this.checkCanceled = checkCanceled;
wordSet = new HashSet<>(words);

Stemmer.StemCandidateProcessor processor =
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
@Override
boolean processStemCandidate(
char[] word,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix) {
String candidate = new String(word, offset, length);
stemCounts.merge(candidate, 1, Integer::sum);
CharHashSet flags = new CharHashSet();
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
stemToPossibleFlags
.computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
.add(new FlagSet(flags, dictionary));
return true;
}
};

for (String word : words) {
checkCanceled.run();
stemCounts.merge(word, 1, Integer::sum);
stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
var processor =
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
@Override
boolean processStemCandidate(
char[] chars,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix) {
String candidate = new String(chars, offset, length);
CharHashSet flags = new CharHashSet();
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
FlagSet flagSet = new FlagSet(flags, dictionary);
StemWithFlags swf = new StemWithFlags(candidate, Set.of(flagSet));
if (forbidden.isEmpty()
|| allGenerated(swf).stream().noneMatch(forbidden::contains)) {
registerStem(candidate);
stemToPossibleFlags
.computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
.add(flagSet);
}
return true;
}

void registerStem(String stem) {
stemsToForms.computeIfAbsent(stem, __ -> new LinkedHashSet<>()).add(word);
}
};
processor.registerStem(word);
stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor);
}

existingStems =
stemCounts.keySet().stream()
stemsToForms.keySet().stream()
.filter(stem -> dictionary.lookupEntries(stem) != null)
.collect(Collectors.toSet());
}

EntrySuggestion compress() {
Comparator<String> stemSorter =
Comparator.comparing((String s) -> existingStems.contains(s))
.thenComparing(stemCounts::get)
.thenComparing(s -> stemsToForms.get(s).size())
.reversed();
List<String> sortedStems = stemCounts.keySet().stream().sorted(stemSorter).toList();
List<String> sortedStems = stemsToForms.keySet().stream().sorted(stemSorter).toList();
PriorityQueue<State> queue = new PriorityQueue<>(solutionFitness);
Set<Map<String, Set<FlagSet>>> visited = new HashSet<>();
queue.offer(new State(Map.of(), wordSet.size(), 0, 0));
State result = null;
while (!queue.isEmpty()) {
State state = queue.poll();
if (state.underGenerated == 0) {
if (result == null || solutionFitness.compare(state, result) < 0) result = state;
if (state.forbidden == 0) break;
continue;
result = state;
break;
}

for (String stem : sortedStems) {
if (!state.stemToFlags.containsKey(stem)) {
queue.offer(addStem(state, stem));
var withStem = addStem(state, stem);
if (visited.add(withStem)) {
var next = newState(withStem);
if (next != null
&& (state.underGenerated > next.underGenerated
|| next.potentialCoverage > state.potentialCoverage)) {
queue.offer(next);
}
}
}
}

if (state.potentialCoverage < wordSet.size()) {
// don't add flags until the suggested entries can potentially cover all requested forms
continue;
}

for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) {
if (!entry.getValue().contains(flags)) {
queue.offer(addFlags(state, entry.getKey(), flags));
var withFlags = addFlags(state, entry.getKey(), flags);
if (visited.add(withFlags)) {
var next = newState(withFlags);
if (next != null && state.underGenerated > next.underGenerated) {
queue.offer(next);
}
}
}
}
}
Expand All @@ -482,7 +509,7 @@ EntrySuggestion toSuggestion(State state) {

List<String> extraGenerated = new ArrayList<>();
for (String extra : allGenerated(state.stemToFlags).distinct().sorted().toList()) {
if (wordSet.contains(extra)) continue;
if (wordSet.contains(extra) || existingStems.contains(extra)) continue;

if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
addEntry(toEdit, toAdd, extra, CharHashSet.from(dictionary.forbiddenword));
Expand All @@ -500,39 +527,55 @@ private void addEntry(
(existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
}

private State addStem(State state, String stem) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
private Map<String, Set<FlagSet>> addStem(State state, String stem) {
Map<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
stemToFlags.put(stem, Set.of());
return newState(stemToFlags);
return stemToFlags;
}

private State addFlags(State state, String stem, FlagSet flags) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
private Map<String, Set<FlagSet>> addFlags(State state, String stem, FlagSet flags) {
Map<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
Set<FlagSet> flagSets = new LinkedHashSet<>(stemToFlags.get(stem));
flagSets.add(flags);
stemToFlags.put(stem, flagSets);
return newState(stemToFlags);
return stemToFlags;
}

private State newState(Map<String, Set<FlagSet>> stemToFlags) {
Set<String> allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet());
int overGenerated = 0;
for (String s : allGenerated) {
if (forbidden.contains(s)) return null;
if (!wordSet.contains(s)) overGenerated++;
}

int potentialCoverage =
(int)
stemToFlags.keySet().stream()
.flatMap(s -> stemsToForms.get(s).stream())
.distinct()
.count();
return new State(
stemToFlags,
(int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(),
(int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(),
(int) allGenerated.stream().filter(s -> forbidden.contains(s)).count());
overGenerated,
potentialCoverage);
}

private final Map<StemWithFlags, List<String>> expansionCache = new HashMap<>();

private record StemWithFlags(String stem, Set<FlagSet> flags) {}

private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
private List<String> allGenerated(StemWithFlags swc) {
Function<StemWithFlags, List<String>> expandToWords =
e -> expand(e.stem, FlagSet.flatten(e.flags)).stream().map(w -> w.getWord()).toList();
return expansionCache.computeIfAbsent(swc, expandToWords);
}

private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
return stemToFlags.entrySet().stream()
.map(e -> new StemWithFlags(e.getKey(), e.getValue()))
.flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
.flatMap(
entry -> allGenerated(new StemWithFlags(entry.getKey(), entry.getValue())).stream());
}

private List<AffixedWord> expand(String stem, CharHashSet flagSet) {
Expand Down Expand Up @@ -561,5 +604,7 @@ private record State(
Map<String, Set<FlagSet>> stemToFlags,
int underGenerated,
int overGenerated,
int forbidden) {}

// The maximum number of requested forms possibly generated by adding only flags to this state
int potentialCoverage) {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ public void testCompressingApi() throws Exception {
Hunspell h = loadNoTimeout("base");
String[] createQuery = {"create", "created", "creates", "creating", "creation"};
checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery);
checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created");
checkCompression(h, "toEdit=[create/DS], toAdd=[], extra=[]", "creates", "created");
checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations");
checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def");
checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms");
Expand All @@ -227,6 +227,20 @@ public void testCompressingIsMinimal() throws Exception {
Hunspell h = loadNoTimeout("compress");
checkCompression(
h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms");

checkCompression(h, "toEdit=[], toAdd=[f/def], extra=[]", "f", "fd", "fe", "ff");

WordFormGenerator gen = new WordFormGenerator(h.dictionary);
EntrySuggestion fAbc =
gen.compress(List.of("f", "fa", "fb", "fc"), Set.of("fyy", "fxx"), () -> {});
assertEquals("toEdit=[], toAdd=[f/abc], extra=[]", fAbc.internalsToString());
}

@Test
public void testCompressingIsFastOnLargeUnrelatedWordSets() throws Exception {
Hunspell h = loadNoTimeout("compress");
String[] letters = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"};
checkCompression(h, "toEdit=[], toAdd=[a, b, c, d, e, f, g, h, i, j, k, l], extra=[]", letters);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
FORBIDDENWORD *

SFX G Y 1
SFX G 0 ing/S .

Expand All @@ -12,3 +10,47 @@ SFX S 0 s .
SFX X Y 2
SFX X 0 s .
SFX X 0 x .

# Flags for f,fa,fb,fc

SFX A Y 3
SFX A 0 a .
SFX A 0 b .
SFX A 0 yy .

SFX B Y 3
SFX B 0 c .
SFX B 0 b .
SFX B 0 xx .

SFX a Y 1
SFX a 0 a .

SFX b Y 1
SFX b 0 b .

SFX c Y 1
SFX c 0 c .

# Flags for f,fd,fe,ff with red herring -+* flags that bias the greedy heuristics to prefer the "fd" stem initially

SFX d Y 1
SFX d 0 d .

SFX e Y 1
SFX e 0 e .

SFX f Y 1
SFX f 0 f .

SFX - Y 2
SFX - d 0 d
SFX - d e d

SFX + Y 2
SFX + d 0 d
SFX + d e d

SFX * Y 2
SFX * d 0 d
SFX * d e d