Skip to content

Commit

Permalink
Hunspell: speed up "compress"; minimize the number of the generated e…
Browse files Browse the repository at this point in the history
…ntries; don't even consider "forbidden" entries anymore
  • Loading branch information
donnerpeter committed May 28, 2024
1 parent 67d80db commit 194c353
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 56 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ Improvements

* GITHUB#13276: UnifiedHighlighter: new 'passageSortComparator' option to allow sorting other than offset order. (Seunghan Jung)

* GITHUB#13429: Hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore (Peter Gromov)

Optimizations
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,89 +418,116 @@ private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind

private class WordCompressor {
private final Comparator<State> solutionFitness =
Comparator.comparingInt((State s) -> s.forbidden)
.thenComparingInt(s -> s.underGenerated)
Comparator.comparingInt((State s) -> -s.potentialCoverage)
.thenComparingInt(s -> s.stemToFlags.size())
.thenComparingInt(s -> s.underGenerated)
.thenComparingInt(s -> s.overGenerated);
private final Set<String> forbidden;
private final Runnable checkCanceled;
private final Set<String> wordSet;
private final Set<String> existingStems;
private final Map<String, Set<FlagSet>> stemToPossibleFlags = new HashMap<>();
private final Map<String, Integer> stemCounts = new LinkedHashMap<>();
private final Map<String, Set<String>> stemsToForms = new LinkedHashMap<>();

WordCompressor(List<String> words, Set<String> forbidden, Runnable checkCanceled) {
this.forbidden = forbidden;
this.checkCanceled = checkCanceled;
wordSet = new HashSet<>(words);

Stemmer.StemCandidateProcessor processor =
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
@Override
boolean processStemCandidate(
char[] word,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix) {
String candidate = new String(word, offset, length);
stemCounts.merge(candidate, 1, Integer::sum);
CharHashSet flags = new CharHashSet();
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
stemToPossibleFlags
.computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
.add(new FlagSet(flags, dictionary));
return true;
}
};

for (String word : words) {
checkCanceled.run();
stemCounts.merge(word, 1, Integer::sum);
stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
var processor =
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
@Override
boolean processStemCandidate(
char[] chars,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix) {
String candidate = new String(chars, offset, length);
CharHashSet flags = new CharHashSet();
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
FlagSet flagSet = new FlagSet(flags, dictionary);
StemWithFlags swf = new StemWithFlags(candidate, Set.of(flagSet));
if (forbidden.isEmpty()
|| allGenerated(swf).stream().noneMatch(forbidden::contains)) {
registerStem(candidate);
stemToPossibleFlags
.computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
.add(flagSet);
}
return true;
}

void registerStem(String stem) {
stemsToForms.computeIfAbsent(stem, __ -> new LinkedHashSet<>()).add(word);
}
};
processor.registerStem(word);
stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor);
}

existingStems =
stemCounts.keySet().stream()
stemsToForms.keySet().stream()
.filter(stem -> dictionary.lookupEntries(stem) != null)
.collect(Collectors.toSet());
}

EntrySuggestion compress() {
Comparator<String> stemSorter =
Comparator.comparing((String s) -> existingStems.contains(s))
.thenComparing(stemCounts::get)
.thenComparing(s -> stemsToForms.get(s).size())
.reversed();
List<String> sortedStems =
stemCounts.keySet().stream().sorted(stemSorter).collect(Collectors.toList());
stemsToForms.keySet().stream().sorted(stemSorter).collect(Collectors.toList());
PriorityQueue<State> queue = new PriorityQueue<>(solutionFitness);
Set<Map<String, Set<FlagSet>>> visited = new HashSet<>();
queue.offer(new State(Map.of(), wordSet.size(), 0, 0));
State result = null;
while (!queue.isEmpty()) {
State state = queue.poll();
if (state.underGenerated == 0) {
if (result == null || solutionFitness.compare(state, result) < 0) result = state;
if (state.forbidden == 0) break;
continue;
result = state;
break;
}

for (String stem : sortedStems) {
if (!state.stemToFlags.containsKey(stem)) {
queue.offer(addStem(state, stem));
var withStem = addStem(state, stem);
if (visited.add(withStem)) {
var next = newState(withStem);
if (next != null
&& (state.underGenerated > next.underGenerated
|| next.potentialCoverage > state.potentialCoverage)) {
queue.offer(next);
}
}
}
}

if (state.potentialCoverage < wordSet.size()) {
// don't add flags until the suggested entries can potentially cover all requested forms
continue;
}

for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) {
if (!entry.getValue().contains(flags)) {
queue.offer(addFlags(state, entry.getKey(), flags));
var withFlags = addFlags(state, entry.getKey(), flags);
if (visited.add(withFlags)) {
var next = newState(withFlags);
if (next != null && state.underGenerated > next.underGenerated) {
queue.offer(next);
}
}
}
}
}
Expand All @@ -518,7 +545,7 @@ EntrySuggestion toSuggestion(State state) {
List<String> extraGenerated = new ArrayList<>();
for (String extra :
allGenerated(state.stemToFlags).distinct().sorted().collect(Collectors.toList())) {
if (wordSet.contains(extra)) continue;
if (wordSet.contains(extra) || existingStems.contains(extra)) continue;

if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
addEntry(toEdit, toAdd, extra, CharHashSet.from(dictionary.forbiddenword));
Expand All @@ -536,27 +563,39 @@ private void addEntry(
(existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
}

private State addStem(State state, String stem) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
private Map<String, Set<FlagSet>> addStem(State state, String stem) {
Map<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
stemToFlags.put(stem, Set.of());
return newState(stemToFlags);
return stemToFlags;
}

private State addFlags(State state, String stem, FlagSet flags) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
private Map<String, Set<FlagSet>> addFlags(State state, String stem, FlagSet flags) {
Map<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
Set<FlagSet> flagSets = new LinkedHashSet<>(stemToFlags.get(stem));
flagSets.add(flags);
stemToFlags.put(stem, flagSets);
return newState(stemToFlags);
return stemToFlags;
}

private State newState(Map<String, Set<FlagSet>> stemToFlags) {
Set<String> allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet());
int overGenerated = 0;
for (String s : allGenerated) {
if (forbidden.contains(s)) return null;
if (!wordSet.contains(s)) overGenerated++;
}

int potentialCoverage =
(int)
stemToFlags.keySet().stream()
.flatMap(s -> stemsToForms.get(s).stream())
.distinct()
.count();
return new State(
stemToFlags,
(int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(),
(int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(),
(int) allGenerated.stream().filter(s -> forbidden.contains(s)).count());
overGenerated,
potentialCoverage);
}

private final Map<StemWithFlags, List<String>> expansionCache = new HashMap<>();
Expand Down Expand Up @@ -584,15 +623,19 @@ public int hashCode() {
}
}

private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
private List<String> allGenerated(StemWithFlags swc) {
Function<StemWithFlags, List<String>> expandToWords =
e ->
expand(e.stem, FlagSet.flatten(e.flags)).stream()
.map(w -> w.getWord())
.collect(Collectors.toList());
return expansionCache.computeIfAbsent(swc, expandToWords);
}

private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
return stemToFlags.entrySet().stream()
.map(e -> new StemWithFlags(e.getKey(), e.getValue()))
.flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
.flatMap(
entry -> allGenerated(new StemWithFlags(entry.getKey(), entry.getValue())).stream());
}

private List<AffixedWord> expand(String stem, CharHashSet flagSet) {
Expand Down Expand Up @@ -642,17 +685,19 @@ private static class State {
final Map<String, Set<FlagSet>> stemToFlags;
final int underGenerated;
final int overGenerated;
final int forbidden;

// The maximum number of requested forms possibly generated by adding only flags to this state
final int potentialCoverage;

State(
Map<String, Set<FlagSet>> stemToFlags,
int underGenerated,
int overGenerated,
int forbidden) {
int potentialCoverage) {
this.stemToFlags = stemToFlags;
this.underGenerated = underGenerated;
this.overGenerated = overGenerated;
this.forbidden = forbidden;
this.potentialCoverage = potentialCoverage;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ public void testCompressingApi() throws Exception {
Hunspell h = loadNoTimeout("base");
String[] createQuery = {"create", "created", "creates", "creating", "creation"};
checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery);
checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created");
checkCompression(h, "toEdit=[create/DS], toAdd=[], extra=[]", "creates", "created");
checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations");
checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def");
checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms");
Expand All @@ -231,6 +231,20 @@ public void testCompressingIsMinimal() throws Exception {
Hunspell h = loadNoTimeout("compress");
checkCompression(
h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms");

checkCompression(h, "toEdit=[], toAdd=[f/def], extra=[]", "f", "fd", "fe", "ff");

WordFormGenerator gen = new WordFormGenerator(h.dictionary);
EntrySuggestion fAbc =
gen.compress(List.of("f", "fa", "fb", "fc"), Set.of("fyy", "fxx"), () -> {});
assertEquals("toEdit=[], toAdd=[f/abc], extra=[]", fAbc.internalsToString());
}

@Test
public void testCompressingIsFastOnLargeUnrelatedWordSets() throws Exception {
Hunspell h = loadNoTimeout("compress");
String[] letters = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"};
checkCompression(h, "toEdit=[], toAdd=[a, b, c, d, e, f, g, h, i, j, k, l], extra=[]", letters);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
FORBIDDENWORD *

SFX G Y 1
SFX G 0 ing/S .

Expand All @@ -12,3 +10,47 @@ SFX S 0 s .
SFX X Y 2
SFX X 0 s .
SFX X 0 x .

# Flags for f,fa,fb,fc

SFX A Y 3
SFX A 0 a .
SFX A 0 b .
SFX A 0 yy .

SFX B Y 3
SFX B 0 c .
SFX B 0 b .
SFX B 0 xx .

SFX a Y 1
SFX a 0 a .

SFX b Y 1
SFX b 0 b .

SFX c Y 1
SFX c 0 c .

# Flags for f,fd,fe,ff with red herring -+* flags that bias the greedy heuristics to prefer the "fd" stem initially

SFX d Y 1
SFX d 0 d .

SFX e Y 1
SFX e 0 e .

SFX f Y 1
SFX f 0 f .

SFX - Y 2
SFX - d 0 d
SFX - d e d

SFX + Y 2
SFX + d 0 d
SFX + d e d

SFX * Y 2
SFX * d 0 d
SFX * d e d

0 comments on commit 194c353

Please sign in to comment.