From cc9581e062c2906ff826588c12bbbec737b50fc9 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 27 May 2024 19:46:12 +0200 Subject: [PATCH 1/2] hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore --- .../analysis/hunspell/WordFormGenerator.java | 147 ++++++++++++------ .../analysis/hunspell/TestHunspell.java | 16 +- .../lucene/analysis/hunspell/compress.aff | 46 +++++- 3 files changed, 155 insertions(+), 54 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java index bfcccf0640af..f6fc9335eeeb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java @@ -384,57 +384,65 @@ private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind private class WordCompressor { private final Comparator solutionFitness = - Comparator.comparingInt((State s) -> s.forbidden) - .thenComparingInt(s -> s.underGenerated) + Comparator.comparingInt((State s) -> -s.potentialCoverage) .thenComparingInt(s -> s.stemToFlags.size()) + .thenComparingInt(s -> s.underGenerated) .thenComparingInt(s -> s.overGenerated); private final Set forbidden; private final Runnable checkCanceled; private final Set wordSet; private final Set existingStems; private final Map> stemToPossibleFlags = new HashMap<>(); - private final Map stemCounts = new LinkedHashMap<>(); + private final Map> stemsToForms = new LinkedHashMap<>(); WordCompressor(List words, Set forbidden, Runnable checkCanceled) { this.forbidden = forbidden; this.checkCanceled = checkCanceled; wordSet = new HashSet<>(words); - Stemmer.StemCandidateProcessor processor = - new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) { - @Override - boolean processStemCandidate( - char[] word, - int offset, - int length, - int lastAffix, - int outerPrefix, - int innerPrefix, - int outerSuffix, - int innerSuffix) { - String candidate = new String(word, offset, length); - stemCounts.merge(candidate, 1, Integer::sum); - CharHashSet flags = new CharHashSet(); - if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG)); - if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG)); - if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG)); - if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG)); - stemToPossibleFlags - .computeIfAbsent(candidate, __ -> new LinkedHashSet<>()) - .add(new FlagSet(flags, dictionary)); - return true; - } - }; - for (String word : words) { checkCanceled.run(); - stemCounts.merge(word, 1, Integer::sum); stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>()); + var processor = + new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) { + @Override + boolean processStemCandidate( + char[] chars, + int offset, + int length, + int lastAffix, + int outerPrefix, + int innerPrefix, + int outerSuffix, + int innerSuffix) { + String candidate = new String(chars, offset, length); + CharHashSet flags = new CharHashSet(); + if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG)); + if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG)); + if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG)); + if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG)); + FlagSet flagSet = new FlagSet(flags, dictionary); + StemWithFlags swf = new StemWithFlags(candidate, Set.of(flagSet)); + if (forbidden.isEmpty() + || allGenerated(swf).stream().noneMatch(forbidden::contains)) { + registerStem(candidate); + stemToPossibleFlags + .computeIfAbsent(candidate, __ -> new LinkedHashSet<>()) + .add(flagSet); + } + return true; + } + + void registerStem(String stem) { + stemsToForms.computeIfAbsent(stem, __ -> new LinkedHashSet<>()).add(word); + } + }; + processor.registerStem(word); stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor); } existingStems = - stemCounts.keySet().stream() + stemsToForms.keySet().stream() .filter(stem -> dictionary.lookupEntries(stem) != null) .collect(Collectors.toSet()); } @@ -442,30 +450,49 @@ boolean processStemCandidate( EntrySuggestion compress() { Comparator stemSorter = Comparator.comparing((String s) -> existingStems.contains(s)) - .thenComparing(stemCounts::get) + .thenComparing(s -> stemsToForms.get(s).size()) .reversed(); - List sortedStems = stemCounts.keySet().stream().sorted(stemSorter).toList(); + List sortedStems = stemsToForms.keySet().stream().sorted(stemSorter).toList(); PriorityQueue queue = new PriorityQueue<>(solutionFitness); + Set>> visited = new HashSet<>(); queue.offer(new State(Map.of(), wordSet.size(), 0, 0)); State result = null; while (!queue.isEmpty()) { State state = queue.poll(); if (state.underGenerated == 0) { - if (result == null || solutionFitness.compare(state, result) < 0) result = state; - if (state.forbidden == 0) break; - continue; + result = state; + break; } for (String stem : sortedStems) { if (!state.stemToFlags.containsKey(stem)) { - queue.offer(addStem(state, stem)); + var withStem = addStem(state, stem); + if (visited.add(withStem)) { + var next = newState(withStem); + if (next != null + && (state.underGenerated > next.underGenerated + || next.potentialCoverage > state.potentialCoverage)) { + queue.offer(next); + } + } } } + if (state.potentialCoverage < wordSet.size()) { + // don't add flags until the suggested entries can potentially cover all requested forms + continue; + } + for (Map.Entry> entry : state.stemToFlags.entrySet()) { for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) { if (!entry.getValue().contains(flags)) { - queue.offer(addFlags(state, entry.getKey(), flags)); + var withFlags = addFlags(state, entry.getKey(), flags); + if (visited.add(withFlags)) { + var next = newState(withFlags); + if (next != null && state.underGenerated > next.underGenerated) { + queue.offer(next); + } + } } } } @@ -482,7 +509,7 @@ EntrySuggestion toSuggestion(State state) { List extraGenerated = new ArrayList<>(); for (String extra : allGenerated(state.stemToFlags).distinct().sorted().toList()) { - if (wordSet.contains(extra)) continue; + if (wordSet.contains(extra) || existingStems.contains(extra)) continue; if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) { addEntry(toEdit, toAdd, extra, CharHashSet.from(dictionary.forbiddenword)); @@ -500,39 +527,55 @@ private void addEntry( (existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString)); } - private State addStem(State state, String stem) { - LinkedHashMap> stemToFlags = new LinkedHashMap<>(state.stemToFlags); + private Map> addStem(State state, String stem) { + Map> stemToFlags = new LinkedHashMap<>(state.stemToFlags); stemToFlags.put(stem, Set.of()); - return newState(stemToFlags); + return stemToFlags; } - private State addFlags(State state, String stem, FlagSet flags) { - LinkedHashMap> stemToFlags = new LinkedHashMap<>(state.stemToFlags); + private Map> addFlags(State state, String stem, FlagSet flags) { + Map> stemToFlags = new LinkedHashMap<>(state.stemToFlags); Set flagSets = new LinkedHashSet<>(stemToFlags.get(stem)); flagSets.add(flags); stemToFlags.put(stem, flagSets); - return newState(stemToFlags); + return stemToFlags; } private State newState(Map> stemToFlags) { Set allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet()); + int overGenerated = 0; + for (String s : allGenerated) { + if (forbidden.contains(s)) return null; + if (!wordSet.contains(s)) overGenerated++; + } + + int potentialCoverage = + (int) + stemToFlags.keySet().stream() + .flatMap(s -> stemsToForms.get(s).stream()) + .distinct() + .count(); return new State( stemToFlags, (int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(), - (int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(), - (int) allGenerated.stream().filter(s -> forbidden.contains(s)).count()); + overGenerated, + potentialCoverage); } private final Map> expansionCache = new HashMap<>(); private record StemWithFlags(String stem, Set flags) {} - private Stream allGenerated(Map> stemToFlags) { + private List allGenerated(StemWithFlags swc) { Function> expandToWords = e -> expand(e.stem, FlagSet.flatten(e.flags)).stream().map(w -> w.getWord()).toList(); + return expansionCache.computeIfAbsent(swc, expandToWords); + } + + private Stream allGenerated(Map> stemToFlags) { return stemToFlags.entrySet().stream() - .map(e -> new StemWithFlags(e.getKey(), e.getValue())) - .flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream()); + .flatMap( + entry -> allGenerated(new StemWithFlags(entry.getKey(), entry.getValue())).stream()); } private List expand(String stem, CharHashSet flagSet) { @@ -561,5 +604,7 @@ private record State( Map> stemToFlags, int underGenerated, int overGenerated, - int forbidden) {} + + // The maximum number of requested forms possibly generated by adding only flags to this state + int potentialCoverage) {} } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java index ec1af2fd96ae..cbb876262ebe 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java @@ -213,7 +213,7 @@ public void testCompressingApi() throws Exception { Hunspell h = loadNoTimeout("base"); String[] createQuery = {"create", "created", "creates", "creating", "creation"}; checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery); - checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created"); + checkCompression(h, "toEdit=[create/DS], toAdd=[], extra=[]", "creates", "created"); checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations"); checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def"); checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms"); @@ -227,6 +227,20 @@ public void testCompressingIsMinimal() throws Exception { Hunspell h = loadNoTimeout("compress"); checkCompression( h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms"); + + checkCompression(h, "toEdit=[], toAdd=[f/def], extra=[]", "f", "fd", "fe", "ff"); + + WordFormGenerator gen = new WordFormGenerator(h.dictionary); + EntrySuggestion fAbc = + gen.compress(List.of("f", "fa", "fb", "fc"), Set.of("fyy", "fxx"), () -> {}); + assertEquals("toEdit=[], toAdd=[f/abc], extra=[]", fAbc.internalsToString()); + } + + @Test + public void testCompressingIsFastOnLargeUnrelatedWordSets() throws Exception { + Hunspell h = loadNoTimeout("compress"); + String[] letters = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}; + checkCompression(h, "toEdit=[], toAdd=[a, b, c, d, e, f, g, h, i, j, k, l], extra=[]", letters); } @Test diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff index 70642d81ebf0..d484d8abf4cd 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff @@ -1,5 +1,3 @@ -FORBIDDENWORD * - SFX G Y 1 SFX G 0 ing/S . @@ -12,3 +10,47 @@ SFX S 0 s . SFX X Y 2 SFX X 0 s . SFX X 0 x . + +# Flags for f,fa,fb,fc + +SFX A Y 3 +SFX A 0 a . +SFX A 0 b . +SFX A 0 yy . + +SFX B Y 3 +SFX B 0 c . +SFX B 0 b . +SFX B 0 xx . + +SFX a Y 1 +SFX a 0 a . + +SFX b Y 1 +SFX b 0 b . + +SFX c Y 1 +SFX c 0 c . + +# Flags for f,fd,fe,ff with red herring -+* flags that bias the greedy heuristics to prefer the "fd" stem initially + +SFX d Y 1 +SFX d 0 d . + +SFX e Y 1 +SFX e 0 e . + +SFX f Y 1 +SFX f 0 f . + +SFX - Y 2 +SFX - d 0 d +SFX - d e d + +SFX + Y 2 +SFX + d 0 d +SFX + d e d + +SFX * Y 2 +SFX * d 0 d +SFX * d e d From a88b0d22f4d5f85e29e3998a0a4e030b3337d63a Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 27 May 2024 19:55:51 +0200 Subject: [PATCH 2/2] update CHANGES.txt --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ec4c96663655..29c2028eea32 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -309,6 +309,8 @@ Improvements * GITHUB#13276: UnifiedHighlighter: new 'passageSortComparator' option to allow sorting other than offset order. (Seunghan Jung) +* GITHUB#13429: Hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore (Peter Gromov) + Optimizations ---------------------