From cdff0accaa63461f7db893a2fb9893899f763bc0 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 15 Mar 2021 10:02:45 +0100 Subject: [PATCH] Hunspell suggestions: speed up for some non-Latin scripts (#19) --- .../analysis/hunspell/GeneratingSuggester.java | 10 ++++++---- .../lucene/analysis/hunspell/TrigramAutomaton.java | 12 ++++++++++-- .../lucene/analysis/hunspell/TestPerformance.java | 14 +++++++++++++- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index f9bcfd557d07..cc72027a331f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -94,10 +94,12 @@ char transformChar(char c) { return; } - int sc = - automaton.ngramScore(rootChars) - - longerWorsePenalty(word.length(), rootChars.length) - + commonPrefix(word, rootChars); + int sc = automaton.ngramScore(rootChars); + if (sc == 0) { + return; // no common characters at all, don't suggest this root + } + + sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length); if (roots.size() == MAX_ROOTS && sc < roots.peek().score) { return; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java index a83505e6540a..effd59685588 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java @@ -33,6 +33,7 @@ class TrigramAutomaton { private final CharacterRunAutomaton automaton; private final int[] state2Score; private final FixedBitSet countedSubstrings; + private final char minChar; TrigramAutomaton(String s1) { Map substringCounts = new HashMap<>(); @@ -40,6 +41,8 @@ class TrigramAutomaton { Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N); int initialState = builder.createState(); + minChar = (char) s1.chars().min().orElseThrow(); + for (int start = 0; start < s1.length(); start++) { int limit = Math.min(s1.length(), start + N); for (int end = start + 1; end <= limit; end++) { @@ -49,7 +52,7 @@ class TrigramAutomaton { int state = initialState; for (int i = start; i < limit; i++) { int next = builder.createState(); - builder.addTransition(state, next, s1.charAt(i)); + builder.addTransition(state, next, s1.charAt(i) - minChar); state = next; } } @@ -70,7 +73,7 @@ class TrigramAutomaton { private int runAutomatonOnStringChars(String s) { int state = 0; for (int i = 0; i < s.length(); i++) { - state = automaton.step(state, s.charAt(i)); + state = automaton.step(state, s.charAt(i) - minChar); } return state; } @@ -86,6 +89,11 @@ int ngramScore(CharsRef s2) { int limit = s2.length + s2.offset; for (int i = s2.offset; i < limit; i++) { char c = transformChar(s2.chars[i]); + if (c < minChar) { + state1 = state2 = -1; + continue; + } + c -= minChar; int state3 = state2 <= 0 ? 0 : automaton.step(state2, c); if (state3 > 0) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index bc69f6c5b812..ffe3ae9de311 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -65,6 +65,16 @@ public void en_suggest() throws Exception { checkSuggestionPerformance("en", 3_000); } + @Test + public void ru() throws Exception { + checkAnalysisPerformance("ru", 400_000); + } + + @Test + public void ru_suggest() throws Exception { + checkSuggestionPerformance("ru", 1000); + } + @Test public void de() throws Exception { checkAnalysisPerformance("de", 300_000); @@ -121,6 +131,7 @@ private void checkSuggestionPerformance(String code, int wordCount) throws Excep Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {}); List words = loadWords(code, wordCount, dictionary).stream() + .distinct() .filter(w -> hasQuickSuggestions(speller, w)) .collect(Collectors.toList()); System.out.println("Checking " + words.size() + " misspelled words"); @@ -181,7 +192,8 @@ private List loadWords(String code, int wordCount, Dictionary dictionary String line = reader.readLine(); if (line == null) break; - for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) { + for (String token : + line.split("[^\\p{IsLetter}" + Pattern.quote(dictionary.wordChars) + "]+")) { String word = stripPunctuation(token); if (word != null) { words.add(word);