Hunspell suggestions: speed up for some non-Latin scripts (apache#19)

msokolov · Mar 15, 2021 · cdff0ac · cdff0ac
1 parent 8913a98
commit cdff0ac
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 7 deletions.
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -94,10 +94,12 @@ char transformChar(char c) {
  return;
  }
 
- int sc =
- automaton.ngramScore(rootChars)
- - longerWorsePenalty(word.length(), rootChars.length)
- + commonPrefix(word, rootChars);
+ int sc = automaton.ngramScore(rootChars);
+ if (sc == 0) {
+ return; // no common characters at all, don't suggest this root
+ }
+
+ sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
 
  if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
  return;

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
@@ -33,13 +33,16 @@ class TrigramAutomaton {
  private final CharacterRunAutomaton automaton;
  private final int[] state2Score;
  private final FixedBitSet countedSubstrings;
+ private final char minChar;
 
  TrigramAutomaton(String s1) {
  Map<String, Integer> substringCounts = new HashMap<>();
 
  Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
  int initialState = builder.createState();
 
+ minChar = (char) s1.chars().min().orElseThrow();
+
  for (int start = 0; start < s1.length(); start++) {
  int limit = Math.min(s1.length(), start + N);
  for (int end = start + 1; end <= limit; end++) {
@@ -49,7 +52,7 @@ class TrigramAutomaton {
  int state = initialState;
  for (int i = start; i < limit; i++) {
  int next = builder.createState();
- builder.addTransition(state, next, s1.charAt(i));
+ builder.addTransition(state, next, s1.charAt(i) - minChar);
  state = next;
  }
  }
@@ -70,7 +73,7 @@ class TrigramAutomaton {
  private int runAutomatonOnStringChars(String s) {
  int state = 0;
  for (int i = 0; i < s.length(); i++) {
- state = automaton.step(state, s.charAt(i));
+ state = automaton.step(state, s.charAt(i) - minChar);
  }
  return state;
  }
@@ -86,6 +89,11 @@ int ngramScore(CharsRef s2) {
  int limit = s2.length + s2.offset;
  for (int i = s2.offset; i < limit; i++) {
  char c = transformChar(s2.chars[i]);
+ if (c < minChar) {
+ state1 = state2 = -1;
+ continue;
+ }
+ c -= minChar;
 
  int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
  if (state3 > 0) {

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -65,6 +65,16 @@ public void en_suggest() throws Exception {
  checkSuggestionPerformance("en", 3_000);
  }
 
+ @Test
+ public void ru() throws Exception {
+ checkAnalysisPerformance("ru", 400_000);
+ }
+
+ @Test
+ public void ru_suggest() throws Exception {
+ checkSuggestionPerformance("ru", 1000);
+ }
+
  @Test
  public void de() throws Exception {
  checkAnalysisPerformance("de", 300_000);
@@ -121,6 +131,7 @@ private void checkSuggestionPerformance(String code, int wordCount) throws Excep
  Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
  List<String> words =
  loadWords(code, wordCount, dictionary).stream()
+ .distinct()
  .filter(w -> hasQuickSuggestions(speller, w))
  .collect(Collectors.toList());
  System.out.println("Checking " + words.size() + " misspelled words");
@@ -181,7 +192,8 @@ private List<String> loadWords(String code, int wordCount, Dictionary dictionary
  String line = reader.readLine();
  if (line == null) break;
 
- for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
+ for (String token :
+ line.split("[^\\p{IsLetter}" + Pattern.quote(dictionary.wordChars) + "]+")) {
  String word = stripPunctuation(token);
  if (word != null) {
  words.add(word);