From cdff0accaa63461f7db893a2fb9893899f763bc0 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Mon, 15 Mar 2021 10:02:45 +0100
Subject: [PATCH] Hunspell suggestions: speed up for some non-Latin scripts
 (#19)

---
 .../analysis/hunspell/GeneratingSuggester.java     | 10 ++++++----
 .../lucene/analysis/hunspell/TrigramAutomaton.java | 12 ++++++++++--
 .../lucene/analysis/hunspell/TestPerformance.java  | 14 +++++++++++++-
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index f9bcfd557d07..cc72027a331f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -94,10 +94,12 @@ char transformChar(char c) {
             return;
           }
 
-          int sc =
-              automaton.ngramScore(rootChars)
-                  - longerWorsePenalty(word.length(), rootChars.length)
-                  + commonPrefix(word, rootChars);
+          int sc = automaton.ngramScore(rootChars);
+          if (sc == 0) {
+            return; // no common characters at all, don't suggest this root
+          }
+
+          sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
 
           if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
             return;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
index a83505e6540a..effd59685588 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
@@ -33,6 +33,7 @@ class TrigramAutomaton {
   private final CharacterRunAutomaton automaton;
   private final int[] state2Score;
   private final FixedBitSet countedSubstrings;
+  private final char minChar;
 
   TrigramAutomaton(String s1) {
     Map<String, Integer> substringCounts = new HashMap<>();
@@ -40,6 +41,8 @@ class TrigramAutomaton {
     Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
     int initialState = builder.createState();
 
+    minChar = (char) s1.chars().min().orElseThrow();
+
     for (int start = 0; start < s1.length(); start++) {
       int limit = Math.min(s1.length(), start + N);
       for (int end = start + 1; end <= limit; end++) {
@@ -49,7 +52,7 @@ class TrigramAutomaton {
       int state = initialState;
       for (int i = start; i < limit; i++) {
         int next = builder.createState();
-        builder.addTransition(state, next, s1.charAt(i));
+        builder.addTransition(state, next, s1.charAt(i) - minChar);
         state = next;
       }
     }
@@ -70,7 +73,7 @@ class TrigramAutomaton {
   private int runAutomatonOnStringChars(String s) {
     int state = 0;
     for (int i = 0; i < s.length(); i++) {
-      state = automaton.step(state, s.charAt(i));
+      state = automaton.step(state, s.charAt(i) - minChar);
     }
     return state;
   }
@@ -86,6 +89,11 @@ int ngramScore(CharsRef s2) {
     int limit = s2.length + s2.offset;
     for (int i = s2.offset; i < limit; i++) {
       char c = transformChar(s2.chars[i]);
+      if (c < minChar) {
+        state1 = state2 = -1;
+        continue;
+      }
+      c -= minChar;
 
       int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
       if (state3 > 0) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index bc69f6c5b812..ffe3ae9de311 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -65,6 +65,16 @@ public void en_suggest() throws Exception {
     checkSuggestionPerformance("en", 3_000);
   }
 
+  @Test
+  public void ru() throws Exception {
+    checkAnalysisPerformance("ru", 400_000);
+  }
+
+  @Test
+  public void ru_suggest() throws Exception {
+    checkSuggestionPerformance("ru", 1000);
+  }
+
   @Test
   public void de() throws Exception {
     checkAnalysisPerformance("de", 300_000);
@@ -121,6 +131,7 @@ private void checkSuggestionPerformance(String code, int wordCount) throws Excep
     Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
     List<String> words =
         loadWords(code, wordCount, dictionary).stream()
+            .distinct()
             .filter(w -> hasQuickSuggestions(speller, w))
             .collect(Collectors.toList());
     System.out.println("Checking " + words.size() + " misspelled words");
@@ -181,7 +192,8 @@ private List<String> loadWords(String code, int wordCount, Dictionary dictionary
         String line = reader.readLine();
         if (line == null) break;
 
-        for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
+        for (String token :
+            line.split("[^\\p{IsLetter}" + Pattern.quote(dictionary.wordChars) + "]+")) {
           String word = stripPunctuation(token);
           if (word != null) {
             words.add(word);