Skip to content

Commit

Permalink
Hunspell suggestions: speed up for some non-Latin scripts (apache#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
donnerpeter authored Mar 15, 2021
1 parent 8913a98 commit cdff0ac
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,12 @@ char transformChar(char c) {
return;
}

int sc =
automaton.ngramScore(rootChars)
- longerWorsePenalty(word.length(), rootChars.length)
+ commonPrefix(word, rootChars);
int sc = automaton.ngramScore(rootChars);
if (sc == 0) {
return; // no common characters at all, don't suggest this root
}

sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);

if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,16 @@ class TrigramAutomaton {
private final CharacterRunAutomaton automaton;
private final int[] state2Score;
private final FixedBitSet countedSubstrings;
private final char minChar;

TrigramAutomaton(String s1) {
Map<String, Integer> substringCounts = new HashMap<>();

Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
int initialState = builder.createState();

minChar = (char) s1.chars().min().orElseThrow();

for (int start = 0; start < s1.length(); start++) {
int limit = Math.min(s1.length(), start + N);
for (int end = start + 1; end <= limit; end++) {
Expand All @@ -49,7 +52,7 @@ class TrigramAutomaton {
int state = initialState;
for (int i = start; i < limit; i++) {
int next = builder.createState();
builder.addTransition(state, next, s1.charAt(i));
builder.addTransition(state, next, s1.charAt(i) - minChar);
state = next;
}
}
Expand All @@ -70,7 +73,7 @@ class TrigramAutomaton {
private int runAutomatonOnStringChars(String s) {
int state = 0;
for (int i = 0; i < s.length(); i++) {
state = automaton.step(state, s.charAt(i));
state = automaton.step(state, s.charAt(i) - minChar);
}
return state;
}
Expand All @@ -86,6 +89,11 @@ int ngramScore(CharsRef s2) {
int limit = s2.length + s2.offset;
for (int i = s2.offset; i < limit; i++) {
char c = transformChar(s2.chars[i]);
if (c < minChar) {
state1 = state2 = -1;
continue;
}
c -= minChar;

int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
if (state3 > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,16 @@ public void en_suggest() throws Exception {
checkSuggestionPerformance("en", 3_000);
}

@Test
public void ru() throws Exception {
checkAnalysisPerformance("ru", 400_000);
}

@Test
public void ru_suggest() throws Exception {
checkSuggestionPerformance("ru", 1000);
}

@Test
public void de() throws Exception {
checkAnalysisPerformance("de", 300_000);
Expand Down Expand Up @@ -121,6 +131,7 @@ private void checkSuggestionPerformance(String code, int wordCount) throws Excep
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
List<String> words =
loadWords(code, wordCount, dictionary).stream()
.distinct()
.filter(w -> hasQuickSuggestions(speller, w))
.collect(Collectors.toList());
System.out.println("Checking " + words.size() + " misspelled words");
Expand Down Expand Up @@ -181,7 +192,8 @@ private List<String> loadWords(String code, int wordCount, Dictionary dictionary
String line = reader.readLine();
if (line == null) break;

for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
for (String token :
line.split("[^\\p{IsLetter}" + Pattern.quote(dictionary.wordChars) + "]+")) {
String word = stripPunctuation(token);
if (word != null) {
words.add(word);
Expand Down

0 comments on commit cdff0ac

Please sign in to comment.