Skip to content

Commit

Permalink
Fix duplicate values in org.apache.lucene.analysis.ko.dict.UserDictio…
Browse files Browse the repository at this point in the history
…nary (#13427)

Remove incorrect assertion in org.apache.lucene.analysis.ko.dict.UserDictionary, and replace with array copy if duplicate values are passed.
  • Loading branch information
ChrisHegarty committed May 27, 2024
1 parent 8d7bf86 commit 4be6531
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
Expand Down Expand Up @@ -138,10 +139,12 @@ private UserDictionary(List<String> entries) throws IOException {
lastToken = token;
ord++;
}
if (entryIndex < rightIds.length) {
rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex);
}
this.fst =
new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
assert entryIndex == rightIds.length;
this.rightIds = rightIds;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
Expand Down Expand Up @@ -593,6 +594,22 @@ public void testCombining() throws IOException {
new POS.Tag[] {POS.Tag.SL});
}

public void testDuplicate() throws IOException {
String s = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시";
try (Reader rulesReader = new StringReader(s)) {
var dict = UserDictionary.open(rulesReader);
assertTrue(dict.getRightId(3) != 0);
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
}

String dupdup = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시\n세종시 세종 시";
try (Reader rulesReader = new StringReader(dupdup)) {
var dict = UserDictionary.open(rulesReader);
assertTrue(dict.getRightId(3) != 0);
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
}
}

private void assertReadings(Analyzer analyzer, String input, String... readings)
throws IOException {
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
Expand Down

0 comments on commit 4be6531

Please sign in to comment.