OPENNLP-1655 Add constructors in SentenceDetectorME and TokenizerME t…

…o customize Abbreviation Dict at runtime
apache · Nov 24, 2024 · 58cdb42 · 58cdb42
1 parent ec09b7e
commit 58cdb42
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 10 deletions.
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -99,11 +99,21 @@ public SentenceDetectorME(String language) throws IOException {
    * @param model the {@link SentenceModel}
    */
   public SentenceDetectorME(SentenceModel model) {
-    SentenceDetectorFactory sdFactory = model.getFactory();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link SentenceDetectorME} with an existing {@link SentenceModel}.
+   *
+   * @param model The {@link SentenceModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public SentenceDetectorME(SentenceModel model, Dictionary abbDict) {
     this.model = model.getMaxentModel();
+    this.abbDict = abbDict;
+    SentenceDetectorFactory sdFactory = model.getFactory();
     cgen = sdFactory.getSDContextGenerator();
     scanner = sdFactory.getEndOfSentenceScanner();
-    abbDict = model.getAbbreviations();
     useTokenEnd = sdFactory.isUseTokenEnd();
   }
 

diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
@@ -18,6 +18,7 @@
 package opennlp.tools.sentdetect;
 
 import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.Span;
 
 /**
@@ -37,20 +38,31 @@
 public class ThreadSafeSentenceDetectorME implements SentenceDetector, AutoCloseable {
 
   private final SentenceModel model;
+  private final Dictionary abbDict;
 
   private final ThreadLocal<SentenceDetectorME> threadLocal =
       new ThreadLocal<>();
 
   public ThreadSafeSentenceDetectorME(SentenceModel model) {
-    super();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link ThreadSafeSentenceDetectorME} with an existing {@link SentenceModel}.
+   *
+   * @param model The {@link SentenceModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public ThreadSafeSentenceDetectorME(SentenceModel model, Dictionary abbDict) {
     this.model = model;
+    this.abbDict = abbDict;
   }
 
   // If a thread-local version exists, return it. Otherwise, create, then return.
   private SentenceDetectorME getSD() {
     SentenceDetectorME sd = threadLocal.get();
     if (sd == null) {
-      sd = new SentenceDetectorME(model);
+      sd = new SentenceDetectorME(model, abbDict);
       threadLocal.set(sd);
     }
     return sd;

diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
@@ -18,6 +18,7 @@
 package opennlp.tools.tokenize;
 
 import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.Span;
 
 /**
@@ -37,18 +38,29 @@
 public class ThreadSafeTokenizerME implements Tokenizer, AutoCloseable {
 
   private final TokenizerModel model;
-
+  private final Dictionary abbDict;
+
   private final ThreadLocal<TokenizerME> threadLocal = new ThreadLocal<>();
 
   public ThreadSafeTokenizerME(TokenizerModel model) {
-    super();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link ThreadSafeTokenizerME} with an existing {@link TokenizerModel}.
+   *
+   * @param model The {@link TokenizerModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public ThreadSafeTokenizerME(TokenizerModel model, Dictionary abbDict) {
     this.model = model;
+    this.abbDict = abbDict;
   }
 
   private TokenizerME getTokenizer() {
     TokenizerME tokenizer = threadLocal.get();
     if (tokenizer == null) {
-      tokenizer = new TokenizerME(model);
+      tokenizer = new TokenizerME(model, abbDict);
       threadLocal.set(tokenizer);
     }
     return tokenizer;

diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -129,13 +129,23 @@ public TokenizerME(String language) throws IOException {
    * @param model The {@link TokenizerModel} to be used.
    */
   public TokenizerME(TokenizerModel model) {
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link TokenizerME} with an existing {@link TokenizerModel}.
+   *
+   * @param model The {@link TokenizerModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public TokenizerME(TokenizerModel model, Dictionary abbDict) {
+    this.model = model.getMaxentModel();
+    this.abbDict = abbDict;
     TokenizerFactory factory = model.getFactory();
-    this.alphanumeric = factory.getAlphaNumericPattern();
     this.cg = factory.getContextGenerator();
-    this.model = model.getMaxentModel();
+    this.alphanumeric = factory.getAlphaNumericPattern();
     this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization();
 
-    abbDict = model.getAbbreviations();
     newTokens = new ArrayList<>();
     tokProbs = new ArrayList<>(50);
   }