Skip to content

Commit

Permalink
OPENNLP-1655 Add constructors in SentenceDetectorME and TokenizerME t…
Browse files Browse the repository at this point in the history
…o customize Abbreviation Dict at runtime
  • Loading branch information
mawiesne committed Nov 24, 2024
1 parent ec09b7e commit 58cdb42
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,21 @@ public SentenceDetectorME(String language) throws IOException {
* @param model the {@link SentenceModel}
*/
public SentenceDetectorME(SentenceModel model) {
SentenceDetectorFactory sdFactory = model.getFactory();
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link SentenceDetectorME} with an existing {@link SentenceModel}.
*
* @param model The {@link SentenceModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public SentenceDetectorME(SentenceModel model, Dictionary abbDict) {
this.model = model.getMaxentModel();
this.abbDict = abbDict;
SentenceDetectorFactory sdFactory = model.getFactory();
cgen = sdFactory.getSDContextGenerator();
scanner = sdFactory.getEndOfSentenceScanner();
abbDict = model.getAbbreviations();
useTokenEnd = sdFactory.isUseTokenEnd();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package opennlp.tools.sentdetect;

import opennlp.tools.commons.ThreadSafe;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.Span;

/**
Expand All @@ -37,20 +38,31 @@
public class ThreadSafeSentenceDetectorME implements SentenceDetector, AutoCloseable {

private final SentenceModel model;
private final Dictionary abbDict;

private final ThreadLocal<SentenceDetectorME> threadLocal =
new ThreadLocal<>();

public ThreadSafeSentenceDetectorME(SentenceModel model) {
super();
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link ThreadSafeSentenceDetectorME} with an existing {@link SentenceModel}.
*
* @param model The {@link SentenceModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public ThreadSafeSentenceDetectorME(SentenceModel model, Dictionary abbDict) {
this.model = model;
this.abbDict = abbDict;
}

// If a thread-local version exists, return it. Otherwise, create, then return.
private SentenceDetectorME getSD() {
SentenceDetectorME sd = threadLocal.get();
if (sd == null) {
sd = new SentenceDetectorME(model);
sd = new SentenceDetectorME(model, abbDict);
threadLocal.set(sd);
}
return sd;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package opennlp.tools.tokenize;

import opennlp.tools.commons.ThreadSafe;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.Span;

/**
Expand All @@ -37,18 +38,29 @@
public class ThreadSafeTokenizerME implements Tokenizer, AutoCloseable {

private final TokenizerModel model;

private final Dictionary abbDict;

private final ThreadLocal<TokenizerME> threadLocal = new ThreadLocal<>();

public ThreadSafeTokenizerME(TokenizerModel model) {
super();
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link ThreadSafeTokenizerME} with an existing {@link TokenizerModel}.
*
* @param model The {@link TokenizerModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public ThreadSafeTokenizerME(TokenizerModel model, Dictionary abbDict) {
this.model = model;
this.abbDict = abbDict;
}

private TokenizerME getTokenizer() {
TokenizerME tokenizer = threadLocal.get();
if (tokenizer == null) {
tokenizer = new TokenizerME(model);
tokenizer = new TokenizerME(model, abbDict);
threadLocal.set(tokenizer);
}
return tokenizer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,23 @@ public TokenizerME(String language) throws IOException {
* @param model The {@link TokenizerModel} to be used.
*/
public TokenizerME(TokenizerModel model) {
this(model, model.getAbbreviations());
}

/**
* Instantiates a {@link TokenizerME} with an existing {@link TokenizerModel}.
*
* @param model The {@link TokenizerModel} to be used.
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
*/
public TokenizerME(TokenizerModel model, Dictionary abbDict) {
this.model = model.getMaxentModel();
this.abbDict = abbDict;
TokenizerFactory factory = model.getFactory();
this.alphanumeric = factory.getAlphaNumericPattern();
this.cg = factory.getContextGenerator();
this.model = model.getMaxentModel();
this.alphanumeric = factory.getAlphaNumericPattern();
this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization();

abbDict = model.getAbbreviations();
newTokens = new ArrayList<>();
tokProbs = new ArrayList<>(50);
}
Expand Down

0 comments on commit 58cdb42

Please sign in to comment.