OPENNLP-1655: Add constructors in SentenceDetectorME and TokenizerME to inject custom abbreviation dictionar (#694)

mawiesne · web-flow · commit 374bee9324df · 2024-11-25T10:30:04.000+01:00
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -99,11 +99,21 @@ public SentenceDetectorME(String language) throws IOException {
    * @param model the {@link SentenceModel}
    */
   public SentenceDetectorME(SentenceModel model) {
-    SentenceDetectorFactory sdFactory = model.getFactory();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link SentenceDetectorME} with an existing {@link SentenceModel}.
+   *
+   * @param model The {@link SentenceModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public SentenceDetectorME(SentenceModel model, Dictionary abbDict) {
     this.model = model.getMaxentModel();
+    this.abbDict = abbDict;
+    SentenceDetectorFactory sdFactory = model.getFactory();
     cgen = sdFactory.getSDContextGenerator();
     scanner = sdFactory.getEndOfSentenceScanner();
-    abbDict = model.getAbbreviations();
     useTokenEnd = sdFactory.isUseTokenEnd();
   }
 
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 
 import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.DownloadUtil;
 import opennlp.tools.util.Span;
 
@@ -43,6 +44,7 @@
 public class ThreadSafeSentenceDetectorME implements SentenceDetector, AutoCloseable {
 
   private final SentenceModel model;
+  private final Dictionary abbDict;
 
   private final ThreadLocal<SentenceDetectorME> threadLocal = new ThreadLocal<>();
 
@@ -63,15 +65,25 @@ public ThreadSafeSentenceDetectorME(String language) throws IOException {
    * @param model A valid {@link SentenceModel}.
    */
   public ThreadSafeSentenceDetectorME(SentenceModel model) {
-    super();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link ThreadSafeSentenceDetectorME} with an existing {@link SentenceModel}.
+   *
+   * @param model The {@link SentenceModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public ThreadSafeSentenceDetectorME(SentenceModel model, Dictionary abbDict) {
     this.model = model;
+    this.abbDict = abbDict;
   }
 
   // If a thread-local version exists, return it. Otherwise, create, then return.
   private SentenceDetectorME getSD() {
     SentenceDetectorME sd = threadLocal.get();
     if (sd == null) {
-      sd = new SentenceDetectorME(model);
+      sd = new SentenceDetectorME(model, abbDict);
       threadLocal.set(sd);
     }
     return sd;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 
 import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.DownloadUtil;
 import opennlp.tools.util.Span;
 
@@ -43,6 +44,7 @@
 public class ThreadSafeTokenizerME implements Tokenizer, AutoCloseable {
 
   private final TokenizerModel model;
+  private final Dictionary abbDict;
 
   private final ThreadLocal<TokenizerME> threadLocal = new ThreadLocal<>();
 
@@ -63,14 +65,24 @@ public ThreadSafeTokenizerME(String language) throws IOException {
    * @param model A valid {@link TokenizerModel}.
    */
   public ThreadSafeTokenizerME(TokenizerModel model) {
-    super();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link ThreadSafeTokenizerME} with an existing {@link TokenizerModel}.
+   *
+   * @param model The {@link TokenizerModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public ThreadSafeTokenizerME(TokenizerModel model, Dictionary abbDict) {
     this.model = model;
+    this.abbDict = abbDict;
   }
 
   private TokenizerME getTokenizer() {
     TokenizerME tokenizer = threadLocal.get();
     if (tokenizer == null) {
-      tokenizer = new TokenizerME(model);
+      tokenizer = new TokenizerME(model, abbDict);
       threadLocal.set(tokenizer);
     }
     return tokenizer;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -129,13 +129,23 @@ public TokenizerME(String language) throws IOException {
    * @param model The {@link TokenizerModel} to be used.
    */
   public TokenizerME(TokenizerModel model) {
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link TokenizerME} with an existing {@link TokenizerModel}.
+   *
+   * @param model The {@link TokenizerModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
+   */
+  public TokenizerME(TokenizerModel model, Dictionary abbDict) {
+    this.model = model.getMaxentModel();
+    this.abbDict = abbDict;
     TokenizerFactory factory = model.getFactory();
-    this.alphanumeric = factory.getAlphaNumericPattern();
     this.cg = factory.getContextGenerator();
-    this.model = model.getMaxentModel();
+    this.alphanumeric = factory.getAlphaNumericPattern();
     this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization();
 
-    abbDict = model.getAbbreviations();
     newTokens = new ArrayList<>();
     tokProbs = new ArrayList<>(50);
   }