Skip to content

Commit 374bee9

Browse files
authored
OPENNLP-1655: Add constructors in SentenceDetectorME and TokenizerME to inject custom abbreviation dictionar (#694)
1 parent a238e18 commit 374bee9

File tree

4 files changed

+53
-9
lines changed

4 files changed

+53
-9
lines changed

opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java

+12-2
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,21 @@ public SentenceDetectorME(String language) throws IOException {
9999
* @param model the {@link SentenceModel}
100100
*/
101101
public SentenceDetectorME(SentenceModel model) {
102-
SentenceDetectorFactory sdFactory = model.getFactory();
102+
this(model, model.getAbbreviations());
103+
}
104+
105+
/**
106+
* Instantiates a {@link SentenceDetectorME} with an existing {@link SentenceModel}.
107+
*
108+
* @param model The {@link SentenceModel} to be used.
109+
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
110+
*/
111+
public SentenceDetectorME(SentenceModel model, Dictionary abbDict) {
103112
this.model = model.getMaxentModel();
113+
this.abbDict = abbDict;
114+
SentenceDetectorFactory sdFactory = model.getFactory();
104115
cgen = sdFactory.getSDContextGenerator();
105116
scanner = sdFactory.getEndOfSentenceScanner();
106-
abbDict = model.getAbbreviations();
107117
useTokenEnd = sdFactory.isUseTokenEnd();
108118
}
109119

opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java

+14-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.io.IOException;
2121

2222
import opennlp.tools.commons.ThreadSafe;
23+
import opennlp.tools.dictionary.Dictionary;
2324
import opennlp.tools.util.DownloadUtil;
2425
import opennlp.tools.util.Span;
2526

@@ -43,6 +44,7 @@
4344
public class ThreadSafeSentenceDetectorME implements SentenceDetector, AutoCloseable {
4445

4546
private final SentenceModel model;
47+
private final Dictionary abbDict;
4648

4749
private final ThreadLocal<SentenceDetectorME> threadLocal = new ThreadLocal<>();
4850

@@ -63,15 +65,25 @@ public ThreadSafeSentenceDetectorME(String language) throws IOException {
6365
* @param model A valid {@link SentenceModel}.
6466
*/
6567
public ThreadSafeSentenceDetectorME(SentenceModel model) {
66-
super();
68+
this(model, model.getAbbreviations());
69+
}
70+
71+
/**
72+
* Instantiates a {@link ThreadSafeSentenceDetectorME} with an existing {@link SentenceModel}.
73+
*
74+
* @param model The {@link SentenceModel} to be used.
75+
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
76+
*/
77+
public ThreadSafeSentenceDetectorME(SentenceModel model, Dictionary abbDict) {
6778
this.model = model;
79+
this.abbDict = abbDict;
6880
}
6981

7082
// If a thread-local version exists, return it. Otherwise, create, then return.
7183
private SentenceDetectorME getSD() {
7284
SentenceDetectorME sd = threadLocal.get();
7385
if (sd == null) {
74-
sd = new SentenceDetectorME(model);
86+
sd = new SentenceDetectorME(model, abbDict);
7587
threadLocal.set(sd);
7688
}
7789
return sd;

opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java

+14-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.io.IOException;
2121

2222
import opennlp.tools.commons.ThreadSafe;
23+
import opennlp.tools.dictionary.Dictionary;
2324
import opennlp.tools.util.DownloadUtil;
2425
import opennlp.tools.util.Span;
2526

@@ -43,6 +44,7 @@
4344
public class ThreadSafeTokenizerME implements Tokenizer, AutoCloseable {
4445

4546
private final TokenizerModel model;
47+
private final Dictionary abbDict;
4648

4749
private final ThreadLocal<TokenizerME> threadLocal = new ThreadLocal<>();
4850

@@ -63,14 +65,24 @@ public ThreadSafeTokenizerME(String language) throws IOException {
6365
* @param model A valid {@link TokenizerModel}.
6466
*/
6567
public ThreadSafeTokenizerME(TokenizerModel model) {
66-
super();
68+
this(model, model.getAbbreviations());
69+
}
70+
71+
/**
72+
* Instantiates a {@link ThreadSafeTokenizerME} with an existing {@link TokenizerModel}.
73+
*
74+
* @param model The {@link TokenizerModel} to be used.
75+
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
76+
*/
77+
public ThreadSafeTokenizerME(TokenizerModel model, Dictionary abbDict) {
6778
this.model = model;
79+
this.abbDict = abbDict;
6880
}
6981

7082
private TokenizerME getTokenizer() {
7183
TokenizerME tokenizer = threadLocal.get();
7284
if (tokenizer == null) {
73-
tokenizer = new TokenizerME(model);
85+
tokenizer = new TokenizerME(model, abbDict);
7486
threadLocal.set(tokenizer);
7587
}
7688
return tokenizer;

opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java

+13-3
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,23 @@ public TokenizerME(String language) throws IOException {
129129
* @param model The {@link TokenizerModel} to be used.
130130
*/
131131
public TokenizerME(TokenizerModel model) {
132+
this(model, model.getAbbreviations());
133+
}
134+
135+
/**
136+
* Instantiates a {@link TokenizerME} with an existing {@link TokenizerModel}.
137+
*
138+
* @param model The {@link TokenizerModel} to be used.
139+
* @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
140+
*/
141+
public TokenizerME(TokenizerModel model, Dictionary abbDict) {
142+
this.model = model.getMaxentModel();
143+
this.abbDict = abbDict;
132144
TokenizerFactory factory = model.getFactory();
133-
this.alphanumeric = factory.getAlphaNumericPattern();
134145
this.cg = factory.getContextGenerator();
135-
this.model = model.getMaxentModel();
146+
this.alphanumeric = factory.getAlphaNumericPattern();
136147
this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization();
137148

138-
abbDict = model.getAbbreviations();
139149
newTokens = new ArrayList<>();
140150
tokProbs = new ArrayList<>(50);
141151
}

0 commit comments

Comments
 (0)