20
20
import java .io .ByteArrayInputStream ;
21
21
import java .io .ByteArrayOutputStream ;
22
22
import java .io .IOException ;
23
- import java .io .InputStream ;
24
23
import java .nio .charset .StandardCharsets ;
24
+ import java .util .Locale ;
25
25
import java .util .regex .Pattern ;
26
26
27
27
import org .junit .jupiter .api .Assertions ;
@@ -54,24 +54,28 @@ private static TokenizerModel train(TokenizerFactory factory)
54
54
return TokenizerME .train (createSampleStream (), factory , TrainingParameters .defaultParams ());
55
55
}
56
56
57
- private static Dictionary loadAbbDictionary () throws IOException {
58
- InputStream in = TokenizerFactoryTest .class .getClassLoader ()
59
- .getResourceAsStream ("opennlp/tools/sentdetect/abb.xml" );
60
-
61
- return new Dictionary (in );
57
+ private static Dictionary loadAbbDictionary (Locale loc ) throws IOException {
58
+ final String abbrevDict ;
59
+ if (loc .equals (Locale .GERMAN )) {
60
+ abbrevDict = "opennlp/tools/sentdetect/abb_DE.xml" ;
61
+ } else {
62
+ abbrevDict = "opennlp/tools/sentdetect/abb.xml" ;
63
+ }
64
+ return new Dictionary (TokenizerFactoryTest .class .getClassLoader ()
65
+ .getResourceAsStream (abbrevDict ));
62
66
}
63
67
64
68
@ Test
65
69
void testDefault () throws IOException {
66
70
67
- Dictionary dic = loadAbbDictionary ();
71
+ Dictionary dic = loadAbbDictionary (Locale . ENGLISH );
68
72
final String lang = "eng" ;
69
73
70
74
TokenizerModel model = train (new TokenizerFactory (lang , dic , false , null ));
71
75
72
76
TokenizerFactory factory = model .getFactory ();
73
77
Assertions .assertNotNull (factory .getAbbreviationDictionary ());
74
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DefaultTokenContextGenerator );
78
+ Assertions .assertInstanceOf ( DefaultTokenContextGenerator . class , factory .getContextGenerator ());
75
79
76
80
String defaultPattern = Factory .DEFAULT_ALPHANUMERIC .pattern ();
77
81
Assertions .assertEquals (defaultPattern , factory .getAlphaNumericPattern ().pattern ());
@@ -87,7 +91,7 @@ void testDefault() throws IOException {
87
91
88
92
factory = fromSerialized .getFactory ();
89
93
Assertions .assertNotNull (factory .getAbbreviationDictionary ());
90
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DefaultTokenContextGenerator );
94
+ Assertions .assertInstanceOf ( DefaultTokenContextGenerator . class , factory .getContextGenerator ());
91
95
92
96
Assertions .assertEquals (defaultPattern , factory .getAlphaNumericPattern ().pattern ());
93
97
Assertions .assertEquals (lang , factory .getLanguageCode ());
@@ -105,7 +109,7 @@ void testNullDict() throws IOException {
105
109
106
110
TokenizerFactory factory = model .getFactory ();
107
111
Assertions .assertNull (factory .getAbbreviationDictionary ());
108
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DefaultTokenContextGenerator );
112
+ Assertions .assertInstanceOf ( DefaultTokenContextGenerator . class , factory .getContextGenerator ());
109
113
110
114
String defaultPattern = Factory .DEFAULT_ALPHANUMERIC .pattern ();
111
115
Assertions .assertEquals (defaultPattern , factory .getAlphaNumericPattern ().pattern ());
@@ -121,7 +125,7 @@ void testNullDict() throws IOException {
121
125
122
126
factory = fromSerialized .getFactory ();
123
127
Assertions .assertNull (factory .getAbbreviationDictionary ());
124
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DefaultTokenContextGenerator );
128
+ Assertions .assertInstanceOf ( DefaultTokenContextGenerator . class , factory .getContextGenerator ());
125
129
126
130
Assertions .assertEquals (defaultPattern , factory .getAlphaNumericPattern ().pattern ());
127
131
Assertions .assertEquals (lang , factory .getLanguageCode ());
@@ -141,7 +145,7 @@ void testCustomPatternAndAlphaOpt() throws IOException {
141
145
142
146
TokenizerFactory factory = model .getFactory ();
143
147
Assertions .assertNull (factory .getAbbreviationDictionary ());
144
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DefaultTokenContextGenerator );
148
+ Assertions .assertInstanceOf ( DefaultTokenContextGenerator . class , factory .getContextGenerator ());
145
149
146
150
Assertions .assertEquals (pattern , factory .getAlphaNumericPattern ().pattern ());
147
151
Assertions .assertEquals (lang , factory .getLanguageCode ());
@@ -156,7 +160,7 @@ void testCustomPatternAndAlphaOpt() throws IOException {
156
160
157
161
factory = fromSerialized .getFactory ();
158
162
Assertions .assertNull (factory .getAbbreviationDictionary ());
159
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DefaultTokenContextGenerator );
163
+ Assertions .assertInstanceOf ( DefaultTokenContextGenerator . class , factory .getContextGenerator ());
160
164
Assertions .assertEquals (pattern , factory .getAlphaNumericPattern ().pattern ());
161
165
Assertions .assertEquals (lang , factory .getLanguageCode ());
162
166
Assertions .assertEquals (lang , model .getLanguage ());
@@ -165,18 +169,24 @@ void testCustomPatternAndAlphaOpt() throws IOException {
165
169
166
170
void checkCustomPatternForTokenizerME (String lang , String pattern , String sentence ,
167
171
int expectedNumTokens ) throws IOException {
168
-
169
- TokenizerModel model = train (new TokenizerFactory (lang , null , true ,
172
+ Locale loc = Locale .ENGLISH ;
173
+ if ("deu" .equals (lang )) {
174
+ loc = Locale .GERMAN ;
175
+ }
176
+ TokenizerModel model = train (new TokenizerFactory (lang , loadAbbDictionary (loc ), true ,
170
177
Pattern .compile (pattern )));
171
178
172
179
TokenizerME tokenizer = new TokenizerME (model );
173
180
String [] tokens = tokenizer .tokenize (sentence );
174
181
175
182
Assertions .assertEquals (expectedNumTokens , tokens .length );
176
- String [] sentSplit = sentence .replaceAll ("\\ ." , " ." )
177
- .replaceAll ("'" , " '" ).split (" " );
183
+ String [] sentSplit = sentence .replaceAll ("'" , " '" ).split (" " );
178
184
for (int i = 0 ; i < sentSplit .length ; i ++) {
179
- Assertions .assertEquals (sentSplit [i ], tokens [i ]);
185
+ String sElement = sentSplit [i ];
186
+ if (i == sentSplit .length - 1 ) {
187
+ sElement = sElement .replace ("." , "" ); // compensate for sentence ending
188
+ }
189
+ Assertions .assertEquals (sElement , tokens [i ]);
180
190
}
181
191
}
182
192
@@ -185,7 +195,7 @@ void testCustomPatternForTokenizerMEDeu() throws IOException {
185
195
String lang = "deu" ;
186
196
String pattern = "^[A-Za-z0-9äéöüÄÉÖÜß]+$" ;
187
197
String sentence = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie." ;
188
- checkCustomPatternForTokenizerME (lang , pattern , sentence , 16 );
198
+ checkCustomPatternForTokenizerME (lang , pattern , sentence , 14 );
189
199
}
190
200
191
201
@ Test
@@ -267,16 +277,16 @@ void testContractionsEng() throws IOException {
267
277
@ Test
268
278
void testDummyFactory () throws IOException {
269
279
270
- Dictionary dic = loadAbbDictionary ();
280
+ Dictionary dic = loadAbbDictionary (Locale . ENGLISH );
271
281
final String lang = "eng" ;
272
282
String pattern = "^[0-9A-Za-z]+$" ;
273
283
274
284
TokenizerModel model = train (new DummyTokenizerFactory (lang , dic , true ,
275
285
Pattern .compile (pattern )));
276
286
277
287
TokenizerFactory factory = model .getFactory ();
278
- Assertions .assertTrue ( factory .getAbbreviationDictionary () instanceof DummyDictionary );
279
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DummyContextGenerator );
288
+ Assertions .assertInstanceOf ( DummyDictionary . class , factory .getAbbreviationDictionary ());
289
+ Assertions .assertInstanceOf ( DummyContextGenerator . class , factory .getContextGenerator ());
280
290
Assertions .assertEquals (pattern , factory .getAlphaNumericPattern ().pattern ());
281
291
Assertions .assertEquals (lang , factory .getLanguageCode ());
282
292
Assertions .assertEquals (lang , model .getLanguage ());
@@ -289,8 +299,8 @@ void testDummyFactory() throws IOException {
289
299
TokenizerModel fromSerialized = new TokenizerModel (in );
290
300
291
301
factory = fromSerialized .getFactory ();
292
- Assertions .assertTrue ( factory .getAbbreviationDictionary () instanceof DummyDictionary );
293
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DummyContextGenerator );
302
+ Assertions .assertInstanceOf ( DummyDictionary . class , factory .getAbbreviationDictionary ());
303
+ Assertions .assertInstanceOf ( DummyContextGenerator . class , factory .getContextGenerator ());
294
304
Assertions .assertEquals (pattern , factory .getAlphaNumericPattern ().pattern ());
295
305
Assertions .assertEquals (lang , factory .getLanguageCode ());
296
306
Assertions .assertEquals (lang , model .getLanguage ());
@@ -299,16 +309,16 @@ void testDummyFactory() throws IOException {
299
309
300
310
@ Test
301
311
void testCreateDummyFactory () throws IOException {
302
- Dictionary dic = loadAbbDictionary ();
312
+ Dictionary dic = loadAbbDictionary (Locale . ENGLISH );
303
313
final String lang = "eng" ;
304
314
String pattern = "^[0-9A-Za-z]+$" ;
305
315
306
316
TokenizerFactory factory = TokenizerFactory .create (
307
317
DummyTokenizerFactory .class .getCanonicalName (), lang , dic , true ,
308
318
Pattern .compile (pattern ));
309
319
310
- Assertions .assertTrue ( factory .getAbbreviationDictionary () instanceof DummyDictionary );
311
- Assertions .assertTrue ( factory .getContextGenerator () instanceof DummyContextGenerator );
320
+ Assertions .assertInstanceOf ( DummyDictionary . class , factory .getAbbreviationDictionary ());
321
+ Assertions .assertInstanceOf ( DummyContextGenerator . class , factory .getContextGenerator ());
312
322
Assertions .assertEquals (pattern , factory .getAlphaNumericPattern ().pattern ());
313
323
Assertions .assertEquals (lang , factory .getLanguageCode ());
314
324
Assertions .assertTrue (factory .isUseAlphaNumericOptimization ());
0 commit comments