Skip to content

Commit 530f3c7

Browse files
committed
OPENNLP-1369 - Fixes NPE when serializing TokenNameFinder model trained with 1.5 Pos Models
Adds a test-case to reproduce the NPE reported in OPENNLP-1369
1 parent f6adbff commit 530f3c7

File tree

6 files changed

+148
-16
lines changed

6 files changed

+148
-16
lines changed

opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java

+9-3
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,12 @@ protected void validateArtifactMap() throws InvalidFormatException {
171171
}
172172
}
173173

174+
@Override
175+
protected boolean skipEntryForSerialization(Map.Entry<String, Object> entry) {
176+
// An old model format was detected, skipping the process for this entry, see: OPENNLP-1369
177+
return GENERATOR_DESCRIPTOR_ENTRY_NAME.equals(entry.getKey()) && entry.getValue() == null;
178+
}
179+
174180
/**
175181
* @deprecated use {@link POSModel#getPosSequenceModel} instead. This method will be removed soon.
176182
* Only required for Parser 1.5.x backward compatibility. Newer models don't need this anymore.
@@ -232,7 +238,7 @@ public Class<POSModelSerializer> getArtifactSerializerClass() {
232238
@Override
233239
public int hashCode() {
234240
return Objects.hash(artifactMap.get("manifest.properties"), artifactMap.get("pos.model"),
235-
Arrays.hashCode((byte[]) artifactMap.get("generator.featuregen"))
241+
Arrays.hashCode((byte[]) artifactMap.get(GENERATOR_DESCRIPTOR_ENTRY_NAME))
236242
);
237243
}
238244

@@ -248,8 +254,8 @@ public boolean equals(Object obj) {
248254

249255
return artifactMap.get("manifest.properties").equals(artifactMapToCheck.get("manifest.properties")) &&
250256
artifactMap.get("pos.model").equals(abstractModel) &&
251-
Arrays.equals((byte[]) artifactMap.get("generator.featuregen"),
252-
(byte[]) artifactMapToCheck.get("generator.featuregen"));
257+
Arrays.equals((byte[]) artifactMap.get(GENERATOR_DESCRIPTOR_ENTRY_NAME),
258+
(byte[]) artifactMapToCheck.get(GENERATOR_DESCRIPTOR_ENTRY_NAME));
253259
}
254260
return false;
255261
}

opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java

+10-3
Original file line numberDiff line numberDiff line change
@@ -596,9 +596,8 @@ public final void serialize(OutputStream out) throws IOException {
596596
zip.putNextEntry(new ZipEntry(name));
597597

598598
Object artifact = entry.getValue();
599-
// TODO Discuss if this is the correct location to have this workaround in place
600-
if ("generator.featuregen".equals(name) && artifact == null) {
601-
// An old model format was detected, skipping the process for this entry, see: OPENNLP-1369
599+
600+
if (skipEntryForSerialization(entry)) {
602601
continue;
603602
}
604603

@@ -689,4 +688,12 @@ private void readObject(final ObjectInputStream in) throws IOException {
689688

690689
this.loadModel(in);
691690
}
691+
692+
/**
693+
* @param entry the entry to check
694+
* @return {@code true}, if the given entry should be skipped for serialization.
695+
*/
696+
protected boolean skipEntryForSerialization(Entry<String, Object> entry) {
697+
return false;
698+
}
692699
}

opennlp-tools/src/test/java/opennlp/tools/EnabledWhenCDNAvailable.java

-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
*/
3737
@Retention(RetentionPolicy.RUNTIME)
3838
@ExtendWith(EnabledWhenCDNAvailable.CDNAvailableCondition.class)
39-
@ParameterizedTest
4039
public @interface EnabledWhenCDNAvailable {
4140

4241
String hostname();

opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderModelTest.java

+85-7
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,21 @@
2323
import java.io.IOException;
2424
import java.io.InputStream;
2525
import java.io.InputStreamReader;
26+
import java.net.URISyntaxException;
27+
import java.net.URL;
2628
import java.nio.charset.StandardCharsets;
29+
import java.nio.file.CopyOption;
2730
import java.nio.file.Files;
2831
import java.nio.file.Path;
32+
import java.nio.file.StandardCopyOption;
2933
import java.util.Map;
3034
import java.util.stream.Collectors;
3135

3236
import org.junit.jupiter.api.Assertions;
3337
import org.junit.jupiter.api.Test;
3438

39+
import opennlp.tools.EnabledWhenCDNAvailable;
40+
import opennlp.tools.cmdline.AbstractModelLoaderTest;
3541
import opennlp.tools.cmdline.TerminateToolException;
3642
import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
3743
import opennlp.tools.postag.POSModel;
@@ -43,25 +49,21 @@
4349
import opennlp.tools.util.TrainingParameters;
4450
import opennlp.tools.util.model.ModelType;
4551

46-
public class TokenNameFinderModelTest {
52+
public class TokenNameFinderModelTest extends AbstractModelLoaderTest {
4753

4854
@Test
4955
void testNERWithPOSModel() throws IOException {
5056

5157
// create a resources folder
5258
Path resourcesFolder = Files.createTempDirectory("resources").toAbsolutePath();
5359

54-
// TODO Restore old test and provide a separate one which does it with the pt-pos-perceptron.bin
5560
// save a POS model there
5661
POSModel posModel = POSTaggerMETest.trainPOSModel(ModelType.MAXENT);
57-
File posModelFile = new File("pt-pos-perceptron.bin");
58-
Files.copy(posModelFile.toPath(), resourcesFolder.resolve("pt-pos-perceptron.bin"));
62+
File posModelFile = new File(resourcesFolder.toFile(), "pos-model.bin");
5963

60-
// posModel.serialize(posModelFile);
64+
posModel.serialize(posModelFile);
6165

6266
Assertions.assertTrue(posModelFile.exists());
63-
Assertions.assertTrue(resourcesFolder.resolve("pt-pos-perceptron.bin").toFile().exists());
64-
// end TODO
6567

6668
// load feature generator xml bytes
6769
InputStream fgInputStream = this.getClass().getResourceAsStream("ner-pos-features.xml");
@@ -108,4 +110,80 @@ void testNERWithPOSModel() throws IOException {
108110
FileUtil.deleteDirectory(resourcesFolder.toFile());
109111
}
110112
}
113+
114+
/*
115+
* OPENNLP-1369
116+
*/
117+
@EnabledWhenCDNAvailable(hostname = "opennlp.sourceforge.net")
118+
@Test
119+
void testNERWithPOSModelV15() throws IOException, URISyntaxException {
120+
121+
// 0. Download model from sourceforge and place at the right location
122+
final String modelName = "pt-pos-perceptron.bin";
123+
124+
downloadVersion15Model(modelName);
125+
126+
final Path model = OPENNLP_DIR.resolve(modelName);
127+
final Path resourcesFolder = Files.createTempDirectory("resources").toAbsolutePath();
128+
129+
Assertions.assertNotNull(model);
130+
Assertions.assertNotNull(resourcesFolder);
131+
132+
// 1. Copy the downloaded model to the temporary resource folder, so it can be referenced from
133+
// the feature gen xml file.
134+
135+
final Path copy = resourcesFolder.resolve(modelName);
136+
137+
Files.copy(OPENNLP_DIR.resolve(modelName), copy, StandardCopyOption.REPLACE_EXISTING);
138+
139+
Assertions.assertTrue(copy.toFile().exists());
140+
141+
// 2. Load feature generator xml bytes
142+
final URL featureGeneratorXmlUrl = this.getClass().getResource("ner-pos-features-v15.xml");
143+
Assertions.assertNotNull(featureGeneratorXmlUrl);
144+
145+
final Path featureGeneratorXmlPath = Path.of(featureGeneratorXmlUrl.toURI());
146+
Assertions.assertNotNull(featureGeneratorXmlPath);
147+
148+
final Path featureGenerator = Files.createTempFile("ner-featuregen-v15", ".xml");
149+
Assertions.assertNotNull(featureGenerator);
150+
151+
Files.copy(featureGeneratorXmlPath, featureGenerator, StandardCopyOption.REPLACE_EXISTING);
152+
Assertions.assertTrue(featureGenerator.toFile().exists());
153+
154+
Map<String, Object> resources;
155+
try {
156+
resources = TokenNameFinderTrainerTool.loadResources(resourcesFolder.toFile(),
157+
featureGenerator.toAbsolutePath().toFile());
158+
} catch (IOException e) {
159+
throw new TerminateToolException(-1, e.getMessage(), e);
160+
} finally {
161+
Files.delete(featureGenerator);
162+
}
163+
164+
165+
// train a name finder
166+
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
167+
new PlainTextByLineStream(new MockInputStreamFactory(
168+
new File("opennlp/tools/namefind/voa1.train")), StandardCharsets.UTF_8));
169+
170+
TrainingParameters params = new TrainingParameters();
171+
params.put(TrainingParameters.ITERATIONS_PARAM, 70);
172+
params.put(TrainingParameters.CUTOFF_PARAM, 1);
173+
174+
TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
175+
params, TokenNameFinderFactory.create(null,
176+
Files.readString(featureGeneratorXmlPath, StandardCharsets.UTF_8)
177+
.getBytes(StandardCharsets.UTF_8), resources, new BioCodec()));
178+
179+
180+
File nerModel = Files.createTempFile("nermodel", ".bin").toFile();
181+
try (FileOutputStream modelOut = new FileOutputStream(nerModel)) {
182+
nameFinderModel.serialize(modelOut);
183+
Assertions.assertTrue(nerModel.exists());
184+
} finally {
185+
Assertions.assertTrue(nerModel.delete());
186+
FileUtil.deleteDirectory(resourcesFolder.toFile());
187+
}
188+
}
111189
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
<!--
2+
~ Licensed to the Apache Software Foundation (ASF) under one or more
3+
~ contributor license agreements. See the NOTICE file distributed with
4+
~ this work for additional information regarding copyright ownership.
5+
~ The ASF licenses this file to You under the Apache License, Version 2.0
6+
~ (the "License"); you may not use this file except in compliance with
7+
~ the License. You may obtain a copy of the License at
8+
~
9+
~ http://www.apache.org/licenses/LICENSE-2.0
10+
~
11+
~ Unless required by applicable law or agreed to in writing, software
12+
~ distributed under the License is distributed on an "AS IS" BASIS,
13+
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
~ See the License for the specific language governing permissions and
15+
~ limitations under the License.
16+
-->
17+
18+
<featureGenerators cache="true" name="nameFinder">
19+
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
20+
<int name="prevLength">2</int>
21+
<int name="nextLength">2</int>
22+
<generator class="opennlp.tools.util.featuregen.TokenClassFeatureGeneratorFactory"/>
23+
</generator>
24+
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
25+
<int name="prevLength">2</int>
26+
<int name="nextLength">2</int>
27+
<generator class="opennlp.tools.util.featuregen.TokenFeatureGeneratorFactory"/>
28+
</generator>
29+
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
30+
<int name="prevLength">2</int>
31+
<int name="nextLength">2</int>
32+
<generator class="opennlp.tools.util.featuregen.POSTaggerNameFeatureGeneratorFactory">
33+
<str name="model">pt-pos-perceptron.bin</str>
34+
</generator>
35+
</generator>
36+
<generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/>
37+
<generator class="opennlp.tools.util.featuregen.DefinitionFeatureGeneratorFactory"/>
38+
<generator class="opennlp.tools.util.featuregen.BigramNameFeatureGeneratorFactory"/>
39+
<generator class="opennlp.tools.util.featuregen.SentenceFeatureGeneratorFactory">
40+
<bool name="begin">true</bool>
41+
<bool name="end">false</bool>
42+
</generator>
43+
</featureGenerators>

opennlp-tools/src/test/resources/opennlp/tools/namefind/ner-pos-features.xml

+1-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@
3030
<int name="prevLength">2</int>
3131
<int name="nextLength">2</int>
3232
<generator class="opennlp.tools.util.featuregen.POSTaggerNameFeatureGeneratorFactory">
33-
<!-- FIXME Restore old state for TokenNameFinderModelTest -->
34-
<str name="model">pt-pos-perceptron.bin</str>
33+
<str name="model">pos-model.bin</str>
3534
</generator>
3635
</generator>
3736
<generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/>

0 commit comments

Comments
 (0)