Skip to content

Commit 186ecf9

Browse files
rzo1mawiesne
andauthored
OPENNLP-1369 - NPE when serializing a TokenNameFinder model trained with POSTaggerNameFeatureGeneratorFactory (#571)
* OPENNLP-1369 NPE when serializing a TokenNameFinder model trained with POSTaggerNameFeatureGeneratorFactory - adds first reproducer to check if 1369 is broken => yes it is - confirms basic workaround is curing the issue, however this is not pretty. * OPENNLP-1369 - Fixes NPE when serializing TokenNameFinder model trained with 1.5 Pos Models Adds a test-case to reproduce the NPE reported in OPENNLP-1369 * OPENNLP-1369 - Checkstyle fixes --------- Co-authored-by: Martin Wiesner <[email protected]>
1 parent b74c6df commit 186ecf9

File tree

5 files changed

+146
-6
lines changed

5 files changed

+146
-6
lines changed

opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java

+9-3
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,12 @@ protected void validateArtifactMap() throws InvalidFormatException {
171171
}
172172
}
173173

174+
@Override
175+
protected boolean skipEntryForSerialization(Map.Entry<String, Object> entry) {
176+
// An old model format was detected, skipping the process for this entry, see: OPENNLP-1369
177+
return GENERATOR_DESCRIPTOR_ENTRY_NAME.equals(entry.getKey()) && entry.getValue() == null;
178+
}
179+
174180
/**
175181
* @deprecated use {@link POSModel#getPosSequenceModel} instead. This method will be removed soon.
176182
* Only required for Parser 1.5.x backward compatibility. Newer models don't need this anymore.
@@ -232,7 +238,7 @@ public Class<POSModelSerializer> getArtifactSerializerClass() {
232238
@Override
233239
public int hashCode() {
234240
return Objects.hash(artifactMap.get("manifest.properties"), artifactMap.get("pos.model"),
235-
Arrays.hashCode((byte[]) artifactMap.get("generator.featuregen"))
241+
Arrays.hashCode((byte[]) artifactMap.get(GENERATOR_DESCRIPTOR_ENTRY_NAME))
236242
);
237243
}
238244

@@ -248,8 +254,8 @@ public boolean equals(Object obj) {
248254

249255
return artifactMap.get("manifest.properties").equals(artifactMapToCheck.get("manifest.properties")) &&
250256
artifactMap.get("pos.model").equals(abstractModel) &&
251-
Arrays.equals((byte[]) artifactMap.get("generator.featuregen"),
252-
(byte[]) artifactMapToCheck.get("generator.featuregen"));
257+
Arrays.equals((byte[]) artifactMap.get(GENERATOR_DESCRIPTOR_ENTRY_NAME),
258+
(byte[]) artifactMapToCheck.get(GENERATOR_DESCRIPTOR_ENTRY_NAME));
253259
}
254260
return false;
255261
}

opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java

+12
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,10 @@ public final void serialize(OutputStream out) throws IOException {
597597

598598
Object artifact = entry.getValue();
599599

600+
if (skipEntryForSerialization(entry)) {
601+
continue;
602+
}
603+
600604
ArtifactSerializer serializer = getArtifactSerializer(name);
601605

602606
// If model is serialize-able always use the provided serializer
@@ -684,4 +688,12 @@ private void readObject(final ObjectInputStream in) throws IOException {
684688

685689
this.loadModel(in);
686690
}
691+
692+
/**
693+
* @param entry the entry to check
694+
* @return {@code true}, if the given entry should be skipped for serialization.
695+
*/
696+
protected boolean skipEntryForSerialization(Entry<String, Object> entry) {
697+
return false;
698+
}
687699
}

opennlp-tools/src/test/java/opennlp/tools/EnabledWhenCDNAvailable.java

-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import org.junit.jupiter.api.extension.ExecutionCondition;
2828
import org.junit.jupiter.api.extension.ExtendWith;
2929
import org.junit.jupiter.api.extension.ExtensionContext;
30-
import org.junit.jupiter.params.ParameterizedTest;
3130

3231
import static org.junit.platform.commons.util.AnnotationUtils.findAnnotation;
3332

@@ -36,7 +35,6 @@
3635
*/
3736
@Retention(RetentionPolicy.RUNTIME)
3837
@ExtendWith(EnabledWhenCDNAvailable.CDNAvailableCondition.class)
39-
@ParameterizedTest
4038
public @interface EnabledWhenCDNAvailable {
4139

4240
String hostname();

opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderModelTest.java

+82-1
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,20 @@
2323
import java.io.IOException;
2424
import java.io.InputStream;
2525
import java.io.InputStreamReader;
26+
import java.net.URISyntaxException;
27+
import java.net.URL;
2628
import java.nio.charset.StandardCharsets;
2729
import java.nio.file.Files;
2830
import java.nio.file.Path;
31+
import java.nio.file.StandardCopyOption;
2932
import java.util.Map;
3033
import java.util.stream.Collectors;
3134

3235
import org.junit.jupiter.api.Assertions;
3336
import org.junit.jupiter.api.Test;
3437

38+
import opennlp.tools.EnabledWhenCDNAvailable;
39+
import opennlp.tools.cmdline.AbstractModelLoaderTest;
3540
import opennlp.tools.cmdline.TerminateToolException;
3641
import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
3742
import opennlp.tools.postag.POSModel;
@@ -43,7 +48,7 @@
4348
import opennlp.tools.util.TrainingParameters;
4449
import opennlp.tools.util.model.ModelType;
4550

46-
public class TokenNameFinderModelTest {
51+
public class TokenNameFinderModelTest extends AbstractModelLoaderTest {
4752

4853
@Test
4954
void testNERWithPOSModel() throws IOException {
@@ -104,4 +109,80 @@ void testNERWithPOSModel() throws IOException {
104109
FileUtil.deleteDirectory(resourcesFolder.toFile());
105110
}
106111
}
112+
113+
/*
114+
* OPENNLP-1369
115+
*/
116+
@EnabledWhenCDNAvailable(hostname = "opennlp.sourceforge.net")
117+
@Test
118+
void testNERWithPOSModelV15() throws IOException, URISyntaxException {
119+
120+
// 0. Download model from sourceforge and place at the right location
121+
final String modelName = "pt-pos-perceptron.bin";
122+
123+
downloadVersion15Model(modelName);
124+
125+
final Path model = OPENNLP_DIR.resolve(modelName);
126+
final Path resourcesFolder = Files.createTempDirectory("resources").toAbsolutePath();
127+
128+
Assertions.assertNotNull(model);
129+
Assertions.assertNotNull(resourcesFolder);
130+
131+
// 1. Copy the downloaded model to the temporary resource folder, so it can be referenced from
132+
// the feature gen xml file.
133+
134+
final Path copy = resourcesFolder.resolve(modelName);
135+
136+
Files.copy(OPENNLP_DIR.resolve(modelName), copy, StandardCopyOption.REPLACE_EXISTING);
137+
138+
Assertions.assertTrue(copy.toFile().exists());
139+
140+
// 2. Load feature generator xml bytes
141+
final URL featureGeneratorXmlUrl = this.getClass().getResource("ner-pos-features-v15.xml");
142+
Assertions.assertNotNull(featureGeneratorXmlUrl);
143+
144+
final Path featureGeneratorXmlPath = Path.of(featureGeneratorXmlUrl.toURI());
145+
Assertions.assertNotNull(featureGeneratorXmlPath);
146+
147+
final Path featureGenerator = Files.createTempFile("ner-featuregen-v15", ".xml");
148+
Assertions.assertNotNull(featureGenerator);
149+
150+
Files.copy(featureGeneratorXmlPath, featureGenerator, StandardCopyOption.REPLACE_EXISTING);
151+
Assertions.assertTrue(featureGenerator.toFile().exists());
152+
153+
Map<String, Object> resources;
154+
try {
155+
resources = TokenNameFinderTrainerTool.loadResources(resourcesFolder.toFile(),
156+
featureGenerator.toAbsolutePath().toFile());
157+
} catch (IOException e) {
158+
throw new TerminateToolException(-1, e.getMessage(), e);
159+
} finally {
160+
Files.delete(featureGenerator);
161+
}
162+
163+
164+
// train a name finder
165+
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
166+
new PlainTextByLineStream(new MockInputStreamFactory(
167+
new File("opennlp/tools/namefind/voa1.train")), StandardCharsets.UTF_8));
168+
169+
TrainingParameters params = new TrainingParameters();
170+
params.put(TrainingParameters.ITERATIONS_PARAM, 70);
171+
params.put(TrainingParameters.CUTOFF_PARAM, 1);
172+
173+
TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
174+
params, TokenNameFinderFactory.create(null,
175+
Files.readString(featureGeneratorXmlPath, StandardCharsets.UTF_8)
176+
.getBytes(StandardCharsets.UTF_8), resources, new BioCodec()));
177+
178+
179+
File nerModel = Files.createTempFile("nermodel", ".bin").toFile();
180+
try (FileOutputStream modelOut = new FileOutputStream(nerModel)) {
181+
nameFinderModel.serialize(modelOut);
182+
Assertions.assertTrue(nerModel.exists());
183+
} finally {
184+
Assertions.assertTrue(nerModel.delete());
185+
FileUtil.deleteDirectory(resourcesFolder.toFile());
186+
}
187+
}
107188
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
<!--
2+
~ Licensed to the Apache Software Foundation (ASF) under one or more
3+
~ contributor license agreements. See the NOTICE file distributed with
4+
~ this work for additional information regarding copyright ownership.
5+
~ The ASF licenses this file to You under the Apache License, Version 2.0
6+
~ (the "License"); you may not use this file except in compliance with
7+
~ the License. You may obtain a copy of the License at
8+
~
9+
~ http://www.apache.org/licenses/LICENSE-2.0
10+
~
11+
~ Unless required by applicable law or agreed to in writing, software
12+
~ distributed under the License is distributed on an "AS IS" BASIS,
13+
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
~ See the License for the specific language governing permissions and
15+
~ limitations under the License.
16+
-->
17+
18+
<featureGenerators cache="true" name="nameFinder">
19+
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
20+
<int name="prevLength">2</int>
21+
<int name="nextLength">2</int>
22+
<generator class="opennlp.tools.util.featuregen.TokenClassFeatureGeneratorFactory"/>
23+
</generator>
24+
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
25+
<int name="prevLength">2</int>
26+
<int name="nextLength">2</int>
27+
<generator class="opennlp.tools.util.featuregen.TokenFeatureGeneratorFactory"/>
28+
</generator>
29+
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
30+
<int name="prevLength">2</int>
31+
<int name="nextLength">2</int>
32+
<generator class="opennlp.tools.util.featuregen.POSTaggerNameFeatureGeneratorFactory">
33+
<str name="model">pt-pos-perceptron.bin</str>
34+
</generator>
35+
</generator>
36+
<generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/>
37+
<generator class="opennlp.tools.util.featuregen.DefinitionFeatureGeneratorFactory"/>
38+
<generator class="opennlp.tools.util.featuregen.BigramNameFeatureGeneratorFactory"/>
39+
<generator class="opennlp.tools.util.featuregen.SentenceFeatureGeneratorFactory">
40+
<bool name="begin">true</bool>
41+
<bool name="end">false</bool>
42+
</generator>
43+
</featureGenerators>

0 commit comments

Comments
 (0)