Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
OPENNLP-1650 Update DownloadUtil to use Models release 1.2
Browse files Browse the repository at this point in the history
- adapts DownloadUtil, related classes and tests towards Models 1.2
- updates index.html in opennlp/tools/util to latest data Models 1.2 for DownloadParserTest
- introduces DownloadUtil.ModelType#LEMMATIZER as those are now available
- adds LemmatizerModelLoaderIT
- extracts some cnp'ed strings to constants
- fixes broken JavaDoc in PerceptronTrainer along the path
mawiesne committed Nov 23, 2024
1 parent cff36bc commit 978010a
Showing 12 changed files with 676 additions and 400 deletions.
Original file line number Diff line number Diff line change
@@ -108,7 +108,7 @@ public PerceptronTrainer(TrainingParameters parameters) {
* {@inheritDoc}
*
* @throws IllegalArgumentException Thrown if the algorithm name is not equal to
* {{@link #PERCEPTRON_VALUE}}.
* {@link #PERCEPTRON_VALUE}.
*/
@Override
public void validate() {
@@ -215,7 +215,7 @@ public void setSkippedAveraging(boolean averaging) {
*
* @param iterations The number of iterations to use for training.
* @param di The {@link DataIndexer} used as data input.
* @param cutoff The {{@link #CUTOFF_PARAM}} value to use for training.
* @param cutoff The {@link TrainingParameters#CUTOFF_PARAM} value to use for training.
*
* @return A valid, trained {@link AbstractModel perceptron model}.
*/
@@ -228,9 +228,9 @@ public AbstractModel trainModel(int iterations, DataIndexer di, int cutoff) {
*
* @param iterations The number of iterations to use for training.
* @param di The {@link DataIndexer} used as data input.
* @param cutoff The {{@link #CUTOFF_PARAM}} value to use for training.
* @param cutoff The {@link TrainingParameters#CUTOFF_PARAM} value to use for training.
* @param useAverage Whether to use 'averaging', or not.
* See {{@link #setSkippedAveraging(boolean)}} for details.
* See {@link #setSkippedAveraging(boolean)} for details.
*
* @return A valid, trained {@link AbstractModel perceptron model}.
*/
25 changes: 23 additions & 2 deletions opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
Original file line number Diff line number Diff line change
@@ -57,6 +57,7 @@ public class DownloadUtil {
* The type of model.
*/
public enum ModelType {
LEMMATIZER("lemma"),
TOKENIZER("token"),
SENTENCE_DETECTOR("sent"),
POS("pos-perceptron"),
@@ -72,13 +73,13 @@ public enum ModelType {
}

private static final String BASE_URL = "https://dlcdn.apache.org/opennlp/";
private static final String MODELS_UD_MODELS_1_1 = "models/ud-models-1.1/";
private static final String MODELS_UD_MODELS_1_2 = "models/ud-models-1.2/";

public static final Map<String, Map<ModelType, String>> available_models;

static {
try {
available_models = new DownloadParser(new URL(BASE_URL + MODELS_UD_MODELS_1_1)).getAvailableModels();
available_models = new DownloadParser(new URL(BASE_URL + MODELS_UD_MODELS_1_2)).getAvailableModels();
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
@@ -240,18 +241,34 @@ private Map<String, Map<ModelType, String>> toMap(List<String> links) {
addModel("fr", link, result);
} else if (link.contains("bg-ud")) { // Bulgarian
addModel("bg", link, result);
} else if (link.contains("ca-ud")) { // Catalan
addModel("ca", link, result);
} else if (link.contains("cs-ud")) { // Czech
addModel("cs", link, result);
} else if (link.contains("hr-ud")) { // Croatian
addModel("hr", link, result);
} else if (link.contains("da-ud")) { // Danish
addModel("da", link, result);
} else if (link.contains("el-ud")) { // Greek
addModel("el", link, result);
} else if (link.contains("es-ud")) { // Spanish
addModel("es", link, result);
} else if (link.contains("et-ud")) { // Estonian
addModel("et", link, result);
} else if (link.contains("eu-ud")) { // Basque
addModel("eu", link, result);
} else if (link.contains("fi-ud")) { // Finnish
addModel("fi", link, result);
} else if (link.contains("hy-ud")) { // Armenian
addModel("hy", link, result);
} else if (link.contains("is-ud")) { // Icelandic
addModel("is", link, result);
} else if (link.contains("ka-ud")) { // Georgian
addModel("ka", link, result);
} else if (link.contains("kk-ud")) { // Kazakh
addModel("kk", link, result);
} else if (link.contains("ko-ud")) { // Korean
addModel("ko", link, result);
} else if (link.contains("lv-ud")) { // Latvian
addModel("lv", link, result);
} else if (link.contains("no-ud")) { // Norwegian
@@ -272,6 +289,8 @@ private Map<String, Map<ModelType, String>> toMap(List<String> links) {
addModel("sl", link, result);
} else if (link.contains("sv-ud")) { // Swedish
addModel("sv", link, result);
} else if (link.contains("tr-ud")) { // Turkish
addModel("tr", link, result);
} else if (link.contains("uk-ud")) { // Ukrainian
addModel("uk", link, result);
}
@@ -288,6 +307,8 @@ private void addModel(String locale, String link, Map<String, Map<ModelType, Str
models.put(ModelType.SENTENCE_DETECTOR, url);
} else if (link.contains("tokens")) {
models.put(ModelType.TOKENIZER, url);
} else if (link.contains("lemma")) {
models.put(ModelType.LEMMATIZER, url);
} else if (link.contains("pos")) {
models.put(ModelType.POS, url);
}
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -36,6 +37,13 @@ public abstract class AbstractModelLoaderTest {
private static final String BASE_URL_MODELS_V15 = "https://opennlp.sourceforge.net/models-1.5/";
private static final String BASE_URL_MODELS_V183 = "https://dlcdn.apache.org/opennlp/models/langdetect/1.8.3/";
protected static final Path OPENNLP_DIR = Paths.get(System.getProperty("user.home") + "/.opennlp/");
protected static final String VER = "1.2-2.5.0";
protected static final String BIN = ".bin";
protected static List<String> SUPPORTED_LANG_CODES = List.of(
"en", "fr", "de", "it", "nl", "bg", "ca", "cs", "da", "el",
"es", "et", "eu", "fi", "hr", "hy", "is", "ka", "kk", "ko",
"lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv",
"tr", "uk");

protected static void downloadVersion15Model(String modelName) throws IOException {
downloadModel(new URL(BASE_URL_MODELS_V15 + modelName));
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.cmdline.lemmatizer;

import java.io.IOException;
import java.nio.file.Files;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import opennlp.tools.AbstractModelLoaderTest;
import opennlp.tools.EnabledWhenCDNAvailable;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.util.DownloadUtil;

@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
public class LemmatizerModelLoaderIT extends AbstractModelLoaderTest {

// SUT
private LemmatizerModelLoader loader;

@BeforeAll
public static void initResources() {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.LEMMATIZER, LemmatizerModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}

@BeforeEach
public void setup() {
loader = new LemmatizerModelLoader();
}

@ParameterizedTest(name = "Verify \"{0}\" tokenizer model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-lemmas-" + VER + BIN;
LemmatizerModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
}
}
Original file line number Diff line number Diff line change
@@ -19,7 +19,6 @@

import java.io.IOException;
import java.nio.file.Files;
import java.util.List;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
@@ -40,12 +39,9 @@ public class POSModelLoaderIT extends AbstractModelLoaderTest {

@BeforeAll
public static void initResources() {
List<String> langs = List.of("en", "fr", "de", "it", "nl", "bg", "cs", "da",
"es", "et", "fi", "hr", "lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "uk");
langs.forEach(lang -> {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang,
DownloadUtil.ModelType.POS, POSModel.class);
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.POS, POSModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -59,11 +55,12 @@ public void setup() {

@ParameterizedTest(name = "Verify \"{0}\" POS model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "cs-ud-pdt", "da-ud-ddt", "es-ud-gsd", "et-ud-edt", "fi-ud-tdt", "hr-ud-set",
"lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "uk-ud-iu"})
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-pos-1.1-2.4.0.bin";
String modelName = "opennlp-" + langModel + "-pos-" + VER + BIN;
POSModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Original file line number Diff line number Diff line change
@@ -19,7 +19,6 @@

import java.io.IOException;
import java.nio.file.Files;
import java.util.List;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
@@ -40,12 +39,9 @@ public class SentenceModelLoaderIT extends AbstractModelLoaderTest {

@BeforeAll
public static void initResources() {
List<String> langs = List.of("en", "fr", "de", "it", "nl", "bg", "cs", "da",
"es", "et", "fi", "hr", "lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "uk");
langs.forEach(lang -> {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang,
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -59,11 +55,12 @@ public void setup() {

@ParameterizedTest(name = "Verify \"{0}\" sentence model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "cs-ud-pdt", "da-ud-ddt", "es-ud-gsd", "et-ud-edt", "fi-ud-tdt", "hr-ud-set",
"lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "uk-ud-iu"})
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-sentence-1.1-2.4.0.bin";
String modelName = "opennlp-" + langModel + "-sentence-" + VER + BIN;
SentenceModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Original file line number Diff line number Diff line change
@@ -19,7 +19,6 @@

import java.io.IOException;
import java.nio.file.Files;
import java.util.List;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
@@ -40,12 +39,9 @@ public class TokenizerModelLoaderIT extends AbstractModelLoaderTest {

@BeforeAll
public static void initResources() {
List<String> langs = List.of("en", "fr", "de", "it", "nl", "bg", "cs", "da",
"es", "et", "fi", "hr", "lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "uk");
langs.forEach(lang -> {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang,
DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class);
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -59,11 +55,12 @@ public void setup() {

@ParameterizedTest(name = "Verify \"{0}\" tokenizer model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "cs-ud-pdt", "da-ud-ddt", "es-ud-gsd", "et-ud-edt", "fi-ud-tdt", "hr-ud-set",
"lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "uk-ud-iu"})
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-tokens-1.1-2.4.0.bin";
String modelName = "opennlp-" + langModel + "-tokens-" + VER + BIN;
TokenizerModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Loading

0 comments on commit 978010a

Please sign in to comment.