OPENNLP-1526 Add Spanish abbreviation dictionary

- moves abbreviation dictionaries to a common location: "tools/lang", independent of 'sentdetect' use cases, test scope accordingly - adds abb_ES.xml to opennlp-tools/lang - adds new test cases for the ES localization - adjusts and enhances existing test cases for new dictionary locations
apache · Dec 13, 2023 · 76d4be9 · 76d4be9
1 parent 7d2722e
commit 76d4be9
Show file tree

Hide file tree

Showing 13 changed files with 527 additions and 16 deletions.
diff --git a/opennlp-tools/lang/de/sentdetect/abb_DE.xml → opennlp-tools/lang/de/abb_DE.xml b/opennlp-tools/lang/de/sentdetect/abb_DE.xml → opennlp-tools/lang/de/abb_DE.xml
diff --git a/opennlp-tools/lang/es/abb_ES.xml b/opennlp-tools/lang/es/abb_ES.xml
@@ -0,0 +1,236 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+-->
+
+<dictionary case_sensitive="false">
+  <entry>
+    <token>a.C.</token>
+  </entry>
+  <entry>
+    <token>a. de C.</token>
+  </entry>
+  <entry>
+    <token>a.J.C.</token>
+  </entry>
+  <entry>
+    <token>a. de J.C.</token>
+  </entry>
+  <entry>
+    <token>a. m.</token>
+  </entry>
+  <entry>
+    <token>apdo.</token>
+  </entry>
+  <entry>
+    <token>apdo.</token>
+  </entry>
+  <entry>
+    <token>aprox.</token>
+  </entry>
+  <entry>
+    <token>Av.</token>
+  </entry>
+  <entry>
+    <token>Avda.</token>
+  </entry>
+  <entry>
+    <token>Bs. As.</token>
+  </entry>
+  <entry>
+    <token>c.c.</token>
+  </entry>
+  <entry>
+    <token>cap.</token>
+  </entry>
+  <entry>
+    <token>D.</token>
+  </entry>
+  <entry>
+    <token>Da.</token>
+  </entry>
+  <entry>
+    <token>Dña.</token>
+  </entry>
+  <entry>
+    <token>d.C.</token>
+  </entry>
+  <entry>
+    <token>d. de C.</token>
+  </entry>
+  <entry>
+    <token>d.J.C.</token>
+  </entry>
+  <entry>
+    <token>d. de J.C</token>
+  </entry>
+  <entry>
+    <token>dna.</token>
+  </entry>
+  <entry>
+    <token>EE. UU.</token>
+  </entry>
+  <entry>
+    <token>etc.</token>
+  </entry>
+  <entry>
+    <token>f.c.</token>
+  </entry>
+  <entry>
+    <token>F.C.</token>
+  </entry>
+  <entry>
+    <token>FF. AA.</token>
+  </entry>
+  <entry>
+    <token>Dr.</token>
+  </entry>
+  <entry>
+    <token>Dra.</token>
+  </entry>
+  <entry>
+    <token>Gob.</token>
+  </entry>
+  <entry>
+    <token>Lic.</token>
+  </entry>
+  <entry>
+    <token>Ing.</token>
+  </entry>
+  <entry>
+    <token>Pdte.</token>
+  </entry>
+  <entry>
+    <token>Pdta.</token>
+  </entry>
+  <entry>
+    <token>pág.</token>
+  </entry>
+  <entry>
+    <token>no.</token>
+  </entry>
+  <entry>
+    <token>núm.</token>
+  </entry>
+  <entry>
+    <token>p.ej.</token>
+  </entry>
+  <entry>
+    <token>p. m.</token>
+  </entry>
+  <entry>
+    <token>Prof.</token>
+  </entry>
+  <entry>
+    <token>Profa.</token>
+  </entry>
+  <entry>
+    <token>q.e.p.d.</token>
+  </entry>
+  <entry>
+    <token>S.A.</token>
+  </entry>
+  <entry>
+    <token>S.L.</token>
+  </entry>
+  <entry>
+    <token>Sr.</token>
+  </entry>
+  <entry>
+    <token>Sra.</token>
+  </entry>
+  <entry>
+    <token>Srta.</token>
+  </entry>
+  <entry>
+    <token>Ud.</token>
+  </entry>
+  <entry>
+    <token>Vd.</token>
+  </entry>
+  <entry>
+    <token>Uds.</token>
+  </entry>
+  <entry>
+    <token>Vds.</token>
+  </entry>
+  <entry>
+    <token>vol.</token>
+  </entry>
+  <entry>
+    <token>v.</token>
+  </entry>
+  <entry>
+    <token>lu.</token>
+  </entry>
+  <entry>
+    <token>ma.</token>
+  </entry>
+  <entry>
+    <token>mi.</token>
+  </entry>
+  <entry>
+    <token>ju.</token>
+  </entry>
+  <entry>
+    <token>vi.</token>
+  </entry>
+  <entry>
+    <token>sá.</token>
+  </entry>
+  <entry>
+    <token>do.</token>
+  </entry>
+  <entry>
+    <token>en.</token>
+  </entry>
+  <entry>
+    <token>feb.</token>
+  </entry>
+  <entry>
+    <token>mzo.</token>
+  </entry>
+  <entry>
+    <token>abr.</token>
+  </entry>
+  <entry>
+    <token>my.</token>
+  </entry>
+  <entry>
+    <token>jun.</token>
+  </entry>
+  <entry>
+    <token>jul.</token>
+  </entry>
+  <entry>
+    <token>ag.</token>
+  </entry>
+  <entry>
+    <token>set.</token>
+  </entry>
+  <entry>
+    <token>oct.</token>
+  </entry>
+  <entry>
+    <token>nov.</token>
+  </entry>
+  <entry>
+    <token>dic.</token>
+  </entry>
+</dictionary>
diff --git a/opennlp-tools/lang/ga/sentdetect/abb.xml → opennlp-tools/lang/ga/abb_GA.xml b/opennlp-tools/lang/ga/sentdetect/abb.xml → opennlp-tools/lang/ga/abb_GA.xml
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java
@@ -63,7 +63,7 @@ public void testGetShortDescription() {
 
   @Test
   public void testLoadDictHappyCase() throws IOException {
-    File dictFile = new File("lang/ga/sentdetect/abb.xml");
+    File dictFile = new File("lang/ga/abb_GA.xml");
     Dictionary dict = TokenizerTrainerTool.loadDict(dictFile);
     Assertions.assertNotNull(dict);
   }

diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/AbstractSentenceDetectorTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/AbstractSentenceDetectorTest.java
@@ -29,11 +29,15 @@
 import opennlp.tools.util.TrainingParameters;
 
 public abstract class AbstractSentenceDetectorTest {
-
+
+  protected static final Locale LOCALE_SPANISH = new Locale("es");
+
   static ObjectStream<SentenceSample> createSampleStream(Locale loc) throws IOException {
     final String trainingResource;
     if (loc.equals(Locale.GERMAN)) {
       trainingResource = "/opennlp/tools/sentdetect/Sentences_DE.txt";
+    } else if (loc.equals(LOCALE_SPANISH)) {
+      trainingResource = "/opennlp/tools/sentdetect/Sentences_ES.txt";
     } else {
       trainingResource = "/opennlp/tools/sentdetect/Sentences.txt";
     }
@@ -43,22 +47,26 @@ static ObjectStream<SentenceSample> createSampleStream(Locale loc) throws IOExce
   }
 
   static SentenceModel train(SentenceDetectorFactory factory, Locale loc) throws IOException {
-    final String languageCode;
+    final String lang;
     if (loc.equals(Locale.GERMAN)) {
-      languageCode = "deu";
+      lang = "deu";
+    } else if (loc.equals(LOCALE_SPANISH)) {
+      lang = "spa";
     } else {
-      languageCode = "eng";
+      lang = "eng";
     }
-    return SentenceDetectorME.train(languageCode, createSampleStream(loc), factory,
+    return SentenceDetectorME.train(lang, createSampleStream(loc), factory,
             TrainingParameters.defaultParams());
   }
 
   static Dictionary loadAbbDictionary(Locale loc) throws IOException {
     final String abbrevDict;
     if (loc.equals(Locale.GERMAN)) {
-      abbrevDict = "opennlp/tools/sentdetect/abb_DE.xml";
+      abbrevDict = "opennlp/tools/lang/abb_DE.xml";
+    } else if (loc.equals(LOCALE_SPANISH)) {
+      abbrevDict = "opennlp/tools/lang/abb_ES.xml";
     } else {
-      abbrevDict = "opennlp/tools/sentdetect/abb.xml";
+      abbrevDict = "opennlp/tools/lang/abb_EN.xml";
     }
     return new Dictionary(AbstractSentenceDetectorTest.class.getClassLoader()
             .getResourceAsStream(abbrevDict));

diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -17,7 +17,6 @@
 
 package opennlp.tools.sentdetect;
 
-
 import java.io.IOException;
 import java.util.Locale;
 
@@ -33,8 +32,8 @@
  * Verifies OPENNLP-793 in combination with OPENNLP-570.
  * <p>
  * In this context, well-known known German (de_DE) abbreviations must be respected,
- * so that non-sentence breaks (words abbreviated with one or more '.' characters)
- * result in incorrect sentence boundaries .
+ * so that words abbreviated with one or more '.' characters do not
+ * result in incorrect sentence boundaries.
  * <p>
  * See:
  * <a href="https://issues.apache.org/jira/projects/OPENNLP/issues/OPENNLP-793">OPENNLP-793</a>
@@ -65,8 +64,8 @@ void testSentDetectWithInlineAbbreviationsEx1() {
     final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
 
     SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
-    String sampleSentences1 = sent1 + " " + sent2;
-    String[] sents = sentDetect.sentDetect(sampleSentences1);
+    String sampleSentences = sent1 + " " + sent2;
+    String[] sents = sentDetect.sentDetect(sampleSentences);
     Assertions.assertEquals(2, sents.length);
     Assertions.assertEquals(sent1, sents[0]);
     Assertions.assertEquals(sent2, sents[1]);