Skip to content

Commit

Permalink
OPENNLP-1526 Add Spanish abbreviation dictionary
Browse files Browse the repository at this point in the history
- moves abbreviation dictionaries to a common location: "tools/lang", independent of 'sentdetect' use cases, test scope accordingly
- adds abb_ES.xml to opennlp-tools/lang
- adds new test cases for the ES localization
- adjusts and enhances existing test cases for new dictionary locations
  • Loading branch information
mawiesne committed Dec 13, 2023
1 parent 7d2722e commit 76d4be9
Show file tree
Hide file tree
Showing 13 changed files with 527 additions and 16 deletions.
File renamed without changes.
236 changes: 236 additions & 0 deletions opennlp-tools/lang/es/abb_ES.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
<?xml version="1.0" encoding="UTF-8"?>

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

<dictionary case_sensitive="false">
<entry>
<token>a.C.</token>
</entry>
<entry>
<token>a. de C.</token>
</entry>
<entry>
<token>a.J.C.</token>
</entry>
<entry>
<token>a. de J.C.</token>
</entry>
<entry>
<token>a. m.</token>
</entry>
<entry>
<token>apdo.</token>
</entry>
<entry>
<token>apdo.</token>
</entry>
<entry>
<token>aprox.</token>
</entry>
<entry>
<token>Av.</token>
</entry>
<entry>
<token>Avda.</token>
</entry>
<entry>
<token>Bs. As.</token>
</entry>
<entry>
<token>c.c.</token>
</entry>
<entry>
<token>cap.</token>
</entry>
<entry>
<token>D.</token>
</entry>
<entry>
<token>Da.</token>
</entry>
<entry>
<token>Dña.</token>
</entry>
<entry>
<token>d.C.</token>
</entry>
<entry>
<token>d. de C.</token>
</entry>
<entry>
<token>d.J.C.</token>
</entry>
<entry>
<token>d. de J.C</token>
</entry>
<entry>
<token>dna.</token>
</entry>
<entry>
<token>EE. UU.</token>
</entry>
<entry>
<token>etc.</token>
</entry>
<entry>
<token>f.c.</token>
</entry>
<entry>
<token>F.C.</token>
</entry>
<entry>
<token>FF. AA.</token>
</entry>
<entry>
<token>Dr.</token>
</entry>
<entry>
<token>Dra.</token>
</entry>
<entry>
<token>Gob.</token>
</entry>
<entry>
<token>Lic.</token>
</entry>
<entry>
<token>Ing.</token>
</entry>
<entry>
<token>Pdte.</token>
</entry>
<entry>
<token>Pdta.</token>
</entry>
<entry>
<token>pág.</token>
</entry>
<entry>
<token>no.</token>
</entry>
<entry>
<token>núm.</token>
</entry>
<entry>
<token>p.ej.</token>
</entry>
<entry>
<token>p. m.</token>
</entry>
<entry>
<token>Prof.</token>
</entry>
<entry>
<token>Profa.</token>
</entry>
<entry>
<token>q.e.p.d.</token>
</entry>
<entry>
<token>S.A.</token>
</entry>
<entry>
<token>S.L.</token>
</entry>
<entry>
<token>Sr.</token>
</entry>
<entry>
<token>Sra.</token>
</entry>
<entry>
<token>Srta.</token>
</entry>
<entry>
<token>Ud.</token>
</entry>
<entry>
<token>Vd.</token>
</entry>
<entry>
<token>Uds.</token>
</entry>
<entry>
<token>Vds.</token>
</entry>
<entry>
<token>vol.</token>
</entry>
<entry>
<token>v.</token>
</entry>
<entry>
<token>lu.</token>
</entry>
<entry>
<token>ma.</token>
</entry>
<entry>
<token>mi.</token>
</entry>
<entry>
<token>ju.</token>
</entry>
<entry>
<token>vi.</token>
</entry>
<entry>
<token>sá.</token>
</entry>
<entry>
<token>do.</token>
</entry>
<entry>
<token>en.</token>
</entry>
<entry>
<token>feb.</token>
</entry>
<entry>
<token>mzo.</token>
</entry>
<entry>
<token>abr.</token>
</entry>
<entry>
<token>my.</token>
</entry>
<entry>
<token>jun.</token>
</entry>
<entry>
<token>jul.</token>
</entry>
<entry>
<token>ag.</token>
</entry>
<entry>
<token>set.</token>
</entry>
<entry>
<token>oct.</token>
</entry>
<entry>
<token>nov.</token>
</entry>
<entry>
<token>dic.</token>
</entry>
</dictionary>
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public void testGetShortDescription() {

@Test
public void testLoadDictHappyCase() throws IOException {
File dictFile = new File("lang/ga/sentdetect/abb.xml");
File dictFile = new File("lang/ga/abb_GA.xml");
Dictionary dict = TokenizerTrainerTool.loadDict(dictFile);
Assertions.assertNotNull(dict);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@
import opennlp.tools.util.TrainingParameters;

public abstract class AbstractSentenceDetectorTest {


protected static final Locale LOCALE_SPANISH = new Locale("es");

static ObjectStream<SentenceSample> createSampleStream(Locale loc) throws IOException {
final String trainingResource;
if (loc.equals(Locale.GERMAN)) {
trainingResource = "/opennlp/tools/sentdetect/Sentences_DE.txt";
} else if (loc.equals(LOCALE_SPANISH)) {
trainingResource = "/opennlp/tools/sentdetect/Sentences_ES.txt";
} else {
trainingResource = "/opennlp/tools/sentdetect/Sentences.txt";
}
Expand All @@ -43,22 +47,26 @@ static ObjectStream<SentenceSample> createSampleStream(Locale loc) throws IOExce
}

static SentenceModel train(SentenceDetectorFactory factory, Locale loc) throws IOException {
final String languageCode;
final String lang;
if (loc.equals(Locale.GERMAN)) {
languageCode = "deu";
lang = "deu";
} else if (loc.equals(LOCALE_SPANISH)) {
lang = "spa";
} else {
languageCode = "eng";
lang = "eng";
}
return SentenceDetectorME.train(languageCode, createSampleStream(loc), factory,
return SentenceDetectorME.train(lang, createSampleStream(loc), factory,
TrainingParameters.defaultParams());
}

static Dictionary loadAbbDictionary(Locale loc) throws IOException {
final String abbrevDict;
if (loc.equals(Locale.GERMAN)) {
abbrevDict = "opennlp/tools/sentdetect/abb_DE.xml";
abbrevDict = "opennlp/tools/lang/abb_DE.xml";
} else if (loc.equals(LOCALE_SPANISH)) {
abbrevDict = "opennlp/tools/lang/abb_ES.xml";
} else {
abbrevDict = "opennlp/tools/sentdetect/abb.xml";
abbrevDict = "opennlp/tools/lang/abb_EN.xml";
}
return new Dictionary(AbstractSentenceDetectorTest.class.getClassLoader()
.getResourceAsStream(abbrevDict));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

package opennlp.tools.sentdetect;


import java.io.IOException;
import java.util.Locale;

Expand All @@ -33,8 +32,8 @@
* Verifies OPENNLP-793 in combination with OPENNLP-570.
* <p>
* In this context, well-known known German (de_DE) abbreviations must be respected,
* so that non-sentence breaks (words abbreviated with one or more '.' characters)
* result in incorrect sentence boundaries .
* so that words abbreviated with one or more '.' characters do not
* result in incorrect sentence boundaries.
* <p>
* See:
* <a href="https://issues.apache.org/jira/projects/OPENNLP/issues/OPENNLP-793">OPENNLP-793</a>
Expand Down Expand Up @@ -65,8 +64,8 @@ void testSentDetectWithInlineAbbreviationsEx1() {
final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";

SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String sampleSentences1 = sent1 + " " + sent2;
String[] sents = sentDetect.sentDetect(sampleSentences1);
String sampleSentences = sent1 + " " + sent2;
String[] sents = sentDetect.sentDetect(sampleSentences);
Assertions.assertEquals(2, sents.length);
Assertions.assertEquals(sent1, sents[0]);
Assertions.assertEquals(sent2, sents[1]);
Expand Down
Loading

0 comments on commit 76d4be9

Please sign in to comment.