|
18 | 18 | package opennlp.tools.sentdetect;
|
19 | 19 |
|
20 | 20 | import java.io.IOException;
|
| 21 | +import java.nio.charset.StandardCharsets; |
| 22 | +import java.util.ArrayList; |
| 23 | +import java.util.Arrays; |
| 24 | +import java.util.List; |
21 | 25 | import java.util.Locale;
|
22 | 26 |
|
23 | 27 | import org.junit.jupiter.api.Assertions;
|
|
27 | 31 | import org.junit.jupiter.params.provider.ValueSource;
|
28 | 32 |
|
29 | 33 | import opennlp.tools.dictionary.Dictionary;
|
| 34 | +import opennlp.tools.formats.ResourceAsStreamFactory; |
| 35 | +import opennlp.tools.util.InputStreamFactory; |
| 36 | +import opennlp.tools.util.PlainTextByLineStream; |
30 | 37 |
|
31 | 38 | /**
|
32 | 39 | * Tests for the {@link SentenceDetectorME} class.
|
@@ -90,4 +97,46 @@ void testSentDetectWithInlineAbbreviationsResultsInTwoSentences() {
|
90 | 97 | Assertions.assertEquals(2, probs.length);
|
91 | 98 | }
|
92 | 99 |
|
| 100 | + /* |
| 101 | + * Verifies OPENNLP-1163, |
| 102 | + * see: https://issues.apache.org/jira/browse/OPENNLP-1163 |
| 103 | + * |
| 104 | + * Original problem: |
| 105 | + * "Even though the abbreviation "art." was included in the XML file, |
| 106 | + * the sentence detector breaks the sentence on instances of this |
| 107 | + * abbreviation preceded by article and apostrophe |
| 108 | + * (e.g. nell'art., dall'art., dell'art.)" |
| 109 | + * |
| 110 | + * This test demonstrates it is working, with "art." in the abbreviations xml file. |
| 111 | + */ |
| 112 | + @Test |
| 113 | + void testSentDetectOpenNLP1163() throws IOException { |
| 114 | + final SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); |
| 115 | + |
| 116 | + final String testResource = "/opennlp/tools/sentdetect/Test-Sample_OPENNLP-1163.txt"; |
| 117 | + InputStreamFactory in = new ResourceAsStreamFactory( |
| 118 | + AbstractSentenceDetectorTest.class, testResource); |
| 119 | + List<String> detectedSentences = new ArrayList<>(); |
| 120 | + try (PlainTextByLineStream stream = new PlainTextByLineStream(in, StandardCharsets.UTF_8)) { |
| 121 | + StringBuilder text = new StringBuilder(); |
| 122 | + String line; |
| 123 | + do { |
| 124 | + line = stream.read(); |
| 125 | + text.append(line); |
| 126 | + } while (line != null); |
| 127 | + |
| 128 | + String[] sents = sentDetect.sentDetect(text.toString()); |
| 129 | + detectedSentences.addAll(Arrays.asList(sents)); |
| 130 | + } |
| 131 | + |
| 132 | + // Test |
| 133 | + Assertions.assertEquals(11, detectedSentences.size()); |
| 134 | + for (String sent : detectedSentences) { |
| 135 | + Assertions.assertFalse(hasMisplacedAbbreviationAtEnd(sent)); |
| 136 | + } |
| 137 | + } |
| 138 | + |
| 139 | + private boolean hasMisplacedAbbreviationAtEnd(String sent) { |
| 140 | + return sent.endsWith("dell'art.") || sent.endsWith("dall'art.") || sent.endsWith("nell'art."); |
| 141 | + } |
93 | 142 | }
|
0 commit comments