|
8 | 8 |
|
9 | 9 | import pytest |
10 | 10 |
|
11 | | -from ..core.types import CaseHandling, LanguageFamily, TokenizerConfig, TokenType |
| 11 | +from ..core.types import CaseHandling, TokenizerConfig |
12 | 12 | from .tokenizer import BasicTokenizer |
13 | 13 |
|
14 | 14 |
|
@@ -1181,6 +1181,82 @@ def test_international_social_media(self): |
1181 | 1181 |
|
1182 | 1182 |
|
1183 | 1183 | # Fixtures for reusable test data |
| 1184 | + |
| 1185 | + |
| 1186 | +class TestAbbreviationsAndPunctuation: |
| 1187 | + """Test abbreviation handling and punctuation edge cases.""" |
| 1188 | + |
| 1189 | + def test_abbreviations_basic(self): |
| 1190 | + """Test basic abbreviation tokenization - abbreviations should stay intact.""" |
| 1191 | + tokenizer = BasicTokenizer() |
| 1192 | + text = "The c.e.o.s met yesterday" |
| 1193 | + result = tokenizer.tokenize(text) |
| 1194 | + |
| 1195 | + # Abbreviations should be preserved as single tokens |
| 1196 | + expected = ["the", "c.e.o.s", "met", "yesterday"] |
| 1197 | + assert result == expected, f"Expected {expected}, got {result}" |
| 1198 | + |
| 1199 | + def test_abbreviations_with_trailing_period(self): |
| 1200 | + """Test abbreviation with trailing sentence period.""" |
| 1201 | + tokenizer = BasicTokenizer() |
| 1202 | + text = "I live in U.S. now" |
| 1203 | + result = tokenizer.tokenize(text) |
| 1204 | + |
| 1205 | + # Abbreviation should be preserved, period is part of the abbreviation |
| 1206 | + expected = ["i", "live", "in", "u.s.", "now"] |
| 1207 | + assert result == expected, f"Expected {expected}, got {result}" |
| 1208 | + |
| 1209 | + def test_multiple_abbreviations(self): |
| 1210 | + """Test multiple abbreviations in the same sentence.""" |
| 1211 | + tokenizer = BasicTokenizer() |
| 1212 | + text = "U.S. and U.K. relations" |
| 1213 | + result = tokenizer.tokenize(text) |
| 1214 | + |
| 1215 | + # Both abbreviations should be preserved |
| 1216 | + expected = ["u.s.", "and", "u.k.", "relations"] |
| 1217 | + assert result == expected, f"Expected {expected}, got {result}" |
| 1218 | + |
| 1219 | + def test_ellipses_without_punctuation(self): |
| 1220 | + """Test ellipses handling - ellipses should be filtered out by default.""" |
| 1221 | + tokenizer = BasicTokenizer() |
| 1222 | + text = "Wait for it..." |
| 1223 | + result = tokenizer.tokenize(text) |
| 1224 | + |
| 1225 | + # Ellipses should be removed with default config (include_punctuation=False) |
| 1226 | + expected = ["wait", "for", "it"] |
| 1227 | + assert result == expected, f"Expected {expected}, got {result}" |
| 1228 | + |
| 1229 | + def test_chinese_tokenization_regression(self): |
| 1230 | + """Test that Chinese character tokenization still works correctly (regression check).""" |
| 1231 | + tokenizer = BasicTokenizer() |
| 1232 | + text = "你好世界" |
| 1233 | + result = tokenizer.tokenize(text) |
| 1234 | + |
| 1235 | + # Chinese should still be tokenized character by character |
| 1236 | + expected = ["你", "好", "世", "界"] |
| 1237 | + assert result == expected, f"Expected {expected}, got {result}" |
| 1238 | + |
| 1239 | + def test_contractions_regression(self): |
| 1240 | + """Test that contractions are still handled correctly (regression check).""" |
| 1241 | + tokenizer = BasicTokenizer() |
| 1242 | + text = "I don't think it's ready" |
| 1243 | + result = tokenizer.tokenize(text) |
| 1244 | + |
| 1245 | + # Contractions should be preserved as single tokens |
| 1246 | + expected = ["i", "don't", "think", "it's", "ready"] |
| 1247 | + assert result == expected, f"Expected {expected}, got {result}" |
| 1248 | + |
| 1249 | + def test_abbreviations_and_contractions_together(self): |
| 1250 | + """Test complex sentence with both abbreviations and contractions.""" |
| 1251 | + tokenizer = BasicTokenizer() |
| 1252 | + text = "U.S. citizens don't always agree" |
| 1253 | + result = tokenizer.tokenize(text) |
| 1254 | + |
| 1255 | + # Both abbreviations and contractions should be preserved |
| 1256 | + expected = ["u.s.", "citizens", "don't", "always", "agree"] |
| 1257 | + assert result == expected, f"Expected {expected}, got {result}" |
| 1258 | + |
| 1259 | + |
1184 | 1260 | @pytest.fixture |
1185 | 1261 | def basic_config(): |
1186 | 1262 | """Basic tokenizer configuration for tests.""" |
|
0 commit comments