Skip to content

Commit b7ac227

Browse files
[RELEASE] v0.9.0 (#219)
1 parent f3c7fce commit b7ac227

File tree

6 files changed

+116
-19
lines changed

6 files changed

+116
-19
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ VERSION
99
*.DS_Store
1010
.env*
1111
.serena/cache
12+
.specify/

.serena/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/cache

README.md

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,28 @@
1-
<h2 align="center">mango-tango-cli</h2>
2-
<h3 align="center">A Python command-line tool for detecting coordinated inauthentic behavior</h3>
1+
<h2 align="center">CIB Mango Tree</h2>
2+
<h3 align="center">An Interactive Command Line and Dashboard Tool for Detecting Coordinated Inauthentic Behavior Datasets of Online Activity</h3>
33

44
<p align="center">
55
<img src="https://raw.githubusercontent.com/CIB-Mango-Tree/CIB-Mango-Tree-Website/main/assets/images/mango-text.PNG" alt="Mango logo" style="width:200px;"/>
66
</p>
77

88
<p align="center">
9-
<a href="https://www.python.org/"><img alt="code" src="https://img.shields.io/badge/code-Python%203.12-blue?logo=Python"></a>
9+
<a href="https://www.python.org/"><img alt="code" src="https://img.shields.io/badge/Python-3.12-blue?logo=Python"></a>
10+
<a href="https://docs.astral.sh/ruff/"><img alt="style: black" src="https://img.shields.io/badge/Polars-1.9-skyblue?logo=Polars"></a>
11+
<a href="https://plotly.com/python/"><img alt="style: black" src="https://img.shields.io/badge/Plotly-5.24.1-purple?logo=Plotly"></a>
12+
<a href="https://github.com/Textualize/rich"><img alt="style: black" src="https://img.shields.io/badge/Rich-14.0.0-gold?logo=Rich"></a>
13+
<a href="https://civictechdc.github.io/mango-tango-cli/"><img alt="style: black" src="https://img.shields.io/badge/docs-website-blue"></a>
1014
<a href="https://black.readthedocs.io/en/stable/"><img alt="style: black" src="https://img.shields.io/badge/style-Black-black?logo=Black"></a>
11-
<a href="https://docs.astral.sh/ruff/"><img alt="style: black" src="https://img.shields.io/badge/tool-Polars-skyblue?logo=Polars"></a>
1215
</p>
1316

1417
---
1518

1619
## Technical Documentation
1720

18-
For in-depth technical docs related to this repository please visit [https://civictechdc.github.io/mango-tango-cli](https://civictechdc.github.io/mango-tango-cli)
21+
For in-depth technical docs related to this repository please visit: [https://civictechdc.github.io/mango-tango-cli](https://civictechdc.github.io/mango-tango-cli)
1922

2023
## Requirements
2124

22-
Python 3.12
25+
Python 3.12 (see [requirements.txt](https://github.com/civictechdc/mango-tango-cli/blob/main/requirements.txt))
2326

2427
## Setting up
2528

@@ -41,7 +44,7 @@ python -m venv venv
4144
## Starting the application
4245

4346
```shell
44-
python -m mangotango
47+
python -m cibmangotree
4548
```
4649

4750
## Development Guide and Documentation

services/tokenizer/basic/patterns.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@
9898
)
9999

100100
# Word patterns for different script types
101-
LATIN_WORD_PATTERN = r"[a-zA-Z]+(?:\'[a-zA-Z]+)*" # Handle contractions
101+
102+
LATIN_WORD_PATTERN = r"[a-zA-Z]+(?:\.[a-zA-Z]+)+\.?|[a-zA-Z]+(?:\'[a-zA-Z]+)*" # Handle abbreviations and contractions
103+
102104
WORD_PATTERN = f"(?:{LATIN_WORD_PATTERN}|{CJK_PATTERN}+|{ARABIC_PATTERN}+|{THAI_PATTERN}+|{SEA_PATTERN}+)"
103105

104106
# Punctuation (preserve some, group others)

services/tokenizer/basic/test_basic_tokenizer.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import pytest
1010

11-
from ..core.types import CaseHandling, LanguageFamily, TokenizerConfig, TokenType
11+
from ..core.types import CaseHandling, TokenizerConfig
1212
from .tokenizer import BasicTokenizer
1313

1414

@@ -1181,6 +1181,82 @@ def test_international_social_media(self):
11811181

11821182

11831183
# Fixtures for reusable test data
1184+
1185+
1186+
class TestAbbreviationsAndPunctuation:
1187+
"""Test abbreviation handling and punctuation edge cases."""
1188+
1189+
def test_abbreviations_basic(self):
1190+
"""Test basic abbreviation tokenization - abbreviations should stay intact."""
1191+
tokenizer = BasicTokenizer()
1192+
text = "The c.e.o.s met yesterday"
1193+
result = tokenizer.tokenize(text)
1194+
1195+
# Abbreviations should be preserved as single tokens
1196+
expected = ["the", "c.e.o.s", "met", "yesterday"]
1197+
assert result == expected, f"Expected {expected}, got {result}"
1198+
1199+
def test_abbreviations_with_trailing_period(self):
1200+
"""Test abbreviation with trailing sentence period."""
1201+
tokenizer = BasicTokenizer()
1202+
text = "I live in U.S. now"
1203+
result = tokenizer.tokenize(text)
1204+
1205+
# Abbreviation should be preserved, period is part of the abbreviation
1206+
expected = ["i", "live", "in", "u.s.", "now"]
1207+
assert result == expected, f"Expected {expected}, got {result}"
1208+
1209+
def test_multiple_abbreviations(self):
1210+
"""Test multiple abbreviations in the same sentence."""
1211+
tokenizer = BasicTokenizer()
1212+
text = "U.S. and U.K. relations"
1213+
result = tokenizer.tokenize(text)
1214+
1215+
# Both abbreviations should be preserved
1216+
expected = ["u.s.", "and", "u.k.", "relations"]
1217+
assert result == expected, f"Expected {expected}, got {result}"
1218+
1219+
def test_ellipses_without_punctuation(self):
1220+
"""Test ellipses handling - ellipses should be filtered out by default."""
1221+
tokenizer = BasicTokenizer()
1222+
text = "Wait for it..."
1223+
result = tokenizer.tokenize(text)
1224+
1225+
# Ellipses should be removed with default config (include_punctuation=False)
1226+
expected = ["wait", "for", "it"]
1227+
assert result == expected, f"Expected {expected}, got {result}"
1228+
1229+
def test_chinese_tokenization_regression(self):
1230+
"""Test that Chinese character tokenization still works correctly (regression check)."""
1231+
tokenizer = BasicTokenizer()
1232+
text = "你好世界"
1233+
result = tokenizer.tokenize(text)
1234+
1235+
# Chinese should still be tokenized character by character
1236+
expected = ["你", "好", "世", "界"]
1237+
assert result == expected, f"Expected {expected}, got {result}"
1238+
1239+
def test_contractions_regression(self):
1240+
"""Test that contractions are still handled correctly (regression check)."""
1241+
tokenizer = BasicTokenizer()
1242+
text = "I don't think it's ready"
1243+
result = tokenizer.tokenize(text)
1244+
1245+
# Contractions should be preserved as single tokens
1246+
expected = ["i", "don't", "think", "it's", "ready"]
1247+
assert result == expected, f"Expected {expected}, got {result}"
1248+
1249+
def test_abbreviations_and_contractions_together(self):
1250+
"""Test complex sentence with both abbreviations and contractions."""
1251+
tokenizer = BasicTokenizer()
1252+
text = "U.S. citizens don't always agree"
1253+
result = tokenizer.tokenize(text)
1254+
1255+
# Both abbreviations and contractions should be preserved
1256+
expected = ["u.s.", "citizens", "don't", "always", "agree"]
1257+
assert result == expected, f"Expected {expected}, got {result}"
1258+
1259+
11841260
@pytest.fixture
11851261
def basic_config():
11861262
"""Basic tokenizer configuration for tests."""

services/tokenizer/basic/tokenizer.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from typing import Optional
1010

1111
from ..core.base import AbstractTokenizer
12-
from ..core.types import LanguageFamily, TokenizerConfig, TokenList, TokenType
12+
from ..core.types import LanguageFamily, TokenizerConfig, TokenList
1313
from .patterns import get_patterns
1414

1515

@@ -219,15 +219,29 @@ def _is_url_like(self, token: str) -> bool:
219219
if self._is_email_like(token):
220220
return False
221221

222-
return (
223-
token.startswith(("http://", "https://", "www."))
224-
or "://" in token
225-
or (
226-
token.count(".") >= 1
227-
and any(c.isalpha() for c in token)
228-
and "@" not in token
229-
)
230-
)
222+
# Explicit URL indicators (http://, https://, www., or protocol markers)
223+
if token.startswith(("http://", "https://", "www.")) or "://" in token:
224+
return True
225+
226+
# Domain-like patterns (e.g., "example.com")
227+
# But NOT abbreviations (e.g., "U.S.", "c.e.o.s")
228+
# Heuristic: URLs have at least one period NOT followed by a single uppercase/lowercase letter
229+
# This allows "example.com" but excludes "U.S." and "c.e.o.s"
230+
if (
231+
token.count(".") >= 1
232+
and any(c.isalpha() for c in token)
233+
and "@" not in token
234+
):
235+
# Check if this looks like an abbreviation (single letters between periods)
236+
# Pattern: letter(s).letter(s).letter(s) where segments are 1-3 chars
237+
abbreviation_pattern = r"^[a-z]{1,3}(?:\.[a-z]{1,3})+\.?$"
238+
239+
if re.match(abbreviation_pattern, token, re.IGNORECASE):
240+
return False # This is an abbreviation, not a URL
241+
# If it has a period and looks like a domain, it's URL-like
242+
return True
243+
244+
return False
231245

232246
def _is_email_like(self, token: str) -> bool:
233247
"""Check if token looks like an email address."""

0 commit comments

Comments
 (0)