[RELEASE] v0.9.0 (#219)

KristijanArmeni · web-flow · commit b7ac227b1580 · 2025-10-01T11:05:25.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ VERSION
 *.DS_Store
 .env*
 .serena/cache
+.specify/
diff --git a/.serena/.gitignore b/.serena/.gitignore
@@ -0,0 +1 @@
+/cache
diff --git a/README.md b/README.md
@@ -1,25 +1,28 @@
-<h2 align="center">mango-tango-cli</h2>
-<h3 align="center">A Python command-line tool for detecting coordinated inauthentic behavior</h3>
+<h2 align="center">CIB Mango Tree</h2>
+<h3 align="center">An Interactive Command Line and Dashboard Tool for Detecting Coordinated Inauthentic Behavior Datasets of Online Activity</h3>
 
 <p align="center">
 <img src="https://raw.githubusercontent.com/CIB-Mango-Tree/CIB-Mango-Tree-Website/main/assets/images/mango-text.PNG" alt="Mango logo" style="width:200px;"/>
 </p>
 
 <p align="center">
-<a href="https://www.python.org/"><img alt="code" src="https://img.shields.io/badge/code-Python%203.12-blue?logo=Python"></a>
+<a href="https://www.python.org/"><img alt="code" src="https://img.shields.io/badge/Python-3.12-blue?logo=Python"></a>
+<a href="https://docs.astral.sh/ruff/"><img alt="style: black" src="https://img.shields.io/badge/Polars-1.9-skyblue?logo=Polars"></a>
+<a href="https://plotly.com/python/"><img alt="style: black" src="https://img.shields.io/badge/Plotly-5.24.1-purple?logo=Plotly"></a>
+<a href="https://github.com/Textualize/rich"><img alt="style: black" src="https://img.shields.io/badge/Rich-14.0.0-gold?logo=Rich"></a>
+<a href="https://civictechdc.github.io/mango-tango-cli/"><img alt="style: black" src="https://img.shields.io/badge/docs-website-blue"></a>
 <a href="https://black.readthedocs.io/en/stable/"><img alt="style: black" src="https://img.shields.io/badge/style-Black-black?logo=Black"></a>
-<a href="https://docs.astral.sh/ruff/"><img alt="style: black" src="https://img.shields.io/badge/tool-Polars-skyblue?logo=Polars"></a>
 </p>
 
 ---
 
 ## Technical Documentation
 
-For in-depth technical docs related to this repository please visit [https://civictechdc.github.io/mango-tango-cli](https://civictechdc.github.io/mango-tango-cli)
+For in-depth technical docs related to this repository please visit: [https://civictechdc.github.io/mango-tango-cli](https://civictechdc.github.io/mango-tango-cli)
 
 ## Requirements
 
-Python 3.12
+Python 3.12 (see [requirements.txt](https://github.com/civictechdc/mango-tango-cli/blob/main/requirements.txt))
 
 ## Setting up
 
@@ -41,7 +44,7 @@ python -m venv venv
 ## Starting the application
 
 ```shell
-python -m mangotango
+python -m cibmangotree
 ```
 
 ## Development Guide and Documentation
diff --git a/services/tokenizer/basic/patterns.py b/services/tokenizer/basic/patterns.py
@@ -98,7 +98,9 @@
 )
 
 # Word patterns for different script types
-LATIN_WORD_PATTERN = r"[a-zA-Z]+(?:\'[a-zA-Z]+)*"  # Handle contractions
+
+LATIN_WORD_PATTERN = r"[a-zA-Z]+(?:\.[a-zA-Z]+)+\.?|[a-zA-Z]+(?:\'[a-zA-Z]+)*"  # Handle abbreviations and contractions
+
 WORD_PATTERN = f"(?:{LATIN_WORD_PATTERN}|{CJK_PATTERN}+|{ARABIC_PATTERN}+|{THAI_PATTERN}+|{SEA_PATTERN}+)"
 
 # Punctuation (preserve some, group others)
diff --git a/services/tokenizer/basic/test_basic_tokenizer.py b/services/tokenizer/basic/test_basic_tokenizer.py
@@ -8,7 +8,7 @@
 
 import pytest
 
-from ..core.types import CaseHandling, LanguageFamily, TokenizerConfig, TokenType
+from ..core.types import CaseHandling, TokenizerConfig
 from .tokenizer import BasicTokenizer
 
 
@@ -1181,6 +1181,82 @@ def test_international_social_media(self):
 
 
 # Fixtures for reusable test data
+
+
+class TestAbbreviationsAndPunctuation:
+    """Test abbreviation handling and punctuation edge cases."""
+
+    def test_abbreviations_basic(self):
+        """Test basic abbreviation tokenization - abbreviations should stay intact."""
+        tokenizer = BasicTokenizer()
+        text = "The c.e.o.s met yesterday"
+        result = tokenizer.tokenize(text)
+
+        # Abbreviations should be preserved as single tokens
+        expected = ["the", "c.e.o.s", "met", "yesterday"]
+        assert result == expected, f"Expected {expected}, got {result}"
+
+    def test_abbreviations_with_trailing_period(self):
+        """Test abbreviation with trailing sentence period."""
+        tokenizer = BasicTokenizer()
+        text = "I live in U.S. now"
+        result = tokenizer.tokenize(text)
+
+        # Abbreviation should be preserved, period is part of the abbreviation
+        expected = ["i", "live", "in", "u.s.", "now"]
+        assert result == expected, f"Expected {expected}, got {result}"
+
+    def test_multiple_abbreviations(self):
+        """Test multiple abbreviations in the same sentence."""
+        tokenizer = BasicTokenizer()
+        text = "U.S. and U.K. relations"
+        result = tokenizer.tokenize(text)
+
+        # Both abbreviations should be preserved
+        expected = ["u.s.", "and", "u.k.", "relations"]
+        assert result == expected, f"Expected {expected}, got {result}"
+
+    def test_ellipses_without_punctuation(self):
+        """Test ellipses handling - ellipses should be filtered out by default."""
+        tokenizer = BasicTokenizer()
+        text = "Wait for it..."
+        result = tokenizer.tokenize(text)
+
+        # Ellipses should be removed with default config (include_punctuation=False)
+        expected = ["wait", "for", "it"]
+        assert result == expected, f"Expected {expected}, got {result}"
+
+    def test_chinese_tokenization_regression(self):
+        """Test that Chinese character tokenization still works correctly (regression check)."""
+        tokenizer = BasicTokenizer()
+        text = "你好世界"
+        result = tokenizer.tokenize(text)
+
+        # Chinese should still be tokenized character by character
+        expected = ["你", "好", "世", "界"]
+        assert result == expected, f"Expected {expected}, got {result}"
+
+    def test_contractions_regression(self):
+        """Test that contractions are still handled correctly (regression check)."""
+        tokenizer = BasicTokenizer()
+        text = "I don't think it's ready"
+        result = tokenizer.tokenize(text)
+
+        # Contractions should be preserved as single tokens
+        expected = ["i", "don't", "think", "it's", "ready"]
+        assert result == expected, f"Expected {expected}, got {result}"
+
+    def test_abbreviations_and_contractions_together(self):
+        """Test complex sentence with both abbreviations and contractions."""
+        tokenizer = BasicTokenizer()
+        text = "U.S. citizens don't always agree"
+        result = tokenizer.tokenize(text)
+
+        # Both abbreviations and contractions should be preserved
+        expected = ["u.s.", "citizens", "don't", "always", "agree"]
+        assert result == expected, f"Expected {expected}, got {result}"
+
+
 @pytest.fixture
 def basic_config():
     """Basic tokenizer configuration for tests."""
diff --git a/services/tokenizer/basic/tokenizer.py b/services/tokenizer/basic/tokenizer.py
@@ -9,7 +9,7 @@
 from typing import Optional
 
 from ..core.base import AbstractTokenizer
-from ..core.types import LanguageFamily, TokenizerConfig, TokenList, TokenType
+from ..core.types import LanguageFamily, TokenizerConfig, TokenList
 from .patterns import get_patterns
 
 
@@ -219,15 +219,29 @@ def _is_url_like(self, token: str) -> bool:
         if self._is_email_like(token):
             return False
 
-        return (
-            token.startswith(("http://", "https://", "www."))
-            or "://" in token
-            or (
-                token.count(".") >= 1
-                and any(c.isalpha() for c in token)
-                and "@" not in token
-            )
-        )
+        # Explicit URL indicators (http://, https://, www., or protocol markers)
+        if token.startswith(("http://", "https://", "www.")) or "://" in token:
+            return True
+
+        # Domain-like patterns (e.g., "example.com")
+        # But NOT abbreviations (e.g., "U.S.", "c.e.o.s")
+        # Heuristic: URLs have at least one period NOT followed by a single uppercase/lowercase letter
+        # This allows "example.com" but excludes "U.S." and "c.e.o.s"
+        if (
+            token.count(".") >= 1
+            and any(c.isalpha() for c in token)
+            and "@" not in token
+        ):
+            # Check if this looks like an abbreviation (single letters between periods)
+            # Pattern: letter(s).letter(s).letter(s) where segments are 1-3 chars
+            abbreviation_pattern = r"^[a-z]{1,3}(?:\.[a-z]{1,3})+\.?$"
+
+            if re.match(abbreviation_pattern, token, re.IGNORECASE):
+                return False  # This is an abbreviation, not a URL
+            # If it has a period and looks like a domain, it's URL-like
+            return True
+
+        return False
 
     def _is_email_like(self, token: str) -> bool:
         """Check if token looks like an email address."""

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,9 @@`
`98`	`98`	`)`
`99`	`99`
`100`	`100`	`# Word patterns for different script types`
`101`		`-LATIN_WORD_PATTERN = r"[a-zA-Z]+(?:\'[a-zA-Z]+)*" # Handle contractions`
	`101`	`+`
	`102`	`+LATIN_WORD_PATTERN = r"[a-zA-Z]+(?:\.[a-zA-Z]+)+\.?\|[a-zA-Z]+(?:\'[a-zA-Z]+)*" # Handle abbreviations and contractions`
	`103`	`+`
`102`	`104`	`WORD_PATTERN = f"(?:{LATIN_WORD_PATTERN}\|{CJK_PATTERN}+\|{ARABIC_PATTERN}+\|{THAI_PATTERN}+\|{SEA_PATTERN}+)"`
`103`	`105`
`104`	`106`	`# Punctuation (preserve some, group others)`