From 19d663a2cf6409086fba73077e7bdb5d96edce60 Mon Sep 17 00:00:00 2001 From: Sudhendra Date: Wed, 11 Feb 2026 18:40:54 -0600 Subject: [PATCH 1/3] fix: resolve 3 remaining issues from PR #16 review - Fix NameError on malformed JSON lines by initializing stats/unsanitized_data before the JSONL loading loop instead of after it - Move dagshub/mlflow from core dependencies to optional [mlflow] extra - Remove stale RULE C FIX docstring, duplicate import re, and unused Tuple import; use builtin tuple[] instead of typing.Tuple - Add test for malformed JSON line handling in sanitize_and_extract --- pyproject.toml | 6 +- scripts/data_sanitization.py | 145 +++++++++++++++-------------- tests/test_sanitization_manager.py | 26 ++++++ 3 files changed, 103 insertions(+), 74 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1fa1020..e3bae24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,11 +22,13 @@ dependencies = [ "datasets>=3.0.0", "tinker", "matplotlib>=3.8.0", - "dagshub", - "mlflow", ] [project.optional-dependencies] +mlflow = [ + "dagshub", + "mlflow", +] dev = [ "pytest>=8.0.0", "pytest-asyncio>=0.24.0", diff --git a/scripts/data_sanitization.py b/scripts/data_sanitization.py index bfad34c..080c160 100644 --- a/scripts/data_sanitization.py +++ b/scripts/data_sanitization.py @@ -356,26 +356,9 @@ def rule_b_orphaned_symbols(compressed: str, is_code: bool) -> tuple[bool, str]: return True, "" -""" -RULE C FIX: Drop-in replacement for rule_c_negation_preservation - -Replace the existing rule_c_negation_preservation function in your -sanitization script with these functions. - -FIXES: -1. Unicode apostrophes (can't vs can't) - CRITICAL -2. Flexible whitespace in multi-word phrases -3. Reduced false positives from symbols (~, !) -4. Proper contraction handling as suffix, not standalone word -""" - -import re -from typing import Tuple - - def _normalize_apostrophes(text: str) -> str: """Normalize Unicode apostrophes to ASCII.""" - unicode_apostrophes = ['\u2019', '\u2018', '\u02BC', '\u0060'] + unicode_apostrophes = ["\u2019", "\u2018", "\u02bc", "\u0060"] normalized = text for unicode_apos in unicode_apostrophes: normalized = normalized.replace(unicode_apos, "'") @@ -384,8 +367,8 @@ def _normalize_apostrophes(text: str) -> str: def _normalize_whitespace(text: str) -> str: """Normalize whitespace for consistent matching.""" - normalized = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') - normalized = re.sub(r'\s+', ' ', normalized) + normalized = text.replace("\n", " ").replace("\t", " ").replace("\r", " ") + normalized = re.sub(r"\s+", " ", normalized) return normalized.strip() @@ -398,11 +381,21 @@ def _has_contractions(text: str) -> bool: def _has_standalone_negation(text: str) -> bool: """Detect standalone negation words.""" normalized = _normalize_whitespace(text.lower()) - words = ["not", "no", "never", "neither", "nor", - "without", "none", "nothing", "nobody", "nowhere"] - + words = [ + "not", + "no", + "never", + "neither", + "nor", + "without", + "none", + "nothing", + "nobody", + "nowhere", + ] + for word in words: - if re.search(r'\b' + re.escape(word) + r'\b', normalized): + if re.search(r"\b" + re.escape(word) + r"\b", normalized): return True return False @@ -410,67 +403,67 @@ def _has_standalone_negation(text: str) -> bool: def _has_multiword_negation(text: str) -> bool: """Detect multi-word negation phrases with flexible whitespace.""" normalized = _normalize_whitespace(text.lower()) - patterns = [r'\bno\s+longer\b', r'\bno\s+more\b', r'\bnot\s+anymore\b'] - + patterns = [r"\bno\s+longer\b", r"\bno\s+more\b", r"\bnot\s+anymore\b"] + return any(re.search(pattern, normalized) for pattern in patterns) def _has_negation_symbols(text: str, strict: bool = True) -> bool: """ Detect negation symbols with reduced false positives. - + Args: strict: If True, only check ¬ (unambiguous) If False, also check ~ and ! (more permissive) """ - if '¬' in text: + if "¬" in text: return True - + if not strict: # Check for ! but avoid != and !! - if re.search(r'![^=!]', text): + if re.search(r"![^=!]", text): return True # Check for ~ but avoid ~/ and ~digits - if re.search(r'~(?![/\d])', text): + if re.search(r"~(?![/\d])", text): return True - + return False -def rule_c_negation_preservation(verbose: str, compressed: str) -> Tuple[bool, str]: +def rule_c_negation_preservation(verbose: str, compressed: str) -> tuple[bool, str]: """ Rule C: Remove samples that lost negation (NL only). - + FIXED VERSION addressing: - Unicode apostrophes in contractions (can't vs can't) - Flexible whitespace in multi-word phrases - Reduced false positives from symbols - Proper contraction detection - + Returns: (passed, reason) - passed=False if negation was lost """ # Check if input has any form of negation has_input_negation = ( - _has_contractions(verbose) or - _has_standalone_negation(verbose) or - _has_multiword_negation(verbose) + _has_contractions(verbose) + or _has_standalone_negation(verbose) + or _has_multiword_negation(verbose) ) - + if not has_input_negation: return True, "" # No negation to preserve - + # Input has negation - check if output preserved it has_output_negation = ( - _has_contractions(compressed) or - _has_standalone_negation(compressed) or - _has_multiword_negation(compressed) or - _has_negation_symbols(compressed, strict=False) # Allow symbols in compressed + _has_contractions(compressed) + or _has_standalone_negation(compressed) + or _has_multiword_negation(compressed) + or _has_negation_symbols(compressed, strict=False) # Allow symbols in compressed ) - + if not has_output_negation: return False, "Negation lost" - + return True, "" @@ -514,6 +507,27 @@ def sanitize_and_extract(input_path: Path, sanitized_path: Path, unsanitized_pat data = [] + stats = { + "total_input": 0, + "code_samples": 0, + "nl_samples": 0, + "code_passed": 0, + "nl_passed": 0, + "rule_a_failed": 0, + "rule_b_failed": 0, + "rule_c_failed": 0, + "rule_d_failed": 0, + "parse_errors": 0, + "passed_all": 0, + "failed_all": 0, + "failed_samples": [], + "passed_samples": [], + "parse_error_samples": [], + } + + sanitized_data = [] + unsanitized_data = [] + with open(input_path, encoding="utf-8") as f: for idx, line in enumerate(f): if not line.strip(): @@ -540,29 +554,9 @@ def sanitize_and_extract(input_path: Path, sanitized_path: Path, unsanitized_pat print(f"⚠ {error_msg}") continue - print(f"✓ Loaded {len(data)} samples\n") - stats = { - "total_input": len(data), - "code_samples": 0, - "nl_samples": 0, - "code_passed": 0, - "nl_passed": 0, - "rule_a_failed": 0, - "rule_b_failed": 0, - "rule_c_failed": 0, - "rule_d_failed": 0, - "parse_errors": 0, - "passed_all": 0, - "failed_all": 0, - "failed_samples": [], - "passed_samples": [], - "parse_error_samples": [], - } - - sanitized_data = [] - unsanitized_data = [] + stats["total_input"] = len(data) print("Processing samples...\n") @@ -695,12 +689,20 @@ def pct(n: int, d: int) -> float: print(f" NL samples: {nl:5d} ({pct(nl, total):5.1f}%)") print() - print(f"✓ SANITIZED (passed): {stats['passed_all']:5d} ({pct(stats['passed_all'], total):5.1f}%)") - print(f" Code: {stats['code_passed']:5d} ({pct(stats['code_passed'], code):5.1f}%)") - print(f" NL: {stats['nl_passed']:5d} ({pct(stats['nl_passed'], nl):5.1f}%)") + print( + f"✓ SANITIZED (passed): {stats['passed_all']:5d} ({pct(stats['passed_all'], total):5.1f}%)" + ) + print( + f" Code: {stats['code_passed']:5d} ({pct(stats['code_passed'], code):5.1f}%)" + ) + print( + f" NL: {stats['nl_passed']:5d} ({pct(stats['nl_passed'], nl):5.1f}%)" + ) print() - print(f"✗ UNSANITIZED (failed): {stats['failed_all']:5d} ({pct(stats['failed_all'], total):5.1f}%)") + print( + f"✗ UNSANITIZED (failed): {stats['failed_all']:5d} ({pct(stats['failed_all'], total):5.1f}%)" + ) print() print("Failed by rule:") @@ -733,7 +735,6 @@ def pct(n: int, d: int) -> float: print() - # ============================================================================ # CLI # ============================================================================ diff --git a/tests/test_sanitization_manager.py b/tests/test_sanitization_manager.py index 74c80e0..42e7f3f 100644 --- a/tests/test_sanitization_manager.py +++ b/tests/test_sanitization_manager.py @@ -267,6 +267,32 @@ def test_sanitize_dataset_splits_correctly(self, temp_dir): assert sanitized_file.exists() assert unsanitized_file.exists() + def test_malformed_json_line_does_not_crash(self, temp_dir): + """Test that malformed JSON lines are tracked as parse errors, not NameError.""" + + input_file = temp_dir / "train.jsonl" + sanitized_file = temp_dir / "sanitized.jsonl" + unsanitized_file = temp_dir / "unsanitized.jsonl" + + good_sample = { + "messages": [ + {"role": "user", "content": "Compress: one two three four"}, + {"role": "assistant", "content": "1 2"}, + ] + } + + with open(input_file, "w", encoding="utf-8") as f: + f.write(json.dumps(good_sample, ensure_ascii=False) + "\n") + f.write("{this is not valid json}\n") # malformed line + f.write(json.dumps(good_sample, ensure_ascii=False) + "\n") + + # Should NOT raise NameError + stats = sanitize_and_extract(input_file, sanitized_file, unsanitized_file) + + assert stats["parse_errors"] == 1 + assert len(stats["parse_error_samples"]) == 1 + assert stats["total_input"] == 2 # only 2 valid samples loaded + def test_unicode_symbols_preserved(self, temp_dir): """Test that → ∵ @ symbols are NOT escaped in sanitized output.""" From 2b2f9a0fdf17159396f04ce1a9a722a5cec02b87 Mon Sep 17 00:00:00 2001 From: Sudhendra Date: Wed, 11 Feb 2026 18:43:45 -0600 Subject: [PATCH 2/3] style: apply ruff SIM110 simplification for _has_standalone_negation --- scripts/data_sanitization.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/data_sanitization.py b/scripts/data_sanitization.py index 080c160..394583e 100644 --- a/scripts/data_sanitization.py +++ b/scripts/data_sanitization.py @@ -394,10 +394,7 @@ def _has_standalone_negation(text: str) -> bool: "nowhere", ] - for word in words: - if re.search(r"\b" + re.escape(word) + r"\b", normalized): - return True - return False + return any(re.search(r"\b" + re.escape(word) + r"\b", normalized) for word in words) def _has_multiword_negation(text: str) -> bool: From 48083a94964d8799819e2165c51e0bec38c311b3 Mon Sep 17 00:00:00 2001 From: Sudhendra Date: Wed, 11 Feb 2026 18:52:33 -0600 Subject: [PATCH 3/3] fix: skip mlflow_logger tests when dagshub/mlflow not installed CI installs only the [dev] extra, not [mlflow]. Add pytest.importorskip for dagshub and mlflow so the test module is skipped gracefully when these optional dependencies are absent. --- tests/test_mlflow_logger.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_mlflow_logger.py b/tests/test_mlflow_logger.py index 26e58da..0c8bfa1 100644 --- a/tests/test_mlflow_logger.py +++ b/tests/test_mlflow_logger.py @@ -20,6 +20,10 @@ import pytest +# Skip entire module if dagshub/mlflow not installed (optional dependencies) +pytest.importorskip("dagshub", reason="dagshub not installed (optional [mlflow] extra)") +pytest.importorskip("mlflow", reason="mlflow not installed (optional [mlflow] extra)") + class MlflowRecorder: """Mock MLflow client that records all operations."""