Add tests for dataset extraction script

Kaushik-Kumar-CEG · Kaushik-Kumar-CEG · commit 2541b321c89e · 2026-06-13T19:57:36.000Z
diff --git a/etc/scripts/dataset_pipeline/build_dataset.py b/etc/scripts/dataset_pipeline/build_dataset.py
@@ -186,7 +186,6 @@ def main(rules_dir, output_dir):
     click.echo(f'  train: {len(splits["train"])}  val: {len(splits["val"])}  test: {len(splits["test"])}')
     click.echo(f'  output: {out_dir}')
 
-# stuff to do(follow up commits):
-# tests to be added in script
+
 if __name__ == '__main__':
     main()
diff --git a/etc/scripts/dataset_pipeline/test_build_dataset.py b/etc/scripts/dataset_pipeline/test_build_dataset.py
@@ -0,0 +1,101 @@
+# tests for build_dataset.py
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+from build_dataset import normalize_phrase, tag_tokens, assign_splits
+
+
+class TestNormalizePhrase:
+
+    def test_html_entities(self):
+        assert normalize_phrase('&lt;b&gt;MIT&lt;/b&gt;') == 'MIT'
+        assert normalize_phrase('the &quot;License&quot;') == 'the "License"'
+        assert normalize_phrase('foo &amp; bar') == 'foo & bar'
+
+    def test_preserves_urls_in_angle_brackets(self):
+        result = normalize_phrase('<http://example.com/LICENSE>')
+        assert result == 'http://example.com/LICENSE'
+
+    def test_strips_xml_tags(self):
+        assert normalize_phrase('<name>Apache 2.0</name>') == 'Apache 2.0'
+        assert normalize_phrase('<license>MIT</license>') == 'MIT'
+
+    def test_strips_backticks(self):
+        assert normalize_phrase('`MIT License`') == 'MIT License'
+
+    def test_collapses_whitespace(self):
+        assert normalize_phrase('GNU  General\n Public   License') == 'GNU General Public License'
+
+    def test_strips_trailing_punct(self):
+        assert normalize_phrase('Apache 2.0.') == 'Apache 2.0'
+        assert normalize_phrase(',MIT,') == 'MIT'
+
+    def test_empty_after_strip(self):
+        assert normalize_phrase('<foo>') == ''
+        assert normalize_phrase('...') == ''
+
+
+class TestTagTokens:
+
+    def test_single_phrase(self):
+        tokens, labels = tag_tokens('under the {{Apache License}} terms')
+        assert tokens == ['under', 'the', 'Apache', 'License', 'terms']
+        assert labels == ['O', 'O', 'B-REQ', 'E-REQ', 'O']
+
+    def test_single_word_phrase(self):
+        tokens, labels = tag_tokens('use {{MIT}} license')
+        assert tokens == ['use', 'MIT', 'license']
+        assert labels == ['O', 'S-REQ', 'O']
+
+    def test_multiple_phrases(self):
+        tokens, labels = tag_tokens('{{Apache}} and {{MIT}} stuff')
+        assert tokens == ['Apache', 'and', 'MIT', 'stuff']
+        assert labels == ['S-REQ', 'O', 'S-REQ', 'O']
+
+    def test_long_phrase(self):
+        tokens, labels = tag_tokens('{{GNU General Public License}}')
+        assert tokens == ['GNU', 'General', 'Public', 'License']
+        assert labels == ['B-REQ', 'I-REQ', 'I-REQ', 'E-REQ']
+
+    def test_no_markers(self):
+        tokens, labels = tag_tokens('released under the license')
+        assert tokens == ['released', 'under', 'the', 'license']
+        assert labels == ['O', 'O', 'O', 'O']
+
+    def test_alignment(self):
+        tokens, labels = tag_tokens('licensed under {{Apache License}} or {{MIT}}')
+        assert len(tokens) == len(labels)
+
+    def test_empty_input(self):
+        tokens, labels = tag_tokens('')
+        assert tokens == []
+        assert labels == []
+
+    def test_empty_markers_ignored(self):
+        tokens, labels = tag_tokens('licensed under {{}} the GPL')
+        assert tokens == ['licensed', 'under', 'the', 'GPL']
+        assert labels == ['O', 'O', 'O', 'O']
+
+
+class TestAssignSplits:
+
+    def test_light_expressions_no_leakage(self):
+        results = []
+        for i in range(5):
+            for j in range(10):
+                results.append({'license_expression': f'license-{i}', 'identifier': f'rule_{i}_{j}.RULE'})
+
+        heavy, assignment = assign_splits(results)
+        assert len(heavy) == 0
+        assert len(assignment) == 5
+        assert all(s in ('train', 'val', 'test') for s in assignment.values())
+
+    def test_heavy_expressions_detected(self):
+        results = [{'license_expression': 'mit', 'identifier': f'mit_{i}.RULE'} for i in range(100)]
+        results += [{'license_expression': 'rare-1.0', 'identifier': 'rare_1.RULE'}]
+
+        heavy, assignment = assign_splits(results)
+        assert 'mit' in heavy
+        assert 'rare-1.0' not in heavy
+        assert 'rare-1.0' in assignment