Add file extension validation (#30)

* Add file extension validation * Bump patch version
climatepolicyradar · Nov 11, 2024 · 992c989 · 992c989
1 parent f65de82
commit 992c989
Show file tree

Hide file tree

Showing 3 changed files with 153 additions and 1 deletion.
diff --git a/gcf_data_mapper/parsers/document.py b/gcf_data_mapper/parsers/document.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, Optional, cast
 from urllib.parse import urlparse
 
@@ -17,6 +18,8 @@
     verify_required_fields_present,
 )
 
+SUPPORTED_FILE_EXTENSIONS = [".pdf", ".html"]
+
 
 def contains_duplicate_urls(urls: list[str]) -> bool:
     """Check a list of urls for any duplicate entries.
@@ -195,6 +198,14 @@ def process_row(row: pd.Series, debug: bool) -> Optional[list[dict[str, Any]]]:
         click.echo(f"🛑 Skipping row with missing required document columns: {doc_id}")
         return None
 
+    source_url = row.at[RequiredDocumentColumns.SOURCE_URL.value]
+    _, ext = os.path.splitext(source_url)
+    if ext.lower() not in SUPPORTED_FILE_EXTENSIONS:
+        click.echo(
+            f"🛑 Skipping row as [{ext}] is not a valid file ext. Project ID: {doc_id}"
+        )
+        return None
+
     mapped_docs = [map_document_metadata(row, DocumentVariantNames.ORIGINAL.value)]
     if has_translated_files(row):
         translated_docs = map_translated_files(row)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gcf-data-mapper"
-version = "0.1.14"
+version = "0.1.15"
 description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
 authors = ["CPR-dev-team <[email protected]>"]
 license = "Apache-2.0"

diff --git a/tests/unit_tests/parsers/document/test_process_row.py b/tests/unit_tests/parsers/document/test_process_row.py
@@ -89,3 +89,144 @@ def test_handles_data_with_leading_and_trailing_whitespace(
     ]
 
     assert expected_mapped_doc == process_row(mock_valid_row_with_whitespace, False)
+
+
+@pytest.mark.parametrize(
+    ("test_ds,expected_return,error_message"),
+    [
+        (
+            pd.Series(
+                {
+                    "ApprovedRef": "ref123",
+                    "ProjectsID": "proj123",
+                    "ID (Unique ID from our CMS for the document)": "doc123",
+                    "Type": "Test type",
+                    "Title": "Test title",
+                    "Main file (English)": "link123.pdf",
+                    "Document page permalink": "link123",
+                    "Translated files": pd.NA,
+                    "Translated titles": pd.NA,
+                }
+            ),
+            [
+                {
+                    "import_id": "GCF.document.ref123_proj123.doc123",
+                    "family_import_id": "GCF.family.ref123.proj123",
+                    "metadata": {"type": ["Test type"]},
+                    "title": "Test title",
+                    "source_url": "link123.pdf",
+                    "variant_name": "Original Language",
+                }
+            ],
+            None,
+        ),
+        (
+            pd.Series(
+                {
+                    "ApprovedRef": "ref123",
+                    "ProjectsID": "proj123",
+                    "ID (Unique ID from our CMS for the document)": "doc123",
+                    "Type": "Test type",
+                    "Title": "Test title",
+                    "Main file (English)": "link123.PDF",
+                    "Document page permalink": "link123",
+                    "Translated files": pd.NA,
+                    "Translated titles": pd.NA,
+                }
+            ),
+            [
+                {
+                    "import_id": "GCF.document.ref123_proj123.doc123",
+                    "family_import_id": "GCF.family.ref123.proj123",
+                    "metadata": {"type": ["Test type"]},
+                    "title": "Test title",
+                    "source_url": "link123.PDF",
+                    "variant_name": "Original Language",
+                }
+            ],
+            None,
+        ),
+        (
+            pd.Series(
+                {
+                    "ApprovedRef": "ref123",
+                    "ProjectsID": "proj123",
+                    "ID (Unique ID from our CMS for the document)": "doc123",
+                    "Type": "Test type",
+                    "Title": "Test title",
+                    "Main file (English)": "link123.html",
+                    "Document page permalink": "link123",
+                    "Translated files": pd.NA,
+                    "Translated titles": pd.NA,
+                }
+            ),
+            [
+                {
+                    "import_id": "GCF.document.ref123_proj123.doc123",
+                    "family_import_id": "GCF.family.ref123.proj123",
+                    "metadata": {"type": ["Test type"]},
+                    "title": "Test title",
+                    "source_url": "link123.html",
+                    "variant_name": "Original Language",
+                }
+            ],
+            None,
+        ),
+        (
+            pd.Series(
+                {
+                    "ApprovedRef": "ref123",
+                    "ProjectsID": "proj123",
+                    "ID (Unique ID from our CMS for the document)": "doc123",
+                    "Type": "Test type",
+                    "Title": "Test title",
+                    "Main file (English)": "link123.HTML",
+                    "Document page permalink": "link123",
+                    "Translated files": pd.NA,
+                    "Translated titles": pd.NA,
+                }
+            ),
+            [
+                {
+                    "import_id": "GCF.document.ref123_proj123.doc123",
+                    "family_import_id": "GCF.family.ref123.proj123",
+                    "metadata": {"type": ["Test type"]},
+                    "title": "Test title",
+                    "source_url": "link123.HTML",
+                    "variant_name": "Original Language",
+                }
+            ],
+            None,
+        ),
+        (
+            pd.Series(
+                {
+                    "ApprovedRef": "ref123",
+                    "ProjectsID": "proj123",
+                    "ID (Unique ID from our CMS for the document)": "doc123",
+                    "Type": "Test type",
+                    "Title": "Test title",
+                    "Main file (English)": "link123.xlsx",
+                    "Document page permalink": "link123",
+                    "Translated files": pd.NA,
+                    "Translated titles": pd.NA,
+                }
+            ),
+            None,
+            "🛑 Skipping row as [.xlsx] is not a valid file ext. Project ID: doc123",
+        ),
+    ],
+)
+def test_validates_url_has_a_supported_extension(
+    test_ds: pd.Series,
+    expected_return,
+    error_message: str,
+    capsys,
+):
+    document_data = process_row(test_ds, debug=False)
+
+    assert expected_return == document_data
+
+    if error_message:
+        captured = capsys.readouterr()
+        assert error_message == captured.out.strip()