Skip to content

Commit

Permalink
Add file extension validation (#30)
Browse files Browse the repository at this point in the history
* Add file extension validation

* Bump patch version
  • Loading branch information
annaCPR authored Nov 11, 2024
1 parent f65de82 commit 992c989
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 1 deletion.
11 changes: 11 additions & 0 deletions gcf_data_mapper/parsers/document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import Any, Optional, cast
from urllib.parse import urlparse

Expand All @@ -17,6 +18,8 @@
verify_required_fields_present,
)

SUPPORTED_FILE_EXTENSIONS = [".pdf", ".html"]


def contains_duplicate_urls(urls: list[str]) -> bool:
"""Check a list of urls for any duplicate entries.
Expand Down Expand Up @@ -195,6 +198,14 @@ def process_row(row: pd.Series, debug: bool) -> Optional[list[dict[str, Any]]]:
click.echo(f"🛑 Skipping row with missing required document columns: {doc_id}")
return None

source_url = row.at[RequiredDocumentColumns.SOURCE_URL.value]
_, ext = os.path.splitext(source_url)
if ext.lower() not in SUPPORTED_FILE_EXTENSIONS:
click.echo(
f"🛑 Skipping row as [{ext}] is not a valid file ext. Project ID: {doc_id}"
)
return None

mapped_docs = [map_document_metadata(row, DocumentVariantNames.ORIGINAL.value)]
if has_translated_files(row):
translated_docs = map_translated_files(row)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gcf-data-mapper"
version = "0.1.14"
version = "0.1.15"
description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
authors = ["CPR-dev-team <[email protected]>"]
license = "Apache-2.0"
Expand Down
141 changes: 141 additions & 0 deletions tests/unit_tests/parsers/document/test_process_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,144 @@ def test_handles_data_with_leading_and_trailing_whitespace(
]

assert expected_mapped_doc == process_row(mock_valid_row_with_whitespace, False)


@pytest.mark.parametrize(
("test_ds,expected_return,error_message"),
[
(
pd.Series(
{
"ApprovedRef": "ref123",
"ProjectsID": "proj123",
"ID (Unique ID from our CMS for the document)": "doc123",
"Type": "Test type",
"Title": "Test title",
"Main file (English)": "link123.pdf",
"Document page permalink": "link123",
"Translated files": pd.NA,
"Translated titles": pd.NA,
}
),
[
{
"import_id": "GCF.document.ref123_proj123.doc123",
"family_import_id": "GCF.family.ref123.proj123",
"metadata": {"type": ["Test type"]},
"title": "Test title",
"source_url": "link123.pdf",
"variant_name": "Original Language",
}
],
None,
),
(
pd.Series(
{
"ApprovedRef": "ref123",
"ProjectsID": "proj123",
"ID (Unique ID from our CMS for the document)": "doc123",
"Type": "Test type",
"Title": "Test title",
"Main file (English)": "link123.PDF",
"Document page permalink": "link123",
"Translated files": pd.NA,
"Translated titles": pd.NA,
}
),
[
{
"import_id": "GCF.document.ref123_proj123.doc123",
"family_import_id": "GCF.family.ref123.proj123",
"metadata": {"type": ["Test type"]},
"title": "Test title",
"source_url": "link123.PDF",
"variant_name": "Original Language",
}
],
None,
),
(
pd.Series(
{
"ApprovedRef": "ref123",
"ProjectsID": "proj123",
"ID (Unique ID from our CMS for the document)": "doc123",
"Type": "Test type",
"Title": "Test title",
"Main file (English)": "link123.html",
"Document page permalink": "link123",
"Translated files": pd.NA,
"Translated titles": pd.NA,
}
),
[
{
"import_id": "GCF.document.ref123_proj123.doc123",
"family_import_id": "GCF.family.ref123.proj123",
"metadata": {"type": ["Test type"]},
"title": "Test title",
"source_url": "link123.html",
"variant_name": "Original Language",
}
],
None,
),
(
pd.Series(
{
"ApprovedRef": "ref123",
"ProjectsID": "proj123",
"ID (Unique ID from our CMS for the document)": "doc123",
"Type": "Test type",
"Title": "Test title",
"Main file (English)": "link123.HTML",
"Document page permalink": "link123",
"Translated files": pd.NA,
"Translated titles": pd.NA,
}
),
[
{
"import_id": "GCF.document.ref123_proj123.doc123",
"family_import_id": "GCF.family.ref123.proj123",
"metadata": {"type": ["Test type"]},
"title": "Test title",
"source_url": "link123.HTML",
"variant_name": "Original Language",
}
],
None,
),
(
pd.Series(
{
"ApprovedRef": "ref123",
"ProjectsID": "proj123",
"ID (Unique ID from our CMS for the document)": "doc123",
"Type": "Test type",
"Title": "Test title",
"Main file (English)": "link123.xlsx",
"Document page permalink": "link123",
"Translated files": pd.NA,
"Translated titles": pd.NA,
}
),
None,
"🛑 Skipping row as [.xlsx] is not a valid file ext. Project ID: doc123",
),
],
)
def test_validates_url_has_a_supported_extension(
test_ds: pd.Series,
expected_return,
error_message: str,
capsys,
):
document_data = process_row(test_ds, debug=False)

assert expected_return == document_data

if error_message:
captured = capsys.readouterr()
assert error_message == captured.out.strip()

0 comments on commit 992c989

Please sign in to comment.