Strip whitespace from values (#28)

* Strip whitespace from row values when mapping fmaily data * Strip whitespace from row values when mapping docs * Strip whitespace from row values when mapping events * Bump patch version
climatepolicyradar · Nov 11, 2024 · f65de82 · f65de82
1 parent 933b4fa
commit f65de82
Show file tree

Hide file tree

Showing 11 changed files with 179 additions and 6 deletions.
diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json
@@ -41,7 +41,8 @@
     "isin",
     "pydantic",
     "getfixturevalue",
-    "isna"
+    "isna",
+    "AAABBB"
   ],
   "flagWords": ["hte"],
   "suggestionsTimeout": 5000

diff --git a/gcf_data_mapper/parsers/document.py b/gcf_data_mapper/parsers/document.py
@@ -13,6 +13,7 @@
 )
 from gcf_data_mapper.parsers.helpers import (
     check_required_column_value_not_na,
+    strip_nested,
     verify_required_fields_present,
 )
 
@@ -177,6 +178,8 @@ def process_row(row: pd.Series, debug: bool) -> Optional[list[dict[str, Any]]]:
         the 'destination' format described in the GCF Data Mapper Google
         Sheet.
     """
+    row = cast(pd.Series, row.apply(strip_nested))
+
     doc_id = (
         row.at[RequiredDocumentColumns.ID.value]
         if RequiredDocumentColumns.ID.value in row.index

diff --git a/gcf_data_mapper/parsers/event.py b/gcf_data_mapper/parsers/event.py
@@ -1,10 +1,10 @@
-from typing import Any, Optional
+from typing import Any, Optional, cast
 
 import click
 import pandas as pd
 
 from gcf_data_mapper.enums.event import Event, EventColumnNames, Events
-from gcf_data_mapper.parsers.helpers import verify_required_fields_present
+from gcf_data_mapper.parsers.helpers import strip_nested, verify_required_fields_present
 
 
 def append_event(
@@ -134,6 +134,7 @@ def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, A
     event_counter = {}
 
     for _, row in projects_data.iterrows():
+        row = cast(pd.Series, row.apply(strip_nested))
         approved_ref = row.at[EventColumnNames.APPROVED_REF.value]
         projects_id = row.at[EventColumnNames.PROJECTS_ID.value]
         process_event(row, gcf_events, event_counter, approved_ref, projects_id)

diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py
@@ -1,4 +1,4 @@
-from typing import Any, Iterable, Optional
+from typing import Any, Iterable, Optional, cast
 
 import click
 import pandas as pd
@@ -12,6 +12,7 @@
 from gcf_data_mapper.parsers.helpers import (
     arrays_contain_empty_values,
     row_contains_columns_with_empty_values,
+    strip_nested,
     verify_required_fields_present,
 )
 
@@ -222,6 +223,7 @@ def process_row(
         )
         return None
 
+    row = cast(pd.Series, row.apply(strip_nested))
     return map_family_data(row)
 
 
@@ -255,7 +257,7 @@ def family(
     )
 
     for _, row in gcf_projects_data.iterrows():
-        projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
+        projects_id = str(row.at[FamilyColumnsNames.PROJECTS_ID.value]).strip()
         mapped_families.append(process_row(row, projects_id, list(required_fields)))
 
     return mapped_families
diff --git a/gcf_data_mapper/parsers/helpers.py b/gcf_data_mapper/parsers/helpers.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import click
 import pandas as pd
 
@@ -69,3 +71,14 @@ def arrays_contain_empty_values(list_values: list[tuple], id: str) -> bool:
         return True
 
     return False
+
+
+def strip_nested(value: Any) -> Any:
+    """Recursively strip strings in nested structures."""
+    if isinstance(value, str):
+        return value.strip()
+    elif isinstance(value, list):
+        return [strip_nested(item) for item in value]
+    elif isinstance(value, dict):
+        return {key: strip_nested(val) for key, val in value.items()}
+    return value
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gcf-data-mapper"
-version = "0.1.13"
+version = "0.1.14"
 description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
 authors = ["CPR-dev-team <[email protected]>"]
 license = "Apache-2.0"

diff --git a/tests/unit_tests/parsers/document/conftest.py b/tests/unit_tests/parsers/document/conftest.py
@@ -97,6 +97,22 @@ def mock_valid_row():
     )
 
 
+@pytest.fixture
+def mock_valid_row_with_whitespace():
+    return pd.Series(
+        {
+            "ApprovedRef": "  ref123  ",
+            "ProjectsID": "  proj123  ",
+            "ID (Unique ID from our CMS for the document)": "  doc123 ",
+            "Type": " type123 ",
+            "Title": "  title123 ",
+            "Main file (English)": " link123.pdf ",
+            "Document page permalink": " link123 ",
+            "Translated titles": None,
+        }
+    )
+
+
 @pytest.fixture
 def mock_gcf_docs():
     return pd.DataFrame(

diff --git a/tests/unit_tests/parsers/document/test_process_row.py b/tests/unit_tests/parsers/document/test_process_row.py
@@ -71,3 +71,21 @@ def test_process_row_returns_none_with_na_in_required_columns(
     assert process_row(row, debug=False) is None
     captured = capsys.readouterr()
     assert expected_error_msg in captured.out
+
+
+def test_handles_data_with_leading_and_trailing_whitespace(
+    mock_valid_row_with_whitespace,
+):
+
+    expected_mapped_doc = [
+        {
+            "import_id": "GCF.document.ref123_proj123.doc123",
+            "family_import_id": "GCF.family.ref123.proj123",
+            "metadata": {"type": ["type123"]},
+            "title": "title123",
+            "source_url": "link123.pdf",
+            "variant_name": "Original Language",
+        }
+    ]
+
+    assert expected_mapped_doc == process_row(mock_valid_row_with_whitespace, False)
diff --git a/tests/unit_tests/parsers/event/test_event.py b/tests/unit_tests/parsers/event/test_event.py
@@ -61,3 +61,41 @@ def test_event_handles_partial_valid_dates():
     )
     result = event(projects_data, debug=False)
     assert len(result) == 3
+
+
+def test_handles_data_with_leading_and_trailing_whitespace():
+    mock_projects_data = pd.DataFrame(
+        {
+            "ApprovalDate": [" 2023-01-01 ", None],
+            "StartDate": [None, "  2023-06-01"],
+            "DateCompletion": ["2023-12-31  ", None],
+            "ApprovedRef": ["  FP123  ", " FP124 "],
+            "ProjectsID": [" PID456 ", "  PID457  "],
+        }
+    )
+
+    expected_mapped_events = [
+        {
+            "date": "2023-01-01",
+            "event_title": "Project Approved",
+            "event_type_value": "Project Approved",
+            "import_id": "GCF.event.FP123_PID456.n0000",
+            "family_import_id": "GCF.family.FP123.PID456",
+        },
+        {
+            "date": "2023-12-31",
+            "event_title": "Project Completed",
+            "event_type_value": "Project Completed",
+            "family_import_id": "GCF.family.FP123.PID456",
+            "import_id": "GCF.event.FP123_PID456.n0001",
+        },
+        {
+            "date": "2023-06-01",
+            "event_title": "Under Implementation",
+            "event_type_value": "Under Implementation",
+            "family_import_id": "GCF.family.FP124.PID457",
+            "import_id": "GCF.event.FP124_PID457.n0000",
+        },
+    ]
+
+    assert expected_mapped_events == event(mock_projects_data, False)
diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py
@@ -231,6 +231,55 @@ def mock_family_row_with_non_int_non_float_budget_values():
     )
 
 
+@pytest.fixture()
+def mock_family_doc_with_whitespace():
+    yield pd.Series(
+        {
+            "ProjectsID": "  AAABBB  ",
+            "ApprovedRef": " FP003 ",
+            "ProjectName": "  Enhancing resilience of coastal ecosystems and communities",
+            "Theme": " Adaptation ",
+            "Sector": " Environment ",
+            "ProjectURL": " https://www.climateaction.fund/project/FP003   ",
+            "Summary": " The Summary of the Project ",
+            "Countries": [
+                {
+                    "CountryName": " Bangladesh ",
+                    "ISO3": " BGD ",
+                    "Region": " Asia   ",
+                },
+            ],
+            "Entities": [
+                {
+                    "Name": " Green Innovations  ",
+                }
+            ],
+            "Funding": [
+                {
+                    "Source": " GCF ",
+                    "Budget": 9200000,
+                    "BudgetUSDeq": 9200000,
+                },
+                {
+                    "ProjectBudgetID": 412,
+                    "Source": " Co-Financing  ",
+                    "Budget": 620000,
+                    "BudgetUSDeq": 620000,
+                },
+            ],
+            "ResultAreas": [
+                {
+                    "Area": " Coastal protection and restoration ",
+                    "Type": " Adaptation  ",
+                },
+            ],
+            "ApprovalDate": " 2016-06-30T00:00:00.000Z ",
+            "StartDate": " 2024-06-28T00:00:00.000Z  ",
+            "DateCompletion": None,
+        }
+    )
+
+
 @pytest.fixture()
 def required_family_columns():
     required_columns = [column.value for column in FamilyColumnsNames]

diff --git a/tests/unit_tests/parsers/family/test_map_family.py b/tests/unit_tests/parsers/family/test_map_family.py
@@ -165,3 +165,35 @@ def test_skips_processing_row_if_family_metadata_has_missing_data(
         f"🛑 Skipping row as family metadata has missing information, ProjectsID : {projects_id}"
         == map_family_data_output[1]
     )
+
+
+def test_handles_data_with_leading_and_trailing_whitespace(
+    mock_family_doc_with_whitespace,
+):
+
+    expected_mapped_family = {
+        "category": "MCF",
+        "collections": [],
+        "summary": "The Summary of the Project",
+        "geographies": ["BGD"],
+        "import_id": "GCF.family.FP003.AAABBB",
+        "metadata": {
+            "approved_ref": ["FP003"],
+            "implementing_agency": ["Green Innovations"],
+            "project_id": ["AAABBB"],
+            "project_url": ["https://www.climateaction.fund/project/FP003"],
+            "project_value_fund_spend": ["9200000"],
+            "project_value_co_financing": ["620000"],
+            "region": ["Asia"],
+            "result_area": ["Coastal protection and restoration"],
+            "result_type": ["Adaptation"],
+            "sector": ["Environment"],
+            "status": ["Under Implementation"],
+            "theme": ["Adaptation"],
+        },
+        "title": "Enhancing resilience of coastal ecosystems and communities",
+    }
+
+    assert expected_mapped_family == process_row(
+        mock_family_doc_with_whitespace, "  AAABBB  ", []
+    )