Skip to content

Commit

Permalink
Strip whitespace from values (#28)
Browse files Browse the repository at this point in the history
* Strip whitespace from row values when mapping fmaily data

* Strip whitespace from row values when mapping docs

* Strip whitespace from row values when mapping events

* Bump patch version
  • Loading branch information
annaCPR authored Nov 11, 2024
1 parent 933b4fa commit f65de82
Show file tree
Hide file tree
Showing 11 changed files with 179 additions and 6 deletions.
3 changes: 2 additions & 1 deletion .trunk/configs/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
"isin",
"pydantic",
"getfixturevalue",
"isna"
"isna",
"AAABBB"
],
"flagWords": ["hte"],
"suggestionsTimeout": 5000
Expand Down
3 changes: 3 additions & 0 deletions gcf_data_mapper/parsers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from gcf_data_mapper.parsers.helpers import (
check_required_column_value_not_na,
strip_nested,
verify_required_fields_present,
)

Expand Down Expand Up @@ -177,6 +178,8 @@ def process_row(row: pd.Series, debug: bool) -> Optional[list[dict[str, Any]]]:
the 'destination' format described in the GCF Data Mapper Google
Sheet.
"""
row = cast(pd.Series, row.apply(strip_nested))

doc_id = (
row.at[RequiredDocumentColumns.ID.value]
if RequiredDocumentColumns.ID.value in row.index
Expand Down
5 changes: 3 additions & 2 deletions gcf_data_mapper/parsers/event.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import Any, Optional
from typing import Any, Optional, cast

import click
import pandas as pd

from gcf_data_mapper.enums.event import Event, EventColumnNames, Events
from gcf_data_mapper.parsers.helpers import verify_required_fields_present
from gcf_data_mapper.parsers.helpers import strip_nested, verify_required_fields_present


def append_event(
Expand Down Expand Up @@ -134,6 +134,7 @@ def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, A
event_counter = {}

for _, row in projects_data.iterrows():
row = cast(pd.Series, row.apply(strip_nested))
approved_ref = row.at[EventColumnNames.APPROVED_REF.value]
projects_id = row.at[EventColumnNames.PROJECTS_ID.value]
process_event(row, gcf_events, event_counter, approved_ref, projects_id)
Expand Down
6 changes: 4 additions & 2 deletions gcf_data_mapper/parsers/family.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Iterable, Optional
from typing import Any, Iterable, Optional, cast

import click
import pandas as pd
Expand All @@ -12,6 +12,7 @@
from gcf_data_mapper.parsers.helpers import (
arrays_contain_empty_values,
row_contains_columns_with_empty_values,
strip_nested,
verify_required_fields_present,
)

Expand Down Expand Up @@ -222,6 +223,7 @@ def process_row(
)
return None

row = cast(pd.Series, row.apply(strip_nested))
return map_family_data(row)


Expand Down Expand Up @@ -255,7 +257,7 @@ def family(
)

for _, row in gcf_projects_data.iterrows():
projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
projects_id = str(row.at[FamilyColumnsNames.PROJECTS_ID.value]).strip()
mapped_families.append(process_row(row, projects_id, list(required_fields)))

return mapped_families
13 changes: 13 additions & 0 deletions gcf_data_mapper/parsers/helpers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Any

import click
import pandas as pd

Expand Down Expand Up @@ -69,3 +71,14 @@ def arrays_contain_empty_values(list_values: list[tuple], id: str) -> bool:
return True

return False


def strip_nested(value: Any) -> Any:
"""Recursively strip strings in nested structures."""
if isinstance(value, str):
return value.strip()
elif isinstance(value, list):
return [strip_nested(item) for item in value]
elif isinstance(value, dict):
return {key: strip_nested(val) for key, val in value.items()}
return value
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gcf-data-mapper"
version = "0.1.13"
version = "0.1.14"
description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
authors = ["CPR-dev-team <[email protected]>"]
license = "Apache-2.0"
Expand Down
16 changes: 16 additions & 0 deletions tests/unit_tests/parsers/document/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,22 @@ def mock_valid_row():
)


@pytest.fixture
def mock_valid_row_with_whitespace():
return pd.Series(
{
"ApprovedRef": " ref123 ",
"ProjectsID": " proj123 ",
"ID (Unique ID from our CMS for the document)": " doc123 ",
"Type": " type123 ",
"Title": " title123 ",
"Main file (English)": " link123.pdf ",
"Document page permalink": " link123 ",
"Translated titles": None,
}
)


@pytest.fixture
def mock_gcf_docs():
return pd.DataFrame(
Expand Down
18 changes: 18 additions & 0 deletions tests/unit_tests/parsers/document/test_process_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,21 @@ def test_process_row_returns_none_with_na_in_required_columns(
assert process_row(row, debug=False) is None
captured = capsys.readouterr()
assert expected_error_msg in captured.out


def test_handles_data_with_leading_and_trailing_whitespace(
mock_valid_row_with_whitespace,
):

expected_mapped_doc = [
{
"import_id": "GCF.document.ref123_proj123.doc123",
"family_import_id": "GCF.family.ref123.proj123",
"metadata": {"type": ["type123"]},
"title": "title123",
"source_url": "link123.pdf",
"variant_name": "Original Language",
}
]

assert expected_mapped_doc == process_row(mock_valid_row_with_whitespace, False)
38 changes: 38 additions & 0 deletions tests/unit_tests/parsers/event/test_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,41 @@ def test_event_handles_partial_valid_dates():
)
result = event(projects_data, debug=False)
assert len(result) == 3


def test_handles_data_with_leading_and_trailing_whitespace():
mock_projects_data = pd.DataFrame(
{
"ApprovalDate": [" 2023-01-01 ", None],
"StartDate": [None, " 2023-06-01"],
"DateCompletion": ["2023-12-31 ", None],
"ApprovedRef": [" FP123 ", " FP124 "],
"ProjectsID": [" PID456 ", " PID457 "],
}
)

expected_mapped_events = [
{
"date": "2023-01-01",
"event_title": "Project Approved",
"event_type_value": "Project Approved",
"import_id": "GCF.event.FP123_PID456.n0000",
"family_import_id": "GCF.family.FP123.PID456",
},
{
"date": "2023-12-31",
"event_title": "Project Completed",
"event_type_value": "Project Completed",
"family_import_id": "GCF.family.FP123.PID456",
"import_id": "GCF.event.FP123_PID456.n0001",
},
{
"date": "2023-06-01",
"event_title": "Under Implementation",
"event_type_value": "Under Implementation",
"family_import_id": "GCF.family.FP124.PID457",
"import_id": "GCF.event.FP124_PID457.n0000",
},
]

assert expected_mapped_events == event(mock_projects_data, False)
49 changes: 49 additions & 0 deletions tests/unit_tests/parsers/family/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,55 @@ def mock_family_row_with_non_int_non_float_budget_values():
)


@pytest.fixture()
def mock_family_doc_with_whitespace():
yield pd.Series(
{
"ProjectsID": " AAABBB ",
"ApprovedRef": " FP003 ",
"ProjectName": " Enhancing resilience of coastal ecosystems and communities",
"Theme": " Adaptation ",
"Sector": " Environment ",
"ProjectURL": " https://www.climateaction.fund/project/FP003 ",
"Summary": " The Summary of the Project ",
"Countries": [
{
"CountryName": " Bangladesh ",
"ISO3": " BGD ",
"Region": " Asia ",
},
],
"Entities": [
{
"Name": " Green Innovations ",
}
],
"Funding": [
{
"Source": " GCF ",
"Budget": 9200000,
"BudgetUSDeq": 9200000,
},
{
"ProjectBudgetID": 412,
"Source": " Co-Financing ",
"Budget": 620000,
"BudgetUSDeq": 620000,
},
],
"ResultAreas": [
{
"Area": " Coastal protection and restoration ",
"Type": " Adaptation ",
},
],
"ApprovalDate": " 2016-06-30T00:00:00.000Z ",
"StartDate": " 2024-06-28T00:00:00.000Z ",
"DateCompletion": None,
}
)


@pytest.fixture()
def required_family_columns():
required_columns = [column.value for column in FamilyColumnsNames]
Expand Down
32 changes: 32 additions & 0 deletions tests/unit_tests/parsers/family/test_map_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,35 @@ def test_skips_processing_row_if_family_metadata_has_missing_data(
f"🛑 Skipping row as family metadata has missing information, ProjectsID : {projects_id}"
== map_family_data_output[1]
)


def test_handles_data_with_leading_and_trailing_whitespace(
mock_family_doc_with_whitespace,
):

expected_mapped_family = {
"category": "MCF",
"collections": [],
"summary": "The Summary of the Project",
"geographies": ["BGD"],
"import_id": "GCF.family.FP003.AAABBB",
"metadata": {
"approved_ref": ["FP003"],
"implementing_agency": ["Green Innovations"],
"project_id": ["AAABBB"],
"project_url": ["https://www.climateaction.fund/project/FP003"],
"project_value_fund_spend": ["9200000"],
"project_value_co_financing": ["620000"],
"region": ["Asia"],
"result_area": ["Coastal protection and restoration"],
"result_type": ["Adaptation"],
"sector": ["Environment"],
"status": ["Under Implementation"],
"theme": ["Adaptation"],
},
"title": "Enhancing resilience of coastal ecosystems and communities",
}

assert expected_mapped_family == process_row(
mock_family_doc_with_whitespace, " AAABBB ", []
)

0 comments on commit f65de82

Please sign in to comment.