diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py index f8aa787..df4a48d 100644 --- a/gcf_data_mapper/parsers/family.py +++ b/gcf_data_mapper/parsers/family.py @@ -1,8 +1,10 @@ +from typing import Any, Iterable, Optional from typing import Any, Iterable, Optional, Union import click import pandas as pd +from gcf_data_mapper.enums.event import EventColumnNames, Events from gcf_data_mapper.enums.event import EventColumnNames, Events from gcf_data_mapper.enums.family import ( FamilyColumnsNames, @@ -25,6 +27,48 @@ def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool return any(date is pd.NaT for date in list_of_dates) +def calculate_status(row: pd.Series) -> Optional[str]: + """Calculate status of project based on the event types and dates + The status is calculated per the below: + Completed : (NOW is passed date-completion) + Under implementation : (NOW is passed start-date) + Approved : (NOW is passed approved-date) + + :param pd.Series row: The row containing the event information + :return Optional[str]: The status of the project, if there are no valid values return None + """ + completed_date = pd.to_datetime(row.at[Events.COMPLETED.column_name]) + start_date = pd.to_datetime(row.at[Events.UNDER_IMPLEMENTATION.column_name]) + approved_date = pd.to_datetime(row.at[Events.APPROVED.column_name]) + + if contains_invalid_date_entries([completed_date, start_date, approved_date]): + click.echo("🛑 Row contains invalid date entries") + return None + + now = pd.Timestamp.now(tz="UTC") + + # This block is arranged to reflect the project lifecycle in reverse order, from the final stage to the initial stage. + if pd.notna(completed_date) and now >= completed_date: + return Events.COMPLETED.type + if pd.notna(start_date) and now >= start_date: + return Events.UNDER_IMPLEMENTATION.type + if pd.notna(approved_date) and now >= approved_date: + return Events.APPROVED.type + + click.echo("🛑 Row missing event date information to calculate status") + return None + + +def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]: +def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool: + """Check if any of the values in the list of dates are NaT (Not a Time). + + :param Iterable[pd.Timestamp] list_of_dates: A list of pd.TimeStamps, may also include NoneTypes + :return bool: True if any of the values are not a valid timestamp. This helps distinguish between NaT and NaN/None Type values which are valid date entries. + """ + return any(date is pd.NaT for date in list_of_dates) + + def calculate_status(row: pd.Series) -> Optional[str]: """Calculate status of project based on the event types and dates The status is calculated per the below: @@ -97,6 +141,11 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]: status = calculate_status(row) + if status is None: + return None + + status = calculate_status(row) + if status is None: return None @@ -150,6 +199,7 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]: "result_types": list(set(types)), "sector": [row.at[FamilyColumnsNames.SECTOR.value]], "status": status, + "status": status, "theme": [row.at[FamilyColumnsNames.THEME.value]], } @@ -248,6 +298,8 @@ def family( mapped_families = [] + family_columns = set(str(e.value) for e in FamilyColumnsNames) + required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames)) family_columns = set(str(e.value) for e in FamilyColumnsNames) required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames)) @@ -258,6 +310,12 @@ def family( required_fields -= set( str(e.value) for e in EventColumnNames if str(e.value) not in family_columns ) + # Whilst we expect the event columns to be present, some of the events in the data may have empty values. + # We therefore want to exclude these from the `row_contains_columns_with_empty_values` function, + # and handle any empty event values in the `calculate_status` function. + required_fields -= set( + str(e.value) for e in EventColumnNames if str(e.value) not in family_columns + ) for _, row in gcf_projects_data.iterrows(): projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value] diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py index 4f34a79..7db040a 100644 --- a/tests/unit_tests/parsers/family/conftest.py +++ b/tests/unit_tests/parsers/family/conftest.py @@ -50,6 +50,9 @@ def mock_family_doc_df(): "ApprovalDate": "2016-06-30T00:00:00.000Z", "StartDate": "2024-06-28T00:00:00.000Z", "DateCompletion": None, + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ] ) @@ -100,6 +103,9 @@ def mock_family_row_ds(): "ApprovalDate": "2016-06-30T00:00:00.000Z", "StartDate": "2024-06-28T00:00:00.000Z", "DateCompletion": None, + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ) @@ -146,6 +152,9 @@ def mock_family_row_no_result_areas(): "ApprovalDate": "2016-06-30T00:00:00.000Z", "StartDate": "2024-06-28T00:00:00.000Z", "DateCompletion": None, + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ) diff --git a/tests/unit_tests/parsers/family/test_map_family_metadata.py b/tests/unit_tests/parsers/family/test_map_family_metadata.py index a9a0459..62f0cd9 100644 --- a/tests/unit_tests/parsers/family/test_map_family_metadata.py +++ b/tests/unit_tests/parsers/family/test_map_family_metadata.py @@ -1,5 +1,7 @@ from typing import Optional +from typing import Optional + import pandas as pd import pytest @@ -10,6 +12,13 @@ get_budgets, map_family_metadata, ) +from gcf_data_mapper.enums.event import Events +from gcf_data_mapper.parsers.family import ( + calculate_status, + contains_invalid_date_entries, + get_budgets, + map_family_metadata, +) @pytest.fixture()