From b3348a2074571216d0a6c905aa9065d25578f528 Mon Sep 17 00:00:00 2001 From: odrakes-cpr Date: Wed, 11 Sep 2024 10:09:37 +0100 Subject: [PATCH] feat: add function to calculate status of a project based on events (#15) * feat: add function to calculate status of a project based on events * refactor: move nonetype budget check further up the chain in the block * refactor: update budget type checker to expect ints only --------- Co-authored-by: Osneil Drakes Co-authored-by: Osneil Drakes --- gcf_data_mapper/parsers/family.py | 72 +++++++- tests/unit_tests/parsers/family/conftest.py | 52 ++++++ .../parsers/family/test_map_family.py | 6 +- .../family/test_map_family_metadata.py | 162 +++++++++++++++++- 4 files changed, 285 insertions(+), 7 deletions(-) diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py index 937718e..2b662fa 100644 --- a/gcf_data_mapper/parsers/family.py +++ b/gcf_data_mapper/parsers/family.py @@ -1,8 +1,9 @@ -from typing import Any, Optional +from typing import Any, Iterable, Optional import click import pandas as pd +from gcf_data_mapper.enums.event import EventColumnNames, Events from gcf_data_mapper.enums.family import ( FamilyColumnsNames, FamilyNestedColumnNames, @@ -15,13 +16,54 @@ ) -def get_budgets(funding_list: list[dict], source: str) -> list[int]: +def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool: + """Check if any of the values in the list of dates are NaT (Not a Time). + + :param Iterable[pd.Timestamp] list_of_dates: A list of pd.TimeStamps, may also include NoneTypes + :return bool: True if any of the values are not a valid timestamp. This helps distinguish between NaT and NaN/None Type values which are valid date entries. + """ + return any(date is pd.NaT for date in list_of_dates) + + +def calculate_status(row: pd.Series) -> Optional[str]: + """Calculate status of project based on the event types and dates + The status is calculated per the below: + Completed : (NOW is passed date-completion) + Under implementation : (NOW is passed start-date) + Approved : (NOW is passed approved-date) + + :param pd.Series row: The row containing the event information + :return Optional[str]: The status of the project, if there are no valid values return None + """ + completed_date = pd.to_datetime(row.at[Events.COMPLETED.column_name]) + start_date = pd.to_datetime(row.at[Events.UNDER_IMPLEMENTATION.column_name]) + approved_date = pd.to_datetime(row.at[Events.APPROVED.column_name]) + + if contains_invalid_date_entries([completed_date, start_date, approved_date]): + click.echo("🛑 Row contains invalid date entries") + return None + + now = pd.Timestamp.now(tz="UTC") + + # This block is arranged to reflect the project lifecycle in reverse order, from the final stage to the initial stage. + if pd.notna(completed_date) and now >= completed_date: + return Events.COMPLETED.type + if pd.notna(start_date) and now >= start_date: + return Events.UNDER_IMPLEMENTATION.type + if pd.notna(approved_date) and now >= approved_date: + return Events.APPROVED.type + + click.echo("🛑 Row missing event date information to calculate status") + return None + + +def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]: """Get the budget amount from the row based on the funding source. :param list[dict] row: A list of all the funding information, represented in dictionaries :param str source: The funding source to retrieve the budget from. - :return list[int]: A list of budget amounts corresponding to the source, + :return Optional[list[int]]: A list of budget amounts corresponding to the source, or [0] if the source is not found. """ @@ -32,6 +74,11 @@ def get_budgets(funding_list: list[dict], source: str) -> list[int]: funding[budget_key] for funding in funding_list if funding[source_key] == source ] + # Check for any invalid values + if any(not isinstance(budget, (int)) for budget in budgets): + click.echo("🛑 Funding entries does not have valid int budget values") + return None + # Where we have projects which have been solely funded by the fund (GCF), or solely co-financed # - so in instances where there will be no funding that match either the GCF or co-financing # source value, we will map the `project_value_fund spend` or the `project_value_co_financing` @@ -46,6 +93,11 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]: :return Optional[dict]: A dictionary containing mapped metadata for the family. """ + status = calculate_status(row) + + if status is None: + return None + countries = row.at[FamilyColumnsNames.COUNTRIES.value] entities = row.at[FamilyColumnsNames.ENTITIES.value] funding_sources = row.at[FamilyColumnsNames.FUNDING.value] @@ -61,6 +113,9 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]: ) gcf_budgets = get_budgets(funding_sources, GCFProjectBudgetSource.GCF.value) + if gcf_budgets is None or co_financing_budgets is None: + return None + implementing_agencies = [entity[name_key] for entity in entities] regions = [country[region_key] for country in countries] areas = [result[area_key] for result in result_areas] @@ -92,6 +147,7 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]: "result_areas": list(set(areas)), "result_types": list(set(types)), "sector": [row.at[FamilyColumnsNames.SECTOR.value]], + "status": status, "theme": [row.at[FamilyColumnsNames.THEME.value]], } @@ -188,10 +244,16 @@ def family( mapped_families = [] - required_fields = set(str(e.value) for e in FamilyColumnsNames) + family_columns = set(str(e.value) for e in FamilyColumnsNames) + required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames)) verify_required_fields_present(gcf_projects_data, required_fields) - # Do a check that the projects data has the field you need + # Whilst we expect the event columns to be present, some of the events in the data may have empty values. + # We therefore want to exclude these from the `row_contains_columns_with_empty_values` function, + # and handle any empty event values in the `calculate_status` function. + required_fields -= set( + str(e.value) for e in EventColumnNames if str(e.value) not in family_columns + ) for _, row in gcf_projects_data.iterrows(): projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value] diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py index 40171f8..f9aba38 100644 --- a/tests/unit_tests/parsers/family/conftest.py +++ b/tests/unit_tests/parsers/family/conftest.py @@ -47,6 +47,9 @@ def mock_family_doc_df(): "Type": "Adaptation", }, ], + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ] ) @@ -94,6 +97,9 @@ def mock_family_row_ds(): "Type": "The Type for the Result Area", }, ], + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ) @@ -137,6 +143,9 @@ def mock_family_row_no_result_areas(): "ResultAreas": [ {"Area": "", "Type": ""}, ], + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ) @@ -175,6 +184,49 @@ def mock_family_row_no_entities_no_regions(): "Type": "The Type for the Result Area", }, ], + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, + } + ) + + +@pytest.fixture() +def mock_family_row_with_non_int_budget_values(): + yield pd.Series( + { + "ProjectsID": 3, + "ApprovedRef": "FP004", + "ProjectName": "Enhancing resilience of marine ecosystems", + "Theme": "Adaptation", + "Sector": "Private", + "ProjectURL": "https://www.climateaction.fund/project/FP004", + "Summary": "The Summary of the Project", + "Countries": [ + {"Region": ""}, + ], + "Entities": [{"Name": ""}], + "Funding": [ + { + "Source": "GCF", + "Budget": "82000", + "BudgetUSDeq": "82000", + }, + { + "Source": "Co-Financing", + "Budget": 620000.20, + "BudgetUSDeq": 620000.50, + }, + ], + "ResultAreas": [ + { + "Area": "The Area for the Result Area", + "Type": "The Type for the Result Area", + }, + ], + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ) diff --git a/tests/unit_tests/parsers/family/test_map_family.py b/tests/unit_tests/parsers/family/test_map_family.py index 43297f7..6ca122c 100644 --- a/tests/unit_tests/parsers/family/test_map_family.py +++ b/tests/unit_tests/parsers/family/test_map_family.py @@ -24,6 +24,7 @@ def parsed_family_data(): "result_areas": ["Coastal protection and restoration"], "result_types": ["Adaptation"], "sector": ["Environment"], + "status": "Under Implementation", "theme": ["Adaptation"], }, "title": "Enhancing resilience of coastal ecosystems and communities", @@ -63,11 +64,14 @@ def test_raises_error_on_validating_row_for_missing_columns(): "ResultAreas": [{"Area": "Coastal"}], "Summary": "Fake Summary", "ProjectName": "Fake Project Name", + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, } ] ) - expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovedRef', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'Summary']" + expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovalDate', 'ApprovedRef', 'DateCompletion', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'StartDate', 'Summary']" with pytest.raises(AttributeError) as e: family(test_data_frame, debug=True) assert expected_error_message == str(e.value) diff --git a/tests/unit_tests/parsers/family/test_map_family_metadata.py b/tests/unit_tests/parsers/family/test_map_family_metadata.py index 999ae14..75798e7 100644 --- a/tests/unit_tests/parsers/family/test_map_family_metadata.py +++ b/tests/unit_tests/parsers/family/test_map_family_metadata.py @@ -1,7 +1,15 @@ +from typing import Optional + import pandas as pd import pytest -from gcf_data_mapper.parsers.family import get_budgets, map_family_metadata +from gcf_data_mapper.enums.event import Events +from gcf_data_mapper.parsers.family import ( + calculate_status, + contains_invalid_date_entries, + get_budgets, + map_family_metadata, +) @pytest.fixture() @@ -17,6 +25,7 @@ def parsed_family_metadata(): "result_areas": ["The Area for the Result Area"], "result_types": ["The Type for the Result Area"], "sector": ["Private"], + "status": "Under Implementation", "theme": ["Adaptation"], } @@ -117,3 +126,154 @@ def test_returns_expected_value_when_parsing_budget_data( ): budgets = get_budgets(funding_list, source) assert budgets == expected_value + + +def test_map_family_metadata_returns_none_if_budget_does_not_contain_valid_int_types( + mock_family_row_with_non_int_budget_values: pd.Series, +): + result = map_family_metadata(mock_family_row_with_non_int_budget_values) + assert result is None + + +@pytest.mark.parametrize( + ("mock_family_row, expected_status"), + [ + ( + pd.Series( + { + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": None, + "DateCompletion": None, + } + ), + Events.APPROVED.type, + ), + ( + pd.Series( + { + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2024-06-28T00:00:00.000Z", + "DateCompletion": None, + } + ), + Events.UNDER_IMPLEMENTATION.type, + ), + ( + pd.Series( + { + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2018-06-30T00:00:00.000Z", + "DateCompletion": "2022-06-30T00:00:00.000Z", + } + ), + Events.COMPLETED.type, + ), + ( + pd.Series( + { + "ApprovalDate": None, + "StartDate": None, + "DateCompletion": None, + } + ), + None, + ), + ( + pd.Series( + { + "ApprovalDate": pd.NA, + "StartDate": pd.NA, + "DateCompletion": pd.NA, + } + ), + None, + ), + ( + pd.Series( + { + "ApprovalDate": "", # invalid date entry + "StartDate": "2018-06-30T00:00:00.000Z", + "DateCompletion": "2022-06-30T00:00:00.000Z", + } + ), + None, + ), + ], +) +def test_returns_status(mock_family_row: pd.Series, expected_status: Optional[str]): + status = calculate_status(mock_family_row) + assert status == expected_status + + +@pytest.mark.parametrize( + ("list_of_dates, return_value"), + [ + ( + [ + pd.to_datetime("2016-06-30T00:00:00.000Z"), + pd.to_datetime("2018-06-30T00:00:00.000Z"), + pd.to_datetime("2022-06-30T00:00:00.000Z"), + ], + False, + ), + ( + [None, None, pd.to_datetime("2016-06-30T00:00:00.000Z")], + False, + ), + ( + [ + pd.to_datetime("2018-06-30T00:00:00.000Z"), + pd.to_datetime(""), + pd.to_datetime("2022-06-30T00:00:00.000Z"), + ], + True, + ), + ], +) +def test_dates_contain_invalid_date_entries(list_of_dates: list, return_value): + result = contains_invalid_date_entries(list_of_dates) + assert result == return_value + + +@pytest.mark.parametrize( + ("mock_row, output_message"), + [ + ( + pd.Series( + { + "ApprovalDate": pd.NA, + "StartDate": pd.NA, + "DateCompletion": pd.NA, + } + ), + "🛑 Row contains invalid date entries", + ), + ( + pd.Series( + { + "ApprovalDate": "2016-06-30T00:00:00.000Z", + "StartDate": "2018-06-30T00:00:00.000Z", + "DateCompletion": "", + } + ), + "🛑 Row contains invalid date entries", + ), + ( + pd.Series( + { + "ApprovalDate": None, + "StartDate": None, + "DateCompletion": None, + } + ), + "🛑 Row missing event date information to calculate status", + ), + ], +) +def test_skips_processing_row_if_calculate_status_returns_none( + mock_row: pd.Series, output_message: str, capsys +): + return_value = map_family_metadata(mock_row) + assert return_value is None + captured = capsys.readouterr() + assert output_message == captured.out.strip()