diff --git a/life_expectancy/cleaning.py b/life_expectancy/cleaning.py index 723a42a..eab794e 100644 --- a/life_expectancy/cleaning.py +++ b/life_expectancy/cleaning.py @@ -1,20 +1,33 @@ -"""_summary_ +"""Script for cleaning life expectancy data """ from pathlib import Path import argparse import pandas as pd +root_path = Path.cwd() +data_path = root_path / 'life_expectancy' / 'data' -def clean_data(region: str='PT') -> None: - """_summary_ + +def load_data() -> pd.DataFrame: + """Loads raw data from file + + Returns: + pd.DataFrame: Raw loaded from TSV file """ - root_path = Path.cwd() - data_path = root_path / 'life_expectancy' / 'data' raw_file = data_path / 'eu_life_expectancy_raw.tsv' - cleaned_file = data_path / 'pt_life_expectancy.csv' - data = pd.read_csv(raw_file, sep='\t') + return pd.read_csv(raw_file, sep='\t') + +def clean_data(data: pd.DataFrame, region: str='PT') -> None: + """Cleans the data and filter it according to region supplied + + Args: + data (pd.DataFrame): _description_ + region (str, optional): _description_. Defaults to 'PT'. + + """ + cleaned_data = ( data['unit,sex,age,geo\\time'].str.split(pat=',', expand=True) .join(data) @@ -26,11 +39,27 @@ def clean_data(region: str='PT') -> None: .assign(year=lambda row: row.year.astype('int')) .dropna(subset=['value']) ) + return cleaned_data + +def save_data(cleaned_data: pd.DataFrame) -> None: + """Saves the cleaned data into a file + + Args: + cleaned_data (pd.DataFrame): _description_ + """ + cleaned_file = data_path / 'pt_life_expectancy.csv' cleaned_data.to_csv(cleaned_file, index=False) +def main(region:str = 'PT') -> None: + """Pipeline function + """ + raw_data = load_data() + cleaned_data = clean_data(raw_data, region) + save_data(cleaned_data) + if __name__ == "__main__": #pragma: no cover parser = argparse.ArgumentParser(description='App to process life expectancy data') parser.add_argument('--region', type=str, help='Region to filter', default='PT') args = parser.parse_args() - clean_data(args.region) + main(args.region) diff --git a/life_expectancy/tests/test_cleaning.py b/life_expectancy/tests/test_cleaning.py index 214c6d8..c2790bd 100644 --- a/life_expectancy/tests/test_cleaning.py +++ b/life_expectancy/tests/test_cleaning.py @@ -1,13 +1,13 @@ """Tests for the cleaning module""" import pandas as pd -from life_expectancy.cleaning import clean_data +from life_expectancy.cleaning import main from . import OUTPUT_DIR def test_clean_data(pt_life_expectancy_expected): """Run the `clean_data` function and compare the output to the expected output""" - clean_data('PT') + main('PT') pt_life_expectancy_actual = pd.read_csv( OUTPUT_DIR / "pt_life_expectancy.csv" )