Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 37 additions & 8 deletions life_expectancy/cleaning.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
"""_summary_
"""Script for cleaning life expectancy data
"""

from pathlib import Path
import argparse
import pandas as pd

root_path = Path.cwd()
data_path = root_path / 'life_expectancy' / 'data'

def clean_data(region: str='PT') -> None:
"""_summary_

def load_data() -> pd.DataFrame:
"""Loads raw data from file

Returns:
pd.DataFrame: Raw loaded from TSV file
"""
root_path = Path.cwd()
data_path = root_path / 'life_expectancy' / 'data'
raw_file = data_path / 'eu_life_expectancy_raw.tsv'
cleaned_file = data_path / 'pt_life_expectancy.csv'

data = pd.read_csv(raw_file, sep='\t')
return pd.read_csv(raw_file, sep='\t')

def clean_data(data: pd.DataFrame, region: str='PT') -> None:
"""Cleans the data and filter it according to region supplied

Args:
data (pd.DataFrame): _description_
region (str, optional): _description_. Defaults to 'PT'.

"""

cleaned_data = (
data['unit,sex,age,geo\\time'].str.split(pat=',', expand=True)
.join(data)
Expand All @@ -26,11 +39,27 @@ def clean_data(region: str='PT') -> None:
.assign(year=lambda row: row.year.astype('int'))
.dropna(subset=['value'])
)
return cleaned_data

def save_data(cleaned_data: pd.DataFrame) -> None:
"""Saves the cleaned data into a file

Args:
cleaned_data (pd.DataFrame): _description_
"""
cleaned_file = data_path / 'pt_life_expectancy.csv'
cleaned_data.to_csv(cleaned_file, index=False)

def main(region:str = 'PT') -> None:
"""Pipeline function
"""
raw_data = load_data()
cleaned_data = clean_data(raw_data, region)
save_data(cleaned_data)


if __name__ == "__main__": #pragma: no cover
parser = argparse.ArgumentParser(description='App to process life expectancy data')
parser.add_argument('--region', type=str, help='Region to filter', default='PT')
args = parser.parse_args()
clean_data(args.region)
main(args.region)
4 changes: 2 additions & 2 deletions life_expectancy/tests/test_cleaning.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""Tests for the cleaning module"""
import pandas as pd

from life_expectancy.cleaning import clean_data
from life_expectancy.cleaning import main
from . import OUTPUT_DIR


def test_clean_data(pt_life_expectancy_expected):
"""Run the `clean_data` function and compare the output to the expected output"""
clean_data('PT')
main('PT')
pt_life_expectancy_actual = pd.read_csv(
OUTPUT_DIR / "pt_life_expectancy.csv"
)
Expand Down