From 05ba7fce1901b3e8d3da880d8f564fbb46be5ea3 Mon Sep 17 00:00:00 2001 From: QuanMPhm Date: Thu, 18 Apr 2024 14:22:33 -0400 Subject: [PATCH] Added pre-commit linting check --- .github/workflows/pre-commit.yaml | 16 ++ .pre-commit-config.yaml | 16 ++ README.md | 2 +- process_report/institute_map.json | 2 +- process_report/process_report.py | 144 +++++++----- process_report/tests/unit_tests.py | 366 ++++++++++++++++++----------- requirements.txt | 2 +- 7 files changed, 349 insertions(+), 199 deletions(-) create mode 100644 .github/workflows/pre-commit.yaml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 0000000..b16bd03 --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,16 @@ + +name: pre-commit + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..907cd60 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: check-merge-conflict + - id: end-of-file-fixer + - id: check-added-large-files + - id: check-case-conflict + - id: detect-private-key + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.1 + hooks: + - id: ruff + - id: ruff-format diff --git a/README.md b/README.md index 8e1ce38..c4ebc66 100644 --- a/README.md +++ b/README.md @@ -69,5 +69,5 @@ In this example, `project foo` will not be billed for September 2023 and August ## Combine CSVs -This script also combines the 3 separate Invoice data CSVs into 1 Invoice CSV. It combines +This script also combines the 3 separate Invoice data CSVs into 1 Invoice CSV. It combines OpenShift SU, OpenStack SU, and Storage SU data. diff --git a/process_report/institute_map.json b/process_report/institute_map.json index cd2d60d..be959af 100644 --- a/process_report/institute_map.json +++ b/process_report/institute_map.json @@ -12,7 +12,7 @@ "bidmc.harvard.edu" : "Beth Israel Deaconess Medical Center", "fas.harvard.edu" : "Harvard University", "cga.harvard.edu" : "Harvard University", - "iq.harvard.edu" : "Harvard University", + "iq.harvard.edu" : "Harvard University", "hks.harvard.edu" : "Harvard University", "hsph.harvard.edu" : "Harvard University", "seas.harvard.edu" : "Harvard University", diff --git a/process_report/process_report.py b/process_report/process_report.py index 4e3d55b..b2d2dd9 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -7,35 +7,35 @@ ### Invoice field names -INVOICE_DATE_FIELD = 'Invoice Month' -PROJECT_FIELD = 'Project - Allocation' -PROJECT_ID_FIELD = 'Project - Allocation ID' -PI_FIELD = 'Manager (PI)' -INVOICE_EMAIL_FIELD = 'Invoice Email' -INVOICE_ADDRESS_FIELD = 'Invoice Address' -INSTITUTION_FIELD = 'Institution' -INSTITUTION_ID_FIELD = 'Institution - Specific Code' -SU_HOURS_FIELD = 'SU Hours (GBhr or SUhr)' -SU_TYPE_FIELD = 'SU Type' -COST_FIELD = 'Cost' -CREDIT_FIELD = 'Credit' -CREDIT_CODE_FIELD = 'Credit Code' -BALANCE_FIELD = 'Balance' +INVOICE_DATE_FIELD = "Invoice Month" +PROJECT_FIELD = "Project - Allocation" +PROJECT_ID_FIELD = "Project - Allocation ID" +PI_FIELD = "Manager (PI)" +INVOICE_EMAIL_FIELD = "Invoice Email" +INVOICE_ADDRESS_FIELD = "Invoice Address" +INSTITUTION_FIELD = "Institution" +INSTITUTION_ID_FIELD = "Institution - Specific Code" +SU_HOURS_FIELD = "SU Hours (GBhr or SUhr)" +SU_TYPE_FIELD = "SU Type" +COST_FIELD = "Cost" +CREDIT_FIELD = "Credit" +CREDIT_CODE_FIELD = "Credit Code" +BALANCE_FIELD = "Balance" ### def get_institution_from_pi(institute_map, pi_uname): - institution_key = pi_uname.split('@')[-1] - institution_name = institute_map.get(institution_key, '') + institution_key = pi_uname.split("@")[-1] + institution_name = institute_map.get(institution_key, "") - if institution_name == '': + if institution_name == "": print(f"Warning: PI name {pi_uname} does not match any institution!") - + return institution_name def load_institute_map() -> dict: - with open('institute_map.json', 'r') as f: + with open("institute_map.json", "r") as f: institute_map = json.load(f) return institute_map @@ -46,18 +46,18 @@ def load_old_pis(old_pi_file): try: with open(old_pi_file) as f: - for pi_info in f: - pi, first_month = pi_info.strip().split(',') + for pi_info in f: + pi, first_month = pi_info.strip().split(",") old_pi_dict[pi] = first_month except FileNotFoundError: - print('Applying credit 0002 failed. Old PI file does not exist') + print("Applying credit 0002 failed. Old PI file does not exist") sys.exit(1) - + return old_pi_dict def is_old_pi(old_pi_dict, pi, invoice_month): - if pi in old_pi_dict and old_pi_dict[pi] != invoice_month: + if pi in old_pi_dict and old_pi_dict[pi] != invoice_month: return True return False @@ -97,24 +97,24 @@ def main(): "--output-folder", required=False, default="pi_invoices", - help="Name of output folder containing pi-specific invoice csvs" + help="Name of output folder containing pi-specific invoice csvs", ) parser.add_argument( "--HU-invoice-file", required=False, default="HU_only.csv", - help="Name of output csv for HU invoices" + help="Name of output csv for HU invoices", ) parser.add_argument( "--HU-BU-invoice-file", required=False, default="HU_BU.csv", - help="Name of output csv for HU and BU invoices" + help="Name of output csv for HU and BU invoices", ) parser.add_argument( "--old-pi-file", required=False, - help="Name of csv file listing previously billed PIs" + help="Name of csv file listing previously billed PIs", ) args = parser.parse_args() merged_dataframe = merge_csv(args.csv_files) @@ -167,7 +167,7 @@ def get_invoice_date(dataframe): be the same for every row. """ invoice_date_str = dataframe[INVOICE_DATE_FIELD][0] - invoice_date = pandas.to_datetime(invoice_date_str, format='%Y-%m') + invoice_date = pandas.to_datetime(invoice_date_str, format="%Y-%m") return invoice_date @@ -176,16 +176,22 @@ def timed_projects(timed_projects_file, invoice_date): dataframe = pandas.read_csv(timed_projects_file) # convert to pandas timestamp objects - dataframe['Start Date'] = pandas.to_datetime(dataframe['Start Date'], format="%Y-%m") - dataframe['End Date'] = pandas.to_datetime(dataframe['End Date'], format="%Y-%m") + dataframe["Start Date"] = pandas.to_datetime( + dataframe["Start Date"], format="%Y-%m" + ) + dataframe["End Date"] = pandas.to_datetime(dataframe["End Date"], format="%Y-%m") - mask = (dataframe['Start Date'] <= invoice_date) & (invoice_date <= dataframe['End Date']) - return dataframe[mask]['Project'].to_list() + mask = (dataframe["Start Date"] <= invoice_date) & ( + invoice_date <= dataframe["End Date"] + ) + return dataframe[mask]["Project"].to_list() def remove_non_billables(dataframe, pi, projects): """Removes projects and PIs that should not be billed from the dataframe""" - filtered_dataframe = dataframe[~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)] + filtered_dataframe = dataframe[ + ~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects) + ] return filtered_dataframe @@ -194,14 +200,16 @@ def remove_billables(dataframe, pi, projects, output_file): So this *keeps* the projects/pis that should not be billed. """ - filtered_dataframe = dataframe[dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)] + filtered_dataframe = dataframe[ + dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects) + ] filtered_dataframe.to_csv(output_file, index=False) def validate_pi_names(dataframe): invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])] for i, row in invalid_pi_projects.iterrows(): - print(f'Warning: Project {row[PROJECT_FIELD]} has empty PI field') + print(f"Warning: Project {row[PROJECT_FIELD]} has empty PI field") dataframe = dataframe[~pandas.isna(dataframe[PI_FIELD])] return dataframe @@ -219,11 +227,13 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder): pi_list = dataframe[PI_FIELD].unique() for pi in pi_list: - if pandas.isna(pi): + if pandas.isna(pi): continue pi_projects = dataframe[dataframe[PI_FIELD] == pi] pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0] - pi_projects.to_csv(output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv") + pi_projects.to_csv( + output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv" + ) def apply_credits_new_pi(dataframe, old_pi_file): @@ -251,24 +261,24 @@ def apply_credits_new_pi(dataframe, old_pi_file): project_cost = row[COST_FIELD] applied_credit = min(project_cost, remaining_credit) - dataframe.at[i, CREDIT_FIELD] = applied_credit + dataframe.at[i, CREDIT_FIELD] = applied_credit dataframe.at[i, CREDIT_CODE_FIELD] = new_pi_credit_code dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - applied_credit remaining_credit -= applied_credit if remaining_credit == 0: break - + return dataframe def add_institution(dataframe: pandas.DataFrame): """Determine every PI's institution name, logging any PI whose institution cannot be determined - This is performed by `get_institution_from_pi()`, which tries to match the PI's username to + This is performed by `get_institution_from_pi()`, which tries to match the PI's username to a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if the username is not an email address. - - Exact matches are then mapped to the corresponding institution name. + + Exact matches are then mapped to the corresponding institution name. I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University" @@ -277,42 +287,50 @@ def add_institution(dataframe: pandas.DataFrame): institute_map = load_institute_map() for i, row in dataframe.iterrows(): pi_name = row[PI_FIELD] - if pandas.isna(pi_name): + if pandas.isna(pi_name): print(f"Project {row[PROJECT_FIELD]} has no PI") - else: - dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(institute_map, pi_name) + else: + dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi( + institute_map, pi_name + ) return dataframe def export_HU_only(dataframe, output_file): - HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == 'Harvard University'] + HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"] HU_projects.to_csv(output_file) def export_HU_BU(dataframe, output_file): - HU_BU_projects = dataframe[(dataframe[INSTITUTION_FIELD] == 'Harvard University') | - (dataframe[INSTITUTION_FIELD] == 'Boston University')] - HU_BU_projects.to_csv(output_file) + HU_BU_projects = dataframe[ + (dataframe[INSTITUTION_FIELD] == "Harvard University") + | (dataframe[INSTITUTION_FIELD] == "Boston University") + ] + HU_BU_projects.to_csv(output_file) def export_lenovo(dataframe: pandas.DataFrame, output_file=None): + lenovo_file_name = ( + output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv" + ) - lenovo_file_name = output_file or f'Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv' - - LENOVO_SU_TYPES = ['OpenShift GPUA100SXM4', 'OpenStack GPUA100SXM4'] + LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] SU_CHARGE_MULTIPLIER = 1 - lenovo_df = dataframe[dataframe[SU_TYPE_FIELD].isin(LENOVO_SU_TYPES)][[ - INVOICE_DATE_FIELD, - PROJECT_FIELD, - INSTITUTION_FIELD, - SU_HOURS_FIELD, - SU_TYPE_FIELD]] - - lenovo_df.rename(columns={SU_HOURS_FIELD: 'SU Hours'}, inplace=True) - lenovo_df.insert(len(lenovo_df.columns), 'SU Charge', SU_CHARGE_MULTIPLIER) - lenovo_df['Charge'] = lenovo_df['SU Hours'] * lenovo_df['SU Charge'] + lenovo_df = dataframe[dataframe[SU_TYPE_FIELD].isin(LENOVO_SU_TYPES)][ + [ + INVOICE_DATE_FIELD, + PROJECT_FIELD, + INSTITUTION_FIELD, + SU_HOURS_FIELD, + SU_TYPE_FIELD, + ] + ] + + lenovo_df.rename(columns={SU_HOURS_FIELD: "SU Hours"}, inplace=True) + lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER) + lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"] lenovo_df.to_csv(lenovo_file_name) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 35676f1..8f0c381 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -1,5 +1,4 @@ from unittest import TestCase -from unittest import skipIf import tempfile import pandas import os @@ -8,10 +7,11 @@ from process_report import process_report + class TestGetInvoiceDate(TestCase): def test_get_invoice_date(self): # The month in sample data is not the same - data = {'Invoice Month': ['2023-01', '2023-02', '2023-03']} + data = {"Invoice Month": ["2023-01", "2023-02", "2023-03"]} dataframe = pandas.DataFrame(data) invoice_date = process_report.get_invoice_date(dataframe) @@ -19,26 +19,27 @@ def test_get_invoice_date(self): self.assertIsInstance(invoice_date, pandas.Timestamp) # Assert that the invoice_date is the first item - expected_date = pandas.Timestamp('2023-01') + expected_date = pandas.Timestamp("2023-01") self.assertEqual(invoice_date, expected_date) class TestTimedProjects(TestCase): def setUp(self): - # Without the dedent method, our data will have leading spaces which # messes up the first key. Also the '\' is imporant to ignore the first # new line we added so it's more readable in code. - self.csv_data = dedent("""\ + self.csv_data = dedent( + """\ Project,Start Date,End Date ProjectA,2022-09,2023-08 ProjectB,2022-09,2023-09 ProjectC,2023-09,2024-08 ProjectD,2022-09,2024-08 - """) - self.invoice_date = pandas.Timestamp('2023-09') + """ + ) + self.invoice_date = pandas.Timestamp("2023-09") - self.csv_file = tempfile.NamedTemporaryFile(delete=False, mode='w') + self.csv_file = tempfile.NamedTemporaryFile(delete=False, mode="w") self.csv_file.write(self.csv_data) self.csv_file.close() @@ -46,24 +47,31 @@ def tearDown(self): os.remove(self.csv_file.name) def test_timed_projects(self): - excluded_projects = process_report.timed_projects(self.csv_file.name, self.invoice_date) + excluded_projects = process_report.timed_projects( + self.csv_file.name, self.invoice_date + ) - expected_projects = ['ProjectB', 'ProjectC', 'ProjectD'] + expected_projects = ["ProjectB", "ProjectC", "ProjectD"] self.assertEqual(excluded_projects, expected_projects) class TestRemoveNonBillables(TestCase): def setUp(self): - data = { - 'Manager (PI)': ['PI1', 'PI2', 'PI3', 'PI4', 'PI5'], - 'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'], - 'Untouch Data Column': ['DataA', 'DataB', 'DataC', 'DataD', 'DataE'] + "Manager (PI)": ["PI1", "PI2", "PI3", "PI4", "PI5"], + "Project - Allocation": [ + "ProjectA", + "ProjectB", + "ProjectC", + "ProjectD", + "ProjectE", + ], + "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) - self.pi_to_exclude = ['PI2', 'PI3'] - self.projects_to_exclude = ['ProjectB', 'ProjectD'] + self.pi_to_exclude = ["PI2", "PI3"] + self.projects_to_exclude = ["ProjectB", "ProjectD"] self.output_file = tempfile.NamedTemporaryFile(delete=False) self.output_file2 = tempfile.NamedTemporaryFile(delete=False) @@ -73,54 +81,67 @@ def tearDown(self): os.remove(self.output_file2.name) def test_remove_non_billables(self): - billables_df = process_report.remove_non_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude) + billables_df = process_report.remove_non_billables( + self.dataframe, self.pi_to_exclude, self.projects_to_exclude + ) process_report.export_billables(billables_df, self.output_file.name) result_df = pandas.read_csv(self.output_file.name) - self.assertNotIn('PI2', result_df['Manager (PI)'].tolist()) - self.assertNotIn('PI3', result_df['Manager (PI)'].tolist()) - self.assertNotIn('PI4', result_df['Manager (PI)'].tolist()) # indirect because ProjectD was removed - self.assertNotIn('ProjectB', result_df['Project - Allocation'].tolist()) - self.assertNotIn('ProjectC', result_df['Project - Allocation'].tolist()) # indirect because PI3 was removed - self.assertNotIn('ProjectD', result_df['Project - Allocation'].tolist()) - - self.assertIn('PI1', result_df['Manager (PI)'].tolist()) - self.assertIn('PI5', result_df['Manager (PI)'].tolist()) - self.assertIn('ProjectA', result_df['Project - Allocation'].tolist()) - self.assertIn('ProjectE', result_df['Project - Allocation'].tolist()) + self.assertNotIn("PI2", result_df["Manager (PI)"].tolist()) + self.assertNotIn("PI3", result_df["Manager (PI)"].tolist()) + self.assertNotIn( + "PI4", result_df["Manager (PI)"].tolist() + ) # indirect because ProjectD was removed + self.assertNotIn("ProjectB", result_df["Project - Allocation"].tolist()) + self.assertNotIn( + "ProjectC", result_df["Project - Allocation"].tolist() + ) # indirect because PI3 was removed + self.assertNotIn("ProjectD", result_df["Project - Allocation"].tolist()) + + self.assertIn("PI1", result_df["Manager (PI)"].tolist()) + self.assertIn("PI5", result_df["Manager (PI)"].tolist()) + self.assertIn("ProjectA", result_df["Project - Allocation"].tolist()) + self.assertIn("ProjectE", result_df["Project - Allocation"].tolist()) def test_remove_billables(self): - process_report.remove_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude, self.output_file2.name) + process_report.remove_billables( + self.dataframe, + self.pi_to_exclude, + self.projects_to_exclude, + self.output_file2.name, + ) result_df = pandas.read_csv(self.output_file2.name) - self.assertIn('PI2', result_df['Manager (PI)'].tolist()) - self.assertIn('PI3', result_df['Manager (PI)'].tolist()) - self.assertIn('PI4', result_df['Manager (PI)'].tolist()) - self.assertIn('ProjectB', result_df['Project - Allocation'].tolist()) - self.assertIn('ProjectC', result_df['Project - Allocation'].tolist()) - self.assertIn('ProjectD', result_df['Project - Allocation'].tolist()) + self.assertIn("PI2", result_df["Manager (PI)"].tolist()) + self.assertIn("PI3", result_df["Manager (PI)"].tolist()) + self.assertIn("PI4", result_df["Manager (PI)"].tolist()) + self.assertIn("ProjectB", result_df["Project - Allocation"].tolist()) + self.assertIn("ProjectC", result_df["Project - Allocation"].tolist()) + self.assertIn("ProjectD", result_df["Project - Allocation"].tolist()) - self.assertNotIn('PI1', result_df['Manager (PI)'].tolist()) - self.assertNotIn('PI5', result_df['Manager (PI)'].tolist()) - self.assertNotIn('ProjectA', result_df['Project - Allocation'].tolist()) - self.assertNotIn('ProjectE', result_df['Project - Allocation'].tolist()) + self.assertNotIn("PI1", result_df["Manager (PI)"].tolist()) + self.assertNotIn("PI5", result_df["Manager (PI)"].tolist()) + self.assertNotIn("ProjectA", result_df["Project - Allocation"].tolist()) + self.assertNotIn("ProjectE", result_df["Project - Allocation"].tolist()) class TestMergeCSV(TestCase): def setUp(self): - self.header = ['ID', 'Name', 'Age'] + self.header = ["ID", "Name", "Age"] self.data = [ - [1, 'Alice', 25], - [2, 'Bob', 30], - [3, 'Charlie', 28], + [1, "Alice", 25], + [2, "Bob", 30], + [3, "Charlie", 28], ] self.csv_files = [] for _ in range(3): - csv_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv') + csv_file = tempfile.NamedTemporaryFile( + delete=False, mode="w", suffix=".csv" + ) self.csv_files.append(csv_file) dataframe = pandas.DataFrame(self.data, columns=self.header) dataframe.to_csv(csv_file, index=False) @@ -131,10 +152,14 @@ def tearDown(self): os.remove(csv_file.name) def test_merge_csv(self): - merged_dataframe = process_report.merge_csv([csv_file.name for csv_file in self.csv_files]) + merged_dataframe = process_report.merge_csv( + [csv_file.name for csv_file in self.csv_files] + ) expected_rows = len(self.data) * 3 - self.assertEqual(len(merged_dataframe), expected_rows) # `len` for a pandas dataframe excludes the header row + self.assertEqual( + len(merged_dataframe), expected_rows + ) # `len` for a pandas dataframe excludes the header row # Assert that the headers in the merged DataFrame match the expected headers self.assertListEqual(merged_dataframe.columns.tolist(), self.header) @@ -142,13 +167,18 @@ def test_merge_csv(self): class TestExportPICSV(TestCase): def setUp(self): - data = { - 'Invoice Month': ['2023-01','2023-01','2023-01','2023-01','2023-01'], - 'Manager (PI)': ['PI1', 'PI1', 'PI1', 'PI2', 'PI2'], - 'Institution': ['BU', 'BU', 'BU', 'HU', 'HU'], - 'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'], - 'Untouch Data Column': ['DataA', 'DataB', 'DataC', 'DataD', 'DataE'] + "Invoice Month": ["2023-01", "2023-01", "2023-01", "2023-01", "2023-01"], + "Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"], + "Institution": ["BU", "BU", "BU", "HU", "HU"], + "Project - Allocation": [ + "ProjectA", + "ProjectB", + "ProjectC", + "ProjectD", + "ProjectE", + ], + "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) @@ -160,70 +190,101 @@ def test_export_pi(self): pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv' self.assertIn(pi_csv_1, os.listdir(output_dir.name)) self.assertIn(pi_csv_2, os.listdir(output_dir.name)) - self.assertEqual(len(os.listdir(output_dir.name)), len(self.dataframe['Manager (PI)'].unique())) + self.assertEqual( + len(os.listdir(output_dir.name)), + len(self.dataframe["Manager (PI)"].unique()), + ) - pi_df = pandas.read_csv(output_dir.name + '/' + pi_csv_1) - self.assertEqual(len(pi_df['Manager (PI)'].unique()), 1) - self.assertEqual(pi_df['Manager (PI)'].unique()[0], self.dataframe['Manager (PI)'][0]) + pi_df = pandas.read_csv(output_dir.name + "/" + pi_csv_1) + self.assertEqual(len(pi_df["Manager (PI)"].unique()), 1) + self.assertEqual( + pi_df["Manager (PI)"].unique()[0], self.dataframe["Manager (PI)"][0] + ) - self.assertIn('ProjectA', pi_df['Project - Allocation'].tolist()) - self.assertIn('ProjectB', pi_df['Project - Allocation'].tolist()) - self.assertIn('ProjectC', pi_df['Project - Allocation'].tolist()) + self.assertIn("ProjectA", pi_df["Project - Allocation"].tolist()) + self.assertIn("ProjectB", pi_df["Project - Allocation"].tolist()) + self.assertIn("ProjectC", pi_df["Project - Allocation"].tolist()) - pi_df = pandas.read_csv(output_dir.name + '/' + pi_csv_2) - self.assertEqual(len(pi_df['Manager (PI)'].unique()), 1) - self.assertEqual(pi_df['Manager (PI)'].unique()[0], self.dataframe['Manager (PI)'][3]) + pi_df = pandas.read_csv(output_dir.name + "/" + pi_csv_2) + self.assertEqual(len(pi_df["Manager (PI)"].unique()), 1) + self.assertEqual( + pi_df["Manager (PI)"].unique()[0], self.dataframe["Manager (PI)"][3] + ) - self.assertIn('ProjectD', pi_df['Project - Allocation'].tolist()) - self.assertIn('ProjectE', pi_df['Project - Allocation'].tolist()) - self.assertNotIn('ProjectA', pi_df['Project - Allocation'].tolist()) - self.assertNotIn('ProjectB', pi_df['Project - Allocation'].tolist()) - self.assertNotIn('ProjectC', pi_df['Project - Allocation'].tolist()) + self.assertIn("ProjectD", pi_df["Project - Allocation"].tolist()) + self.assertIn("ProjectE", pi_df["Project - Allocation"].tolist()) + self.assertNotIn("ProjectA", pi_df["Project - Allocation"].tolist()) + self.assertNotIn("ProjectB", pi_df["Project - Allocation"].tolist()) + self.assertNotIn("ProjectC", pi_df["Project - Allocation"].tolist()) class TestGetInstitute(TestCase): def test_get_pi_institution(self): - institute_map = { - "harvard.edu" : "Harvard University", - "bu.edu" : "Boston University", - "bentley.edu" : "Bentley", - "mclean.harvard.edu" : "McLean Hospital", - "meei.harvard.edu" : "Massachusetts Eye & Ear", - "dfci.harvard.edu" : "Dana-Farber Cancer Institute", - "northeastern.edu" : "Northeastern University", + "harvard.edu": "Harvard University", + "bu.edu": "Boston University", + "bentley.edu": "Bentley", + "mclean.harvard.edu": "McLean Hospital", + "meei.harvard.edu": "Massachusetts Eye & Ear", + "dfci.harvard.edu": "Dana-Farber Cancer Institute", + "northeastern.edu": "Northeastern University", } - + self.assertEqual( - process_report.get_institution_from_pi(institute_map, "quanmp@bu.edu"), "Boston University" + process_report.get_institution_from_pi(institute_map, "quanmp@bu.edu"), + "Boston University", ) self.assertEqual( - process_report.get_institution_from_pi(institute_map, "c@mclean.harvard.edu"), "McLean Hospital" + process_report.get_institution_from_pi( + institute_map, "c@mclean.harvard.edu" + ), + "McLean Hospital", ) self.assertEqual( - process_report.get_institution_from_pi(institute_map, "b@harvard.edu"), "Harvard University" + process_report.get_institution_from_pi(institute_map, "b@harvard.edu"), + "Harvard University", ) self.assertEqual( process_report.get_institution_from_pi(institute_map, "fake"), "" ) self.assertEqual( - process_report.get_institution_from_pi(institute_map, "pi@northeastern.edu"), "Northeastern University" + process_report.get_institution_from_pi( + institute_map, "pi@northeastern.edu" + ), + "Northeastern University", ) class TestCredit0002(TestCase): def setUp(self): - data = { - 'Invoice Month': ['2024-03','2024-03','2024-03','2024-03','2024-03','2024-03'], - 'Manager (PI)': ['PI1', 'PI1', 'PI2', 'PI3', 'PI4', 'PI4'], - 'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE', 'ProjectF'], - 'Cost': [10, 100, 10000, 5000, 800, 1000] + "Invoice Month": [ + "2024-03", + "2024-03", + "2024-03", + "2024-03", + "2024-03", + "2024-03", + ], + "Manager (PI)": ["PI1", "PI1", "PI2", "PI3", "PI4", "PI4"], + "Project - Allocation": [ + "ProjectA", + "ProjectB", + "ProjectC", + "ProjectD", + "ProjectE", + "ProjectF", + ], + "Cost": [10, 100, 10000, 5000, 800, 1000], } self.dataframe = pandas.DataFrame(data) - old_pi = ['PI2,2023-09', 'PI3,2024-02', 'PI4,2024-03'] # Case with old and new pi in pi file - old_pi_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv') - for pi in old_pi: + old_pi = [ + "PI2,2023-09", + "PI3,2024-02", + "PI4,2024-03", + ] # Case with old and new pi in pi file + old_pi_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv") + for pi in old_pi: old_pi_file.write(pi + "\n") self.old_pi_file = old_pi_file.name @@ -231,84 +292,123 @@ def tearDown(self): os.remove(self.old_pi_file) def test_apply_credit_0002(self): - dataframe = process_report.apply_credits_new_pi(self.dataframe, self.old_pi_file) + dataframe = process_report.apply_credits_new_pi( + self.dataframe, self.old_pi_file + ) - self.assertTrue('Credit' in dataframe) - self.assertTrue('Credit Code' in dataframe) - self.assertTrue('Balance' in dataframe) + self.assertTrue("Credit" in dataframe) + self.assertTrue("Credit Code" in dataframe) + self.assertTrue("Balance" in dataframe) - non_credited_project = dataframe[pandas.isna(dataframe['Credit Code'])] - credited_projects = dataframe[dataframe['Credit Code'] == '0002'] + non_credited_project = dataframe[pandas.isna(dataframe["Credit Code"])] + credited_projects = dataframe[dataframe["Credit Code"] == "0002"] self.assertEqual(2, len(non_credited_project)) - self.assertEqual(non_credited_project.loc[2, 'Cost'], non_credited_project.loc[2, 'Balance']) - self.assertEqual(non_credited_project.loc[3, 'Cost'], non_credited_project.loc[3, 'Balance']) - + self.assertEqual( + non_credited_project.loc[2, "Cost"], non_credited_project.loc[2, "Balance"] + ) + self.assertEqual( + non_credited_project.loc[3, "Cost"], non_credited_project.loc[3, "Balance"] + ) self.assertEqual(4, len(credited_projects.index)) - self.assertTrue('PI2' not in credited_projects['Manager (PI)'].unique()) - self.assertTrue('PI3' not in credited_projects['Manager (PI)'].unique()) + self.assertTrue("PI2" not in credited_projects["Manager (PI)"].unique()) + self.assertTrue("PI3" not in credited_projects["Manager (PI)"].unique()) - self.assertEqual(10, credited_projects.loc[0, 'Credit']) - self.assertEqual(100, credited_projects.loc[1, 'Credit']) - self.assertEqual(800, credited_projects.loc[4, 'Credit']) - self.assertEqual(200, credited_projects.loc[5, 'Credit']) + self.assertEqual(10, credited_projects.loc[0, "Credit"]) + self.assertEqual(100, credited_projects.loc[1, "Credit"]) + self.assertEqual(800, credited_projects.loc[4, "Credit"]) + self.assertEqual(200, credited_projects.loc[5, "Credit"]) - self.assertEqual(0, credited_projects.loc[0, 'Balance']) - self.assertEqual(0, credited_projects.loc[1, 'Balance']) - self.assertEqual(0, credited_projects.loc[4, 'Balance']) - self.assertEqual(800, credited_projects.loc[5, 'Balance']) + self.assertEqual(0, credited_projects.loc[0, "Balance"]) + self.assertEqual(0, credited_projects.loc[1, "Balance"]) + self.assertEqual(0, credited_projects.loc[4, "Balance"]) + self.assertEqual(800, credited_projects.loc[5, "Balance"]) class TestValidateBillables(TestCase): - def setUp(self): - data = { - 'Manager (PI)': ['PI1', math.nan, 'PI1', 'PI2', 'PI2'], - 'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'], + "Manager (PI)": ["PI1", math.nan, "PI1", "PI2", "PI2"], + "Project - Allocation": [ + "ProjectA", + "ProjectB", + "ProjectC", + "ProjectD", + "ProjectE", + ], } self.dataframe = pandas.DataFrame(data) def test_validate_billables(self): - self.assertEqual(1, len(self.dataframe[pandas.isna(self.dataframe['Manager (PI)'])])) + self.assertEqual( + 1, len(self.dataframe[pandas.isna(self.dataframe["Manager (PI)"])]) + ) validated_df = process_report.validate_pi_names(self.dataframe) - self.assertEqual(0, len(validated_df[pandas.isna(validated_df['Manager (PI)'])])) + self.assertEqual( + 0, len(validated_df[pandas.isna(validated_df["Manager (PI)"])]) + ) class TestExportLenovo(TestCase): def setUp(self): - data = { - 'Invoice Month': ['2023-01','2023-01','2023-01','2023-01','2023-01', '2023-01'], - 'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE', 'ProjectF'], - 'Institution': ['A', 'B', 'C', 'D', 'E', 'F'], - 'SU Hours (GBhr or SUhr)': [1, 10, 100, 4, 432, 10], - 'SU Type': ['OpenShift GPUA100SXM4', 'OpenShift GPUA100', 'OpenShift GPUA100SXM4', 'OpenStack GPUA100SXM4', 'OpenStack CPU', 'OpenStack GPUK80'] + "Invoice Month": [ + "2023-01", + "2023-01", + "2023-01", + "2023-01", + "2023-01", + "2023-01", + ], + "Project - Allocation": [ + "ProjectA", + "ProjectB", + "ProjectC", + "ProjectD", + "ProjectE", + "ProjectF", + ], + "Institution": ["A", "B", "C", "D", "E", "F"], + "SU Hours (GBhr or SUhr)": [1, 10, 100, 4, 432, 10], + "SU Type": [ + "OpenShift GPUA100SXM4", + "OpenShift GPUA100", + "OpenShift GPUA100SXM4", + "OpenStack GPUA100SXM4", + "OpenStack CPU", + "OpenStack GPUK80", + ], } self.dataframe = pandas.DataFrame(data) - output_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv') + output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv") self.output_file = output_file.name def tearDown(self): os.remove(self.output_file) - def test_apply_credit_0002(self): process_report.export_lenovo(self.dataframe, self.output_file) output_df = pandas.read_csv(self.output_file) - self.assertTrue(set([ - process_report.INVOICE_DATE_FIELD, - process_report.PROJECT_FIELD, - process_report.INSTITUTION_FIELD, - process_report.SU_TYPE_FIELD, - 'SU Hours', - 'SU Charge', - 'Charge', - ]).issubset(output_df)) - + self.assertTrue( + set( + [ + process_report.INVOICE_DATE_FIELD, + process_report.PROJECT_FIELD, + process_report.INSTITUTION_FIELD, + process_report.SU_TYPE_FIELD, + "SU Hours", + "SU Charge", + "Charge", + ] + ).issubset(output_df) + ) + for i, row in output_df.iterrows(): - self.assertIn(row[process_report.SU_TYPE_FIELD], ['OpenShift GPUA100SXM4', 'OpenStack GPUA100SXM4']) - self.assertEqual(row['Charge'], row['SU Charge'] * row['SU Hours']) + self.assertIn( + row[process_report.SU_TYPE_FIELD], + ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"], + ) + self.assertEqual(row["Charge"], row["SU Charge"] * row["SU Hours"]) diff --git a/requirements.txt b/requirements.txt index 1411a4a..fb6c7ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -pandas \ No newline at end of file +pandas