-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added processing to apply project credits, determine institution name…
… for each PI, and exporting HU and BU invoices
- Loading branch information
Showing
3 changed files
with
303 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"northeastern.edu" : "Northeastern University", | ||
"bu.edu" : "Boston University", | ||
"bentley.edu" : "Bentley", | ||
"uri.edu" : "University of Rhode Island", | ||
"redhat.com" : "Red Hat", | ||
"childrens.harvard.edu" : "Boston Childrens Hospital", | ||
"mclean.harvard.edu" : "McLean Hospital", | ||
"meei.harvard.edu" : "Massachusetts Eye & Ear", | ||
"dfci.harvard.edu" : "Dana-Farber Cancer Institute", | ||
"bwh.harvard.edu" : "Brigham and Women's Hospital", | ||
"bidmc.harvard.edu" : "Beth Israel Deaconess Medical Center", | ||
"fas.harvard.edu" : "Harvard University", | ||
"cga.harvard.edu" : "Harvard University", | ||
"iq.harvard.edu" : "Harvard University", | ||
"hks.harvard.edu" : "Harvard University", | ||
"hsph.harvard.edu" : "Harvard University", | ||
"seas.harvard.edu" : "Harvard University", | ||
"gse.harvard.edu" : "Harvard University", | ||
"gov.harvard.edu" : "Harvard University", | ||
"oeb.harvard.edu" : "Harvard University", | ||
"harvard.edu" : "Harvard University", | ||
"wpi.edu" : "Worcester Polytechnic Institute", | ||
"mit.edu" : "Massachusetts Institute of Technology", | ||
"umass.edu" : "University of Massachusetts Amherst", | ||
"uml.edu" : "University of Massachusetts Lowell", | ||
"codeforboston.org" : "Code For Boston", | ||
"yale.edu" : "Yale University", | ||
"mmsh" : "Harvard University", | ||
"gstuart" : "University of Massachusetts Amherst", | ||
"rudolph" : "Boston Childrens Hospital", | ||
"robbaron" : "Boston University", | ||
"kmdalton" : "Harvard University", | ||
"mzink" : "University of Massachusetts Amherst", | ||
"francesco.pontiggia" : "Harvard University" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,67 @@ | ||
import argparse | ||
import os | ||
import sys | ||
|
||
import json | ||
import pandas | ||
|
||
|
||
### Invoice field names | ||
INVOICE_DATE_FIELD = 'Invoice Month' | ||
PROJECT_FIELD = 'Project - Allocation' | ||
PROJECT_ID_FIELD = 'Project - Allocation ID' | ||
PI_FIELD = 'Manager (PI)' | ||
INVOICE_EMAIL_FIELD = 'Invoice Email' | ||
INVOICE_ADDRESS_FIELD = 'Invoice Address' | ||
INSTITUTION_FIELD = 'Institution' | ||
INSTITUTION_ID_FIELD = 'Institution - Specific Code' | ||
SU_HOURS_FIELD = 'SU Hours (GBhr or SUhr)' | ||
SU_TYPE_FIELD = 'SU Type' | ||
COST_FIELD = 'Cost' | ||
CREDIT_FIELD = 'Credit' | ||
CREDIT_CODE_FIELD = 'Credit Code' | ||
BALANCE_FIELD = 'Balance' | ||
### | ||
|
||
|
||
def get_institution_from_pi(institute_map, pi_uname): | ||
institution_key = pi_uname.split('@')[-1] | ||
institution_name = institute_map.get(institution_key, '') | ||
|
||
if institution_name == '': | ||
print(f"Warning: PI name {pi_uname} does not match any institution!") | ||
|
||
return institution_name | ||
|
||
|
||
def load_institute_map() -> dict: | ||
with open('institute_map.json', 'r') as f: | ||
institute_map = json.load(f) | ||
|
||
return institute_map | ||
|
||
|
||
def load_old_pis(old_pi_file): | ||
old_pi_dict = dict() | ||
|
||
try: | ||
with open(old_pi_file) as f: | ||
for pi_info in f: | ||
pi, first_month = pi_info.strip().split(',') | ||
old_pi_dict[pi] = first_month | ||
except FileNotFoundError: | ||
print('Applying credit 0002 failed. Old PI file does not exist') | ||
sys.exit(1) | ||
|
||
return old_pi_dict | ||
|
||
|
||
def is_old_pi(old_pi_dict, pi, invoice_month): | ||
if pi in old_pi_dict and old_pi_dict[pi] != invoice_month: | ||
return True | ||
return False | ||
|
||
|
||
def main(): | ||
"""Remove non-billable PIs and projects""" | ||
|
||
|
@@ -41,6 +99,23 @@ def main(): | |
default="pi_invoices", | ||
help="Name of output folder containing pi-specific invoice csvs" | ||
) | ||
parser.add_argument( | ||
"--HU-invoice-file", | ||
required=False, | ||
default="HU_only.csv", | ||
help="Name of output csv for HU invoices" | ||
) | ||
parser.add_argument( | ||
"--HU-BU-invoice-file", | ||
required=False, | ||
default="HU_BU.csv", | ||
help="Name of output csv for HU and BU invoices" | ||
) | ||
parser.add_argument( | ||
"--old-pi-file", | ||
required=False, | ||
help="Name of csv file listing previously billed PIs" | ||
) | ||
args = parser.parse_args() | ||
merged_dataframe = merge_csv(args.csv_files) | ||
|
||
|
@@ -60,9 +135,16 @@ def main(): | |
|
||
projects = list(set(projects + timed_projects_list)) | ||
|
||
billable_projects = remove_non_billables(merged_dataframe, pi, projects, args.output_file) | ||
merged_dataframe = add_institution(merged_dataframe) | ||
remove_billables(merged_dataframe, pi, projects, "non_billable.csv") | ||
|
||
billable_projects = remove_non_billables(merged_dataframe, pi, projects) | ||
billable_projects = validate_pi_names(billable_projects) | ||
credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file) | ||
export_billables(credited_projects, args.output_file) | ||
export_pi_billables(billable_projects, args.output_folder) | ||
export_HU_only(billable_projects, args.HU_invoice_file) | ||
export_HU_BU(billable_projects, args.HU_BU_invoice_file) | ||
|
||
|
||
def merge_csv(files): | ||
|
@@ -83,7 +165,7 @@ def get_invoice_date(dataframe): | |
Note that it only checks the first entry because it should | ||
be the same for every row. | ||
""" | ||
invoice_date_str = dataframe['Invoice Month'][0] | ||
invoice_date_str = dataframe[INVOICE_DATE_FIELD][0] | ||
invoice_date = pandas.to_datetime(invoice_date_str, format='%Y-%m') | ||
return invoice_date | ||
|
||
|
@@ -100,10 +182,9 @@ def timed_projects(timed_projects_file, invoice_date): | |
return dataframe[mask]['Project'].to_list() | ||
|
||
|
||
def remove_non_billables(dataframe, pi, projects, output_file): | ||
def remove_non_billables(dataframe, pi, projects): | ||
"""Removes projects and PIs that should not be billed from the dataframe""" | ||
filtered_dataframe = dataframe[~dataframe['Manager (PI)'].isin(pi) & ~dataframe['Project - Allocation'].isin(projects)] | ||
filtered_dataframe.to_csv(output_file, index=False) | ||
filtered_dataframe = dataframe[~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)] | ||
return filtered_dataframe | ||
|
||
|
||
|
@@ -112,21 +193,107 @@ def remove_billables(dataframe, pi, projects, output_file): | |
So this *keeps* the projects/pis that should not be billed. | ||
""" | ||
filtered_dataframe = dataframe[dataframe['Manager (PI)'].isin(pi) | dataframe['Project - Allocation'].isin(projects)] | ||
filtered_dataframe = dataframe[dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)] | ||
filtered_dataframe.to_csv(output_file, index=False) | ||
|
||
|
||
def validate_pi_names(dataframe): | ||
invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])] | ||
for i, row in invalid_pi_projects.iterrows(): | ||
print(f'Warning: Project {row[PROJECT_FIELD]} has empty PI field') | ||
dataframe = dataframe[~pandas.isna(dataframe[PI_FIELD])] | ||
|
||
return dataframe | ||
|
||
|
||
def export_billables(dataframe, output_file): | ||
dataframe.to_csv(output_file, index=False) | ||
|
||
|
||
def export_pi_billables(dataframe: pandas.DataFrame, output_folder): | ||
if not os.path.exists(output_folder): | ||
os.mkdir(output_folder) | ||
|
||
invoice_month = dataframe['Invoice Month'].iat[0] | ||
pi_list = dataframe['Manager (PI)'].unique() | ||
invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] | ||
pi_list = dataframe[PI_FIELD].unique() | ||
|
||
for pi in pi_list: | ||
pi_projects = dataframe[dataframe['Manager (PI)'] == pi] | ||
pi_instituition = pi_projects['Institution'].iat[0] | ||
if pandas.isna(pi): | ||
continue | ||
pi_projects = dataframe[dataframe[PI_FIELD] == pi] | ||
pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0] | ||
pi_projects.to_csv(output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv") | ||
|
||
|
||
|
||
def apply_credits_new_pi(dataframe, old_pi_file): | ||
new_pi_credit_code = "0002" | ||
new_pi_credit_amount = 1000 | ||
|
||
dataframe[CREDIT_FIELD] = None | ||
dataframe[CREDIT_CODE_FIELD] = None | ||
dataframe[BALANCE_FIELD] = 0 | ||
|
||
old_pi_dict = load_old_pis(old_pi_file) | ||
|
||
current_pi_list = dataframe[PI_FIELD].unique() | ||
invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] | ||
|
||
for pi in current_pi_list: | ||
pi_projects = dataframe[dataframe[PI_FIELD] == pi] | ||
|
||
if is_old_pi(old_pi_dict, pi, invoice_month): | ||
for i, row in pi_projects.iterrows(): | ||
dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] | ||
else: | ||
remaining_credit = new_pi_credit_amount | ||
for i, row in pi_projects.iterrows(): | ||
project_cost = row[COST_FIELD] | ||
applied_credit = min(project_cost, remaining_credit) | ||
|
||
dataframe.at[i, CREDIT_FIELD] = applied_credit | ||
dataframe.at[i, CREDIT_CODE_FIELD] = new_pi_credit_code | ||
dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - applied_credit | ||
remaining_credit -= applied_credit | ||
|
||
if remaining_credit == 0: | ||
break | ||
|
||
return dataframe | ||
|
||
|
||
def add_institution(dataframe: pandas.DataFrame): | ||
"""Determine every PI's institution name, logging any PI whose institution cannot be determined | ||
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to | ||
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if | ||
the username is not an email address. | ||
Exact matches are then mapped to the corresponding institution name. | ||
I.e "[email protected]" would match with "bu.edu", which maps to the instition name "Boston University" | ||
The list of mappings are defined in `institute_map.json`. | ||
""" | ||
institute_map = load_institute_map() | ||
for i, row in dataframe.iterrows(): | ||
pi_name = row[PI_FIELD] | ||
if pandas.isna(pi_name): | ||
print(f"Project {row[PROJECT_FIELD]} has no PI") | ||
else: | ||
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(institute_map, pi_name) | ||
|
||
return dataframe | ||
|
||
|
||
def export_HU_only(dataframe, output_file): | ||
HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == 'Harvard University'] | ||
HU_projects.to_csv(output_file) | ||
|
||
|
||
def export_HU_BU(dataframe, output_file): | ||
HU_BU_projects = dataframe[(dataframe[INSTITUTION_FIELD] == 'Harvard University') | | ||
(dataframe[INSTITUTION_FIELD] == 'Boston University')] | ||
HU_BU_projects.to_csv(output_file) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,11 @@ | ||
from unittest import TestCase | ||
from unittest import skipIf | ||
import tempfile | ||
import pandas | ||
import os | ||
import math | ||
from textwrap import dedent | ||
|
||
from process_report import process_report | ||
|
||
class TestGetInvoiceDate(TestCase): | ||
|
@@ -70,7 +73,8 @@ def tearDown(self): | |
os.remove(self.output_file2.name) | ||
|
||
def test_remove_non_billables(self): | ||
process_report.remove_non_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude, self.output_file.name) | ||
billables_df = process_report.remove_non_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude) | ||
process_report.export_billables(billables_df, self.output_file.name) | ||
|
||
result_df = pandas.read_csv(self.output_file.name) | ||
|
||
|
@@ -175,3 +179,87 @@ def test_export_pi(self): | |
self.assertNotIn('ProjectA', pi_df['Project - Allocation'].tolist()) | ||
self.assertNotIn('ProjectB', pi_df['Project - Allocation'].tolist()) | ||
self.assertNotIn('ProjectC', pi_df['Project - Allocation'].tolist()) | ||
|
||
|
||
class TestGetInstitute(TestCase): | ||
def test_get_pi_institution(self): | ||
|
||
self.assertEqual( | ||
process_report.get_institution_from_pi("[email protected]"), "Boston University" | ||
) | ||
self.assertEqual( | ||
process_report.get_institution_from_pi("[email protected]"), "McLean Hospital" | ||
) | ||
self.assertEqual( | ||
process_report.get_institution_from_pi("[email protected]"), "Harvard University" | ||
) | ||
self.assertEqual( | ||
process_report.get_institution_from_pi("fake"), "" | ||
) | ||
self.assertEqual( | ||
process_report.get_institution_from_pi("[email protected]"), "Northeastern University" | ||
) | ||
|
||
|
||
class TestCredit0002(TestCase): | ||
def setUp(self): | ||
|
||
data = { | ||
'Invoice Month': ['2024-03','2024-03','2024-03','2024-03','2024-03','2024-03'], | ||
'Manager (PI)': ['PI1', 'PI1', 'PI2', 'PI3', 'PI4', 'PI4'], | ||
'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE', 'ProjectF'], | ||
'Cost': [10, 100, 10000, 5000, 800, 1000] | ||
} | ||
self.dataframe = pandas.DataFrame(data) | ||
old_pi = ['PI2,2023-09', 'PI3,2024-02', 'PI4,2024-03'] # Case with old and new pi in pi file | ||
old_pi_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv') | ||
for pi in old_pi: old_pi_file.write(pi + "\n") | ||
self.old_pi_file = old_pi_file.name | ||
|
||
def tearDown(self): | ||
os.remove(self.old_pi_file) | ||
|
||
def test_apply_credit_0002(self): | ||
dataframe = process_report.apply_credits_new_pi(self.dataframe, self.old_pi_file) | ||
|
||
self.assertTrue('Credit' in dataframe) | ||
self.assertTrue('Credit Code' in dataframe) | ||
self.assertTrue('Balance' in dataframe) | ||
|
||
non_credited_project = dataframe[pandas.isna(dataframe['Credit Code'])] | ||
credited_projects = dataframe[dataframe['Credit Code'] == '0002'] | ||
|
||
self.assertEqual(2, len(non_credited_project)) | ||
self.assertEqual(non_credited_project.loc[2, 'Cost'], non_credited_project.loc[2, 'Balance']) | ||
self.assertEqual(non_credited_project.loc[3, 'Cost'], non_credited_project.loc[3, 'Balance']) | ||
|
||
|
||
self.assertEqual(4, len(credited_projects.index)) | ||
self.assertTrue('PI2' not in credited_projects['Manager (PI)'].unique()) | ||
self.assertTrue('PI3' not in credited_projects['Manager (PI)'].unique()) | ||
|
||
self.assertEqual(10, credited_projects.loc[0, 'Credit']) | ||
self.assertEqual(100, credited_projects.loc[1, 'Credit']) | ||
self.assertEqual(800, credited_projects.loc[4, 'Credit']) | ||
self.assertEqual(200, credited_projects.loc[5, 'Credit']) | ||
|
||
self.assertEqual(0, credited_projects.loc[0, 'Balance']) | ||
self.assertEqual(0, credited_projects.loc[1, 'Balance']) | ||
self.assertEqual(0, credited_projects.loc[4, 'Balance']) | ||
self.assertEqual(800, credited_projects.loc[5, 'Balance']) | ||
|
||
|
||
class TestValidateBillables(TestCase): | ||
|
||
def setUp(self): | ||
|
||
data = { | ||
'Manager (PI)': ['PI1', math.nan, 'PI1', 'PI2', 'PI2'], | ||
'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'], | ||
} | ||
self.dataframe = pandas.DataFrame(data) | ||
|
||
def test_validate_billables(self): | ||
self.assertEqual(1, len(self.dataframe[pandas.isna(self.dataframe['Manager (PI)'])])) | ||
validated_df = process_report.validate_billables(self.dataframe) | ||
self.assertEqual(0, len(validated_df[pandas.isna(validated_df['Manager (PI)'])])) |