Skip to content

Commit

Permalink
Added processing to apply project credits, determine institution name…
Browse files Browse the repository at this point in the history
… for each PI, and exporting HU and BU invoices
  • Loading branch information
QuanMPhm committed Apr 5, 2024
1 parent 1ec80b2 commit 12dbf39
Show file tree
Hide file tree
Showing 3 changed files with 245 additions and 8 deletions.
27 changes: 27 additions & 0 deletions process_report/institute_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"northeastern.edu" : "Northeastern University",
"bu.edu" : "Boston University",
"bentley.edu" : "Bentley",
"uri.edu" : "University of Rhode Island",
"redhat.com" : "Red Hat",
"childrens.harvard.edu" : "Boston Childrens Hospital",
"mclean.harvard.edu" : "McLean Hospital",
"meei.harvard.edu" : "Massachusetts Eye & Ear",
"dfci.harvard.edu" : "Dana-Farber Cancer Institute",
"bwh.harvard.edu" : "Brigham and Women's Hospital",
"bidmc.harvard.edu" : "Beth Israel Deaconess Medical Center",
"harvard.edu" : "Harvard University",
"wpi.edu" : "Worcester Polytechnic Institute",
"mit.edu" : "Massachusetts Institute of Technology",
"umass.edu" : "University of Massachusetts Amherst",
"uml.edu" : "University of Massachusetts Lowell",
"codeforboston.org" : "Code For Boston",
"mmsh" : "Harvard University",
"gstuart" : "University of Massachusetts Amherst",
"rudolph" : "Boston Childrens Hospital",
"robbaron" : "Boston University",
"kmdalton" : "Harvard University",
"mzink" : "University of Massachusetts Amherst",
"yale.edu" : "Yale University",
"francesco.pontiggia" : "Harvard University"
}
163 changes: 155 additions & 8 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,101 @@
import argparse
import os
import sys

import json
import pandas


### Invoice field names
INVOICE_DATE_FIELD = 'Invoice Month'
PROJECT_FIELD = 'Project - Allocation'
PROJECT_ID_FIELD = 'Project - Allocation ID'
PI_FIELD = 'Manager (PI)'
INVOICE_EMAIL_FIELD = 'Invoice Email'
INVOICE_ADDRESS_FIELD = 'Invoice Address'
INSTITUTION_FIELD = 'Institution'
INSTITUTION_ID_FIELD = 'Institution - Specific Code'
SU_HOURS_FIELD = 'SU Hours (GBhr or SUhr)'
SU_TYPE_FIELD = 'SU Type'
COST_FIELD = 'Cost'
CREDIT_FIELD = 'Credit'
CREDIT_CODE_FIELD = 'Credit Code'
BALANCE_FIELD = 'Balance'
###


def apply_credits_0001(dataframe):
credit_code = "0001"
pass


def apply_credits_0002(dataframe):
"""Applies the New PI Credit. This credit function expects the
env var `C0002_OLD_PI` to be set, pointing to a txt file containing old PIs"""
credit_code = "0002"
credit_amount = 1000

old_pi_list = set()
if "C0002_OLD_PI" not in os.environ:
print("Applying credit 0002 failed. C0002_OLD_PI env var is not set")
sys.exit(1)

try:
with open(os.getenv("C0002_OLD_PI")) as f:
for pi in f: old_pi_list.add(pi.strip())

except FileNotFoundError:
print("Applying credit 0002 failed. Old PI file does not exist")
sys.exit(1)

print("Old pi list: ", old_pi_list)
pi_list = dataframe[PI_FIELD].unique()

for pi in pi_list:
if pandas.isna(pi):
continue # NaN check
if pi in old_pi_list:
continue # Is the PI an old PI?

pi_projects = dataframe[dataframe[PI_FIELD] == pi]
rem_credit = credit_amount
for i, row in pi_projects.iterrows():
project_cost = row[COST_FIELD]
if project_cost >= rem_credit:
dataframe.at[i, CREDIT_FIELD] = rem_credit
dataframe.at[i, CREDIT_CODE_FIELD] = credit_code
dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - rem_credit
break
else:
dataframe.at[i, CREDIT_FIELD] = project_cost
dataframe.at[i, CREDIT_CODE_FIELD] = credit_code
dataframe.at[i, BALANCE_FIELD] = 0
rem_credit -= project_cost

return dataframe


applied_credits = [apply_credits_0002]


def get_institution_from_pi(pi_uname):

dir_path = os.path.dirname(__file__)
with open(f'{dir_path}/institute_map.json', 'r') as f:
institute_map = json.load(f)

if '@' in pi_uname:
domain = pi_uname.split('@')[1]
institute_name = institute_map.get(domain, '')
else:
institute_name = institute_map.get(pi_uname, '')

if institute_name == '':
print(f"PI name {pi_uname} does not match any institution!")

return institute_name


def main():
"""Remove non-billable PIs and projects"""

Expand Down Expand Up @@ -41,6 +133,18 @@ def main():
default="pi_invoices",
help="Name of output folder containing pi-specific invoice csvs"
)
parser.add_argument(
"--HU-only",
required=False,
default="HU_only.csv",
help="Name of output csv for HU invoices"
)
parser.add_argument(
"--HU-BU",
required=False,
default="HU_BU.csv",
help="Name of output csv for HU and BU invoices"
)
args = parser.parse_args()
merged_dataframe = merge_csv(args.csv_files)

Expand All @@ -60,9 +164,13 @@ def main():

projects = list(set(projects + timed_projects_list))

merged_dataframe = add_credits(merged_dataframe)
merged_dataframe = add_institution(merged_dataframe)
billable_projects = remove_non_billables(merged_dataframe, pi, projects, args.output_file)
remove_billables(merged_dataframe, pi, projects, "non_billable.csv")
export_pi_billables(billable_projects, args.output_folder)
export_HU_only(billable_projects, args.HU_only)
export_HU_BU(billable_projects, args.HU_BU)


def merge_csv(files):
Expand All @@ -83,7 +191,7 @@ def get_invoice_date(dataframe):
Note that it only checks the first entry because it should
be the same for every row.
"""
invoice_date_str = dataframe['Invoice Month'][0]
invoice_date_str = dataframe[INVOICE_DATE_FIELD][0]
invoice_date = pandas.to_datetime(invoice_date_str, format='%Y-%m')
return invoice_date

Expand All @@ -102,7 +210,7 @@ def timed_projects(timed_projects_file, invoice_date):

def remove_non_billables(dataframe, pi, projects, output_file):
"""Removes projects and PIs that should not be billed from the dataframe"""
filtered_dataframe = dataframe[~dataframe['Manager (PI)'].isin(pi) & ~dataframe['Project - Allocation'].isin(projects)]
filtered_dataframe = dataframe[~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)]
filtered_dataframe.to_csv(output_file, index=False)
return filtered_dataframe

Expand All @@ -112,21 +220,60 @@ def remove_billables(dataframe, pi, projects, output_file):
So this *keeps* the projects/pis that should not be billed.
"""
filtered_dataframe = dataframe[dataframe['Manager (PI)'].isin(pi) | dataframe['Project - Allocation'].isin(projects)]
filtered_dataframe = dataframe[dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)]
filtered_dataframe.to_csv(output_file, index=False)


def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
if not os.path.exists(output_folder):
os.mkdir(output_folder)

invoice_month = dataframe['Invoice Month'].iat[0]
pi_list = dataframe['Manager (PI)'].unique()
invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
pi_list = dataframe[PI_FIELD].unique()

for pi in pi_list:
pi_projects = dataframe[dataframe['Manager (PI)'] == pi]
pi_instituition = pi_projects['Institution'].iat[0]
if pandas.isna(pi):
continue
pi_projects = dataframe[dataframe[PI_FIELD] == pi]
pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0]
pi_projects.to_csv(output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv")



def add_credits(dataframe : pandas.DataFrame):
"""Adds credits to PIs depending on different criterions"""
dataframe.insert(dataframe.columns.get_loc(COST_FIELD) + 1, CREDIT_FIELD, 0.0)
dataframe.insert(dataframe.columns.get_loc(CREDIT_FIELD) + 1, CREDIT_CODE_FIELD, None)
dataframe.insert(dataframe.columns.get_loc(CREDIT_CODE_FIELD) + 1, BALANCE_FIELD, 0.0)

# Apply credits
for credit_func in applied_credits:
dataframe = credit_func(dataframe)

return dataframe


def add_institution(dataframe: pandas.DataFrame):
"""Determine the PI's institution name, logging any PI whose institution cannot be determined"""
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI") # Nan check
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(pi_name)

return dataframe


def export_HU_only(dataframe, output_file):
HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == 'Harvard University']
HU_projects.to_csv(output_file)


def export_HU_BU(dataframe, output_file):
HU_BU_projects = dataframe[(dataframe[INSTITUTION_FIELD] == 'Harvard University') |
(dataframe[INSTITUTION_FIELD] == 'Boston University')]
HU_BU_projects.to_csv(output_file)


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest import TestCase
from unittest import skipIf
import tempfile
import pandas
import os
Expand Down Expand Up @@ -175,3 +176,65 @@ def test_export_pi(self):
self.assertNotIn('ProjectA', pi_df['Project - Allocation'].tolist())
self.assertNotIn('ProjectB', pi_df['Project - Allocation'].tolist())
self.assertNotIn('ProjectC', pi_df['Project - Allocation'].tolist())


class TestGetInstitute(TestCase):

def setUp(self):

data = {
'Manager (PI)': ['[email protected]', '[email protected]', '[email protected]', 'fake', '[email protected]'],
'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'],
'Answer': ["Boston University", "McLean Hospital", "Harvard University", "", "Northeastern University"]
}
self.data = pandas.DataFrame(data)

def test_get_pi_institution(self):
for i, row in self.data.iterrows():
institution_name = process_report.get_institution_from_pi(row['Manager (PI)'])
self.assertEqual(institution_name, row['Answer'])

@skipIf(process_report.apply_credits_0002 not in process_report.applied_credits, "Skipping test for credit 0002 because credit not enabled")
class TestCredit0002(TestCase):
def setUp(self):

data = {
'Manager (PI)': ['PI1', 'PI1', 'PI2', 'PI3', 'PI4', 'PI4'],
'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE', 'ProjectF'],
'Cost': [10, 100, 10000, 5000, 800, 1000]
}
self.dataframe = pandas.DataFrame(data)
old_pi = ['PI2', 'PI3']
old_pi_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv')
for pi in old_pi: old_pi_file.write(pi + "\n")
self.old_pi_file = old_pi_file.name

os.environ["C0002_OLD_PI"] = self.old_pi_file

def tearDown(self):
os.remove(self.old_pi_file)

def test_apply_credit_0002(self):
dataframe = process_report.add_credits(self.dataframe)

self.assertTrue('Credit' in dataframe)
self.assertTrue('Credit Code' in dataframe)
self.assertTrue('Balance' in dataframe)

credited_projects = dataframe[dataframe['Credit Code'] == '0002']

self.assertEqual(4, len(credited_projects.index))
self.assertTrue('PI2' not in credited_projects['Manager (PI)'].unique())
self.assertTrue('PI3' not in credited_projects['Manager (PI)'].unique())

self.assertEqual(10, credited_projects[credited_projects['Project - Allocation'] == 'ProjectA']['Credit'].iloc[0])
self.assertEqual(100, credited_projects[credited_projects['Project - Allocation'] == 'ProjectB']['Credit'].iloc[0])
self.assertEqual(800, credited_projects[credited_projects['Project - Allocation'] == 'ProjectE']['Credit'].iloc[0])
self.assertEqual(200, credited_projects[credited_projects['Project - Allocation'] == 'ProjectF']['Credit'].iloc[0])

self.assertEqual(0, credited_projects[credited_projects['Project - Allocation'] == 'ProjectA']['Balance'].iloc[0])
self.assertEqual(0, credited_projects[credited_projects['Project - Allocation'] == 'ProjectB']['Balance'].iloc[0])
self.assertEqual(0, credited_projects[credited_projects['Project - Allocation'] == 'ProjectE']['Balance'].iloc[0])
self.assertEqual(800, credited_projects[credited_projects['Project - Allocation'] == 'ProjectF']['Balance'].iloc[0])


0 comments on commit 12dbf39

Please sign in to comment.