From 37e6a45a55259a0e70046a22f4dd903cae276439 Mon Sep 17 00:00:00 2001
From: Quan Pham <qmpham2019@gmail.com>
Date: Wed, 3 Apr 2024 11:52:47 -0400
Subject: [PATCH] Added processing to apply project credits, determine
 institution name for each PI, and exporting HU and BU invoices

---
 process_report/institute_map.json  |  36 ++++++
 process_report/process_report.py   | 189 +++++++++++++++++++++++++++--
 process_report/tests/unit_tests.py |  90 +++++++++++++-
 3 files changed, 303 insertions(+), 12 deletions(-)
 create mode 100644 process_report/institute_map.json

diff --git a/process_report/institute_map.json b/process_report/institute_map.json
new file mode 100644
index 0000000..cd2d60d
--- /dev/null
+++ b/process_report/institute_map.json
@@ -0,0 +1,36 @@
+{
+    "northeastern.edu"      : "Northeastern University",
+    "bu.edu"                : "Boston University",
+    "bentley.edu"           : "Bentley",
+    "uri.edu"               : "University of Rhode Island",
+    "redhat.com"            : "Red Hat",
+    "childrens.harvard.edu" : "Boston Childrens Hospital",
+    "mclean.harvard.edu"    : "McLean Hospital",
+    "meei.harvard.edu"      : "Massachusetts Eye & Ear",
+    "dfci.harvard.edu"      : "Dana-Farber Cancer Institute",
+    "bwh.harvard.edu"       : "Brigham and Women's Hospital",
+    "bidmc.harvard.edu"     : "Beth Israel Deaconess Medical Center",
+    "fas.harvard.edu"       : "Harvard University",
+    "cga.harvard.edu"       : "Harvard University",
+    "iq.harvard.edu"        : "Harvard University", 
+    "hks.harvard.edu"       : "Harvard University",
+    "hsph.harvard.edu"      : "Harvard University",
+    "seas.harvard.edu"      : "Harvard University",
+    "gse.harvard.edu"       : "Harvard University",
+    "gov.harvard.edu"       : "Harvard University",
+    "oeb.harvard.edu"       : "Harvard University",
+    "harvard.edu"           : "Harvard University",
+    "wpi.edu"               : "Worcester Polytechnic Institute",
+    "mit.edu"               : "Massachusetts Institute of Technology",
+    "umass.edu"             : "University of Massachusetts Amherst",
+    "uml.edu"               : "University of Massachusetts Lowell",
+    "codeforboston.org"     : "Code For Boston",
+    "yale.edu"              : "Yale University",
+    "mmsh"                  : "Harvard University",
+    "gstuart"               : "University of Massachusetts Amherst",
+    "rudolph"               : "Boston Childrens Hospital",
+    "robbaron"              : "Boston University",
+    "kmdalton"              : "Harvard University",
+    "mzink"                 : "University of Massachusetts Amherst",
+    "francesco.pontiggia"   : "Harvard University"
+}
diff --git a/process_report/process_report.py b/process_report/process_report.py
index cfcdc70..846cccf 100644
--- a/process_report/process_report.py
+++ b/process_report/process_report.py
@@ -1,9 +1,67 @@
 import argparse
 import os
+import sys
 
+import json
 import pandas
 
 
+### Invoice field names
+INVOICE_DATE_FIELD = 'Invoice Month'
+PROJECT_FIELD = 'Project - Allocation'
+PROJECT_ID_FIELD = 'Project - Allocation ID'
+PI_FIELD = 'Manager (PI)'
+INVOICE_EMAIL_FIELD = 'Invoice Email'
+INVOICE_ADDRESS_FIELD = 'Invoice Address'
+INSTITUTION_FIELD = 'Institution'
+INSTITUTION_ID_FIELD = 'Institution - Specific Code'
+SU_HOURS_FIELD = 'SU Hours (GBhr or SUhr)'
+SU_TYPE_FIELD = 'SU Type'
+COST_FIELD = 'Cost'
+CREDIT_FIELD = 'Credit'
+CREDIT_CODE_FIELD = 'Credit Code'
+BALANCE_FIELD = 'Balance'
+###
+
+
+def get_institution_from_pi(institute_map, pi_uname):
+    institution_key = pi_uname.split('@')[-1]
+    institution_name = institute_map.get(institution_key, '')
+
+    if institution_name == '':
+        print(f"Warning: PI name {pi_uname} does not match any institution!")
+    
+    return institution_name
+
+
+def load_institute_map() -> dict:
+    with open('institute_map.json', 'r') as f:
+        institute_map = json.load(f)
+
+    return institute_map
+
+
+def load_old_pis(old_pi_file):
+    old_pi_dict = dict()
+
+    try:
+        with open(old_pi_file) as f:
+            for pi_info in f: 
+                pi, first_month = pi_info.strip().split(',')
+                old_pi_dict[pi] = first_month
+    except FileNotFoundError:
+        print('Applying credit 0002 failed. Old PI file does not exist')
+        sys.exit(1)
+    
+    return old_pi_dict
+
+
+def is_old_pi(old_pi_dict, pi, invoice_month):
+    if pi in old_pi_dict and old_pi_dict[pi] != invoice_month: 
+        return True
+    return False
+
+
 def main():
     """Remove non-billable PIs and projects"""
 
@@ -41,6 +99,23 @@ def main():
         default="pi_invoices",
         help="Name of output folder containing pi-specific invoice csvs"
     )
+    parser.add_argument(
+        "--HU-invoice-file",
+        required=False,
+        default="HU_only.csv",
+        help="Name of output csv for HU invoices"
+    )
+    parser.add_argument(
+        "--HU-BU-invoice-file",
+        required=False,
+        default="HU_BU.csv",
+        help="Name of output csv for HU and BU invoices"
+    )
+    parser.add_argument(
+        "--old-pi-file",
+        required=False,
+        help="Name of csv file listing previously billed PIs"
+    )
     args = parser.parse_args()
     merged_dataframe = merge_csv(args.csv_files)
 
@@ -60,9 +135,16 @@ def main():
 
     projects = list(set(projects + timed_projects_list))
 
-    billable_projects = remove_non_billables(merged_dataframe, pi, projects, args.output_file)
+    merged_dataframe = add_institution(merged_dataframe)
     remove_billables(merged_dataframe, pi, projects, "non_billable.csv")
+
+    billable_projects = remove_non_billables(merged_dataframe, pi, projects)
+    billable_projects = validate_pi_names(billable_projects)
+    credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file)
+    export_billables(credited_projects, args.output_file)
     export_pi_billables(billable_projects, args.output_folder)
+    export_HU_only(billable_projects, args.HU_invoice_file)
+    export_HU_BU(billable_projects, args.HU_BU_invoice_file)
 
 
 def merge_csv(files):
@@ -83,7 +165,7 @@ def get_invoice_date(dataframe):
     Note that it only checks the first entry because it should
     be the same for every row.
     """
-    invoice_date_str = dataframe['Invoice Month'][0]
+    invoice_date_str = dataframe[INVOICE_DATE_FIELD][0]
     invoice_date = pandas.to_datetime(invoice_date_str, format='%Y-%m')
     return invoice_date
 
@@ -100,10 +182,9 @@ def timed_projects(timed_projects_file, invoice_date):
     return dataframe[mask]['Project'].to_list()
 
 
-def remove_non_billables(dataframe, pi, projects, output_file):
+def remove_non_billables(dataframe, pi, projects):
     """Removes projects and PIs that should not be billed from the dataframe"""
-    filtered_dataframe = dataframe[~dataframe['Manager (PI)'].isin(pi) & ~dataframe['Project - Allocation'].isin(projects)]
-    filtered_dataframe.to_csv(output_file, index=False)
+    filtered_dataframe = dataframe[~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)]
     return filtered_dataframe
 
 
@@ -112,21 +193,107 @@ def remove_billables(dataframe, pi, projects, output_file):
 
     So this *keeps* the projects/pis that should not be billed.
     """
-    filtered_dataframe = dataframe[dataframe['Manager (PI)'].isin(pi) | dataframe['Project - Allocation'].isin(projects)]
+    filtered_dataframe = dataframe[dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)]
     filtered_dataframe.to_csv(output_file, index=False)
 
+
+def validate_pi_names(dataframe):
+    invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])]
+    for i, row in invalid_pi_projects.iterrows():
+        print(f'Warning: Project {row[PROJECT_FIELD]} has empty PI field')
+    dataframe = dataframe[~pandas.isna(dataframe[PI_FIELD])]
+
+    return dataframe
+
+
+def export_billables(dataframe, output_file):
+    dataframe.to_csv(output_file, index=False)
+
+
 def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
     if not os.path.exists(output_folder):
         os.mkdir(output_folder)
 
-    invoice_month = dataframe['Invoice Month'].iat[0]
-    pi_list = dataframe['Manager (PI)'].unique()
+    invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
+    pi_list = dataframe[PI_FIELD].unique()
 
     for pi in pi_list:
-        pi_projects = dataframe[dataframe['Manager (PI)'] == pi]
-        pi_instituition = pi_projects['Institution'].iat[0]
+        if pandas.isna(pi): 
+            continue
+        pi_projects = dataframe[dataframe[PI_FIELD] == pi]
+        pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0]
         pi_projects.to_csv(output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv")
-        
+
+
+def apply_credits_new_pi(dataframe, old_pi_file):
+    new_pi_credit_code = "0002"
+    new_pi_credit_amount = 1000
+
+    dataframe[CREDIT_FIELD] = None
+    dataframe[CREDIT_CODE_FIELD] = None
+    dataframe[BALANCE_FIELD] = 0
+
+    old_pi_dict = load_old_pis(old_pi_file)
+
+    current_pi_list = dataframe[PI_FIELD].unique()
+    invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
+
+    for pi in current_pi_list:
+        pi_projects = dataframe[dataframe[PI_FIELD] == pi]
+
+        if is_old_pi(old_pi_dict, pi, invoice_month):
+            for i, row in pi_projects.iterrows():
+                dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD]
+        else:
+            remaining_credit = new_pi_credit_amount
+            for i, row in pi_projects.iterrows():
+                project_cost = row[COST_FIELD]
+                applied_credit = min(project_cost, remaining_credit)
+
+                dataframe.at[i, CREDIT_FIELD] =  applied_credit
+                dataframe.at[i, CREDIT_CODE_FIELD] = new_pi_credit_code
+                dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - applied_credit
+                remaining_credit -= applied_credit
+
+                if remaining_credit == 0:
+                    break
+    
+    return dataframe
+
+
+def add_institution(dataframe: pandas.DataFrame):
+    """Determine every PI's institution name, logging any PI whose institution cannot be determined
+    This is performed by `get_institution_from_pi()`, which tries to match the PI's username to 
+    a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
+    the username is not an email address.
+    
+    Exact matches are then mapped to the corresponding institution name. 
+
+    I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University"
+
+    The list of mappings are defined in `institute_map.json`.
+    """
+    institute_map = load_institute_map()
+    for i, row in dataframe.iterrows():
+        pi_name = row[PI_FIELD]
+        if pandas.isna(pi_name): 
+            print(f"Project {row[PROJECT_FIELD]} has no PI")
+        else: 
+            dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(institute_map, pi_name)
+
+    return dataframe
+
+
+def export_HU_only(dataframe, output_file):
+    HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == 'Harvard University']
+    HU_projects.to_csv(output_file)
+
+
+def export_HU_BU(dataframe, output_file):
+    HU_BU_projects = dataframe[(dataframe[INSTITUTION_FIELD] == 'Harvard University') | 
+                               (dataframe[INSTITUTION_FIELD] == 'Boston University')]
+    HU_BU_projects.to_csv(output_file)        
+
 
 if __name__ == "__main__":
     main()
diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py
index d63c7d4..05f2e6b 100644
--- a/process_report/tests/unit_tests.py
+++ b/process_report/tests/unit_tests.py
@@ -1,8 +1,11 @@
 from unittest import TestCase
+from unittest import skipIf
 import tempfile
 import pandas
 import os
+import math
 from textwrap import dedent
+
 from process_report import process_report
 
 class TestGetInvoiceDate(TestCase):
@@ -70,7 +73,8 @@ def tearDown(self):
         os.remove(self.output_file2.name)
 
     def test_remove_non_billables(self):
-        process_report.remove_non_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude, self.output_file.name)
+        billables_df = process_report.remove_non_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude)
+        process_report.export_billables(billables_df, self.output_file.name)
 
         result_df = pandas.read_csv(self.output_file.name)
 
@@ -175,3 +179,87 @@ def test_export_pi(self):
         self.assertNotIn('ProjectA', pi_df['Project - Allocation'].tolist())
         self.assertNotIn('ProjectB', pi_df['Project - Allocation'].tolist())
         self.assertNotIn('ProjectC', pi_df['Project - Allocation'].tolist())
+
+
+class TestGetInstitute(TestCase):
+    def test_get_pi_institution(self):
+        
+        self.assertEqual(
+            process_report.get_institution_from_pi("quanmp@bu.edu"), "Boston University"
+        )
+        self.assertEqual(
+            process_report.get_institution_from_pi("c@mclean.harvard.edu"), "McLean Hospital"
+        )
+        self.assertEqual(
+            process_report.get_institution_from_pi("b@harvard.edu"), "Harvard University"
+        )
+        self.assertEqual(
+            process_report.get_institution_from_pi("fake"), ""
+        )
+        self.assertEqual(
+            process_report.get_institution_from_pi("pi@northeastern.edu"), "Northeastern University"
+        )
+
+
+class TestCredit0002(TestCase):
+    def setUp(self):
+
+        data = {
+            'Invoice Month': ['2024-03','2024-03','2024-03','2024-03','2024-03','2024-03'],
+            'Manager (PI)': ['PI1', 'PI1', 'PI2', 'PI3', 'PI4', 'PI4'],
+            'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE', 'ProjectF'],
+            'Cost': [10, 100, 10000, 5000, 800, 1000]
+        }
+        self.dataframe = pandas.DataFrame(data)
+        old_pi = ['PI2,2023-09', 'PI3,2024-02', 'PI4,2024-03'] # Case with old and new pi in pi file
+        old_pi_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv')
+        for pi in old_pi: old_pi_file.write(pi + "\n")
+        self.old_pi_file = old_pi_file.name
+
+    def tearDown(self):
+        os.remove(self.old_pi_file)
+
+    def test_apply_credit_0002(self):
+        dataframe = process_report.apply_credits_new_pi(self.dataframe, self.old_pi_file)
+
+        self.assertTrue('Credit' in dataframe)
+        self.assertTrue('Credit Code' in dataframe)
+        self.assertTrue('Balance' in dataframe)
+
+        non_credited_project = dataframe[pandas.isna(dataframe['Credit Code'])]
+        credited_projects = dataframe[dataframe['Credit Code'] == '0002']
+
+        self.assertEqual(2, len(non_credited_project))
+        self.assertEqual(non_credited_project.loc[2, 'Cost'], non_credited_project.loc[2, 'Balance'])
+        self.assertEqual(non_credited_project.loc[3, 'Cost'], non_credited_project.loc[3, 'Balance'])
+
+
+        self.assertEqual(4, len(credited_projects.index))
+        self.assertTrue('PI2' not in credited_projects['Manager (PI)'].unique())
+        self.assertTrue('PI3' not in credited_projects['Manager (PI)'].unique())
+
+        self.assertEqual(10, credited_projects.loc[0, 'Credit'])
+        self.assertEqual(100, credited_projects.loc[1, 'Credit'])
+        self.assertEqual(800, credited_projects.loc[4, 'Credit'])
+        self.assertEqual(200, credited_projects.loc[5, 'Credit'])
+
+        self.assertEqual(0, credited_projects.loc[0, 'Balance'])
+        self.assertEqual(0, credited_projects.loc[1, 'Balance'])
+        self.assertEqual(0, credited_projects.loc[4, 'Balance'])
+        self.assertEqual(800, credited_projects.loc[5, 'Balance'])
+
+
+class TestValidateBillables(TestCase):
+
+    def setUp(self):
+
+        data = {
+            'Manager (PI)': ['PI1', math.nan, 'PI1', 'PI2', 'PI2'],
+            'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'],
+        }
+        self.dataframe = pandas.DataFrame(data)
+
+    def test_validate_billables(self):
+        self.assertEqual(1, len(self.dataframe[pandas.isna(self.dataframe['Manager (PI)'])]))
+        validated_df = process_report.validate_billables(self.dataframe)
+        self.assertEqual(0, len(validated_df[pandas.isna(validated_df['Manager (PI)'])]))