Skip to content

Commit

Permalink
Merge pull request #19 from QuanMPhm/17/linting
Browse files Browse the repository at this point in the history
Added pre-commit linting check
  • Loading branch information
knikolla authored Apr 18, 2024
2 parents ad7ba1e + 05ba7fc commit cd5c6cc
Show file tree
Hide file tree
Showing 7 changed files with 349 additions and 199 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/pre-commit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

name: pre-commit

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
- uses: pre-commit/[email protected]
16 changes: 16 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: check-merge-conflict
- id: end-of-file-fixer
- id: check-added-large-files
- id: check-case-conflict
- id: detect-private-key

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.1
hooks:
- id: ruff
- id: ruff-format
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,5 @@ In this example, `project foo` will not be billed for September 2023 and August

## Combine CSVs

This script also combines the 3 separate Invoice data CSVs into 1 Invoice CSV. It combines
This script also combines the 3 separate Invoice data CSVs into 1 Invoice CSV. It combines
OpenShift SU, OpenStack SU, and Storage SU data.
2 changes: 1 addition & 1 deletion process_report/institute_map.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"bidmc.harvard.edu" : "Beth Israel Deaconess Medical Center",
"fas.harvard.edu" : "Harvard University",
"cga.harvard.edu" : "Harvard University",
"iq.harvard.edu" : "Harvard University",
"iq.harvard.edu" : "Harvard University",
"hks.harvard.edu" : "Harvard University",
"hsph.harvard.edu" : "Harvard University",
"seas.harvard.edu" : "Harvard University",
Expand Down
144 changes: 81 additions & 63 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,35 @@


### Invoice field names
INVOICE_DATE_FIELD = 'Invoice Month'
PROJECT_FIELD = 'Project - Allocation'
PROJECT_ID_FIELD = 'Project - Allocation ID'
PI_FIELD = 'Manager (PI)'
INVOICE_EMAIL_FIELD = 'Invoice Email'
INVOICE_ADDRESS_FIELD = 'Invoice Address'
INSTITUTION_FIELD = 'Institution'
INSTITUTION_ID_FIELD = 'Institution - Specific Code'
SU_HOURS_FIELD = 'SU Hours (GBhr or SUhr)'
SU_TYPE_FIELD = 'SU Type'
COST_FIELD = 'Cost'
CREDIT_FIELD = 'Credit'
CREDIT_CODE_FIELD = 'Credit Code'
BALANCE_FIELD = 'Balance'
INVOICE_DATE_FIELD = "Invoice Month"
PROJECT_FIELD = "Project - Allocation"
PROJECT_ID_FIELD = "Project - Allocation ID"
PI_FIELD = "Manager (PI)"
INVOICE_EMAIL_FIELD = "Invoice Email"
INVOICE_ADDRESS_FIELD = "Invoice Address"
INSTITUTION_FIELD = "Institution"
INSTITUTION_ID_FIELD = "Institution - Specific Code"
SU_HOURS_FIELD = "SU Hours (GBhr or SUhr)"
SU_TYPE_FIELD = "SU Type"
COST_FIELD = "Cost"
CREDIT_FIELD = "Credit"
CREDIT_CODE_FIELD = "Credit Code"
BALANCE_FIELD = "Balance"
###


def get_institution_from_pi(institute_map, pi_uname):
institution_key = pi_uname.split('@')[-1]
institution_name = institute_map.get(institution_key, '')
institution_key = pi_uname.split("@")[-1]
institution_name = institute_map.get(institution_key, "")

if institution_name == '':
if institution_name == "":
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open('institute_map.json', 'r') as f:
with open("institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map
Expand All @@ -46,18 +46,18 @@ def load_old_pis(old_pi_file):

try:
with open(old_pi_file) as f:
for pi_info in f:
pi, first_month = pi_info.strip().split(',')
for pi_info in f:
pi, first_month = pi_info.strip().split(",")
old_pi_dict[pi] = first_month
except FileNotFoundError:
print('Applying credit 0002 failed. Old PI file does not exist')
print("Applying credit 0002 failed. Old PI file does not exist")
sys.exit(1)

return old_pi_dict


def is_old_pi(old_pi_dict, pi, invoice_month):
if pi in old_pi_dict and old_pi_dict[pi] != invoice_month:
if pi in old_pi_dict and old_pi_dict[pi] != invoice_month:
return True
return False

Expand Down Expand Up @@ -97,24 +97,24 @@ def main():
"--output-folder",
required=False,
default="pi_invoices",
help="Name of output folder containing pi-specific invoice csvs"
help="Name of output folder containing pi-specific invoice csvs",
)
parser.add_argument(
"--HU-invoice-file",
required=False,
default="HU_only.csv",
help="Name of output csv for HU invoices"
help="Name of output csv for HU invoices",
)
parser.add_argument(
"--HU-BU-invoice-file",
required=False,
default="HU_BU.csv",
help="Name of output csv for HU and BU invoices"
help="Name of output csv for HU and BU invoices",
)
parser.add_argument(
"--old-pi-file",
required=False,
help="Name of csv file listing previously billed PIs"
help="Name of csv file listing previously billed PIs",
)
args = parser.parse_args()
merged_dataframe = merge_csv(args.csv_files)
Expand Down Expand Up @@ -167,7 +167,7 @@ def get_invoice_date(dataframe):
be the same for every row.
"""
invoice_date_str = dataframe[INVOICE_DATE_FIELD][0]
invoice_date = pandas.to_datetime(invoice_date_str, format='%Y-%m')
invoice_date = pandas.to_datetime(invoice_date_str, format="%Y-%m")
return invoice_date


Expand All @@ -176,16 +176,22 @@ def timed_projects(timed_projects_file, invoice_date):
dataframe = pandas.read_csv(timed_projects_file)

# convert to pandas timestamp objects
dataframe['Start Date'] = pandas.to_datetime(dataframe['Start Date'], format="%Y-%m")
dataframe['End Date'] = pandas.to_datetime(dataframe['End Date'], format="%Y-%m")
dataframe["Start Date"] = pandas.to_datetime(
dataframe["Start Date"], format="%Y-%m"
)
dataframe["End Date"] = pandas.to_datetime(dataframe["End Date"], format="%Y-%m")

mask = (dataframe['Start Date'] <= invoice_date) & (invoice_date <= dataframe['End Date'])
return dataframe[mask]['Project'].to_list()
mask = (dataframe["Start Date"] <= invoice_date) & (
invoice_date <= dataframe["End Date"]
)
return dataframe[mask]["Project"].to_list()


def remove_non_billables(dataframe, pi, projects):
"""Removes projects and PIs that should not be billed from the dataframe"""
filtered_dataframe = dataframe[~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)]
filtered_dataframe = dataframe[
~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)
]
return filtered_dataframe


Expand All @@ -194,14 +200,16 @@ def remove_billables(dataframe, pi, projects, output_file):
So this *keeps* the projects/pis that should not be billed.
"""
filtered_dataframe = dataframe[dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)]
filtered_dataframe = dataframe[
dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)
]
filtered_dataframe.to_csv(output_file, index=False)


def validate_pi_names(dataframe):
invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])]
for i, row in invalid_pi_projects.iterrows():
print(f'Warning: Project {row[PROJECT_FIELD]} has empty PI field')
print(f"Warning: Project {row[PROJECT_FIELD]} has empty PI field")
dataframe = dataframe[~pandas.isna(dataframe[PI_FIELD])]

return dataframe
Expand All @@ -219,11 +227,13 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
pi_list = dataframe[PI_FIELD].unique()

for pi in pi_list:
if pandas.isna(pi):
if pandas.isna(pi):
continue
pi_projects = dataframe[dataframe[PI_FIELD] == pi]
pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0]
pi_projects.to_csv(output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv")
pi_projects.to_csv(
output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv"
)


def apply_credits_new_pi(dataframe, old_pi_file):
Expand Down Expand Up @@ -251,24 +261,24 @@ def apply_credits_new_pi(dataframe, old_pi_file):
project_cost = row[COST_FIELD]
applied_credit = min(project_cost, remaining_credit)

dataframe.at[i, CREDIT_FIELD] = applied_credit
dataframe.at[i, CREDIT_FIELD] = applied_credit
dataframe.at[i, CREDIT_CODE_FIELD] = new_pi_credit_code
dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - applied_credit
remaining_credit -= applied_credit

if remaining_credit == 0:
break

return dataframe


def add_institution(dataframe: pandas.DataFrame):
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
the username is not an email address.
Exact matches are then mapped to the corresponding institution name.
Exact matches are then mapped to the corresponding institution name.
I.e "[email protected]" would match with "bu.edu", which maps to the instition name "Boston University"
Expand All @@ -277,42 +287,50 @@ def add_institution(dataframe: pandas.DataFrame):
institute_map = load_institute_map()
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI")
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(institute_map, pi_name)
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
institute_map, pi_name
)

return dataframe


def export_HU_only(dataframe, output_file):
HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == 'Harvard University']
HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"]
HU_projects.to_csv(output_file)


def export_HU_BU(dataframe, output_file):
HU_BU_projects = dataframe[(dataframe[INSTITUTION_FIELD] == 'Harvard University') |
(dataframe[INSTITUTION_FIELD] == 'Boston University')]
HU_BU_projects.to_csv(output_file)
HU_BU_projects = dataframe[
(dataframe[INSTITUTION_FIELD] == "Harvard University")
| (dataframe[INSTITUTION_FIELD] == "Boston University")
]
HU_BU_projects.to_csv(output_file)


def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
lenovo_file_name = (
output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv"
)

lenovo_file_name = output_file or f'Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv'

LENOVO_SU_TYPES = ['OpenShift GPUA100SXM4', 'OpenStack GPUA100SXM4']
LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
SU_CHARGE_MULTIPLIER = 1

lenovo_df = dataframe[dataframe[SU_TYPE_FIELD].isin(LENOVO_SU_TYPES)][[
INVOICE_DATE_FIELD,
PROJECT_FIELD,
INSTITUTION_FIELD,
SU_HOURS_FIELD,
SU_TYPE_FIELD]]

lenovo_df.rename(columns={SU_HOURS_FIELD: 'SU Hours'}, inplace=True)
lenovo_df.insert(len(lenovo_df.columns), 'SU Charge', SU_CHARGE_MULTIPLIER)
lenovo_df['Charge'] = lenovo_df['SU Hours'] * lenovo_df['SU Charge']
lenovo_df = dataframe[dataframe[SU_TYPE_FIELD].isin(LENOVO_SU_TYPES)][
[
INVOICE_DATE_FIELD,
PROJECT_FIELD,
INSTITUTION_FIELD,
SU_HOURS_FIELD,
SU_TYPE_FIELD,
]
]

lenovo_df.rename(columns={SU_HOURS_FIELD: "SU Hours"}, inplace=True)
lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER)
lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"]
lenovo_df.to_csv(lenovo_file_name)


Expand Down
Loading

0 comments on commit cd5c6cc

Please sign in to comment.