-
Notifications
You must be signed in to change notification settings - Fork 0
/
bpt.py
119 lines (106 loc) · 5.69 KB
/
bpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
bpt.py consists of functions that streamline BPT data I/O for quicker analyses, storing each workbook
into a BPT class object. Each workbook within the BPT class is stored as a Pandas DataFrame object.
"""
import os
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import zipfile
import pandas as pd
def load_data_dict(dir_to_query:str) -> pd.DataFrame:
"""Find zipped directory containing the string 'dictionary' for a given year."""
data_dict_flag = re.compile("dictionary")
assert os.path.exists(dir_to_query), f"Path {dir_to_query} doesn't exist."
for _, _, files in os.walk(dir_to_query):
for file in files:
if data_dict_flag.search(file):
full_path_to_zip = os.path.join(dir_to_query, file)
if full_path_to_zip.endswith('.zip'):
zippedFile = zipfile.ZipFile(full_path_to_zip, "r")
for zippedObj in zippedFile.namelist():
if zippedObj.endswith('.csv'):
data_dict = pd.read_csv(zippedFile.open(zippedObj))
return data_dict
def load_sheet(dir_to_query:str, worksheet:str) -> pd.DataFrame:
"""
Loads sheets corresponding to different sections of BPT.
An exception would be sheet 5 (county data), which spans multiple text files.
Filename choices have the general form ma_x, where x denotes a specific sheet number.
"""
years_for_exception = ['2014', '2015']
for _, _, objs in os.walk(dir_to_query):
tmp = list(objs)
if any(x in dir_to_query for x in years_for_exception):
dir_of_interest = [d for d in tmp if('_ma' in d)]
else:
dir_of_interest = [d for d in tmp if(worksheet in d)]
for d in dir_of_interest:
zmaobj = zipfile.ZipFile(os.path.join(dir_to_query, d), "r")
with zmaobj as f:
for name in f.namelist():
if worksheet in name:
df = pd.read_csv(zmaobj.open(name),
sep='\t',
engine='python',
encoding='cp1252')
return df
def load_county_data(dir_to_query:str, worksheet:str="ma_5") -> pd.DataFrame:
"""Load up benchmark data, which spans multiple zipped directories or text files by county."""
county = []
ma5_flag = re.compile(worksheet)
years_for_exception = ['2014', '2015']
for _, _, objs in os.walk(dir_to_query):
tmp = list(objs)
for obj in tmp:
if ma5_flag.search(obj):
fullzippath = os.path.join(dir_to_query, obj)
zf = zipfile.ZipFile(fullzippath, "r")
for f in zf.namelist():
if any(x in dir_to_query for x in years_for_exception):
if 'ma_5' in f: # Needed to hardcode 'ma_5' search since worksheet needs to contain '_ma'
if f.endswith('.txt'):
tmp = pd.read_csv(zf.open(f),
sep='\t',
engine='python',
encoding='cp1252')
county.append(tmp)
else:
if f.endswith('.txt'):
tmp = pd.read_csv(zf.open(f),
sep='\t',
engine='python',
encoding='cp1252')
county.append(tmp)
df = pd.concat(county, axis=1)
return df
def replace_field_to_name(data_dict: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
"""Replace df identifiers with whole name for improving interpretibility."""
return df.rename(columns=dict(zip(data_dict['NAME'],
data_dict.iloc[:, data_dict.columns.str.contains(pat='FIELD')].squeeze())))
def data_loader(dirpath:str, worksheet:str, data_dict:pd.DataFrame):
"""Wrap functions above to load data into class field."""
df = load_sheet(dirpath, worksheet)
renamed_df = replace_field_to_name(data_dict, df)
return renamed_df.dropna(subset=['BID ID (H-number, Plan ID, Segment ID)'])
class BPT:
def __init__(self, yeardir):
self.year = yeardir.split('/')[-2]
self.data_dict = load_data_dict(yeardir)
print(f"LOADING {self.year} BPT DATA")
print("LOADING BASE PERIOD EXPERIENCE ('ma_1') DATA")
self.ma_1 = data_loader(yeardir, worksheet='ma_1', data_dict=self.data_dict)
print("LOADING PROJECTED ALLOWED COST ('ma_2') DATA")
self.ma_2 = data_loader(yeardir, worksheet='ma_2', data_dict=self.data_dict)
print("LOADING PROJECTED COST SHARING ('ma_3') DATA")
self.ma_3 = data_loader(yeardir, worksheet='ma_3', data_dict=self.data_dict)
print("LOADING PROJECTED REV REQUIREMENT ('ma_4') DATA")
self.ma_4 = data_loader(yeardir, worksheet='ma_4', data_dict=self.data_dict)
print("LOADING BENCHMARK ('ma_5') DATA")
self.ma_5 = data_loader(yeardir, worksheet='ma_5', data_dict=self.data_dict)
print("LOADING BID SUMMARY ('ma_6') DATA")
self.ma_6 = data_loader(yeardir, worksheet='ma_6', data_dict=self.data_dict)
print("LOADING OPTIONAL SUPPLEMENTAL BENEFITS ('ma_7') DATA")
self.ma_7 = data_loader(yeardir, worksheet='ma_7', data_dict=self.data_dict)