-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
109 lines (101 loc) · 4.43 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#
# Copyright 2024 Two Sigma Open Source, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import pandas as pd
import numpy as np
from jinja2 import Environment, BaseLoader
def get_top_5_most_freq(values):
"""
Get top-5 most frequent values.
"""
series = pd.Series(list(values), dtype='object')
return list(series.value_counts().sort_values(ascending=False).iloc[:5].index)
def get_col_summary(df, data_dict=None):
"""
Get Column Summary of a table.
:param df: data-table in the universe.
:param data_dict: data dictionary mapping Column -> Description.
:return: string column summary
"""
inferred = df.infer_objects()
column_summary = ''
if data_dict is None:
data_dict = {}
for col in inferred.columns:
description = data_dict.get(col, None)
description = '' if description is None else description
num_na = np.count_nonzero(inferred[col].isna())
if pd.api.types.is_bool_dtype(inferred[col].dtype) or (not pd.api.types.is_numeric_dtype(inferred[col].dtype)) or (inferred[col].isna().all()):
try:
average_size = inferred.loc[~inferred[col].isna(), col].astype(str).str.len().mean()
except Exception as e:
print(inferred.loc[~inferred[col].isna(), col])
raise e
if average_size > 40:
continue
try:
num_unique = len(inferred[col].unique())
values = [val[:300] if isinstance(val, str) else val for val in inferred[col].values]
top_5_most_freq = get_top_5_most_freq(values)
first_five = values[:5]
column_summary += f'-col:{col}\n' + '\n'.join([
f'\t*{k}:{v}' for k,v in [('description', description), ('num_unique', num_unique), ('top_5_most_freq', top_5_most_freq), ('first_five', first_five), ('num_na', num_na)]
]) + '\n'
except Exception as e:
pass
else:
mean = round(inferred[col].mean(), 3)
minimum = inferred[col].min()
maximum = inferred[col].max()
std = round(inferred[col].std(), 3)
quantiles = inferred[col].quantile([0.25, 0.5, 0.75])
q1 = round(quantiles[0.25], 3)
q2 = round(quantiles[0.5], 3)
q3 = round(quantiles[0.75], 3)
first_five = list(inferred[col].values[:5])
column_summary += f'-col:{col}\n' + '\n'.join([
f'\t*{k}:{v}' for k,v in [('description', description), ('mean', mean), ('std', std), ('min', minimum), ('0.25', q1), ('0.5', q2), ('0.75', q3), ('max', maximum), ('first_five', first_five), ('num_na', num_na)]
]) + '\n'
return column_summary
def col_prep(col):
"""
Normalization of column name for summary
"""
col = col.lower()
col = col.replace('%', 'percent')
return ''.join(ch for ch in col if ch.isalnum()).strip()
def df_reader(directory, max_rows):
"""
Reads in a dataframe from a directory, with maximum row specification. Also only loads first 30 columns. Used for Kaggle.
"""
df = pd.read_csv(directory, nrows=max_rows)
df.columns = [col_prep(col) for col in df.columns]
sub_df = df[df.columns[:30]]
return sub_df.loc[:,~sub_df.columns.duplicated()]
def is_good_name(col):
for bad_name in ['tsdatasnowflake', 'unnamed']:
if bad_name in col:
return False
return True
def df_reader_v2(directory, max_rows):
"""
Reads in a dataframe from a directory, with maximum row specification and column filtering. Also only loads first 30 columns. Used for Harvard.
"""
df = pd.read_csv(directory, nrows=max_rows)
df.columns = [col_prep(col) for col in df.columns]
sub_df = df[df.columns[:30]]
col_mask = [is_good_name(col) for col in sub_df.columns]
sub_df = sub_df.loc[:, sub_df.columns[col_mask]]
return sub_df.loc[:,~sub_df.columns.duplicated()]