-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
345 lines (286 loc) · 12.1 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
'''
This module defines a TimeSeries object that will be used in all other components. It provides all the preprocessing
methods for time series data as specified in the Project 1 handout.
Authors: Yifeng Cui and Jacob Rammer
Group name: The Classy Coders
Most recent modification: 2/9/21
'''
import pandas as pd
import csv
import matplotlib.pyplot as plt
from datetime import date
from datetime import datetime, date, timedelta
import numpy as np
import re
import janitor # need to install
class TimeSeries:
def __init__(self, df=None):
self.data = None # holds data from initial csv read
if type(df) == pd.core.frame.DataFrame:
self.data = df
# self.clipped = None # holds data from a clipped interval
# self.temp = None # for impute missing
# self.dif = None # for calculating difference
def read_from_file(self, file_name: str):
"""
Read from a CSV file and create a Pandas dataframe
:param file_name: name of the csv file to open
:return: a time series
"""
try:
self.data = pd.read_csv(file_name)
except FileNotFoundError:
print(f"File {file_name} not found.")
return self
def write_to_file(self, file_name: str):
"""
Write data to a CSV file
:param data: data to write to CSV file
:param file_name: name of the output file
:return: void
"""
self.data.to_csv(file_name)
def assign_time(self, start: str, increment: int):
"""
If a csv file does not include a date section, this method adds it.
Accomplished by iterating over all rows and adding the date
Expected input: 01/23/2021 12:30 (mm/dd/yyyy hh:mm)
Use regex to extract date information to create a datetime object
for easy time manipulation
:param start: The starting date of the time series
:type start: datetime
:param increment: the time interval
:type increment: int
:return: void
"""
try:
month_reg = r"^([0-9]{2})" # matches the month
day_reg = r"\/([0-9]{2})\/" # matches the day
year_reg = r"\/([0-9]{4})" # matches the year
hour_reg = r"( [0-9]{2}):" # matches the hour
minute_reg = r":([0-9]{2})" # matches the minute
month = re.search(month_reg, start) # extracted month
day = re.search(day_reg, start) # extracted day
year = re.search(year_reg, start) # extracted year
hour = re.search(hour_reg, start) # extracted hour
minute = (re.search(minute_reg, start)) # extracted minute
# datetime object for easy time manipulation over an interval
date = datetime(year=int(year.group(1)), month=int(month.group(1)),
day=int(day.group(1)), hour=int(hour.group(1)),
minute=int(minute.group(1)))
# create the missing columns in the dataframe
# self.data['DATE (MM/DD/YYYY)'] = None
# self.data['MST'] = None
self.data.insert(0, "Date", None)
self.data.insert(1, "Time", None)
"""
Assign each row data in the missing time and date columns.
Increment time time by increment. .date() and .time()
pull exactly what it sounds like
"""
for i in range(len(self.data)):
self.data.at[i, self.data.columns[0]] = date.date()
self.data.at[i, self.data.columns[1]] = date.time()
date += timedelta(hours=int(increment))
except:
print("Error!")
print(self.data)
def clip(self, starting_date, final_date):
"""
This method extracts time series data from the dataframe
within a specified date (starting_date) and ending date (final_date)
:param starting_date: date str in the form of mm/dd/yyyy
:type starting_date: str
:param final_date: ending date in the form of mm/dd/yyyy
:type final_date: str
:return: TimeSeries with extracted data
"""
first_date = self.data.columns[0] # copy the date header from csv
clipped = self.data.filter_date(first_date, starting_date, final_date)
# print(clipped.head())
return TimeSeries(clipped)
def denoise(self):
"""
Denoise a time series. This is accomplished by first
filling in missing values and then imputing outliers.
This method also modifies self.data and returns a new
TimeSeries.
:return: TimeSeries with denoised data
"""
self.impute_missing()
self.impute_outliers()
return TimeSeries(self.data)
def impute_missing(self):
"""
Compute missing values such as NaN's
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
Fills data to the right to NaNs
:returns: void
"""
temp = self.data.fillna(method='bfill')
self.data = temp
def difference(self):
"""
This method calculates the difference between columns.
This method only modifies the data columns.
:return: TimeSeries with difference calculated
"""
data_index = len(self.data.columns) - 1
temp = self.data.copy()
temp[temp.columns[data_index]] = \
self.data[self.data.columns[data_index]] - \
self.data[self.data.columns[data_index]].shift(-1)
print(temp)
return TimeSeries(temp)
def impute_outliers(self):
"""
Find and remove outlies from dataframe
Referenced: https://stackoverflow.com/questions/23199796/
detect-and-exclude-outliers-in-pandas-data-frame
Find the low and high quantile in the dataframe and
look through the whole dataframe and remove that value.
:returns: void
"""
temp = self.data.copy()
data_index = len(self.data.columns) - 1
q_low = self.data[self.data.columns[data_index]].quantile(.01)
q_high = self.data[self.data.columns[data_index]].quantile(.99)
self.data = self.data[(self.data[temp.columns[data_index]] < q_high) &
(self.data[temp.columns[data_index]] > q_low)]
#print(self.data)
def longest_continuous_run(self):
"""
This method finds the longest continuous run in a dataframe.
A continuous run can be defined as rows that don't have
any missing information (NaNs).
df = dataframe
:return: TimeSeries with longest continuous run
"""
temp = self.data.isna() # find NaNs in df
runs = [] # holds index of NaNs
data_index = len(self.data.columns) - 1
for i in range(len(temp)):
if temp.at[i, temp.columns[data_index]] == True:
runs.append(i + 1)
runs.append(len(temp) + 1) # end of file
ret = self.data.iloc[0:0] # blank df
for i in range(len(runs) - 1):
first = runs[i]
last = runs[i + 1]
temp = self.data.iloc[first : last - 1]
if len(temp) > len(ret):
'''
This compares the length of the previous
longest run represented by a dataframe
'''
ret = temp
#print(ret)
return TimeSeries(ret)
def scaling(self):
"""
Produces a time series whose magnitudes are scaled so that the resulting
magnitudes range in the interval [0,1].
"""
new_df = self.data.copy()
# Loop through the columns in the DataFrame
for col in new_df:
# If the column contains floats or integers we can take the scaling and store it
if new_df[col].dtype == 'float64' or new_df[col].dtype == 'int64':
new_df[col] = (new_df[col] - new_df[col].min()) / (new_df[col].max() - new_df[col].min())
return TimeSeries(new_df)
def standardize(self):
"""
Produces a time series whose mean is 0 and variance is 1.
:returns: Timeseries with standard data
"""
new_df = self.data.copy()
# Loop through the columns in the DataFrame
for col in new_df:
# If the column contains floats or integers we can take the scaling and store it
if new_df[col].dtype == 'float64' or new_df[col].dtype == 'int64':
new_df[col] = (new_df[col] - 0) / 1
return TimeSeries(new_df)
def logarithm(self):
"""
Produces a time series whose elements are the logarithm of the original
elements.
:returns: Timeseries with logarithm'd data
"""
# Create a copy of the current DataFrame
new_df = self.data.copy()
# Loop through the columns in the DataFrame
for col in new_df:
# If the column contains floats or integers we can take the logarithm and store it
if new_df[col].dtype == 'float64' or new_df[col].dtype == 'int64':
new_df[col] = np.log10(new_df[col])
# Return our new DataFrame
return TimeSeries(new_df)
def cubic_root(self):
"""
Produces a time series whose elements are the original elements’ cubic root.
:return: TimeSeries with Cubic root
"""
# Create a copy of the current DataFrame
new_df = self.data.copy()
# Loop through the columns in the DataFrame
for col in new_df:
# If the column contains floats or integers we can take the cubic root and store it
if new_df[col].dtype == 'float64' or new_df[col].dtype == 'int64':
new_df[col] = new_df[col] ** (1 / 3)
# Return our new DataFrame
return TimeSeries(new_df)
def split_data(self, perc_training=.8, perc_valid=.01, perc_test=.19, ):
"""
Splits a time series into training, validation, and testing according to the given percentages.
:return: void
"""
perc_valid += perc_training
perc_test += perc_valid
df = pd.DataFrame(self.data)
array = df.values.tolist()
timeList = []
varList = []
for index in array:
timeList.append(index[0])
varList.append(index[-1])
self.train = varList[0:int(len(array) * perc_training) - 1]
self.val = varList[int(len(array) * perc_training):int(len(array) * perc_valid) - 1]
self.test = varList[int(len(array) * perc_valid):int(len(array) * perc_test) - 1]
def design_matrix(self, input_index=0, output_index=25):
x_train, y_train = [],[]
x_start = 0
x_end = input_index + x_start
y_start = x_end
y_end = output_index + y_start
while y_end <= len(self.train):
x_train.append(self.train[x_start:x_end])
y_train.append(self.train[y_start:y_end])
x_start += 1
x_end += 1
y_start += 1
y_end += 1
x_test, y_test = [],[]
x_start = 0
x_end = input_index + x_start
y_start = x_end
y_end = output_index + y_start
while y_end <= len(self.test):
x_test.append(self.test[x_start:x_end])
y_test.append(self.test[y_start:y_end])
x_start += 1
x_end += 1
y_start += 1
y_end += 1
return (x_train, y_train), (x_test, y_test)
def ts2db(self, input_file_name, perc_training, perc_valid, perc_test, input_index,
output_index, output_file_name):
"""read the file
split data
produce a new database"""
if input_file_name:
self.read_from_file(input_file_name)
self.split_data(perc_training, perc_valid, perc_test)
trainingMatrix, testMatrix = self.design_matrix(input_index=input_index, output_index=output_index)
x_train, y_train = trainingMatrix[0], trainingMatrix[1]
x_test, y_test = testMatrix[0], testMatrix[1]
return x_train, y_train, x_test, y_test