-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.py
110 lines (86 loc) · 3.84 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import matplotlib.pyplot as plt
#################################################################################
# Helper Functions
#################################################################################
def sparql_to_df(g, q):
"""
Transforms sparql query result into a pandas dataframe called metadata, and uses this to retrieve timeseries data
Input: rdf graph g and sparql query q
Output: metadata and timeseries pandas dataframes
"""
# query the graph
res = g.query(q)
# convert to SparqlResult to a pandas dataframe with column names relfecting variables in sparql query
md = pd.DataFrame.from_records(data=list(res),columns=res.vars)
md = md.applymap(str) #convert everything to a string
md.columns = [x.strip() for x in md.columns]
md.drop_duplicates(inplace=True) #get rid of duplicate rows
for column in md:
if column == 'unit':
md[column] = md[column].str.split("unit/", expand=True).iloc[:,-1:]
else:
md[column] = md[column].str.split("#", expand=True).iloc[:,-1:]
# extract the ids from the metadata dataframe and add DATETIME
md.set_index('sensorid', inplace=True, drop=True)
ids = list(md.index)
ids.insert(0,'DATETIME')
# extract the location from the metadata dataframe
path = md['dblocation'][0]
# retrieve timeseries data
ts = get_ts_data(ids, path)
return md, ts
def get_ts_data(ids, path):
"""
Retrieves timeseries data using the ids given
Input: list containing timeseries ids
Output: pandas dataframe containing timeseries data for given id's
"""
ts = pd.read_csv(path, usecols=ids)
ts['DATETIME'] = pd.to_datetime(ts['DATETIME'])
ts.set_index('DATETIME', inplace=True, drop=True)
return ts
def missing_values(ts):
"""
Checks if the timeseries data contains missing values, and what percentage of data is missing
Input: timeseries pandas dataframe
Output: pandas dataframe, indexed by column name
"""
# row, column count
row, col = ts.shape
# count empty values and percentage of column empty
num_empty = ts.isnull().sum()
percent_empty = 100 * num_empty / row
# create a pandas table to store this information. Sort by percentage missing, descending order
mis_val_table = pd.concat([num_empty, percent_empty], axis=1).rename(columns = {0 : 'N. of Missing Values', 1 : '% of Values Missing'})
mis_val_table = mis_val_table[mis_val_table.iloc[:,1] != 0].sort_values('% of Values Missing', ascending=False).round()
print(f"The timeseries data has {col} columns. \n{len(mis_val_table.index)} of these have missing values.")
return mis_val_table
def timeseries_plot(timeseries,metadata, sensors=None, timeslice=None):
"""
Plots timeseries data for a given list of sensors and time period
Inputs: timeseries data as pandas dataframe, a list of sensors, timeslice as a tuple (YY-MM-DD HH-MM-SS, YY-MM-DD HH-MM-SS)
"""
if timeslice:
try:
timeseries = timeseries[timeslice[0]:timeslice[1]]
except:
print("Timeslice is invalid. Must be of format ('YY-MM-DD HH-MM-SS', 'YY-MM-DD HH-MM-SS') ")
return
if sensors:
try:
timeseries = timeseries[sensors]
except:
print("Sensors are invalid. Must be strings in a list. Check the naming")
return
elif not sensors:
sensors = list(timeseries.columns)
sensor_units = {}
for sensor in sensors:
sensor_units[sensor] = metadata["unit"][sensor]
if len(set(sensor_units.values())) != 1:
# there is more than one unit
print("NOTE: Sensors are of different units")
return
timeseries.plot(xlabel= "Time-Stamp",ylabel=list(sensor_units.values())[0])
plt.show()