-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
132 lines (111 loc) · 3.83 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
'''
Author: Pratheeksha Nair
'''
import pandas as pd
import numpy as np
import ast
import json
import plotly.graph_objects as go
from sklearn.linear_model import ElasticNet, LinearRegression
def get_feature_weights(X, Y):
'''
This functions returns the features that get more weightage according to the selected points.
i.e, if certain points were visually selected due to lying close to each other in a low-dim space,
we want to identify which features in the higher-dimensions may be responsible for that.
For this, we employ a Logistic Regression classifier to choose which features it gives importance to
while learning which are selected and which are not
'''
reg = LinearRegression().fit(X, Y)
feature_weights = reg.coef_
feature_weight_inds = np.argsort(feature_weights)[::-1][:4] # top 4 out of 9 important features
# print(feature_weight_inds)
return feature_weight_inds
def get_summary(df):
'''
This function returns the summary of the micro-cluster. It returns:
- Number of ads
- Number of phones numbers
- Number of Img URLs
- Number of Names
- Span of Days
- Number of Locations
'''
summary_stats = {}
df['post_date'] = pd.to_datetime(df.post_date)
df = df.sort_values(by='post_date')
# df['name'] = df['name'].apply(lambda x: ast.literal_eval(x))
summary_stats['num_ads'] = [len(df)]
summary_stats['num_phone'] = [df.phone_num.count(), df.phone_num.nunique()]
summary_stats['img_urls'] = [df.img_urls.count(), df.img_urls.nunique()]
summary_stats['num_names'] = [df.name.count(), df.name.nunique()]
summary_stats['num_loc'] = [df.location.count(), df.location.nunique()]
summary_stats['num_days'] = [df.post_date.count(), df.post_date.nunique()]
return summary_stats
def plotlyfromjson(fpath):
"""Render a plotly figure from a json file"""
with open(fpath, 'r', encoding='utf-8') as f:
v = json.loads(f.read())
fig = go.Figure(data=v['data'], layout=v['layout'])
return fig
def get_location_time_info(df, geoloc_info):
ads_per_location_time_df = pd.DataFrame(columns=['date','lat','lon','count','location','meta_label'])
lats = []
lons = []
counts = []
plot_counts = []
locations = []
plot_txt = []
dates = []
meta_ids = []
for i, cluster in df.groupby("Meta label"):
# for id, grp in cluster.groupby("year-month"):
for id, grp in cluster.groupby("post_date"):
cities = grp['cleaned_loc'].unique()
for city in cities:
geo = geoloc_info.loc[city].drop_duplicates()
try:
if city == 'Victoria':
lats.append(geo['lat'][0])
lons.append(geo['lng'][0])
else:
lats.append(geo['lat'])
lons.append(geo['lng'])
except:
if type(geo.geolocation) == str:
gg = geo.geolocation.split()
if len(gg) > 2:
print(gg)
elif pd.isnull(geo.geolocation.all()):
continue
else:
gg = geo.geolocation.values[0].split()
lats.append(gg[0])
lons.append(gg[1])
cnt = grp[grp.cleaned_loc==city].ad_id.count()
counts.append(cnt)
dates.append(id.strftime("%Y-%m-%d"))
# dates.append(str(grp['post_date'].values[0]))
# plot_counts.append(cnt*90000)
if cnt <= 99:
plot_counts.append(5)
elif cnt <= 499:
plot_counts.append(10)
elif cnt <= 999:
plot_counts.append(20)
elif cnt <= 4999:
plot_counts.append(50)
else:
plot_counts.append(90)
locations.append(city)
plot_txt.append(city)
meta_ids.append(i)
ads_per_location_time_df['lat'] = lats
ads_per_location_time_df['lon'] = lons
ads_per_location_time_df['count'] = counts
ads_per_location_time_df['location'] = locations
ads_per_location_time_df['plot_text'] = plot_txt
ads_per_location_time_df['date'] = dates
ads_per_location_time_df['plot_counts'] = plot_counts
ads_per_location_time_df['Meta Cluster ID'] = meta_ids
ads_per_location_time_df.sort_values(by='date',inplace=True)
return ads_per_location_time_df