forked from hadoov/GHRS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Features100K.py
79 lines (67 loc) · 2.9 KB
/
Features100K.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import pandas as pd
import scipy
import matplotlib as plt
import networkx as nx
import itertools
import collections
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
def convert_categorical(df_X, _X):
values = np.array(df_X[_X])
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
df_X = df_X.drop(_X, 1)
for j in range(integer_encoded.max() + 1):
df_X.insert(loc=j + 1, column=str(_X) + str(j + 1), value=onehot_encoded[:, j])
return df_X
alpha_coefs = [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04, 0.045]
dataPath = 'datasets/ml-100k/'
df = pd.read_csv(dataPath+'ua.base', sep='\\t', engine='python', names=['UID', 'MID', 'rate', 'time'])
df_user = pd.read_csv(dataPath+'u.user', sep='\\|', engine='python', names=['UID', 'age', 'gender', 'job', 'zip'])
df_user = convert_categorical(df_user, 'job')
df_user = convert_categorical(df_user, 'gender')
df_user['bin'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 100], labels=['1', '2', '3', '4', '5', '6'])
df_user['age'] = df_user['bin']
df_user = df_user.drop('bin', 1)
df_user = convert_categorical(df_user, 'age')
df_user = df_user.drop('zip', 1)
for alpha_coef in alpha_coefs:
pairs = []
grouped = df.groupby(['MID', 'rate'])
for key, group in grouped:
pairs.extend(list(itertools.combinations(group['UID'], 2)))
counter = collections.Counter(pairs)
alpha = alpha_coef * 1682 # param*i_no
edge_list = map(list, collections.Counter(el for el in counter.elements() if counter[el] >= alpha).keys())
G = nx.Graph()
for el in edge_list:
G.add_edge(el[0], el[1], weight=1)
G.add_edge(el[0], el[0], weight=1)
G.add_edge(el[1], el[1], weight=1)
pr = nx.pagerank(G.to_directed())
df_user['PR'] = df_user['UID'].map(pr)
df_user['PR'] /= float(df_user['PR'].max())
dc = nx.degree_centrality(G)
df_user['CD'] = df_user['UID'].map(dc)
df_user['CD'] /= float(df_user['CD'].max())
cc = nx.closeness_centrality(G)
df_user['CC'] = df_user['UID'].map(cc)
df_user['CC'] /= float(df_user['CC'].max())
bc = nx.betweenness_centrality(G)
df_user['CB'] = df_user['UID'].map(bc)
df_user['CB'] /= float(df_user['CB'].max())
lc = nx.load_centrality(G)
df_user['LC'] = df_user['UID'].map(lc)
df_user['LC'] /= float(df_user['LC'].max())
nd = nx.average_neighbor_degree(G, weight='weight')
df_user['AND'] = df_user['UID'].map(nd)
df_user['AND'] /= float(df_user['AND'].max())
X_train = df_user[df_user.columns[1:]]
X_train.fillna(0, inplace=True)
X_train.to_pickle("data100k/x_train_alpha("+str(alpha_coef)+").pkl")