-
Notifications
You must be signed in to change notification settings - Fork 2
/
DNGR.py
165 lines (111 loc) · 4.88 KB
/
DNGR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
# coding: utf-8
"""
Keras implementation of DNGR model. Generate embeddings for NG3, NG6 and NG9
of 20NewsGroup dataset. Evaluate with F1-score from MNB classifier and NMI score.
Also visualizing embeddings with t-SNE.
Author: Apoorva Vinod Gorur
"""
import sys
import numpy as np
import warnings
import DNGR_utils as ut
import matplotlib.pyplot as plt
from argparse import ArgumentParser
#Stage 1 - Random Surfing
@ut.timer("Random_Surfing")
def random_surf(cosine_sim_matrix, num_hops, alpha):
num_nodes = len(cosine_sim_matrix)
adj_matrix = ut.scale_sim_matrix(cosine_sim_matrix)
P0 = np.eye(num_nodes, dtype='float32')
P = np.eye(num_nodes, dtype='float32')
A = np.zeros((num_nodes,num_nodes),dtype='float32')
for i in range(num_hops):
P = (alpha*np.dot(P,adj_matrix)) + ((1-alpha)*P0)
A = A + P
return A
#Stage 2 - PPMI Matrix
@ut.timer("Generating PPMI matrix")
def PPMI_matrix(A):
num_nodes = len(A)
A = ut.scale_sim_matrix(A)
row_sum = np.sum(A, axis=1).reshape(num_nodes,1)
col_sum = np.sum(A, axis=0).reshape(1,num_nodes)
D = np.sum(col_sum)
PPMI = np.log(np.divide(np.multiply(D,A),np.dot(row_sum,col_sum)))
#Gotta use numpy for division, else it runs into divide by zero error, now it'll store inf or -inf
#All Diag elements will have either inf or -inf.
#Get PPMI by making negative values to 0
PPMI[np.isinf(PPMI)] = 0.0
PPMI[np.isneginf(PPMI)] = 0.0
PPMI[PPMI<0.0] = 0.0
return PPMI
#Stage 3 - AutoEncoders
@ut.timer("Generating embeddings with AutoEncoders")
def sdae(PPMI, hidden_neurons):
#local import
from keras.layers import Input, Dense, noise
from keras.models import Model
#Input layer. Corrupt with Gaussian Noise.
inp = Input(shape=(PPMI.shape[1],))
enc = noise.GaussianNoise(0.2)(inp)
#Encoding layers. Last layer is the bottle neck
for neurons in hidden_neurons:
enc = Dense(neurons, activation = 'relu')(enc)
#Decoding layers
dec = Dense(hidden_neurons[-2], activation = 'relu')(enc)
for neurons in hidden_neurons[:-3][::-1]:
dec = Dense(neurons, activation = 'relu')(dec)
dec = Dense(PPMI.shape[1], activation='relu')(dec)
#Train
auto_enc = Model(inputs=inp, outputs=dec)
auto_enc.compile(optimizer='adam', loss='mse')
auto_enc.fit(x=PPMI, y=PPMI, batch_size=10, epochs=5)
encoder = Model(inputs=inp, outputs=enc)
encoder.compile(optimizer='adam', loss='mse')
embeddings = encoder.predict(PPMI)
return embeddings
@ut.timer("the whole process")
def process(args):
group = args.group
num_hops = args.hops
alpha = args.alpha
hidden_neurons = args.hidden_neurons
if num_hops < 1:
sys.exit("DNGR: error: argument --hops: Max hops should be a positive whole number")
if alpha < 0.0 or alpha > 1.0:
sys.exit("DNGR: error: argument --alpha: Alpha's range is 0-1")
if len(hidden_neurons) < 3:
sys.exit("DNGR: error: argument --hidden_neurons: Need a minimum of 3 hidden layers")
#Read groups
text_corpus, file_names, target = ut.read_data(group)
#Compute Cosine Similarity Matrix. This acts as Adjacency matrix for the graph.
cosine_sim_matrix = ut.get_cosine_sim_matrix(text_corpus)
#Stage 1 - Compute Transition Matrix A by random surfing model
A = random_surf(cosine_sim_matrix, num_hops, alpha)
#Stage 2 - Compute PPMI matrix
PPMI = PPMI_matrix(A)
#Stage 3 - Generate Embeddings using Auto-Encoder
embeddings = sdae(PPMI, hidden_neurons)
#Evaluation
ut.compute_metrics(embeddings, target)
#Visualize embeddings using t-SNE
ut.visualize_TSNE(embeddings, target)
plt.show()
return
def main():
parser = ArgumentParser('DNGR',description="This is a Keras implementaion of DNGR evaluating the 20NewsGroup dataset.")
parser.add_argument('--group', default='NG3', const='NG3', nargs='?',
choices=['NG3','NG6','NG9'],
help='Choose the group to evaluate')
parser.add_argument('--hops', default=2, type=int,
help='Maximum number of hops for Transition Matrix in Random surfing')
parser.add_argument('--alpha', default=0.98,
help='Probability of (alpha) that surfing will go to next vertex, probability of (1-alpha) that surfing will return to original vertex and restart. Range 0-1')
parser.add_argument('--hidden_neurons', default=[512,256,128], type=int, nargs = '+',
help='Eg: \'512 256 128\' or \'256 128 64 32\'. Number of hidden neurons in auto-encoder layers. Make sure there are 3 or more layers')
warnings.filterwarnings("ignore")
args = parser.parse_args()
process(args)
if __name__ == '__main__':
main()