-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhelper_functions_LSTM_TF.py
executable file
·360 lines (299 loc) · 11.7 KB
/
helper_functions_LSTM_TF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import os
#https://stackoverflow.com/questions/37604289/tkinter-tclerror-no-display-name-and-no-display-environment-variable/43592515
#for running the pipeline through SSH
import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
mpl.use('Agg')
import os, json, nltk, re, string
# from sklearn.externals import joblib
import joblib
import numpy as np
import pickle, gzip
import pdb
import h5py
# import cPickle
import _pickle as cPickle
from ldl_utils import vectorize
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from collections import defaultdict,OrderedDict
from gensim.models import LdaModel #https://radimrehurek.com/gensim/utils.html#gensim.utils.SaveLoad.load
import shutil #copy lda max model to final folder
from helper_functions import save_to_json_foldercheck
import datetime
import pandas as pd
import wandb
# WANDB_NAME = "maxent-experiments"
# WANDB_NAME = "pooling-experiments"
# WANDB_NAME = "anuj_exps"
# WANDB_NAME = "tr_exps"
# WANDB_NAME = "pp_pldl_party"
# WANDB_NAME = "pp_pldl"
# WANDB_NAME = "DS-experiments"
# WANDB_NAME = "acl_exps"
WANDB_NAME = "crowdeval_exps"
from tqdm import tqdm
def gpu_fix_cuda():
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))
def save_keras_trained_model(MODEL_LOG_DIR, model):
# output_name = cnn_name.split("_")[0]
# model_dir = MODEL_LOG_DIR + output_name
#
# if not os.path.exists(model_dir):
# os.makedirs(model_dir)
pdb.set_trace()
temp = cPickle.dump(model)
file = open(MODEL_LOG_DIR, "wb")
with open(MODEL_LOG_DIR, "wb") as file:
cPickle.dump(model, file)
del model # deletes the existing model
file.close()
#data/LSTM/jobQ1/jobQ1_CF_shuffle_lda_AVGprob.h5
def load_keras_model(MODEL_LOG_DIR, cnn_name):
output_name = cnn_name.split("_")[0]
model_dir = MODEL_LOG_DIR + output_name
model = load_model(model_dir + '/' + cnn_name + '.h5')
return model
def get_feature_vectors(fdict, data):
#output = {}
output = defaultdict(list)
for item in data:
# vect = vectorize(fdict, item["labels"])
vect = list(item["labels"].values()) #list conversion through vectorize without frills
item["message_id"] = int(item["message_id"])
output[item["message_id"]] = vect
return output
def read_labels_json(fdict, data):
#output = {}
output = defaultdict(list)
for item in data:
# vect = vectorize(fdict, item["labels"])
vect = list(item["labels"].values()) #list conversion through vectorize without frills
item["message_id"] = int(item["message_id"])
output[item["message_id"]] = vect
return output
def compile_tweet_dict(json_list):
result = {int(x["message_id"]): x["message"] for x in json_list}
return result
def check_label_frequency(y):
print(y.sum(axis=0))
#print(y_test.sum(axis=0))
def myconverter(o):
if isinstance(o, datetime.datetime):
return o.__str__()
def write_results_to_json(model_file,model_type,score,acc,y_test_KL,y_test_Mis,y_test_Nmis,epsilon,outputdir):
result = defaultdict(list)
results = defaultdict(list)
model_info = defaultdict(list)
model_info["Model path"] = model_file
model_info["Model"] = model_type
model_info["Timestamp"] = datetime.datetime.now()
result["Test score"] = score
result["Test accuracy"] = acc
result["KL divergence"] = y_test_KL
result["Mutual information score"] = y_test_Mis
result["Normalized mutual information score"] = y_test_Nmis
results["Evaluations"] = result
results["Model info"] = model_info
results["Epsilon"] = epsilon
# pdb.set_trace()
if not os.path.exists(os.path.dirname(outputdir)):
os.makedirs(os.path.dirname(outputdir))
with open(outputdir, 'a') as outfile:
outfile.write(json.dumps(results, indent=4,default = myconverter))
print ("JSON file saved to "+outputdir)
def write_results_to_json_pandas(model_file,model_type,score,acc,y_test_KL,y_test_Mis,y_test_Nmis,epsilon,outputdir):
result = defaultdict(list)
results = defaultdict(list)
model_info = defaultdict(list)
results["Model path"] = model_file
results["Model"] = model_type
results["Timestamp"] = datetime.datetime.now()
results["Test accuracy"] = acc
results["KL divergence"] = y_test_KL
results["Mutual information score"] = y_test_Mis
results["Normalized mutual information score"] = y_test_Nmis
# results["Evaluations"] = result
results["Recall Macro"] = score['recall']['macro']
results["Precision Macro"] = score['precision']['macro']
results["F1 Macro"] = score['f1_macro']
results["Epsilon"] = epsilon
results_df = pd.DataFrame(results,index=[0])
if not os.path.exists(os.path.dirname(outputdir)):
os.makedirs(os.path.dirname(outputdir))
if os.path.exists(outputdir):
results_df.to_csv(outputdir, mode='a', header=False,index=False)
else:
results_df.to_csv(outputdir,index=False)
def write_results_to_wandb(model_file,model_type,score,acc,y_test_KL,y_test_Mis,y_test_Nmis,epsilon,dataset,weight,wandb_project):
result = defaultdict(list)
results = defaultdict(list)
model_info = defaultdict(list)
wandb.init(project=wandb_project,name=dataset)
wandb.config = {
"model": model_type,
"model_path": model_file,
"dataset": dataset
}
# results["Timestamp"] = datetime.datetime.now()
results["Test accuracy"] = acc
results["KL divergence"] = y_test_KL
results["Mutual information score"] = y_test_Mis
results["Normalized mutual information score"] = y_test_Nmis
results["Recall Macro"] = score['recall']['macro']
results["Precision Macro"] = score['precision']['macro']
results["F1 Macro"] = score['f1_macro']
results["Epsilon"] = epsilon
results["Dataset"] = dataset
results["Weight"] = weight
wandb.log(results)
def write_results_to_json_only(results,outputdir):
results["Timestamp"] = datetime.datetime.utcnow()
if not os.path.exists(os.path.dirname(outputdir)):
os.makedirs(os.path.dirname(outputdir))
with open(outputdir, 'a') as outfile:
outfile.write(json.dumps(results, indent=4,default = myconverter))
print ("JSON file saved to "+outputdir)
# def write_results_to_mongodb(model_file,process_id,score,acc,y_test_KL,y_test_Mis,y_test_Nmis,run_location,db_name,epsilon):
# mongo_client = pymongo.MongoClient(MONGODB_URL)
# mongo_db = mongo_client[db_name]
# mongo_col = mongo_db[process_id]
#
# for KL,Mis,Nmis in zip(y_test_KL,y_test_Mis,y_test_Nmis):
# results = defaultdict(list)
# results["Model path"] = model_file
# results["Run Location"] = run_location
# results["date"] = datetime.datetime.utcnow()
# results["Test score"] = score
# results["Test accuracy"] = acc
# results["KL divergence"] = KL
# results["Mutual information score"] = Mis
# results["Normalized mutual information score"] = Nmis
# results["Epsilon"] = epsilon
#
# x = mongo_col.insert_one(results)
#
# print "Result saved to the database"
def build_text_labels(message_dict,answer_counters):
text = []
labels = []
for message_id in message_dict:
text.append(message_dict[int(message_id)])
labels.append(answer_counters[int(message_id)])
labels = np.asarray(labels)
return text,labels
def build_labels_dict(answer_counters):
labels = []
for message_id in answer_counters:
labels.append(answer_counters[(message_id)])
labels = np.asarray(labels)
return labels
def write_predictions_to_json(predictions,data_dict,label_dict,output):
data_to_write = {}
predictions_to_write = []
for message_id,pred_vect in zip(data_dict,predictions):
message = data_dict[int(message_id)]
labels = {x:y for x,y in zip(label_dict.values(),pred_vect[0].tolist())}
predictions_to_write.append(OrderedDict([("message_id", message_id),("message", message),("labels", labels)]))
data_to_write['dictionary'] = label_dict
data_to_write['data'] = predictions_to_write
save_to_json_foldercheck(data_to_write,output)
def write_predictions_to_json_cnn(predictions,data_dict,label_dict,output):
data_to_write = {}
predictions_to_write = []
for message_id,pred_vect in zip(data_dict,predictions):
message = data_dict[int(message_id)]
labels = {x:y for x,y in zip(label_dict.values(),pred_vect.tolist())}
predictions_to_write.append(OrderedDict([("message_id", message_id),("message", message),("labels", labels)]))
data_to_write['dictionary'] = label_dict
data_to_write['data'] = predictions_to_write
save_to_json_foldercheck(data_to_write,output)
def keras_feature_prep(texts,MAX_NB_WORDS,MAX_SEQUENCE_LENGTH):
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
# token represented by index
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print(len(sequences), len(word_index))
features = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(features, features.shape)
return features, word_index
def plot_NN_history(history_NN, NN_name, kind):
# plt.style.use('ggplot')
plt.plot(history_NN.history['acc'])
plt.plot(history_NN.history['val_acc'])
plt.legend(['Learning Curve', 'Validation Curve'], loc='best')
plt.title('%s accuracy' % kind)
plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.xticks(range(0, 26, 5))
plt.yticks()
plt.savefig(NN_name + ("_%s.pdf" % kind))
#plt.savefig("figures/" + NN_name + ("_%s.pdf" % kind))
def plot_KN_history(x,y1, measure, folder_name, kind):
# plt.style.use('ggplot')
plt.plot(x,y1)
plt.legend([measure], loc='best')
plt.title('%s KL-Divergence' % kind)
plt.xlabel('epsilon')
plt.ylabel('KL-Divergence')
#plt.xticks(range(0, 26, 5))
plt.xticks()
plt.yticks()
plt.savefig(folder_name + ("/%s.pdf" % kind))
#plt.savefig("figures/" + NN_name + ("_%s.pdf" % kind))
plt.close()
def plot_KN_history_all(x,y1,y2,y3,y4,y5, folder_name, kind):
# plt.style.use('ggplot')
plt.plot(x,y1)
plt.plot(x,y2)
plt.plot(x,y3)
plt.plot(x,y4)
plt.plot(x,y5)
plt.legend(['KL-Divergence', 'Chebyshev distance','Euclidean distance','Canaberra distance','Cosine Similarity'], loc='best')
plt.title('%s KL-Divergence' % kind)
plt.xlabel('epsilon')
plt.ylabel('KL-Divergence')
#plt.xticks(range(0, 26, 5))
plt.xticks()
plt.yticks()
plt.savefig(folder_name + ("/%s.pdf" % kind))
plt.close()
#plt.savefig("figures/" + NN_name + ("_%s.pdf" % kind))
def KLdivergence(P, Q):
# from Q to P
# https://datascience.stackexchange.com/a/26318/30372
"""
Epsilon is used here to avoid conditional code for
checking that neither P nor Q is equal to 0.
"""
epsilon = 0.00001
P = P + epsilon
Q = Q + epsilon
return np.sum(P * np.log(P/Q))
def JSdivergence(P, Q):
# from Q to P
# https://datascience.stackexchange.com/a/26318/30372
"""
Epsilon is used here to avoid conditional code for
checking that neither P nor Q is equal to 0.
"""
KL1 = KLdivergence(P,Q)
KL2 = KLdivergence(Q,P)
JS = 0.5*(P+Q)
return JS
# def save_keras_trained_model(MODEL_LOG_DIR, model, cnn_name):
# output_name = cnn_name.split("_")[0]
# model_dir = MODEL_LOG_DIR + output_name
#
# if not os.path.exists(model_dir):
# os.makedirs(model_dir)
#
# model.save(model_dir + '/' + cnn_name + '.h5') # creates a HDF5 file
# del model