-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfmm_train.py
294 lines (257 loc) · 15.8 KB
/
fmm_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#https://stackoverflow.com/questions/37604289/tkinter-tclerror-no-display-name-and-no-display-environment-variable/43592515
#for running the pipeline through SSH
import os
import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
mpl.use('Agg')
import argparse
import bnpy
import sys
import pdb
import numpy as np
from numpy import argmax, dot
from collections import defaultdict,OrderedDict
from ldl_utils import get_data_dict, get_feature_vectors, vectorize,read_json,compile_tweet_dict,save_label_dict,load_label_dict,save_label_vects,load_label_vects
from helper_functions import data_prep_bnpy,save_bnpy_model,load_bnpy_model,build_prob_distribution,map_probability_to_label,generate_topics_dict,create_folder,save_to_json_foldercheck
from FMM_utils import get_assignments,language_prep_bnpy
import pickle
import argparse
import pandas as pd
from helper_functions_nlp import clean_text_for_sklean,build_bag_of_words,data_in_cluster_sklearn,save_trained_model_joblib_sklearn_nlp,prep_tokens_for_doc2vec,embed_to_vect,build_glove_embed,glove_embed_vects,text_hybrid_labels,hybrid_flag,data_prep_bnpy_glove,transform_bert_for_lda
from helper_functions import bnpy_find_kl,iteration_selection_bnpy,find_item_distribution_clusters_sklearn,get_ids_only,write_model_logs_to_json,relu
from tqdm import tqdm
import shutil
ITERATIONS = 10
q1_LOWER = 2
q1_UPPER = 12
q2_LOWER = 2
q2_UPPER = 12
q3_LOWER = 5
q3_UPPER = 20
DS_iter = 1
default_LOWER = 2
default_UPPER = 12
FMM_DPMM_Gamma = 0.5
TARGET = 'label'
iterations = 10
#Pre_Training
def FMM_preprocess(input_file_name, output_file_name):
vects = defaultdict(list)
tweet_dict = defaultdict(list)
JSONfile = read_json(input_file_name)
create_folder(output_file_name) #creates the folder for saving LDA models
tweet_dict = compile_tweet_dict(JSONfile["data"])
JSONfile["dictionary"] = [str(x) for x in JSONfile["dictionary"]]
(fdict, rdict) = get_data_dict(JSONfile["dictionary"])
vects = get_feature_vectors(fdict, JSONfile["data"])
print("Running FMM in Train mode on {} Data Items on {}.".format(len(vects),id))
return vects, rdict,tweet_dict
def transform_for_bnpy(vectors):
result_vectors = [relu(vector) for vector in vectors] #3 for 50 window size
return result_vectors
def bnpy_train(tweetid_answer_counters, choices, lower,upper, message_dict, path_to_save, process_id, target,dev_vects,dev_tweet_dict,test_vects,test_tweet_dict,hybrid,train_vectors,dev_vectors,test_vectors):
'''
Train bnpy multinomial mixture model
:param split_prep: type of data split to use for this experiment(shuffle/dense)
:param tweetid_answer_counters: dictionary of the form {tweet_id: [ct_ans1, ct_ans2, ct_ans3 ...]}
:param choices: possible answers choices
:param ITERATIONS: number of iterations from which the best model will be chosen
:param LOWER: start value for number of clusters with which the model will be trained
:param UPPER: end value for number of clusters with which the model will be trained
:param output_name: the name of output directory
:param target: This parameter is used to select embedding method based on input. It supports 1) bow or glove, which triggers data to be processed and embedded.
2) embeddings, which expects a .npy file of existing vectors in the format of IxN (I = vectors, N = number of items) or 3) label, which experiments on labels only.
:return: None
'''
algName = 'VB'
train_message_ids = get_ids_only(tweetid_answer_counters)
if target == 'vectors':
# train_vectors = transform_bert_for_lda(train_vectors)
# dev_vectors = transform_bert_for_lda(dev_vectors)
# test_vectors = transform_bert_for_lda(test_vectors)
trn_pd = pd.DataFrame(train_vectors)
trn_pd.columns = trn_pd.columns.astype(str)
trn_dataset = bnpy.data.XData.from_dataframe(trn_pd)
trn_vectors_values = trn_dataset.column_names
bow_info_train = data_prep_bnpy_glove(train_vectors, trn_vectors_values)
bow_info_dev = data_prep_bnpy_glove(dev_vectors, trn_vectors_values)
bow_info_test = data_prep_bnpy_glove(test_vectors, trn_vectors_values)
# algName = 'EM'
# if "facebook" in path_to_save:
# algName = 'VB'
if target == 'bow' or target =='glove':
train_messages,train_message_ids,train_cleaned_messages,train_tokens = clean_text_for_sklean(message_dict)
dev_messages,dev_message_ids,dev_cleaned_messages,dev_tokens = clean_text_for_sklean(dev_tweet_dict)
test_messages,test_message_ids,test_cleaned_messages,test_tokens = clean_text_for_sklean(test_tweet_dict)
# train_vectors,sklearn_bow_model = build_bag_of_words(train_cleaned_messages)
# dev_vectors = sklearn_bow_model.transform(dev_cleaned_messages)
# pdb.set_trace()
bow_info_train = language_prep_bnpy(train_cleaned_messages,train_cleaned_messages)
bow_info_dev = language_prep_bnpy(train_cleaned_messages,dev_cleaned_messages)
bow_info_test = language_prep_bnpy(train_cleaned_messages,test_cleaned_messages)
if target =='glove':
vec_model = build_glove_embed(train_cleaned_messages)
train_vectors,_ = glove_embed_vects(train_tokens,vec_model)
train_vectors = [transform_for_bnpy(train_vector) for train_vector in train_vectors]
trn_vectors = pd.DataFrame(train_vectors)
trn_vectors.columns = trn_vectors.columns.astype(str)
trn_dataset = bnpy.data.XData.from_dataframe(trn_vectors)
trn_vectors_values = trn_dataset.column_names
dev_cleaned_messages = list(prep_tokens_for_doc2vec(dev_cleaned_messages,tokens_only=True))
dev_vectors,_ = glove_embed_vects(dev_cleaned_messages,vec_model)
dev_vectors = [transform_for_bnpy(dev_vector) for dev_vector in dev_vectors]
test_cleaned_messages = list(prep_tokens_for_doc2vec(test_cleaned_messages,tokens_only=True))
test_vectors,_ = glove_embed_vects(test_cleaned_messages,vec_model)
test_vectors = [transform_for_bnpy(test_vector) for test_vector in test_vectors]
bow_info_train = data_prep_bnpy_glove(train_vectors, trn_vectors_values)
bow_info_dev = data_prep_bnpy_glove(dev_vectors, trn_vectors_values)
bow_info_test = data_prep_bnpy_glove(test_vectors, trn_vectors_values)
if target == 'label' or hybrid==100:
### convert data to bag of words format ###
bow_info_train = data_prep_bnpy(tweetid_answer_counters, choices.values())
bow_info_dev = data_prep_bnpy(dev_vects, choices.values())
bow_info_test = data_prep_bnpy(test_vects, choices.values())
if hybrid and hybrid<100:
train_vectors = text_hybrid_labels(train_vectors,tweetid_answer_counters,float(hybrid))
column_names = [str(x) for x in range(len(train_vectors[0]))]
dev_vectors = text_hybrid_labels(dev_vectors,dev_vects,float(hybrid))
test_vectors = text_hybrid_labels(test_vectors,test_vects,float(hybrid))
bow_info_train = data_prep_bnpy_glove(train_vectors, column_names)
bow_info_dev = data_prep_bnpy_glove(dev_vectors, column_names)
bow_info_test = data_prep_bnpy_glove(test_vectors, column_names)
else:
bow_info_train = data_prep_bnpy(tweetid_answer_counters, choices.values())
bow_info_dev = data_prep_bnpy(dev_vects, choices.values())
bow_info_test = data_prep_bnpy(test_vects, choices.values())
### create a bnpy DataObj ###
trn_dataset = bnpy.data.BagOfWordsData(**bow_info_train)
dev_dataset = bnpy.data.BagOfWordsData(**bow_info_dev)
test_dataset = bnpy.data.BagOfWordsData(**bow_info_test)
results_log_dict = {}
### train and save the Mixture Model ###
for n_clusters in tqdm(range(lower,upper),desc="Training FMM Model"):
trained_model = None
info_dict = None
kl = []
results = {}
# get the best model out of nTask runs
# https://bnpy.readthedocs.io/en/latest/examples/01_asterisk_K8/plot-01-demo=init_methods-model=mix+gauss.html?highlight=initname#initname-bregmankmeans
for i in range(iterations):
trained_model, info_dict = bnpy.run(trn_dataset, 'FiniteMixtureModel', 'Mult', algName,
nLap=1000, convergeThr=0.0001, nTask=ITERATIONS,
K=n_clusters, initname='randexamples',
gamma0=FMM_DPMM_Gamma, lam=0.1, doWriteStdOut=False, logFunc=None, doSaveToDisk=False)
# changing initname to randexamples fixes div error default is bregmankmeans
info_dict['Centroids'] = np.multiply(info_dict['SS'].WordCounts.transpose(), np.reciprocal(info_dict['SS'].SumWordCounts)).transpose()
info_dict['curr_loss'] = -1 * trained_model.calc_evidence(trn_dataset)
# pdb.set_trace()
LP = trained_model.calc_local_params(trn_dataset)
preds = LP['resp']
predictions,cluster_assignments = get_assignments(tweetid_answer_counters,preds)
train_answer_counters = tweetid_answer_counters
train_predict = predictions
#save_bnpy_model(model_dir, trained_model, info_dict)
kl.append(bnpy_find_kl(train_answer_counters,train_predict))
results[i] = find_item_distribution_clusters_sklearn(cluster_assignments)
#Generating data to write
bnpy_write_predicitions(tweetid_answer_counters,predictions,cluster_assignments,choices,info_dict,message_dict,path_to_save+"/CL"+str(n_clusters)+"/temp"+str(i)+"/"+process_id+"_train.json")
LP = trained_model.calc_local_params(dev_dataset)
preds = LP['resp']
predictions,cluster_assignments = get_assignments(dev_vects,preds)
bnpy_write_predicitions(dev_vects,predictions,cluster_assignments,choices,info_dict,dev_tweet_dict,path_to_save+"/CL"+str(n_clusters)+"/temp"+str(i)+"/"+process_id+"_dev.json")
LP = trained_model.calc_local_params(test_dataset)
preds = LP['resp']
predictions,cluster_assignments = get_assignments(test_vects,preds)
bnpy_write_predicitions(test_vects,predictions,cluster_assignments,choices,info_dict,test_tweet_dict,path_to_save+"/CL"+str(n_clusters)+"/temp"+str(i)+"/"+process_id+"_test.json")
results_log_dict[n_clusters],train_pred,dev_pred,test_pred = iteration_selection_bnpy(kl,results,path_to_save + "/CL"+str(n_clusters)+"/temp",n_clusters,process_id)
shutil.rmtree(path_to_save + "/CL"+str(n_clusters))
write_model_logs_to_json(path_to_save+"/CL"+str(n_clusters),train_pred,process_id+"_train")
write_model_logs_to_json(path_to_save+"/CL"+str(n_clusters),dev_pred,process_id+"_dev")
write_model_logs_to_json(path_to_save+"/CL"+str(n_clusters),test_pred,process_id+"_test")
results_log_dict["exp_name"] = process_id
write_model_logs_to_json(path_to_save,results_log_dict,"cluster_log")
print("Completed FMM Training")
def bnpy_write_predicitions(tweetid_answer_counters,predictions,cluster_assignments,choices,info_dict,message_dict,path_to_save):
predictions_to_write = []
data_to_write = {}
for data_item,prediction,cluster_assignment in zip(tweetid_answer_counters,predictions,cluster_assignments):
labels = map_probability_to_label(choices,prediction)
predictions_to_write.append(OrderedDict([("message_id", data_item),("message", message_dict[int(data_item)]),("cluster",cluster_assignment+1),("labels", labels)]))
#print ("Training completed and saved to "+model_dir)
data_to_write["data"] = predictions_to_write
data_to_write["dictionary"] = choices.values()
data_to_write['topics_dict'] = generate_topics_dict(info_dict['Centroids'])
save_to_json_foldercheck(data_to_write,path_to_save)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", help="Input training file JSON name")
parser.add_argument("--dev_file", help="Input dev file JSON name")
parser.add_argument("--test_file", help="Input test file JSON name")
parser.add_argument("--input_train_file_vectors", help="Input training file vectors .npy name",default=False)
parser.add_argument("--dev_file_vectors", help="Input dev file vectors .npy name",default=False)
parser.add_argument("--test_file_vectors", help="Input test file vectors .npy name",default=False)
parser.add_argument("--nlp_flag", help="NLP Data Flag",default=False)
parser.add_argument("--lower", help="Lower Limit")
parser.add_argument("--upper", help="Upper Limit")
parser.add_argument("--output_file", help="Output file name")
parser.add_argument("--folder_name", help="Main folder name",default=False)
parser.add_argument("--hybrid", help="Hybrid model of text + labels",default=False)
args = parser.parse_args()
nlp_flag = args.nlp_flag
train_embed = args.input_train_file_vectors
dev_embed = args.dev_file_vectors
test_embed = args.test_file_vectors
hybrid = hybrid_flag(args.hybrid)
if nlp_flag!="bow" and nlp_flag!="glove" and nlp_flag!="vectors":
nlp_flag = 'label'
if nlp_flag=="vectors":
train_embed = np.load(train_embed,allow_pickle=True)
dev_embed = np.load(dev_embed,allow_pickle=True)
test_embed = np.load(test_embed,allow_pickle=True)
train_vects, rdict,tweet_dict = FMM_preprocess(args.input_file, args.folder_name)
dev_vects, rdict,dev_tweet_dict = FMM_preprocess(args.dev_file, args.folder_name)
test_vects, rdict,test_tweet_dict = FMM_preprocess(args.test_file, args.folder_name)
bnpy_train(train_vects, rdict, int(args.lower),int(args.upper),tweet_dict,args.folder_name,args.output_file,nlp_flag,dev_vects,dev_tweet_dict,test_vects,test_tweet_dict,hybrid,train_embed,dev_embed,test_embed)
if __name__ == '__main__':
main()
# #https://stackoverflow.com/questions/37604289/tkinter-tclerror-no-display-name-and-no-display-environment-variable/43592515
# #for running the pipeline through SSH
# import os
# import matplotlib as mpl
# if os.environ.get('DISPLAY','') == '':
# print('no display found. Using non-interactive Agg backend')
# mpl.use('Agg')
# import argparse
# import sys
# import pdb
# from FMM_utils import bnpy_train_model
# from ldl_utils import get_data_dict, get_feature_vectors, vectorize,read_json,compile_tweet_dict,save_label_dict,load_label_dict,save_label_vects,load_label_vects
# from collections import defaultdict #https://stackoverflow.com/questions/5900578/how-does-collections-defaultdict-work
# from helper_functions import create_folder
# import pickle
# import argparse
# #python FMM_train.py --input data/jobQ123_BOTH/processed/jobQ1_BOTH/split/jobQ1_BOTH_train.json --clusters 5 --output data/jobQ1_BOTH/jobQ1_BOTH_split_fmm.pkl
# #Constants for LDA Data
# #Pre_Training
# def FMM_preprocess(upper,input_file_name, output_file_name):
# vects = defaultdict(list)
# tweet_dict = defaultdict(list)
# JSONfile = read_json(input_file_name)
# #create_folder(output_file_name) #creates the folder for saving LDA models
# tweet_dict = compile_tweet_dict(JSONfile["data"])
# (fdict, rdict) = get_data_dict(JSONfile["dictionary"])
# vects = get_feature_vectors(fdict, JSONfile["data"])
# print("Running FMM in Train mode on {} Tweets on {}.".format(len(vects),id))
# lower=1
# bnpy_train_model(vects, rdict,lower,upper,tweet_dict,output_file_name,output_file_name,"fmm","label")
# def main():
# parser = argparse.ArgumentParser()
# parser.add_argument("--input_file", help="Input training file JSON name")
# parser.add_argument("--clusters", help="Lower Limit")
# #parser.add_argument("--upper", help="Upper Limit")
# parser.add_argument("--output_file", help="Output file name")
# #parser.add_argument("--folder_name", help="Main folder name")
# args = parser.parse_args()
# FMM_preprocess(int(args.clusters),args.input_file, args.output_file)
# if __name__ == '__main__':
# main()