-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlda_train.py
235 lines (204 loc) · 12.5 KB
/
lda_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#python kmeans_train.py --train_file data/jobQ123_BOTH/processed/jobQ1_BOTH/split/jobQ1_BOTH_train.json --dev_file data/jobQ123_BOTH/processed/jobQ1_BOTH/split/jobQ1_BOTH_dev.json --lower 2 --upper 12 --iterations 5 --output_file jobQ1_BOTH_split_kmeans --folder_name data/jobQ1_BOTH/kmeans
#https://stackoverflow.com/questions/37604289/tkinter-tclerror-no-display-name-and-no-display-environment-variable/43592515
#for running the pipeline through SSH
import os
import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
mpl.use('Agg')
# from sklearn.cluster import KMeans
# import gensim
from tqdm import tqdm
import os, math, sys, json, collections
from scipy.stats import entropy
import numpy as np
import joblib
from label_vectorization import get_ans_pct_vectors,get_assignments,tests,get_perplexity
from helper_functions import write_model_logs_to_json,read_labeled_data_sklearn,create_folder,get_index_of_best_iteration,save_trained_model_joblib,save_max_sklearn_model_trained,save_trained_model_joblib_sklearn,KLdivergence,median,create_folder
from helper_functions import sklearn_find_kl,iteration_selection_sklearn,find_item_distribution_clusters_sklearn,get_ids_only
from helper_functions_nlp import clean_text_for_sklean,build_bag_of_words,data_in_cluster_sklearn,save_trained_model_joblib_sklearn_nlp,prep_tokens_for_doc2vec,embed_to_vect,build_glove_embed,glove_embed_vects,text_hybrid_labels,hybrid_flag,transform_bert_for_lda
import argparse
import sys
from collections import Counter
import pdb
import pandas as pd
from ldl_utils import read_json
import shutil
from sklearn.decomposition import LatentDirichletAllocation
pretrained_emb = "data/lexicons/glove.twitter.27B/glove.twitter.27B.100d.txt"
#doc2vec parameters
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processe
model_selection_measure = "cross"
iterations = 10
# v = {"entropy": entropee, "max": maxy, "distance": scores, "centroid": centroidy, "cross": cross}
def train_dev_lda_selection(train_answer_counters,dev_answer_counters, ITERATIONS, LOWER, UPPER, output_name, folder_name):
# # Read data splits from file, NOT generate each time
# with open(SPLIT_LOG_DIR + output_name + "_" + split_prep + ".json") as fp:
# results_dict = json.load(fp)
# train_items = results_dict['train_set']
# dev_items = results_dict['dev_set']
#
# train_answer_counters = {}
# for k in train_items:
# train_answer_counters[k] = tweetid_answer_counters[k]
train_vectors = get_ans_pct_vectors(train_answer_counters)
train_message_ids = get_ids_only(train_answer_counters)
dev_vectors = get_ans_pct_vectors(dev_answer_counters)
results_log_dict = {}
results_dict = {}
for n_clusters in tqdm(range(LOWER, UPPER)):
print(n_clusters)
# maxy = []
# entropee = []
# scores = []
# cross = []
# centroidy = []
kl = []
results = {}
for i in range(iterations):
# Initialize the clusterer with n_clusters value and a random generator seed of 10 for reproducibility
clusterer = LatentDirichletAllocation(n_components=n_clusters,learning_method='online') #https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
#Default 300 iteration
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
train_predict = clusterer.fit_transform(train_vectors)
train_predict = map_to_clusters(train_predict)
cluster_distributions = data_in_cluster_sklearn(train_predict,n_clusters,train_message_ids,train_answer_counters)
kl.append(sklearn_find_kl(train_answer_counters,train_predict, cluster_distributions))
results[i] = results[i] = find_item_distribution_clusters_sklearn(train_predict)
create_folder(folder_name + "/logs/models/CL"+str(n_clusters)+"/temp"+str(i))
write_model_logs_to_json(folder_name + "/logs/models/CL"+str(n_clusters)+"/temp"+str(i),cluster_distributions,"cluster_info_"+str(n_clusters))
save_trained_model_joblib_sklearn_nlp(folder_name + "/logs/models/CL"+str(n_clusters)+"/temp"+str(i), clusterer, output_name, n_clusters)
model,cluster_distributions,results_log_dict[n_clusters] = iteration_selection_sklearn(kl,results,folder_name + "/logs/models/CL"+str(n_clusters)+"/temp",n_clusters)
shutil.rmtree(folder_name + "/logs/models/CL"+str(n_clusters))
write_model_logs_to_json(folder_name + "/logs/models",cluster_distributions,"cluster_info_"+str(n_clusters))
save_trained_model_joblib_sklearn_nlp(folder_name + "/logs/models/", model, output_name, n_clusters)
results_log_dict["exp_name"] = output_name
write_model_logs_to_json(folder_name + "/logs/models/",results_log_dict,"cluster_log")
print ("Completed LDA Training")
def transform_for_lda(vectors):
result_vectors = [1+vector for vector in vectors]
return result_vectors
def train_dev_lda_nlp(train_answer_counters,dev_answer_counters, ITERATIONS, LOWER, UPPER, output_name, folder_name,label_dict,train_message_dict,dev_message_dict,glove,hybrid,train_vects,dev_vects):
train_messages,train_message_ids,train_cleaned_messages,train_tokens = clean_text_for_sklean(train_message_dict)
dev_messages,dev_message_ids,dev_cleaned_messages,dev_tokens = clean_text_for_sklean(dev_message_dict)
if glove == "bert":
train_vectors = transform_bert_for_lda(train_vects)
dev_vectors = transform_bert_for_lda(dev_vects)
if glove == True:
vec_model = build_glove_embed(train_cleaned_messages)
train_vectors,_ = glove_embed_vects(train_tokens,vec_model)
train_vectors = [transform_for_lda(train_vector) for train_vector in train_vectors]
vec_model.save(folder_name + "/logs/models/lda_glove.dict")
# else:
# train_vectors,sklearn_bow_model = build_bag_of_words(train_cleaned_messages)
# dev_vectors = sklearn_bow_model.transform(dev_cleaned_messages)
if hybrid:
train_vectors = text_hybrid_labels(train_vectors,train_answer_counters,float(hybrid))
results_log_dict = {}
results_dict = {}
for n_clusters in tqdm(range(LOWER, UPPER)):
#print(n_clusters)
kl = []
results = {}
for i in range(iterations):
# Initialize the clusterer with n_clusters value and a random generator seed of 10 for reproducibility
clusterer = LatentDirichletAllocation(n_components=n_clusters,learning_method='batch') #https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
train_predict = clusterer.fit_transform(train_vectors)
train_predict = map_to_clusters(train_predict)
cluster_distributions = data_in_cluster_sklearn(train_predict,n_clusters,train_message_ids,train_answer_counters)
kl.append(sklearn_find_kl(train_answer_counters,train_predict, cluster_distributions))
results[i] = find_item_distribution_clusters_sklearn(train_predict)
create_folder(folder_name + "/logs/models/CL"+str(n_clusters)+"/temp"+str(i))
write_model_logs_to_json(folder_name + "/logs/models/CL"+str(n_clusters)+"/temp"+str(i),cluster_distributions,"cluster_info_"+str(n_clusters))
save_trained_model_joblib_sklearn_nlp(folder_name + "/logs/models/CL"+str(n_clusters)+"/temp"+str(i), clusterer, output_name, n_clusters)
model,cluster_distributions,results_log_dict[n_clusters] = iteration_selection_sklearn(kl,results,folder_name + "/logs/models/CL"+str(n_clusters)+"/temp",n_clusters)
shutil.rmtree(folder_name + "/logs/models/CL"+str(n_clusters))
write_model_logs_to_json(folder_name + "/logs/models",cluster_distributions,"cluster_info_"+str(n_clusters))
save_trained_model_joblib_sklearn_nlp(folder_name + "/logs/models/", model, output_name, n_clusters)
results_log_dict["exp_name"] = output_name
write_model_logs_to_json(folder_name + "/logs/models/",results_log_dict,"cluster_log")
print ("Completed LDA NLP Training")
def map_to_clusters(predictions):
results = []
for prediction in predictions:
max_prob = max(prediction)
cluster_id = np.where(prediction == max_prob)
if len(cluster_id[0])>1:
cluster_id = cluster_id[0][0].item()
else:
cluster_id = cluster_id[0].item()
results.append(cluster_id)
return results
def model_selection(cluster_log,output_dir,output_name,LOWER, UPPER):
max_cluster_id,max_iteration = model_selection_lda_sklearn(cluster_log, model_selection_measure,LOWER, UPPER)
model_dir = output_dir + '/logs/models/CL' + str(max_cluster_id) + '/'
model_path = model_dir + "Iter" + str(max_iteration) +'.pkl'
save_max_sklearn_model_trained(model_path,output_dir,output_name)
print ("Model training for LDA completed cluster number: "+str(max_cluster_id)+" and saved to "+model_path)
def model_selection_lda_sklearn(cluster_log, measure_name,LOWER, UPPER):
# Select model by the Maximum of **measure_name**
# measure_name = "entropy" or "likelihood"
print(measure_name)
max_meas = cluster_log[LOWER][measure_name]
max_meas_idx = 0
for n_clusters in range(LOWER, UPPER):
# v = {"entropy": entropee, "max": maxy, "likelihood": likelies, "centroid": centroidy}
target_values = cluster_log[n_clusters][measure_name]
if measure_name == "cross":
if target_values <= max_meas:
max_meas_idx = n_clusters
max_meas = target_values
max_iteration = cluster_log[n_clusters]["max_iteration"]
else:
if target_values >= max_meas:
max_meas_idx = n_clusters
max_meas = target_values
max_iteration = cluster_log[n_clusters]["max_iteration"]
print(max_meas_idx, max_meas,max_iteration)
return max_meas_idx,max_iteration
#save_max_lda_model_trained(output_folder + "/" + str(max_meas_idx) + "_topic.lda",max_model_location)
def preprocess_data(input_train_file_name,input_dev_file_name,folder_name):
create_folder(folder_name)
create_folder(folder_name + "/logs")
create_folder(folder_name + "/logs/models")
train_answer_counters,train_message_dict,label_dict = read_labeled_data_sklearn(input_train_file_name)
dev_answer_counters,dev_message_dict,label_dict = read_labeled_data_sklearn(input_dev_file_name)
return train_answer_counters,dev_answer_counters,label_dict,train_message_dict,dev_message_dict
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--train_file", help="Input training file JSON name")
parser.add_argument("--train_file_vects", help="Input training vects .npy",default=False)
parser.add_argument("--dev_file", help="Input dev file JSON name")
parser.add_argument("--dev_file_vects", help="Input dev vects .npy",default=False)
parser.add_argument("--lower", help="Lower Limit")
parser.add_argument("--upper", help="Upper Limit")
parser.add_argument("--iterations", help="Number of iterations")
parser.add_argument("--output_file", help="Output file name", default = False)
parser.add_argument("--folder_name", help="Main folder name",default = False)
parser.add_argument("--nlp_data", help="NLP Data",default = False)
parser.add_argument("--glove", help="Glove Embeddings",default=False)
parser.add_argument("--hybrid", help="Hybrid of Text + Labels", default=False)
args = parser.parse_args()
nlp_flag = args.nlp_data
glove = args.glove
hybrid = hybrid_flag(args.hybrid)
train_vects = args.train_file_vects
dev_vects = args.dev_file_vects
if train_vects:
train_vects = np.load(train_vects,allow_pickle=True)
if dev_vects:
dev_vects = np.load(dev_vects,allow_pickle=True)
#Reading Data
train_answer_counters,dev_answer_counters,label_dict,train_message_dict,dev_message_dict = preprocess_data(args.train_file,args.dev_file,args.folder_name)
if (nlp_flag):
train_dev_lda_nlp(train_answer_counters,dev_answer_counters,int(args.iterations), int(args.lower), int(args.upper),args.output_file,args.folder_name,label_dict,train_message_dict,dev_message_dict,glove,hybrid,train_vects,dev_vects)
else:
train_dev_lda_selection(train_answer_counters,dev_answer_counters,int(args.iterations), int(args.lower), int(args.upper),args.output_file,args.folder_name)
if __name__ == '__main__':
main()