-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathCNN_train_vects.py
146 lines (125 loc) · 5.96 KB
/
CNN_train_vects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
#https://stackoverflow.com/questions/37604289/tkinter-tclerror-no-display-name-and-no-display-environment-variable/43592515
#for running the pipeline through SSH
import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
mpl.use('Agg')
import argparse
import sys
import pdb
import numpy as np
import keras
import tensorflow as tf
from keras.layers import Dense, Embedding, Input
from keras.layers import Conv1D, MaxPooling1D, Concatenate, Flatten, Dropout
from keras.models import Model, Sequential
from ldl_utils import get_data_dict, vectorize,read_json
from helper_functions_LSTM_TF import get_feature_vectors,compile_tweet_dict,check_label_frequency,build_text_labels,keras_feature_prep,plot_NN_history,gpu_fix_cuda
from LSTM_utils import LSTM_training,LSTM_and_embedding_layer_train
from collections import defaultdict #https://stackoverflow.com/questions/5900578/how-does-collections-defaultdict-work
from helper_functions import create_folder,validate_file_location,convert_to_majority,delete_model_drive
import os
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 1000 # words limit in each doc
ITERATIONS = 100
EPOCHS = 25
BATCHSIZE = 32
q1_LOWER = 2
q1_UPPER = 12
q2_LOWER = 2
q2_UPPER = 12
q3_LOWER = 5
q3_UPPER = 20
DS_iter = 1
def CNN_pre_processing(folder_name, input_train_file_name,input_dev_file_name, output_file_name,majority,train_vects,dev_vects):
train_answer_counters = defaultdict(list)
JSONfile = read_json(input_train_file_name)
create_folder(folder_name) #creates the folder for saving LSTM model
train_message_dict = compile_tweet_dict(JSONfile["data"])
(fdict, choices) = get_data_dict(JSONfile["dictionary"])
train_answer_counters = get_feature_vectors(fdict, JSONfile["data"])
dev_answer_counters = defaultdict(list)
JSONfile = read_json(input_dev_file_name)
dev_message_dict = compile_tweet_dict(JSONfile["data"])
(fdict, rdict) = get_data_dict(JSONfile["dictionary"])
dev_answer_counters = get_feature_vectors(fdict, JSONfile["data"])
if majority=="True":
print ("Majority Label")
train_answer_counters = convert_to_majority(train_answer_counters)
dev_answer_counters = convert_to_majority(dev_answer_counters)
#glove_lexicon = open(glove, 'r', encoding='utf-8')
train_text = []
train_labels = []
train_text,train_labels = build_text_labels(train_message_dict,train_answer_counters)
print(len(train_text), len(train_labels))
dev_text = []
dev_labels = []
dev_text,dev_labels = build_text_labels(dev_message_dict,dev_answer_counters)
# train_features, train_word_index = keras_feature_prep(train_text,MAX_NB_WORDS,MAX_SEQUENCE_LENGTH)
# dev_features, dev_word_index = keras_feature_prep(dev_text,MAX_NB_WORDS,MAX_SEQUENCE_LENGTH)
# word_index = train_word_index
# print(train_features.shape, train_labels.shape)
x_train = np.load(train_vects,allow_pickle=True)
x_dev = np.load(dev_vects,allow_pickle=True)
print(x_train, x_train.shape)
print(x_dev, x_dev.shape)
#print(x_test, x_test.shape)
y_train = train_labels
y_dev = dev_labels
print(y_train, y_train.shape)
print(y_dev, y_dev.shape)
#print(y_test, y_test.shape)
check_label_frequency(y_train)
check_label_frequency(y_dev)
pred_dim = train_labels.shape[1]
CNN_training(x_train, y_train, x_dev, y_dev, pred_dim, output_file_name)
def CNN_training(x_train, y_train, x_dev, y_dev, pred_dim,model_output):
gpu_fix_cuda()
tb = [keras.callbacks.TensorBoard(log_dir='./dllogs')]
main_input = Input(shape=(len(x_train[0]),1), dtype='float32') #changed from jobs dataset for image
x_train = np.array(x_train)
x_train = np.expand_dims(x_train,axis=-1)
y_train = np.array(y_train)
#x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_dev = np.array(x_dev)
x_dev = np.expand_dims(x_dev,axis=-1)
#x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1]))
y_dev = np.array(y_dev)
x = Conv1D(128, 5, activation='relu')(main_input)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(10)(x)
x = Flatten()(x)
x = Dropout(0.25)(x)
x = Dense(128, activation='relu')(x)
preds = Dense(pred_dim, activation='softmax')(x)
model = Model(inputs=main_input, outputs=preds)
model.compile(optimizer='adam', loss='kullback_leibler_divergence', metrics=['accuracy'])
print(model.summary())
history_NN = model.fit(x_train, y_train, batch_size=BATCHSIZE, epochs=EPOCHS, callbacks=tb, validation_data=(x_dev, y_dev))
# plot_model(model, to_file="figures/" + NN_name + "_CNN_layers.pdf", show_shapes=False)
# SVG(model_to_dot(model).create(prog='dot', format='svg'))
predicted_probabilities = model.predict(x_train, batch_size=BATCHSIZE)
delete_model_drive(model_output)
model.save(model_output)
del model #delete model for saving space
# plot_NN_history(history_NN, model_output, "CNN")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input_train_file", help="Input training file JSON name")
parser.add_argument("--input_train_file_vects", help="Input training file JSON name")
parser.add_argument("--input_dev_file", help="Input dev file JSON name")
parser.add_argument("--input_dev_file_vects", help="Input dev file JSON name")
parser.add_argument("--output_model_file", help="Output model file name")
parser.add_argument("--folder_name", help="Main folder name")
parser.add_argument("--majority", help="Flag for majority",default=False)
#parser.add_argument("--output_pred_name", help="Output JSON predictions")
args = parser.parse_args()
input_vects = args.input_train_file_vects
dev_vects = args.input_dev_file_vects
CNN_pre_processing(args.folder_name, args.input_train_file,args.input_dev_file, args.output_model_file,args.majority,input_vects,dev_vects)
if __name__ == '__main__':
main()