-
Notifications
You must be signed in to change notification settings - Fork 2
/
init.py
195 lines (163 loc) · 7.84 KB
/
init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def main():
# importing libraries
import pandas as pd
import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon.data import ArrayDataset
from mxnet.gluon.data import DataLoader
import numpy as np
import random
# creating variables
extension = '.csv'
# Load Data
categories = ['Excellent', 'Very_good', 'Good', 'Average', 'Poor']
# Load the data in memory
MAX_ITEMS_PER_CATEGORY = 80000
# Loading data from file if exist
try:
data = pd.read_pickle('pickleddata.pkl')
except:
data = None
if data is None:
data = pd.DataFrame(data={'X': [], 'Y': []})
for index, category in enumerate(categories):
df = pd.read_csv(category + extension, encoding='utf8')
df = pd.DataFrame(data={'X': (df['Review'])[:MAX_ITEMS_PER_CATEGORY], 'Y': index})
data = data.append(df)
print('{}:{} reviews'.format(category, len(df)))
# Shuffle the samples
data = data.sample(frac=1)
data.reset_index(drop=True, inplace=True)
# Saving the data in a pickled file
pd.to_pickle(data, 'pickleddata.pkl')
print('Value counts:\n', data['Y'].value_counts())
for i, cat in enumerate(categories):
print(i, cat)
data.head()
# Creating the dataset
ALPHABET = list(
"abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}") # The 69 characters as specified in the paper
ALPHABET_INDEX = {letter: index for index, letter in enumerate(ALPHABET)} # { a: 0, b: 1, etc}
FEATURE_LEN = 1014 # max-length in characters for one document
NUM_WORKERS = 0 # number of workers used in the data loading
BATCH_SIZE = 128 # number of documents per batch
def encode(text):
encoded = np.zeros([len(ALPHABET), FEATURE_LEN], dtype='float32')
review = text.lower()[:FEATURE_LEN - 1:-1]
i = 0
for letter in text:
if i >= FEATURE_LEN:
break;
if letter in ALPHABET_INDEX:
encoded[ALPHABET_INDEX[letter]][i] = 1
i += 1
return encoded
def transform(x, y):
return encode(x), y
split = 0.8
split_index = int(split * len(data))
train_data_X = data['X'][:split_index].as_matrix()
train_data_Y = data['Y'][:split_index].as_matrix()
test_data_X = data['X'][split_index:].as_matrix()
test_data_Y = data['Y'][split_index:].as_matrix()
train_dataset = ArrayDataset(train_data_X, train_data_Y).transform(transform)
test_dataset = ArrayDataset(test_data_X, test_data_Y).transform(transform)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS,
last_batch='rollover')
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS,
last_batch='rollover')
ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
NUM_FILTERS = 256 # number of convolutional filters per convolutional layer
NUM_OUTPUTS = len(categories) # number of classes
FULLY_CONNECTED = 1024 # number of unit in the fully connected dense layer
DROPOUT_RATE = 0.5 # probability of node drop out
LEARNING_RATE = 0.0001 # learning rate of the gradient
MOMENTUM = 0.9 # momentum of the gradient
WDECAY = 0.00001 # regularization term to limit size of weights
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=7, activation='relu'))
net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=7, activation='relu'))
net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
net.add(gluon.nn.Flatten())
net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu'))
net.add(gluon.nn.Dropout(DROPOUT_RATE))
net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu'))
net.add(gluon.nn.Dropout(DROPOUT_RATE))
net.add(gluon.nn.Dense(NUM_OUTPUTS))
print(net)
hybridize = True # for speed improvement, compile the network but no in-depth debugging possible
# load_params = True # Load pre-trained model
net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
if hybridize:
net.hybridize(static_alloc=True, static_shape=True)
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': LEARNING_RATE,
'wd': WDECAY,
'momentum': MOMENTUM})
def evaluate_accuracy(data_iterator, net):
acc = mx.metric.Accuracy()
for i, (data, label) in enumerate(data_iterator):
data = data.as_in_context(ctx)
label = label.as_in_context(ctx)
output = net(data)
prediction = nd.argmax(output, axis=1)
acc.update(preds=prediction, labels=label)
return acc.get()[1]
start_epoch = 6
number_epochs = 7
smoothing_constant = .01
for e in range(start_epoch, number_epochs):
for i, (review, label) in enumerate(train_dataloader):
review = review.as_in_context(ctx)
label = label.as_in_context(ctx)
with autograd.record():
output = net(review)
loss = softmax_cross_entropy(output, label)
loss.backward()
trainer.step(review.shape[0])
# moving average of the loss
curr_loss = nd.mean(loss)
moving_loss = (curr_loss if (i == 0)
else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss)
if (i % 200 == 0):
print('Batch {}: Instant loss {:.4f}, Moving loss {:.4f}'.format(i, curr_loss.asscalar(),
moving_loss.asscalar()))
test_accuracy = evaluate_accuracy(test_dataloader, net)
# Save the model using the gluon params format
net.save_parameters('crepe_epoch_{}_test_acc_{}.params'.format(e, int(test_accuracy * 10000) / 100))
print("Epoch {}. Loss: {:.4f}, Test_acc {:.4f}".format(e, moving_loss.asscalar(), test_accuracy))
net.export('crepe', epoch=number_epochs)
for i in range(50):
index = random.randint(1, len(data))
review = data['X'][index]
label = categories[int(data['Y'][index])]
print(review)
print('\nCategory: {}\n'.format(label))
encoded = nd.array([encode(review)], ctx=ctx)
output = net(encoded)
predicted = categories[np.argmax(output[0].asnumpy())]
if predicted == label:
print('Correctly predicted the right category')
else:
print('Incorrectly predicted {}'.format(predicted))
review_title = "Good stuff"
review = "This course is definitely better than the previous one"
print(review_title)
print(review + '\n')
encoded = nd.array([encode(review + " | " + review_title)], ctx=ctx)
output = net(encoded)
softmax = nd.exp(output) / nd.sum(nd.exp(output))[0]
predicted = categories[np.argmax(output[0].asnumpy())]
print('Predicted: {}\n'.format(predicted))
for i, val in enumerate(categories):
print(val, float(int(softmax[0][i].asnumpy() * 1000) / 10), '%')
if __name__ == "__main__":
main()