-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
99 lines (77 loc) · 3.37 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader
from config import EMB_PATH
from dataloading import SentenceDataset
from models import BaselineDNN
from training import train_dataset, eval_dataset
from utils.load_datasets import load_MR, load_Semeval2017A
from utils.load_embeddings import load_word_vectors
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
########################################################
# Configuration
########################################################
# Download the embeddings of your choice
# for example http://nlp.stanford.edu/data/glove.6B.zip
# 1 - point to the pretrained embeddings file (must be in /embeddings folder)
EMBEDDINGS = os.path.join(EMB_PATH, "glove.6B.50d.txt")
# 2 - set the correct dimensionality of the embeddings
EMB_DIM = 50
EMB_TRAINABLE = False
BATCH_SIZE = 128
EPOCHS = 50
DATASET = "MR" # options: "MR", "Semeval2017A"
# if your computer has a CUDA compatible gpu use it, otherwise use the cpu
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
########################################################
# Define PyTorch datasets and dataloaders
########################################################
# load word embeddings
print("loading word embeddings...")
word2idx, idx2word, embeddings = load_word_vectors(EMBEDDINGS, EMB_DIM)
# load the raw data
if DATASET == "Semeval2017A":
X_train, y_train, X_test, y_test = load_Semeval2017A()
elif DATASET == "MR":
X_train, y_train, X_test, y_test = load_MR()
else:
raise ValueError("Invalid dataset")
# convert data labels from strings to integers
y_train = ... # EX1
y_test = ... # EX1
n_classes = ... # EX1 - LabelEncoder.classes_.size
# Define our PyTorch-based Dataset
train_set = SentenceDataset(X_train, y_train, word2idx)
test_set = SentenceDataset(X_test, y_test, word2idx)
# EX4 - Define our PyTorch-based DataLoader
train_loader = ... # EX7
test_loader = ... # EX7
#############################################################################
# Model Definition (Model, Loss Function, Optimizer)
#############################################################################
model = BaselineDNN(output_size=..., # EX8
embeddings=embeddings,
trainable_emb=EMB_TRAINABLE)
# move the mode weight to cpu or gpu
model.to(DEVICE)
print(model)
# We optimize ONLY those parameters that are trainable (p.requires_grad==True)
criterion = ... # EX8
parameters = ... # EX8
optimizer = ... # EX8
#############################################################################
# Training Pipeline
#############################################################################
for epoch in range(1, EPOCHS + 1):
# train the model for one epoch
train_dataset(epoch, train_loader, model, criterion, optimizer)
# evaluate the performance of the model, on both data sets
train_loss, (y_train_gold, y_train_pred) = eval_dataset(train_loader,
model,
criterion)
test_loss, (y_test_gold, y_test_pred) = eval_dataset(test_loader,
model,
criterion)