-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_conll2003.py
101 lines (82 loc) · 4.06 KB
/
train_conll2003.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
from transformers import BertTokenizerFast
import utils.utils_conll2003 as E
import utils.utils_Generic as G
from utils.utils_training_testing import eval_conll2003
from nets.Bert_Only import BertNER
if __name__ == '__main__':
# ----------------------------------------------------#
# save_path: Path to save you json file
# download_path: Path to conll2003 dataset
# if_downloaded: Have downloaded the conll2003 dataset
# pretrained_model_name: Which pretrained bert model to use
# ----------------------------------------------------#
save_path = 'dataset/conll2023.jsonl'
download_path = 'dataset/conll2003_NER'
if_downloaded = True
pretrained_model_name = 'All_Bert_Pretrained_Models/bert-base-uncased'
# ----------------------------------------------------#
# Training parameters
# epoch_num Epoch number
# batch_size Batch size
# lr Learning rate
# ----------------------------------------------------#
epoch_num = 1
batch_size = 2
lr = 2e-5
# ----------------------------------------------------#
# Download the dataset
# ----------------------------------------------------#
label_list, categories = E.get_json_data(save_path=save_path, if_downloaded=if_downloaded,
download_path=download_path)
# ----------------------------------------------------#
# Read in the data and get the dataloaders
# I keep train ratio as default: 0.8 -> you can change it by passing argument to "load_json"
# I don't have test set -> if you need it, just modify a few lines of codes in "load_json" -> very easy
# ----------------------------------------------------#
train_data, val_data = E.load_json(save_path)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
train_loader = G.get_dataloader(train_data, tokenizer, categories=categories, mode='Train')
val_loader = G.get_dataloader(val_data, tokenizer, categories=categories, mode='Val')
# ----------------------------------------------------#
# Get the model and put it on GPU
# ----------------------------------------------------#
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertNER(len(label_list), bert_model_type=pretrained_model_name)
model.to(device)
# ----------------------------------------------------#
# Optimizer and Loss function
# ----------------------------------------------------#
optimizer = Adam(model.parameters(), lr=lr)
loss_func = nn.CrossEntropyLoss(ignore_index=-100)
# ----------------------------------------------------#
# Start training
# ----------------------------------------------------#
print('\nStart training!!!\n')
for epoch in range(epoch_num):
total_batches = len(train_loader)
with tqdm(total=total_batches, desc=f'Epoch {epoch + 1}/{epoch_num}', unit='batch') as pbar:
for data in train_loader:
model.train()
tokenized_inputs, targets = data
tokenized_inputs, targets = tokenized_inputs.to(device), targets.to(device)
targets = targets.view(-1)
outputs = model(tokenized_inputs)
loss = loss_func(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
pbar.update(1)
pbar.set_postfix(loss=loss.item())
with torch.no_grad():
model.eval()
precision, recall, f1 = eval_conll2003(val_loader, model, device, categories)
print(f'Epoch: {epoch + 1:02d}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.3f}')
if (epoch + 1) % 5 == 0:
sub_path = int(f1 * 1000)
save_path = f'logs/model_f1_{sub_path}.pth'
torch.save(model.state_dict(), save_path)
print('\nFinished Training!!!\n')