-
Notifications
You must be signed in to change notification settings - Fork 57
/
huggingface_bert.py
116 lines (98 loc) · 4 KB
/
huggingface_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# BSD 3-Clause License
#
# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the psutil authors nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification
from patrickstar.runtime import initialize_engine
from patrickstar.utils import get_rank
from imdb_dataset import get_dataset
# Uncomment these lines when doing multiprocess training
# torch.distributed.init_process_group(backend='nccl')
# torch.cuda.set_device(get_rank())
train_dataset, _, test_dataset = get_dataset("/root/aclImdb")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def model_func():
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
# For large models, please uncomment the following lines to utilize gradient checkpointing
# model.gradient_checkpointing_enable()
return model
config = {
# The same format as optimizer config of DeepSpeed
# https://www.deepspeed.ai/docs/config-json/#optimizer-parameters
"optimizer": {
"type": "Adam",
"params": {
"lr": 5e-5,
"betas": (0.9, 0.999),
"eps": 1e-6,
"weight_decay": 0,
"use_hybrid_adam": True,
},
},
"fp16": {
"enabled": True,
"loss_scale": 0,
"initial_scale_power": 2 ** 3,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1,
},
"default_chunk_size": 64 * 1024 * 1024,
"release_after_init": False,
"use_cpu_embedding": False,
}
model, optim = initialize_engine(
model_func=model_func, local_rank=get_rank(), config=config
)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
print("train loss:")
for i, batch in enumerate(train_loader):
optim.zero_grad()
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
model.backward(loss)
optim.step()
print(i, loss.item())
if i == 10:
break
model.eval()
print("test loss:")
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
for i, batch in enumerate(test_loader):
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
print(i, loss.item())
if i == 5:
break