Skip to content

Commit

Permalink
add GQA, BLOOM, remove APEX (#97)
Browse files Browse the repository at this point in the history
* bug fix

* update

* gqa (#11)

* message

* message

* message

---------

Co-authored-by: yaning zhang <[email protected]>

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: yaning zhang <[email protected]>
  • Loading branch information
ydli-ai and Jenine-321 authored Oct 8, 2023
1 parent 9f63aae commit 0ccdcf5
Show file tree
Hide file tree
Showing 33 changed files with 346 additions and 268 deletions.
8 changes: 0 additions & 8 deletions finetune/run_c3.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_chid.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

if args.use_adv and args.adv_type == "fgm":
args.adv_method.attack(epsilon=args.fgm_epsilon)
Expand Down Expand Up @@ -310,14 +306,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 1 addition & 7 deletions finetune/run_classifier_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,7 @@ def main():
model = model.to(args.device)
load_or_initialize_parameters(args, model)
optimizer, scheduler = build_optimizer(args, model)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
args.model = model
Expand Down
8 changes: 1 addition & 7 deletions finetune/run_classifier_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,7 @@ def main():
model = model.to(args.device)
load_or_initialize_parameters(args, model)
optimizer, scheduler = build_optimizer(args, model)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
args.model = model
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_classifier_mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_classifier_multi_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

if args.use_adv and args.adv_type == "fgm":
args.adv_method.attack(epsilon=args.fgm_epsilon)
Expand Down Expand Up @@ -234,14 +230,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_classifier_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -257,14 +253,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_classifier_siamese.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -288,14 +284,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
13 changes: 1 addition & 12 deletions finetune/run_cmrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,7 @@ def train(args, model, optimizer, scheduler, src_batch, seg_batch, start_positio
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -394,13 +390,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer,opt_level=args.fp16_opt_level)

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_dbqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer,opt_level = args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_image_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
13 changes: 1 addition & 12 deletions finetune/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,7 @@ def train(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch):
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -288,13 +284,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level = args.fp16_opt_level)

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_simcse.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,14 +202,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down Expand Up @@ -245,11 +237,7 @@ def main():
tgt_batch = torch.arange(similarity_matrix.size(0), device=similarity_matrix.device, dtype=torch.long)
loss = nn.CrossEntropyLoss()(similarity_matrix, tgt_batch)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_speech2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_in_batch, tgt_
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -259,14 +255,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_text2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_in_batch, tgt_
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -262,14 +258,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
21 changes: 21 additions & 0 deletions models/bloom/175b_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"emb_size": 14336,
"feedforward_size": 57344,
"hidden_size": 14336,
"hidden_act": "gelu",
"heads_num": 112,
"layers_num": 70,
"dropout": 0.0,
"data_processor": "lm",
"embedding": ["word"],
"remove_transformer_bias": false,
"has_lmtarget_bias": false,
"remove_embedding_layernorm": true,
"encoder": "transformer",
"mask": "causal",
"layernorm_positioning": "pre",
"target": ["lm"],
"tie_weights": true,
"alibi_position_embedding": true,
"layer_number_scale": true
}
Loading

0 comments on commit 0ccdcf5

Please sign in to comment.