-
Notifications
You must be signed in to change notification settings - Fork 169
/
run_train.sh
78 lines (69 loc) · 2.56 KB
/
run_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
GPUS=${GPUS:-8}
BATCH_SIZE=${BATCH_SIZE:-32}
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
tensor_parallel_degree=${tensor_parallel_degree:-1}
sharding_parallel_degree=$((GPUS / tensor_parallel_degree))
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34229
export TF_CPP_MIN_LOG_LEVEL=3
OUTPUT_DIR='work_dirs/got_ocr_20'
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi
TRAINING_MODEL_RESUME="None"
TRAINER_INSTANCES='127.0.0.1'
MASTER='127.0.0.1:8080'
# --freeze_vision_tower False \ # True for stage3
TRAINING_PYTHON="python -m paddle.distributed.launch --master ${MASTER} --nnodes 1 --nproc_per_node ${GPUS} --rank 0 --ips ${TRAINER_INSTANCES} --run_mode=collective"
${TRAINING_PYTHON} --log_dir ${OUTPUT_DIR}/paddle_distributed_logs \
paddlemix/examples/GOT_OCR_2_0/train_GOT.py \
--do_train \
--model_name_or_path "stepfun-ai/GOT-OCR2_0" \
--output_dir ${OUTPUT_DIR} \
--logging_dir ${OUTPUT_DIR}/logs \
--meta_path paddlemix/examples/GOT_OCR_2_0/configs/demo_dataset.json \
--overwrite_output_dir True \
--dataloader_num_workers 8 \
--bf16 True \
--fp16 False \
--fp16_opt_level "O2" \
--num_train_epochs 1 \
--per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
--gradient_accumulation_steps ${GRADIENT_ACC} \
--freeze_vision_tower False \
--use_im_start_end True \
--max_seq_length 8192 \
--recompute False \
--max_grad_norm 1.0 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 200 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.001 \
--optim "adamw" \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "visualdl" \
--tensor_parallel_degree=${tensor_parallel_degree} \
--sharding_parallel_degree=${sharding_parallel_degree} \
--pipeline_parallel_degree=1 \
--sep_parallel_degree=1 \
--sharding="stage1" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"