From 8b116777dade6f1b41ef9d9c900f4f65c6f1daf3 Mon Sep 17 00:00:00 2001 From: anxiang Date: Sun, 21 Mar 2021 15:04:27 +0800 Subject: [PATCH] Simplify running commands(single node and multi nodes) 1. Update training logs(glint360k) 2. Update install docs 3. Fix distributed training --- recognition/arcface_torch/README.md | 63 +++++++++-------------- recognition/arcface_torch/config.py | 3 ++ recognition/arcface_torch/docs/eval.md | 13 +++-- recognition/arcface_torch/docs/install.md | 36 +++++++++++++ recognition/arcface_torch/run.sh | 4 +- recognition/arcface_torch/train.py | 14 ++--- 6 files changed, 81 insertions(+), 52 deletions(-) create mode 100644 recognition/arcface_torch/docs/install.md diff --git a/recognition/arcface_torch/README.md b/recognition/arcface_torch/README.md index 5d26018b2..cd8614c4a 100644 --- a/recognition/arcface_torch/README.md +++ b/recognition/arcface_torch/README.md @@ -1,48 +1,34 @@ # Arcface Pytorch (Distributed Version of ArcFace) - ## Contents ## Set Up ```shell torch >= 1.6.0 -``` - -## Train on a single node -If you want to use 8 GPU to train, you should set `--nproc_per_node=8` and set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ` -If you want to use 4 GPU to train, you should set `--nproc_per_node=4` and set `CUDA_VISIBLE_DEVICES=0,1,2,3` -If you want to use 1 GPU to train, you should set `--nproc_per_node=1` ... +``` +More details see [eval.md](docs/install.md) in docs. +## Training +### 1. Single node, 1 GPUs: ```shell -export OMP_NUM_THREADS=4 -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -python -m torch.distributed.launch \ ---nproc_per_node=8 --nnodes=1 \ ---node_rank=0 --master_addr="127.0.0.1" \ ---master_port=1234 train.py -ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh +python -m torch.distributed.launch --nproc_per_node=1 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py ``` - -## Train on multi-node +### 2. Single node, 8 GPUs: ```shell -pass +python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py ``` - -## Evaluation +### 3. Multiple nodes, each node 8 GPUs: +Node 0: +```shell +python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=1234 train.py +``` +Node 1: ```shell -# model-prefix your model path -# image-path your IJBC path -# result-dir your result path -# network your backbone -CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \ ---model-prefix ms1mv3_arcface_r50/backbone.pth \ ---image-path IJB_release/IJBC \ ---result-dir ms1mv3_arcface_r50 \ ---batch-size 128 \ ---job ms1mv3_arcface_r50 \ ---target IJBC \ ---network iresnet50 +python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=1234 train.py ``` + + +## Evaluation IJBC More details see [eval.md](docs/eval.md) in docs. ## Speed Benchmark @@ -89,14 +75,12 @@ All Model Can be found in here. ### Glint360k | Datasets | log |backbone | IJBC(1e-05) | IJBC(1e-04) |agedb30|cfp_fp|lfw | | :---: | :--- |:--- | :--- | :--- |:--- |:--- |:--- | -| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100/training.log) |r100 | 96.19 | 97.39 | 98.52 | 99.26 | 99.83 | -| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|r100-fp16-sample-0.1 | 95.95 | 97.35 | 98.57 | 99.30 | 99.85 | -| Glint360k-Cosface | - | - | - | - | - | - | - | -| Glint360k-Cosface | - | - | - | - | - | - | - | -| Glint360k-Cosface | - | - | - | - | - | - | - | - - - +| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r18_fp16_0.1/training.log) |r18-fp16-0.1 | 93.16 | 95.33 | 97.72 | 97.73 | 99.77 | +| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r34_fp16_0.1/training.log) |r34-fp16-0.1 | 95.16 | 96.56 | 98.33 | 98.78 | 99.82 | +| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r50_fp16_0.1/training.log) |r50-fp16-0.1 | 95.61 | 96.97 | 98.38 | 99.20 | 99.83 | +| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|r100-fp16-0.1 | 95.88 | 97.32 | 98.48 | 99.29 | 99.82 | + +0.1 means sample rate is 0.1. More details see [eval.md](docs/modelzoo.md) in docs. @@ -121,3 +105,4 @@ We test on PyTorch versions 1.6.0, 1.7.1, and 1.8.0. Please create an issue if y year={2020} } ``` +7 \ No newline at end of file diff --git a/recognition/arcface_torch/config.py b/recognition/arcface_torch/config.py index 0d76b016c..5fc70e7a6 100644 --- a/recognition/arcface_torch/config.py +++ b/recognition/arcface_torch/config.py @@ -37,6 +37,9 @@ def lr_step_func(epoch): config.lr_func = lr_step_func elif config.dataset == "glint360k": + # make training faster + # our RAM is 256G + # mount -t tmpfs -o size=140G tmpfs /train_tmp config.rec = "/train_tmp/glint360k" config.num_classes = 360232 config.num_image = 17091657 diff --git a/recognition/arcface_torch/docs/eval.md b/recognition/arcface_torch/docs/eval.md index 759fbf51b..c529670b6 100644 --- a/recognition/arcface_torch/docs/eval.md +++ b/recognition/arcface_torch/docs/eval.md @@ -1,15 +1,20 @@ ## Eval IJBC ```shell +# model-prefix your model path +# image-path your IJBC path +# result-dir your result path +# network your backbone CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \ ---model-prefix tmp_models/backbone.pth \ ---image-path /data/anxiang/IJB_release/IJBC \ ---result-dir result \ +--model-prefix ms1mv3_arcface_r50/backbone.pth \ +--image-path IJB_release/IJBC \ +--result-dir ms1mv3_arcface_r50 \ --batch-size 128 \ ---job cosface \ +--job ms1mv3_arcface_r50 \ --target IJBC \ --network iresnet50 ``` ## Eval MegaFace +pass diff --git a/recognition/arcface_torch/docs/install.md b/recognition/arcface_torch/docs/install.md new file mode 100644 index 000000000..33b751506 --- /dev/null +++ b/recognition/arcface_torch/docs/install.md @@ -0,0 +1,36 @@ +## v1.7.1 +### Linux and Windows +```shell +# CUDA 11.0 +pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html + +# CUDA 10.2 +pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 + +# CUDA 10.1 +pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html + +# CUDA 9.2 +pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html + +# CPU only +pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html +``` + + +## v1.6.0 + +### Linux and Windows +```shell +# CUDA 10.2 +pip install torch==1.6.0 torchvision==0.7.0 + +# CUDA 10.1 +pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html + +# CUDA 9.2 +pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html + +# CPU only +pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html +``` \ No newline at end of file diff --git a/recognition/arcface_torch/run.sh b/recognition/arcface_torch/run.sh index 26de7c51c..3371a39b1 100644 --- a/recognition/arcface_torch/run.sh +++ b/recognition/arcface_torch/run.sh @@ -1,4 +1,2 @@ -export OMP_NUM_THREADS=4 -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 \ ---node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh diff --git a/recognition/arcface_torch/train.py b/recognition/arcface_torch/train.py index bf84e6c98..220c23107 100644 --- a/recognition/arcface_torch/train.py +++ b/recognition/arcface_torch/train.py @@ -22,11 +22,13 @@ def main(args): - dist.init_process_group(backend='nccl', init_method='env://') + + world_size = int(os.environ['WORLD_SIZE']) + rank = int(os.environ['RANK']) + dist_url = "tcp://{}:{}".format(os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"]) + dist.init_process_group(backend='nccl', init_method=dist_url, rank=rank, world_size=world_size) local_rank = args.local_rank torch.cuda.set_device(local_rank) - rank = dist.get_rank() - world_size = dist.get_world_size() if not os.path.exists(cfg.output) and rank is 0: os.makedirs(cfg.output) @@ -124,8 +126,8 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description='PyTorch ArcFace Training') parser.add_argument('--local_rank', type=int, default=0, help='local_rank') - parser.add_argument('--network', type=str, default="iresnet50", help="backbone network") - parser.add_argument('--loss', type=str, default="ArcFace", help="loss function") - parser.add_argument('--resume', type=int, default=0, help="model resuming") + parser.add_argument('--network', type=str, default='iresnet50', help='backbone network') + parser.add_argument('--loss', type=str, default='ArcFace', help='loss function') + parser.add_argument('--resume', type=int, default=0, help='model resuming') args_ = parser.parse_args() main(args_)