From 920e25b282e99d41eb20d3993485b73d6cc428ba Mon Sep 17 00:00:00 2001 From: Xingyu Xie Date: Thu, 1 Sep 2022 18:50:19 +0800 Subject: [PATCH] code submission --- CV/MAE/README.md | 126 ++ CV/MAE/adan.py | 154 +++ CV/MAE/engine_finetune.py | 131 ++ CV/MAE/engine_pretrain.py | 83 ++ CV/MAE/exp_results/MAE/base/log_base_ft.txt | 100 ++ .../MAE/base/log_base_pretrain.txt | 800 +++++++++++ CV/MAE/exp_results/MAE/large/log_large_ft.txt | 50 + .../MAE/large/log_large_pretrain.txt | 801 +++++++++++ CV/MAE/main_finetune.py | 391 ++++++ CV/MAE/main_linprobe.py | 316 +++++ CV/MAE/main_pretrain.py | 277 ++++ CV/MAE/models_mae.py | 250 ++++ CV/MAE/models_vit.py | 74 + CV/MAE/util/crop.py | 42 + CV/MAE/util/datasets.py | 65 + CV/MAE/util/lars.py | 47 + CV/MAE/util/lr_decay.py | 76 + CV/MAE/util/lr_sched.py | 21 + CV/MAE/util/misc.py | 366 +++++ CV/MAE/util/pos_embed.py | 96 ++ CV/timm/README.md | 79 ++ CV/timm/adan.py | 154 +++ .../ConvNext/small/args_cvnext_150.yaml | 111 ++ .../ConvNext/small/args_cvnext_300.yaml | 111 ++ .../ConvNext/small/summary_cvnext_150.csv | 162 +++ .../ConvNext/small/summary_cvnext_300.csv | 311 +++++ .../ResNet/Res50/args_res50_100.yaml | 111 ++ .../ResNet/Res50/args_res50_200.yaml | 111 ++ .../ResNet/Res50/args_res50_300.yaml | 112 ++ .../ResNet/Res50/summary_res50_100.csv | 111 ++ .../ResNet/Res50/summary_res50_200.csv | 211 +++ .../ResNet/Res50/summary_res50_300.csv | 311 +++++ .../exp_results/ViT/base/args_vit-B_150.yaml | 112 ++ .../exp_results/ViT/base/args_vit-B_300.yaml | 112 ++ .../ViT/base/summary_vit-B_150.csv | 161 +++ .../ViT/base/summary_vit-B_300.csv | 311 +++++ .../ViT/small/args_vit-s_150-I.yaml | 113 ++ .../exp_results/ViT/small/args_vit-s_150.yaml | 111 ++ .../ViT/small/args_vit-s_300-I.yaml | 113 ++ .../exp_results/ViT/small/args_vit-s_300.yaml | 111 ++ .../ViT/small/summary_vit-s_150-I.csv | 171 +++ .../ViT/small/summary_vit-s_150.csv | 162 +++ .../ViT/small/summary_vit-s_300-I.csv | 311 +++++ .../ViT/small/summary_vit-s_300.csv | 311 +++++ CV/timm/optim_factory.py | 343 +++++ CV/timm/sam.py | 62 + CV/timm/supervised.md | 168 +++ CV/timm/train.py | 830 +++++++++++ NLP/BERT/README.md | 213 +++ NLP/BERT/adan.py | 231 ++++ NLP/BERT/config/finetuning/acc_test.py | 116 ++ NLP/BERT/config/finetuning/cola-adan.yaml | 59 + NLP/BERT/config/finetuning/cola.yaml | 59 + NLP/BERT/config/finetuning/mnli-adan.yaml | 59 + NLP/BERT/config/finetuning/mnli.yaml | 59 + NLP/BERT/config/finetuning/qnli-adan.yaml | 59 + NLP/BERT/config/finetuning/qnli.yaml | 59 + NLP/BERT/config/finetuning/qqp-adan.yaml | 59 + NLP/BERT/config/finetuning/qqp.yaml | 59 + NLP/BERT/config/finetuning/rte-adan.yaml | 59 + NLP/BERT/config/finetuning/rte.yaml | 59 + NLP/BERT/config/finetuning/sst_2-adan.yaml | 59 + NLP/BERT/config/finetuning/sst_2.yaml | 59 + NLP/BERT/config/finetuning/sts_b-adan.yaml | 58 + NLP/BERT/config/finetuning/sts_b.yaml | 58 + NLP/BERT/config/pretraining/base.yaml | 42 + NLP/BERT/config/pretraining/bert-adan.yaml | 52 + NLP/BERT/config/pretraining/bert-base.yaml | 54 + .../pretrain/full_config-adam.yaml | 376 +++++ .../pretrain/full_config-adan.yaml | 376 +++++ NLP/Transformer-XL/README.md | 92 ++ NLP/Transformer-XL/adan.py | 154 +++ NLP/Transformer-XL/data_utils.py | 273 ++++ NLP/Transformer-XL/eval.py | 122 ++ NLP/Transformer-XL/exp_results/log-100k.txt | 649 +++++++++ NLP/Transformer-XL/exp_results/log-200k.txt | 1224 +++++++++++++++++ NLP/Transformer-XL/exp_results/log-50k.txt | 360 +++++ NLP/Transformer-XL/exp_results/log-adam.txt | 1224 +++++++++++++++++ NLP/Transformer-XL/mem_transformer.py | 812 +++++++++++ NLP/Transformer-XL/run_wt103_adan.sh | 46 + NLP/Transformer-XL/train.py | 581 ++++++++ NLP/Transformer-XL/utils/adaptive_softmax.py | 90 ++ NLP/Transformer-XL/utils/data_parallel.py | 91 ++ NLP/Transformer-XL/utils/exp_utils.py | 40 + .../utils/log_uniform_sampler.py | 147 ++ .../utils/proj_adaptive_softmax.py | 151 ++ NLP/Transformer-XL/utils/vocabulary.py | 163 +++ README.md | 135 ++ adan.py | 154 +++ 89 files changed, 18455 insertions(+) create mode 100644 CV/MAE/README.md create mode 100644 CV/MAE/adan.py create mode 100644 CV/MAE/engine_finetune.py create mode 100644 CV/MAE/engine_pretrain.py create mode 100644 CV/MAE/exp_results/MAE/base/log_base_ft.txt create mode 100644 CV/MAE/exp_results/MAE/base/log_base_pretrain.txt create mode 100644 CV/MAE/exp_results/MAE/large/log_large_ft.txt create mode 100644 CV/MAE/exp_results/MAE/large/log_large_pretrain.txt create mode 100644 CV/MAE/main_finetune.py create mode 100644 CV/MAE/main_linprobe.py create mode 100644 CV/MAE/main_pretrain.py create mode 100644 CV/MAE/models_mae.py create mode 100644 CV/MAE/models_vit.py create mode 100644 CV/MAE/util/crop.py create mode 100644 CV/MAE/util/datasets.py create mode 100644 CV/MAE/util/lars.py create mode 100644 CV/MAE/util/lr_decay.py create mode 100644 CV/MAE/util/lr_sched.py create mode 100644 CV/MAE/util/misc.py create mode 100644 CV/MAE/util/pos_embed.py create mode 100644 CV/timm/README.md create mode 100644 CV/timm/adan.py create mode 100644 CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml create mode 100644 CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml create mode 100644 CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv create mode 100644 CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv create mode 100644 CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml create mode 100644 CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml create mode 100644 CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml create mode 100644 CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv create mode 100644 CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv create mode 100644 CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv create mode 100644 CV/timm/exp_results/ViT/base/args_vit-B_150.yaml create mode 100644 CV/timm/exp_results/ViT/base/args_vit-B_300.yaml create mode 100644 CV/timm/exp_results/ViT/base/summary_vit-B_150.csv create mode 100644 CV/timm/exp_results/ViT/base/summary_vit-B_300.csv create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_150.yaml create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_300.yaml create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_150.csv create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_300.csv create mode 100644 CV/timm/optim_factory.py create mode 100644 CV/timm/sam.py create mode 100644 CV/timm/supervised.md create mode 100644 CV/timm/train.py create mode 100644 NLP/BERT/README.md create mode 100644 NLP/BERT/adan.py create mode 100644 NLP/BERT/config/finetuning/acc_test.py create mode 100644 NLP/BERT/config/finetuning/cola-adan.yaml create mode 100644 NLP/BERT/config/finetuning/cola.yaml create mode 100644 NLP/BERT/config/finetuning/mnli-adan.yaml create mode 100644 NLP/BERT/config/finetuning/mnli.yaml create mode 100644 NLP/BERT/config/finetuning/qnli-adan.yaml create mode 100644 NLP/BERT/config/finetuning/qnli.yaml create mode 100644 NLP/BERT/config/finetuning/qqp-adan.yaml create mode 100644 NLP/BERT/config/finetuning/qqp.yaml create mode 100644 NLP/BERT/config/finetuning/rte-adan.yaml create mode 100644 NLP/BERT/config/finetuning/rte.yaml create mode 100644 NLP/BERT/config/finetuning/sst_2-adan.yaml create mode 100644 NLP/BERT/config/finetuning/sst_2.yaml create mode 100644 NLP/BERT/config/finetuning/sts_b-adan.yaml create mode 100644 NLP/BERT/config/finetuning/sts_b.yaml create mode 100644 NLP/BERT/config/pretraining/base.yaml create mode 100644 NLP/BERT/config/pretraining/bert-adan.yaml create mode 100644 NLP/BERT/config/pretraining/bert-base.yaml create mode 100644 NLP/BERT/exp_results/pretrain/full_config-adam.yaml create mode 100644 NLP/BERT/exp_results/pretrain/full_config-adan.yaml create mode 100644 NLP/Transformer-XL/README.md create mode 100644 NLP/Transformer-XL/adan.py create mode 100644 NLP/Transformer-XL/data_utils.py create mode 100644 NLP/Transformer-XL/eval.py create mode 100644 NLP/Transformer-XL/exp_results/log-100k.txt create mode 100644 NLP/Transformer-XL/exp_results/log-200k.txt create mode 100644 NLP/Transformer-XL/exp_results/log-50k.txt create mode 100644 NLP/Transformer-XL/exp_results/log-adam.txt create mode 100644 NLP/Transformer-XL/mem_transformer.py create mode 100644 NLP/Transformer-XL/run_wt103_adan.sh create mode 100644 NLP/Transformer-XL/train.py create mode 100644 NLP/Transformer-XL/utils/adaptive_softmax.py create mode 100644 NLP/Transformer-XL/utils/data_parallel.py create mode 100644 NLP/Transformer-XL/utils/exp_utils.py create mode 100644 NLP/Transformer-XL/utils/log_uniform_sampler.py create mode 100644 NLP/Transformer-XL/utils/proj_adaptive_softmax.py create mode 100644 NLP/Transformer-XL/utils/vocabulary.py create mode 100644 README.md create mode 100644 adan.py diff --git a/CV/MAE/README.md b/CV/MAE/README.md new file mode 100644 index 0000000..918c011 --- /dev/null +++ b/CV/MAE/README.md @@ -0,0 +1,126 @@ +# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models + +We provide the instruction to modify the official training and fine-tuning files used in [MAE](https://github.com/facebookresearch/mae) such that you can use Adan to train MAE. **Please follow MAE instruction to install necessary packages.** + + + +## Environment + +Our experiments for this task are based on the following pkg version. + +```python +torch.__version__ = '1.7.1+cu110' +torchvision.__version__ = '0.8.2+cu110' +timm.__version__ = '0.4.5' +torchaudio.__version__ = '0.7.2' +``` +If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:mae](https://hub.docker.com/repository/docker/xyxie/adan-image). + + + +## Usage of Adan for MAE + +### Two steps to use Adan + +**Step 1.** add the following parameters to the `main_pretrain.py` and `main_finetune.py`. + +```python +parser.add_argument('--use-adan', action='store_true', default=False, help='whether to use Adan') +parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient (default: 0.0, no gradient clip)') +parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)') +``` + +* `use-adan`: whether to use Adan. The default optimizer is AdamW. + +* `max-grad-norm`: it determines whether to perform gradient clipping. + +* `opt-eps`: optimizer epsilon to avoid the bad case where second-order moment is zero. + +* `opt-betas`: optimizer betas for Adan. + + + +**Step 2.** creat the Adan optimizer as follows. In this step, you can directly replace the vanilla optimizer creator : + +```python +# following timm: set wd as 0 for bias and norm layers +param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay) +if args.use_adan: + if args.bias_decay: + param = model_without_ddp.parameters() + else: + param = param_groups + args.weight_decay = 0.0 + optimizer = Adan(param, weight_decay=args.weight_decay, + lr=args.lr, betas=args.opt_betas, + eps = args.opt_eps, max_grad_norm=args.max_grad_norm) + else: + optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) +``` + + + +## MAE Pre-training + +```python +python main_pretrain.py \ + --batch_size 256 --accum_iter 1 \ + --model ${MODEL_NAME} --norm_pix_loss --mask_ratio 0.75 \ + --epochs 800 \ + --lr ${LR} --weight_decay 0.02 --warmup_epochs ${WR_EPOCH} \ + --min_lr ${MIN_LR} \ + --opt-betas 0.98 0.92 0.90 --opt-eps 1e-8 --max-grad-norm 10.0 \ + --use-adan \ + --data_path ${IMAGENET_DIR} + --output_dir ${OUT_DIR} +``` + +- The pre-training file `main_pretrain.py` comes from [MAE](https://github.com/facebookresearch/mae). +- We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large. +- There are some differences between hyper-parameters for MAE-Base and MAE-Large + +| | MODEL_NAME | LR | MIN_LR | WR_EPOCH | +| :-------: | :-------------------: | :----: | :----: | :------: | +| MAE-Base | mae_vit_base_patch16 | 2.0e-3 | 1e-8 | 40 | +| MAE-Large | mae_vit_large_patch16 | 2.2e-3 | 1e-4 | 80 | + + + +## MAE Fine-tuning + +```python +python main_finetune.py \ + --accum_iter 1 \ + --batch_size 256 \ + --model ${MODEL_NAME} \ + --finetune ${PATH to Ptr-trained Model} \ + --epochs ${EPOCH} \ + --lr 1.5e-2 --layer_decay ${LAYER_DECAY} \ + --min-lr ${MIN_LR} \ + --opt-betas 0.98 0.92 0.99 \ + --opt-eps 1e-8 --max-grad-norm 0 \ + --use-adan --warmup-epochs ${WR_EPOCH} \ + --weight_decay ${WD} --drop_path ${DROP_PATH} \ + --mixup 0.8 --cutmix 1.0 --reprob 0.25 \ + --dist_eval --data_path ${IMAGENET_DIR} +``` + +- The fine-tune file `main_finetune.py` comes from [MAE](https://github.com/facebookresearch/mae). +- We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large. +- There are some differences between hyper-parameters for MAE-Base and MAE-Large + +| | MODEL_NAME | EPOCH | MIN_LR | LAYER_DECAY | WR_EPOCH | WD | DROP_PATH | +| :-------: | :---------------: | :---: | :----: | :---------: | :------: | ---- | :-------: | +| MAE-Base | vit_base_patch16 | 100 | 1e-6 | 0.65 | 40 | 5e-3 | 0.1 | +| MAE-Large | vit_large_patch16 | 50 | 1e-5 | 0.75 | 10 | 1e-3 | 0.2 | + + + +## Results and Logs + +| | MAE-Base | MAE-Large | +| :------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| Top-1 Acc. (%) | 83.8 | 85.9 | +| download | [log-pretrain](./exp_results/MAE/base/log_base_pretrain.txt)/[log-finetune](./exp_results/MAE/base/log_base_ft.txt)/model | [log-pretrain](./exp_results/MAE/large/log_large_pretrain.txt)/[log-finetune](./exp_results/MAE/large/log_large_ft.txt)/model | + diff --git a/CV/MAE/adan.py b/CV/MAE/adan.py new file mode 100644 index 0000000..e2a224a --- /dev/null +++ b/CV/MAE/adan.py @@ -0,0 +1,154 @@ +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import torch +from torch.optim.optimizer import Optimizer +from timm.utils import * + + +class Adan(Optimizer): + """ + Implements a pytorch variant of Adan + + Adan was proposed in + Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. + https://arxiv.org/abs/2208.06677 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float, flot], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) + max_grad_norm (float, optional): value used to clip + global grad norm (default: 0.0 no clip) + no_prox (bool): how to perform the decoupled weight decay (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, max_grad_norm=0.0, no_prox=False): + if not 0.0 <= max_grad_norm: + raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= betas[2] < 1.0: + raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm, no_prox=no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @torch.no_grad() + def restart_opt(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + if p.requires_grad: + state = self.state[p] + # State initialization + + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + # Exponential moving average of gradient difference + state['exp_avg_diff'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self): + """ + Performs a single optimization step. + """ + if self.defaults['max_grad_norm'] > 0: + device = self.param_groups[0]['params'][0].device + global_grad_norm = torch.zeros(1, device=device) + + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + for group in self.param_groups: + + for p in group['params']: + if p.grad is not None: + grad = p.grad + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + + clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) + else: + clip_global_grad_norm = 1.0 + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) + + grad = p.grad.mul_(clip_global_grad_norm) + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + copy_grad = grad.clone() + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + update = grad + beta2 * diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) + + if group['no_prox']: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + p.add_(update, alpha=-group['lr']) + else: + p.add_(update, alpha=-group['lr']) + p.data.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad diff --git a/CV/MAE/engine_finetune.py b/CV/MAE/engine_finetune.py new file mode 100644 index 0000000..3b0fcbd --- /dev/null +++ b/CV/MAE/engine_finetune.py @@ -0,0 +1,131 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import math +import sys +from typing import Iterable, Optional + +import torch + +from timm.data import Mixup +from timm.utils import accuracy + +import util.misc as misc +import util.lr_sched as lr_sched + + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, + mixup_fn: Optional[Mixup] = None, log_writer=None, + args=None): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = 20 + + accum_iter = args.accum_iter + + optimizer.zero_grad() + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + for data_iter_step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + + # we use a per iteration (instead of per epoch) lr scheduler + if data_iter_step % accum_iter == 0: + lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) + + samples = samples.to(device, non_blocking=True) + targets = targets.to(device, non_blocking=True) + + if mixup_fn is not None: + samples, targets = mixup_fn(samples, targets) + + with torch.cuda.amp.autocast(): + outputs = model(samples) + loss = criterion(outputs, targets) + + loss_value = loss.item() + + + + loss /= accum_iter + loss_scaler(loss, optimizer, clip_grad=max_norm, + parameters=model.parameters(), create_graph=False, + update_grad=(data_iter_step + 1) % accum_iter == 0) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + min_lr = 10. + max_lr = 0. + for group in optimizer.param_groups: + min_lr = min(min_lr, group["lr"]) + max_lr = max(max_lr, group["lr"]) + + metric_logger.update(lr=max_lr) + + loss_value_reduce = misc.all_reduce_mean(loss_value) + if not math.isfinite(loss_value_reduce): + print("Loss is {}, stopping training".format(loss_value_reduce)) + sys.exit(1) + if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: + """ We use epoch_1000x as the x-axis in tensorboard. + This calibrates different curves when batch size changes. + """ + epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) + log_writer.add_scalar('loss', loss_value_reduce, epoch_1000x) + log_writer.add_scalar('lr', max_lr, epoch_1000x) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def evaluate(data_loader, model, device): + criterion = torch.nn.CrossEntropyLoss() + + metric_logger = misc.MetricLogger(delimiter=" ") + header = 'Test:' + + # switch to evaluation mode + model.eval() + + for batch in metric_logger.log_every(data_loader, 10, header): + images = batch[0] + target = batch[-1] + images = images.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + + # compute output + with torch.cuda.amp.autocast(): + output = model(images) + loss = criterion(output, target) + + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + batch_size = images.shape[0] + metric_logger.update(loss=loss.item()) + metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) + metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}' + .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss)) + + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} \ No newline at end of file diff --git a/CV/MAE/engine_pretrain.py b/CV/MAE/engine_pretrain.py new file mode 100644 index 0000000..8f41b63 --- /dev/null +++ b/CV/MAE/engine_pretrain.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- +import math +import sys +from typing import Iterable + +import torch + +import util.misc as misc +import util.lr_sched as lr_sched + + +def train_one_epoch(model: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, + log_writer=None, + args=None): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = 20 + + accum_iter = args.accum_iter + + optimizer.zero_grad() + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + for data_iter_step, (samples, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + + # we use a per iteration (instead of per epoch) lr scheduler + if data_iter_step % accum_iter == 0: + lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) + + samples = samples.to(device, non_blocking=True) + + with torch.cuda.amp.autocast(): + loss, _, _ = model(samples, mask_ratio=args.mask_ratio) + + loss_value = loss.item() + + + + loss /= accum_iter + loss_scaler(loss, optimizer, parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(lr=lr) + + loss_value_reduce = misc.all_reduce_mean(loss_value) + if not math.isfinite(loss_value_reduce): + print("Loss is {}, stopping training".format(loss_value_reduce)) + sys.exit(1) + if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: + """ We use epoch_1000x as the x-axis in tensorboard. + This calibrates different curves when batch size changes. + """ + epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) + log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x) + log_writer.add_scalar('lr', lr, epoch_1000x) + + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} \ No newline at end of file diff --git a/CV/MAE/exp_results/MAE/base/log_base_ft.txt b/CV/MAE/exp_results/MAE/base/log_base_ft.txt new file mode 100644 index 0000000..10d975f --- /dev/null +++ b/CV/MAE/exp_results/MAE/base/log_base_ft.txt @@ -0,0 +1,100 @@ +{"train_lr": 0.00018719999999999997, "train_loss": 6.220443151950836, "test_loss": 3.146200124025345, "test_acc1": 38.718000028839114, "test_acc5": 68.56600002685546, "epoch": 0, "n_parameters": 86567656} +{"train_lr": 0.0005622000000000001, "train_loss": 4.947992331647873, "test_loss": 1.8401640093326568, "test_acc1": 59.50200001586914, "test_acc5": 84.50000004486084, "epoch": 1, "n_parameters": 86567656} +{"train_lr": 0.0009372000000000001, "train_loss": 4.558939405536652, "test_loss": 1.5222917068004609, "test_acc1": 65.57200004241943, "test_acc5": 88.05600001373291, "epoch": 2, "n_parameters": 86567656} +{"train_lr": 0.0013122000000000001, "train_loss": 4.343958680105209, "test_loss": 1.3651889091730118, "test_acc1": 68.48600001678467, "test_acc5": 89.68400004119873, "epoch": 3, "n_parameters": 86567656} +{"train_lr": 0.0016872, "train_loss": 4.213459821510315, "test_loss": 1.3220995950698853, "test_acc1": 70.23400000396728, "test_acc5": 90.71000001220703, "epoch": 4, "n_parameters": 86567656} +{"train_lr": 0.002062199999999999, "train_loss": 4.090291801166535, "test_loss": 1.228445138335228, "test_acc1": 71.64600002532958, "test_acc5": 91.52000002593994, "epoch": 5, "n_parameters": 86567656} +{"train_lr": 0.002437199999999999, "train_loss": 4.008170649766922, "test_loss": 1.185943089723587, "test_acc1": 72.97000003448487, "test_acc5": 91.96400001800536, "epoch": 6, "n_parameters": 86567656} +{"train_lr": 0.002812200000000001, "train_loss": 3.933375036430359, "test_loss": 1.1420067197084427, "test_acc1": 73.51400005615234, "test_acc5": 92.32200002044678, "epoch": 7, "n_parameters": 86567656} +{"train_lr": 0.0031871999999999994, "train_loss": 3.883904767179489, "test_loss": 1.154113737642765, "test_acc1": 74.26400002593994, "test_acc5": 92.74000003173828, "epoch": 8, "n_parameters": 86567656} +{"train_lr": 0.0035621999999999993, "train_loss": 3.82627286157608, "test_loss": 1.1308922132849692, "test_acc1": 74.69400002990723, "test_acc5": 93.01800005096436, "epoch": 9, "n_parameters": 86567656} +{"train_lr": 0.003937199999999998, "train_loss": 3.7571802374839782, "test_loss": 1.0861055210232735, "test_acc1": 75.23200000030518, "test_acc5": 93.20800005615234, "epoch": 10, "n_parameters": 86567656} +{"train_lr": 0.0043122, "train_loss": 3.7400177223205566, "test_loss": 1.0553014129400253, "test_acc1": 75.61800002456665, "test_acc5": 93.4340000265503, "epoch": 11, "n_parameters": 86567656} +{"train_lr": 0.004687200000000001, "train_loss": 3.6843111953258516, "test_loss": 1.0651562806963921, "test_acc1": 76.1400000088501, "test_acc5": 93.58800003448486, "epoch": 12, "n_parameters": 86567656} +{"train_lr": 0.0050622, "train_loss": 3.663368493080139, "test_loss": 1.0271986865997313, "test_acc1": 76.15800001403808, "test_acc5": 93.72600003509521, "epoch": 13, "n_parameters": 86567656} +{"train_lr": 0.0054372, "train_loss": 3.62431772274971, "test_loss": 1.0381141716241837, "test_acc1": 76.57800002990723, "test_acc5": 93.96000005645752, "epoch": 14, "n_parameters": 86567656} +{"train_lr": 0.0058122, "train_loss": 3.5911776705265046, "test_loss": 1.025875808596611, "test_acc1": 76.83400003021241, "test_acc5": 93.91400003479004, "epoch": 15, "n_parameters": 86567656} +{"train_lr": 0.0061872, "train_loss": 3.558628352546692, "test_loss": 1.0111908143758774, "test_acc1": 76.96200001129151, "test_acc5": 93.98400003753662, "epoch": 16, "n_parameters": 86567656} +{"train_lr": 0.006562199999999997, "train_loss": 3.543642453479767, "test_loss": 0.9803780218958855, "test_acc1": 77.42599999786377, "test_acc5": 94.18000003753662, "epoch": 17, "n_parameters": 86567656} +{"train_lr": 0.0069372, "train_loss": 3.5387943693637847, "test_loss": 0.9889326795935631, "test_acc1": 77.75000002258301, "test_acc5": 94.21000005096435, "epoch": 18, "n_parameters": 86567656} +{"train_lr": 0.007312200000000001, "train_loss": 3.499820264673233, "test_loss": 0.9561663055419922, "test_acc1": 77.62800001464844, "test_acc5": 94.35400003509521, "epoch": 19, "n_parameters": 86567656} +{"train_lr": 0.007687199999999997, "train_loss": 3.4993985069274904, "test_loss": 0.9616166499257087, "test_acc1": 77.71800002593994, "test_acc5": 94.36600001068115, "epoch": 20, "n_parameters": 86567656} +{"train_lr": 0.0080622, "train_loss": 3.460160034608841, "test_loss": 0.9682401439547539, "test_acc1": 78.13600002288818, "test_acc5": 94.4920000137329, "epoch": 21, "n_parameters": 86567656} +{"train_lr": 0.0084372, "train_loss": 3.4610034842967985, "test_loss": 0.9606137126684189, "test_acc1": 77.98400003875733, "test_acc5": 94.52200002197266, "epoch": 22, "n_parameters": 86567656} +{"train_lr": 0.008812199999999997, "train_loss": 3.436005702972412, "test_loss": 0.9578395664691925, "test_acc1": 78.23200000976563, "test_acc5": 94.63200002716064, "epoch": 23, "n_parameters": 86567656} +{"train_lr": 0.009187199999999998, "train_loss": 3.421230093240738, "test_loss": 0.9625651022791862, "test_acc1": 78.46600002075195, "test_acc5": 94.70200002471924, "epoch": 24, "n_parameters": 86567656} +{"train_lr": 0.009562199999999996, "train_loss": 3.4071589920043945, "test_loss": 0.9450574347376823, "test_acc1": 78.4380000125122, "test_acc5": 94.63400004058838, "epoch": 25, "n_parameters": 86567656} +{"train_lr": 0.009937199999999998, "train_loss": 3.3995270119667054, "test_loss": 0.9474635648727417, "test_acc1": 78.45400002380372, "test_acc5": 94.70600004058838, "epoch": 26, "n_parameters": 86567656} +{"train_lr": 0.010312199999999997, "train_loss": 3.3837591108322145, "test_loss": 0.9371258601546287, "test_acc1": 78.65400002624511, "test_acc5": 94.7400000402832, "epoch": 27, "n_parameters": 86567656} +{"train_lr": 0.010687199999999997, "train_loss": 3.370058294558525, "test_loss": 0.9240103733539581, "test_acc1": 78.72200001831055, "test_acc5": 94.82600001617432, "epoch": 28, "n_parameters": 86567656} +{"train_lr": 0.011062199999999998, "train_loss": 3.3644386556625365, "test_loss": 0.9514913991093635, "test_acc1": 78.53200000701904, "test_acc5": 94.69600005126954, "epoch": 29, "n_parameters": 86567656} +{"train_lr": 0.011437199999999996, "train_loss": 3.352064240050316, "test_loss": 0.9473175323009491, "test_acc1": 78.84600002838135, "test_acc5": 94.88000001922607, "epoch": 30, "n_parameters": 86567656} +{"train_lr": 0.011812200000000004, "train_loss": 3.343575291442871, "test_loss": 0.8956420955061912, "test_acc1": 78.94000000701904, "test_acc5": 95.00600002227783, "epoch": 31, "n_parameters": 86567656} +{"train_lr": 0.012187199999999997, "train_loss": 3.3357870742559435, "test_loss": 0.9404549324512481, "test_acc1": 78.7719999963379, "test_acc5": 94.98000003814697, "epoch": 32, "n_parameters": 86567656} +{"train_lr": 0.012562199999999997, "train_loss": 3.319555434727669, "test_loss": 0.9541194596886635, "test_acc1": 78.83600001220704, "test_acc5": 94.94200002471923, "epoch": 33, "n_parameters": 86567656} +{"train_lr": 0.012937200000000005, "train_loss": 3.31564322450161, "test_loss": 0.9260695472359657, "test_acc1": 79.08000001220704, "test_acc5": 94.95400003814697, "epoch": 34, "n_parameters": 86567656} +{"train_lr": 0.013312200000000005, "train_loss": 3.309744071626663, "test_loss": 0.920071712732315, "test_acc1": 79.07800001220703, "test_acc5": 94.88000004058838, "epoch": 35, "n_parameters": 86567656} +{"train_lr": 0.013687199999999998, "train_loss": 3.2949381719112396, "test_loss": 0.8983643808960915, "test_acc1": 79.15200002593994, "test_acc5": 95.06000004058838, "epoch": 36, "n_parameters": 86567656} +{"train_lr": 0.014062200000000004, "train_loss": 3.27849584107399, "test_loss": 0.9205039608478546, "test_acc1": 79.14400001251221, "test_acc5": 94.99400002471924, "epoch": 37, "n_parameters": 86567656} +{"train_lr": 0.014437199999999996, "train_loss": 3.2822228091716767, "test_loss": 0.8969993716478348, "test_acc1": 79.23400001312255, "test_acc5": 95.05600001953125, "epoch": 38, "n_parameters": 86567656} +{"train_lr": 0.0148122, "train_loss": 3.2667752093315126, "test_loss": 0.8877088937163353, "test_acc1": 79.5680000289917, "test_acc5": 94.98600002471923, "epoch": 39, "n_parameters": 86567656} +{"train_lr": 0.014996581744440903, "train_loss": 3.265050085401535, "test_loss": 0.8858077436685562, "test_acc1": 79.5340000366211, "test_acc5": 95.0980000491333, "epoch": 40, "n_parameters": 86567656} +{"train_lr": 0.014976050594237029, "train_loss": 3.2547211683750152, "test_loss": 0.8691368037462235, "test_acc1": 79.7200000289917, "test_acc5": 95.13200002441407, "epoch": 41, "n_parameters": 86567656} +{"train_lr": 0.014935028122712916, "train_loss": 3.241869815301895, "test_loss": 0.8789618426561355, "test_acc1": 79.56200002838135, "test_acc5": 95.1620000302124, "epoch": 42, "n_parameters": 86567656} +{"train_lr": 0.014873626769611584, "train_loss": 3.2315730389356614, "test_loss": 0.8627288854122162, "test_acc1": 79.916000050354, "test_acc5": 95.27200001678467, "epoch": 43, "n_parameters": 86567656} +{"train_lr": 0.014792014831773887, "train_loss": 3.202810504412651, "test_loss": 0.8483913645148278, "test_acc1": 80.12600003234863, "test_acc5": 95.39000004089355, "epoch": 44, "n_parameters": 86567656} +{"train_lr": 0.014690416001848693, "train_loss": 3.1993762041330336, "test_loss": 0.8779441103339195, "test_acc1": 79.98000003997802, "test_acc5": 95.40600001953125, "epoch": 45, "n_parameters": 86567656} +{"train_lr": 0.014569108755166755, "train_loss": 3.187324148273468, "test_loss": 0.8693245351314545, "test_acc1": 80.12200002899169, "test_acc5": 95.39200002746583, "epoch": 46, "n_parameters": 86567656} +{"train_lr": 0.014428425586459301, "train_loss": 3.1755990085601806, "test_loss": 0.8747495183348656, "test_acc1": 80.1980000241089, "test_acc5": 95.35000002197266, "epoch": 47, "n_parameters": 86567656} +{"train_lr": 0.014268752098512963, "train_loss": 3.155262738776207, "test_loss": 0.8533616551756859, "test_acc1": 80.38600003753662, "test_acc5": 95.45800000335693, "epoch": 48, "n_parameters": 86567656} +{"train_lr": 0.014090525945259457, "train_loss": 3.1335597133159636, "test_loss": 0.8487207201123238, "test_acc1": 80.55600003723144, "test_acc5": 95.59800004394532, "epoch": 49, "n_parameters": 86567656} +{"train_lr": 0.013894235632196493, "train_loss": 3.1262916456222536, "test_loss": 0.8581038397550583, "test_acc1": 80.52800002960205, "test_acc5": 95.61000001708985, "epoch": 50, "n_parameters": 86567656} +{"train_lr": 0.013680419177428186, "train_loss": 3.1185608382940293, "test_loss": 0.8400468546152114, "test_acc1": 80.67600002471924, "test_acc5": 95.60000000610351, "epoch": 51, "n_parameters": 86567656} +{"train_lr": 0.01344966263699487, "train_loss": 3.1101703361988067, "test_loss": 0.8435578069090843, "test_acc1": 80.63000002166748, "test_acc5": 95.73200002227783, "epoch": 52, "n_parameters": 86567656} +{"train_lr": 0.01320259849853417, "train_loss": 3.096173421573639, "test_loss": 0.8371610799431801, "test_acc1": 81.05600002746581, "test_acc5": 95.72600002532958, "epoch": 53, "n_parameters": 86567656} +{"train_lr": 0.012939903947676419, "train_loss": 3.085701132774353, "test_loss": 0.8331937485933304, "test_acc1": 80.97600001831054, "test_acc5": 95.77200003051757, "epoch": 54, "n_parameters": 86567656} +{"train_lr": 0.012662299011925835, "train_loss": 3.0661963938474655, "test_loss": 0.8213660365343094, "test_acc1": 80.88600003204346, "test_acc5": 95.75800004089355, "epoch": 55, "n_parameters": 86567656} +{"train_lr": 0.012370544587115206, "train_loss": 3.057591044449806, "test_loss": 0.8165697306394577, "test_acc1": 81.3160000213623, "test_acc5": 95.83400003814697, "epoch": 56, "n_parameters": 86567656} +{"train_lr": 0.012065440351843355, "train_loss": 3.0463304943323135, "test_loss": 0.8060084617137909, "test_acc1": 81.3940000354004, "test_acc5": 95.85600003845215, "epoch": 57, "n_parameters": 86567656} +{"train_lr": 0.011747822575611631, "train_loss": 3.026911934018135, "test_loss": 0.7906639388203621, "test_acc1": 81.43200004058838, "test_acc5": 95.98400001434327, "epoch": 58, "n_parameters": 86567656} +{"train_lr": 0.011418561826667327, "train_loss": 3.0174565678834915, "test_loss": 0.800553865134716, "test_acc1": 81.60400002746582, "test_acc5": 95.92800002471924, "epoch": 59, "n_parameters": 86567656} +{"train_lr": 0.01107856058583667, "train_loss": 3.008225770974159, "test_loss": 0.7980207592248917, "test_acc1": 81.7600000326538, "test_acc5": 96.03000002502442, "epoch": 60, "n_parameters": 86567656} +{"train_lr": 0.010728750772887586, "train_loss": 2.9927734558582304, "test_loss": 0.8040242698788643, "test_acc1": 81.70800001647949, "test_acc5": 96.02200004943847, "epoch": 61, "n_parameters": 86567656} +{"train_lr": 0.01037009119220237, "train_loss": 2.9911838919639586, "test_loss": 0.7953862142562866, "test_acc1": 81.8280000354004, "test_acc5": 96.05800003082275, "epoch": 62, "n_parameters": 86567656} +{"train_lr": 0.010003564904761585, "train_loss": 2.9719505157470705, "test_loss": 0.7940907120704651, "test_acc1": 81.78600001922608, "test_acc5": 96.01400002502442, "epoch": 63, "n_parameters": 86567656} +{"train_lr": 0.009630176533642159, "train_loss": 2.963402523112297, "test_loss": 0.7831828370690346, "test_acc1": 81.97200001159668, "test_acc5": 96.0800000253296, "epoch": 64, "n_parameters": 86567656} +{"train_lr": 0.009250949510415432, "train_loss": 2.94107414290905, "test_loss": 0.7819095095992088, "test_acc1": 82.04200000030518, "test_acc5": 96.19600002777099, "epoch": 65, "n_parameters": 86567656} +{"train_lr": 0.008866923269992324, "train_loss": 2.9405784098625185, "test_loss": 0.7755044403672219, "test_acc1": 82.14000001678467, "test_acc5": 96.21200004425049, "epoch": 66, "n_parameters": 86567656} +{"train_lr": 0.00847915040160449, "train_loss": 2.9315583733558657, "test_loss": 0.7770591515302658, "test_acc1": 82.12400002716065, "test_acc5": 96.27200004425049, "epoch": 67, "n_parameters": 86567656} +{"train_lr": 0.008088693763730424, "train_loss": 2.9024018899202346, "test_loss": 0.7656473967432976, "test_acc1": 82.40400001159668, "test_acc5": 96.2580000253296, "epoch": 68, "n_parameters": 86567656} +{"train_lr": 0.0076966235708742035, "train_loss": 2.89498575322628, "test_loss": 0.7626381632685661, "test_acc1": 82.49200003051757, "test_acc5": 96.27400003356934, "epoch": 69, "n_parameters": 86567656} +{"train_lr": 0.0073040144601820185, "train_loss": 2.864926402044296, "test_loss": 0.7629410058259964, "test_acc1": 82.72200000610351, "test_acc5": 96.35200002532959, "epoch": 70, "n_parameters": 86567656} +{"train_lr": 0.006911942545936348, "train_loss": 2.8626769891262054, "test_loss": 0.7633640518784524, "test_acc1": 82.480000027771, "test_acc5": 96.36400005249024, "epoch": 71, "n_parameters": 86567656} +{"train_lr": 0.006521482470001542, "train_loss": 2.8596914556503297, "test_loss": 0.7628568401932716, "test_acc1": 82.56400003570556, "test_acc5": 96.32000005767823, "epoch": 72, "n_parameters": 86567656} +{"train_lr": 0.006133704456305189, "train_loss": 2.8379484293699266, "test_loss": 0.7544870236515999, "test_acc1": 82.728000027771, "test_acc5": 96.4060000415039, "epoch": 73, "n_parameters": 86567656} +{"train_lr": 0.0057496713774287216, "train_loss": 2.825576261472702, "test_loss": 0.7530808946490288, "test_acc1": 82.57800004364013, "test_acc5": 96.29800003082275, "epoch": 74, "n_parameters": 86567656} +{"train_lr": 0.005370435841347517, "train_loss": 2.821367850232124, "test_loss": 0.7521498575806618, "test_acc1": 82.79800002227783, "test_acc5": 96.43200002807617, "epoch": 75, "n_parameters": 86567656} +{"train_lr": 0.004997037306305652, "train_loss": 2.8102516706705094, "test_loss": 0.7544213259220123, "test_acc1": 82.91400002807617, "test_acc5": 96.54000005767823, "epoch": 76, "n_parameters": 86567656} +{"train_lr": 0.004630499231733052, "train_loss": 2.7939453321933745, "test_loss": 0.7465361738204956, "test_acc1": 82.92000003570557, "test_acc5": 96.50400005523682, "epoch": 77, "n_parameters": 86567656} +{"train_lr": 0.004271826273014355, "train_loss": 2.7792876475334167, "test_loss": 0.7468091368675231, "test_acc1": 83.13600001190186, "test_acc5": 96.53200005767822, "epoch": 78, "n_parameters": 86567656} +{"train_lr": 0.003922001527798389, "train_loss": 2.7769425602436066, "test_loss": 0.7462005805969238, "test_acc1": 83.0420000515747, "test_acc5": 96.48400005249023, "epoch": 79, "n_parameters": 86567656} +{"train_lr": 0.0035819838413957477, "train_loss": 2.7580106742620467, "test_loss": 0.7476282814145088, "test_acc1": 83.30400002258301, "test_acc5": 96.52800004699706, "epoch": 80, "n_parameters": 86567656} +{"train_lr": 0.0032527051786505347, "train_loss": 2.755719056749344, "test_loss": 0.7485474190115928, "test_acc1": 83.2220000415039, "test_acc5": 96.54400005249023, "epoch": 81, "n_parameters": 86567656} +{"train_lr": 0.0029350680694894497, "train_loss": 2.733246625614166, "test_loss": 0.7376724031567573, "test_acc1": 83.25400003051757, "test_acc5": 96.54200004974365, "epoch": 82, "n_parameters": 86567656} +{"train_lr": 0.0026299431351500197, "train_loss": 2.725854627633095, "test_loss": 0.7386321437358856, "test_acc1": 83.2880000064087, "test_acc5": 96.5220000390625, "epoch": 83, "n_parameters": 86567656} +{"train_lr": 0.0023381667018682875, "train_loss": 2.730336236548424, "test_loss": 0.7432589226961136, "test_acc1": 83.51000001190185, "test_acc5": 96.53000004425049, "epoch": 84, "n_parameters": 86567656} +{"train_lr": 0.0020605385085667566, "train_loss": 2.7197771132946014, "test_loss": 0.7351065069437027, "test_acc1": 83.53600001495361, "test_acc5": 96.56200005523682, "epoch": 85, "n_parameters": 86567656} +{"train_lr": 0.0017978195148255654, "train_loss": 2.703706868839264, "test_loss": 0.7340468415617942, "test_acc1": 83.50800002014161, "test_acc5": 96.56400005523682, "epoch": 86, "n_parameters": 86567656} +{"train_lr": 0.0015507298151451832, "train_loss": 2.7080028397798537, "test_loss": 0.7307805678248406, "test_acc1": 83.53800004180908, "test_acc5": 96.64000005249024, "epoch": 87, "n_parameters": 86567656} +{"train_lr": 0.0013199466652174393, "train_loss": 2.700676307797432, "test_loss": 0.7344949060678482, "test_acc1": 83.53000002807617, "test_acc5": 96.63800004699706, "epoch": 88, "n_parameters": 86567656} +{"train_lr": 0.0011061026256147478, "train_loss": 2.689790417647362, "test_loss": 0.7330510130524636, "test_acc1": 83.54800002838135, "test_acc5": 96.6440000390625, "epoch": 89, "n_parameters": 86567656} +{"train_lr": 0.0009097838279855404, "train_loss": 2.6793027224302293, "test_loss": 0.7295243856310845, "test_acc1": 83.56600002258301, "test_acc5": 96.64400005523682, "epoch": 90, "n_parameters": 86567656} +{"train_lr": 0.0007315283685081682, "train_loss": 2.678568696594238, "test_loss": 0.7311912828683853, "test_acc1": 83.71000003631592, "test_acc5": 96.63600004180908, "epoch": 91, "n_parameters": 86567656} +{"train_lr": 0.0005718248330066727, "train_loss": 2.6540179803133013, "test_loss": 0.7337997442483902, "test_acc1": 83.62800002532958, "test_acc5": 96.64800004974366, "epoch": 92, "n_parameters": 86567656} +{"train_lr": 0.00043111095777100694, "train_loss": 2.6582588331222534, "test_loss": 0.7270043037831784, "test_acc1": 83.62800000091553, "test_acc5": 96.68400004974366, "epoch": 93, "n_parameters": 86567656} +{"train_lr": 0.0003097724297522902, "train_loss": 2.6649514198064805, "test_loss": 0.7313019317388535, "test_acc1": 83.72400000915528, "test_acc5": 96.64400004974365, "epoch": 94, "n_parameters": 86567656} +{"train_lr": 0.0002081418294216848, "train_loss": 2.6643570078611374, "test_loss": 0.7312484115362168, "test_acc1": 83.72200002807617, "test_acc5": 96.63200004974365, "epoch": 95, "n_parameters": 86567656} +{"train_lr": 0.00012649771919044005, "train_loss": 2.6634057205677033, "test_loss": 0.7318920171260834, "test_acc1": 83.75600001708985, "test_acc5": 96.65400004699707, "epoch": 96, "n_parameters": 86567656} +{"train_lr": 6.506387988968701e-05, "train_loss": 2.6628331270217895, "test_loss": 0.7311200454831124, "test_acc1": 83.77000001983643, "test_acc5": 96.64400004699706, "epoch": 97, "n_parameters": 86567656} +{"train_lr": 2.4008697402735765e-05, "train_loss": 2.6537726917743685, "test_loss": 0.7295740690827369, "test_acc1": 83.79800002258301, "test_acc5": 96.67800005249024, "epoch": 98, "n_parameters": 86567656} +{"train_lr": 3.4447011310720205e-06, "train_loss": 2.6564826879262924, "test_loss": 0.7303899875283242, "test_acc1": 83.80000002258301, "test_acc5": 96.79000005249023, "epoch": 99, "n_parameters": 86567656} diff --git a/CV/MAE/exp_results/MAE/base/log_base_pretrain.txt b/CV/MAE/exp_results/MAE/base/log_base_pretrain.txt new file mode 100644 index 0000000..aa05f86 --- /dev/null +++ b/CV/MAE/exp_results/MAE/base/log_base_pretrain.txt @@ -0,0 +1,800 @@ +{"train_lr": 2.4920064e-05, "train_loss": 1.029866009569168, "epoch": 0} +{"train_lr": 7.492006399999998e-05, "train_loss": 0.826913864672184, "epoch": 1} +{"train_lr": 0.00012492006400000006, "train_loss": 0.7834360855937004, "epoch": 2} +{"train_lr": 0.00017492006399999992, "train_loss": 0.7394427828550338, "epoch": 3} +{"train_lr": 0.00022492006399999994, "train_loss": 0.7072458884239197, "epoch": 4} +{"train_lr": 0.0002749200639999999, "train_loss": 0.6952311094760895, "epoch": 5} +{"train_lr": 0.00032492006400000004, "train_loss": 0.6914466771841049, "epoch": 6} +{"train_lr": 0.00037492006400000006, "train_loss": 0.6886494241833687, "epoch": 7} +{"train_lr": 0.00042492006399999987, "train_loss": 0.6822491272926331, "epoch": 8} +{"train_lr": 0.00047492006400000044, "train_loss": 0.6565844881653786, "epoch": 9} +{"train_lr": 0.0005249200640000002, "train_loss": 0.6322732063770294, "epoch": 10} +{"train_lr": 0.0005749200639999999, "train_loss": 0.6173936517953873, "epoch": 11} +{"train_lr": 0.0006249200639999998, "train_loss": 0.5982944983005524, "epoch": 12} +{"train_lr": 0.0006749200640000005, "train_loss": 0.5760513068437576, "epoch": 13} +{"train_lr": 0.000724920064, "train_loss": 0.558085383951664, "epoch": 14} +{"train_lr": 0.0007749200639999998, "train_loss": 0.5438177249908447, "epoch": 15} +{"train_lr": 0.0008249200639999997, "train_loss": 0.5321405304253102, "epoch": 16} +{"train_lr": 0.0008749200639999996, "train_loss": 0.5226460194289684, "epoch": 17} +{"train_lr": 0.0009249200639999996, "train_loss": 0.5143405359089375, "epoch": 18} +{"train_lr": 0.0009749200640000003, "train_loss": 0.5078899907290936, "epoch": 19} +{"train_lr": 0.0010249200639999999, "train_loss": 0.5023593332231044, "epoch": 20} +{"train_lr": 0.0010749200640000002, "train_loss": 0.4975539141893387, "epoch": 21} +{"train_lr": 0.0011249200639999997, "train_loss": 0.49327956531047823, "epoch": 22} +{"train_lr": 0.001174920064000001, "train_loss": 0.48933574736714364, "epoch": 23} +{"train_lr": 0.001224920064, "train_loss": 0.4860016059339046, "epoch": 24} +{"train_lr": 0.0012749200639999999, "train_loss": 0.48271030520796776, "epoch": 25} +{"train_lr": 0.0013249200639999996, "train_loss": 0.48013067763447764, "epoch": 26} +{"train_lr": 0.0013749200639999988, "train_loss": 0.47759121404886246, "epoch": 27} +{"train_lr": 0.001424920064, "train_loss": 0.4752511966407299, "epoch": 28} +{"train_lr": 0.0014749200639999995, "train_loss": 0.47316404255628586, "epoch": 29} +{"train_lr": 0.001524920064000001, "train_loss": 0.4711698636054993, "epoch": 30} +{"train_lr": 0.001574920064, "train_loss": 0.4693278255581856, "epoch": 31} +{"train_lr": 0.0016249200640000012, "train_loss": 0.46783283928632735, "epoch": 32} +{"train_lr": 0.0016749200639999998, "train_loss": 0.46624995667934416, "epoch": 33} +{"train_lr": 0.0017249200640000002, "train_loss": 0.46449592822194097, "epoch": 34} +{"train_lr": 0.0017749200639999995, "train_loss": 0.46306066621541975, "epoch": 35} +{"train_lr": 0.0018249200639999994, "train_loss": 0.4618008250772953, "epoch": 36} +{"train_lr": 0.0018749200640000001, "train_loss": 0.46067862812876703, "epoch": 37} +{"train_lr": 0.0019249200639999994, "train_loss": 0.4598328410089016, "epoch": 38} +{"train_lr": 0.001974920064, "train_loss": 0.4584417822599411, "epoch": 39} +{"train_lr": 0.0019999971657724378, "train_loss": 0.4574848892211914, "epoch": 40} +{"train_lr": 0.001999980105982979, "train_loss": 0.45612444841861727, "epoch": 41} +{"train_lr": 0.0019999459593779763, "train_loss": 0.4549763072431087, "epoch": 42} +{"train_lr": 0.001999894726540914, "train_loss": 0.4538106670200825, "epoch": 43} +{"train_lr": 0.00199982640834721, "train_loss": 0.45279076434373855, "epoch": 44} +{"train_lr": 0.0019997410059642406, "train_loss": 0.45213532138466833, "epoch": 45} +{"train_lr": 0.001999638520851299, "train_loss": 0.45124949448108675, "epoch": 46} +{"train_lr": 0.0019995189547595644, "train_loss": 0.45040599479079246, "epoch": 47} +{"train_lr": 0.0019993823097320955, "train_loss": 0.44934970703125, "epoch": 48} +{"train_lr": 0.00199922858810378, "train_loss": 0.44850586191415787, "epoch": 49} +{"train_lr": 0.001999057792501294, "train_loss": 0.44782372673153875, "epoch": 50} +{"train_lr": 0.0019988699258430644, "train_loss": 0.4471781154215336, "epoch": 51} +{"train_lr": 0.001998664991339216, "train_loss": 0.44638750190138815, "epoch": 52} +{"train_lr": 0.001998442992491514, "train_loss": 0.44575022593736646, "epoch": 53} +{"train_lr": 0.001998203933093299, "train_loss": 0.4452553203165531, "epoch": 54} +{"train_lr": 0.0019979478172294477, "train_loss": 0.4447741946578026, "epoch": 55} +{"train_lr": 0.0019976746492762656, "train_loss": 0.4442489013373852, "epoch": 56} +{"train_lr": 0.001997384433901443, "train_loss": 0.4437332093179226, "epoch": 57} +{"train_lr": 0.0019970771760639545, "train_loss": 0.4431314005434513, "epoch": 58} +{"train_lr": 0.001996752881013995, "train_loss": 0.44259421687722206, "epoch": 59} +{"train_lr": 0.0019964115542928738, "train_loss": 0.4422669967353344, "epoch": 60} +{"train_lr": 0.0019960532017329145, "train_loss": 0.4417998247206211, "epoch": 61} +{"train_lr": 0.0019956778294573777, "train_loss": 0.44136346296072004, "epoch": 62} +{"train_lr": 0.0019952854438803442, "train_loss": 0.44111803106069564, "epoch": 63} +{"train_lr": 0.0019948760517065917, "train_loss": 0.44124629287719724, "epoch": 64} +{"train_lr": 0.001994449659931513, "train_loss": 0.44030142896175384, "epoch": 65} +{"train_lr": 0.001994006275840954, "train_loss": 0.4400160102546215, "epoch": 66} +{"train_lr": 0.001993545907011146, "train_loss": 0.4394983403027058, "epoch": 67} +{"train_lr": 0.0019930685613085016, "train_loss": 0.43934193002581595, "epoch": 68} +{"train_lr": 0.001992574246889553, "train_loss": 0.43939045332074167, "epoch": 69} +{"train_lr": 0.0019920629722007623, "train_loss": 0.43876420689821244, "epoch": 70} +{"train_lr": 0.001991534745978413, "train_loss": 0.4383896221101284, "epoch": 71} +{"train_lr": 0.00199098957724843, "train_loss": 0.43805962885022165, "epoch": 72} +{"train_lr": 0.001990427475326234, "train_loss": 0.4378157087266445, "epoch": 73} +{"train_lr": 0.0019898484498166082, "train_loss": 0.4375223469555378, "epoch": 74} +{"train_lr": 0.001989252510613496, "train_loss": 0.4371592809557915, "epoch": 75} +{"train_lr": 0.0019886396678998547, "train_loss": 0.43709583897590637, "epoch": 76} +{"train_lr": 0.001988009932147472, "train_loss": 0.43670710064172746, "epoch": 77} +{"train_lr": 0.0019873633141167983, "train_loss": 0.4365320971310139, "epoch": 78} +{"train_lr": 0.0019866998248567525, "train_loss": 0.4362802542924881, "epoch": 79} +{"train_lr": 0.0019860194757045316, "train_loss": 0.43601478363275525, "epoch": 80} +{"train_lr": 0.001985322278285424, "train_loss": 0.4358106957912445, "epoch": 81} +{"train_lr": 0.001984608244512617, "train_loss": 0.43634837040901187, "epoch": 82} +{"train_lr": 0.0019838773865869753, "train_loss": 0.4354546032965183, "epoch": 83} +{"train_lr": 0.001983129716996845, "train_loss": 0.43514904779195784, "epoch": 84} +{"train_lr": 0.0019823652485178316, "train_loss": 0.4349977902054787, "epoch": 85} +{"train_lr": 0.0019815839942125928, "train_loss": 0.4347984156310558, "epoch": 86} +{"train_lr": 0.001980785967430611, "train_loss": 0.4346069442749023, "epoch": 87} +{"train_lr": 0.001979971181807968, "train_loss": 0.43437728793025016, "epoch": 88} +{"train_lr": 0.0019791396512670954, "train_loss": 0.43415831446647646, "epoch": 89} +{"train_lr": 0.0019782913900165513, "train_loss": 0.43396707623004915, "epoch": 90} +{"train_lr": 0.001977426412550794, "train_loss": 0.43384554549455645, "epoch": 91} +{"train_lr": 0.0019765447336498893, "train_loss": 0.43357861334681513, "epoch": 92} +{"train_lr": 0.0019756463683793042, "train_loss": 0.43340606517791747, "epoch": 93} +{"train_lr": 0.0019747313320896127, "train_loss": 0.43321831868886945, "epoch": 94} +{"train_lr": 0.001973799640416274, "train_loss": 0.43303715973496437, "epoch": 95} +{"train_lr": 0.001972851309279318, "train_loss": 0.4329533623635769, "epoch": 96} +{"train_lr": 0.001971886354883114, "train_loss": 0.4327393824696541, "epoch": 97} +{"train_lr": 0.0019709047937160624, "train_loss": 0.4326424191534519, "epoch": 98} +{"train_lr": 0.00196990664255034, "train_loss": 0.432473964703083, "epoch": 99} +{"train_lr": 0.001968891918441605, "train_loss": 0.4323611004710197, "epoch": 100} +{"train_lr": 0.0019678606387286746, "train_loss": 0.43220903632044794, "epoch": 101} +{"train_lr": 0.0019668128210332835, "train_loss": 0.4320597696781158, "epoch": 102} +{"train_lr": 0.001965748483259745, "train_loss": 0.431807940363884, "epoch": 103} +{"train_lr": 0.0019646676435946544, "train_loss": 0.43166343091726306, "epoch": 104} +{"train_lr": 0.0019635703205065857, "train_loss": 0.43165171412825587, "epoch": 105} +{"train_lr": 0.001962456532745752, "train_loss": 0.4314692829966545, "epoch": 106} +{"train_lr": 0.001961326299343718, "train_loss": 0.43120107041597366, "epoch": 107} +{"train_lr": 0.0019601796396130477, "train_loss": 0.43122232078313827, "epoch": 108} +{"train_lr": 0.001959016573147011, "train_loss": 0.43111382276415827, "epoch": 109} +{"train_lr": 0.001957837119819182, "train_loss": 0.43085430263876917, "epoch": 110} +{"train_lr": 0.0019566412997831803, "train_loss": 0.43075852123498914, "epoch": 111} +{"train_lr": 0.0019554291334722604, "train_loss": 0.4307229078769684, "epoch": 112} +{"train_lr": 0.001954200641599004, "train_loss": 0.43064907240271566, "epoch": 113} +{"train_lr": 0.001952955845154955, "train_loss": 0.4303462516546249, "epoch": 114} +{"train_lr": 0.0019516947654102353, "train_loss": 0.430285882461071, "epoch": 115} +{"train_lr": 0.0019504174239132258, "train_loss": 0.4302071396172047, "epoch": 116} +{"train_lr": 0.0019491238424901728, "train_loss": 0.43007939971089365, "epoch": 117} +{"train_lr": 0.0019478140432448074, "train_loss": 0.42995246585607527, "epoch": 118} +{"train_lr": 0.0019464880485579859, "train_loss": 0.4299236563742161, "epoch": 119} +{"train_lr": 0.0019451458810873046, "train_loss": 0.42975575862526894, "epoch": 120} +{"train_lr": 0.0019437875637666987, "train_loss": 0.4296689044058323, "epoch": 121} +{"train_lr": 0.0019424131198060568, "train_loss": 0.4295571488142014, "epoch": 122} +{"train_lr": 0.001941022572690844, "train_loss": 0.4295304506480694, "epoch": 123} +{"train_lr": 0.0019396159461816677, "train_loss": 0.4295160254955292, "epoch": 124} +{"train_lr": 0.0019381932643138978, "train_loss": 0.4293148836731911, "epoch": 125} +{"train_lr": 0.0019367545513972342, "train_loss": 0.4292679689407349, "epoch": 126} +{"train_lr": 0.0019352998320153279, "train_loss": 0.42902680062055587, "epoch": 127} +{"train_lr": 0.0019338291310252987, "train_loss": 0.4290702934384346, "epoch": 128} +{"train_lr": 0.001932342473557387, "train_loss": 0.4288153325974941, "epoch": 129} +{"train_lr": 0.0019308398850144532, "train_loss": 0.4288082259654999, "epoch": 130} +{"train_lr": 0.0019293213910715973, "train_loss": 0.4287457905650139, "epoch": 131} +{"train_lr": 0.0019277870176756878, "train_loss": 0.42868427852988245, "epoch": 132} +{"train_lr": 0.0019262367910449316, "train_loss": 0.4285197732448578, "epoch": 133} +{"train_lr": 0.0019246707376684355, "train_loss": 0.4284260827243328, "epoch": 134} +{"train_lr": 0.0019230888843057212, "train_loss": 0.42834890897870065, "epoch": 135} +{"train_lr": 0.001921491257986288, "train_loss": 0.42836915620565413, "epoch": 136} +{"train_lr": 0.001919877886009182, "train_loss": 0.4281206827223301, "epoch": 137} +{"train_lr": 0.0019182487959424652, "train_loss": 0.4281239497900009, "epoch": 138} +{"train_lr": 0.0019166040156227992, "train_loss": 0.4281063589513302, "epoch": 139} +{"train_lr": 0.0019149435731549388, "train_loss": 0.42810667996406554, "epoch": 140} +{"train_lr": 0.001913267496911266, "train_loss": 0.42796390196084977, "epoch": 141} +{"train_lr": 0.001911575815531295, "train_loss": 0.4278896269261837, "epoch": 142} +{"train_lr": 0.001909868557921197, "train_loss": 0.42768696791529653, "epoch": 143} +{"train_lr": 0.0019081457532532941, "train_loss": 0.42768659583330154, "epoch": 144} +{"train_lr": 0.0019064074309655585, "train_loss": 0.42750629413723945, "epoch": 145} +{"train_lr": 0.0019046536207611357, "train_loss": 0.42743830469250677, "epoch": 146} +{"train_lr": 0.0019028843526077868, "train_loss": 0.42752327723503114, "epoch": 147} +{"train_lr": 0.0019010996567374366, "train_loss": 0.42736736696958544, "epoch": 148} +{"train_lr": 0.0018992995636456075, "train_loss": 0.4276691595375538, "epoch": 149} +{"train_lr": 0.0018974841040909193, "train_loss": 0.4272169639468193, "epoch": 150} +{"train_lr": 0.0018956533090945722, "train_loss": 0.4271261396050453, "epoch": 151} +{"train_lr": 0.0018938072099398014, "train_loss": 0.42715407326221466, "epoch": 152} +{"train_lr": 0.0018919458381713458, "train_loss": 0.4283799661755562, "epoch": 153} +{"train_lr": 0.0018900692255949196, "train_loss": 0.4275367811322212, "epoch": 154} +{"train_lr": 0.0018881774042766428, "train_loss": 0.42704309683442115, "epoch": 155} +{"train_lr": 0.0018862704065425273, "train_loss": 0.4270126509964466, "epoch": 156} +{"train_lr": 0.0018843482649779047, "train_loss": 0.42692647844552994, "epoch": 157} +{"train_lr": 0.0018824110124268732, "train_loss": 0.42698997198939326, "epoch": 158} +{"train_lr": 0.0018804586819917309, "train_loss": 0.42690860251784324, "epoch": 159} +{"train_lr": 0.0018784913070324137, "train_loss": 0.42671867433190347, "epoch": 160} +{"train_lr": 0.0018765089211659387, "train_loss": 0.4265693518280983, "epoch": 161} +{"train_lr": 0.0018745115582658063, "train_loss": 0.42649883098006247, "epoch": 162} +{"train_lr": 0.0018724992524614336, "train_loss": 0.42644061017632484, "epoch": 163} +{"train_lr": 0.0018704720381375868, "train_loss": 0.42639319202899933, "epoch": 164} +{"train_lr": 0.0018684299499337567, "train_loss": 0.4262290573775768, "epoch": 165} +{"train_lr": 0.0018663730227436023, "train_loss": 0.42622224123477936, "epoch": 166} +{"train_lr": 0.0018643012917143186, "train_loss": 0.4262301009774208, "epoch": 167} +{"train_lr": 0.0018622147922460915, "train_loss": 0.426052722042799, "epoch": 168} +{"train_lr": 0.0018601135599914326, "train_loss": 0.42615303208231925, "epoch": 169} +{"train_lr": 0.0018579976308546009, "train_loss": 0.4259471821427345, "epoch": 170} +{"train_lr": 0.0018558670409909955, "train_loss": 0.42602315420508385, "epoch": 171} +{"train_lr": 0.001853721826806509, "train_loss": 0.4258790579974651, "epoch": 172} +{"train_lr": 0.001851562024956937, "train_loss": 0.4258593296408653, "epoch": 173} +{"train_lr": 0.0018493876723473352, "train_loss": 0.42584737250208854, "epoch": 174} +{"train_lr": 0.0018471988061313895, "train_loss": 0.4257563955247402, "epoch": 175} +{"train_lr": 0.001844995463710784, "train_loss": 0.42564952899217606, "epoch": 176} +{"train_lr": 0.0018427776827345638, "train_loss": 0.425537062728405, "epoch": 177} +{"train_lr": 0.0018405455010984842, "train_loss": 0.42545911307930945, "epoch": 178} +{"train_lr": 0.0018382989569443692, "train_loss": 0.42556525562405584, "epoch": 179} +{"train_lr": 0.0018360380886594638, "train_loss": 0.4254311235845089, "epoch": 180} +{"train_lr": 0.0018337629348757645, "train_loss": 0.4255017030119896, "epoch": 181} +{"train_lr": 0.0018314735344693734, "train_loss": 0.42531982975006105, "epoch": 182} +{"train_lr": 0.0018291699265598315, "train_loss": 0.4253919682562351, "epoch": 183} +{"train_lr": 0.0018268521505094332, "train_loss": 0.42532408665418625, "epoch": 184} +{"train_lr": 0.0018245202459225905, "train_loss": 0.4252872289419174, "epoch": 185} +{"train_lr": 0.0018221742526451034, "train_loss": 0.42515819770097735, "epoch": 186} +{"train_lr": 0.0018198142107635346, "train_loss": 0.4251448391377926, "epoch": 187} +{"train_lr": 0.001817440160604478, "train_loss": 0.4251269141793251, "epoch": 188} +{"train_lr": 0.001815052142733913, "train_loss": 0.42496180029511454, "epoch": 189} +{"train_lr": 0.001812650197956469, "train_loss": 0.4249068469822407, "epoch": 190} +{"train_lr": 0.001810234367314759, "train_loss": 0.4250109994530678, "epoch": 191} +{"train_lr": 0.001807804692088656, "train_loss": 0.424981004846096, "epoch": 192} +{"train_lr": 0.0018053612137946117, "train_loss": 0.4247819488167763, "epoch": 193} +{"train_lr": 0.00180290397418492, "train_loss": 0.4246981884419918, "epoch": 194} +{"train_lr": 0.0018004330152470427, "train_loss": 0.4247443710744381, "epoch": 195} +{"train_lr": 0.001797948379202839, "train_loss": 0.42464397926330566, "epoch": 196} +{"train_lr": 0.001795450108507886, "train_loss": 0.4248728358566761, "epoch": 197} +{"train_lr": 0.0017929382458507345, "train_loss": 0.42464299993515014, "epoch": 198} +{"train_lr": 0.001790412834152188, "train_loss": 0.42458462185263635, "epoch": 199} +{"train_lr": 0.0017878739165645666, "train_loss": 0.4244833302080631, "epoch": 200} +{"train_lr": 0.0017853215364709624, "train_loss": 0.4244596959531307, "epoch": 201} +{"train_lr": 0.001782755737484517, "train_loss": 0.4243732154786587, "epoch": 202} +{"train_lr": 0.0017801765634476482, "train_loss": 0.42429113371372223, "epoch": 203} +{"train_lr": 0.0017775840584313269, "train_loss": 0.4243124633014202, "epoch": 204} +{"train_lr": 0.0017749782667343087, "train_loss": 0.4242242727458477, "epoch": 205} +{"train_lr": 0.0017723592328823872, "train_loss": 0.42419893629550937, "epoch": 206} +{"train_lr": 0.0017697270016276267, "train_loss": 0.424120762860775, "epoch": 207} +{"train_lr": 0.0017670816179475896, "train_loss": 0.4241485097467899, "epoch": 208} +{"train_lr": 0.0017644231270445914, "train_loss": 0.4241050954818726, "epoch": 209} +{"train_lr": 0.0017617515743449002, "train_loss": 0.42401950508356095, "epoch": 210} +{"train_lr": 0.0017590670054979855, "train_loss": 0.4240114216029644, "epoch": 211} +{"train_lr": 0.0017563694663757193, "train_loss": 0.4239511508405209, "epoch": 212} +{"train_lr": 0.001753659003071607, "train_loss": 0.4239001268327236, "epoch": 213} +{"train_lr": 0.0017509356618999798, "train_loss": 0.4238909521043301, "epoch": 214} +{"train_lr": 0.0017481994893952333, "train_loss": 0.42393678986430167, "epoch": 215} +{"train_lr": 0.0017454505323109951, "train_loss": 0.42374238679409026, "epoch": 216} +{"train_lr": 0.0017426888376193663, "train_loss": 0.42381820154190064, "epoch": 217} +{"train_lr": 0.0017399144525100897, "train_loss": 0.423696692097187, "epoch": 218} +{"train_lr": 0.0017371274243897503, "train_loss": 0.4236421609342098, "epoch": 219} +{"train_lr": 0.0017343278008809635, "train_loss": 0.42375850692987443, "epoch": 220} +{"train_lr": 0.0017315156298215765, "train_loss": 0.42361214114427564, "epoch": 221} +{"train_lr": 0.0017286909592638356, "train_loss": 0.423529484629631, "epoch": 222} +{"train_lr": 0.001725853837473557, "train_loss": 0.4241306705236435, "epoch": 223} +{"train_lr": 0.001723004312929336, "train_loss": 0.4236619794726372, "epoch": 224} +{"train_lr": 0.0017201424343216843, "train_loss": 0.423469975990057, "epoch": 225} +{"train_lr": 0.001717268250552199, "train_loss": 0.4235054041683674, "epoch": 226} +{"train_lr": 0.0017143818107327635, "train_loss": 0.4234480388879776, "epoch": 227} +{"train_lr": 0.001711483164184661, "train_loss": 0.42325710557699203, "epoch": 228} +{"train_lr": 0.0017085723604377695, "train_loss": 0.423364332896471, "epoch": 229} +{"train_lr": 0.001705649449229696, "train_loss": 0.4232913333415985, "epoch": 230} +{"train_lr": 0.0017027144805049166, "train_loss": 0.42325079972147944, "epoch": 231} +{"train_lr": 0.0016997675044139638, "train_loss": 0.4232346039891243, "epoch": 232} +{"train_lr": 0.0016968085713125144, "train_loss": 0.42314266840815545, "epoch": 233} +{"train_lr": 0.001693837731760583, "train_loss": 0.4230865432739258, "epoch": 234} +{"train_lr": 0.001690855036521616, "train_loss": 0.42304754146933554, "epoch": 235} +{"train_lr": 0.0016878605365616413, "train_loss": 0.4229908716440201, "epoch": 236} +{"train_lr": 0.0016848542830484078, "train_loss": 0.42301636381149293, "epoch": 237} +{"train_lr": 0.0016818363273504887, "train_loss": 0.4229623642385006, "epoch": 238} +{"train_lr": 0.0016788067210364202, "train_loss": 0.42286261225342753, "epoch": 239} +{"train_lr": 0.0016757655158738203, "train_loss": 0.4231466094911098, "epoch": 240} +{"train_lr": 0.0016727127638284855, "train_loss": 0.4228993058741093, "epoch": 241} +{"train_lr": 0.0016696485170635351, "train_loss": 0.4227703313648701, "epoch": 242} +{"train_lr": 0.001666572827938487, "train_loss": 0.42268667768239976, "epoch": 243} +{"train_lr": 0.0016634857490083828, "train_loss": 0.4226205878555775, "epoch": 244} +{"train_lr": 0.001660387333022884, "train_loss": 0.4228058986365795, "epoch": 245} +{"train_lr": 0.0016572776329253699, "train_loss": 0.4226573086321354, "epoch": 246} +{"train_lr": 0.0016541567018520343, "train_loss": 0.4226382351756096, "epoch": 247} +{"train_lr": 0.0016510245931309836, "train_loss": 0.42262957600951195, "epoch": 248} +{"train_lr": 0.001647881360281309, "train_loss": 0.42256055372953416, "epoch": 249} +{"train_lr": 0.0016447270570121876, "train_loss": 0.42256660661697387, "epoch": 250} +{"train_lr": 0.0016415617372219618, "train_loss": 0.4224966368377209, "epoch": 251} +{"train_lr": 0.001638385454997211, "train_loss": 0.422463566839695, "epoch": 252} +{"train_lr": 0.00163519826461184, "train_loss": 0.42237583945393564, "epoch": 253} +{"train_lr": 0.0016320002205261264, "train_loss": 0.42226354267001154, "epoch": 254} +{"train_lr": 0.0016287913773858353, "train_loss": 0.4222660710632801, "epoch": 255} +{"train_lr": 0.0016255717900212328, "train_loss": 0.4222601546764374, "epoch": 256} +{"train_lr": 0.0016223415134461888, "train_loss": 0.4221981988191605, "epoch": 257} +{"train_lr": 0.0016191006028572102, "train_loss": 0.4222234422802925, "epoch": 258} +{"train_lr": 0.0016158491136325235, "train_loss": 0.42203234511613846, "epoch": 259} +{"train_lr": 0.0016125871013311073, "train_loss": 0.4220574823975563, "epoch": 260} +{"train_lr": 0.0016093146216917486, "train_loss": 0.42207971769571306, "epoch": 261} +{"train_lr": 0.0016060317306321, "train_loss": 0.42206390278339384, "epoch": 262} +{"train_lr": 0.0016027384842477105, "train_loss": 0.4220549532175064, "epoch": 263} +{"train_lr": 0.0015994349388110693, "train_loss": 0.4220111142575741, "epoch": 264} +{"train_lr": 0.001596121150770662, "train_loss": 0.4219424910187721, "epoch": 265} +{"train_lr": 0.0015927971767499772, "train_loss": 0.4220361890375614, "epoch": 266} +{"train_lr": 0.0015894630735465585, "train_loss": 0.4218697710752487, "epoch": 267} +{"train_lr": 0.001586118898131038, "train_loss": 0.42191931760907175, "epoch": 268} +{"train_lr": 0.0015827647076461402, "train_loss": 0.4217308155596256, "epoch": 269} +{"train_lr": 0.0015794005594057226, "train_loss": 0.4218587208151817, "epoch": 270} +{"train_lr": 0.0015760265108938055, "train_loss": 0.4217528022646904, "epoch": 271} +{"train_lr": 0.001572642619763563, "train_loss": 0.421679973757267, "epoch": 272} +{"train_lr": 0.0015692489438363627, "train_loss": 0.4217708421468735, "epoch": 273} +{"train_lr": 0.001565845541100755, "train_loss": 0.4216467033326626, "epoch": 274} +{"train_lr": 0.001562432469711511, "train_loss": 0.42169866006374357, "epoch": 275} +{"train_lr": 0.0015590097879886, "train_loss": 0.42156042192578314, "epoch": 276} +{"train_lr": 0.001555577554416206, "train_loss": 0.42161925470232964, "epoch": 277} +{"train_lr": 0.0015521358276417347, "train_loss": 0.42153908587694167, "epoch": 278} +{"train_lr": 0.0015486846664748033, "train_loss": 0.4215101927101612, "epoch": 279} +{"train_lr": 0.0015452241298862248, "train_loss": 0.4214610160768032, "epoch": 280} +{"train_lr": 0.0015417542770070323, "train_loss": 0.42141325249671935, "epoch": 281} +{"train_lr": 0.0015382751671274308, "train_loss": 0.42136881043314933, "epoch": 282} +{"train_lr": 0.0015347868596958091, "train_loss": 0.42135062956213953, "epoch": 283} +{"train_lr": 0.0015312894143177202, "train_loss": 0.42139784327149393, "epoch": 284} +{"train_lr": 0.0015277828907548521, "train_loss": 0.4214449079275131, "epoch": 285} +{"train_lr": 0.001524267348924025, "train_loss": 0.4213422214746475, "epoch": 286} +{"train_lr": 0.0015207428488961414, "train_loss": 0.42129106523990634, "epoch": 287} +{"train_lr": 0.0015172094508951826, "train_loss": 0.42131546414494514, "epoch": 288} +{"train_lr": 0.0015136672152971753, "train_loss": 0.4212149278342724, "epoch": 289} +{"train_lr": 0.0015101162026291506, "train_loss": 0.42106798495054243, "epoch": 290} +{"train_lr": 0.001506556473568119, "train_loss": 0.42114609475135806, "epoch": 291} +{"train_lr": 0.0015029880889400262, "train_loss": 0.42112620157003405, "epoch": 292} +{"train_lr": 0.001499411109718721, "train_loss": 0.4210899014830589, "epoch": 293} +{"train_lr": 0.001495825597024904, "train_loss": 0.42102030997872353, "epoch": 294} +{"train_lr": 0.0014922316121251074, "train_loss": 0.4210644329071045, "epoch": 295} +{"train_lr": 0.0014886292164306054, "train_loss": 0.42094684926271436, "epoch": 296} +{"train_lr": 0.001485018471496406, "train_loss": 0.420904375231266, "epoch": 297} +{"train_lr": 0.001481399439020176, "train_loss": 0.4209073343873024, "epoch": 298} +{"train_lr": 0.0014777721808411927, "train_loss": 0.4208303572535515, "epoch": 299} +{"train_lr": 0.0014741367589392984, "train_loss": 0.420840155172348, "epoch": 300} +{"train_lr": 0.001470493235433814, "train_loss": 0.42079361829161643, "epoch": 301} +{"train_lr": 0.0014668416725825066, "train_loss": 0.420751271378994, "epoch": 302} +{"train_lr": 0.0014631821327805124, "train_loss": 0.42070620072484016, "epoch": 303} +{"train_lr": 0.0014595146785592672, "train_loss": 0.4206903719842434, "epoch": 304} +{"train_lr": 0.00145583937258545, "train_loss": 0.4207553890287876, "epoch": 305} +{"train_lr": 0.001452156277659891, "train_loss": 0.42065541954040525, "epoch": 306} +{"train_lr": 0.0014484654567165239, "train_loss": 0.4206352675974369, "epoch": 307} +{"train_lr": 0.0014447669728213, "train_loss": 0.420621358191967, "epoch": 308} +{"train_lr": 0.0014410608891710992, "train_loss": 0.42055000742673876, "epoch": 309} +{"train_lr": 0.0014373472690926664, "train_loss": 0.4204847206771374, "epoch": 310} +{"train_lr": 0.0014336261760415228, "train_loss": 0.4204664314568043, "epoch": 311} +{"train_lr": 0.0014298976736008813, "train_loss": 0.42043020857572555, "epoch": 312} +{"train_lr": 0.001426161825480565, "train_loss": 0.42040675433278085, "epoch": 313} +{"train_lr": 0.0014224186955159059, "train_loss": 0.42036361072659495, "epoch": 314} +{"train_lr": 0.001418668347666667, "train_loss": 0.42047722015976907, "epoch": 315} +{"train_lr": 0.001414910846015954, "train_loss": 0.4203856301009655, "epoch": 316} +{"train_lr": 0.0014111462547690917, "train_loss": 0.42029333937168123, "epoch": 317} +{"train_lr": 0.0014073746382525652, "train_loss": 0.4202634672820568, "epoch": 318} +{"train_lr": 0.0014035960609128908, "train_loss": 0.4202524435698986, "epoch": 319} +{"train_lr": 0.0013998105873155328, "train_loss": 0.4202196150660515, "epoch": 320} +{"train_lr": 0.0013960182821437879, "train_loss": 0.4202494762778282, "epoch": 321} +{"train_lr": 0.001392219210197692, "train_loss": 0.42015298603773116, "epoch": 322} +{"train_lr": 0.0013884134363928921, "train_loss": 0.4201268202781677, "epoch": 323} +{"train_lr": 0.001384601025759574, "train_loss": 0.419998131608963, "epoch": 324} +{"train_lr": 0.001380782043441313, "train_loss": 0.42012175452113154, "epoch": 325} +{"train_lr": 0.0013769565546939756, "train_loss": 0.4199835945248604, "epoch": 326} +{"train_lr": 0.001373124624884616, "train_loss": 0.4199679668843746, "epoch": 327} +{"train_lr": 0.0013692863194903408, "train_loss": 0.41995242735147476, "epoch": 328} +{"train_lr": 0.0013654417040971938, "train_loss": 0.419914648014307, "epoch": 329} +{"train_lr": 0.0013615908443990496, "train_loss": 0.420291811478138, "epoch": 330} +{"train_lr": 0.0013577338061964764, "train_loss": 0.42001431497335434, "epoch": 331} +{"train_lr": 0.0013538706553956092, "train_loss": 0.41982507169246674, "epoch": 332} +{"train_lr": 0.0013500014580070398, "train_loss": 0.4198868880212307, "epoch": 333} +{"train_lr": 0.0013461262801446774, "train_loss": 0.41981268939375876, "epoch": 334} +{"train_lr": 0.0013422451880246203, "train_loss": 0.4197859157443047, "epoch": 335} +{"train_lr": 0.0013383582479640172, "train_loss": 0.4197030574500561, "epoch": 336} +{"train_lr": 0.0013344655263799582, "train_loss": 0.41963064196109773, "epoch": 337} +{"train_lr": 0.0013305670897883135, "train_loss": 0.41961017757058144, "epoch": 338} +{"train_lr": 0.0013266630048026041, "train_loss": 0.41970578683018683, "epoch": 339} +{"train_lr": 0.00132275333813287, "train_loss": 0.4195633431851864, "epoch": 340} +{"train_lr": 0.001318838156584536, "train_loss": 0.41961616225242615, "epoch": 341} +{"train_lr": 0.0013149175270572404, "train_loss": 0.419570646417141, "epoch": 342} +{"train_lr": 0.0013109915165437332, "train_loss": 0.41954742604494094, "epoch": 343} +{"train_lr": 0.0013070601921287021, "train_loss": 0.4194275137424469, "epoch": 344} +{"train_lr": 0.001303123620987628, "train_loss": 0.41942608463168146, "epoch": 345} +{"train_lr": 0.0012991818703856554, "train_loss": 0.4194012299001217, "epoch": 346} +{"train_lr": 0.0012952350076764231, "train_loss": 0.41930887975096703, "epoch": 347} +{"train_lr": 0.00129128310030093, "train_loss": 0.4193776847779751, "epoch": 348} +{"train_lr": 0.0012873262157863646, "train_loss": 0.4193605829834938, "epoch": 349} +{"train_lr": 0.0012833644217449664, "train_loss": 0.4192501567542553, "epoch": 350} +{"train_lr": 0.0012793977858728675, "train_loss": 0.4192127873659134, "epoch": 351} +{"train_lr": 0.00127542637594893, "train_loss": 0.4193183109641075, "epoch": 352} +{"train_lr": 0.0012714502598335897, "train_loss": 0.4197004640817642, "epoch": 353} +{"train_lr": 0.0012674695054677005, "train_loss": 0.41917526848316194, "epoch": 354} +{"train_lr": 0.0012634841808713748, "train_loss": 0.41914406824707984, "epoch": 355} +{"train_lr": 0.0012594943541428109, "train_loss": 0.4190760906219482, "epoch": 356} +{"train_lr": 0.0012555000934571397, "train_loss": 0.419058157235384, "epoch": 357} +{"train_lr": 0.0012515014670652586, "train_loss": 0.419025038343668, "epoch": 358} +{"train_lr": 0.0012474985432926558, "train_loss": 0.4189569546878338, "epoch": 359} +{"train_lr": 0.001243491390538254, "train_loss": 0.4190199301660061, "epoch": 360} +{"train_lr": 0.0012394800772732412, "train_loss": 0.4189968164920807, "epoch": 361} +{"train_lr": 0.0012354646720398926, "train_loss": 0.41894257601499557, "epoch": 362} +{"train_lr": 0.001231445243450402, "train_loss": 0.41891998412013054, "epoch": 363} +{"train_lr": 0.0012274218601857198, "train_loss": 0.41875716477632524, "epoch": 364} +{"train_lr": 0.0012233945909943611, "train_loss": 0.4189456850349903, "epoch": 365} +{"train_lr": 0.001219363504691245, "train_loss": 0.41879626615047455, "epoch": 366} +{"train_lr": 0.0012153286701565129, "train_loss": 0.4187718325734138, "epoch": 367} +{"train_lr": 0.0012112901563343563, "train_loss": 0.4186474060893059, "epoch": 368} +{"train_lr": 0.0012072480322318328, "train_loss": 0.4186492353022099, "epoch": 369} +{"train_lr": 0.0012032023669176915, "train_loss": 0.4186739155292511, "epoch": 370} +{"train_lr": 0.0011991532295211936, "train_loss": 0.41872328140735626, "epoch": 371} +{"train_lr": 0.001195100689230918, "train_loss": 0.4187125334382057, "epoch": 372} +{"train_lr": 0.0011910448152936013, "train_loss": 0.4187649071574211, "epoch": 373} +{"train_lr": 0.00118698567701294, "train_loss": 0.41862554777264593, "epoch": 374} +{"train_lr": 0.001182923343748406, "train_loss": 0.418472838807106, "epoch": 375} +{"train_lr": 0.0011788578849140647, "train_loss": 0.4184612022995949, "epoch": 376} +{"train_lr": 0.00117478936997739, "train_loss": 0.418489412689209, "epoch": 377} +{"train_lr": 0.001170717868458082, "train_loss": 0.4183967174947262, "epoch": 378} +{"train_lr": 0.001166643449926863, "train_loss": 0.4184104426383972, "epoch": 379} +{"train_lr": 0.0011625661840043084, "train_loss": 0.41830870187282565, "epoch": 380} +{"train_lr": 0.0011584861403596384, "train_loss": 0.41836045224666596, "epoch": 381} +{"train_lr": 0.0011544033887095435, "train_loss": 0.41827855964899063, "epoch": 382} +{"train_lr": 0.0011503179988169893, "train_loss": 0.41831054545640944, "epoch": 383} +{"train_lr": 0.001146230040490009, "train_loss": 0.41834157658815385, "epoch": 384} +{"train_lr": 0.0011421395835805358, "train_loss": 0.41814741303920744, "epoch": 385} +{"train_lr": 0.0011380466979831925, "train_loss": 0.41818422635793684, "epoch": 386} +{"train_lr": 0.0011339514536341003, "train_loss": 0.418172834277153, "epoch": 387} +{"train_lr": 0.001129853920509686, "train_loss": 0.4181679087162018, "epoch": 388} +{"train_lr": 0.0011257541686254895, "train_loss": 0.4180516511440277, "epoch": 389} +{"train_lr": 0.0011216522680349492, "train_loss": 0.41804933690428736, "epoch": 390} +{"train_lr": 0.0011175482888282399, "train_loss": 0.4180680680811405, "epoch": 391} +{"train_lr": 0.0011134423011310347, "train_loss": 0.4179827343761921, "epoch": 392} +{"train_lr": 0.0011093343751033356, "train_loss": 0.4179728990733623, "epoch": 393} +{"train_lr": 0.0011052245809382672, "train_loss": 0.4179292483329773, "epoch": 394} +{"train_lr": 0.0011011129888608734, "train_loss": 0.41794585397839545, "epoch": 395} +{"train_lr": 0.0010969996691269118, "train_loss": 0.41799348885416987, "epoch": 396} +{"train_lr": 0.0010928846920216773, "train_loss": 0.41798636142015455, "epoch": 397} +{"train_lr": 0.0010887681278587693, "train_loss": 0.4178778306603432, "epoch": 398} +{"train_lr": 0.0010846500469789088, "train_loss": 0.417874898070097, "epoch": 399} +{"train_lr": 0.0010805305197487387, "train_loss": 0.4177279465615749, "epoch": 400} +{"train_lr": 0.001076409616559617, "train_loss": 0.4177750180602074, "epoch": 401} +{"train_lr": 0.001072287407826403, "train_loss": 0.41769819692969323, "epoch": 402} +{"train_lr": 0.0010681639639862738, "train_loss": 0.4177047014296055, "epoch": 403} +{"train_lr": 0.0010640393554975105, "train_loss": 0.41777194578647614, "epoch": 404} +{"train_lr": 0.001059913652838287, "train_loss": 0.4176086929380894, "epoch": 405} +{"train_lr": 0.0010557869265054776, "train_loss": 0.41784522614479064, "epoch": 406} +{"train_lr": 0.0010516592470134524, "train_loss": 0.4175245689034462, "epoch": 407} +{"train_lr": 0.0010475306848928647, "train_loss": 0.41752753249406815, "epoch": 408} +{"train_lr": 0.0010434013106894533, "train_loss": 0.41756826764941213, "epoch": 409} +{"train_lr": 0.0010392711949628248, "train_loss": 0.417456934183836, "epoch": 410} +{"train_lr": 0.00103514040828526, "train_loss": 0.4174256393015385, "epoch": 411} +{"train_lr": 0.001031009021240512, "train_loss": 0.4173729620695114, "epoch": 412} +{"train_lr": 0.0010268771044225837, "train_loss": 0.41735645656585696, "epoch": 413} +{"train_lr": 0.0010227447284345357, "train_loss": 0.4173586934030056, "epoch": 414} +{"train_lr": 0.0010186119638872688, "train_loss": 0.4173215918242931, "epoch": 415} +{"train_lr": 0.001014478881398324, "train_loss": 0.4172305813729763, "epoch": 416} +{"train_lr": 0.0010103455515906839, "train_loss": 0.4172926494061947, "epoch": 417} +{"train_lr": 0.0010062120450915484, "train_loss": 0.41724464458227156, "epoch": 418} +{"train_lr": 0.0010020784325311383, "train_loss": 0.4171691377878189, "epoch": 419} +{"train_lr": 0.0009979447845414845, "train_loss": 0.4171639740407467, "epoch": 420} +{"train_lr": 0.000993811171755231, "train_loss": 0.41711280400156975, "epoch": 421} +{"train_lr": 0.0009896776648044105, "train_loss": 0.4171565491616726, "epoch": 422} +{"train_lr": 0.0009855443343192564, "train_loss": 0.41707179708480835, "epoch": 423} +{"train_lr": 0.0009814112509269812, "train_loss": 0.4170188140451908, "epoch": 424} +{"train_lr": 0.0009772784852505741, "train_loss": 0.4170406273066998, "epoch": 425} +{"train_lr": 0.0009731461079075985, "train_loss": 0.41711612367033957, "epoch": 426} +{"train_lr": 0.0009690141895089831, "train_loss": 0.4170222055196762, "epoch": 427} +{"train_lr": 0.0009648828006578134, "train_loss": 0.41702341947555543, "epoch": 428} +{"train_lr": 0.0009607520119481245, "train_loss": 0.41691789889335634, "epoch": 429} +{"train_lr": 0.0009566218939636999, "train_loss": 0.4168686304748058, "epoch": 430} +{"train_lr": 0.0009524925172768602, "train_loss": 0.41675924023389815, "epoch": 431} +{"train_lr": 0.0009483639524472556, "train_loss": 0.416854871147871, "epoch": 432} +{"train_lr": 0.000944236270020672, "train_loss": 0.4168514198482037, "epoch": 433} +{"train_lr": 0.0009401095405278129, "train_loss": 0.416757233697176, "epoch": 434} +{"train_lr": 0.0009359838344831006, "train_loss": 0.41670299382805825, "epoch": 435} +{"train_lr": 0.0009318592223834629, "train_loss": 0.4166590934753418, "epoch": 436} +{"train_lr": 0.0009277357747071485, "train_loss": 0.41663964110612867, "epoch": 437} +{"train_lr": 0.0009236135619124953, "train_loss": 0.4165965996146202, "epoch": 438} +{"train_lr": 0.0009194926544367453, "train_loss": 0.41651798075437546, "epoch": 439} +{"train_lr": 0.0009153731226948438, "train_loss": 0.4164986294090748, "epoch": 440} +{"train_lr": 0.0009112550370782172, "train_loss": 0.416437136977911, "epoch": 441} +{"train_lr": 0.0009071384679535845, "train_loss": 0.41646568976044657, "epoch": 442} +{"train_lr": 0.0009030234856617595, "train_loss": 0.41643167090415956, "epoch": 443} +{"train_lr": 0.0008989101605164331, "train_loss": 0.41652717319726945, "epoch": 444} +{"train_lr": 0.0008947985628029826, "train_loss": 0.41643864707946776, "epoch": 445} +{"train_lr": 0.000890688762777271, "train_loss": 0.4163997540950775, "epoch": 446} +{"train_lr": 0.000886580830664437, "train_loss": 0.41629073085188867, "epoch": 447} +{"train_lr": 0.000882474836657711, "train_loss": 0.41629329221844674, "epoch": 448} +{"train_lr": 0.0008783708509171996, "train_loss": 0.4162357913553715, "epoch": 449} +{"train_lr": 0.0008742689435686971, "train_loss": 0.41620945250988006, "epoch": 450} +{"train_lr": 0.0008701691847024806, "train_loss": 0.4162765025675297, "epoch": 451} +{"train_lr": 0.0008660716443721178, "train_loss": 0.4162437853038311, "epoch": 452} +{"train_lr": 0.0008619763925932672, "train_loss": 0.4160850357532501, "epoch": 453} +{"train_lr": 0.000857883499342485, "train_loss": 0.41610757068395615, "epoch": 454} +{"train_lr": 0.0008537930345560229, "train_loss": 0.4160928344607353, "epoch": 455} +{"train_lr": 0.0008497050681286344, "train_loss": 0.4160299357116222, "epoch": 456} +{"train_lr": 0.0008456196699123931, "train_loss": 0.4160114522874355, "epoch": 457} +{"train_lr": 0.0008415369097154786, "train_loss": 0.4159763306438923, "epoch": 458} +{"train_lr": 0.0008374568573009967, "train_loss": 0.41603127918839455, "epoch": 459} +{"train_lr": 0.0008333795823857903, "train_loss": 0.41598988704681394, "epoch": 460} +{"train_lr": 0.0008293051546392356, "train_loss": 0.4159310473740101, "epoch": 461} +{"train_lr": 0.0008252336436820601, "train_loss": 0.415981581813097, "epoch": 462} +{"train_lr": 0.0008211651190851549, "train_loss": 0.41578987702727316, "epoch": 463} +{"train_lr": 0.000817099650368378, "train_loss": 0.41580206685066223, "epoch": 464} +{"train_lr": 0.0008130373069993725, "train_loss": 0.41571778808832166, "epoch": 465} +{"train_lr": 0.0008089781583923796, "train_loss": 0.41572712430357933, "epoch": 466} +{"train_lr": 0.0008049222739070492, "train_loss": 0.41574336388111116, "epoch": 467} +{"train_lr": 0.0008008697228472562, "train_loss": 0.4156535137236118, "epoch": 468} +{"train_lr": 0.0007968205744599162, "train_loss": 0.4156419990241528, "epoch": 469} +{"train_lr": 0.0007927748979338039, "train_loss": 0.4155884074151516, "epoch": 470} +{"train_lr": 0.0007887327623983688, "train_loss": 0.4156378916442394, "epoch": 471} +{"train_lr": 0.0007846942369225543, "train_loss": 0.4155085301876068, "epoch": 472} +{"train_lr": 0.0007806593905136176, "train_loss": 0.4155448598384857, "epoch": 473} +{"train_lr": 0.0007766282921159524, "train_loss": 0.4154927032291889, "epoch": 474} +{"train_lr": 0.0007726010106099076, "train_loss": 0.41546733177900314, "epoch": 475} +{"train_lr": 0.0007685776148106116, "train_loss": 0.4154239023923874, "epoch": 476} +{"train_lr": 0.0007645581734668001, "train_loss": 0.41537334437966345, "epoch": 477} +{"train_lr": 0.0007605427552596346, "train_loss": 0.4154018564403057, "epoch": 478} +{"train_lr": 0.0007565314288015307, "train_loss": 0.4154400738298893, "epoch": 479} +{"train_lr": 0.0007525242626349935, "train_loss": 0.41526551213860513, "epoch": 480} +{"train_lr": 0.0007485213252314344, "train_loss": 0.41534639605283735, "epoch": 481} +{"train_lr": 0.0007445226849900115, "train_loss": 0.41517547313570974, "epoch": 482} +{"train_lr": 0.0007405284102364519, "train_loss": 0.41520622568130494, "epoch": 483} +{"train_lr": 0.0007365385692218902, "train_loss": 0.4151805117607117, "epoch": 484} +{"train_lr": 0.0007325532301217024, "train_loss": 0.4151923873543739, "epoch": 485} +{"train_lr": 0.0007285724610343378, "train_loss": 0.415164637196064, "epoch": 486} +{"train_lr": 0.0007245963299801566, "train_loss": 0.41513061105012894, "epoch": 487} +{"train_lr": 0.0007206249049002679, "train_loss": 0.41505594806075097, "epoch": 488} +{"train_lr": 0.000716658253655366, "train_loss": 0.414974316573143, "epoch": 489} +{"train_lr": 0.0007126964440245807, "train_loss": 0.4150249040722847, "epoch": 490} +{"train_lr": 0.0007087395437043058, "train_loss": 0.41488441542983057, "epoch": 491} +{"train_lr": 0.00070478762030705, "train_loss": 0.4148910955309868, "epoch": 492} +{"train_lr": 0.0007008407413602802, "train_loss": 0.41482908695936205, "epoch": 493} +{"train_lr": 0.000696898974305269, "train_loss": 0.4148184060752392, "epoch": 494} +{"train_lr": 0.000692962386495939, "train_loss": 0.4147723206758499, "epoch": 495} +{"train_lr": 0.0006890310451977145, "train_loss": 0.4147319468975067, "epoch": 496} +{"train_lr": 0.0006851050175863707, "train_loss": 0.41478028755784035, "epoch": 497} +{"train_lr": 0.0006811843707468876, "train_loss": 0.41468667683005334, "epoch": 498} +{"train_lr": 0.0006772691716723045, "train_loss": 0.4146990427553654, "epoch": 499} +{"train_lr": 0.0006733594872625652, "train_loss": 0.4146124188661575, "epoch": 500} +{"train_lr": 0.0006694553843233956, "train_loss": 0.41460838395953176, "epoch": 501} +{"train_lr": 0.000665556929565141, "train_loss": 0.4146430678844452, "epoch": 502} +{"train_lr": 0.0006616641896016334, "train_loss": 0.41454836529493333, "epoch": 503} +{"train_lr": 0.0006577772309490656, "train_loss": 0.4145636016011238, "epoch": 504} +{"train_lr": 0.0006538961200248293, "train_loss": 0.4144807538509369, "epoch": 505} +{"train_lr": 0.0006500209231464063, "train_loss": 0.4144996554195881, "epoch": 506} +{"train_lr": 0.0006461517065302167, "train_loss": 0.41448832686543463, "epoch": 507} +{"train_lr": 0.0006422885362904992, "train_loss": 0.41441225247979163, "epoch": 508} +{"train_lr": 0.0006384314784381729, "train_loss": 0.41435343540906905, "epoch": 509} +{"train_lr": 0.000634580598879715, "train_loss": 0.41432497901916504, "epoch": 510} +{"train_lr": 0.0006307359634160299, "train_loss": 0.4141963863253593, "epoch": 511} +{"train_lr": 0.0006268976377413344, "train_loss": 0.414292369222641, "epoch": 512} +{"train_lr": 0.0006230656874420206, "train_loss": 0.41412822899222373, "epoch": 513} +{"train_lr": 0.000619240177995549, "train_loss": 0.41413087169528007, "epoch": 514} +{"train_lr": 0.0006154211747693183, "train_loss": 0.4142154009103775, "epoch": 515} +{"train_lr": 0.0006116087430195577, "train_loss": 0.4141320895433426, "epoch": 516} +{"train_lr": 0.0006078029478902082, "train_loss": 0.41410443152785303, "epoch": 517} +{"train_lr": 0.0006040038544118062, "train_loss": 0.41408611317276955, "epoch": 518} +{"train_lr": 0.0006002115275003778, "train_loss": 0.41404432806372643, "epoch": 519} +{"train_lr": 0.0005964260319563274, "train_loss": 0.41398654327988627, "epoch": 520} +{"train_lr": 0.0005926474324633267, "train_loss": 0.41391335440278054, "epoch": 521} +{"train_lr": 0.0005888757935872201, "train_loss": 0.41387077738046646, "epoch": 522} +{"train_lr": 0.0005851111797749066, "train_loss": 0.41397186594605445, "epoch": 523} +{"train_lr": 0.0005813536553532483, "train_loss": 0.4139413024187088, "epoch": 524} +{"train_lr": 0.0005776032845279719, "train_loss": 0.4138102644562721, "epoch": 525} +{"train_lr": 0.0005738601313825683, "train_loss": 0.4137563929796219, "epoch": 526} +{"train_lr": 0.0005701242598771955, "train_loss": 0.41368419902324677, "epoch": 527} +{"train_lr": 0.0005663957338475891, "train_loss": 0.4137420842349529, "epoch": 528} +{"train_lr": 0.0005626746170039725, "train_loss": 0.41368860872387886, "epoch": 529} +{"train_lr": 0.0005589609729299664, "train_loss": 0.41366335294246676, "epoch": 530} +{"train_lr": 0.0005552548650815012, "train_loss": 0.4136664641916752, "epoch": 531} +{"train_lr": 0.0005515563567857334, "train_loss": 0.41364744307994844, "epoch": 532} +{"train_lr": 0.0005478655112399664, "train_loss": 0.4135095750927925, "epoch": 533} +{"train_lr": 0.0005441823915105678, "train_loss": 0.4134646384775639, "epoch": 534} +{"train_lr": 0.0005405070605318911, "train_loss": 0.4135267463207245, "epoch": 535} +{"train_lr": 0.0005368395811052013, "train_loss": 0.41346614977121354, "epoch": 536} +{"train_lr": 0.000533180015897602, "train_loss": 0.41341310681700705, "epoch": 537} +{"train_lr": 0.0005295284274409709, "train_loss": 0.41329991322159765, "epoch": 538} +{"train_lr": 0.0005258848781308736, "train_loss": 0.41334705371260644, "epoch": 539} +{"train_lr": 0.0005222494302255165, "train_loss": 0.41332384219169616, "epoch": 540} +{"train_lr": 0.0005186221458446746, "train_loss": 0.41331451881527903, "epoch": 541} +{"train_lr": 0.0005150030869686313, "train_loss": 0.4133096279680729, "epoch": 542} +{"train_lr": 0.0005113923154371142, "train_loss": 0.41333488993048667, "epoch": 543} +{"train_lr": 0.0005077898929482494, "train_loss": 0.41325920339226724, "epoch": 544} +{"train_lr": 0.0005041958810574948, "train_loss": 0.41315065550804136, "epoch": 545} +{"train_lr": 0.0005006103411766005, "train_loss": 0.41317506961226463, "epoch": 546} +{"train_lr": 0.0004970333345725481, "train_loss": 0.4131286765635014, "epoch": 547} +{"train_lr": 0.0004934649223665127, "train_loss": 0.41310886276960374, "epoch": 548} +{"train_lr": 0.0004899051655328116, "train_loss": 0.4130334359705448, "epoch": 549} +{"train_lr": 0.0004863541248978668, "train_loss": 0.4128937359213829, "epoch": 550} +{"train_lr": 0.00048281186113916804, "train_loss": 0.413018404263258, "epoch": 551} +{"train_lr": 0.00047927843478422894, "train_loss": 0.4130131136238575, "epoch": 552} +{"train_lr": 0.00047575390620955427, "train_loss": 0.41287013072967527, "epoch": 553} +{"train_lr": 0.00047223833563961505, "train_loss": 0.4128674404680729, "epoch": 554} +{"train_lr": 0.00046873178314581177, "train_loss": 0.41282065522670747, "epoch": 555} +{"train_lr": 0.00046523430864545227, "train_loss": 0.412801239490509, "epoch": 556} +{"train_lr": 0.00046174597190072565, "train_loss": 0.4127468424975872, "epoch": 557} +{"train_lr": 0.0004582668325176823, "train_loss": 0.41264703783988954, "epoch": 558} +{"train_lr": 0.000454796949945214, "train_loss": 0.412723533976078, "epoch": 559} +{"train_lr": 0.0004513363834740404, "train_loss": 0.4126707662463188, "epoch": 560} +{"train_lr": 0.0004478851922356962, "train_loss": 0.4126263898909092, "epoch": 561} +{"train_lr": 0.0004444434352015155, "train_loss": 0.4126412259161472, "epoch": 562} +{"train_lr": 0.0004410111711816321, "train_loss": 0.4125005640268326, "epoch": 563} +{"train_lr": 0.0004375884588239656, "train_loss": 0.41252969363331793, "epoch": 564} +{"train_lr": 0.0004341753566132277, "train_loss": 0.4123950005233288, "epoch": 565} +{"train_lr": 0.0004307719228699184, "train_loss": 0.4124559945344925, "epoch": 566} +{"train_lr": 0.0004273782157493301, "train_loss": 0.41243081186413766, "epoch": 567} +{"train_lr": 0.00042399429324055236, "train_loss": 0.41239919402599334, "epoch": 568} +{"train_lr": 0.0004206202131654863, "train_loss": 0.4123308017849922, "epoch": 569} +{"train_lr": 0.000417256033177851, "train_loss": 0.4123557644248009, "epoch": 570} +{"train_lr": 0.00041390181076219907, "train_loss": 0.41225514442920685, "epoch": 571} +{"train_lr": 0.0004105576032329374, "train_loss": 0.41224638593196866, "epoch": 572} +{"train_lr": 0.0004072234677333462, "train_loss": 0.4121566233634949, "epoch": 573} +{"train_lr": 0.000403899461234601, "train_loss": 0.4122022950172424, "epoch": 574} +{"train_lr": 0.0004005856405348028, "train_loss": 0.4122386267721653, "epoch": 575} +{"train_lr": 0.00039728206225800316, "train_loss": 0.4121166242182255, "epoch": 576} +{"train_lr": 0.0003939887828532405, "train_loss": 0.41211722364425657, "epoch": 577} +{"train_lr": 0.00039070585859357225, "train_loss": 0.41196879163384437, "epoch": 578} +{"train_lr": 0.00038743334557511883, "train_loss": 0.4120268380403519, "epoch": 579} +{"train_lr": 0.00038417129971609465, "train_loss": 0.4120321435570717, "epoch": 580} +{"train_lr": 0.0003809197767558675, "train_loss": 0.4119880166888237, "epoch": 581} +{"train_lr": 0.00037767883225399033, "train_loss": 0.41182354040145874, "epoch": 582} +{"train_lr": 0.00037444852158926347, "train_loss": 0.4119151137650013, "epoch": 583} +{"train_lr": 0.00037122889995878434, "train_loss": 0.41178027091026304, "epoch": 584} +{"train_lr": 0.00036802002237700215, "train_loss": 0.41189671708345416, "epoch": 585} +{"train_lr": 0.0003648219436747815, "train_loss": 0.4118088481903076, "epoch": 586} +{"train_lr": 0.00036163471849846445, "train_loss": 0.41158689913749696, "epoch": 587} +{"train_lr": 0.00035845840130893473, "train_loss": 0.4116609573543072, "epoch": 588} +{"train_lr": 0.00035529304638068815, "train_loss": 0.4116432239770889, "epoch": 589} +{"train_lr": 0.0003521387078009091, "train_loss": 0.4116695198178291, "epoch": 590} +{"train_lr": 0.0003489954394685392, "train_loss": 0.4116169459104538, "epoch": 591} +{"train_lr": 0.000345863295093364, "train_loss": 0.4115727410554886, "epoch": 592} +{"train_lr": 0.0003427423281950851, "train_loss": 0.41158620098233223, "epoch": 593} +{"train_lr": 0.00033963259210241883, "train_loss": 0.41150212720036505, "epoch": 594} +{"train_lr": 0.00033653413995217435, "train_loss": 0.41141462765336034, "epoch": 595} +{"train_lr": 0.00033344702468834903, "train_loss": 0.4113722758948803, "epoch": 596} +{"train_lr": 0.00033037129906122623, "train_loss": 0.41129573442935946, "epoch": 597} +{"train_lr": 0.0003273070156264704, "train_loss": 0.41129547247886655, "epoch": 598} +{"train_lr": 0.0003242542267442306, "train_loss": 0.4113450105786324, "epoch": 599} +{"train_lr": 0.0003212129845782456, "train_loss": 0.411285870462656, "epoch": 600} +{"train_lr": 0.0003181833410949536, "train_loss": 0.41130744271874425, "epoch": 601} +{"train_lr": 0.00031516534806260186, "train_loss": 0.4112095928132534, "epoch": 602} +{"train_lr": 0.00031215905705036536, "train_loss": 0.41113772990703584, "epoch": 603} +{"train_lr": 0.0003091645194274621, "train_loss": 0.41113032053112986, "epoch": 604} +{"train_lr": 0.0003061817863622778, "train_loss": 0.4110891651570797, "epoch": 605} +{"train_lr": 0.00030321090882149234, "train_loss": 0.41110460319519043, "epoch": 606} +{"train_lr": 0.0003002519375692042, "train_loss": 0.41107726674675943, "epoch": 607} +{"train_lr": 0.00029730492316606825, "train_loss": 0.4110739596545696, "epoch": 608} +{"train_lr": 0.0002943699159684297, "train_loss": 0.4109533204615116, "epoch": 609} +{"train_lr": 0.00029144696612746454, "train_loss": 0.41088306730389595, "epoch": 610} +{"train_lr": 0.0002885361235883199, "train_loss": 0.41095819348096846, "epoch": 611} +{"train_lr": 0.0002856374380892637, "train_loss": 0.41093446829319, "epoch": 612} +{"train_lr": 0.00028275095916083335, "train_loss": 0.41092277715802195, "epoch": 613} +{"train_lr": 0.00027987673612499026, "train_loss": 0.41091884284615515, "epoch": 614} +{"train_lr": 0.00027701481809427403, "train_loss": 0.41077308706641197, "epoch": 615} +{"train_lr": 0.0002741652539709704, "train_loss": 0.41076149238944054, "epoch": 616} +{"train_lr": 0.0002713280924462657, "train_loss": 0.41067302731275557, "epoch": 617} +{"train_lr": 0.00026850338199942207, "train_loss": 0.4106996956408024, "epoch": 618} +{"train_lr": 0.0002656911708969498, "train_loss": 0.41060551152825353, "epoch": 619} +{"train_lr": 0.0002628915071917763, "train_loss": 0.41057525554299357, "epoch": 620} +{"train_lr": 0.0002601044387224285, "train_loss": 0.4105493293166161, "epoch": 621} +{"train_lr": 0.0002573300131122188, "train_loss": 0.41071004919409754, "epoch": 622} +{"train_lr": 0.00025456827776842376, "train_loss": 0.41045111640691756, "epoch": 623} +{"train_lr": 0.00025181927988148265, "train_loss": 0.410511493909359, "epoch": 624} +{"train_lr": 0.0002490830664241836, "train_loss": 0.4104461461484432, "epoch": 625} +{"train_lr": 0.0002463596841508659, "train_loss": 0.4104572146654129, "epoch": 626} +{"train_lr": 0.00024364917959661644, "train_loss": 0.41034869700074195, "epoch": 627} +{"train_lr": 0.00024095159907648234, "train_loss": 0.41023331859111783, "epoch": 628} +{"train_lr": 0.0002382669886846699, "train_loss": 0.41035697820782663, "epoch": 629} +{"train_lr": 0.0002355953942937644, "train_loss": 0.4102578080415726, "epoch": 630} +{"train_lr": 0.00023293686155394203, "train_loss": 0.41025401488542557, "epoch": 631} +{"train_lr": 0.00023029143589219285, "train_loss": 0.41027388836741446, "epoch": 632} +{"train_lr": 0.00022765916251154313, "train_loss": 0.4101309650480747, "epoch": 633} +{"train_lr": 0.00022504008639028075, "train_loss": 0.41018197714686394, "epoch": 634} +{"train_lr": 0.00022243425228119063, "train_loss": 0.4102461946487427, "epoch": 635} +{"train_lr": 0.00021984170471078866, "train_loss": 0.41012363595962525, "epoch": 636} +{"train_lr": 0.00021726248797855976, "train_loss": 0.41003916486501696, "epoch": 637} +{"train_lr": 0.0002146966461562013, "train_loss": 0.4100011553347111, "epoch": 638} +{"train_lr": 0.00021214422308687, "train_loss": 0.4099870161771774, "epoch": 639} +{"train_lr": 0.00020960526238443468, "train_loss": 0.409950205296278, "epoch": 640} +{"train_lr": 0.00020707980743272803, "train_loss": 0.40993198407888415, "epoch": 641} +{"train_lr": 0.00020456790138480746, "train_loss": 0.40987456869482997, "epoch": 642} +{"train_lr": 0.00020206958716221631, "train_loss": 0.4099106639921665, "epoch": 643} +{"train_lr": 0.00019958490745425211, "train_loss": 0.40992944944500925, "epoch": 644} +{"train_lr": 0.00019711390471723525, "train_loss": 0.40970903441905976, "epoch": 645} +{"train_lr": 0.00019465662117378513, "train_loss": 0.4097623137831688, "epoch": 646} +{"train_lr": 0.00019221309881209726, "train_loss": 0.4097091728568077, "epoch": 647} +{"train_lr": 0.00018978337938522675, "train_loss": 0.4097723929464817, "epoch": 648} +{"train_lr": 0.00018736750441037523, "train_loss": 0.4096767637908459, "epoch": 649} +{"train_lr": 0.00018496551516817997, "train_loss": 0.4096685712814331, "epoch": 650} +{"train_lr": 0.00018257745270201065, "train_loss": 0.4095007773041725, "epoch": 651} +{"train_lr": 0.00018020335781726479, "train_loss": 0.40950128165483474, "epoch": 652} +{"train_lr": 0.0001778432710806747, "train_loss": 0.4095606074631214, "epoch": 653} +{"train_lr": 0.00017549723281960988, "train_loss": 0.40949765983819963, "epoch": 654} +{"train_lr": 0.00017316528312139175, "train_loss": 0.40952413992881775, "epoch": 655} +{"train_lr": 0.00017084746183260703, "train_loss": 0.4094638512015343, "epoch": 656} +{"train_lr": 0.00016854380855842624, "train_loss": 0.4094694583117962, "epoch": 657} +{"train_lr": 0.00016625436266192763, "train_loss": 0.40931712368130685, "epoch": 658} +{"train_lr": 0.00016397916326342497, "train_loss": 0.4093422090888023, "epoch": 659} +{"train_lr": 0.000161718249239798, "train_loss": 0.4092994294703007, "epoch": 660} +{"train_lr": 0.0001594716592238298, "train_loss": 0.4093663468182087, "epoch": 661} +{"train_lr": 0.00015723943160354516, "train_loss": 0.40929065743684767, "epoch": 662} +{"train_lr": 0.00015502160452155516, "train_loss": 0.4092123525619507, "epoch": 663} +{"train_lr": 0.00015281821587440569, "train_loss": 0.40918805617690085, "epoch": 664} +{"train_lr": 0.00015062930331192866, "train_loss": 0.4091305765867233, "epoch": 665} +{"train_lr": 0.0001484549042366004, "train_loss": 0.40919655148983003, "epoch": 666} +{"train_lr": 0.0001462950558029027, "train_loss": 0.40923569843173024, "epoch": 667} +{"train_lr": 0.0001441497949166853, "train_loss": 0.40912016796469686, "epoch": 668} +{"train_lr": 0.00014201915823453798, "train_loss": 0.4091143898308277, "epoch": 669} +{"train_lr": 0.00013990318216316309, "train_loss": 0.4091158373832703, "epoch": 670} +{"train_lr": 0.00013780190285875329, "train_loss": 0.4089883540272713, "epoch": 671} +{"train_lr": 0.0001357153562263738, "train_loss": 0.40893249164819717, "epoch": 672} +{"train_lr": 0.00013364357791935063, "train_loss": 0.409016412883997, "epoch": 673} +{"train_lr": 0.0001315866033386586, "train_loss": 0.40892095088362695, "epoch": 674} +{"train_lr": 0.00012954446763231708, "train_loss": 0.4089177478671074, "epoch": 675} +{"train_lr": 0.00012751720569479193, "train_loss": 0.4089482992887497, "epoch": 676} +{"train_lr": 0.00012550485216639558, "train_loss": 0.40890288605093955, "epoch": 677} +{"train_lr": 0.0001235074414326978, "train_loss": 0.40893578273653985, "epoch": 678} +{"train_lr": 0.00012152500762393668, "train_loss": 0.40879338170886037, "epoch": 679} +{"train_lr": 0.00011955758461443642, "train_loss": 0.40870585800409315, "epoch": 680} +{"train_lr": 0.0001176052060220283, "train_loss": 0.408755088865757, "epoch": 681} +{"train_lr": 0.00011566790520747518, "train_loss": 0.4087392102777958, "epoch": 682} +{"train_lr": 0.00011374571527390314, "train_loss": 0.40866463065743447, "epoch": 683} +{"train_lr": 0.0001118386690662345, "train_loss": 0.4087050619006157, "epoch": 684} +{"train_lr": 0.00010994679917062744, "train_loss": 0.4086720600247383, "epoch": 685} +{"train_lr": 0.000108070137913918, "train_loss": 0.40857414263486863, "epoch": 686} +{"train_lr": 0.00010620871736307003, "train_loss": 0.40863434770703316, "epoch": 687} +{"train_lr": 0.00010436256932462424, "train_loss": 0.40859491340518, "epoch": 688} +{"train_lr": 0.00010253172534415723, "train_loss": 0.40860966989994046, "epoch": 689} +{"train_lr": 0.00010071621670574097, "train_loss": 0.408625454801321, "epoch": 690} +{"train_lr": 9.891607443140929e-05, "train_loss": 0.40844214201569556, "epoch": 691} +{"train_lr": 9.713132928062657e-05, "train_loss": 0.40843296210169794, "epoch": 692} +{"train_lr": 9.536201174976322e-05, "train_loss": 0.40837096125483513, "epoch": 693} +{"train_lr": 9.360815207157413e-05, "train_loss": 0.4083694005072117, "epoch": 694} +{"train_lr": 9.186978021468215e-05, "train_loss": 0.4084002661764622, "epoch": 695} +{"train_lr": 9.014692588306594e-05, "train_loss": 0.40843813487291336, "epoch": 696} +{"train_lr": 8.84396185155527e-05, "train_loss": 0.40834322509765625, "epoch": 697} +{"train_lr": 8.67478872853143e-05, "train_loss": 0.4083211016476154, "epoch": 698} +{"train_lr": 8.507176109937047e-05, "train_loss": 0.4082286029994488, "epoch": 699} +{"train_lr": 8.341126859809256e-05, "train_loss": 0.40823151443004607, "epoch": 700} +{"train_lr": 8.176643815471623e-05, "train_loss": 0.40823154353499413, "epoch": 701} +{"train_lr": 8.013729787485531e-05, "train_loss": 0.40827645783424377, "epoch": 702} +{"train_lr": 7.852387559602257e-05, "train_loss": 0.40825580505132675, "epoch": 703} +{"train_lr": 7.692619888715302e-05, "train_loss": 0.4081780993103981, "epoch": 704} +{"train_lr": 7.534429504813323e-05, "train_loss": 0.4081855354487896, "epoch": 705} +{"train_lr": 7.377819110933544e-05, "train_loss": 0.4082311128556728, "epoch": 706} +{"train_lr": 7.222791383115492e-05, "train_loss": 0.4081001627087593, "epoch": 707} +{"train_lr": 7.069348970355303e-05, "train_loss": 0.40801326141357425, "epoch": 708} +{"train_lr": 6.917494494560436e-05, "train_loss": 0.40805929116606715, "epoch": 709} +{"train_lr": 6.767230550504895e-05, "train_loss": 0.4080538489818573, "epoch": 710} +{"train_lr": 6.618559705784932e-05, "train_loss": 0.4080111927628517, "epoch": 711} +{"train_lr": 6.471484500775038e-05, "train_loss": 0.40799329899549486, "epoch": 712} +{"train_lr": 6.326007448584706e-05, "train_loss": 0.4080479858994484, "epoch": 713} +{"train_lr": 6.182131035015343e-05, "train_loss": 0.4079994874477387, "epoch": 714} +{"train_lr": 6.0398577185179195e-05, "train_loss": 0.4078952370584011, "epoch": 715} +{"train_lr": 5.8991899301508436e-05, "train_loss": 0.40794192504286764, "epoch": 716} +{"train_lr": 5.7601300735385406e-05, "train_loss": 0.4079172481238842, "epoch": 717} +{"train_lr": 5.62268052483022e-05, "train_loss": 0.4078769870400429, "epoch": 718} +{"train_lr": 5.4868436326594996e-05, "train_loss": 0.40775742872953413, "epoch": 719} +{"train_lr": 5.352621718104013e-05, "train_loss": 0.4078458012342453, "epoch": 720} +{"train_lr": 5.220017074646012e-05, "train_loss": 0.4077809689939022, "epoch": 721} +{"train_lr": 5.089031968132945e-05, "train_loss": 0.40774403147697447, "epoch": 722} +{"train_lr": 4.959668636738903e-05, "train_loss": 0.4077515964627266, "epoch": 723} +{"train_lr": 4.831929290926272e-05, "train_loss": 0.407721921145916, "epoch": 724} +{"train_lr": 4.705816113408049e-05, "train_loss": 0.40768695514798164, "epoch": 725} +{"train_lr": 4.5813312591104704e-05, "train_loss": 0.4076756275653839, "epoch": 726} +{"train_lr": 4.458476855136227e-05, "train_loss": 0.40769834047555925, "epoch": 727} +{"train_lr": 4.3372550007281185e-05, "train_loss": 0.4076857505738735, "epoch": 728} +{"train_lr": 4.217667767233175e-05, "train_loss": 0.4076809181332588, "epoch": 729} +{"train_lr": 4.0997171980672597e-05, "train_loss": 0.4076770887076855, "epoch": 730} +{"train_lr": 3.9834053086801805e-05, "train_loss": 0.4075366601884365, "epoch": 731} +{"train_lr": 3.868734086521197e-05, "train_loss": 0.40765976741313936, "epoch": 732} +{"train_lr": 3.7557054910051054e-05, "train_loss": 0.40767239355444906, "epoch": 733} +{"train_lr": 3.644321453478749e-05, "train_loss": 0.4076313421726227, "epoch": 734} +{"train_lr": 3.5345838771880166e-05, "train_loss": 0.40756957579255104, "epoch": 735} +{"train_lr": 3.4264946372453015e-05, "train_loss": 0.40758756697773935, "epoch": 736} +{"train_lr": 3.3200555805974955e-05, "train_loss": 0.40753637469410897, "epoch": 737} +{"train_lr": 3.215268525994395e-05, "train_loss": 0.40759654030799863, "epoch": 738} +{"train_lr": 3.1121352639576464e-05, "train_loss": 0.4075975024521351, "epoch": 739} +{"train_lr": 3.0106575567501452e-05, "train_loss": 0.4074262948334217, "epoch": 740} +{"train_lr": 2.9108371383459213e-05, "train_loss": 0.4075006844162941, "epoch": 741} +{"train_lr": 2.8126757144005083e-05, "train_loss": 0.4073981125712395, "epoch": 742} +{"train_lr": 2.7161749622217994e-05, "train_loss": 0.40740938003063204, "epoch": 743} +{"train_lr": 2.6213365307414162e-05, "train_loss": 0.4074016982078552, "epoch": 744} +{"train_lr": 2.5281620404864564e-05, "train_loss": 0.4073709517121315, "epoch": 745} +{"train_lr": 2.4366530835519025e-05, "train_loss": 0.40737549446821214, "epoch": 746} +{"train_lr": 2.3468112235733392e-05, "train_loss": 0.4074480685114861, "epoch": 747} +{"train_lr": 2.2586379957002727e-05, "train_loss": 0.407499808126688, "epoch": 748} +{"train_lr": 2.1721349065698846e-05, "train_loss": 0.40737112711071966, "epoch": 749} +{"train_lr": 2.087303434281305e-05, "train_loss": 0.40735656403303144, "epoch": 750} +{"train_lr": 2.0041450283703275e-05, "train_loss": 0.40729228178858756, "epoch": 751} +{"train_lr": 1.9226611097846807e-05, "train_loss": 0.40734857454895973, "epoch": 752} +{"train_lr": 1.842853070859705e-05, "train_loss": 0.407330923384428, "epoch": 753} +{"train_lr": 1.7647222752945838e-05, "train_loss": 0.40724869443178174, "epoch": 754} +{"train_lr": 1.688270058129047e-05, "train_loss": 0.4072960561275482, "epoch": 755} +{"train_lr": 1.6134977257205462e-05, "train_loss": 0.407342313015461, "epoch": 756} +{"train_lr": 1.5404065557219386e-05, "train_loss": 0.40731965934634207, "epoch": 757} +{"train_lr": 1.4689977970596522e-05, "train_loss": 0.40725169029831887, "epoch": 758} +{"train_lr": 1.3992726699123512e-05, "train_loss": 0.4072245597243309, "epoch": 759} +{"train_lr": 1.3312323656900852e-05, "train_loss": 0.40719416123628616, "epoch": 760} +{"train_lr": 1.2648780470139173e-05, "train_loss": 0.4072723692417145, "epoch": 761} +{"train_lr": 1.2002108476960741e-05, "train_loss": 0.4072173948287964, "epoch": 762} +{"train_lr": 1.1372318727205755e-05, "train_loss": 0.40724221390485765, "epoch": 763} +{"train_lr": 1.0759421982243326e-05, "train_loss": 0.40718788425326347, "epoch": 764} +{"train_lr": 1.0163428714787861e-05, "train_loss": 0.40724759435653685, "epoch": 765} +{"train_lr": 9.584349108719813e-06, "train_loss": 0.40717979621887207, "epoch": 766} +{"train_lr": 9.022193058912006e-06, "train_loss": 0.4071979228913784, "epoch": 767} +{"train_lr": 8.476970171060192e-06, "train_loss": 0.40717670152187346, "epoch": 768} +{"train_lr": 7.948689761519278e-06, "train_loss": 0.40705007915496827, "epoch": 769} +{"train_lr": 7.437360857143847e-06, "train_loss": 0.4072156092405319, "epoch": 770} +{"train_lr": 6.942992195134097e-06, "train_loss": 0.4070780915558338, "epoch": 771} +{"train_lr": 6.465592222886441e-06, "train_loss": 0.4070723837614059, "epoch": 772} +{"train_lr": 6.0051690978492155e-06, "train_loss": 0.40715753821730616, "epoch": 773} +{"train_lr": 5.561730687383275e-06, "train_loss": 0.40711742687225344, "epoch": 774} +{"train_lr": 5.135284568627556e-06, "train_loss": 0.407139888215065, "epoch": 775} +{"train_lr": 4.725838028369653e-06, "train_loss": 0.4070862729489803, "epoch": 776} +{"train_lr": 4.333398062921207e-06, "train_loss": 0.40716656067967416, "epoch": 777} +{"train_lr": 3.957971377998454e-06, "train_loss": 0.4070811638891697, "epoch": 778} +{"train_lr": 3.599564388607613e-06, "train_loss": 0.40715316613912583, "epoch": 779} +{"train_lr": 3.258183218935257e-06, "train_loss": 0.4070707754790783, "epoch": 780} +{"train_lr": 2.9338337022436484e-06, "train_loss": 0.407094335603714, "epoch": 781} +{"train_lr": 2.626521380771149e-06, "train_loss": 0.4070680266022682, "epoch": 782} +{"train_lr": 2.3362515056374043e-06, "train_loss": 0.4070445769608021, "epoch": 783} +{"train_lr": 2.0630290367537063e-06, "train_loss": 0.407051396137476, "epoch": 784} +{"train_lr": 1.8068586427382016e-06, "train_loss": 0.4070886338174343, "epoch": 785} +{"train_lr": 1.5677447008361348e-06, "train_loss": 0.4070832368195057, "epoch": 786} +{"train_lr": 1.3456912968450236e-06, "train_loss": 0.4070368420124054, "epoch": 787} +{"train_lr": 1.140702225044881e-06, "train_loss": 0.40696477791666985, "epoch": 788} +{"train_lr": 9.527809881333541e-07, "train_loss": 0.40711091704964636, "epoch": 789} +{"train_lr": 7.819307971659009e-07, "train_loss": 0.40703405417203903, "epoch": 790} +{"train_lr": 6.281545715008838e-07, "train_loss": 0.4070916808605194, "epoch": 791} +{"train_lr": 4.91454938749716e-07, "train_loss": 0.4070300230205059, "epoch": 792} +{"train_lr": 3.7183423473196524e-07, "train_loss": 0.40707525467276573, "epoch": 793} +{"train_lr": 2.6929450343540397e-07, "train_loss": 0.40699523387551306, "epoch": 794} +{"train_lr": 1.8383749698112992e-07, "train_loss": 0.4070662397742271, "epoch": 795} +{"train_lr": 1.1546467559359906e-07, "train_loss": 0.40707216830849646, "epoch": 796} +{"train_lr": 6.417720757569029e-08, "train_loss": 0.4070766533434391, "epoch": 797} +{"train_lr": 2.9975969288707755e-08, "train_loss": 0.4070338776230812, "epoch": 798} +{"train_lr": 1.2861545137461837e-08, "train_loss": 0.4070114720463753, "epoch": 799} diff --git a/CV/MAE/exp_results/MAE/large/log_large_ft.txt b/CV/MAE/exp_results/MAE/large/log_large_ft.txt new file mode 100644 index 0000000..209bfb4 --- /dev/null +++ b/CV/MAE/exp_results/MAE/large/log_large_ft.txt @@ -0,0 +1,50 @@ +{"train_lr": 0.0007476019200000001, "train_loss": 5.9094133159518245, "test_loss": 1.7714076134562493, "test_acc1": 61.33637235611582, "test_acc5": 84.77687142609177, "epoch": 0, "n_parameters": 304326632} +{"train_lr": 0.0022476019200000003, "train_loss": 4.501337738275528, "test_loss": 1.1959131537377834, "test_acc1": 72.27087332465598, "test_acc5": 91.66066860084875, "epoch": 1, "n_parameters": 304326632} +{"train_lr": 0.0037476019200000004, "train_loss": 4.119643689954281, "test_loss": 1.0854404755681752, "test_acc1": 75.52783110144804, "test_acc5": 93.39011516131733, "epoch": 2, "n_parameters": 304326632} +{"train_lr": 0.005247601920000002, "train_loss": 3.9008864871740343, "test_loss": 1.0289268112555146, "test_acc1": 76.92938261289896, "test_acc5": 94.09788868386092, "epoch": 3, "n_parameters": 304326632} +{"train_lr": 0.006747601919999998, "train_loss": 3.76051225707531, "test_loss": 0.9720380315184594, "test_acc1": 78.21497122713639, "test_acc5": 94.63371721293326, "epoch": 4, "n_parameters": 304326632} +{"train_lr": 0.00824760192, "train_loss": 3.651956864875555, "test_loss": 0.9415295435115695, "test_acc1": 78.97672746285214, "test_acc5": 95.09756876746584, "epoch": 5, "n_parameters": 304326632} +{"train_lr": 0.009747601920000001, "train_loss": 3.5677191224038602, "test_loss": 0.9388785093277693, "test_acc1": 79.57453616627957, "test_acc5": 95.29950415058465, "epoch": 6, "n_parameters": 304326632} +{"train_lr": 0.011247601919999997, "train_loss": 3.507449230492115, "test_loss": 0.9052619117870927, "test_acc1": 80.08437302847818, "test_acc5": 95.49944016815986, "epoch": 7, "n_parameters": 304326632} +{"train_lr": 0.012747601919999994, "train_loss": 3.4423172294437885, "test_loss": 0.8388488055765628, "test_acc1": 80.4342610673575, "test_acc5": 95.76935380052772, "epoch": 8, "n_parameters": 304326632} +{"train_lr": 0.014247601920000002, "train_loss": 3.3948125799477102, "test_loss": 0.8529021150618792, "test_acc1": 80.73616445743343, "test_acc5": 95.86732244598355, "epoch": 9, "n_parameters": 304326632} +{"train_lr": 0.01499233375709719, "train_loss": 3.342990658354759, "test_loss": 0.8151264287903905, "test_acc1": 81.03206976010719, "test_acc5": 95.96529109723585, "epoch": 10, "n_parameters": 304326632} +{"train_lr": 0.014946245730243689, "train_loss": 3.288912183743715, "test_loss": 0.8095201044529676, "test_acc1": 81.51191621381963, "test_acc5": 96.16522712243801, "epoch": 11, "n_parameters": 304326632} +{"train_lr": 0.01485427994899793, "train_loss": 3.238141927015781, "test_loss": 0.7871933653950691, "test_acc1": 82.07973450799821, "test_acc5": 96.36716250067556, "epoch": 12, "n_parameters": 304326632} +{"train_lr": 0.014717003412983015, "train_loss": 3.1956452232837678, "test_loss": 0.7688306730240584, "test_acc1": 82.2496801315022, "test_acc5": 96.52111323888074, "epoch": 13, "n_parameters": 304326632} +{"train_lr": 0.014535262477692571, "train_loss": 3.1652532088041307, "test_loss": 0.7522821754962206, "test_acc1": 82.66154833756725, "test_acc5": 96.58309339943104, "epoch": 14, "n_parameters": 304326632} +{"train_lr": 0.014310177636427614, "train_loss": 3.121457608240843, "test_loss": 0.7477796772867441, "test_acc1": 82.73952337029799, "test_acc5": 96.67906269169892, "epoch": 15, "n_parameters": 304326632} +{"train_lr": 0.014043136612082945, "train_loss": 3.0966577651739122, "test_loss": 0.753467806391418, "test_acc1": 82.9974408353359, "test_acc5": 96.78502878132953, "epoch": 16, "n_parameters": 304326632} +{"train_lr": 0.013735785801373714, "train_loss": 3.0689808761537076, "test_loss": 0.7341048694401979, "test_acc1": 83.14339413813727, "test_acc5": 96.79302621802991, "epoch": 17, "n_parameters": 304326632} +{"train_lr": 0.01339002012425247, "train_loss": 3.029768516147137, "test_loss": 0.725501059666276, "test_acc1": 83.34532951271389, "test_acc5": 96.81301982526358, "epoch": 18, "n_parameters": 304326632} +{"train_lr": 0.01300797134109743, "train_loss": 3.0120413874208927, "test_loss": 0.7309531949833036, "test_acc1": 83.50927706414586, "test_acc5": 97.00095968694924, "epoch": 19, "n_parameters": 304326632} +{"train_lr": 0.012591994909700855, "train_loss": 2.9821670488238334, "test_loss": 0.7118158831447363, "test_acc1": 83.61924186945724, "test_acc5": 97.01895393230026, "epoch": 20, "n_parameters": 304326632} +{"train_lr": 0.012144655463088535, "train_loss": 2.962305991309881, "test_loss": 0.7047568802535534, "test_acc1": 83.74520156128774, "test_acc5": 97.07493601513458, "epoch": 21, "n_parameters": 304326632} +{"train_lr": 0.011668710997704269, "train_loss": 2.938569626682997, "test_loss": 0.7103257965296507, "test_acc1": 83.9051503784292, "test_acc5": 97.10092768666078, "epoch": 22, "n_parameters": 304326632} +{"train_lr": 0.01116709586944475, "train_loss": 2.91352473244071, "test_loss": 0.7010805677436293, "test_acc1": 84.26103648877037, "test_acc5": 97.12492002376135, "epoch": 23, "n_parameters": 304326632} +{"train_lr": 0.010642902702379645, "train_loss": 2.8938853970646856, "test_loss": 0.692104572802782, "test_acc1": 84.34101090580701, "test_acc5": 97.2508797091852, "epoch": 24, "n_parameters": 304326632} +{"train_lr": 0.010099363321695844, "train_loss": 2.874984144228697, "test_loss": 0.6802691061235965, "test_acc1": 84.30902114603967, "test_acc5": 97.22488802759142, "epoch": 25, "n_parameters": 304326632} +{"train_lr": 0.009539828828420426, "train_loss": 2.852267661267519, "test_loss": 0.6850866706669331, "test_acc1": 84.41898594143599, "test_acc5": 97.29486562941827, "epoch": 26, "n_parameters": 304326632} +{"train_lr": 0.00896774893876856, "train_loss": 2.837763201504946, "test_loss": 0.6828102863952518, "test_acc1": 84.65091173876118, "test_acc5": 97.3268554027616, "epoch": 27, "n_parameters": 304326632} +{"train_lr": 0.008386650715495802, "train_loss": 2.81947190862298, "test_loss": 0.6762189302407206, "test_acc1": 84.7188899800782, "test_acc5": 97.34884836501368, "epoch": 28, "n_parameters": 304326632} +{"train_lr": 0.00780011682238341, "train_loss": 2.8003201848089696, "test_loss": 0.6725861196033657, "test_acc1": 84.82285671179217, "test_acc5": 97.32285668235212, "epoch": 29, "n_parameters": 304326632} +{"train_lr": 0.007211763435924688, "train_loss": 2.7866385659873485, "test_loss": 0.671936163790524, "test_acc1": 84.95481448881304, "test_acc5": 97.38883556682028, "epoch": 30, "n_parameters": 304326632} +{"train_lr": 0.006625217950394574, "train_loss": 2.7746526652514936, "test_loss": 0.6678782022558153, "test_acc1": 84.89283432917799, "test_acc5": 97.4168266078561, "epoch": 31, "n_parameters": 304326632} +{"train_lr": 0.006044096613757472, "train_loss": 2.7576689450562, "test_loss": 0.6610171441733838, "test_acc1": 85.12675947130145, "test_acc5": 97.46681061770316, "epoch": 32, "n_parameters": 304326632} +{"train_lr": 0.00547198223229625, "train_loss": 2.7347684874773024, "test_loss": 0.6683760618418455, "test_acc1": 85.13675626942688, "test_acc5": 97.39683300702906, "epoch": 33, "n_parameters": 304326632} +{"train_lr": 0.004912402081419917, "train_loss": 2.723790532976389, "test_loss": 0.6556776543706655, "test_acc1": 85.26271595713884, "test_acc5": 97.47680741643875, "epoch": 34, "n_parameters": 304326632} +{"train_lr": 0.004368806158837928, "train_loss": 2.7088236126720906, "test_loss": 0.654360967874527, "test_acc1": 85.24072299839516, "test_acc5": 97.47280869541913, "epoch": 35, "n_parameters": 304326632} +{"train_lr": 0.003844545914176986, "train_loss": 2.694744017738104, "test_loss": 0.6538684133067727, "test_acc1": 85.33869164430858, "test_acc5": 97.53278950156115, "epoch": 36, "n_parameters": 304326632} +{"train_lr": 0.0033428535861796433, "train_loss": 2.6908254801392557, "test_loss": 0.6542927216365934, "test_acc1": 85.39467373049877, "test_acc5": 97.53878758217536, "epoch": 37, "n_parameters": 304326632} +{"train_lr": 0.002866822274877639, "train_loss": 2.671278304463625, "test_loss": 0.6524978142604232, "test_acc1": 85.49464174439643, "test_acc5": 97.49280229456822, "epoch": 38, "n_parameters": 304326632} +{"train_lr": 0.0024193868716016085, "train_loss": 2.657200170958042, "test_loss": 0.650126696806401, "test_acc1": 85.59660910492285, "test_acc5": 97.52879077825345, "epoch": 39, "n_parameters": 304326632} +{"train_lr": 0.0020033059644001382, "train_loss": 2.652334677708149, "test_loss": 0.6520910476334393, "test_acc1": 85.52263278619495, "test_acc5": 97.50279909849014, "epoch": 40, "n_parameters": 304326632} +{"train_lr": 0.001621144830427048, "train_loss": 2.6431411161601543, "test_loss": 0.647436778191477, "test_acc1": 85.6365963131361, "test_acc5": 97.54478566339972, "epoch": 41, "n_parameters": 304326632} +{"train_lr": 0.0012752596201547688, "train_loss": 2.637372990643978, "test_loss": 0.6462450991012156, "test_acc1": 85.61260398945898, "test_acc5": 97.54678502360446, "epoch": 42, "n_parameters": 304326632} +{"train_lr": 0.0009677828309231273, "train_loss": 2.6305615900933743, "test_loss": 0.6458461854793132, "test_acc1": 85.75455856750352, "test_acc5": 97.53878758278552, "epoch": 43, "n_parameters": 304326632} +{"train_lr": 0.0007006101593841485, "train_loss": 2.627352162593603, "test_loss": 0.6431183713674545, "test_acc1": 85.75455856231719, "test_acc5": 97.5627799058525, "epoch": 44, "n_parameters": 304326632} +{"train_lr": 0.0004753888139017931, "train_loss": 2.6245033386409284, "test_loss": 0.6450332224182784, "test_acc1": 85.80654192580981, "test_acc5": 97.56877798524638, "epoch": 45, "n_parameters": 304326632} +{"train_lr": 0.0002935073589646598, "train_loss": 2.6220774190187455, "test_loss": 0.6432638500258326, "test_acc1": 85.85252721585758, "test_acc5": 97.56078054442744, "epoch": 46, "n_parameters": 304326632} +{"train_lr": 0.00015608715422415792, "train_loss": 2.611486408829689, "test_loss": 0.6422065225988627, "test_acc1": 85.82453617009305, "test_acc5": 97.57077734545112, "epoch": 47, "n_parameters": 304326632} +{"train_lr": 6.397544093936805e-05, "train_loss": 2.6108330062150955, "test_loss": 0.6433782994002104, "test_acc1": 85.822536808668, "test_acc5": 97.57677542606532, "epoch": 48, "n_parameters": 304326632} +{"train_lr": 1.7740118452942777e-05, "train_loss": 2.6155946560740473, "test_loss": 0.6427758732996881, "test_acc1": 85.822536808668, "test_acc5": 97.5807741464748, "epoch": 49, "n_parameters": 304326632} diff --git a/CV/MAE/exp_results/MAE/large/log_large_pretrain.txt b/CV/MAE/exp_results/MAE/large/log_large_pretrain.txt new file mode 100644 index 0000000..b3f4d30 --- /dev/null +++ b/CV/MAE/exp_results/MAE/large/log_large_pretrain.txt @@ -0,0 +1,801 @@ +{"train_lr": 1.3705929487179487e-05, "train_loss": 1.0373671979237444, "epoch": 0} +{"train_lr": 4.1205929487179494e-05, "train_loss": 0.8163748006873692, "epoch": 1} +{"train_lr": 6.870592948717947e-05, "train_loss": 0.7898846722196023, "epoch": 2} +{"train_lr": 9.62059294871795e-05, "train_loss": 0.7556995776995348, "epoch": 3} +{"train_lr": 0.00012370592948717955, "train_loss": 0.7204586103892862, "epoch": 4} +{"train_lr": 0.00015120592948717948, "train_loss": 0.6970280320025407, "epoch": 5} +{"train_lr": 0.0001787059294871795, "train_loss": 0.6892808590036554, "epoch": 6} +{"train_lr": 0.00020620592948717952, "train_loss": 0.6760739260412848, "epoch": 7} +{"train_lr": 0.0002337059294871796, "train_loss": 0.6467630795549411, "epoch": 8} +{"train_lr": 0.00026120592948717953, "train_loss": 0.6119912476577343, "epoch": 9} +{"train_lr": 0.0002887059294871795, "train_loss": 0.591552123773652, "epoch": 10} +{"train_lr": 0.0003162059294871794, "train_loss": 0.577067206100298, "epoch": 11} +{"train_lr": 0.0003437059294871795, "train_loss": 0.5598926345567004, "epoch": 12} +{"train_lr": 0.0003712059294871795, "train_loss": 0.5453465787502818, "epoch": 13} +{"train_lr": 0.0003987059294871796, "train_loss": 0.5339593999475861, "epoch": 14} +{"train_lr": 0.00042620592948717975, "train_loss": 0.5245551809173029, "epoch": 15} +{"train_lr": 0.0004537059294871794, "train_loss": 0.5173753621116376, "epoch": 16} +{"train_lr": 0.0004812059294871794, "train_loss": 0.5108209133554155, "epoch": 17} +{"train_lr": 0.0005087059294871794, "train_loss": 0.5050460415659472, "epoch": 18} +{"train_lr": 0.0005362059294871794, "train_loss": 0.5001554909842805, "epoch": 19} +{"train_lr": 0.0005637059294871797, "train_loss": 0.4958586446188677, "epoch": 20} +{"train_lr": 0.0005912059294871796, "train_loss": 0.4919821908757186, "epoch": 21} +{"train_lr": 0.0006187059294871795, "train_loss": 0.4885007034903631, "epoch": 22} +{"train_lr": 0.0006462059294871793, "train_loss": 0.48533707800715303, "epoch": 23} +{"train_lr": 0.0006737059294871794, "train_loss": 0.48238299978682053, "epoch": 24} +{"train_lr": 0.0007012059294871796, "train_loss": 0.4795845612280596, "epoch": 25} +{"train_lr": 0.0007287059294871798, "train_loss": 0.47709798404815584, "epoch": 26} +{"train_lr": 0.0007562059294871797, "train_loss": 0.4748512744031942, "epoch": 27} +{"train_lr": 0.0007837059294871795, "train_loss": 0.4727442269362748, "epoch": 28} +{"train_lr": 0.0008112059294871793, "train_loss": 0.4707687391201034, "epoch": 29} +{"train_lr": 0.0008387059294871796, "train_loss": 0.46924415775216544, "epoch": 30} +{"train_lr": 0.0008662059294871798, "train_loss": 0.46733421087265015, "epoch": 31} +{"train_lr": 0.0008937059294871797, "train_loss": 0.46589025970584214, "epoch": 32} +{"train_lr": 0.0009212059294871793, "train_loss": 0.46424756009871954, "epoch": 33} +{"train_lr": 0.0009487059294871794, "train_loss": 0.46277184823814493, "epoch": 34} +{"train_lr": 0.0009762059294871795, "train_loss": 0.4613388040377639, "epoch": 35} +{"train_lr": 0.0010037059294871799, "train_loss": 0.4599568355494203, "epoch": 36} +{"train_lr": 0.0010312059294871796, "train_loss": 0.4587650656347903, "epoch": 37} +{"train_lr": 0.0010587059294871793, "train_loss": 0.4574355971671116, "epoch": 38} +{"train_lr": 0.0010862059294871797, "train_loss": 0.4563278357904309, "epoch": 39} +{"train_lr": 0.0011137059294871793, "train_loss": 0.45536527213437533, "epoch": 40} +{"train_lr": 0.0011412059294871797, "train_loss": 0.4542963448494004, "epoch": 41} +{"train_lr": 0.00116870592948718, "train_loss": 0.45362750644115013, "epoch": 42} +{"train_lr": 0.0011962059294871796, "train_loss": 0.4524291767201458, "epoch": 43} +{"train_lr": 0.0012237059294871793, "train_loss": 0.4514670613138244, "epoch": 44} +{"train_lr": 0.0012512059294871795, "train_loss": 0.45056493411986875, "epoch": 45} +{"train_lr": 0.0012787059294871797, "train_loss": 0.4497934035622539, "epoch": 46} +{"train_lr": 0.0013062059294871792, "train_loss": 0.448925889061334, "epoch": 47} +{"train_lr": 0.0013337059294871796, "train_loss": 0.4482692235943455, "epoch": 48} +{"train_lr": 0.0013612059294871794, "train_loss": 0.4476346656656227, "epoch": 49} +{"train_lr": 0.0013887059294871796, "train_loss": 0.44682418112643063, "epoch": 50} +{"train_lr": 0.0014162059294871793, "train_loss": 0.44604185191042817, "epoch": 51} +{"train_lr": 0.0014437059294871795, "train_loss": 0.445422636401744, "epoch": 52} +{"train_lr": 0.0014712059294871795, "train_loss": 0.444676601423476, "epoch": 53} +{"train_lr": 0.0014987059294871796, "train_loss": 0.44410661035456145, "epoch": 54} +{"train_lr": 0.0015262059294871796, "train_loss": 0.4435697843780359, "epoch": 55} +{"train_lr": 0.0015537059294871798, "train_loss": 0.4429366707067507, "epoch": 56} +{"train_lr": 0.0015812059294871795, "train_loss": 0.4423868660158358, "epoch": 57} +{"train_lr": 0.0016087059294871797, "train_loss": 0.4419644352179976, "epoch": 58} +{"train_lr": 0.0016362059294871794, "train_loss": 0.4413508808550735, "epoch": 59} +{"train_lr": 0.0016637059294871796, "train_loss": 0.4408118412292634, "epoch": 60} +{"train_lr": 0.0016912059294871796, "train_loss": 0.44038724107369304, "epoch": 61} +{"train_lr": 0.0017187059294871791, "train_loss": 0.43994122875543934, "epoch": 62} +{"train_lr": 0.001746205929487179, "train_loss": 0.4393991921783592, "epoch": 63} +{"train_lr": 0.0017737059294871797, "train_loss": 0.43902113603857845, "epoch": 64} +{"train_lr": 0.0018012059294871797, "train_loss": 0.43856765599384046, "epoch": 65} +{"train_lr": 0.0018287059294871792, "train_loss": 0.43815263809982496, "epoch": 66} +{"train_lr": 0.0018562059294871796, "train_loss": 0.4385444735302232, "epoch": 67} +{"train_lr": 0.00188370592948718, "train_loss": 0.4378033945253358, "epoch": 68} +{"train_lr": 0.0019112059294871802, "train_loss": 0.4373112537342912, "epoch": 69} +{"train_lr": 0.0019387059294871795, "train_loss": 0.4368607692796594, "epoch": 70} +{"train_lr": 0.0019662059294871794, "train_loss": 0.43645675210521007, "epoch": 71} +{"train_lr": 0.0019937059294871796, "train_loss": 0.43612490208126986, "epoch": 72} +{"train_lr": 0.002021205929487179, "train_loss": 0.4361463249929679, "epoch": 73} +{"train_lr": 0.0020487059294871796, "train_loss": 0.4355239907506471, "epoch": 74} +{"train_lr": 0.0020762059294871793, "train_loss": 0.4350612056381905, "epoch": 75} +{"train_lr": 0.0021037059294871795, "train_loss": 0.4349484308055626, "epoch": 76} +{"train_lr": 0.0021312059294871797, "train_loss": 0.4347404240606687, "epoch": 77} +{"train_lr": 0.002158705929487179, "train_loss": 0.4341731424807595, "epoch": 78} +{"train_lr": 0.002186205929487179, "train_loss": 0.4338370120415512, "epoch": 79} +{"train_lr": 0.002199996684251048, "train_loss": 0.4335061630460983, "epoch": 80} +{"train_lr": 0.002199976725863753, "train_loss": 0.4331624563275956, "epoch": 81} +{"train_lr": 0.0021999367774331083, "train_loss": 0.43274621699208343, "epoch": 82} +{"train_lr": 0.002199876839719668, "train_loss": 0.43239927467985606, "epoch": 83} +{"train_lr": 0.002199796913864568, "train_loss": 0.4320713832753543, "epoch": 84} +{"train_lr": 0.002199697001389479, "train_loss": 0.43173469559779054, "epoch": 85} +{"train_lr": 0.002199577104196586, "train_loss": 0.4313527816631951, "epoch": 86} +{"train_lr": 0.0021994372245685645, "train_loss": 0.4310783812035926, "epoch": 87} +{"train_lr": 0.0021992773651685147, "train_loss": 0.4307854121688228, "epoch": 88} +{"train_lr": 0.002199097529039938, "train_loss": 0.43050752383155316, "epoch": 89} +{"train_lr": 0.002198897719606647, "train_loss": 0.43016116209149075, "epoch": 90} +{"train_lr": 0.0021986779406727294, "train_loss": 0.4297766865093786, "epoch": 91} +{"train_lr": 0.0021984381964224556, "train_loss": 0.4297368807676368, "epoch": 92} +{"train_lr": 0.0021981784914202134, "train_loss": 0.4293370681683509, "epoch": 93} +{"train_lr": 0.0021978988306104136, "train_loss": 0.42910005276700336, "epoch": 94} +{"train_lr": 0.0021975992193173943, "train_loss": 0.42879289225973666, "epoch": 95} +{"train_lr": 0.0021972796632453166, "train_loss": 0.42852139729970634, "epoch": 96} +{"train_lr": 0.0021969401684780723, "train_loss": 0.4283285259287088, "epoch": 97} +{"train_lr": 0.0021965807414791516, "train_loss": 0.42814426130065936, "epoch": 98} +{"train_lr": 0.0021962013890915295, "train_loss": 0.427771221443366, "epoch": 99} +{"train_lr": 0.002195802118537524, "train_loss": 0.42763554915272367, "epoch": 100} +{"train_lr": 0.0021953829374186744, "train_loss": 0.42753871774766594, "epoch": 101} +{"train_lr": 0.002194943853715583, "train_loss": 0.42720191766364646, "epoch": 102} +{"train_lr": 0.002194484875787771, "train_loss": 0.427074841169927, "epoch": 103} +{"train_lr": 0.0021940060123735164, "train_loss": 0.4268490781929965, "epoch": 104} +{"train_lr": 0.0021935072725896877, "train_loss": 0.42664367280518395, "epoch": 105} +{"train_lr": 0.0021929886659315715, "train_loss": 0.4263747133887731, "epoch": 106} +{"train_lr": 0.0021924502022726967, "train_loss": 0.42692178616431564, "epoch": 107} +{"train_lr": 0.0021918918918646256, "train_loss": 0.42619890832186985, "epoch": 108} +{"train_lr": 0.0021913137453367865, "train_loss": 0.4259361302724872, "epoch": 109} +{"train_lr": 0.0021907157736962605, "train_loss": 0.425667358828016, "epoch": 110} +{"train_lr": 0.0021900979883275615, "train_loss": 0.42560386141905415, "epoch": 111} +{"train_lr": 0.0021894604009924366, "train_loss": 0.4254387913253875, "epoch": 112} +{"train_lr": 0.0021888030238296262, "train_loss": 0.4252314920602844, "epoch": 113} +{"train_lr": 0.0021881258693546408, "train_loss": 0.4251124603518595, "epoch": 114} +{"train_lr": 0.0021874289504595305, "train_loss": 0.4249423658146929, "epoch": 115} +{"train_lr": 0.00218671228041263, "train_loss": 0.42477498546791953, "epoch": 116} +{"train_lr": 0.0021859758728582953, "train_loss": 0.42456992419185835, "epoch": 117} +{"train_lr": 0.0021852197418166675, "train_loss": 0.4244430363805105, "epoch": 118} +{"train_lr": 0.0021844439016833928, "train_loss": 0.4242876783484975, "epoch": 119} +{"train_lr": 0.0021836483672293488, "train_loss": 0.42407468885171395, "epoch": 120} +{"train_lr": 0.0021828331536003654, "train_loss": 0.4240114364790945, "epoch": 121} +{"train_lr": 0.0021819982763169312, "train_loss": 0.423764265790128, "epoch": 122} +{"train_lr": 0.0021811437512739154, "train_loss": 0.42374736052125883, "epoch": 123} +{"train_lr": 0.0021802695947402357, "train_loss": 0.4245943425862023, "epoch": 124} +{"train_lr": 0.0021793758233585704, "train_loss": 0.42387092797658765, "epoch": 125} +{"train_lr": 0.002178462454145044, "train_loss": 0.4235223626288084, "epoch": 126} +{"train_lr": 0.0021775295044888857, "train_loss": 0.4233784673269838, "epoch": 127} +{"train_lr": 0.002176576992152116, "train_loss": 0.4231366920732678, "epoch": 128} +{"train_lr": 0.0021756049352691944, "train_loss": 0.4230839658337526, "epoch": 129} +{"train_lr": 0.002174613352346683, "train_loss": 0.4228322916807464, "epoch": 130} +{"train_lr": 0.002173602262262889, "train_loss": 0.4232935921396487, "epoch": 131} +{"train_lr": 0.0021725716842675145, "train_loss": 0.4229748474720579, "epoch": 132} +{"train_lr": 0.0021715216379812764, "train_loss": 0.42267333947790736, "epoch": 133} +{"train_lr": 0.0021704521433955426, "train_loss": 0.422408729346875, "epoch": 134} +{"train_lr": 0.0021693632208719493, "train_loss": 0.4223583231841286, "epoch": 135} +{"train_lr": 0.002168254891142009, "train_loss": 0.42228768690704155, "epoch": 136} +{"train_lr": 0.002167127175306729, "train_loss": 0.4221280742621718, "epoch": 137} +{"train_lr": 0.002165980094836185, "train_loss": 0.4220910801444776, "epoch": 138} +{"train_lr": 0.002164813671569137, "train_loss": 0.42186729532654565, "epoch": 139} +{"train_lr": 0.002163627927712607, "train_loss": 0.421843397601221, "epoch": 140} +{"train_lr": 0.0021624228858414477, "train_loss": 0.4217382113908967, "epoch": 141} +{"train_lr": 0.0021611985688979166, "train_loss": 0.4215304550732701, "epoch": 142} +{"train_lr": 0.0021599550001912458, "train_loss": 0.42138542972194654, "epoch": 143} +{"train_lr": 0.0021586922033971913, "train_loss": 0.4213122255377806, "epoch": 144} +{"train_lr": 0.002157410202557581, "train_loss": 0.421302269359167, "epoch": 145} +{"train_lr": 0.002156109022079862, "train_loss": 0.42114945856006575, "epoch": 146} +{"train_lr": 0.0021547886867366393, "train_loss": 0.42119269031517875, "epoch": 147} +{"train_lr": 0.0021534492216651966, "train_loss": 0.420911968464796, "epoch": 148} +{"train_lr": 0.0021520906523670095, "train_loss": 0.4209221635097399, "epoch": 149} +{"train_lr": 0.0021507130047072865, "train_loss": 0.420738841407001, "epoch": 150} +{"train_lr": 0.00214931630491445, "train_loss": 0.4208859864932795, "epoch": 151} +{"train_lr": 0.0021479005795796537, "train_loss": 0.42067305678621125, "epoch": 152} +{"train_lr": 0.00214646585565626, "train_loss": 0.42045262417732143, "epoch": 153} +{"train_lr": 0.0021450121604593515, "train_loss": 0.4205632483473239, "epoch": 154} +{"train_lr": 0.002143539521665188, "train_loss": 0.4204747405094214, "epoch": 155} +{"train_lr": 0.002142047967310689, "train_loss": 0.42016615892927617, "epoch": 156} +{"train_lr": 0.002140537525792898, "train_loss": 0.420656357312766, "epoch": 157} +{"train_lr": 0.002139008225868444, "train_loss": 0.42010926239741725, "epoch": 158} +{"train_lr": 0.002137460096652994, "train_loss": 0.4202159770215169, "epoch": 159} +{"train_lr": 0.0021358931676206975, "train_loss": 0.42006659239996225, "epoch": 160} +{"train_lr": 0.0021343074686036253, "train_loss": 0.42007251099969906, "epoch": 161} +{"train_lr": 0.002132703029791194, "train_loss": 0.41983775014523417, "epoch": 162} +{"train_lr": 0.0021310798817296174, "train_loss": 0.41982228738458777, "epoch": 163} +{"train_lr": 0.002129438055321287, "train_loss": 0.41973125434611946, "epoch": 164} +{"train_lr": 0.0021277775818242138, "train_loss": 0.4196543410265197, "epoch": 165} +{"train_lr": 0.002126098492851418, "train_loss": 0.419556195089498, "epoch": 166} +{"train_lr": 0.0021244008203703327, "train_loss": 0.41950905472875977, "epoch": 167} +{"train_lr": 0.0021226845967021965, "train_loss": 0.41944392684262055, "epoch": 168} +{"train_lr": 0.0021209498545214367, "train_loss": 0.4193810497541936, "epoch": 169} +{"train_lr": 0.00211919662685504, "train_loss": 0.4192442288831211, "epoch": 170} +{"train_lr": 0.0021174249470819317, "train_loss": 0.4192189235234251, "epoch": 171} +{"train_lr": 0.002115634848932345, "train_loss": 0.4191800984374892, "epoch": 172} +{"train_lr": 0.0021138263664871684, "train_loss": 0.4190960426050692, "epoch": 173} +{"train_lr": 0.0021119995341772973, "train_loss": 0.4190245867611315, "epoch": 174} +{"train_lr": 0.0021101543867829906, "train_loss": 0.4189993502596059, "epoch": 175} +{"train_lr": 0.0021082909594331923, "train_loss": 0.4187265991245229, "epoch": 176} +{"train_lr": 0.0021064092876048723, "train_loss": 0.4187703061824999, "epoch": 177} +{"train_lr": 0.0021045094071223494, "train_loss": 0.4187772857914798, "epoch": 178} +{"train_lr": 0.0021025913541566133, "train_loss": 0.4186587428453211, "epoch": 179} +{"train_lr": 0.0021006551652246208, "train_loss": 0.41868259694176513, "epoch": 180} +{"train_lr": 0.0020987008771886275, "train_loss": 0.41851956458487666, "epoch": 181} +{"train_lr": 0.0020967285272554524, "train_loss": 0.41851956319983286, "epoch": 182} +{"train_lr": 0.0020947381529758, "train_loss": 0.4204516486169245, "epoch": 183} +{"train_lr": 0.002092729792243523, "train_loss": 0.41863377986308664, "epoch": 184} +{"train_lr": 0.0020907034832949195, "train_loss": 0.4185516889451836, "epoch": 185} +{"train_lr": 0.0020886592647079852, "train_loss": 0.41843239219787604, "epoch": 186} +{"train_lr": 0.0020865971754017044, "train_loss": 0.4183382890545405, "epoch": 187} +{"train_lr": 0.002084517254635278, "train_loss": 0.4181868398400883, "epoch": 188} +{"train_lr": 0.0020824195420073976, "train_loss": 0.4181370502934815, "epoch": 189} +{"train_lr": 0.0020803040774554945, "train_loss": 0.418083170022911, "epoch": 190} +{"train_lr": 0.0020781709012549616, "train_loss": 0.4179465582904716, "epoch": 191} +{"train_lr": 0.0020760200540183996, "train_loss": 0.4180231848373436, "epoch": 192} +{"train_lr": 0.0020738515766948354, "train_loss": 0.4179109181247604, "epoch": 193} +{"train_lr": 0.002071665510568953, "train_loss": 0.41795996805199254, "epoch": 194} +{"train_lr": 0.0020694618972603037, "train_loss": 0.4180147959343277, "epoch": 195} +{"train_lr": 0.002067240778722506, "train_loss": 0.4179162073802824, "epoch": 196} +{"train_lr": 0.0020650021972424553, "train_loss": 0.41772405684698755, "epoch": 197} +{"train_lr": 0.002062746195439519, "train_loss": 0.4176427618009396, "epoch": 198} +{"train_lr": 0.002060472816264713, "train_loss": 0.4175360346457754, "epoch": 199} +{"train_lr": 0.002058182102999905, "train_loss": 0.4175078834598072, "epoch": 200} +{"train_lr": 0.002055874099256973, "train_loss": 0.4175237088959712, "epoch": 201} +{"train_lr": 0.0020535488489769813, "train_loss": 0.41736443084963143, "epoch": 202} +{"train_lr": 0.0020512063964293406, "train_loss": 0.41743514193401027, "epoch": 203} +{"train_lr": 0.0020488467862109726, "train_loss": 0.41724888548457945, "epoch": 204} +{"train_lr": 0.0020464700632454582, "train_loss": 0.4175108184482759, "epoch": 205} +{"train_lr": 0.0020440762727821694, "train_loss": 0.41730758319728267, "epoch": 206} +{"train_lr": 0.002041665460395431, "train_loss": 0.4171531482014614, "epoch": 207} +{"train_lr": 0.002039237671983636, "train_loss": 0.4171176812856291, "epoch": 208} +{"train_lr": 0.002036792953768375, "train_loss": 0.4171120857533354, "epoch": 209} +{"train_lr": 0.002034331352293559, "train_loss": 0.41690345452680516, "epoch": 210} +{"train_lr": 0.0020318529144245315, "train_loss": 0.4169688862360393, "epoch": 211} +{"train_lr": 0.0020293576873471747, "train_loss": 0.4169356456110015, "epoch": 212} +{"train_lr": 0.0020268457185670195, "train_loss": 0.41681159019935876, "epoch": 213} +{"train_lr": 0.002024317055908329, "train_loss": 0.41887578073566634, "epoch": 214} +{"train_lr": 0.0020217717475131958, "train_loss": 0.41701536552789503, "epoch": 215} +{"train_lr": 0.0020192098418406177, "train_loss": 0.4168762993873455, "epoch": 216} +{"train_lr": 0.0020166313876655924, "train_loss": 0.41685889998319536, "epoch": 217} +{"train_lr": 0.002014036434078168, "train_loss": 0.4166946614113374, "epoch": 218} +{"train_lr": 0.0020114250304825213, "train_loss": 0.41653254462812, "epoch": 219} +{"train_lr": 0.002008797226596011, "train_loss": 0.4166686459360883, "epoch": 220} +{"train_lr": 0.0020061530724482363, "train_loss": 0.4163997006208564, "epoch": 221} +{"train_lr": 0.00200349261838008, "train_loss": 0.4164510855021385, "epoch": 222} +{"train_lr": 0.0020008159150427538, "train_loss": 0.4164321756629178, "epoch": 223} +{"train_lr": 0.0019981230133968306, "train_loss": 0.4163135129253929, "epoch": 224} +{"train_lr": 0.0019954139647112732, "train_loss": 0.41683639441198933, "epoch": 225} +{"train_lr": 0.001992688820562465, "train_loss": 0.4164539910763359, "epoch": 226} +{"train_lr": 0.0019899476328332256, "train_loss": 0.4162949403353895, "epoch": 227} +{"train_lr": 0.001987190453711815, "train_loss": 0.41620151580979997, "epoch": 228} +{"train_lr": 0.0019844173356909473, "train_loss": 0.4163096353907186, "epoch": 229} +{"train_lr": 0.0019816283315667966, "train_loss": 0.4160788968706933, "epoch": 230} +{"train_lr": 0.001978823494437979, "train_loss": 0.4160326524124218, "epoch": 231} +{"train_lr": 0.001976002877704551, "train_loss": 0.4159600309108217, "epoch": 232} +{"train_lr": 0.00197316653506699, "train_loss": 0.415864022105383, "epoch": 233} +{"train_lr": 0.001970314520525169, "train_loss": 0.4159824102579688, "epoch": 234} +{"train_lr": 0.0019674468883773347, "train_loss": 0.4159344154505584, "epoch": 235} +{"train_lr": 0.0019645636932190706, "train_loss": 0.41571531779108906, "epoch": 236} +{"train_lr": 0.0019616649899422568, "train_loss": 0.41572490451523125, "epoch": 237} +{"train_lr": 0.0019587508337340223, "train_loss": 0.41574745898385745, "epoch": 238} +{"train_lr": 0.0019558212800757026, "train_loss": 0.41571679237322545, "epoch": 239} +{"train_lr": 0.0019528763847417802, "train_loss": 0.41566707138330317, "epoch": 240} +{"train_lr": 0.0019499162037988121, "train_loss": 0.4154396512730716, "epoch": 241} +{"train_lr": 0.001946940793604378, "train_loss": 0.4155334876641297, "epoch": 242} +{"train_lr": 0.0019439502108059982, "train_loss": 0.4154881267683007, "epoch": 243} +{"train_lr": 0.0019409445123400604, "train_loss": 0.4154404973467955, "epoch": 244} +{"train_lr": 0.0019379237554307278, "train_loss": 0.41536261062794483, "epoch": 245} +{"train_lr": 0.001934887997588859, "train_loss": 0.4153592953733049, "epoch": 246} +{"train_lr": 0.0019318372966109106, "train_loss": 0.4152674611586218, "epoch": 247} +{"train_lr": 0.0019287717105778263, "train_loss": 0.4153060437783074, "epoch": 248} +{"train_lr": 0.0019256912978539496, "train_loss": 0.4151951908521975, "epoch": 249} +{"train_lr": 0.0019225961170858967, "train_loss": 0.41521607185355747, "epoch": 250} +{"train_lr": 0.0019194862272014467, "train_loss": 0.4151945383920788, "epoch": 251} +{"train_lr": 0.001916361687408424, "train_loss": 0.415128694429325, "epoch": 252} +{"train_lr": 0.0019132225571935563, "train_loss": 0.4152043274042603, "epoch": 253} +{"train_lr": 0.0019100688963213624, "train_loss": 0.41502150900375384, "epoch": 254} +{"train_lr": 0.0019069007648329988, "train_loss": 0.414975148676417, "epoch": 255} +{"train_lr": 0.0019037182230451216, "train_loss": 0.41492671117437285, "epoch": 256} +{"train_lr": 0.0019005213315487395, "train_loss": 0.414867433364121, "epoch": 257} +{"train_lr": 0.0018973101512080564, "train_loss": 0.41478415592334783, "epoch": 258} +{"train_lr": 0.0018940847431593185, "train_loss": 0.41493314027320594, "epoch": 259} +{"train_lr": 0.0018908451688096474, "train_loss": 0.4148071248932049, "epoch": 260} +{"train_lr": 0.001887591489835866, "train_loss": 0.4147998127692307, "epoch": 261} +{"train_lr": 0.0018843237681833364, "train_loss": 0.4147368201997895, "epoch": 262} +{"train_lr": 0.0018810420660647636, "train_loss": 0.41473585022434306, "epoch": 263} +{"train_lr": 0.0018777464459590254, "train_loss": 0.41463008197322965, "epoch": 264} +{"train_lr": 0.0018744369706099827, "train_loss": 0.4145144334791276, "epoch": 265} +{"train_lr": 0.0018711137030252738, "train_loss": 0.4145721539299792, "epoch": 266} +{"train_lr": 0.0018677767064751189, "train_loss": 0.4146482014688305, "epoch": 267} +{"train_lr": 0.0018644260444911289, "train_loss": 0.4145143874652254, "epoch": 268} +{"train_lr": 0.001861061780865072, "train_loss": 0.41513582520509285, "epoch": 269} +{"train_lr": 0.001857683979647683, "train_loss": 0.4145510424561321, "epoch": 270} +{"train_lr": 0.0018542927051474255, "train_loss": 0.4143607781405967, "epoch": 271} +{"train_lr": 0.0018508880219292774, "train_loss": 0.4143475139519582, "epoch": 272} +{"train_lr": 0.0018474699948134992, "train_loss": 0.4142985286591097, "epoch": 273} +{"train_lr": 0.001844038688874402, "train_loss": 0.4142709164969766, "epoch": 274} +{"train_lr": 0.0018405941694391048, "train_loss": 0.4141980246312391, "epoch": 275} +{"train_lr": 0.0018371365020862912, "train_loss": 0.4141822525670227, "epoch": 276} +{"train_lr": 0.0018336657526449639, "train_loss": 0.4141156807667218, "epoch": 277} +{"train_lr": 0.0018301819871931874, "train_loss": 0.41410747529246295, "epoch": 278} +{"train_lr": 0.0018266852720568382, "train_loss": 0.41405237226699215, "epoch": 279} +{"train_lr": 0.0018231756738083295, "train_loss": 0.41394138641249484, "epoch": 280} +{"train_lr": 0.0018196532592653519, "train_loss": 0.4139768362093048, "epoch": 281} +{"train_lr": 0.0018161180954896032, "train_loss": 0.41397458218778366, "epoch": 282} +{"train_lr": 0.0018125702497855084, "train_loss": 0.41386487913461256, "epoch": 283} +{"train_lr": 0.0018090097896989272, "train_loss": 0.4139637210054132, "epoch": 284} +{"train_lr": 0.0018054367830158936, "train_loss": 0.41381767440515643, "epoch": 285} +{"train_lr": 0.0018018512977613032, "train_loss": 0.4137953261170202, "epoch": 286} +{"train_lr": 0.0017982534021976266, "train_loss": 0.41396266024094075, "epoch": 287} +{"train_lr": 0.0017946431648236123, "train_loss": 0.4137683888085378, "epoch": 288} +{"train_lr": 0.00179102065437297, "train_loss": 0.41367859775737786, "epoch": 289} +{"train_lr": 0.0017873859398130803, "train_loss": 0.4136714078157615, "epoch": 290} +{"train_lr": 0.0017837390903436671, "train_loss": 0.4135731390247551, "epoch": 291} +{"train_lr": 0.0017800801753954888, "train_loss": 0.4136636662834252, "epoch": 292} +{"train_lr": 0.0017764092646290154, "train_loss": 0.41356596955432534, "epoch": 293} +{"train_lr": 0.0017727264279330912, "train_loss": 0.4135198926714321, "epoch": 294} +{"train_lr": 0.0017690317354236186, "train_loss": 0.4135080915219031, "epoch": 295} +{"train_lr": 0.0017653252574422209, "train_loss": 0.41334341784031725, "epoch": 296} +{"train_lr": 0.001761607064554894, "train_loss": 0.41334728990645647, "epoch": 297} +{"train_lr": 0.0017578772275506705, "train_loss": 0.41330422608193773, "epoch": 298} +{"train_lr": 0.0017541358174402676, "train_loss": 0.41324203011866373, "epoch": 299} +{"train_lr": 0.0017503829054547454, "train_loss": 0.4132311620110741, "epoch": 300} +{"train_lr": 0.0017466185630441384, "train_loss": 0.4132360761197141, "epoch": 301} +{"train_lr": 0.0017428428618760945, "train_loss": 0.4132467000745237, "epoch": 302} +{"train_lr": 0.0017390558738345284, "train_loss": 0.41321784359677577, "epoch": 303} +{"train_lr": 0.00173525767101823, "train_loss": 0.4131651461745302, "epoch": 304} +{"train_lr": 0.001731448325739506, "train_loss": 0.4131434768384609, "epoch": 305} +{"train_lr": 0.0017276279105227959, "train_loss": 0.41309410180801, "epoch": 306} +{"train_lr": 0.0017237964981033048, "train_loss": 0.41297986341986614, "epoch": 307} +{"train_lr": 0.0017199541614255998, "train_loss": 0.4128857530438556, "epoch": 308} +{"train_lr": 0.001716100973642235, "train_loss": 0.413681590425161, "epoch": 309} +{"train_lr": 0.001712237008112346, "train_loss": 0.4130775839275418, "epoch": 310} +{"train_lr": 0.00170836233840027, "train_loss": 0.4129094842564649, "epoch": 311} +{"train_lr": 0.0017044770382741352, "train_loss": 0.4129503610400626, "epoch": 312} +{"train_lr": 0.001700581181704449, "train_loss": 0.41287012439734566, "epoch": 313} +{"train_lr": 0.00169667484286271, "train_loss": 0.41280730115548253, "epoch": 314} +{"train_lr": 0.001692758096119979, "train_loss": 0.41272441369409746, "epoch": 315} +{"train_lr": 0.0016888310160454662, "train_loss": 0.41262197143768364, "epoch": 316} +{"train_lr": 0.0016848936774051166, "train_loss": 0.4126468203926029, "epoch": 317} +{"train_lr": 0.0016809461551601822, "train_loss": 0.41255873617513156, "epoch": 318} +{"train_lr": 0.0016769885244657956, "train_loss": 0.412636836590723, "epoch": 319} +{"train_lr": 0.0016730208606695412, "train_loss": 0.412544735134221, "epoch": 320} +{"train_lr": 0.001669043239310017, "train_loss": 0.41250999425000584, "epoch": 321} +{"train_lr": 0.0016650557361153995, "train_loss": 0.41247300328448033, "epoch": 322} +{"train_lr": 0.0016610584270020066, "train_loss": 0.41246316529129845, "epoch": 323} +{"train_lr": 0.0016570513880728383, "train_loss": 0.41240897299184537, "epoch": 324} +{"train_lr": 0.0016530346956161383, "train_loss": 0.4123140548934969, "epoch": 325} +{"train_lr": 0.0016490084261039418, "train_loss": 0.4124350955274004, "epoch": 326} +{"train_lr": 0.0016449726561906196, "train_loss": 0.4123261533050726, "epoch": 327} +{"train_lr": 0.0016409274627114101, "train_loss": 0.4122270453775015, "epoch": 328} +{"train_lr": 0.0016368729226809665, "train_loss": 0.4122691912463126, "epoch": 329} +{"train_lr": 0.001632809113291888, "train_loss": 0.41216078330165684, "epoch": 330} +{"train_lr": 0.0016287361119132467, "train_loss": 0.4120873586029913, "epoch": 331} +{"train_lr": 0.0016246539960891194, "train_loss": 0.41207945707122773, "epoch": 332} +{"train_lr": 0.001620562843537104, "train_loss": 0.4120459109634304, "epoch": 333} +{"train_lr": 0.0016164627321468496, "train_loss": 0.41213970735239297, "epoch": 334} +{"train_lr": 0.001612353739978566, "train_loss": 0.41200300023699993, "epoch": 335} +{"train_lr": 0.0016082359452615441, "train_loss": 0.4119489185410576, "epoch": 336} +{"train_lr": 0.0016041094263926547, "train_loss": 0.41217702930458844, "epoch": 337} +{"train_lr": 0.0015999742619348728, "train_loss": 0.41226495031971866, "epoch": 338} +{"train_lr": 0.0015958305306157678, "train_loss": 0.4119313608389348, "epoch": 339} +{"train_lr": 0.001591678311326011, "train_loss": 0.41189253890218264, "epoch": 340} +{"train_lr": 0.0015875176831178716, "train_loss": 0.4119056661386425, "epoch": 341} +{"train_lr": 0.0015833487252037124, "train_loss": 0.4116876139395082, "epoch": 342} +{"train_lr": 0.0015791715169544858, "train_loss": 0.4117441733684152, "epoch": 343} +{"train_lr": 0.0015749861378982126, "train_loss": 0.4116757833727229, "epoch": 344} +{"train_lr": 0.0015707926677184783, "train_loss": 0.4117640209718583, "epoch": 345} +{"train_lr": 0.0015665911862529113, "train_loss": 0.41165625816509605, "epoch": 346} +{"train_lr": 0.001562381773491659, "train_loss": 0.41153176439006645, "epoch": 347} +{"train_lr": 0.0015581645095758788, "train_loss": 0.41154465329451245, "epoch": 348} +{"train_lr": 0.0015539394747961911, "train_loss": 0.4115032056907717, "epoch": 349} +{"train_lr": 0.0015497067495911672, "train_loss": 0.41149046644568443, "epoch": 350} +{"train_lr": 0.0015454664145457997, "train_loss": 0.4113970931279115, "epoch": 351} +{"train_lr": 0.0015412185503899496, "train_loss": 0.41137130759381807, "epoch": 352} +{"train_lr": 0.0015369632379968283, "train_loss": 0.41134841808189565, "epoch": 353} +{"train_lr": 0.0015327005583814536, "train_loss": 0.41121035462352806, "epoch": 354} +{"train_lr": 0.0015284305926990987, "train_loss": 0.4112332000886687, "epoch": 355} +{"train_lr": 0.0015241534222437516, "train_loss": 0.4111234651108344, "epoch": 356} +{"train_lr": 0.0015198691284465764, "train_loss": 0.41117768977971697, "epoch": 357} +{"train_lr": 0.0015155777928743523, "train_loss": 0.41120130031739766, "epoch": 358} +{"train_lr": 0.0015112794972279191, "train_loss": 0.4111418041120021, "epoch": 359} +{"train_lr": 0.0015069743233406332, "train_loss": 0.41106258124674266, "epoch": 360} +{"train_lr": 0.0015026623531767976, "train_loss": 0.411072268562678, "epoch": 361} +{"train_lr": 0.0014983436688301081, "train_loss": 0.41097060625608534, "epoch": 362} +{"train_lr": 0.001494018352522093, "train_loss": 0.4110512023206609, "epoch": 363} +{"train_lr": 0.001489686486600536, "train_loss": 0.4109220164260851, "epoch": 364} +{"train_lr": 0.001485348153537923, "train_loss": 0.4108758811832955, "epoch": 365} +{"train_lr": 0.0014810034359298602, "train_loss": 0.41088385198217553, "epoch": 366} +{"train_lr": 0.001476652416493508, "train_loss": 0.4108179884109025, "epoch": 367} +{"train_lr": 0.0014722951780660042, "train_loss": 0.4108165610724917, "epoch": 368} +{"train_lr": 0.0014679318036028908, "train_loss": 0.41075373621872413, "epoch": 369} +{"train_lr": 0.001463562376176525, "train_loss": 0.410711961481959, "epoch": 370} +{"train_lr": 0.0014591869789745055, "train_loss": 0.41074939695700335, "epoch": 371} +{"train_lr": 0.0014548056952980906, "train_loss": 0.41064891304808837, "epoch": 372} +{"train_lr": 0.0014504186085606062, "train_loss": 0.41055555843437713, "epoch": 373} +{"train_lr": 0.001446025802285859, "train_loss": 0.4104764893865929, "epoch": 374} +{"train_lr": 0.0014416273601065466, "train_loss": 0.4106733149025016, "epoch": 375} +{"train_lr": 0.0014372233657626709, "train_loss": 0.410539361090065, "epoch": 376} +{"train_lr": 0.0014328139030999325, "train_loss": 0.41042309052024323, "epoch": 377} +{"train_lr": 0.00142839905606815, "train_loss": 0.4104917037497776, "epoch": 378} +{"train_lr": 0.0014239789087196419, "train_loss": 0.4103732809621411, "epoch": 379} +{"train_lr": 0.0014195535452076445, "train_loss": 0.41036086723518866, "epoch": 380} +{"train_lr": 0.0014151230497846973, "train_loss": 0.41025182133456933, "epoch": 381} +{"train_lr": 0.0014106875068010517, "train_loss": 0.4103283556297613, "epoch": 382} +{"train_lr": 0.0014062470007030464, "train_loss": 0.41025463273175633, "epoch": 383} +{"train_lr": 0.001401801616031522, "train_loss": 0.41028842999814796, "epoch": 384} +{"train_lr": 0.0013973514374201934, "train_loss": 0.4101260701636187, "epoch": 385} +{"train_lr": 0.0013928965495940433, "train_loss": 0.4101290691148442, "epoch": 386} +{"train_lr": 0.001388437037367717, "train_loss": 0.4100582876977009, "epoch": 387} +{"train_lr": 0.0013839729856439005, "train_loss": 0.4100109167003001, "epoch": 388} +{"train_lr": 0.0013795044794117017, "train_loss": 0.41002871754030007, "epoch": 389} +{"train_lr": 0.0013750316037450382, "train_loss": 0.4100142954609906, "epoch": 390} +{"train_lr": 0.0013705544438010152, "train_loss": 0.4099733572101029, "epoch": 391} +{"train_lr": 0.0013660730848183047, "train_loss": 0.40993832916212386, "epoch": 392} +{"train_lr": 0.001361587612115522, "train_loss": 0.40984915221074164, "epoch": 393} +{"train_lr": 0.0013570981110896019, "train_loss": 0.40986981642289233, "epoch": 394} +{"train_lr": 0.0013526046672141716, "train_loss": 0.4098925541471451, "epoch": 395} +{"train_lr": 0.0013481073660379268, "train_loss": 0.4097335473455202, "epoch": 396} +{"train_lr": 0.0013436062931829961, "train_loss": 0.4097142314914471, "epoch": 397} +{"train_lr": 0.0013391015343433242, "train_loss": 0.40965941716104937, "epoch": 398} +{"train_lr": 0.0013345931752830203, "train_loss": 0.40957096070110893, "epoch": 399} +{"train_lr": 0.0013300813018347428, "train_loss": 0.40958923276048154, "epoch": 400} +{"train_lr": 0.0013255659998980631, "train_loss": 0.40965150356985247, "epoch": 401} +{"train_lr": 0.001321047355437815, "train_loss": 0.4096374787443962, "epoch": 402} +{"train_lr": 0.0013165254544824816, "train_loss": 0.4095211476236821, "epoch": 403} +{"train_lr": 0.0013120003831225341, "train_loss": 0.4094003884551617, "epoch": 404} +{"train_lr": 0.0013074722275088128, "train_loss": 0.4093821646842676, "epoch": 405} +{"train_lr": 0.0013029410738508687, "train_loss": 0.40934318331225467, "epoch": 406} +{"train_lr": 0.0012984070084153404, "train_loss": 0.40935628237023663, "epoch": 407} +{"train_lr": 0.001293870117524294, "train_loss": 0.4092580980549638, "epoch": 408} +{"train_lr": 0.0012893304875535958, "train_loss": 0.40928569577861196, "epoch": 409} +{"train_lr": 0.001284788204931254, "train_loss": 0.40930103614197993, "epoch": 410} +{"train_lr": 0.0012802433561357833, "train_loss": 0.4091632225676082, "epoch": 411} +{"train_lr": 0.0012756960276945543, "train_loss": 0.40913745740321106, "epoch": 412} +{"train_lr": 0.0012711463061821455, "train_loss": 0.4091320280642368, "epoch": 413} +{"train_lr": 0.0012665942782186948, "train_loss": 0.4090705737053679, "epoch": 414} +{"train_lr": 0.0012620400304682543, "train_loss": 0.40911481409476924, "epoch": 415} +{"train_lr": 0.0012574836496371338, "train_loss": 0.40898072150631404, "epoch": 416} +{"train_lr": 0.001252925222472262, "train_loss": 0.4089827490463041, "epoch": 417} +{"train_lr": 0.0012483648357595157, "train_loss": 0.4089191631408026, "epoch": 418} +{"train_lr": 0.0012438025763220866, "train_loss": 0.4089194155799655, "epoch": 419} +{"train_lr": 0.0012392385310188183, "train_loss": 0.4088360767107075, "epoch": 420} +{"train_lr": 0.0012346727867425544, "train_loss": 0.40889794748718256, "epoch": 421} +{"train_lr": 0.0012301054304184812, "train_loss": 0.4087364257384951, "epoch": 422} +{"train_lr": 0.0012255365490024856, "train_loss": 0.4087256219787284, "epoch": 423} +{"train_lr": 0.0012209662294794788, "train_loss": 0.4086236726068772, "epoch": 424} +{"train_lr": 0.0012163945588617594, "train_loss": 0.40868301555299413, "epoch": 425} +{"train_lr": 0.0012118216241873432, "train_loss": 0.40864022688355106, "epoch": 426} +{"train_lr": 0.0012072475125183195, "train_loss": 0.408639843435361, "epoch": 427} +{"train_lr": 0.0012026723109391762, "train_loss": 0.4084960335704426, "epoch": 428} +{"train_lr": 0.0011980961065551578, "train_loss": 0.40850723951828116, "epoch": 429} +{"train_lr": 0.0011935189864905992, "train_loss": 0.4084427037741989, "epoch": 430} +{"train_lr": 0.0011889410378872717, "train_loss": 0.408406377546131, "epoch": 431} +{"train_lr": 0.0011843623479027132, "train_loss": 0.4084319865319114, "epoch": 432} +{"train_lr": 0.0011797830037085834, "train_loss": 0.4082929293404166, "epoch": 433} +{"train_lr": 0.0011752030924889923, "train_loss": 0.4083061993856413, "epoch": 434} +{"train_lr": 0.001170622701438853, "train_loss": 0.40826553131979054, "epoch": 435} +{"train_lr": 0.0011660419177622026, "train_loss": 0.40831295249219507, "epoch": 436} +{"train_lr": 0.0011614608286705634, "train_loss": 0.408208545213804, "epoch": 437} +{"train_lr": 0.001156879521381265, "train_loss": 0.40810751629312736, "epoch": 438} +{"train_lr": 0.0011522980831157985, "train_loss": 0.4089228991920558, "epoch": 439} +{"train_lr": 0.0011477166010981405, "train_loss": 0.40827475297145355, "epoch": 440} +{"train_lr": 0.0011431351625531072, "train_loss": 0.40814876497890323, "epoch": 441} +{"train_lr": 0.001138553854704682, "train_loss": 0.4081494444772267, "epoch": 442} +{"train_lr": 0.0011339727647743652, "train_loss": 0.4080424102423235, "epoch": 443} +{"train_lr": 0.0011293919799795042, "train_loss": 0.4079775022557722, "epoch": 444} +{"train_lr": 0.0011248115875316382, "train_loss": 0.40800062047305685, "epoch": 445} +{"train_lr": 0.0011202316746348369, "train_loss": 0.4078693060676018, "epoch": 446} +{"train_lr": 0.0011156523284840427, "train_loss": 0.40788094251930046, "epoch": 447} +{"train_lr": 0.0011110736362634, "train_loss": 0.4078162476921884, "epoch": 448} +{"train_lr": 0.0011064956851446132, "train_loss": 0.4077284878376537, "epoch": 449} +{"train_lr": 0.0011019185622852719, "train_loss": 0.4077657826322441, "epoch": 450} +{"train_lr": 0.001097342354827195, "train_loss": 0.4077323899896911, "epoch": 451} +{"train_lr": 0.0010927671498947784, "train_loss": 0.40761750450250334, "epoch": 452} +{"train_lr": 0.001088193034593329, "train_loss": 0.40754589111580014, "epoch": 453} +{"train_lr": 0.0010836200960074077, "train_loss": 0.4076227524962563, "epoch": 454} +{"train_lr": 0.001079048421199174, "train_loss": 0.40763028256463796, "epoch": 455} +{"train_lr": 0.0010744780972067251, "train_loss": 0.4075469539548533, "epoch": 456} +{"train_lr": 0.0010699092110424448, "train_loss": 0.4075042612325305, "epoch": 457} +{"train_lr": 0.0010653418496913364, "train_loss": 0.40739803072900915, "epoch": 458} +{"train_lr": 0.0010607761001093785, "train_loss": 0.40746555620064145, "epoch": 459} +{"train_lr": 0.0010562120492218607, "train_loss": 0.4072514477645596, "epoch": 460} +{"train_lr": 0.0010516497839217333, "train_loss": 0.40732319879596335, "epoch": 461} +{"train_lr": 0.0010470893910679514, "train_loss": 0.40721929031543624, "epoch": 462} +{"train_lr": 0.0010425309574838217, "train_loss": 0.4074741626988189, "epoch": 463} +{"train_lr": 0.0010379745699553473, "train_loss": 0.40718286079115784, "epoch": 464} +{"train_lr": 0.0010334203152295809, "train_loss": 0.4070995915442323, "epoch": 465} +{"train_lr": 0.001028868280012966, "train_loss": 0.4070817599228273, "epoch": 466} +{"train_lr": 0.0010243185509696913, "train_loss": 0.407057137398694, "epoch": 467} +{"train_lr": 0.001019771214720041, "train_loss": 0.40704172161610747, "epoch": 468} +{"train_lr": 0.0010152263578387406, "train_loss": 0.40708873046036714, "epoch": 469} +{"train_lr": 0.0010106840668533167, "train_loss": 0.40687227406753945, "epoch": 470} +{"train_lr": 0.0010061444282424387, "train_loss": 0.40689455137814945, "epoch": 471} +{"train_lr": 0.001001607528434284, "train_loss": 0.4067392822337122, "epoch": 472} +{"train_lr": 0.0009970734538048858, "train_loss": 0.4067725698159148, "epoch": 473} +{"train_lr": 0.0009925422906764867, "train_loss": 0.4067414591673953, "epoch": 474} +{"train_lr": 0.000988014125315904, "train_loss": 0.40677410726531005, "epoch": 475} +{"train_lr": 0.000983489043932877, "train_loss": 0.40671809860027563, "epoch": 476} +{"train_lr": 0.0009789671326784328, "train_loss": 0.4066885843520793, "epoch": 477} +{"train_lr": 0.0009744484776432449, "train_loss": 0.40656357107815355, "epoch": 478} +{"train_lr": 0.0009699331648559909, "train_loss": 0.40661479317499566, "epoch": 479} +{"train_lr": 0.0009654212802817167, "train_loss": 0.4065199022372373, "epoch": 480} +{"train_lr": 0.0009609129098201999, "train_loss": 0.40652067499915856, "epoch": 481} +{"train_lr": 0.0009564081393043194, "train_loss": 0.40650237360909486, "epoch": 482} +{"train_lr": 0.0009519070544984084, "train_loss": 0.4064606928911347, "epoch": 483} +{"train_lr": 0.0009474097410966353, "train_loss": 0.406358272028275, "epoch": 484} +{"train_lr": 0.0009429162847213638, "train_loss": 0.4062820214581174, "epoch": 485} +{"train_lr": 0.0009384267709215272, "train_loss": 0.4062754138229558, "epoch": 486} +{"train_lr": 0.0009339412851709953, "train_loss": 0.40622046690147656, "epoch": 487} +{"train_lr": 0.0009294599128669512, "train_loss": 0.4062196460212223, "epoch": 488} +{"train_lr": 0.0009249827393282664, "train_loss": 0.40624284703666586, "epoch": 489} +{"train_lr": 0.000920509849793868, "train_loss": 0.4063993097575477, "epoch": 490} +{"train_lr": 0.0009160413294211269, "train_loss": 0.4061269706401687, "epoch": 491} +{"train_lr": 0.0009115772632842303, "train_loss": 0.40604672604538977, "epoch": 492} +{"train_lr": 0.0009071177363725607, "train_loss": 0.4059948067312153, "epoch": 493} +{"train_lr": 0.0009026628335890832, "train_loss": 0.4060036432541286, "epoch": 494} +{"train_lr": 0.0008982126397487258, "train_loss": 0.4059653549413316, "epoch": 495} +{"train_lr": 0.0008937672395767638, "train_loss": 0.4058932272018865, "epoch": 496} +{"train_lr": 0.0008893267177072082, "train_loss": 0.40586398070893037, "epoch": 497} +{"train_lr": 0.0008848911586811962, "train_loss": 0.40570423821620166, "epoch": 498} +{"train_lr": 0.0008804606469453758, "train_loss": 0.4057079857477966, "epoch": 499} +{"train_lr": 0.0008760352668503046, "train_loss": 0.4057402282427901, "epoch": 500} +{"train_lr": 0.0008716151026488423, "train_loss": 0.40573455984345996, "epoch": 501} +{"train_lr": 0.0008672002384945409, "train_loss": 0.4056789746675163, "epoch": 502} +{"train_lr": 0.0008627907584400527, "train_loss": 0.40564147108652365, "epoch": 503} +{"train_lr": 0.0008583867464355204, "train_loss": 0.40551826204113567, "epoch": 504} +{"train_lr": 0.0008539882863269843, "train_loss": 0.40549777155049527, "epoch": 505} +{"train_lr": 0.0008495954618547843, "train_loss": 0.40548197221822846, "epoch": 506} +{"train_lr": 0.0008452083566519666, "train_loss": 0.4053351033795386, "epoch": 507} +{"train_lr": 0.0008408270542426921, "train_loss": 0.40544413105966765, "epoch": 508} +{"train_lr": 0.0008364516380406403, "train_loss": 0.4054094929522715, "epoch": 509} +{"train_lr": 0.0008320821913474302, "train_loss": 0.40523585497449416, "epoch": 510} +{"train_lr": 0.000827718797351028, "train_loss": 0.4052047186847537, "epoch": 511} +{"train_lr": 0.0008233615391241664, "train_loss": 0.4051514037568361, "epoch": 512} +{"train_lr": 0.0008190104996227606, "train_loss": 0.405203840847557, "epoch": 513} +{"train_lr": 0.0008146657616843306, "train_loss": 0.40509661564675087, "epoch": 514} +{"train_lr": 0.0008103274080264235, "train_loss": 0.4051370158767662, "epoch": 515} +{"train_lr": 0.0008059955212450415, "train_loss": 0.40509992099713343, "epoch": 516} +{"train_lr": 0.0008016701838130633, "train_loss": 0.4050084149841076, "epoch": 517} +{"train_lr": 0.0007973514780786778, "train_loss": 0.40498947312470335, "epoch": 518} +{"train_lr": 0.0007930394862638177, "train_loss": 0.4048468506005473, "epoch": 519} +{"train_lr": 0.0007887342904625922, "train_loss": 0.40491496573369473, "epoch": 520} +{"train_lr": 0.0007844359726397224, "train_loss": 0.40479989495510477, "epoch": 521} +{"train_lr": 0.0007801446146289847, "train_loss": 0.4047400512440035, "epoch": 522} +{"train_lr": 0.0007758602981316503, "train_loss": 0.40473769646054375, "epoch": 523} +{"train_lr": 0.0007715831047149271, "train_loss": 0.40474676929462033, "epoch": 524} +{"train_lr": 0.0007673131158104147, "train_loss": 0.40466389988955015, "epoch": 525} +{"train_lr": 0.0007630504127125459, "train_loss": 0.40458019682242036, "epoch": 526} +{"train_lr": 0.0007587950765770436, "train_loss": 0.4046392260053649, "epoch": 527} +{"train_lr": 0.0007545471884193728, "train_loss": 0.4045530325965956, "epoch": 528} +{"train_lr": 0.0007503068291132018, "train_loss": 0.40456746879200906, "epoch": 529} +{"train_lr": 0.0007460740793888594, "train_loss": 0.40441401888771605, "epoch": 530} +{"train_lr": 0.0007418490198317987, "train_loss": 0.4043100023170551, "epoch": 531} +{"train_lr": 0.0007376317308810632, "train_loss": 0.4043588912896573, "epoch": 532} +{"train_lr": 0.0007334222928277559, "train_loss": 0.4043358557362062, "epoch": 533} +{"train_lr": 0.0007292207858135094, "train_loss": 0.40429400393548304, "epoch": 534} +{"train_lr": 0.0007250272898289608, "train_loss": 0.40428032110540724, "epoch": 535} +{"train_lr": 0.0007208418847122287, "train_loss": 0.40419535100674975, "epoch": 536} +{"train_lr": 0.0007166646501473936, "train_loss": 0.4041401115932669, "epoch": 537} +{"train_lr": 0.0007124956656629803, "train_loss": 0.4041405795709206, "epoch": 538} +{"train_lr": 0.0007083350106304438, "train_loss": 0.4040476569237235, "epoch": 539} +{"train_lr": 0.0007041827642626584, "train_loss": 0.4039582103007258, "epoch": 540} +{"train_lr": 0.0007000390056124096, "train_loss": 0.40393333622696215, "epoch": 541} +{"train_lr": 0.0006959038135708897, "train_loss": 0.4039792067204148, "epoch": 542} +{"train_lr": 0.0006917772668661943, "train_loss": 0.40400743056446886, "epoch": 543} +{"train_lr": 0.0006876594440618228, "train_loss": 0.40387270907656503, "epoch": 544} +{"train_lr": 0.0006835504235551869, "train_loss": 0.40385030512996495, "epoch": 545} +{"train_lr": 0.0006794502835761145, "train_loss": 0.40380048998094237, "epoch": 546} +{"train_lr": 0.0006753591021853594, "train_loss": 0.40375170164490837, "epoch": 547} +{"train_lr": 0.0006712769572731192, "train_loss": 0.40375785952887666, "epoch": 548} +{"train_lr": 0.0006672039265575479, "train_loss": 0.40368185934121126, "epoch": 549} +{"train_lr": 0.0006631400875832792, "train_loss": 0.40351521058115536, "epoch": 550} +{"train_lr": 0.0006590855177199493, "train_loss": 0.40354250425783295, "epoch": 551} +{"train_lr": 0.0006550402941607243, "train_loss": 0.4034807008589642, "epoch": 552} +{"train_lr": 0.0006510044939208292, "train_loss": 0.40340881089632136, "epoch": 553} +{"train_lr": 0.0006469781938360838, "train_loss": 0.40350077035598075, "epoch": 554} +{"train_lr": 0.0006429614705614375, "train_loss": 0.40342369057739585, "epoch": 555} +{"train_lr": 0.0006389544005695102, "train_loss": 0.4034082487506123, "epoch": 556} +{"train_lr": 0.0006349570601491407, "train_loss": 0.40329981361253137, "epoch": 557} +{"train_lr": 0.0006309695254039274, "train_loss": 0.40330997984617567, "epoch": 558} +{"train_lr": 0.0006269918722507841, "train_loss": 0.4032074278519035, "epoch": 559} +{"train_lr": 0.0006230241764184931, "train_loss": 0.40314884478142726, "epoch": 560} +{"train_lr": 0.0006190665134462633, "train_loss": 0.40318546131348765, "epoch": 561} +{"train_lr": 0.0006151189586822944, "train_loss": 0.40302897839902496, "epoch": 562} +{"train_lr": 0.0006111815872823375, "train_loss": 0.40307248276598656, "epoch": 563} +{"train_lr": 0.0006072544742082678, "train_loss": 0.40295639399212235, "epoch": 564} +{"train_lr": 0.0006033376942266588, "train_loss": 0.4029202505480498, "epoch": 565} +{"train_lr": 0.0005994313219073551, "train_loss": 0.40293739083557367, "epoch": 566} +{"train_lr": 0.0005955354316220552, "train_loss": 0.40285234694452715, "epoch": 567} +{"train_lr": 0.0005916500975428925, "train_loss": 0.4027972887628354, "epoch": 568} +{"train_lr": 0.00058777539364103, "train_loss": 0.40278459172576475, "epoch": 569} +{"train_lr": 0.0005839113936852423, "train_loss": 0.4027620741996007, "epoch": 570} +{"train_lr": 0.0005800581712405198, "train_loss": 0.40272315271879333, "epoch": 571} +{"train_lr": 0.0005762157996666634, "train_loss": 0.4025773255405231, "epoch": 572} +{"train_lr": 0.000572384352116889, "train_loss": 0.40262824610078657, "epoch": 573} +{"train_lr": 0.0005685639015364357, "train_loss": 0.4025623084893689, "epoch": 574} +{"train_lr": 0.000564754520661175, "train_loss": 0.4025112026944183, "epoch": 575} +{"train_lr": 0.0005609562820162276, "train_loss": 0.4024656550743832, "epoch": 576} +{"train_lr": 0.0005571692579145825, "train_loss": 0.40243863600353974, "epoch": 577} +{"train_lr": 0.000553393520455719, "train_loss": 0.4023528701106373, "epoch": 578} +{"train_lr": 0.0005496291415242374, "train_loss": 0.40238917958683884, "epoch": 579} +{"train_lr": 0.0005458761927884844, "train_loss": 0.40230784869979686, "epoch": 580} +{"train_lr": 0.0005421347456991955, "train_loss": 0.40238204727080673, "epoch": 581} +{"train_lr": 0.0005384048714881292, "train_loss": 0.40223376255613774, "epoch": 582} +{"train_lr": 0.0005346866411667144, "train_loss": 0.4021565411777164, "epoch": 583} +{"train_lr": 0.0005309801255246968, "train_loss": 0.4021812020204006, "epoch": 584} +{"train_lr": 0.0005272853951287912, "train_loss": 0.4020746909440137, "epoch": 585} +{"train_lr": 0.0005236025203213388, "train_loss": 0.40203427048459744, "epoch": 586} +{"train_lr": 0.0005199315712189664, "train_loss": 0.40197034500455725, "epoch": 587} +{"train_lr": 0.0005162726177112542, "train_loss": 0.40197522756464493, "epoch": 588} +{"train_lr": 0.0005126257294594024, "train_loss": 0.4018897909987479, "epoch": 589} +{"train_lr": 0.000508990975894907, "train_loss": 0.4019340915271105, "epoch": 590} +{"train_lr": 0.0005053684262182351, "train_loss": 0.4018403323343358, "epoch": 591} +{"train_lr": 0.000501758149397512, "train_loss": 0.40179195118327743, "epoch": 592} +{"train_lr": 0.000498160214167204, "train_loss": 0.40182342882065153, "epoch": 593} +{"train_lr": 0.0004945746890268112, "train_loss": 0.40156844701283634, "epoch": 594} +{"train_lr": 0.0004910016422395642, "train_loss": 0.40166771351001584, "epoch": 595} +{"train_lr": 0.0004874411418311232, "train_loss": 0.4015691514986639, "epoch": 596} +{"train_lr": 0.0004838932555882831, "train_loss": 0.40157083718953895, "epoch": 597} +{"train_lr": 0.0004803580510576859, "train_loss": 0.40150746444347674, "epoch": 598} +{"train_lr": 0.0004768355955445271, "train_loss": 0.40146687768626577, "epoch": 599} +{"train_lr": 0.0004733259561112843, "train_loss": 0.4013656457104028, "epoch": 600} +{"train_lr": 0.00046982919957643183, "train_loss": 0.4014010253177287, "epoch": 601} +{"train_lr": 0.0004663453925131751, "train_loss": 0.401316442541802, "epoch": 602} +{"train_lr": 0.0004628746012481774, "train_loss": 0.40135061838186514, "epoch": 603} +{"train_lr": 0.00045941689186030244, "train_loss": 0.4013017872258878, "epoch": 604} +{"train_lr": 0.00045597233017935225, "train_loss": 0.4012205607514494, "epoch": 605} +{"train_lr": 0.0004525409817848158, "train_loss": 0.4010732845307734, "epoch": 606} +{"train_lr": 0.0004491229120046211, "train_loss": 0.40112392098989147, "epoch": 607} +{"train_lr": 0.0004457181859138885, "train_loss": 0.40095535995295417, "epoch": 608} +{"train_lr": 0.0004423268683336966, "train_loss": 0.40107519237491757, "epoch": 609} +{"train_lr": 0.0004389490238298424, "train_loss": 0.40089200041555345, "epoch": 610} +{"train_lr": 0.0004355847167116164, "train_loss": 0.4009603075026415, "epoch": 611} +{"train_lr": 0.0004322340110305767, "train_loss": 0.4007888589806568, "epoch": 612} +{"train_lr": 0.0004288969705793297, "train_loss": 0.40085823017542654, "epoch": 613} +{"train_lr": 0.00042557365889031546, "train_loss": 0.4008245808311189, "epoch": 614} +{"train_lr": 0.00042226413923459786, "train_loss": 0.40080666332804143, "epoch": 615} +{"train_lr": 0.00041896847462066024, "train_loss": 0.40067557709960216, "epoch": 616} +{"train_lr": 0.0004156867277932069, "train_loss": 0.4006406409122671, "epoch": 617} +{"train_lr": 0.0004124189612319663, "train_loss": 0.40067053648034257, "epoch": 618} +{"train_lr": 0.000409165237150504, "train_loss": 0.4006024836652124, "epoch": 619} +{"train_lr": 0.00040592561749503553, "train_loss": 0.40058071185082483, "epoch": 620} +{"train_lr": 0.0004027001639432505, "train_loss": 0.40051370228413874, "epoch": 621} +{"train_lr": 0.0003994889379031339, "train_loss": 0.40047323810330665, "epoch": 622} +{"train_lr": 0.0003962920005118015, "train_loss": 0.40047235801242864, "epoch": 623} +{"train_lr": 0.0003931094126343328, "train_loss": 0.40038400971426225, "epoch": 624} +{"train_lr": 0.00038994123486261274, "train_loss": 0.4003337500109457, "epoch": 625} +{"train_lr": 0.0003867875275141802, "train_loss": 0.4003118855269769, "epoch": 626} +{"train_lr": 0.0003836483506310766, "train_loss": 0.4002860259079637, "epoch": 627} +{"train_lr": 0.0003805237639787045, "train_loss": 0.40020315256268263, "epoch": 628} +{"train_lr": 0.00037741382704469054, "train_loss": 0.4001016740937932, "epoch": 629} +{"train_lr": 0.00037431859903775094, "train_loss": 0.40014096555443335, "epoch": 630} +{"train_lr": 0.0003712381388865644, "train_loss": 0.40007645417398846, "epoch": 631} +{"train_lr": 0.00036817250523865294, "train_loss": 0.4001130870818041, "epoch": 632} +{"train_lr": 0.000365121756459261, "train_loss": 0.3999421138059682, "epoch": 633} +{"train_lr": 0.000362085950630249, "train_loss": 0.39993893454830426, "epoch": 634} +{"train_lr": 0.00035906514554898285, "train_loss": 0.3999607272577496, "epoch": 635} +{"train_lr": 0.00035605939872723774, "train_loss": 0.39988730190536725, "epoch": 636} +{"train_lr": 0.00035306876739010003, "train_loss": 0.39983918937221646, "epoch": 637} +{"train_lr": 0.0003500933084748797, "train_loss": 0.39978164734426314, "epoch": 638} +{"train_lr": 0.00034713307863002557, "train_loss": 0.39978201805542296, "epoch": 639} +{"train_lr": 0.0003441881342140461, "train_loss": 0.3997511507597012, "epoch": 640} +{"train_lr": 0.00034125853129443856, "train_loss": 0.399678559621605, "epoch": 641} +{"train_lr": 0.0003383443256466194, "train_loss": 0.3995659738414897, "epoch": 642} +{"train_lr": 0.00033544557275286366, "train_loss": 0.39954135730826795, "epoch": 643} +{"train_lr": 0.00033256232780124785, "train_loss": 0.39953629148061365, "epoch": 644} +{"train_lr": 0.00032969464568459927, "train_loss": 0.39953539276925415, "epoch": 645} +{"train_lr": 0.000326842580999452, "train_loss": 0.3994370355951385, "epoch": 646} +{"train_lr": 0.00032400618804500746, "train_loss": 0.39941193949944603, "epoch": 647} +{"train_lr": 0.0003211855208220971, "train_loss": 0.39936334144873303, "epoch": 648} +{"train_lr": 0.0003183806330321605, "train_loss": 0.39932101693911815, "epoch": 649} +{"train_lr": 0.0003155915780762176, "train_loss": 0.39929802753986454, "epoch": 650} +{"train_lr": 0.000312818409053854, "train_loss": 0.3992980943324092, "epoch": 651} +{"train_lr": 0.0003100611787622107, "train_loss": 0.3992614940686438, "epoch": 652} +{"train_lr": 0.0003073199396949779, "train_loss": 0.3991799572637926, "epoch": 653} +{"train_lr": 0.0003045947440413965, "train_loss": 0.39913937841685343, "epoch": 654} +{"train_lr": 0.00030188564368526324, "train_loss": 0.39915586874271053, "epoch": 655} +{"train_lr": 0.00029919269020394336, "train_loss": 0.398969745153012, "epoch": 656} +{"train_lr": 0.00029651593486738974, "train_loss": 0.3989916533315315, "epoch": 657} +{"train_lr": 0.0002938554286371653, "train_loss": 0.398980718314982, "epoch": 658} +{"train_lr": 0.0002912112221654737, "train_loss": 0.3989473644626112, "epoch": 659} +{"train_lr": 0.00028858336579419536, "train_loss": 0.39888661709697676, "epoch": 660} +{"train_lr": 0.00028597190955392625, "train_loss": 0.39888876083438307, "epoch": 661} +{"train_lr": 0.00028337690316303, "train_loss": 0.3988901605346025, "epoch": 662} +{"train_lr": 0.0002807983960266869, "train_loss": 0.3988267953406112, "epoch": 663} +{"train_lr": 0.00027823643723595644, "train_loss": 0.3987751852434415, "epoch": 664} +{"train_lr": 0.0002756910755668407, "train_loss": 0.3987223649672113, "epoch": 665} +{"train_lr": 0.0002731623594793579, "train_loss": 0.3986610113046108, "epoch": 666} +{"train_lr": 0.0002706503371166151, "train_loss": 0.39863646618771154, "epoch": 667} +{"train_lr": 0.0002681550563038991, "train_loss": 0.398625210312625, "epoch": 668} +{"train_lr": 0.0002656765645477588, "train_loss": 0.39857109432789284, "epoch": 669} +{"train_lr": 0.00026321490903510463, "train_loss": 0.3984749709578374, "epoch": 670} +{"train_lr": 0.0002607701366323092, "train_loss": 0.3985634923297673, "epoch": 671} +{"train_lr": 0.0002607701366323092, "train_loss": 0.39847029329468614, "epoch": 671} +{"train_lr": 0.00025834229388431527, "train_loss": 0.3984167390019418, "epoch": 672} +{"train_lr": 0.00025593142701374873, "train_loss": 0.3984153713219059, "epoch": 673} +{"train_lr": 0.0002535375819200397, "train_loss": 0.39833352874474937, "epoch": 674} +{"train_lr": 0.0002511608041785483, "train_loss": 0.39829322964084357, "epoch": 675} +{"train_lr": 0.0002488011390396965, "train_loss": 0.3982728718785951, "epoch": 676} +{"train_lr": 0.000246458631428107, "train_loss": 0.3982159129093186, "epoch": 677} +{"train_lr": 0.00024413332594174845, "train_loss": 0.3982398451025335, "epoch": 678} +{"train_lr": 0.0002418252668510853, "train_loss": 0.39817184433699226, "epoch": 679} +{"train_lr": 0.00023953449809823558, "train_loss": 0.39808873023885566, "epoch": 680} +{"train_lr": 0.0002372610632961341, "train_loss": 0.3980418292369741, "epoch": 681} +{"train_lr": 0.00023500500572770275, "train_loss": 0.39808892941651625, "epoch": 682} +{"train_lr": 0.00023276636834502533, "train_loss": 0.39800545562488526, "epoch": 683} +{"train_lr": 0.00023054519376853095, "train_loss": 0.39798971789423376, "epoch": 684} +{"train_lr": 0.000228341524286182, "train_loss": 0.39801577686403805, "epoch": 685} +{"train_lr": 0.00022615540185266786, "train_loss": 0.39793494881059116, "epoch": 686} +{"train_lr": 0.00022398686808860945, "train_loss": 0.3979601376266099, "epoch": 687} +{"train_lr": 0.00022183596427976347, "train_loss": 0.3978403454903179, "epoch": 688} +{"train_lr": 0.0002197027313762382, "train_loss": 0.39773378970149237, "epoch": 689} +{"train_lr": 0.00021758720999171227, "train_loss": 0.39777832595894164, "epoch": 690} +{"train_lr": 0.00021548944040266456, "train_loss": 0.39771761881330836, "epoch": 691} +{"train_lr": 0.0002134094625476033, "train_loss": 0.3976772154848545, "epoch": 692} +{"train_lr": 0.0002113473160263091, "train_loss": 0.3975796118014468, "epoch": 693} +{"train_lr": 0.000209303040099079, "train_loss": 0.3975841266085179, "epoch": 694} +{"train_lr": 0.00020727667368597986, "train_loss": 0.3975522827225713, "epoch": 695} +{"train_lr": 0.00020526825536610726, "train_loss": 0.39762424131353885, "epoch": 696} +{"train_lr": 0.00020327782337685118, "train_loss": 0.3976167627783397, "epoch": 697} +{"train_lr": 0.0002013054156131673, "train_loss": 0.3974667288130149, "epoch": 698} +{"train_lr": 0.00019935106962685635, "train_loss": 0.39743793080859363, "epoch": 699} +{"train_lr": 0.00019741482262584887, "train_loss": 0.3974320717418614, "epoch": 700} +{"train_lr": 0.00019549671147349638, "train_loss": 0.3973614268649656, "epoch": 701} +{"train_lr": 0.00019359677268787083, "train_loss": 0.39733912353702366, "epoch": 702} +{"train_lr": 0.0001917150424410675, "train_loss": 0.39732482824892473, "epoch": 703} +{"train_lr": 0.00018985155655851815, "train_loss": 0.39729455607429814, "epoch": 704} +{"train_lr": 0.00018800635051830793, "train_loss": 0.3972467629937455, "epoch": 705} +{"train_lr": 0.00018617945945049967, "train_loss": 0.39721219141322833, "epoch": 706} +{"train_lr": 0.00018437091813646575, "train_loss": 0.3971436861514424, "epoch": 707} +{"train_lr": 0.00018258076100822665, "train_loss": 0.3972315847103556, "epoch": 708} +{"train_lr": 0.000180809022147793, "train_loss": 0.3970368156263318, "epoch": 709} +{"train_lr": 0.00017905573528651913, "train_loss": 0.39711282955399024, "epoch": 710} +{"train_lr": 0.000177320933804459, "train_loss": 0.3970147223952107, "epoch": 711} +{"train_lr": 0.00017560465072973276, "train_loss": 0.3970198181236927, "epoch": 712} +{"train_lr": 0.00017390691873789602, "train_loss": 0.3969494656044751, "epoch": 713} +{"train_lr": 0.0001722277701513185, "train_loss": 0.39695064426184845, "epoch": 714} +{"train_lr": 0.0001705672369385691, "train_loss": 0.39693007428640836, "epoch": 715} +{"train_lr": 0.00016892535071380598, "train_loss": 0.39690777982692593, "epoch": 716} +{"train_lr": 0.00016730214273617719, "train_loss": 0.3968320999813911, "epoch": 717} +{"train_lr": 0.00016569764390922197, "train_loss": 0.3967816275723565, "epoch": 718} +{"train_lr": 0.0001641118847802857, "train_loss": 0.39692675177222836, "epoch": 719} +{"train_lr": 0.00016254489553993575, "train_loss": 0.3967170494889172, "epoch": 720} +{"train_lr": 0.00016099670602138892, "train_loss": 0.39678743287909013, "epoch": 721} +{"train_lr": 0.000159467345699942, "train_loss": 0.3967261496746244, "epoch": 722} +{"train_lr": 0.00015795684369241075, "train_loss": 0.3967392017873816, "epoch": 723} +{"train_lr": 0.00015646522875657626, "train_loss": 0.3966885056310835, "epoch": 724} +{"train_lr": 0.0001549925292906367, "train_loss": 0.3965789718248953, "epoch": 725} +{"train_lr": 0.00015353877333266702, "train_loss": 0.39661051045195794, "epoch": 726} +{"train_lr": 0.00015210398856008514, "train_loss": 0.3965992741495705, "epoch": 727} +{"train_lr": 0.00015068820228912496, "train_loss": 0.39652254913921636, "epoch": 728} +{"train_lr": 0.00014929144147431605, "train_loss": 0.39651879141978824, "epoch": 729} +{"train_lr": 0.0001479137327079715, "train_loss": 0.39652866908074474, "epoch": 730} +{"train_lr": 0.0001465551022196798, "train_loss": 0.39642498921602964, "epoch": 731} +{"train_lr": 0.0001452155758758071, "train_loss": 0.3964142541741379, "epoch": 732} +{"train_lr": 0.00014389517917900418, "train_loss": 0.3964455056613168, "epoch": 733} +{"train_lr": 0.00014259393726772084, "train_loss": 0.39640393293498516, "epoch": 734} +{"train_lr": 0.00014131187491572722, "train_loss": 0.39643978867799234, "epoch": 735} +{"train_lr": 0.00014004901653164286, "train_loss": 0.3962538736776855, "epoch": 736} +{"train_lr": 0.00013880538615847047, "train_loss": 0.3962314602148791, "epoch": 737} +{"train_lr": 0.00013758100747314012, "train_loss": 0.39621207925777596, "epoch": 738} +{"train_lr": 0.00013637590378605678, "train_loss": 0.39627872894589716, "epoch": 739} +{"train_lr": 0.00013519009804065788, "train_loss": 0.39609781629704416, "epoch": 740} +{"train_lr": 0.000134023612812975, "train_loss": 0.3961053301717561, "epoch": 741} +{"train_lr": 0.00013287647031120598, "train_loss": 0.39621051645372063, "epoch": 742} +{"train_lr": 0.00013174869237529024, "train_loss": 0.39617534446565866, "epoch": 743} +{"train_lr": 0.00013064030047649377, "train_loss": 0.39614509605169773, "epoch": 744} +{"train_lr": 0.00012955131571700112, "train_loss": 0.3961230623297011, "epoch": 745} +{"train_lr": 0.00012848175882951195, "train_loss": 0.396136003274781, "epoch": 746} +{"train_lr": 0.00012743165017684786, "train_loss": 0.3959994736360386, "epoch": 747} +{"train_lr": 0.00012640100975156387, "train_loss": 0.3958685474846369, "epoch": 748} +{"train_lr": 0.00012538985717556808, "train_loss": 0.39600011887542236, "epoch": 749} +{"train_lr": 0.00012439821169974797, "train_loss": 0.39602591505405516, "epoch": 750} +{"train_lr": 0.00012342609220360385, "train_loss": 0.39595969730856806, "epoch": 751} +{"train_lr": 0.00012247351719488973, "train_loss": 0.39595074674042946, "epoch": 752} +{"train_lr": 0.00012154050480926074, "train_loss": 0.3958544527592424, "epoch": 753} +{"train_lr": 0.0001206270728099278, "train_loss": 0.39586364505334926, "epoch": 754} +{"train_lr": 0.0001197332385873192, "train_loss": 0.395963006802142, "epoch": 755} +{"train_lr": 0.00011885901915875058, "train_loss": 0.3957122047342217, "epoch": 756} +{"train_lr": 0.00011800443116809937, "train_loss": 0.39583009899032706, "epoch": 757} +{"train_lr": 0.00011716949088548901, "train_loss": 0.3957149375600215, "epoch": 758} +{"train_lr": 0.00011635421420697925, "train_loss": 0.3958443301753738, "epoch": 759} +{"train_lr": 0.00011555861665426263, "train_loss": 0.3956724483346662, "epoch": 760} +{"train_lr": 0.00011478271337436975, "train_loss": 0.3958091593037049, "epoch": 761} +{"train_lr": 0.00011402651913938054, "train_loss": 0.39575345273046064, "epoch": 762} +{"train_lr": 0.0001132900483461433, "train_loss": 0.39577234872222805, "epoch": 763} +{"train_lr": 0.00011257331501600035, "train_loss": 0.3956845926377588, "epoch": 764} +{"train_lr": 0.00011187633279452117, "train_loss": 0.3957371470810941, "epoch": 765} +{"train_lr": 0.00011119911495124251, "train_loss": 0.39570510602639747, "epoch": 766} +{"train_lr": 0.00011054167437941602, "train_loss": 0.3956630747460832, "epoch": 767} +{"train_lr": 0.00010990402359576233, "train_loss": 0.39567503699352247, "epoch": 768} +{"train_lr": 0.00010928617474023332, "train_loss": 0.39571998205680686, "epoch": 769} +{"train_lr": 0.00010868813957578054, "train_loss": 0.3956101418216116, "epoch": 770} +{"train_lr": 0.00010810992948813149, "train_loss": 0.3955000247179459, "epoch": 771} +{"train_lr": 0.00010755155548557293, "train_loss": 0.39557026598590594, "epoch": 772} +{"train_lr": 0.00010701302819874079, "train_loss": 0.3955783303707647, "epoch": 773} +{"train_lr": 0.00010649435788041832, "train_loss": 0.3955380746629089, "epoch": 774} +{"train_lr": 0.00010599555440534079, "train_loss": 0.3954764198249158, "epoch": 775} +{"train_lr": 0.00010551662727000747, "train_loss": 0.39549409868852353, "epoch": 776} +{"train_lr": 0.00010505758559250056, "train_loss": 0.39542940607031757, "epoch": 777} +{"train_lr": 0.00010461843811231193, "train_loss": 0.3954961236046914, "epoch": 778} +{"train_lr": 0.00010419919319017639, "train_loss": 0.39550039982303786, "epoch": 779} +{"train_lr": 0.00010379985880791331, "train_loss": 0.39544806897771567, "epoch": 780} +{"train_lr": 0.00010342044256827326, "train_loss": 0.3954047037479587, "epoch": 781} +{"train_lr": 0.00010306095169479492, "train_loss": 0.3954766464855474, "epoch": 782} +{"train_lr": 0.00010272139303166615, "train_loss": 0.3953952655727521, "epoch": 783} +{"train_lr": 0.00010240177304359433, "train_loss": 0.39547297515822816, "epoch": 784} +{"train_lr": 0.0001021020978156836, "train_loss": 0.39534901476536805, "epoch": 785} +{"train_lr": 0.00010182237305331808, "train_loss": 0.39531362823282296, "epoch": 786} +{"train_lr": 0.00010156260408205405, "train_loss": 0.3954241632883891, "epoch": 787} +{"train_lr": 0.00010132279584751836, "train_loss": 0.3952771816163873, "epoch": 788} +{"train_lr": 0.0001011029529153142, "train_loss": 0.3954083356791391, "epoch": 789} +{"train_lr": 0.00010090307947093394, "train_loss": 0.39527252094987303, "epoch": 790} +{"train_lr": 0.00010072317931967978, "train_loss": 0.3953342796780933, "epoch": 791} +{"train_lr": 0.00010056325588659148, "train_loss": 0.39531643394058424, "epoch": 792} +{"train_lr": 0.00010042331221638053, "train_loss": 0.3953475776182201, "epoch": 793} +{"train_lr": 0.00010030335097337291, "train_loss": 0.3953579601873524, "epoch": 794} +{"train_lr": 0.00010020337444145742, "train_loss": 0.39527962923061866, "epoch": 795} +{"train_lr": 0.00010012338452404336, "train_loss": 0.39528626886805374, "epoch": 796} +{"train_lr": 0.00010006338274402353, "train_loss": 0.3952554755826266, "epoch": 797} +{"train_lr": 0.0001000233702437451, "train_loss": 0.39523102716805464, "epoch": 798} +{"train_lr": 0.00010000334778498856, "train_loss": 0.39522883877193987, "epoch": 799} diff --git a/CV/MAE/main_finetune.py b/CV/MAE/main_finetune.py new file mode 100644 index 0000000..2f30421 --- /dev/null +++ b/CV/MAE/main_finetune.py @@ -0,0 +1,391 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import argparse +from ast import arg +import datetime +import json +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +from timm.models.helpers import load_state_dict + +import timm + +#assert timm.__version__ == "0.3.2" # version check +from timm.models.layers import trunc_normal_ +from timm.data.mixup import Mixup +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy +from adan import Adan + +import util.lr_decay as lrd +import util.misc as misc +from util.datasets import build_dataset +from util.pos_embed import interpolate_pos_embed +from util.misc import NativeScalerWithGradNormCount as NativeScaler + +import models_vit + +from engine_finetune import train_one_epoch, evaluate + +def get_args_parser(): + parser = argparse.ArgumentParser('MAE fine-tuning for image classification', add_help=False) + parser.add_argument('--batch_size', default=64, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + parser.add_argument('--epochs', default=50, type=int) + parser.add_argument('--accum_iter', default=1, type=int, + help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') + + # Model parameters + parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL', + help='Name of model to train') + + parser.add_argument('--input_size', default=224, type=int, + help='images input size') + + parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT', + help='Drop path rate (default: 0.1)') + + # Optimizer parameters + parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + parser.add_argument('--weight_decay', type=float, default=0.05, + help='weight decay (default: 0.05)') + + parser.add_argument('--lr', type=float, default=None, metavar='LR', + help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', + help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') + parser.add_argument('--layer_decay', type=float, default=0.75, + help='layer-wise lr decay from ELECTRA/BEiT') + + parser.add_argument('--min-lr', type=float, default=1e-6, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + + parser.add_argument('--warmup-epochs', type=int, default=5, metavar='N', + help='epochs to warmup LR') + + + parser.add_argument('--use-adan', action='store_true', default=True, + help='whether to use Adan') + parser.add_argument('--max-grad-norm', type=float, default=0.0, + help='max grad norm (default: 0.0 for no clip)') + parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') + parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--bias-decay', action='store_true', default=False, + help='whether to decay bias term') + + # Augmentation parameters + parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT', + help='Color jitter factor (enabled only when not using Auto/RandAug)') + parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m9-mstd0.5-inc1)'), + parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') + + # * Random Erase params + parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', + help='Random erase prob (default: 0.25)') + parser.add_argument('--remode', type=str, default='pixel', + help='Random erase mode (default: "pixel")') + parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') + parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') + + # * Mixup params + parser.add_argument('--mixup', type=float, default=0, + help='mixup alpha, mixup enabled if > 0.') + parser.add_argument('--cutmix', type=float, default=0, + help='cutmix alpha, cutmix enabled if > 0.') + parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') + parser.add_argument('--mixup_prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') + parser.add_argument('--mixup_switch_prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') + parser.add_argument('--mixup_mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') + + # * Finetuning params + parser.add_argument('--finetune', default='', + help='finetune from checkpoint') + parser.add_argument('--global_pool', action='store_true') + parser.set_defaults(global_pool=True) + parser.add_argument('--cls_token', action='store_false', dest='global_pool', + help='Use class token instead of global pool for classification') + + # Dataset parameters + parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str, + help='dataset path') + parser.add_argument('--nb_classes', default=1000, type=int, + help='number of the classification types') + + parser.add_argument('--output_dir', default=None, + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default='./output_dir/', + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda:0', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default='', + help='resume from checkpoint') + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true', + help='Perform evaluation only') + parser.add_argument('--dist_eval', action='store_true', default=False, + help='Enabling distributed evaluation (recommended during training for faster monitor') + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up distributed training') + + return parser + + +def main(args): + args.device = 'cuda:0' + args.world_size = 1 + args.rank = 0 # global rank + args.gpu = 0 + #misc.init_distributed_mode(args) + misc.init_distributed_ddpjob(args) + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + dataset_train = build_dataset(is_train=True, args=args) + dataset_val = build_dataset(is_train=False, args=args) + + if True: # args.distributed: + num_tasks = misc.get_world_size() + global_rank = misc.get_rank() + sampler_train = torch.utils.data.DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + if args.dist_eval: + if len(dataset_val) % num_tasks != 0: + print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' + 'This will slightly alter validation results as extra duplicate entries are added to achieve ' + 'equal num of samples per-process.') + sampler_val = torch.utils.data.DistributedSampler( + dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True) # shuffle=True to reduce monitor bias + else: + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + if misc.is_main_process() and args.log_dir is not None and not args.eval: + TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.datetime.now()) + + args.log_dir = args.log_dir+ 'mae-' + TIMESTAMP + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) + else: + log_writer = None + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=True, + ) + + data_loader_val = torch.utils.data.DataLoader( + dataset_val, sampler=sampler_val, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=False + ) + + mixup_fn = None + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + if mixup_active: + print("Mixup is activated!") + mixup_fn = Mixup( + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + label_smoothing=args.smoothing, num_classes=args.nb_classes) + + model = models_vit.__dict__[args.model]( + num_classes=args.nb_classes, + drop_path_rate=args.drop_path, + global_pool=args.global_pool, + ) + + if args.finetune and not args.eval: + #checkpoint = torch.load(args.finetune, map_location='cpu') + + print("Load pre-trained checkpoint from: %s" % args.finetune) + checkpoint_model = load_state_dict(args.finetune) + state_dict = model.state_dict() + for k in ['head.weight', 'head.bias']: + if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: + print(f"Removing key {k} from pretrained checkpoint") + del checkpoint_model[k] + + # interpolate position embedding + interpolate_pos_embed(model, checkpoint_model) + + # load pre-trained model + msg = model.load_state_dict(checkpoint_model, strict=False) + print(msg) + + if args.global_pool: + assert set(msg.missing_keys) == {'head.weight', 'head.bias', 'fc_norm.weight', 'fc_norm.bias'} + else: + assert set(msg.missing_keys) == {'head.weight', 'head.bias'} + + + # manually initialize fc layer + trunc_normal_(model.head.weight, std=1e-5) + + model.to(device) + + model_without_ddp = model + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print("Model = %s" % str(model_without_ddp)) + print('number of params (M): %.2f' % (n_parameters / 1.e6)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model_without_ddp = model.module + + # build optimizer with layer-wise lr decay (lrd) + + if args.use_FAM: + param_groups = lrd.param_groups_lrd(model_without_ddp, args.weight_decay, + no_weight_decay_list=[] if args.bias_decay else model_without_ddp.no_weight_decay(), + layer_decay=args.layer_decay + ) + optimizer = Name(param_groups, weight_decay=args.weight_decay, + lr=args.lr, betas=args.opt_betas, eps = args.opt_eps, max_grad_norm=args.max_grad_norm + ) + else: + param_groups = lrd.param_groups_lrd(model_without_ddp, args.weight_decay, + no_weight_decay_list=model_without_ddp.no_weight_decay(), + layer_decay=args.layer_decay + ) + optimizer = torch.optim.AdamW(param_groups, lr=args.lr) + #print(optimizer) + loss_scaler = NativeScaler() + + if mixup_fn is not None: + # smoothing is handled with mixup label transform + criterion = SoftTargetCrossEntropy() + elif args.smoothing > 0.: + criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) + else: + criterion = torch.nn.CrossEntropyLoss() + + print("criterion = %s" % str(criterion)) + + + misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + if args.eval: + test_stats = evaluate(data_loader_val, model, device) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + exit(0) + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + max_accuracy = 0.0 + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + train_stats = train_one_epoch( + model, criterion, data_loader_train, + optimizer, device, epoch, loss_scaler, + args.clip_grad, mixup_fn, + log_writer=log_writer, + args=args + ) + if args.output_dir and (epoch+1) % 10 == 0: + misc.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch) + + test_stats = evaluate(data_loader_val, model, device) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + max_accuracy = max(max_accuracy, test_stats["acc1"]) + print(f'Max accuracy: {max_accuracy:.2f}%') + + if log_writer is not None: + log_writer.add_scalar('perf/test_acc1', test_stats['acc1'], epoch) + log_writer.add_scalar('perf/test_acc5', test_stats['acc5'], epoch) + log_writer.add_scalar('perf/test_loss', test_stats['loss'], epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if args.log_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.log_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) \ No newline at end of file diff --git a/CV/MAE/main_linprobe.py b/CV/MAE/main_linprobe.py new file mode 100644 index 0000000..2d3f241 --- /dev/null +++ b/CV/MAE/main_linprobe.py @@ -0,0 +1,316 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- + +import argparse +import datetime +import json +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import timm + +assert timm.__version__ == "0.3.2" # version check +from timm.models.layers import trunc_normal_ + +import util.misc as misc +from util.pos_embed import interpolate_pos_embed +from util.misc import NativeScalerWithGradNormCount as NativeScaler +from util.lars import LARS +from util.crop import RandomResizedCrop + +import models_vit + +from engine_finetune import train_one_epoch, evaluate + + +def get_args_parser(): + parser = argparse.ArgumentParser('MAE linear probing for image classification', add_help=False) + parser.add_argument('--batch_size', default=512, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + parser.add_argument('--epochs', default=90, type=int) + parser.add_argument('--accum_iter', default=1, type=int, + help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') + + # Model parameters + parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL', + help='Name of model to train') + + # Optimizer parameters + parser.add_argument('--weight_decay', type=float, default=0, + help='weight decay (default: 0 for linear probe following MoCo v1)') + + parser.add_argument('--lr', type=float, default=None, metavar='LR', + help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=0.1, metavar='LR', + help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') + + parser.add_argument('--min_lr', type=float, default=0., metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + + parser.add_argument('--warmup_epochs', type=int, default=10, metavar='N', + help='epochs to warmup LR') + + # * Finetuning params + parser.add_argument('--finetune', default='', + help='finetune from checkpoint') + parser.add_argument('--global_pool', action='store_true') + parser.set_defaults(global_pool=False) + parser.add_argument('--cls_token', action='store_false', dest='global_pool', + help='Use class token instead of global pool for classification') + + # Dataset parameters + parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str, + help='dataset path') + parser.add_argument('--nb_classes', default=1000, type=int, + help='number of the classification types') + + parser.add_argument('--output_dir', default='./output_dir', + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default='./output_dir', + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default='', + help='resume from checkpoint') + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true', + help='Perform evaluation only') + parser.add_argument('--dist_eval', action='store_true', default=False, + help='Enabling distributed evaluation (recommended during training for faster monitor') + parser.add_argument('--num_workers', default=10, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up distributed training') + + return parser + + +def main(args): + misc.init_distributed_mode(args) + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + # linear probe: weak augmentation + transform_train = transforms.Compose([ + RandomResizedCrop(224, interpolation=3), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + transform_val = transforms.Compose([ + transforms.Resize(256, interpolation=3), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + dataset_train = datasets.ImageFolder(os.path.join(args.data_path, 'train'), transform=transform_train) + dataset_val = datasets.ImageFolder(os.path.join(args.data_path, 'val'), transform=transform_val) + print(dataset_train) + print(dataset_val) + + if True: # args.distributed: + num_tasks = misc.get_world_size() + global_rank = misc.get_rank() + sampler_train = torch.utils.data.DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + if args.dist_eval: + if len(dataset_val) % num_tasks != 0: + print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' + 'This will slightly alter validation results as extra duplicate entries are added to achieve ' + 'equal num of samples per-process.') + sampler_val = torch.utils.data.DistributedSampler( + dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True) # shuffle=True to reduce monitor bias + else: + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + if global_rank == 0 and args.log_dir is not None and not args.eval: + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) + else: + log_writer = None + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=True, + ) + + data_loader_val = torch.utils.data.DataLoader( + dataset_val, sampler=sampler_val, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=False + ) + + model = models_vit.__dict__[args.model]( + num_classes=args.nb_classes, + global_pool=args.global_pool, + ) + + if args.finetune and not args.eval: + checkpoint = torch.load(args.finetune, map_location='cpu') + + print("Load pre-trained checkpoint from: %s" % args.finetune) + checkpoint_model = checkpoint['model'] + state_dict = model.state_dict() + for k in ['head.weight', 'head.bias']: + if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: + print(f"Removing key {k} from pretrained checkpoint") + del checkpoint_model[k] + + # interpolate position embedding + interpolate_pos_embed(model, checkpoint_model) + + # load pre-trained model + msg = model.load_state_dict(checkpoint_model, strict=False) + print(msg) + + if args.global_pool: + assert set(msg.missing_keys) == {'head.weight', 'head.bias', 'fc_norm.weight', 'fc_norm.bias'} + else: + assert set(msg.missing_keys) == {'head.weight', 'head.bias'} + + # manually initialize fc layer: following MoCo v3 + trunc_normal_(model.head.weight, std=0.01) + + # for linear prob only + # hack: revise model's head with BN + model.head = torch.nn.Sequential(torch.nn.BatchNorm1d(model.head.in_features, affine=False, eps=1e-6), model.head) + # freeze all but the head + for _, p in model.named_parameters(): + p.requires_grad = False + for _, p in model.head.named_parameters(): + p.requires_grad = True + + model.to(device) + + model_without_ddp = model + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print("Model = %s" % str(model_without_ddp)) + print('number of params (M): %.2f' % (n_parameters / 1.e6)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model_without_ddp = model.module + + optimizer = LARS(model_without_ddp.head.parameters(), lr=args.lr, weight_decay=args.weight_decay) + print(optimizer) + loss_scaler = NativeScaler() + + criterion = torch.nn.CrossEntropyLoss() + + print("criterion = %s" % str(criterion)) + + misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + if args.eval: + test_stats = evaluate(data_loader_val, model, device) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + exit(0) + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + max_accuracy = 0.0 + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + train_stats = train_one_epoch( + model, criterion, data_loader_train, + optimizer, device, epoch, loss_scaler, + max_norm=None, + log_writer=log_writer, + args=args + ) + if args.output_dir: + misc.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch) + + test_stats = evaluate(data_loader_val, model, device) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + max_accuracy = max(max_accuracy, test_stats["acc1"]) + print(f'Max accuracy: {max_accuracy:.2f}%') + + if log_writer is not None: + log_writer.add_scalar('perf/test_acc1', test_stats['acc1'], epoch) + log_writer.add_scalar('perf/test_acc5', test_stats['acc5'], epoch) + log_writer.add_scalar('perf/test_loss', test_stats['loss'], epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) diff --git a/CV/MAE/main_pretrain.py b/CV/MAE/main_pretrain.py new file mode 100644 index 0000000..2f134f5 --- /dev/null +++ b/CV/MAE/main_pretrain.py @@ -0,0 +1,277 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- +import argparse +import datetime +import json +import numpy as np +import os +import time +from pathlib import Path + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets +from adan import Adan +from timm.models import resume_checkpoint + +import timm + +#assert timm.__version__ == "0.3.2" # version check +import timm.optim.optim_factory as optim_factory +from timm.utils import * + +import util.misc as misc +from util.misc import NativeScalerWithGradNormCount as NativeScaler + +import models_mae + +from engine_pretrain import train_one_epoch + + +def get_args_parser(): + parser = argparse.ArgumentParser('MAE pre-training', add_help=False) + parser.add_argument('--batch_size', default=64, type=int, + help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + parser.add_argument('--epochs', default=400, type=int) + parser.add_argument('--accum_iter', default=1, type=int, + help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') + + # Model parameters + parser.add_argument('--model', default='mae_vit_large_patch16', type=str, metavar='MODEL', + help='Name of model to train') + + parser.add_argument('--input_size', default=224, type=int, + help='images input size') + + parser.add_argument('--mask_ratio', default=0.75, type=float, + help='Masking ratio (percentage of removed patches).') + + parser.add_argument('--norm_pix_loss', action='store_true', + help='Use (per-patch) normalized pixels as targets for computing loss') + parser.set_defaults(norm_pix_loss=False) + + # Optimizer parameters + parser.add_argument('--weight_decay', type=float, default=0.05, + help='weight decay (default: 0.05)') + + parser.add_argument('--lr', type=float, default=None, metavar='LR', + help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', + help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') + parser.add_argument('--min_lr', type=float, default=0., metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + + parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', + help='epochs to warmup LR') + + + parser.add_argument('--use-adan', action='store_true', default=False, + help='whether to use Adan') + parser.add_argument('--max-grad-norm', type=float, default=0.0, + help='max grad norm (default: 0.0 for no clip)') + parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') + parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--bias-decay', action='store_true', default=False, + help='whether to decay bias term') + + # Dataset parameters + parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str, + help='dataset path') + + parser.add_argument('--output_dir', default=None, + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default='./pretrain_dir/', + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default=None, + help='resume from checkpoint') + parser.add_argument('--no-resume-opt', action='store_true', default=False, + help='prevent resume of optimizer state when resuming model') + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--num_workers', default=10, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up distributed training') + + return parser + + +def main(args): + misc.init_distributed_mode(args) + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = torch.device(args.device) + + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + # simple augmentation + transform_train = transforms.Compose([ + transforms.RandomResizedCrop(args.input_size, scale=(0.2, 1.0), interpolation=3), # 3 is bicubic + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + dataset_train = datasets.ImageFolder(os.path.join(args.data_path, 'train'), transform=transform_train) + print(dataset_train) + + if True: # args.distributed: + num_tasks = misc.get_world_size() + global_rank = misc.get_rank() + sampler_train = torch.utils.data.DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + + if misc.is_main_process() and args.log_dir is not None: + TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.datetime.now()) + + args.log_dir = args.log_dir+ 'mae-' + TIMESTAMP + os.makedirs(args.log_dir, exist_ok=True) + log_writer = SummaryWriter(log_dir=args.log_dir) + else: + log_writer = None + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=True, + ) + + # define the model + model = models_mae.__dict__[args.model](norm_pix_loss=args.norm_pix_loss) + + model.to(device) + + model_without_ddp = model + print("Model = %s" % str(model_without_ddp)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) + model_without_ddp = model.module + + # following timm: set wd as 0 for bias and norm layers + param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay) + if args.use_adan: + if args.bias_decay: + param = model_without_ddp.parameters() + else: + param = param_groups + args.weight_decay = 0.0 + optimizer = Adan(param, weight_decay=args.weight_decay, + lr=args.lr, betas=args.opt_betas, eps = args.opt_eps, max_grad_norm=args.max_grad_norm + ) + else: + optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) + print(optimizer) + loss_scaler = NativeScaler() + + resume_epoch = None + + if not args.resume: + resume_path = os.path.join(args.output_dir, "last.pth.tar") + print(resume_path, os.path.isfile(resume_path)) + if os.path.isfile(resume_path): args.resume = resume_path + + if args.resume: + resume_epoch = resume_checkpoint( + model_without_ddp, args.resume, + optimizer=None if args.no_resume_opt else optimizer, + loss_scaler=None if args.no_resume_opt else loss_scaler, + log_info=misc.is_main_process()) + if resume_epoch is not None: + args.start_epoch = resume_epoch + + #misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + print(f"Start training for {args.epochs} epochs") + saver = None + if misc.is_main_process() and args.output_dir is not None: + saver = CheckpointSaver( + model=model, optimizer=optimizer, args=args, amp_scaler=loss_scaler, + checkpoint_dir=args.output_dir, recovery_dir=args.output_dir, decreasing=True, max_history=2) + start_time = time.time() + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + train_stats = train_one_epoch( + model, data_loader_train, + optimizer, device, epoch, loss_scaler, + log_writer=log_writer, + args=args + ) + if saver is not None: + # save proper checkpoint with eval metric + + saver.save_checkpoint(epoch, train_stats['loss']) + # if args.output_dir and (epoch % 25 == 0 or epoch + 1 == args.epochs): + # misc.save_model( + # args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + # loss_scaler=loss_scaler, epoch=epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch,} + + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) diff --git a/CV/MAE/models_mae.py b/CV/MAE/models_mae.py new file mode 100644 index 0000000..880e28f --- /dev/null +++ b/CV/MAE/models_mae.py @@ -0,0 +1,250 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +from functools import partial + +import torch +import torch.nn as nn + +from timm.models.vision_transformer import PatchEmbed, Block + +from util.pos_embed import get_2d_sincos_pos_embed + + +class MaskedAutoencoderViT(nn.Module): + """ Masked Autoencoder with VisionTransformer backbone + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, + embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False): + super().__init__() + + # -------------------------------------------------------------------------- + # MAE encoder specifics + self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False) # fixed sin-cos embedding + + self.blocks = nn.ModuleList([ + Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + # -------------------------------------------------------------------------- + + # -------------------------------------------------------------------------- + # MAE decoder specifics + self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True) + + self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) + + self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, decoder_embed_dim), requires_grad=False) # fixed sin-cos embedding + + self.decoder_blocks = nn.ModuleList([ + Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer) + for i in range(decoder_depth)]) + + self.decoder_norm = norm_layer(decoder_embed_dim) + self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size**2 * in_chans, bias=True) # decoder to patch + # -------------------------------------------------------------------------- + + self.norm_pix_loss = norm_pix_loss + + self.initialize_weights() + + def initialize_weights(self): + # initialization + # initialize (and freeze) pos_embed by sin-cos embedding + pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=True) + self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) + + decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=True) + self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0)) + + # initialize patch_embed like nn.Linear (instead of nn.Conv2d) + w = self.patch_embed.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + + # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.) + torch.nn.init.normal_(self.cls_token, std=.02) + torch.nn.init.normal_(self.mask_token, std=.02) + + # initialize nn.Linear and nn.LayerNorm + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def patchify(self, imgs): + """ + imgs: (N, 3, H, W) + x: (N, L, patch_size**2 *3) + """ + p = self.patch_embed.patch_size[0] + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + + h = w = imgs.shape[2] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum('nchpwq->nhwpqc', x) + x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) + return x + + def unpatchify(self, x): + """ + x: (N, L, patch_size**2 *3) + imgs: (N, 3, H, W) + """ + p = self.patch_embed.patch_size[0] + h = w = int(x.shape[1]**.5) + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) + return imgs + + def random_masking(self, x, mask_ratio): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + noise = torch.rand(N, L, device=x.device) # noise in [0, 1] + + # sort noise for each sample + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore + + def forward_encoder(self, x, mask_ratio): + # embed patches + x = self.patch_embed(x) + + # add pos embed w/o cls token + x = x + self.pos_embed[:, 1:, :] + + # masking: length -> length * mask_ratio + x, mask, ids_restore = self.random_masking(x, mask_ratio) + + # append cls token + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + # apply Transformer blocks + for blk in self.blocks: + x = blk(x) + x = self.norm(x) + + return x, mask, ids_restore + + def forward_decoder(self, x, ids_restore): + # embed tokens + x = self.decoder_embed(x) + + # append mask tokens to sequence + mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1) + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token + x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) # unshuffle + x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token + + # add pos embed + x = x + self.decoder_pos_embed + + # apply Transformer blocks + for blk in self.decoder_blocks: + x = blk(x) + x = self.decoder_norm(x) + + # predictor projection + x = self.decoder_pred(x) + + # remove cls token + x = x[:, 1:, :] + + return x + + def forward_loss(self, imgs, pred, mask): + """ + imgs: [N, 3, H, W] + pred: [N, L, p*p*3] + mask: [N, L], 0 is keep, 1 is remove, + """ + target = self.patchify(imgs) + if self.norm_pix_loss: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6)**.5 + + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + def forward(self, imgs, mask_ratio=0.75): + latent, mask, ids_restore = self.forward_encoder(imgs, mask_ratio) + pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3] + loss = self.forward_loss(imgs, pred, mask) + return loss, pred, mask + + +def mae_vit_base_patch16_dec512d8b(**kwargs): + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=768, depth=12, num_heads=12, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def mae_vit_large_patch16_dec512d8b(**kwargs): + model = MaskedAutoencoderViT( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def mae_vit_huge_patch14_dec512d8b(**kwargs): + model = MaskedAutoencoderViT( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, + decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, + mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +# set recommended archs +mae_vit_base_patch16 = mae_vit_base_patch16_dec512d8b # decoder: 512 dim, 8 blocks +mae_vit_large_patch16 = mae_vit_large_patch16_dec512d8b # decoder: 512 dim, 8 blocks +mae_vit_huge_patch14 = mae_vit_huge_patch14_dec512d8b # decoder: 512 dim, 8 blocks diff --git a/CV/MAE/models_vit.py b/CV/MAE/models_vit.py new file mode 100644 index 0000000..2244a17 --- /dev/null +++ b/CV/MAE/models_vit.py @@ -0,0 +1,74 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +from functools import partial + +import torch +import torch.nn as nn + +import timm.models.vision_transformer + + +class VisionTransformer(timm.models.vision_transformer.VisionTransformer): + """ Vision Transformer with support for global average pooling + """ + def __init__(self, global_pool=False, **kwargs): + super(VisionTransformer, self).__init__(**kwargs) + + self.global_pool = global_pool + if self.global_pool: + norm_layer = kwargs['norm_layer'] + embed_dim = kwargs['embed_dim'] + self.fc_norm = norm_layer(embed_dim) + + del self.norm # remove the original norm + + def forward_features(self, x): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + if self.global_pool: + x = x[:, 1:, :].mean(dim=1) # global pool without cls token + outcome = self.fc_norm(x) + else: + x = self.norm(x) + outcome = x[:, 0] + + return outcome + + +def vit_base_patch16(**kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_large_patch16(**kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_huge_patch14(**kwargs): + model = VisionTransformer( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model \ No newline at end of file diff --git a/CV/MAE/util/crop.py b/CV/MAE/util/crop.py new file mode 100644 index 0000000..fcb2612 --- /dev/null +++ b/CV/MAE/util/crop.py @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch + +from torchvision import transforms +from torchvision.transforms import functional as F + + +class RandomResizedCrop(transforms.RandomResizedCrop): + """ + RandomResizedCrop for matching TF/TPU implementation: no for-loop is used. + This may lead to results different with torchvision's version. + Following BYOL's TF code: + https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206 + """ + @staticmethod + def get_params(img, scale, ratio): + width, height = F._get_image_size(img) + area = height * width + + target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item() + log_ratio = torch.log(torch.tensor(ratio)) + aspect_ratio = torch.exp( + torch.empty(1).uniform_(log_ratio[0], log_ratio[1]) + ).item() + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + w = min(w, width) + h = min(h, height) + + i = torch.randint(0, height - h + 1, size=(1,)).item() + j = torch.randint(0, width - w + 1, size=(1,)).item() + + return i, j, h, w \ No newline at end of file diff --git a/CV/MAE/util/datasets.py b/CV/MAE/util/datasets.py new file mode 100644 index 0000000..0dde1f4 --- /dev/null +++ b/CV/MAE/util/datasets.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +import os +import PIL + +from torchvision import datasets, transforms + +from timm.data import create_transform +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + + +def build_dataset(is_train, args): + transform = build_transform(is_train, args) + + root = os.path.join(args.data_path, 'train' if is_train else 'val') + dataset = datasets.ImageFolder(root, transform=transform) + + print(dataset) + + return dataset + + +def build_transform(is_train, args): + mean = IMAGENET_DEFAULT_MEAN + std = IMAGENET_DEFAULT_STD + # train transform + if is_train: + # this should always dispatch to transforms_imagenet_train + transform = create_transform( + input_size=args.input_size, + is_training=True, + color_jitter=args.color_jitter, + auto_augment=args.aa, + interpolation='bicubic', + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + mean=mean, + std=std, + ) + return transform + + # eval transform + t = [] + if args.input_size <= 224: + crop_pct = 224 / 256 + else: + crop_pct = 1.0 + size = int(args.input_size / crop_pct) + t.append( + transforms.Resize(size, interpolation=PIL.Image.BICUBIC), # to maintain same ratio w.r.t. 224 images + ) + t.append(transforms.CenterCrop(args.input_size)) + + t.append(transforms.ToTensor()) + t.append(transforms.Normalize(mean, std)) + return transforms.Compose(t) diff --git a/CV/MAE/util/lars.py b/CV/MAE/util/lars.py new file mode 100644 index 0000000..509c5f6 --- /dev/null +++ b/CV/MAE/util/lars.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# LARS optimizer, implementation from MoCo v3: +# https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- + +import torch + + +class LARS(torch.optim.Optimizer): + """ + LARS optimizer, no rate scaling or weight decay for parameters <= 1D. + """ + def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, trust_coefficient=trust_coefficient) + super().__init__(params, defaults) + + @torch.no_grad() + def step(self): + for g in self.param_groups: + for p in g['params']: + dp = p.grad + + if dp is None: + continue + + if p.ndim > 1: # if not normalization gamma/beta or bias + dp = dp.add(p, alpha=g['weight_decay']) + param_norm = torch.norm(p) + update_norm = torch.norm(dp) + one = torch.ones_like(param_norm) + q = torch.where(param_norm > 0., + torch.where(update_norm > 0, + (g['trust_coefficient'] * param_norm / update_norm), one), + one) + dp = dp.mul(q) + + param_state = self.state[p] + if 'mu' not in param_state: + param_state['mu'] = torch.zeros_like(p) + mu = param_state['mu'] + mu.mul_(g['momentum']).add_(dp) + p.add_(mu, alpha=-g['lr']) \ No newline at end of file diff --git a/CV/MAE/util/lr_decay.py b/CV/MAE/util/lr_decay.py new file mode 100644 index 0000000..7fa11f1 --- /dev/null +++ b/CV/MAE/util/lr_decay.py @@ -0,0 +1,76 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# ELECTRA https://github.com/google-research/electra +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import json + + +def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=.75): + """ + Parameter groups for layer-wise lr decay + Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58 + """ + param_group_names = {} + param_groups = {} + + num_layers = len(model.blocks) + 1 + + layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1)) + + for n, p in model.named_parameters(): + if not p.requires_grad: + continue + + # no decay: all 1D parameters and model specific ones + if p.ndim == 1 or n in no_weight_decay_list: + g_decay = "no_decay" + this_decay = 0. + else: + g_decay = "decay" + this_decay = weight_decay + + layer_id = get_layer_id_for_vit(n, num_layers) + group_name = "layer_%d_%s" % (layer_id, g_decay) + + if group_name not in param_group_names: + this_scale = layer_scales[layer_id] + + param_group_names[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "params": [], + } + param_groups[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "params": [], + } + + param_group_names[group_name]["params"].append(n) + param_groups[group_name]["params"].append(p) + + # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2)) + + return list(param_groups.values()) + + +def get_layer_id_for_vit(name, num_layers): + """ + Assign a parameter with its layer id + Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33 + """ + if name in ['cls_token', 'pos_embed']: + return 0 + elif name.startswith('patch_embed'): + return 0 + elif name.startswith('blocks'): + return int(name.split('.')[1]) + 1 + else: + return num_layers \ No newline at end of file diff --git a/CV/MAE/util/lr_sched.py b/CV/MAE/util/lr_sched.py new file mode 100644 index 0000000..4cb682b --- /dev/null +++ b/CV/MAE/util/lr_sched.py @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +def adjust_learning_rate(optimizer, epoch, args): + """Decay the learning rate with half-cycle cosine after warmup""" + if epoch < args.warmup_epochs: + lr = args.lr * epoch / args.warmup_epochs + else: + lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \ + (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs))) + for param_group in optimizer.param_groups: + if "lr_scale" in param_group: + param_group["lr"] = lr * param_group["lr_scale"] + else: + param_group["lr"] = lr + return lr diff --git a/CV/MAE/util/misc.py b/CV/MAE/util/misc.py new file mode 100644 index 0000000..b62001a --- /dev/null +++ b/CV/MAE/util/misc.py @@ -0,0 +1,366 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import builtins +import datetime +import os +import time +from collections import defaultdict, deque +from pathlib import Path + +import torch +import torch.distributed as dist +from torch._six import inf + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + log_msg = [ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ] + if torch.cuda.is_available(): + log_msg.append('max mem: {memory:.0f}') + log_msg = self.delimiter.join(log_msg) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + builtin_print = builtins.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + force = force or (get_world_size() > 8) + if is_master or force: + now = datetime.datetime.now().time() + builtin_print('[{}] '.format(now), end='') # print with time stamp + builtin_print(*args, **kwargs) + + builtins.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_ddpjob(args=None): + """ + initialize the ddp job + """ + if not dist.is_available() or not dist.is_initialized(): + try: + os.environ['MASTER_PORT'] = '40101' + torch.distributed.init_process_group( + backend='nccl') + except Exception: + world_size, rank = 1, 0 + print('distributed training not available') + print(Exception) + world_size = dist.get_world_size() + rank = dist.get_rank() + assert rank >= 0 + args.gpu = args.rank + args.world_size, args.rank = world_size, rank + + args.distributed = True + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + #torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +def init_distributed_mode(args): + if args.dist_on_itp: + args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) + args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) + os.environ['LOCAL_RANK'] = str(args.gpu) + os.environ['RANK'] = str(args.rank) + os.environ['WORLD_SIZE'] = str(args.world_size) + # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] + elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + else: + print('Not using distributed mode') + setup_for_distributed(is_master=True) # hack + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}, gpu {}'.format( + args.rank, args.dist_url, args.gpu), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +class NativeScalerWithGradNormCount: + state_dict_key = "amp_scaler" + + def __init__(self): + self._scaler = torch.cuda.amp.GradScaler() + + def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): + self._scaler.scale(loss).backward(create_graph=create_graph) + if update_grad: + if clip_grad is not None: + assert parameters is not None + self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) + else: + self._scaler.unscale_(optimizer) + norm = get_grad_norm_(parameters) + self._scaler.step(optimizer) + self._scaler.update() + else: + norm = None + return norm + + def state_dict(self): + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): + self._scaler.load_state_dict(state_dict) + + +def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + norm_type = float(norm_type) + if len(parameters) == 0: + return torch.tensor(0.) + device = parameters[0].grad.device + if norm_type == inf: + total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + else: + total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + return total_norm + + +def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler): + output_dir = Path(args.output_dir) + epoch_name = str(epoch) + if loss_scaler is not None: + checkpoint_paths = [output_dir / ('checkpoint-%s.pth' % epoch_name)] + for checkpoint_path in checkpoint_paths: + to_save = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'epoch': epoch, + 'scaler': loss_scaler.state_dict(), + 'args': args, + } + + save_on_master(to_save, checkpoint_path) + else: + client_state = {'epoch': epoch} + model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch_name, client_state=client_state) + + +def load_model(args, model_without_ddp, optimizer, loss_scaler): + if args.resume: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + print("Resume checkpoint %s" % args.resume) + if 'optimizer' in checkpoint and 'epoch' in checkpoint and not (hasattr(args, 'eval') and args.eval): + optimizer.load_state_dict(checkpoint['optimizer']) + args.start_epoch = checkpoint['epoch'] + 1 + if 'scaler' in checkpoint: + loss_scaler.load_state_dict(checkpoint['scaler']) + print("With optim & sched!") + + +def all_reduce_mean(x): + world_size = get_world_size() + if world_size > 1: + x_reduce = torch.tensor(x).cuda() + dist.all_reduce(x_reduce) + x_reduce /= world_size + return x_reduce.item() + else: + return x \ No newline at end of file diff --git a/CV/MAE/util/pos_embed.py b/CV/MAE/util/pos_embed.py new file mode 100644 index 0000000..6acf8bd --- /dev/null +++ b/CV/MAE/util/pos_embed.py @@ -0,0 +1,96 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# Position embedding utils +# -------------------------------------------------------- + +import numpy as np + +import torch + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed diff --git a/CV/timm/README.md b/CV/timm/README.md new file mode 100644 index 0000000..3592aba --- /dev/null +++ b/CV/timm/README.md @@ -0,0 +1,79 @@ +# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models + +For vision tasks, our implementation is based on the official [`timm`](https://github.com/rwightman/pytorch-image-models). To reproduce our results, please first refer to [`timm`](https://github.com/rwightman/pytorch-image-models) and install it. Then you can follow the following two steps to reproduce our experiments in paper. + + + +## Environment + +Our experiments for this task are based on the following pkg version. + +```python +torch.__version__ = '1.10.0+cu113' +torchvision.__version__ = '0.11.1+cu113' +timm.__version__ = '0.6.1' +torchaudio.__version__ = '0.10.0+cu113' +``` + +Note that our timm is a developer version. If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:timm](https://hub.docker.com/repository/docker/xyxie/adan-image). + + + +## Usage of Adan in timm + +### Two steps to use Adan + +**Step 1.** add Adan-dependent hyper-parameters by adding the following hyper-parameters to the `train.py`: + +```python +parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient (default: 0.0, no gradient clip)') +parser.add_argument('--weight-decay', type=float, default=0.02, help='weight decay, similar one used in AdamW (default: 0.02)') +parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)') +parser.add_argument('--no-prox', action='store_true', default=False, help='whether perform weight decay like AdamW (default=False)') +parser.add_argument('--bias-decay', action='store_true', default=False, help='Perform the weight decay on bias term (default=False)') + +``` + +* `bias-decay`: It decides whether or not to perform the weight decay on 1) bias term, 2) bn, and 3) other 1d params, which are all filtered out by the default setting in timm. + +* `no-prox`: It determines the update rule of parameters with weight decay. By default, Adan updates the parameters in the way presented in Algorithm 1 in the paper: + + $$\boldsymbol{\theta}_{k+1} = ( 1+\lambda \eta)^{-1}\left[\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k)\right],$$ + + But one also can update the parameter like Adamw: + + $$\boldsymbol{\theta}_{k+1} = ( 1-\lambda \eta)\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k).$$ + **In all experiments, we set `no-prox=False` in our paper.** + + + +**Step 2.** creat the Adan optimizer as follows. In this step, we directly replace the vanilla optimizer creator by using the following three substeps. + +i) add Adan into `optim_factory.py`: + ```python + elif opt_lower == 'adan': + optimizer = Adan(parameters, **opt_args) + ``` + +ii) import the optimizer creator into your training file `train.py` from `optim_factory` : + + ```python + from optim_factory import create_optimizer + ``` + +iii) replace the vanilla creator (`optimizer = create_optimizer(args, model)`) in the training file `train.py` with Adan: + + ```python + opt_lower = args.opt.lower() + if opt_lower == 'adan': + args.opt_args = {'max_grad_norm': args.max_grad_norm, 'no_prox': args.no_prox} + optimizer = create_optimizer(args, model, filter_bias_and_bn = not args.bias_decay) + ``` + + + +## ImageNet-1K Training Recipe + +We provide the specific commonds and hyper-parameters for ViTs, ResNets and ConvNexts in this [recipe](./supervised.md). + diff --git a/CV/timm/adan.py b/CV/timm/adan.py new file mode 100644 index 0000000..e2a224a --- /dev/null +++ b/CV/timm/adan.py @@ -0,0 +1,154 @@ +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import torch +from torch.optim.optimizer import Optimizer +from timm.utils import * + + +class Adan(Optimizer): + """ + Implements a pytorch variant of Adan + + Adan was proposed in + Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. + https://arxiv.org/abs/2208.06677 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float, flot], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) + max_grad_norm (float, optional): value used to clip + global grad norm (default: 0.0 no clip) + no_prox (bool): how to perform the decoupled weight decay (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, max_grad_norm=0.0, no_prox=False): + if not 0.0 <= max_grad_norm: + raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= betas[2] < 1.0: + raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm, no_prox=no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @torch.no_grad() + def restart_opt(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + if p.requires_grad: + state = self.state[p] + # State initialization + + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + # Exponential moving average of gradient difference + state['exp_avg_diff'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self): + """ + Performs a single optimization step. + """ + if self.defaults['max_grad_norm'] > 0: + device = self.param_groups[0]['params'][0].device + global_grad_norm = torch.zeros(1, device=device) + + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + for group in self.param_groups: + + for p in group['params']: + if p.grad is not None: + grad = p.grad + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + + clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) + else: + clip_global_grad_norm = 1.0 + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) + + grad = p.grad.mul_(clip_global_grad_norm) + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + copy_grad = grad.clone() + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + update = grad + beta2 * diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) + + if group['no_prox']: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + p.add_(update, alpha=-group['lr']) + else: + p.add_(update, alpha=-group['lr']) + p.data.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad diff --git a/CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml b/CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml new file mode 100644 index 0000000..30485ea --- /dev/null +++ b/CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml @@ -0,0 +1,111 @@ +aa: rand-m9-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: false +bias_decay: false +bn_eps: null +bn_momentum: null +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.1 +epoch_repeats: 0.0 +epochs: 150 +eval_metric: top1 +experiment: +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 0.0 +mean: null +min_lr: 0.0001 +mixup: 0.8 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: convnext_tiny_hnf +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_eps: 1.0e-08 +output: ./exp_results/cvnext +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.25 +resplit: false +resume: null +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: bicubic +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-08 +weight_decay: 0.04 +workers: 8 diff --git a/CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml b/CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml new file mode 100644 index 0000000..09de86f --- /dev/null +++ b/CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml @@ -0,0 +1,111 @@ +aa: rand-m9-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: false +bias_decay: false +bn_eps: null +bn_momentum: null +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.1 +epoch_repeats: 0.0 +epochs: 300 +eval_metric: top1 +experiment: +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.016 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 0.0 +mean: null +min_lr: 0.0001 +mixup: 0.8 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: convnext_tiny_hnf +model_ema: true +model_ema_decay: 0.9999 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.9 +opt_eps: 1.0e-08 +output: ./exp_results/cvnext +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.25 +resplit: false +resume: null +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 150 +warmup_lr: 1.0e-08 +weight_decay: 0.02 +workers: 8 diff --git a/CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv b/CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv new file mode 100644 index 0000000..eb20334 --- /dev/null +++ b/CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv @@ -0,0 +1,162 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,6.949120010648455,6.94758,0.098,0.5 +1,6.862144197736468,6.67267,0.7260000094604492,2.8660000064086915 +2,6.758432626724243,6.235245,2.4060000006103515,7.79000000793457 +3,6.59022855758667,5.71968375,5.324000036621094,14.974000053710938 +4,6.390288966042655,5.161505,9.559999970703124,24.06800000732422 +5,6.191993509020124,4.61118375,15.203999997558594,34.33800004638672 +6,5.93863810811724,4.02059625,22.75800002685547,45.19399994628906 +7,5.739998238427298,3.502533125,29.72400002319336,54.630000031738284 +8,5.5293778691973,3.0821,36.9499999987793,62.65999991210938 +9,5.182862758636475,2.7918725,42.444000041503905,68.35999993652344 +10,5.155720744814191,2.5169634375,46.800000048828124,72.43000000976562 +11,4.918254239218576,2.3485275,50.37200013916016,75.46400005371093 +12,4.932929992675781,2.1857096875,53.25000006347656,77.90200017333984 +13,4.708190441131592,2.069689375,55.221999931640624,79.64600004882813 +14,4.694507598876953,1.96074125,57.45999997558594,80.91000001220704 +15,4.671949318477085,1.8996378125,58.692000075683595,82.26000005859375 +16,4.54720984186445,1.8131734375,59.98400012207031,83.24400005859376 +17,4.458590064729963,1.8123903125,60.53799998535156,83.96999998291015 +18,4.3758848905563354,1.7519709375,61.49800001953125,84.48200008544922 +19,4.340606348855155,1.6863440625,62.391999909667966,85.24799992919922 +20,4.472063779830933,1.68092125,62.82999993164062,85.42400000732422 +21,4.378427471433367,1.679071875,63.790000107421875,86.00400018310548 +22,4.375464507511684,1.648258125,63.99400006347656,86.01799992919922 +23,4.337998969214303,1.62242875,64.24999989746094,86.27799989990234 +24,4.298710312162127,1.59491171875,64.766,86.731999921875 +25,4.22202069418771,1.5892434375,65.10400015625,86.99000014892579 +26,4.316074218068804,1.56527109375,65.22599997558594,87.11799984375 +27,4.191796098436628,1.53780734375,65.55000013183594,87.21200010009765 +28,4.222038558551243,1.575911875,65.37400005859375,87.27199994873047 +29,4.135323320116315,1.5107128125,66.06800002441406,87.29200002441407 +30,4.22194378716605,1.545961875,66.02200002929688,87.62000001953125 +31,4.185418827193124,1.51114390625,66.05799994873047,87.71000005126953 +32,4.096941930907113,1.49620984375,65.96999989990235,87.7519999975586 +33,4.215072478566851,1.4948609375,66.21000018554687,87.73399987060547 +34,4.163071274757385,1.5585396875,66.666,87.91400005126953 +35,4.114607776914324,1.5304534375,66.01000005371094,87.63400002685547 +36,4.118212870189121,1.5224515625,66.20600007080078,87.93999994384765 +37,4.204281193869455,1.5179465625,66.38400010253906,87.89600007080078 +38,4.24107871736799,1.52187875,66.32999999755859,87.8100000732422 +39,4.1643034390040805,1.515391875,66.61999997558594,87.92400002197266 +40,4.070860539163862,1.56338328125,66.33000002685547,87.8499999975586 +41,4.184106230735779,1.514239375,66.48600007568359,87.71399991943359 +42,4.172222001211984,1.50769546875,66.104,87.7760001538086 +43,4.3232389858790805,1.57834875,65.96399992675781,87.65399987060547 +44,4.174148797988892,1.5443553125,65.89199994628906,87.63599997070312 +45,4.1315469571522305,1.51145125,65.72400005615235,87.67199999511719 +46,4.283388665744236,1.565783125,66.0200000805664,87.78800002197265 +47,4.183656964983259,1.525795,65.64399995117188,87.71999999511719 +48,4.234571712357657,1.50795125,65.86199995117188,87.81200017578125 +49,4.198768717902047,1.5564540625,65.36000006591797,87.1520000830078 +50,4.219038724899292,1.5349934375,65.90200008056641,87.5440000756836 +51,4.127701095172337,1.515018125,65.67200011230469,87.53599997070313 +52,4.239894764763968,1.53229078125,65.62400016601562,87.49200007324218 +53,4.229601979255676,1.5197615625,65.15399995605469,87.35199997314453 +54,4.19003392968859,1.51075296875,65.7220001538086,87.46800004638672 +55,4.192435826574053,1.5914375,65.48400008544922,87.322000078125 +56,4.1021279607500345,1.54005890625,65.38599997802734,87.42000010498047 +57,4.20039519241878,1.5680734375,65.5919998828125,87.57800007568359 +58,4.324156045913696,1.59799640625,64.72200002441406,87.17800012939453 +59,4.239287546702793,1.5902246875,64.84800000732422,86.97599997558594 +60,4.065577728407724,1.44326765625,68.30400000244141,89.15400017333984 +61,4.120057480675833,1.386383125,68.40999994628906,89.24200007324218 +62,4.0325968606131415,1.39331453125,68.53599994628907,89.37199999267578 +63,4.132853150367737,1.42474296875,68.66400004394531,89.28800007080078 +64,4.020219087600708,1.3872065625,68.5939999975586,89.42399996826173 +65,4.013268879481724,1.39158640625,68.34799997070313,89.32800001953125 +66,4.005090730530875,1.3511690625,68.99200005371094,89.52000014648438 +67,3.9683394602366855,1.36299828125,69.29600005126953,89.69600004638671 +68,4.024942125592913,1.340035625,69.46800006835937,89.66600001708984 +69,3.9339629071099416,1.35916578125,69.40000004394531,89.7560000439453 +70,3.9678512130464827,1.35286515625,69.51199994628907,89.79800007324219 +71,4.023373927388873,1.360429375,69.92400012695313,90.07600001953125 +72,3.969754219055176,1.33380875,69.88200002197266,90.17200012451171 +73,3.9590375082833424,1.28158140625,70.1680000756836,90.32199991699218 +epoch,train_loss,eval_loss,eval_top1,eval_top5 +74,3.872572592326573,1.335075,70.19199993652344,90.28600004394531 +75,3.909360408782959,1.312461875,70.52400001708985,90.5280001196289 +76,3.8855138335909163,1.28012890625,70.78799993408204,90.43199994140625 +77,3.919007114001683,1.31800015625,70.6560000390625,90.4380000732422 +78,3.891653691019331,1.336391875,70.90000001708984,90.46399996337891 +79,3.894351737839835,1.28790734375,71.00600001953126,90.64600007080078 +80,3.823104841368539,1.262434375,71.15400001953125,90.76999993896484 +81,3.910448908805847,1.25446109375,71.07600009521484,90.81999999267578 +82,3.924909387316023,1.2769846875,71.60600012207031,90.9519999658203 +83,3.7137380497796193,1.2824403125,71.43800014648437,90.88200001464844 +84,3.8967798948287964,1.24836609375,71.78200009765625,91.15599999267579 +85,3.8117938893181935,1.2596334375,71.88600006347656,91.25200017333984 +86,3.9488723278045654,1.22986734375,72.25800017089844,91.46800001464844 +87,3.7670505387442454,1.22124546875,72.36199996826171,91.43000004394531 +88,3.825251579284668,1.199299375,72.53400001708984,91.48800014892578 +89,3.860643812588283,1.19397125,72.801999921875,91.58000007324219 +90,3.779147114072527,1.20414328125,72.74400001464844,91.69600009277343 +91,3.759223989077977,1.191304375,72.82799999023437,91.7520001196289 +92,3.7066173212868825,1.16685125,73.24599995605469,91.93600001464844 +93,3.715459874698094,1.1712415625,72.98600003417968,91.88200006591796 +94,3.826604655810765,1.16880703125,73.4040000390625,91.98200006591797 +95,3.775054318564279,1.15899234375,73.8859999584961,92.25399999023438 +96,3.782100932938712,1.166160625,73.78200000976562,92.13400014404297 +97,3.743901354925973,1.16621484375,73.6680001171875,92.34799999023437 +98,3.742660846029009,1.1303665625,73.92599998535157,92.48599998779297 +99,3.6611821992056712,1.13907421875,74.32999998291015,92.30999993652344 +100,3.7334849323545183,1.13430890625,74.36000008544922,92.46600009277344 +101,3.634329523359026,1.1071696875,74.45400001220703,92.61800016845703 +102,3.664542010852269,1.114905625,74.62600011230468,92.6480000366211 +103,3.5832586458751132,1.0819065625,75.06000008789063,92.80000006591797 +104,3.64421922819955,1.0801809375,75.07000005615234,93.16000000976562 +105,3.607390114239284,1.0793553125,75.23599990722656,93.0760001171875 +106,3.5108348301478793,1.07566765625,75.54999998291015,93.04799990966796 +107,3.588543857846941,1.0673165625,75.23600003662109,93.10200000976562 +108,3.5450944219316756,1.05819828125,75.74000008544922,93.2359999609375 +109,3.506815637860979,1.040446875,75.8360001147461,93.39200006591797 +110,3.4770743335996355,1.05782140625,75.7600000415039,93.24000001220703 +111,3.5809445721762523,1.0449415625,75.99400006347656,93.38399998779298 +112,3.569081289427621,1.04761125,76.47800013427734,93.5019999584961 +113,3.497449823788234,1.03080390625,76.34600006347657,93.6019998803711 +114,3.4141811473029002,1.0365765625,76.51400006103516,93.5659999609375 +115,3.488074677331107,1.02879859375,76.75000016601562,93.72600006347656 +116,3.478419746671404,1.00970078125,76.97399990966797,93.84199998535156 +117,3.5675860132489885,1.01771640625,76.95800005859375,93.8620000366211 +118,3.424144216946193,0.9859259375,77.22600010742188,93.91600014404297 +119,3.377763560840062,0.9717196875,77.44999998535157,94.06000008789063 +120,3.500486288751875,0.97851296875,77.43200000488281,94.13199998535156 +121,3.3951018367494856,0.96721609375,77.89600000488281,94.10600008789062 +122,3.4327589103153775,0.972241875,77.86000005615234,94.26000006347657 +123,3.356873188699995,0.95380078125,77.99800005615235,94.25000008789063 +124,3.366360766547067,0.94531078125,78.0940000024414,94.36000000732422 +125,3.2791837453842163,0.93246,78.37000002685546,94.53400008789062 +126,3.3443728174482072,0.944398125,78.48199998046876,94.55600000976563 +127,3.332287073135376,0.93921078125,78.58600000244141,94.56599993164062 +128,3.276544622012547,0.9311209375,78.60599998291016,94.67399998291016 +129,3.270776629447937,0.930044375,78.72600005371093,94.63600008789062 +130,3.1806167364120483,0.91067625,79.11799989990234,94.85600008789062 +131,3.2413502761295865,0.9131765625,79.11400010742187,94.90999993164063 +132,3.3152259417942593,0.91413046875,79.33600010742188,94.94599988037109 +133,3.2410558121544972,0.91534203125,79.33800002685547,94.95599998291016 +134,3.1902000393186296,0.8926434375,79.64600013183593,95.03800008789062 +135,3.2619236537388394,0.87898453125,79.832000078125,95.11199995605469 +136,3.1733152525765553,0.8800503125,79.84800002929687,95.14799993408204 +137,3.2605464458465576,0.87765875,79.98400005859375,95.22400003662109 +138,3.1617833205631802,0.88012453125,80.00399994873047,95.24800003417968 +139,3.149062837873186,0.8756025,80.15600005371094,95.29200005859374 +140,3.11025937965938,0.8646725,80.29399990234376,95.40000006103516 +141,3.0998826708112444,0.8566525,80.40999998046875,95.44000000976563 +142,3.127287915774754,0.8480178125,80.42199994873047,95.53199993164063 +143,3.0388451644352505,0.855489375,80.51200018554688,95.50399995605468 +144,3.076501284326826,0.847876875,80.738000078125,95.59399995605469 +145,3.1298107419695174,0.84905984375,80.82400002929687,95.57399995605469 +146,3.075501118387495,0.84291765625,81.01200005126952,95.6200001123047 +147,3.054636767932347,0.83939078125,80.97999997802734,95.68599998046875 +148,3.1356202363967896,0.8339446875,80.97600008056641,95.58800011230468 +149,2.961950966290065,0.8238534375,81.13400012939454,95.69599995605469 +150,2.983371581350054,0.8275609375,81.26999989746093,95.69399998046875 +151,2.9757753951208934,0.8230825,81.20200000244141,95.65000003417968 +152,2.9356598343167986,0.82456109375,81.38800002685547,95.78400008544922 +153,2.976205723626273,0.82176859375,81.40400002685547,95.74199998291016 +154,3.0084752525602068,0.82321359375,81.47600010253906,95.78000011230469 +155,2.9824235098702565,0.819495625,81.59399997558593,95.75999998291016 +156,3.010391661099025,0.8145009375,81.54799997558594,95.79999992919922 +157,3.0055614709854126,0.80819,81.62000002685546,95.77600006103516 +158,3.0069884743009294,0.81687484375,81.694000078125,95.80000003417969 +159,2.9622493471418108,0.81747,81.71600012695312,95.86999987792969 diff --git a/CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv b/CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv new file mode 100644 index 0000000..4d3763d --- /dev/null +++ b/CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv @@ -0,0 +1,311 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,6.94980594090053,6.941925,0.148,0.5639999999332428 +1,6.870656830923898,6.93398,0.14,0.6059999999332428 +2,6.832324981689453,6.9268075,0.14,0.6260000001525879 +3,6.744790928704398,6.85027,0.356,1.423999999732971 +4,6.6263234955923895,6.82559625,0.3519999999332428,1.680000004119873 +5,6.48770945412772,6.8010825,0.445999999294281,1.8840000047302246 +6,6.354598147528512,6.7766625,0.5019999987411499,2.0840000004577637 +7,6.1707538196018765,6.75051,0.5359999960327149,2.357999996337891 +8,6.007028205054147,6.724545,0.5940000102233887,2.5400000085449217 +9,5.772588491439819,6.6968475,0.6180000048828125,2.764000009155273 +10,5.631790944508144,6.66867,0.6499999992370605,2.8380000115966797 +11,5.379027741295951,6.63972,0.7039999964904785,2.983999981994629 +12,5.221321684973581,6.6121825,0.7779999993896485,3.107999990234375 +13,5.053121021815708,6.590085,0.8179999989318848,3.2280000225830077 +14,4.956123658588955,6.572545,0.8080000015258789,3.3900000115966797 +15,4.799027442932129,6.5652675,0.812000001373291,3.498000008239746 +16,4.720616817474365,6.5723075,0.8280000000762939,3.607999997253418 +17,4.573836837496076,6.59066375,0.8199999999237061,3.646000002441406 +18,4.5748593126024515,6.61353875,0.8399999986267089,3.5579999966430664 +19,4.5739074775150845,6.6250425,0.8559999960327148,3.7140000021362303 +20,4.318449326923916,6.61105625,0.937999998626709,3.906000018005371 +21,4.435781615121024,6.56041875,1.032000001525879,4.251999999084473 +22,4.302624361855643,6.4759325,1.2600000071716309,4.878000020751953 +23,4.414106096540179,6.3724225,1.5319999990844726,5.741999996643067 +24,4.211065360477993,6.27572125,1.7319999960327148,6.6620000073242185 +25,4.247744934899466,6.19649875,1.955999998626709,7.483999992370605 +26,4.282609258379255,6.13518,2.046000001068115,8.096000004882812 +27,4.169072525841849,6.0866725,2.1599999997711183,8.647999996643067 +28,4.122129150799343,6.0465125,2.2039999970245363,9.036000012817382 +29,4.049461347716195,6.01146125,2.181999998474121,9.56800002380371 +30,4.055705479213169,5.97383875,2.218000001220703,10.153999978637696 +31,4.163602726800101,5.93298875,2.298000004272461,10.857999990234376 +32,4.081817013876779,5.88474,2.490000000228882,11.744000009765625 +33,4.099612167903355,5.82870625,2.7820000045776365,12.73000000793457 +34,4.05392244883946,5.77154125,3.1359999996948242,13.936000022583007 +35,4.031520996774946,5.70855,3.588000005950928,15.310000037231445 +36,3.9380472217287337,5.63284125,4.223999998474121,16.89000000366211 +37,4.008964129856655,5.55018625,5.134000010375977,18.955999996948243 +38,3.9080281938825334,5.4650375,6.213999992675781,21.212000052490236 +39,3.9426505735942294,5.37652875,7.4720000238037105,23.502000013427736 +40,3.8746714421680997,5.275275,8.935999995117188,25.958000014648437 +41,3.925470062664577,5.16479625,10.686000018310548,28.565999997558595 +42,3.8827988420213972,5.038985,12.961999990234375,31.956000020751954 +43,3.8133669240134105,4.8973225,15.324,35.33399998168945 +44,3.895782709121704,4.735395,18.004000031738283,39.12999999145508 +45,3.840534363474165,4.5540025,20.978000014648437,43.07599996826172 +46,3.8127967289515903,4.3406675,24.086000004882813,47.30399996459961 +47,3.791675567626953,4.10886875,27.554000001220704,51.619999987792966 +48,3.899762204715184,3.8762425,30.89399997314453,55.756 +49,3.8846767289297923,3.6193125,34.578000009765624,59.768000014648436 +50,3.8196579899106706,3.34842375,38.15000000610352,63.612000004882816 +51,3.7278830664498463,3.084740625,41.78199998046875,67.10199993652344 +52,3.827213168144226,2.84354625,45.26399999267578,70.4120000024414 +53,3.8210743835994174,2.6217921875,48.41599998535156,73.43599991210938 +54,3.938135640961783,2.4278509375,51.35800005126953,75.99800005859375 +55,3.804831658090864,2.259463125,53.92400004394531,78.37399994873047 +56,3.7662359986986433,2.108823125,56.36400003417969,80.34200004638672 +57,3.874816451753889,1.9794275,58.644000029296876,82.09400001708984 +58,3.7855028424944197,1.8671025,60.698000024414064,83.56799993896485 +59,3.8578923600060597,1.772823125,62.44000004638672,84.77200006347657 +60,3.776338151523045,1.688523125,64.03600006591797,85.80600006103515 +61,3.821035214832851,1.613735,65.38999995849609,86.73000008544922 +62,3.7199710437229703,1.547733125,66.50600000976563,87.50199997558593 +63,3.8106864350182668,1.49222375,67.56400003173827,88.19799989746093 +64,3.8047622782843455,1.44100609375,68.55600005615234,88.803999921875 +65,3.781612345150539,1.39522640625,69.3880000024414,89.3560000756836 +66,3.7649945531572615,1.351588125,70.10600010253906,89.83200002197266 +67,3.679371884890965,1.3106146875,70.83200012695312,90.23799999267578 +68,3.7694370406014577,1.27566546875,71.43799999511718,90.60799996582031 +69,3.90033187185015,1.247449375,71.96400009765625,90.90999999023437 +70,3.809638108525957,1.22405953125,72.35200017578126,91.14399999023438 +71,3.7749327761786327,1.20222625,72.7679999951172,91.36999999023438 +72,3.8541347810200284,1.18338953125,73.09400004638672,91.56199999023437 +73,3.7692117520741055,1.16706453125,73.45000001953125,91.69999993896485 +74,3.89384697164808,1.15245609375,73.72600004394532,91.83999993896484 +75,3.818265676498413,1.13920421875,73.88999993896485,91.96599999023438 +76,3.808180570602417,1.12641375,74.09400001464844,92.09400006835938 +77,3.7770608493259976,1.11587984375,74.3200000390625,92.20400006835938 +78,3.7761710030691966,1.10574734375,74.42600009033202,92.34000006835937 +79,3.851380297115871,1.096963125,74.5920001171875,92.43800006835937 +80,3.7436849049159457,1.08872375,74.73000016845702,92.52200006835938 +81,3.7914348670414517,1.0825409375,74.7980001171875,92.58200006835938 +82,3.8620016745158603,1.075705,74.9819999609375,92.6440000415039 +83,3.7995089633124217,1.07101390625,75.0600000390625,92.68200004150391 +84,3.820492676326207,1.067599375,75.14399993652344,92.69400004150391 +85,3.9248864139829363,1.06450140625,75.22599998779297,92.74400011962891 +86,3.7503830705370222,1.06045515625,75.2979999609375,92.8420001196289 +87,3.758145349366324,1.05718203125,75.36800006347656,92.87600011962891 +88,3.7711183173315868,1.0540571875,75.46799998535157,92.89000011962891 +89,3.7123160702841624,1.05168203125,75.51800006347656,92.94000017089844 +90,3.8062860454831804,1.04932546875,75.58600008789062,92.99200017089844 +91,3.8366364240646362,1.047400625,75.58600008789062,93.0200001196289 +92,3.79878500529698,1.0462640625,75.67000008789063,93.04199999023437 +93,3.8638044936316356,1.04370953125,75.70600008789063,93.08600001708984 +94,3.8578367233276367,1.04192125,75.75400013916015,93.12000001708985 +95,3.7778862374169484,1.040408125,75.79199995849609,93.11000001708985 +96,3.797469445637294,1.03935296875,75.79799995849609,93.14600001708985 +97,3.851992062159947,1.0373315625,75.86000013916015,93.20200006835937 +98,3.881648983274187,1.0358784375,75.89800013916016,93.18800006835937 +99,3.7989996331078664,1.03511015625,75.97000006103515,93.19600001708984 +100,3.832390921456473,1.034321875,76.06399993164062,93.20800001708984 +101,3.8957015786852156,1.0339253125,76.08000003417969,93.22400001708985 +102,3.7361060891832625,1.03406046875,76.1099999560547,93.2260001196289 +103,3.8049339226314,1.0340321875,76.11400003417968,93.24200009277344 +104,3.7306436811174666,1.03438734375,76.11799995605469,93.27800009277344 +105,3.7872143302645003,1.03508734375,76.15199990478516,93.27000009277344 +106,3.8432654482977733,1.03528734375,76.20999995605469,93.28200014404297 +107,3.743649329457964,1.03604734375,76.25600008300782,93.27400006591797 +108,3.827305112566267,1.0367990625,76.22400010986328,93.27800006591796 +109,3.873527799333845,1.0386921875,76.22000005859375,93.29600001464844 +110,3.8782293115343367,1.03986390625,76.21600005859375,93.31000001464844 +111,3.7775590079171315,1.041655625,76.17200005859375,93.33200006591797 +112,3.800403901508876,1.04380390625,76.15000000732422,93.34000006591796 +113,3.9012868915285384,1.04627703125,76.1580000341797,93.33400006591796 +114,3.846208725656782,1.0487984375,76.15199990478516,93.34800006591797 +115,3.8475638798304965,1.0521446875,76.17799998291015,93.33400014404297 +116,3.9795446395874023,1.05490609375,76.14799998291015,93.36200006591797 +117,3.8443203142711093,1.0580040625,76.08199990478515,93.3640001171875 +118,3.8250525849206105,1.06173859375,76.05199998291016,93.3480001171875 +119,3.907123395374843,1.0642565625,76.06800000976563,93.33600006591797 +120,3.8155746970857893,1.06640765625,76.09999998291016,93.32000006591797 +121,3.8923403535570418,1.0678690625,76.08800000976562,93.30000006591797 +122,3.843483175550188,1.069791875,76.04999995849609,93.32200009277344 +123,3.855623040880476,1.0719815625,76.02399990722657,93.31200014404297 +124,3.8870831046785628,1.07447265625,76.03199995849609,93.33200014404296 +125,3.8356190749577115,1.076770625,76.02800008789062,93.28600009277343 +126,3.8387703554970876,1.07683890625,76.02400000976563,93.30600001464843 +127,3.9135857139314925,1.0763871875,76.0479998803711,93.31400001464844 +128,3.792390823364258,1.07619203125,76.06800006103515,93.2779999633789 +129,3.850563202585493,1.07544859375,76.04999998291015,93.2720000415039 +130,3.738796762057713,1.07485546875,76.00599998291015,93.24200009277344 +131,3.8326881442751204,1.0729271875,75.95000006103515,93.22200001464844 +132,3.893867118018014,1.07098234375,75.92200008789062,93.21400001464843 +133,3.903687221663339,1.068804375,75.92000000976563,93.2240000415039 +134,3.8506924084254672,1.0672546875,75.85400000976563,93.20200009277343 +135,3.8837316717420305,1.065745,75.8399999584961,93.20600004150391 +136,3.8928286177771434,1.06427671875,75.77599990722656,93.18800004150391 +137,3.981452601296561,1.06258359375,75.71999998535156,93.18200009277344 +138,3.909976840019226,1.06103390625,75.68400001220704,93.18800014404297 +139,3.9253520454679216,1.059399375,75.62400006347656,93.16000009277344 +140,3.8020264761788503,1.0577296875,75.60999998535156,93.16800009277344 +141,3.922452688217163,1.05619515625,75.60600006347656,93.15600009277344 +142,3.8550586189542497,1.05484890625,75.63400006347656,93.13400014404297 +143,3.8236265863691057,1.053694375,75.5499999609375,93.16800014404296 +144,3.9549256563186646,1.05310125,75.54199993408203,93.16200009277344 +145,3.936390927859715,1.05281296875,75.5200000390625,93.1260000415039 +146,3.8756578990391324,1.052428125,75.53200001220704,93.0940000415039 +147,3.8966123376573836,1.051775,75.49600014160156,93.11800009277344 +148,3.8537066323416576,1.05180328125,75.51200001220703,93.10600009277344 +149,3.905484369822911,1.05247984375,75.47600001220704,93.06800009277343 +150,3.780659999166216,1.0510046875,75.49200014160157,93.1120001953125 +151,3.650853753089905,1.046048125,75.58200006347656,93.1580001953125 +152,3.614588141441345,1.0403984375,75.72000001220704,93.2160001953125 +153,3.5736235891069685,1.03362390625,75.81200006347656,93.29400014404297 +154,3.616161755153111,1.02613765625,75.98200008789063,93.35400009277343 +155,3.5855596917016164,1.01847828125,76.13000008789062,93.43000006591797 +156,3.59866692338671,1.0102871875,76.26600000976562,93.50000006591797 +157,3.5309522322246005,1.00186296875,76.43600008789062,93.60800001464844 +158,3.574401548930577,0.9941953125,76.5759999584961,93.69000006591797 +159,3.51989597933633,0.98644765625,76.71000000976562,93.73400006591797 +160,3.6196948971067155,0.9791965625,76.82399995849609,93.7780001171875 +161,3.605029055050441,0.9721903125,76.94599995849609,93.8140001171875 +162,3.5420662505286082,0.9652475,77.05399995849609,93.8980001171875 +163,3.5894347429275513,0.958748125,77.22999998291016,93.93400006591797 +164,3.5978579180581227,0.95259359375,77.36000003417969,93.99200006591796 +165,3.6617026329040527,0.9469590625,77.46600008544922,94.02400006591797 +166,3.6058098588671004,0.94118453125,77.55200006103516,94.07200006591796 +167,3.58823892048427,0.9359265625,77.6520000341797,94.11400006591796 +168,3.541081871305193,0.930876875,77.73800003417969,94.16400006591797 +169,3.545100109917777,0.926250625,77.81400008544922,94.22399998779296 +170,3.602257422038487,0.92191265625,77.86400008544922,94.2580000390625 +171,3.507282631737845,0.91811125,77.97200000732421,94.2920001171875 +172,3.5846358367374966,0.91407328125,78.04000005859375,94.31400009033203 +173,3.4164858715874806,0.91026359375,78.15199998046874,94.36200014160156 +174,3.5214629684175764,0.9067821875,78.21600010986329,94.41000009033203 +175,3.723686303411211,0.90326078125,78.24600010986327,94.44000009033203 +176,3.567773597581046,0.9000628125,78.34000005859374,94.45200014160156 +177,3.531922306333269,0.8968096875,78.44199992919921,94.47600014160156 +178,3.445431743349348,0.8934965625,78.48400000732421,94.50800014160156 +179,3.4728594166891917,0.89035171875,78.5360000341797,94.55200014160157 +180,3.494410361562456,0.88745859375,78.59400008544922,94.59200001220704 +181,3.5128171103341237,0.88491375,78.65999992919922,94.65800009033204 +182,3.4463326930999756,0.88252890625,78.74200005859375,94.66800009033203 +183,3.429210696901594,0.8797275,78.83600005859375,94.70400014160157 +184,3.430286169052124,0.87714265625,78.88200008544922,94.76800014160156 +185,3.5126789127077376,0.87460609375,78.96800008544922,94.77200014160157 +186,3.5115831749779836,0.8723978125,78.97000008544921,94.79600014160157 +187,3.4737915652138844,0.8702578125,79.03600000732422,94.81200014160156 +188,3.536697966711862,0.86798125,79.06000008544922,94.82200006347657 +189,3.5705651896340505,0.865568125,79.10999998046876,94.8680000366211 +190,3.5216511828558787,0.86370328125,79.19199998046875,94.89800008789062 +191,3.343783582959856,0.86171015625,79.27199995361327,94.91200008789062 +192,3.306029898779733,0.8600184375,79.31600003173828,94.95000008789063 +193,3.4795777116503035,0.858201875,79.39200003173828,94.96200013916015 +194,3.360287530081613,0.85623359375,79.46000008300781,94.97000013916016 +195,3.383433989116124,0.85435703125,79.5000000048828,95.02400013916015 +196,3.4374169622148787,0.85294046875,79.56400000488281,95.00400013916015 +197,3.403316753251212,0.8514921875,79.58800000488282,95.02000019042968 +198,3.339807084628514,0.84972390625,79.65599997802734,95.05800019042968 +199,3.453535488673619,0.8484521875,79.69200005615234,95.09800013916016 +200,3.419135877064296,0.8466790625,79.75600005615235,95.11400013916015 +201,3.3291687795094083,0.84469078125,79.77400005615235,95.14400008789063 +202,3.3989278929574147,0.84291421875,79.80600005615234,95.17600008789063 +203,3.4345146928514754,0.84117421875,79.91200013427735,95.22200013916016 +204,3.3367595842906406,0.8393425,79.96200013427735,95.24400013916015 +205,3.405371512685503,0.83769421875,80.00800005615234,95.26000013916016 +206,3.2920469726834978,0.83589765625,80.03600018554687,95.29600013916016 +207,3.39451459475926,0.83424109375,80.09400010742188,95.28600008789063 +208,3.3381849186761037,0.83268109375,80.13600013183594,95.3060000366211 +209,3.3369586978639876,0.8314128125,80.16200013183594,95.3240000366211 +210,3.305583425930568,0.82968453125,80.21800005371094,95.3260000366211 +211,3.383475865636553,0.82795625,80.23200005371093,95.3560000366211 +212,3.3413387877600536,0.82657625,80.2960000805664,95.3620000366211 +213,3.3035824469157626,0.82492453125,80.35200013183594,95.37600008789063 +214,3.2925713743482317,0.82351625,80.39800013183594,95.40600008789062 +215,3.2677473170416698,0.82188453125,80.4540000805664,95.4220000366211 +216,3.29270339012146,0.82025625,80.50200013183594,95.4300001147461 +217,3.3200165714536394,0.8187796875,80.54400013183594,95.4600001147461 +218,3.228957329477583,0.81721140625,80.56200013183594,95.4820001147461 +219,3.257698552949088,0.81583140625,80.62400013183594,95.4980000366211 +220,3.324594702039446,0.814423125,80.67800013183594,95.5219999584961 +221,3.2105934960501537,0.8128865625,80.70600005371094,95.5579999584961 +222,3.2700722898755754,0.81163,80.76599997558594,95.56599995849609 +223,3.184366192136492,0.81002171875,80.80000005371093,95.56199995849609 +224,3.2554157972335815,0.80895,80.83400010498048,95.5759999584961 +225,3.2719230311257497,0.80763,80.91000010498047,95.56599995849609 +226,3.1703891924449374,0.8061734375,80.95800010498047,95.58599995849609 +227,3.182521172932216,0.8049934375,81.01800010498047,95.61599995849609 +228,3.215354255267552,0.80380515625,81.05000002685547,95.63400003662109 +229,3.199547358921596,0.80252515625,81.09199994873048,95.65000000976562 +230,3.2207604135785783,0.80131515625,81.17199994873047,95.64200000976562 +231,3.1240655524390086,0.80026515625,81.21600002685547,95.67800000976563 +232,3.257808906691415,0.799116875,81.25400002685546,95.69599993164063 +233,3.1133551938193187,0.79786859375,81.30399997558594,95.70799993164063 +234,3.1397553852626254,0.79670203125,81.33199997558594,95.71799993164062 +235,3.212665115083967,0.79555375,81.36999997558594,95.72199993164062 +236,3.134836418288095,0.79439890625,81.40200005371094,95.73199993164063 +237,3.1265358924865723,0.79340890625,81.43400005371093,95.74599993164063 +238,3.2117801904678345,0.79225890625,81.4740000024414,95.76199993164063 +239,3.1193034989493236,0.791180625,81.5239999243164,95.77199998291016 +240,3.1610439334596907,0.79030234375,81.56599987304688,95.76399993164063 +241,3.126009328024728,0.78928234375,81.58000005371093,95.78399993164062 +242,3.0744248969214305,0.7885540625,81.6080000024414,95.79999993164063 +243,3.0661239453724454,0.7877840625,81.6440000024414,95.81200000976563 +244,3.02728225503649,0.78692578125,81.67000000244141,95.80199993164062 +245,3.1005813223975047,0.78626578125,81.6900000805664,95.81999993164062 +246,3.105734280177525,0.78565578125,81.70800005371093,95.78999993164062 +247,3.098195024899074,0.7848375,81.71600005371094,95.79399993164063 +248,3.142967564719064,0.7842675,81.72600000244141,95.79799993164063 +249,3.0174715518951416,0.7835175,81.79399997558593,95.81999993164062 +250,3.0780578000204906,0.78284921875,81.84599997558594,95.83799993164062 +251,3.147279364722116,0.7824209375,81.87599997558594,95.82799993164062 +252,3.034480486597334,0.78215265625,81.90000002685547,95.83799993164062 +253,3.027377588408334,0.781624375,81.91000002685547,95.85399993164063 +254,3.063859905515398,0.7815178125,81.93600002685547,95.84799993164063 +255,3.042998433113098,0.78106953125,81.97199997558593,95.86599998291015 +256,3.0484637873513356,0.78063296875,81.99199997558594,95.90400003417969 +257,3.0794162239347185,0.7802546875,82.00999997558594,95.9200000341797 +258,3.045973709651402,0.78008984375,82.0399999243164,95.9260000341797 +259,2.928838150841849,0.77996984375,82.08999997558594,95.91800003417968 +260,2.9149784360613142,0.7799015625,82.07999997558593,95.93000003417968 +261,2.9356118951525008,0.7798015625,82.1379999243164,95.92800003417969 +262,3.0385569674628123,0.7799215625,82.1639999243164,95.9260000341797 +263,2.981449672154018,0.77984984375,82.19600000244141,95.93599995605469 +264,2.9820726939610074,0.779828125,82.20799997558593,95.94399995605468 +265,2.8757408687046597,0.7799146875,82.21999997558594,95.96799995605468 +266,3.0372165271214078,0.7797746875,82.22999997558594,95.95600003417968 +267,2.8532401663916453,0.77982640625,82.25199992431641,95.96800003417968 +268,2.8727880716323853,0.779848125,82.27599992431641,95.98200003417969 +269,2.9443894113813127,0.780148125,82.28199992431641,96.00000003417969 +270,2.9520383221762523,0.780188125,82.31200000244141,95.99200003417968 +271,2.9796100854873657,0.780598125,82.29799995117187,95.98600003417968 +272,2.91614978654044,0.780838125,82.34400002929688,95.98800003417969 +273,2.9470558166503906,0.78107984375,82.31199997802734,95.98200003417969 +274,2.9149969816207886,0.781378125,82.33599997802735,95.98800003417969 +275,2.8980303491864885,0.781628125,82.33400002929687,95.99400003417969 +276,2.8834817920412337,0.78223640625,82.3740000805664,95.99799995605468 +277,2.893464684486389,0.78245640625,82.42600008056641,95.99799995605468 +278,2.866437315940857,0.78304296875,82.4220000805664,96.00600000732422 +279,2.9361328056880405,0.78362125,82.4180000024414,95.99600005859375 +280,2.8688106536865234,0.7841978125,82.40600005371094,95.99600000732421 +281,2.8958534002304077,0.784674375,82.41600000244141,95.99000000732421 +282,2.9111480712890625,0.78514265625,82.4360000024414,96.00000000732422 +283,2.8539947952542986,0.7859809375,82.42000005371094,95.98400000732421 +284,2.901869671685355,0.78666921875,82.40999997558593,95.96399995605469 +285,2.789713059152876,0.78732578125,82.4179999243164,95.94800000732423 +286,2.8766787222453525,0.7881540625,82.43599997558594,95.93400000732422 +287,2.9195314816066196,0.78879234375,82.45199997558593,95.91799995605469 +288,2.8812105655670166,0.78950890625,82.43399997558593,95.91400000732422 +289,2.846291661262512,0.79023890625,82.43599997558594,95.90600000732422 +290,2.831061363220215,0.7908171875,82.42200002685547,95.90200000732422 +291,2.8955056497028897,0.79149375,82.43400002685547,95.89600000732422 +292,2.8761453798839023,0.79212203125,82.42600002685546,95.89400000732422 +293,2.8942482812064037,0.7927603125,82.39200002685547,95.88200000732422 +294,2.8691021544592723,0.793396875,82.39000010498047,95.87200000732422 +295,2.7985400472368513,0.79409515625,82.37400015625,95.85000000732421 +296,2.83943886416299,0.7948034375,82.372000234375,95.84200000732422 +297,2.764699237687247,0.79532171875,82.388000234375,95.83800000732423 +298,2.783741457121713,0.79605828125,82.408000234375,95.84000000732422 +299,2.8964389903204784,0.7964965625,82.414000234375,95.83600000732422 +300,2.8764584915978566,0.79715484375,82.418000234375,95.82600005859375 +301,2.8418020009994507,0.79767140625,82.410000234375,95.83000005859375 +302,2.878923177719116,0.79832140625,82.412000234375,95.82400005859375 +303,2.882906266621181,0.79883140625,82.39600015625,95.81200005859375 +304,2.912467394556318,0.7992396875,82.40000015625,95.80600005859375 +305,2.8378712109157016,0.79983625,82.39400020751953,95.81200005859375 +306,2.797353148460388,0.80029625,82.41200020751953,95.80800000732422 +307,2.8914946487971713,0.80090453125,82.44800012939453,95.80200000732422 +308,2.7958712577819824,0.80121453125,82.43600012939453,95.81000000732422 +309,2.9499638080596924,0.80185453125,82.44200012939453,95.81200000732422 diff --git a/CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml b/CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml new file mode 100644 index 0000000..b65af94 --- /dev/null +++ b/CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml @@ -0,0 +1,111 @@ +aa: rand-m7-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: true +bias_decay: true +bn_eps: null +bn_momentum: null +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: 0.95 +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.05 +epoch_repeats: 0.0 +epochs: 100 +eval_metric: top1 +experiment: e100-aug0-w60-minlr1e6-wrlr1e9-initRdm-bias-lr3e2 +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.03 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 5.0 +mean: null +min_lr: 1.0e-06 +mixup: 0.1 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: resnet50 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_eps: 1.0e-08 +output: ./exp_results/res50-epoch- +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.0 +resplit: false +resume: null +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.0 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-09 +weight_decay: 0.02 +workers: 8 diff --git a/CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml b/CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml new file mode 100644 index 0000000..81258c2 --- /dev/null +++ b/CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml @@ -0,0 +1,111 @@ +aa: rand-m7-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: true +bias_decay: true +bn_eps: null +bn_momentum: null +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: 0.95 +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.05 +epoch_repeats: 0.0 +epochs: 200 +eval_metric: top1 +experiment: e200-aug0-w60-minlr1e4-wrlr1e9-initRdm-bias +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 5.0 +mean: null +min_lr: 0.0001 +mixup: 0.1 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: resnet50 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_eps: 1.0e-08 +output: ./exp_results/res50-epoch- +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.0 +resplit: false +resume: null +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.0 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-09 +weight_decay: 0.02 +workers: 8 diff --git a/CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml b/CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml new file mode 100644 index 0000000..2c5fcf3 --- /dev/null +++ b/CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml @@ -0,0 +1,112 @@ +aa: rand-m7-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: true +bias_decay: true +bn_eps: null +bn_momentum: null +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: 0.95 +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.05 +epoch_repeats: 0.0 +epochs: 300 +eval_metric: top1 +experiment: res50-aug0-retrain +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 5.0 +mean: null +min_lr: 1.0e-05 +mixup: 0.1 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: resnet50 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_debug: 5 +opt_eps: 1.0e-08 +output: ./exp_results/res50-epoch- +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.0 +resplit: false +resume: null +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.0 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: bicubic +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-06 +weight_decay: 0.02 +workers: 8 diff --git a/CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv b/CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv new file mode 100644 index 0000000..ecb81ca --- /dev/null +++ b/CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv @@ -0,0 +1,111 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,0.7045409509113857,6.9416,0.064,0.418 +1,0.058665430905031304,6.89746625,0.3079999999332428,1.2019999998664856 +2,0.007796582133908357,6.2966525,1.7179999993896484,5.899999989013672 +3,0.007212148014722126,5.116435,8.078000043945313,21.984000035400392 +4,0.006597742538100907,4.30874625,16.604000009765624,37.32800003540039 +5,0.006309278409129807,3.7494875,24.503999986572264,48.297999992675784 +6,0.00587210977183921,3.23308,31.903999926757812,57.66999989746094 +7,0.005444032173337681,2.87593875,38.16399994140625,63.99200002685547 +8,0.0054282506462186575,2.59584875,43.517999924316406,69.46200001464844 +9,0.005179691860186202,2.359841875,47.206000029296874,72.58200003417969 +10,0.004889545729383826,2.1719675,50.609999997558596,75.55400000244141 +11,0.00470197234036667,2.1567584375,51.69199992919922,76.44600010253906 +12,0.004586202425083944,1.98930375,54.606000112304685,78.83600004394532 +13,0.004271666053682566,1.8706825,56.328000031738284,80.30800010009766 +14,0.004447908040934375,1.806950625,58.472000075683596,81.5399999633789 +15,0.0041762767692229575,1.7647315625,58.741999968261716,82.09000006103516 +16,0.004471837143812861,1.708065625,60.30200004394531,82.98200011230469 +17,0.004270398956058281,1.67571921875,61.048000041503904,83.32200005859374 +18,0.004100026030625615,1.65201375,61.26000004150391,83.77400021972656 +19,0.0041242205105455855,1.63376078125,61.504000068359375,84.07800001220703 +20,0.004059118734273527,1.67590984375,60.91800009765625,83.5019999584961 +21,0.0041561292850279385,1.63649734375,61.82800004882812,84.22399995361329 +22,0.004249815163867814,1.5946559375,62.68000001220703,84.70800006347656 +23,0.0039470667751239875,1.64520578125,61.93799990234375,84.07400013427734 +24,0.003988273092545569,1.671076875,61.05600004394531,83.42199993164063 +25,0.004096939311628895,1.7034496875,61.12399993652344,83.56399995605469 +26,0.004087086118358586,1.60285265625,62.73200006347656,84.75999995605468 +27,0.00399751916328179,1.61492046875,62.43800003662109,84.32400010742188 +28,0.003949649166315794,1.701069375,60.77399994628906,83.2460001147461 +29,0.004051400797574648,1.6202353125,62.64599990722656,84.67000006103515 +30,0.004139024115699742,1.6344540625,62.20200006591797,84.12200026855469 +31,0.003921386137205575,1.62690984375,62.05000011474609,84.17200008544921 +32,0.00411509963617261,1.68366421875,61.46400011474609,83.86600005859376 +33,0.003911659786743777,1.67565765625,60.84800007324219,83.32999993408202 +34,0.00395727701418634,1.62554953125,62.0080000390625,84.16199998291016 +35,0.004033969731868378,1.71603296875,60.70599999267578,83.0460000390625 +36,0.004010531336202153,1.6436690625,62.05400001953125,84.18800013916015 +37,0.0039575622982478565,1.67731390625,61.35800016845703,83.65600013671875 +38,0.0039316649615232435,1.61552953125,62.22400010986328,84.39000005126952 +39,0.003873389430477151,1.63947921875,61.81200003662109,84.1440000366211 +40,0.004065845494291612,1.653141875,61.8460001147461,83.82200008789063 +41,0.004109910373309893,1.714169375,60.308000017089846,83.24199985595703 +42,0.003946930452782128,1.94490875,56.48200006103516,79.57000004638672 +43,0.0041138056798705035,1.6267740625,61.803999931640625,84.29799997802735 +44,0.004048073315061629,1.62808609375,62.09799998291015,84.28800000976563 +45,0.0039734537546922055,1.784985625,59.12400004882812,82.1780000390625 +46,0.0038987650768831372,1.713120625,60.78800010498047,83.26599994628906 +47,0.0040997504090358105,1.88673,57.57800005615234,80.4180000415039 +48,0.003935285162047616,1.6685634375,61.34400001220703,83.64799995605469 +49,0.004107319034769067,1.7783765625,59.22000000244141,82.05199999023438 +50,0.00387493397907487,1.6779953125,61.276,83.92200001464843 +51,0.004015801890221026,1.847471875,58.37599998046875,81.37399998779297 +52,0.003935897473378905,1.859410625,58.18199997802734,81.15000001708984 +53,0.004190738429315388,1.821818125,58.34600005615234,81.56200009277343 +54,0.004043174558319151,1.823231875,58.122000075683594,81.2140000390625 +55,0.004158310043359441,1.86400625,57.84399987792969,81.45800022460938 +56,0.003960915591700801,1.7923175,58.804000024414066,81.96200001220703 +57,0.004142970977617162,1.7743928125,59.36600004394531,82.41600017333984 +58,0.004029840646710779,1.7658021875,59.30400007080078,82.15200016845704 +59,0.004218896684635963,1.88195375,56.881999975585934,80.56000011474609 +60,0.0036925061971747448,1.3517940625,67.70000002197266,88.15399987304687 +61,0.0035992927150800824,1.34404765625,68.08800004882812,88.23600020751954 +62,0.003520481986925006,1.283674375,69.1300000805664,88.94400007568359 +63,0.003616590718073504,1.3082865625,68.802000078125,88.65599994384766 +64,0.0036838793894276023,1.27181484375,69.44200001953125,89.27800005126953 +65,0.003572586092299649,1.29942640625,69.78399999267579,89.41400007324219 +66,0.0036129531716661794,1.2370415625,70.27599992431641,89.516 +67,0.0032376082381233573,1.2114928125,70.86000002197265,90.03600010009765 +68,0.0035054978714989765,1.224236875,70.44400004394531,89.89400004394531 +69,0.0034192517466310945,1.23175109375,70.51399994628906,89.67800012451171 +70,0.00328368427498,1.19048328125,71.48400014648438,90.30600015136719 +71,0.00327613196402256,1.16209390625,71.9240000366211,90.69200007080079 +72,0.0030484608806935804,1.16013578125,71.9080000390625,90.63800004394531 +73,0.0034537422138133217,1.1457075,72.4540000390625,90.89400011962891 +74,0.003460384572723082,1.13635015625,72.41000006835938,90.91400004638672 +75,0.0033204310374068363,1.12647875,72.77400001464844,91.23800009521484 +76,0.0032639388061527696,1.113355625,72.89800006347656,91.27400009521484 +77,0.0032552302914804648,1.1143825,72.92800009033203,91.40200001708985 +78,0.003150941720897598,1.0993584375,73.49799998779297,91.53000014892578 +79,0.0031130987585389186,1.0650625,74.15800011474609,92.03200009521484 +80,0.0032726521603763103,1.0721525,74.11400006591796,91.98600006591796 +81,0.00320629304873624,1.0649465625,74.26599995605469,92.1619999633789 +82,0.0029540062449606402,1.0372840625,74.79800008789063,92.30600001708984 +83,0.003026906833318727,1.0280375,75.05400014160156,92.53800022460938 +84,0.0029979831805186613,1.017864375,75.4720000366211,92.63999999267578 +85,0.00299135923186051,0.99109765625,75.92600000732422,92.9679999633789 +86,0.003011097732399191,0.99155703125,75.93799998291016,92.82800001464844 +87,0.003033405419306031,0.970643125,76.38000008789062,93.0640001171875 +88,0.0028323159287018435,0.9561534375,76.69000000976563,93.1739999633789 +89,0.0030302958163831916,0.9529859375,76.86200008544922,93.20600006591796 +90,0.0030514331634289454,0.9512065625,77.03400000976562,93.26600009033203 +91,0.002754983675133969,0.9374346875,77.17200000488282,93.41999996337891 +92,0.002925087830850056,0.92438484375,77.47400018554687,93.53399991210938 +93,0.002743347780779004,0.9260734375,77.55600011230469,93.63999993652344 +94,0.0028534684097394347,0.95646546875,77.48199992675781,93.63399998779298 +95,0.0028282569421987447,0.91486703125,77.77999995361328,93.66999996337891 +96,0.0026793425869462745,0.90815390625,77.85000003173828,93.7900001171875 +97,0.002686592417636088,0.909225625,78.00000013427734,93.78399993652344 +98,0.002937979913050575,0.90744421875,77.98200003173828,93.79199998779296 +99,0.002853604283050767,0.90461453125,78.05800000488281,93.84199993652344 +100,0.002864615060389042,0.9053496875,78.0300000830078,93.79799998779296 +101,0.002886664870727275,0.9070628125,78.00999995361327,93.78799993652343 +102,0.002906581253877708,0.91363046875,77.93799995361329,93.71799998779296 +103,0.0030246374164042728,0.90368484375,78.14200010986328,93.83399998779296 +104,0.0028219220860462102,0.906053125,78.03600005615235,93.78400006591797 +105,0.002867467302296843,0.90486140625,78.06800013427734,93.80799998779297 +106,0.002776414771298213,0.90622484375,78.1760000830078,93.84400001464844 +107,0.0027404509518029435,0.90221796875,78.09400000488282,93.82399998779297 +108,0.002886704235736813,0.90330140625,78.10999998046876,93.80999993652344 +109,0.0028225835911663516,0.9019365625,78.07000000488281,93.81399993652344 diff --git a/CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv b/CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv new file mode 100644 index 0000000..1da7189 --- /dev/null +++ b/CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv @@ -0,0 +1,211 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,0.7045409551688603,6.9416,0.064,0.418 +1,0.05861631261983088,6.88152125,0.3300000003051758,1.4160000054168702 +2,0.007787905666711075,6.29743625,1.7679999932861328,6.330000021362305 +3,0.007318386575207114,5.39208375,6.4000000115966795,18.032000087890626 +4,0.006826370256021619,4.711741875,12.58400001525879,30.035999998779296 +5,0.006564456370792219,4.215930625,18.40000004638672,39.540000072021485 +6,0.006210722494870424,3.7064575,25.034000032958986,48.896000021972654 +7,0.005833921089236226,3.446033125,29.666000029296875,54.72999997558594 +8,0.005793575064412185,3.0451746875,35.78199996948242,61.63399999023437 +9,0.005482399137690663,2.92318375,37.8639999609375,62.89000001953125 +10,0.005182175398139017,2.4888303125,44.8599998828125,70.50999987060547 +11,0.004980468157944935,2.3753028125,47.31599997314453,72.85000001708984 +12,0.004848868419815387,2.1763934375,50.68400004394531,75.72199998046875 +13,0.004497935130660023,2.02472625,53.710000114746094,78.10400005126954 +14,0.004636649873905948,1.9788840625,54.92200000244141,78.86800007324219 +15,0.004339669598266482,1.876849375,56.504000158691404,80.39399998779297 +16,0.0046054747654125094,1.8213928125,57.803999970703124,81.25400013671874 +17,0.00438071893794196,1.7728221875,59.07799999511719,82.16399990478516 +18,0.004191712393159313,1.7204253125,59.88599999267578,82.56599991210938 +19,0.004200312824520681,1.6947209375,60.43600009277344,83.2559999609375 +20,0.0041146951095600215,1.64887125,61.5360000390625,83.89399993408203 +21,0.004193632979877293,1.6154840625,62.46800001953125,84.26399980224609 +22,0.0042644668470269865,1.620426875,62.13600014404297,84.38800008300781 +23,0.003946975472250155,1.58272265625,63.15000013427734,84.93799995361329 +24,0.003950901828440172,1.62863140625,62.05200009765625,83.91600006347656 +25,0.004059900596205678,1.64938625,62.29600000732422,84.31999992919921 +26,0.004035455657036177,1.580548125,63.11400011474609,84.89199998291015 +27,0.003935754764825106,1.5290325,64.00800005615234,85.40400005371093 +28,0.0038723718441490617,1.5349790625,63.911999926757815,85.51600012939453 +29,0.003965828401435699,1.6118196875,63.035999931640625,84.76599995605469 +30,0.004045715455764106,1.5477725,63.98800006103516,85.71800008056641 +31,0.003791363294502454,1.50541140625,64.66399998291016,85.84799989990235 +32,0.003994816814416221,1.872206875,59.20399997314453,81.1340001953125 +33,0.003794607290598963,1.4659615625,65.19200009277344,86.57399998046876 +34,0.0038328385313174556,1.4683784375,65.23600006103516,86.53399998291016 +35,0.0038947787834331393,1.5209565625,64.42000000732422,85.83799984863282 +36,0.003859427624515125,1.47193203125,65.54000002685547,86.57600013427735 +37,0.003790942220283406,1.5083471875,64.5499999267578,86.07199995361329 +38,0.0037716488753046307,1.50174578125,64.79599992675782,86.10999997558594 +39,0.003689929044672421,1.510310625,64.64600000488281,86.02999997802735 +40,0.0038831187578450355,1.4518178125,65.59200008544921,86.8760000024414 +41,0.00391619885340333,1.5053915625,64.53599995117187,86.06000013183593 +42,0.003753877860227866,1.4715946875,65.40400002685547,86.4160000805664 +43,0.003907359188555607,1.5655059375,63.231999907226566,85.17599997802735 +44,0.003843014344706067,1.48220671875,65.15600012695313,86.44200002685547 +45,0.003759625999789153,1.49491625,64.71200010986328,86.38999997802735 +46,0.0036789896964494672,1.48738609375,65.48199987548828,86.51200010742187 +47,0.003882888887476708,1.479825,65.29599989990234,86.59600005126953 +48,0.0036989124824426006,1.4772675,65.192000078125,86.40999995117187 +49,0.0038785873080736826,1.504194375,64.83399998535157,85.74599995117188 +50,0.0036308450757392813,1.4936209375,65.20000003662109,86.48599998291016 +51,0.003771561148044254,1.488496875,65.1560000366211,86.57800005371094 +52,0.0036744583963549565,1.5306975,64.27400009521484,85.84400010986329 +53,0.0039386395863922575,1.505415,65.27399990234375,86.40400000488282 +54,0.003786725890157478,1.5121053125,64.41799998779297,86.02000000976562 +55,0.0038960346885557684,1.584905,63.384000056152345,85.33400023681641 +56,0.003697914753242263,1.4781703125,65.27399997802735,86.6660000805664 +57,0.0038609286670440008,1.48826421875,65.09000000976563,86.31200005615234 +58,0.00374445816435452,1.4656303125,65.4440000341797,86.49399990234375 +59,0.003939676181679326,1.520185625,64.42199998779297,85.91200008544922 +60,0.0036717986554971765,1.4740965625,65.33400005615235,86.43400010498047 +61,0.0036946918782112853,1.4400040625,66.137999921875,86.8360000756836 +62,0.003649507200212351,1.3948309375,66.91000002929688,87.45000002929687 +63,0.003765647050126323,1.419415,66.84600002441407,87.29800000244141 +64,0.00385747043349381,1.38540703125,67.19199996826171,87.59400010009766 +65,0.003746751995225038,1.4373690625,66.85599995117188,87.50800013183594 +66,0.0038159869810832398,1.35951734375,67.73400008300781,87.92200005371093 +67,0.0034571332590920584,1.37397765625,67.47000008300782,87.77199995117188 +68,0.003730148426257074,1.4091596875,66.44200002685547,87.34000018310547 +69,0.003659855990138437,1.3601834375,67.68800010498047,87.981999921875 +70,0.0035387545524697217,1.35465234375,68.02799999511718,88.44 +71,0.003558939788490534,1.4536840625,65.95800008544921,86.66000021240234 +72,0.0033511826103287084,1.39861125,67.14400007324218,87.55200005126953 +73,0.00376773886715195,1.3392646875,68.31200001953125,88.25800012939453 +74,0.0037749758422640817,1.3295978125,68.16599992431641,88.37799981933594 +75,0.0036681361629494597,1.40448625,66.6980000366211,87.508 +76,0.0036327216907271315,1.34860953125,68.05599997070313,88.05400010253906 +77,0.0036319279045398745,1.3575459375,67.70800001953126,88.14200004638671 +78,0.0035495711656819496,1.36476296875,68.11400007080078,88.26000025390626 +79,0.0035365247999184896,1.31430375,68.72399997558594,88.56599997558594 +80,0.003713787134204592,1.3273484375,68.40599997070312,88.72199999511719 +81,0.0036787415821371333,1.30792578125,68.79600008300781,88.896 +82,0.003430017004055636,1.31487703125,68.60600012207031,88.50999997070312 +83,0.0035406785318627954,1.33942859375,68.50599997314453,88.37800015625 +84,0.0035228457834039417,1.33490109375,68.78400005126953,88.79600002685547 +85,0.0035429883615246843,1.33512765625,68.27599989013672,88.48199999755859 +86,0.0035690352420455645,1.2936003125,69.10999991943359,88.992 +87,0.0036141484244061367,1.30547953125,68.96799993896484,88.92200022705079 +88,0.003423843566062195,1.30081328125,68.81600005126953,88.84600005126953 +89,0.003634033741296402,1.3100946875,68.73999997070312,88.90600017822265 +90,0.003681087350871946,1.3103096875,69.52200002685547,88.89200004882812 +91,0.0033962452351780875,1.2537275,70.15800012207032,89.61000002197265 +92,0.0035749949581388918,1.25576875,69.95000001708985,89.52200002197266 +93,0.003407541712346886,1.26281765625,70.04200010009765,89.61800015136718 +94,0.0035377658371414456,1.33862625,69.27399997070313,88.9799999194336 +95,0.003494631831667253,1.2840921875,69.66400005126953,89.08999997314453 +96,0.0033580272824370433,1.243096875,70.43400004394532,89.8300000756836 +97,0.003372354432940483,1.40749484375,66.95800008789062,87.59000015625 +98,0.0036157858557999134,1.25091953125,70.18200007568359,89.58599994384765 +99,0.003527536356289472,1.21663265625,70.91600010009766,89.88200017822265 +100,0.0035386210906186272,1.23658109375,70.51199994384766,89.73999997314453 +101,0.003551432181016675,1.23664953125,70.65799999755859,89.79200008056641 +102,0.0035552934610417913,1.245574375,70.66399996582031,89.824000078125 +103,0.0036766345479658674,1.2083915625,70.92600004882813,90.03399994628906 +104,0.0034643861831032802,1.24649953125,70.41199998779297,89.57199996582031 +105,0.003494899670061256,1.24356671875,70.44800020263672,89.80400010253906 +106,0.0033947998890653253,1.2432778125,70.20399994140625,89.6799999975586 +107,0.0033498970858220544,1.26618046875,69.61600013183593,89.30800002441406 +108,0.0034940940760342138,1.20514609375,71.09400004394531,90.22199994384765 +109,0.003417208025764142,1.185001875,71.54799999267578,90.50400009765625 +110,0.0032839904306456447,1.211754375,71.20400001708984,90.03000002441406 +111,0.003404544184117445,1.192296875,71.55799993652344,90.51399999267578 +112,0.0032217274752578567,1.216126875,70.7519999609375,90.08199999267578 +113,0.0033759328237335596,1.2004621875,71.39199996826171,90.21800022949219 +114,0.003175128933175334,1.18758890625,71.57000001708984,90.21599994384766 +115,0.0032200828760064076,1.1713734375,71.9300001147461,90.56200012207032 +116,0.0032189975026994944,1.1896378125,72.10400007324219,90.7679999951172 +117,0.0035298727120139767,1.17690640625,71.79800009277344,90.63999996582031 +118,0.0032351285418761627,1.1448571875,72.57400001220704,90.86400012451172 +119,0.0032762797116967185,1.139448125,72.48800014648438,91.06800007080078 +120,0.0032881099863776137,1.16669484375,72.26800007324219,90.61600004638672 +121,0.0034034981758200695,1.160696875,72.14400001464844,90.59199999511719 +122,0.0033585052172254238,1.14411265625,72.40999996582032,90.99000009521484 +123,0.003353612048418394,1.13490546875,72.60200006835937,91.03800004394532 +124,0.003222887670355184,1.1164625,73.02000006347656,91.31800004638671 +125,0.0033358727482014467,1.17029984375,71.98200007324219,90.94200007080079 +126,0.003147848233181451,1.11883234375,72.99600007324219,91.11600004882813 +127,0.00330801319796592,1.1164684375,73.37800008789063,91.31800006835938 +128,0.003165309433825314,1.13151640625,72.79200010009765,91.10200004638672 +129,0.003169606639338391,1.10438015625,73.2600000390625,91.41600007324219 +130,0.003111145625423108,1.1232259375,73.14799991210937,91.18200017333984 +131,0.003257711268296199,1.12542625,73.11000020019532,91.16200017333985 +132,0.0032982677720221026,1.10749890625,73.3720001171875,91.4680000732422 +133,0.003284811640956572,1.08792484375,73.93399993652343,91.64800014892577 +134,0.003277899364807776,1.0648290625,74.2020000390625,92.00999996582031 +135,0.0031747023708053996,1.0842659375,73.8239999609375,91.83600006835937 +136,0.0031947052172784296,1.124199375,73.76800000976563,91.64800001708984 +137,0.003041988188800003,1.0820375,73.96600001708984,91.67200004638671 +138,0.003274818416684866,1.0491440625,74.68800008544922,92.05400009277344 +139,0.0032263360252337797,1.06187390625,74.43999998291015,92.05400014648437 +140,0.003107036247716418,1.06291078125,74.50599998779298,91.89599993896485 +141,0.0031503743957728148,1.0556015625,74.59600016357422,92.1040001196289 +142,0.003153187089732715,1.05592671875,74.43599993408203,91.99400007080078 +143,0.0030785591141985996,1.0333478125,75.0699999584961,92.36800004394532 +144,0.0030362975916692187,1.039278125,74.78800008789062,92.23000006835937 +145,0.0029685184958257844,1.01897828125,75.26999998535156,92.51400012207031 +146,0.0030120556142979433,1.01427953125,75.45599995605468,92.49000001953125 +147,0.0029117654942508253,1.024656875,75.39200024414062,92.52600006835938 +148,0.003095526248216629,0.99868203125,75.65999999023437,92.81400007080079 +149,0.002969694423622319,1.023748125,75.40800006591797,92.5580001196289 +150,0.0030562643826540026,1.00439859375,75.65000000732422,92.63400011962891 +151,0.003044905440349664,1.0112996875,75.57199993408203,92.63200009521485 +152,0.0029704225016757846,0.99755734375,75.90399993408204,92.74999998779298 +153,0.002950280306062528,1.00189,75.8440000366211,92.81200004150391 +154,0.003015570342540741,0.97426734375,76.23200000732422,92.91200009521485 +155,0.002881033279533897,0.97787875,76.3000000366211,93.01800009521484 +156,0.0029676160608817425,0.997861875,75.86400006103516,92.86600011962891 +157,0.002893779011044119,0.97974515625,76.3899999609375,93.02000014648438 +158,0.0027411910206345575,0.9752015625,76.4300000390625,93.0559999658203 +159,0.0030133193525086555,0.98281015625,76.63000009033203,93.06800014404297 +160,0.0027467000597555724,0.9500196875,76.94399998291016,93.4259999609375 +161,0.002747313435455518,0.97383375,76.54000011230468,93.1820000390625 +162,0.002890611666121653,0.96070921875,76.83600008056641,93.23000014404298 +163,0.002992227241130812,0.9530984375,77.13000003417969,93.44800001220703 +164,0.0028335172184077756,0.9447990625,77.23600008544922,93.41000006591797 +165,0.0027590213243716528,0.95350171875,77.05399995849609,93.4360001147461 +166,0.002805237242552851,0.9368121875,77.47000003417969,93.55600001464843 +167,0.003104101467345442,0.9339853125,77.32400000976563,93.60000006835938 +168,0.0028203485999256372,0.930644375,77.66000008544921,93.59800009033204 +169,0.002985484631998198,0.93263171875,77.48199998291015,93.58799998779297 +170,0.0026641425377290162,0.92875015625,77.77600008544921,93.7500000390625 +171,0.0026267553164091495,0.9258865625,77.95000000488281,93.7359999633789 +172,0.002781675280337887,0.91648203125,77.94000010986328,93.82800001220703 +173,0.0028434929637504475,0.908438125,78.14000002929687,93.97399990966797 +174,0.0027169642936704414,0.90687953125,78.13000010986327,93.9280001147461 +175,0.0026101735087909867,0.9099146875,78.14600000732422,93.9720001171875 +176,0.0026994317138035384,0.90558171875,78.24800000732422,94.04800003662109 +177,0.0027551356802827547,0.9109775,78.37600010986328,94.00599998779298 +178,0.0025562551704102327,0.8944078125,78.60200008544922,94.0939999609375 +179,0.002841701381839812,0.8946471875,78.53800000732421,94.09000006347657 +180,0.0027144267556390594,0.89255328125,78.61000005859376,94.11800009033203 +181,0.0025879032078332137,0.88817375,78.77999995849609,94.24800006835937 +182,0.0025061716358842595,0.87781390625,78.87999997802734,94.26600009033203 +183,0.0027128129066633327,0.88093078125,78.88400003417969,94.29800001220703 +184,0.0026006640899660332,0.87767390625,78.98200013427734,94.2580000366211 +185,0.002634142176248133,0.870630625,79.04800016113282,94.4480000366211 +186,0.002722469574239637,0.8782084375,79.07400005859375,94.34199998535156 +187,0.0027721369572515997,0.881635625,78.97800003417969,94.24000013916016 +188,0.0025335990191836444,0.87690703125,79.06000005615235,94.3679999609375 +189,0.0024866706392328653,0.87624640625,79.25999990234375,94.3780001171875 +190,0.002724298608622381,0.86884984375,79.15400000732421,94.36600014160156 +191,0.0026475561815979226,0.873798125,79.23800013427734,94.39600006347656 +192,0.002496325452479401,0.86303046875,79.34600003173828,94.4899999609375 +193,0.0025580572463305934,0.861071953125,79.37200005615234,94.5380000366211 +194,0.0026442011751766714,0.87250765625,79.26800010742187,94.39000008789063 +195,0.002566711910601173,0.86696109375,79.29600008300781,94.39599998535157 +196,0.002543845430149564,0.864356875,79.45799995361328,94.45399998535156 +197,0.0026796250770400676,0.869255625,79.30799997802734,94.41199998779297 +198,0.0025170722676973257,0.86292703125,79.37000005615235,94.43599990966797 +199,0.0025656953387494597,0.861311875,79.44200008300781,94.45999990966797 +200,0.0026176332030445337,0.8598421875,79.50999995361327,94.49400001220702 +201,0.0025168933124015374,0.85830453125,79.6120000024414,94.46600001220703 +202,0.002508296282030642,0.86487796875,79.5180002368164,94.44999998535157 +203,0.0024724971070619567,0.86042703125,79.44800000488281,94.47799998535156 +204,0.002350900338829628,0.87450015625,79.59199995117187,94.55400003662109 +205,0.0025996306545234154,0.86581359375,79.53600005859376,94.4879999609375 +206,0.0024244988869343486,0.8577853125,79.56600018798828,94.52199998535156 +207,0.0025348346680402756,0.85924640625,79.54200005859374,94.53600001220703 +208,0.002649968007712492,0.86089328125,79.55800011230468,94.4879998828125 +209,0.002461412771871047,0.85889765625,79.65000003417968,94.51399993408204 diff --git a/CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv b/CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv new file mode 100644 index 0000000..07953cb --- /dev/null +++ b/CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv @@ -0,0 +1,311 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,0.5879408035959516,6.93980125,0.066,0.406 +1,0.043263234331139495,6.88107375,0.284,1.3320000038146973 +2,0.00782376863727612,6.48283375,1.3799999908447265,4.72400003540039 +3,0.0074164533455457005,5.53255375,5.5160000164794925,15.882000014648437 +4,0.006907936850828784,4.86912625,10.956000001831054,26.94600002441406 +5,0.0066305674346429965,4.3362075,16.86999999267578,37.32200001953125 +6,0.006266916303762368,3.833835,23.227999986572264,46.63400005126953 +7,0.005900568095967174,3.424026875,29.399999946289064,54.26000005371094 +8,0.005858304411438959,3.161556875,33.99200005981445,59.326000024414064 +9,0.005546203548354762,2.7578753125,40.40000000854492,66.20200010742188 +10,0.00525083123440189,2.556459375,43.81000014160156,69.40399999023437 +11,0.005052656168118119,2.41526875,46.781999951171876,72.16199999023438 +12,0.004906764154189399,2.200563125,50.682000043945315,75.4040000024414 +13,0.00454613400091018,2.1156203125,51.83600006103516,76.44200002685547 +14,0.004688152045543704,2.03107625,53.548000114746095,77.93799999511718 +15,0.004370159785529333,1.9354171875,55.46199996826172,79.38000001708984 +16,0.004650838440284133,1.88189921875,56.684000048828125,80.42000008789063 +17,0.0044214187655597925,1.80287,58.39000005126953,81.53800007080078 +18,0.004206506818133805,1.7846809375,58.36399997558594,81.52400004638672 +19,0.004208372756173568,1.7458278125,59.268000092773434,82.38799999023438 +20,0.0041131798976234025,1.67181078125,60.89600001708985,83.4620000390625 +21,0.004204317411806967,1.6355546875,61.79599997070312,84.05200006347657 +22,0.004256514399977667,1.6303496875,61.71199993652344,83.87599995605468 +23,0.0039758168693099704,1.60533265625,62.700000061035155,84.75600000732422 +24,0.003976539592258632,1.6761265625,61.06800004394531,83.21199998291016 +25,0.0040600252936461145,1.589869375,63.34200011474609,85.14399998046875 +26,0.004057688239429679,1.5644690625,63.53000001220703,85.30600000488282 +27,0.003951762303976076,1.6300059375,61.744000146484375,84.00400013671874 +28,0.003902679558710328,1.5681228125,63.68600006835938,85.09000015869141 +29,0.0039712132420390844,1.5614875,63.81000000732422,85.46199989990234 +30,0.004059286156137075,1.6156275,62.969999956054686,84.69800002685547 +31,0.0037791924218514134,1.55970734375,63.461999990234375,85.07399998046876 +32,0.00400861116525318,1.528990625,65.12599995605468,86.2820000805664 +33,0.003777677742099123,1.5097675,64.3220000390625,85.72199998779297 +34,0.003831957051131342,1.49948203125,64.96999995849609,85.94799997558594 +35,0.003889977266745908,1.4704921875,65.59200006347656,86.37600003173829 +36,0.0038687303396207945,1.505684375,64.81400005859375,85.99400013671875 +37,0.003778126955564533,1.509783125,65.04800008544922,85.91000012939453 +38,0.0037710025374378476,1.514193125,64.50600008544922,85.78200002685547 +39,0.0037076565703111036,1.56284796875,63.95599998535156,85.43200000732422 +40,0.003900448991251843,1.48128640625,65.16400001220703,86.3820000805664 +41,0.003922918129579297,1.55287359375,63.76600001220703,85.54400005859375 +42,0.0037474680658695953,1.55009421875,63.562000139160155,85.33400003417968 +43,0.0039003821023340735,1.4944140625,64.84599987304688,86.15999994873047 +44,0.003832862579396793,1.47437,65.29599987304688,86.49000002929688 +45,0.0037390387317697915,1.50843,64.82800001220703,86.0080000024414 +46,0.0036831728149471538,1.551256875,64.07600013916016,85.61399997558594 +47,0.0038603152248210142,1.4796565625,65.42000011230469,86.53800008300782 +48,0.0037030736483367427,1.4785609375,65.19200016113281,86.40799997558594 +49,0.0038562817332733956,1.452211875,65.87800013183593,86.71200007568359 +50,0.0036413500285042183,1.4864984375,65.09800009277343,86.43200000732422 +51,0.003774097821276103,1.53024046875,64.63200011962891,86.05800010986329 +52,0.0036625074821391274,1.472685625,65.56800011230469,86.5720000024414 +53,0.0039101472523595604,1.5525409375,64.06400000976562,85.66800003173829 +54,0.003761170930894358,1.53140046875,63.95000006835937,85.6660000805664 +55,0.0038942551805770825,1.5284903125,64.98600010742187,86.25600005615235 +56,0.0037146693128826363,1.471130625,65.2040000366211,86.56799989990235 +57,0.003862551646307111,1.566090625,63.550000102539066,85.44199998779297 +58,0.0037471780586721642,1.5615784375,63.66200009277344,85.21000003417969 +59,0.003933556095164802,1.505938125,64.50000008789063,86.06800003173828 +60,0.0037123816353934152,1.51548203125,64.28000006835937,85.95799982666016 +61,0.0037444233894348145,1.4667553125,65.69800008300781,86.56999997558594 +62,0.0037000667569892748,1.5014478125,64.77600014160156,86.08999998046875 +63,0.003817097150853702,1.47620203125,65.53200003173828,86.6259999243164 +64,0.003908832723807011,1.44793703125,65.99399989257813,87.16600000244141 +65,0.003810447491040187,1.50632359375,65.92000006103515,86.72400008056641 +66,0.003877482027746737,1.44935671875,65.74400009033204,86.75600010498047 +67,0.0035001660996515837,1.474410625,65.61999997802734,86.56799994873047 +68,0.0037858569329338415,1.44278734375,65.89400003417968,86.91399995361328 +69,0.0037039401774693814,1.4219121875,66.46199997802735,87.15800005126952 +70,0.003584539318191154,1.4854034375,65.58600012939453,86.43000002441406 +71,0.0036189446691423655,1.408791875,66.81999999511719,87.19399994628907 +72,0.0034194375288539697,1.5136809375,64.56200000488282,85.982 +73,0.0038340060273185372,1.50853046875,64.8480001196289,86.02799995605469 +74,0.0038377702169652495,1.48404703125,65.19000013916016,86.21399998046876 +75,0.0037398780696094036,1.44281453125,66.09400000732421,87.01200010253906 +76,0.0036941264157316516,1.419920625,66.66200010253907,87.30800004638672 +77,0.0036912159329014166,1.39577015625,67.10200002197266,87.50799999267578 +78,0.003628838441467711,1.43978140625,66.44999997558594,87.26600007568359 +79,0.0036196053926167743,1.428256875,66.31400001220703,86.93199995361329 +80,0.0037884804021034923,1.41467578125,66.88999984375,87.37200010009765 +81,0.0037704520260116886,1.4212840625,66.73599994873047,87.291999921875 +82,0.003521366626955569,1.36810109375,67.35,87.92200017822266 +83,0.0036201796195070657,1.431815,66.478,87.4620000024414 +84,0.0036126957441280994,1.4206084375,67.08600015625,87.38800002685547 +85,0.00364691173724298,1.383635625,67.43399997314454,87.75199992431641 +86,0.003684012784755656,1.42890765625,66.46799995605468,87.32400002929687 +87,0.0037029437787298647,1.39296953125,67.16,87.6620000756836 +88,0.0035368107492104173,1.4194915625,66.504,87.33000005126954 +89,0.003732426424643823,1.389775,67.05400003173828,87.55000002441406 +90,0.003774909635207483,1.46568671875,66.54600000732422,86.95400013427735 +91,0.0035257768052230987,1.38999359375,67.09600002929687,87.77599981933594 +92,0.003705113460975034,1.395430625,67.05200005371094,87.70200002441406 +93,0.003521749783041222,1.326335,68.68200012451172,88.75400012451172 +94,0.003645537537522614,1.43553203125,67.51400010498047,87.94600010009766 +95,0.0036176966968923807,1.3430171875,68.53400003417968,88.60199997802735 +96,0.0035008633276447654,1.4016096875,67.02000000488282,87.51800000244141 +97,0.0034988391811826398,1.3573815625,68.02600003173828,88.43799987304688 +98,0.003750821475737861,1.37476375,67.54199997802735,87.8860000805664 +99,0.003664890703346048,1.35086578125,67.89200000488282,88.11200015625 +100,0.003684502377706979,1.373255,67.54400005615234,88.03999997558594 +101,0.003695012818622802,1.349871875,68.15200005126952,88.49599999511719 +102,0.0037163359811529517,1.3368015625,68.33599986816407,88.5100000024414 +103,0.003837666840159467,1.338334375,68.08000004638671,88.18199999755859 +104,0.0036296833007197294,1.34355421875,68.29200007080078,88.41799999267577 +105,0.0036591265151011093,1.3459440625,68.00600002441406,88.34000020263672 +106,0.0035502492849315915,1.33519578125,67.93999994873047,88.24399994628907 +107,0.0035088361806369255,1.38043453125,67.594,87.79400002441406 +108,0.0036571137108174817,1.3370221875,68.07799997802735,88.24600005615234 +109,0.003598236129619181,1.2989975,69.07599989013671,88.95199996826172 +110,0.0034665650288973537,1.31067546875,68.76000000488281,88.83600010253906 +111,0.003577601580348398,1.32908,68.58800009765625,88.47400002197266 +112,0.003416676722866084,1.29249703125,69.26600007568359,88.92599997070313 +113,0.00355812518059143,1.3232515625,68.66800005615234,88.62000005126953 +114,0.0033563452307134867,1.3571640625,67.93200002929687,87.85999994873048 +115,0.003407049791089126,1.304571875,68.76200012939454,88.95200007324219 +116,0.0034234707543094245,1.36884828125,68.73400020996094,88.44599994873047 +117,0.0037440506940973656,1.3250934375,68.96199994384766,88.82800005126953 +118,0.003433559023376022,1.2825096875,69.25600004638672,89.04400001953125 +119,0.003482125499950988,1.2820234375,69.37399999267578,89.04400001953125 +120,0.0035127438105908887,1.3135090625,68.70999995361328,88.90399997558593 +121,0.003636225060160671,1.28286640625,69.41800004394531,89.19200002197266 +122,0.003607004914166672,1.282149375,69.41400001708985,89.23599999267579 +123,0.0035776219363989575,1.26638484375,69.63000007080078,89.21400012451171 +124,0.0034578298052240697,1.292155625,69.1800000756836,88.85800012695313 +125,0.0035633945371955633,1.32452921875,68.92799995605469,88.77600002441406 +126,0.0034291247803983943,1.314371875,68.6280000439453,88.6499999975586 +127,0.003587623642358397,1.3392128125,68.4520000805664,88.366000078125 +128,0.00341518730523863,1.32833703125,68.80799996582031,88.56399999511719 +129,0.003439400078994887,1.33987890625,68.01200016601562,88.39799997558593 +130,0.0033969029152233687,1.2661765625,69.92799991699219,89.17799996826172 +131,0.003517004212231508,1.280429375,69.7639999975586,89.32800004638672 +132,0.0035932111287755625,1.268138125,69.91200006835938,89.30200010009766 +133,0.0035588327861790147,1.35987171875,68.31999997802734,88.35400010009765 +134,0.0035531476106760757,1.229270625,70.50999999267579,89.74399997558594 +135,0.0034332048380747437,1.27554859375,69.87000007080078,89.38400004882813 +136,0.0034692909269194517,1.29049078125,70.19400004150391,89.62000009765624 +137,0.003353963855520955,1.2432678125,70.31400006591797,89.86000012207032 +138,0.003599305687073086,1.23844671875,70.32800001953125,89.83000004882813 +139,0.003525197905089174,1.266065625,70.21199999267579,89.37800012695313 +140,0.003439630115670817,1.23350921875,70.19000002685547,89.75200005126953 +141,0.003482519233200167,1.2829453125,69.83200005126953,89.30800020751953 +142,0.003490099500465606,1.2773321875,69.42800010009766,89.03599991943359 +143,0.0034282832805599484,1.230988125,70.60599994140625,89.96600009765625 +144,0.003393945932787444,1.25483703125,70.05200005126953,89.49400004882813 +145,0.003305191340457116,1.23083140625,70.49599996582032,89.67000017578125 +146,0.0033644404861011674,1.24529484375,70.15399996826172,89.82200012939452 +147,0.003268734019781862,1.2247296875,70.85000004394531,89.94799994140625 +148,0.0034619672889156,1.20769796875,70.98200001953126,90.25799994628906 +149,0.003331201466997819,1.26503203125,70.04799997070313,89.41799996826173 +150,0.003441011516510376,1.22800796875,70.52600007324219,89.92000022949219 +151,0.00342923730412232,1.205493125,71.00600012451171,90.21399999267578 +152,0.0033721947111189365,1.22785359375,71.00600011962891,89.97600004394532 +153,0.003331308303001736,1.2461840625,70.60000002197266,89.81000007568359 +154,0.0034347615770197342,1.2066878125,71.1920001196289,90.10000007080077 +155,0.0033012595626392533,1.20977125,71.22600006835937,90.11400007080078 +156,0.0033992239041253924,1.19793546875,71.61599994628907,90.45199983642578 +157,0.003318158394124891,1.21407890625,70.66000012451173,90.06199994628906 +158,0.003163406525605491,1.214715625,70.97999996582031,90.17000004882813 +159,0.0034511078681264606,1.1941971875,71.82799997314453,90.43200017578125 +160,0.003197109093889594,1.14000453125,72.43999991455078,91.09000006835937 +161,0.00321825232822448,1.16305515625,72.20600001708985,90.66999988769531 +162,0.003342022620407598,1.17439734375,71.66399989013672,90.62800009765625 +163,0.003477680164256266,1.1905534375,71.69600007324219,90.53199997070313 +164,0.0032866454649982707,1.1906775,71.70800004394532,90.56200006835938 +165,0.0032659856702334116,1.2040075,70.90800007080078,90.26400004638671 +166,0.0032735190553856747,1.18966578125,71.4379999194336,90.41599999267578 +167,0.003588428072232221,1.16072765625,72.15400001708984,90.81000009765626 +168,0.0033078217280230354,1.18827859375,71.74999993896485,90.47200014892579 +169,0.0034906826580741574,1.1408671875,72.46199991699218,90.97000004638672 +170,0.003191659942136279,1.18764828125,71.94600004150391,90.45000017333984 +171,0.003139835665933788,1.17025375,71.83200004638672,90.51200007324219 +172,0.0032830593242709127,1.1612790625,72.09000010253907,90.82200010009765 +173,0.003370747352684183,1.1359815625,72.7260000390625,91.09599991455079 +174,0.003248639587712075,1.1737246875,71.98399989257813,90.80200009765625 +175,0.0031608384368675096,1.14259765625,72.85200001464844,91.11199999511719 +176,0.00323448857359056,1.1602953125,72.21999999755859,90.78799991699219 +177,0.003318667661265603,1.17726234375,71.84000002441407,90.60200001953125 +178,0.003132707821870489,1.14985984375,72.55800009521484,91.00800006835938 +179,0.0033881253330036998,1.156435625,72.46000006835938,90.93800001953124 +180,0.003284998570701906,1.13349390625,72.88999998779298,91.17200001464843 +181,0.003157229528629354,1.11932859375,73.05000004394532,91.31800009521484 +182,0.003064800286665559,1.116170625,73.36400009277344,91.28400001708984 +183,0.003283920614714069,1.128514375,72.93400013916016,91.1900001977539 +184,0.0031627657091511147,1.126823125,73.07000009277344,91.20600004394531 +185,0.003216694774372237,1.10957890625,73.2940000366211,91.41999993896485 +186,0.0033008489491684096,1.13548015625,73.0499999609375,91.4460001196289 +187,0.0033268413805801955,1.12621671875,73.36800007080078,91.34600009765624 +188,0.0031068910445485797,1.097263125,73.77199993896484,91.81600006835937 +189,0.003080885707666831,1.15398984375,72.1059999975586,90.68200002197266 +190,0.003289500534135316,1.09940046875,73.4820000390625,91.57600020019531 +191,0.0032341959553637673,1.11165171875,73.72000004638672,91.77999996582031 +192,0.0030639911502865808,1.12018734375,73.13600001464843,91.37000009521485 +193,0.0031219612075281994,1.1085696875,73.60000001464844,91.43200004638672 +194,0.0031889582086088403,1.09505875,73.93799988769531,91.7100001196289 +195,0.003157629132536905,1.08983859375,74.01399996337891,91.66600009521484 +196,0.0031145006152135985,1.07532953125,74.01800006835937,91.70999991455078 +197,0.0032537023736430065,1.0866959375,74.32400001708984,91.9120000415039 +198,0.0030873163071061882,1.11645671875,73.21000017089844,91.39600006835937 +199,0.0031369839229487945,1.09043328125,74.05000006347656,91.89799999023438 +200,0.003181640906924648,1.08010234375,74.06400006591797,91.92799999267578 +201,0.0030805699227909955,1.0735140625,74.08799998535156,91.88400011962891 +202,0.0030592053164062755,1.04242984375,74.6680000415039,92.2660000415039 +203,0.0030181118054315448,1.0612190625,74.59600008789063,92.12800022460938 +204,0.0028991151734122206,1.06336171875,74.3419999609375,92.03400002441406 +205,0.003133522512923394,1.06863453125,74.36200003417969,92.07399998535156 +206,0.0029669138769220027,1.057049375,74.50799993652343,92.20400006591797 +207,0.003045719815418124,1.03255125,75.06799996337891,92.34800001708984 +208,0.0031389754731208086,1.06708421875,74.44199999023438,92.11400006835937 +209,0.002939954483216362,1.0265696875,75.12800008789063,92.5659999633789 +210,0.0030027222487011124,1.0509209375,74.48200006591797,91.98199998779297 +211,0.0030858172768993036,1.04638328125,75.16400000976563,92.2900000415039 +212,0.003049486216955951,1.01229046875,75.49800005859375,92.74400006835937 +213,0.003054107938494001,1.0149840625,75.43199993164062,92.66000001464843 +214,0.0030523278817002264,1.027435,75.20000001464844,92.33399996582031 +215,0.00295353280047753,1.02725765625,75.22200014160157,92.46599999023438 +216,0.002916792208062751,1.03820640625,75.47200008544922,92.53200009277344 +217,0.0030544213950634003,1.00248921875,75.63400006347656,92.6440000415039 +218,0.0031409591070509385,1.003148125,75.68800013671876,92.84999993652343 +219,0.00295333845341312,0.99967875,75.9020000390625,92.76399998779297 +220,0.0028556018535579953,0.997283125,75.80200006347657,92.58000014648438 +221,0.002862908594709422,0.999586875,75.85599988037109,92.67600004638672 +222,0.003056368823828442,1.00927765625,75.65600011474609,92.75400001464844 +223,0.003060362451443715,1.01704703125,75.67000011230469,92.81000009277344 +224,0.003175580724408584,0.9809190625,76.30400014160156,93.06399993408203 +225,0.002951170567290059,0.97987609375,76.25200006103516,93.04399996582032 +226,0.0028978271542915274,0.9713578125,76.16400000732422,93.23800006591797 +227,0.00277037569321692,0.9814646875,76.33599998291015,93.0980001196289 +228,0.0030606386883716497,0.9775634375,76.3620001147461,93.1340001171875 +229,0.002817842926430915,0.97441875,76.29000006347657,93.1499999633789 +230,0.0028577848404113737,0.98003734375,76.37199998291015,92.96600001464844 +231,0.002933824185415038,0.96455765625,76.66000000976562,93.1620000390625 +232,0.0028675607671695097,0.96486984375,76.67999993164062,93.26999998779297 +233,0.002932505465910903,0.9552828125,76.96200006103516,93.37800014404297 +234,0.0027431468811950515,0.954749375,76.96599998291016,93.40400001464843 +235,0.0027674867971135037,0.95912953125,76.77600008544921,93.30399993652344 +236,0.002751236149509038,0.96058203125,77.07400000976563,93.38400006591797 +237,0.0029287314641156365,0.95341609375,77.00400006103516,93.34000019287109 +238,0.002752234344370663,0.94035875,77.1740000366211,93.60999998779297 +239,0.00272264369829957,0.9491415625,77.13200001220703,93.42399999267577 +240,0.002795018887679492,0.9399025,77.42600003173828,93.4739999609375 +241,0.0028743272414430976,0.96436625,77.27999990234375,93.37800006591797 +242,0.002643940305071218,0.93623484375,77.42999998046875,93.56800009277343 +243,0.00276781537104398,0.93184546875,77.41600011230469,93.58800009277344 +244,0.0026543704360457404,0.9522346875,77.53400003417968,93.65799998779296 +245,0.002833746772791658,0.92415828125,77.69200003173827,93.7400000390625 +246,0.0028131523369146244,0.92019875,77.76400005615234,93.74200011474609 +247,0.002630403365141579,0.930619375,77.6620000366211,93.71200001464844 +248,0.002686031994276813,0.91615609375,77.89200005859375,93.78799998779297 +249,0.0027636841405183077,0.91213,77.9479999584961,93.82599993652343 +250,0.00263610525455858,0.91414609375,77.94400008789063,93.87000001220703 +251,0.0028250382248578326,0.90730046875,78.12200010986328,93.99800014160157 +252,0.0027282563969492912,0.89905328125,78.41600000244141,94.11400009033203 +253,0.0026974499092570375,0.90280859375,78.29199998291016,93.99600009033203 +254,0.0026165787795824663,0.90059046875,78.35999990234374,94.02800016601563 +255,0.0028645797033927272,0.896670625,78.30000006103515,94.0480000390625 +256,0.0027296430697398527,0.89341390625,78.28800003417969,94.12399993408204 +257,0.002628813514352909,0.8924690625,78.43999998046876,94.10399990722657 +258,0.0027438735456338952,0.89788578125,78.59200005859375,94.09800001464843 +259,0.0027608362558696952,0.88773546875,78.65600008544922,94.1600000390625 +260,0.002709025625205998,0.8862396875,78.82799995605468,94.15000008789063 +261,0.002753045010779585,0.885668203125,78.84200003173828,94.15800000976563 +262,0.0027249641716480255,0.87703578125,78.86000005859376,94.31800001220704 +263,0.002667704613746277,0.88469171875,79.01000010986328,94.24399993164063 +264,0.0027875113633594344,0.8815678125,78.92600000732422,94.18200001464844 +265,0.002604053189445819,0.88435046875,78.95800010742188,94.13800014160157 +266,0.0025744007268388358,0.87147875,79.06400008300781,94.3639999609375 +267,0.0025899515354207586,0.8690709375,79.12400013671875,94.34799998535156 +268,0.0025270525864990695,0.8703253125,79.16600013671875,94.38400000976563 +269,0.0027527200457240853,0.86244484375,79.30800016113281,94.4319998828125 +270,0.002652591543405184,0.86984296875,79.37399995361328,94.38599998779297 +271,0.0025154544107083765,0.878178125,79.30400008300781,94.3060001147461 +272,0.0027743227214419414,0.86343484375,79.41800008544922,94.45400001220703 +273,0.0025572667862953885,0.861257578125,79.62200013671875,94.4799999609375 +274,0.002562160320979144,0.85878140625,79.47599992675781,94.47600001220704 +275,0.0026652730801807983,0.87701125,79.4160001586914,94.44400001220703 +276,0.002546968720188098,0.85214609375,79.69800002929688,94.45800009033204 +277,0.002406195496275489,0.85422359375,79.63799998046875,94.47999993408203 +278,0.0025625270292428987,0.857310859375,79.64600005371094,94.46000001220703 +279,0.002560538156623287,0.852581640625,79.76600005615235,94.55200016601563 +280,0.0023881655069999397,0.85405609375,79.73400010742188,94.55800013916016 +281,0.002368844230659306,0.8560896875,79.77799997802734,94.58600000976563 +282,0.0025179104247529593,0.847653125,79.85799995117188,94.62799993408203 +283,0.002443302289715835,0.854506875,79.83200010742188,94.74800000976562 +284,0.0025897356016295297,0.85438046875,79.94799992675782,94.62800000976563 +285,0.0025441833173057865,0.845479609375,79.88599995361328,94.6980000390625 +286,0.002364877677921738,0.847043125,79.91400003173828,94.65400006347656 +287,0.0024518951873428057,0.8473740625,80.00000008056641,94.66400006347656 +288,0.0025553761183151175,0.8408471875,80.07599995117188,94.74600001220703 +289,0.0024772981996648014,0.84096609375,79.95999997802734,94.7600000366211 +290,0.002522468126179384,0.845085859375,79.92799995361328,94.65000008789063 +291,0.00250109241876219,0.844013984375,80.0679999267578,94.69999993408203 +292,0.0023949523539548473,0.843519921875,80.05000000732421,94.73600003662109 +293,0.0024601881991007496,0.839418828125,80.11200008300781,94.71800003662109 +294,0.002333784642230187,0.83979578125,80.04199995361328,94.71800011474609 +295,0.0023810978995503058,0.84091578125,80.07200015869141,94.74600006347656 +296,0.0023521651829858975,0.838931640625,80.10800002929687,94.75399990722656 +297,0.0024202836211770773,0.8369471875,80.11600005615234,94.73000011474609 +298,0.0024537391810944037,0.838296484375,80.08600002929687,94.72600016601562 +299,0.0024408193421550095,0.837796171875,80.09599992675781,94.72999993408203 +300,0.0024034588714130223,0.84015,80.04599992675782,94.7660000366211 +301,0.002540342717631055,0.83971515625,80.10600002929688,94.7340001147461 +302,0.002468660681708051,0.842360625,80.17199997802734,94.7820000366211 +303,0.0024969897266211255,0.837438515625,80.14799997802734,94.75799998535156 +304,0.0025321109652785318,0.83988890625,80.04600005615234,94.71000000976562 +305,0.002433182222635618,0.838136484375,80.03999997802734,94.72999993408203 +306,0.0024769810760127647,0.83927546875,80.17199997802734,94.78199998535156 +307,0.0026157021389475892,0.84361546875,80.15200000488281,94.73600011474609 +308,0.002351050132087299,0.837729296875,80.07200003173828,94.73600016601563 +309,0.0023475157213397324,0.846922734375,80.21200000488281,94.76799998535157 diff --git a/CV/timm/exp_results/ViT/base/args_vit-B_150.yaml b/CV/timm/exp_results/ViT/base/args_vit-B_150.yaml new file mode 100644 index 0000000..ab05c6d --- /dev/null +++ b/CV/timm/exp_results/ViT/base/args_vit-B_150.yaml @@ -0,0 +1,112 @@ +aa: rand-m9-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 3 +aug_splits: 0 +batch_size: 256 +bce_loss: false +bias_decay: true +bn_eps: null +bn_momentum: null +bn_tf: false +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.1 +epoch_repeats: 0.0 +epochs: 150 +eval_metric: top1 +experiment: '' +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 5.0 +mean: null +min_lr: 1.0e-08 +mixup: 0.8 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: deit_base_patch16_224 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_debug: 5 +opt_eps: 1.0e-08 +output: ./exp_results/deit-base-ori- +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.25 +resplit: false +resume: '' +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-06 +weight_decay: 0.02 +workers: 10 diff --git a/CV/timm/exp_results/ViT/base/args_vit-B_300.yaml b/CV/timm/exp_results/ViT/base/args_vit-B_300.yaml new file mode 100644 index 0000000..e41f508 --- /dev/null +++ b/CV/timm/exp_results/ViT/base/args_vit-B_300.yaml @@ -0,0 +1,112 @@ +aa: rand-m9-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 3 +aug_splits: 0 +batch_size: 256 +bce_loss: false +bias_decay: true +bn_eps: null +bn_momentum: null +bn_tf: false +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.1 +epoch_repeats: 0.0 +epochs: 300 +eval_metric: top1 +experiment: '' +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 5.0 +mean: null +min_lr: 1.0e-05 +mixup: 0.8 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: deit_base_patch16_224 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_debug: 5 +opt_eps: 1.0e-08 +output: ./exp_results/deit-base-ori- +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.25 +resplit: false +resume: '' +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-08 +weight_decay: 0.02 +workers: 10 diff --git a/CV/timm/exp_results/ViT/base/summary_vit-B_150.csv b/CV/timm/exp_results/ViT/base/summary_vit-B_150.csv new file mode 100644 index 0000000..da0cd6d --- /dev/null +++ b/CV/timm/exp_results/ViT/base/summary_vit-B_150.csv @@ -0,0 +1,161 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,6.969629015241351,6.8789,0.35,1.4740000004577636 +1,6.7701307364872525,6.092885,2.923999998779297,9.55200001953125 +2,6.520314659391131,5.52451375,6.120000012207031,17.63199996826172 +3,6.309063332421439,5.1006775,10.359999992675782,25.637999995117188 +4,6.200977563858032,4.6926425,14.590000002441407,33.050000032958984 +5,5.966246536799839,4.211550625,19.92799998046875,41.69199997314453 +6,5.886366980416434,3.823265625,25.436000042724608,48.86799998535156 +7,5.636490276881626,3.476820625,30.502000009765624,55.28800012695312 +8,5.441435030528477,3.2040934375,34.54999999145508,60.13599993408203 +9,5.333667687007359,3.0704365625,36.912000010986326,62.384000053710935 +10,5.33064079284668,2.809416875,40.71199998535156,66.99200001953125 +11,5.050014563969204,2.70141125,43.060000002441406,68.93799999023437 +12,5.04610105923244,2.50888125,46.04800005126953,71.58800000488282 +13,4.8556502887180875,2.483266875,47.44400007080078,72.83800002929688 +14,4.818053586142404,2.33415125,49.5940000366211,75.03200020996094 +15,4.764411279133388,2.301555625,50.29400001464844,75.524000078125 +16,4.74158375603812,2.1787871875,51.97400000244141,76.99400014648438 +17,4.725761686052595,2.1432540625,53.04199998046875,77.7640001196289 +18,4.6711210523332864,2.1266396875,54.010000048828125,78.6920000415039 +19,4.646009683609009,2.0637640625,54.994000053710934,79.66799999267577 +20,4.620888267244611,2.0288465625,55.64800001953125,80.16399999023437 +21,4.584996495928083,1.99706625,56.465999943847656,80.57400009033204 +22,4.563190596444266,1.951900625,57.1580001171875,81.28200008789062 +23,4.4821431296212335,1.8932825,57.884000017089846,81.89999993164062 +24,4.455127239227295,1.87278,58.68000009033203,82.47600006347656 +25,4.406286137444632,1.86544875,58.723999990234375,82.26600011230468 +26,4.310754571642194,1.8313215625,59.26600009033203,82.67000000732422 +27,4.404716219220843,1.7840215625,60.09199994140625,83.1980000366211 +28,4.299009203910828,1.8148903125,59.758000092773436,83.15800011230469 +29,4.417932408196585,1.7890609375,60.447999982910154,83.57200008300781 +30,4.2956497328622,1.75007546875,61.16000005615234,83.83599997802735 +31,4.3164447375706265,1.7511821875,60.878000087890626,83.95799990234374 +32,4.325539588928223,1.7671178125,60.750000036621095,83.98800003662109 +33,4.260358010019575,1.72828484375,61.24400002929688,84.25600013427734 +34,4.185277155467442,1.740508125,61.1720001171875,84.13599995605469 +35,4.1936653682163785,1.7270590625,61.37399996337891,84.41800008789062 +36,4.253177131925311,1.7454315625,61.321999990234374,84.40199992919922 +37,4.308102743966239,1.726250625,61.394000048828126,84.24999998779298 +38,4.234909108706883,1.74166625,61.54800003417969,84.20600000488281 +39,4.306120225361416,1.71573078125,61.391999929199216,84.43199990234375 +40,4.217206188610622,1.71197203125,61.92000005615235,84.634000078125 +41,4.3120207616261075,1.70625515625,61.70000008544922,84.65799997558594 +42,4.250123279435294,1.690705,61.85000001464844,84.68599995361328 +43,4.315731082643781,1.70799375,61.53600005859375,84.68000005615234 +44,4.222789577075413,1.7375190625,61.842000007324216,84.6680000805664 +45,4.26564964226314,1.714016875,61.938000009765624,84.95400005859375 +46,4.3635857445853095,1.6910959375,61.89200005859375,84.74799995361329 +47,4.227936165673392,1.6967903125,62.11200010986328,85.02400005371094 +48,4.254791617393494,1.7004075,62.08599998779297,84.89600003173828 +49,4.355360167367118,1.6895821875,62.13000006835937,84.76999997802734 +50,4.265195778438023,1.7516853125,61.66599998046875,84.66400010742187 +51,4.264554177011762,1.69786328125,62.00400006103516,84.77399995361328 +52,4.327261243547712,1.7127665625,61.6620001171875,84.65600000488281 +53,4.337813939367022,1.72115828125,61.97000008056641,84.78199995361328 +54,4.316329751695905,1.67919625,61.95400005859375,84.94600008789062 +55,4.249390431812832,1.723519375,61.83200000976562,84.64400006347657 +56,4.298370599746704,1.71378625,61.31800004150391,84.59600013427735 +57,4.244845850127084,1.74009125,61.500000036621095,84.44800005615234 +58,4.326196159635272,1.7223190625,61.487999965820315,84.40999990478515 +59,4.326049634388515,1.79321,60.632000007324216,83.9800001123047 +60,4.1043886968067715,1.54936984375,64.7080000048828,86.68400012939453 +61,4.025504384722028,1.5333053125,65.3160000341797,87.17000005126953 +62,4.114333816937038,1.52166609375,65.672,87.32400002197265 +63,4.03487799848829,1.50415203125,65.83000016113282,87.293999921875 +64,3.998051575251988,1.53302625,65.81399997558594,87.40800002441406 +65,4.01603765147073,1.5050271875,66.26600002441407,87.7180000732422 +66,4.131799561636789,1.49615859375,67.01799997314453,87.83800012695312 +67,4.02579082761492,1.4637478125,66.96800007568359,88.12800001953126 +68,4.021304403032575,1.4824240625,67.07400000244141,88.09800013183593 +69,3.9552708864212036,1.4422015625,67.18400006835938,88.29600010009766 +70,3.9504153047289168,1.460476875,67.14399994628906,88.22599994384765 +71,3.9517369951520647,1.40621890625,67.93799994628907,88.46599989501954 +72,3.9281171900885448,1.44610921875,67.65400004882812,88.40000004882812 +73,3.9567974976130893,1.4171990625,67.68600004882812,88.59200002197265 +74,3.9092021669660295,1.44212796875,68.24400012207032,88.59599991699218 +75,3.908873404775347,1.38805734375,68.40799997070313,89.02599991699219 +76,3.88528687613351,1.405209375,68.75000004638672,88.99200009765624 +77,3.881950242178781,1.40530421875,68.8119999658203,89.16600001953125 +78,3.855154871940613,1.36586625,69.13000004882812,89.17800010498047 +79,3.817075729370117,1.37695109375,69.27999994384766,89.5919999169922 +80,3.7851529121398926,1.3624575,69.51800014648437,89.53199991210937 +81,3.905322245189122,1.3414584375,69.5499999975586,89.51800002685547 +82,3.7586053950445995,1.3092875,69.9420000756836,89.78000007324219 +83,3.751699788229806,1.32302875,69.99599988769532,89.96599999023438 +84,3.8931176321847096,1.32061453125,70.19800007324218,89.98600012451172 +85,3.709507261003767,1.31129953125,70.26800009521484,90.21999994140624 +86,3.7826418536049977,1.2817078125,70.78999999267577,90.25800007080078 +87,3.6400119747434343,1.29241625,70.62000014648437,90.35200001953125 +88,3.758640170097351,1.27716546875,71.0879999633789,90.71399999023437 +89,3.6318452187946866,1.2621228125,71.35400004150391,90.60800001953125 +90,3.651788149561201,1.252781875,71.63599993652343,90.72799996582032 +91,3.7197152887071883,1.2491325,71.77199999267579,90.88599999023438 +92,3.7757417304175243,1.26371796875,71.76600007080079,90.90400006835938 +93,3.6193600382123674,1.25542765625,71.77400006591797,90.88800001708984 +94,3.6238814422062466,1.2174590625,72.23000012207031,91.18000001708984 +95,3.536820190293448,1.24194640625,72.40800001464844,91.32400009033204 +96,3.5582499844687328,1.2054115625,72.8439999609375,91.49200021972656 +97,3.6898646354675293,1.18531578125,72.8999999584961,91.5640001953125 +98,3.549690229552133,1.18031859375,73.15199987792968,91.80200001464844 +99,3.609755516052246,1.1682584375,73.5279998828125,91.81599998779296 +100,3.657796195575169,1.16523125,73.63399996337891,91.82000017089844 +101,3.569818241255624,1.16046578125,73.75399990722656,91.91600009521484 +102,3.62766364642552,1.15534203125,73.86800000976562,92.15999999023437 +103,3.544077685901097,1.14937140625,73.92800006591797,92.08600009033204 +104,3.5154461520058766,1.11958328125,74.30400001220703,92.44999991210938 +105,3.5504840782710483,1.1220875,74.61600006103515,92.41400009277343 +106,3.4753070218222484,1.1120759375,74.76800016845704,92.60200016601563 +107,3.5267016206468855,1.10029203125,74.79000003417968,92.67399990966797 +108,3.444872396332877,1.11157,75.01600021972656,92.60800017089844 +109,3.4604526417595998,1.10123546875,75.36000003662109,92.75999991210938 +110,3.4083507571901595,1.0809575,75.45199990478515,92.8700001953125 +111,3.3957954985754832,1.065394375,75.60800009033203,92.9800001953125 +112,3.3272638150623868,1.06037859375,75.64400016845703,93.13599996337891 +113,3.4289666414260864,1.0603453125,76.15000003417968,93.15600006347657 +114,3.388340336935861,1.04481484375,76.2539999609375,93.1620001171875 +115,3.3944766351154874,1.0165696875,76.64199998291015,93.47000006347656 +116,3.3446701083864485,1.03141125,76.69600002929687,93.43200009033202 +117,3.3022158316203525,1.0210525,76.63599995361328,93.60599998779297 +118,3.3074265718460083,1.017645625,77.00199995605469,93.68400009277343 +119,3.213198951312474,0.99452734375,77.18400003173828,93.70600011474609 +120,3.2595878498894826,0.9907528125,77.4020000830078,93.94799990722656 +121,3.2362237998417447,0.9886540625,77.52999995605468,93.9280000390625 +122,3.153636063848223,0.97030625,77.73000010742187,94.03999990966797 +123,3.1741700853620256,0.9702671875,77.92399989990234,94.0639999609375 +124,3.184590901647295,0.9583778125,78.30200003173829,94.16600001220704 +125,3.116585901805333,0.94181203125,78.68399998046876,94.4280001147461 +126,3.1041476896830966,0.952405625,78.46399998046876,94.32799998779296 +127,3.1573141642979214,0.94671078125,78.60999987548828,94.4000000415039 +128,3.180657318660191,0.92641671875,78.80400003173828,94.5760001171875 +129,3.101477725165231,0.9277609375,78.93800012939452,94.5620001147461 +130,3.069905706814357,0.92532625,79.19000002929687,94.69400016845704 +131,3.1060594660895213,0.9192690625,79.23400010742188,94.7280000366211 +132,2.992018461227417,0.90406015625,79.7039999975586,94.79200016845704 +133,3.000976528440203,0.90748828125,79.68400000244141,94.87200000976563 +134,2.9952284267970493,0.887784375,79.83800000488282,94.9960001147461 +135,2.9843625681740895,0.8879746875,79.90600010498046,94.98200006347656 +136,2.9764948231833324,0.88236796875,80.11400005371094,95.09400001220703 +137,2.939366579055786,0.88322875,80.27399997802735,95.0480001171875 +138,2.916310088975089,0.8718796875,80.26000010742187,95.16400003662109 +139,2.8832543236868724,0.8712409375,80.50000010986328,95.16800011474609 +140,2.9356773921421597,0.862911875,80.52800015869141,95.25000008789063 +141,2.8584332977022444,0.851140625,80.74800013183594,95.21599998535156 +142,2.907580545970372,0.85194359375,80.95000003173828,95.31199995849609 +143,2.8931364502225603,0.84861859375,80.87800005615235,95.33600000976563 +144,2.8906786952699934,0.844526875,80.96400008300782,95.37000006103516 +145,2.856494903564453,0.8433371875,80.98600010498046,95.4320000341797 +146,2.8453703948429654,0.84349796875,81.0280000024414,95.42200001220704 +147,2.7883094208581105,0.8313503125,81.18400000244141,95.5220000366211 +148,2.82052743434906,0.8334771875,81.28800010742188,95.45600003662109 +149,2.815722806113107,0.83443375,81.30000005371093,95.50399998535157 +150,2.8363174029759,0.82525375,81.26799997802735,95.56000006347656 +151,2.8583740166255405,0.82702125,81.48200010498047,95.57200013916015 +152,2.857897468975612,0.82399515625,81.5140000024414,95.61800006103516 +153,2.807421122278486,0.8223240625,81.50000000244141,95.59400008789062 +154,2.799610444477626,0.81890265625,81.52800005371094,95.61000013916015 +155,2.7337716477257863,0.82232546875,81.63599997558593,95.62000013916015 +156,2.7986813272748674,0.8176771875,81.53799997802734,95.64000013916015 +157,2.7346041883741106,0.81801578125,81.62399995117187,95.64200013916016 +158,2.7378521987370084,0.8184275,81.6320000024414,95.62400013916016 +159,2.7272439684186662,0.8178109375,81.66199989990234,95.63600013916016 diff --git a/CV/timm/exp_results/ViT/base/summary_vit-B_300.csv b/CV/timm/exp_results/ViT/base/summary_vit-B_300.csv new file mode 100644 index 0000000..b8e7525 --- /dev/null +++ b/CV/timm/exp_results/ViT/base/summary_vit-B_300.csv @@ -0,0 +1,311 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,6.969629015241351,6.8789,0.35,1.4740000004577636 +1,6.7701307364872525,6.092885,2.923999998779297,9.55200001953125 +2,6.520314659391131,5.52451375,6.120000012207031,17.63199996826172 +3,6.309063332421439,5.1006775,10.359999992675782,25.637999995117188 +4,6.200977563858032,4.6926425,14.590000002441407,33.050000032958984 +5,5.966246536799839,4.211550625,19.92799998046875,41.69199997314453 +6,5.886366980416434,3.823265625,25.436000042724608,48.86799998535156 +7,5.636490276881626,3.476820625,30.502000009765624,55.28800012695312 +8,5.441435030528477,3.2040934375,34.54999999145508,60.13599993408203 +9,5.333667687007359,3.0704365625,36.912000010986326,62.384000053710935 +10,5.33064079284668,2.809416875,40.71199998535156,66.99200001953125 +11,5.050014563969204,2.70141125,43.060000002441406,68.93799999023437 +12,5.04610105923244,2.50888125,46.04800005126953,71.58800000488282 +13,4.8556502887180875,2.483266875,47.44400007080078,72.83800002929688 +14,4.818053586142404,2.33415125,49.5940000366211,75.03200020996094 +15,4.764411279133388,2.301555625,50.29400001464844,75.524000078125 +16,4.74158375603812,2.1787871875,51.97400000244141,76.99400014648438 +17,4.725761686052595,2.1432540625,53.04199998046875,77.7640001196289 +18,4.6711210523332864,2.1266396875,54.010000048828125,78.6920000415039 +19,4.646009683609009,2.0637640625,54.994000053710934,79.66799999267577 +20,4.620888267244611,2.0288465625,55.64800001953125,80.16399999023437 +21,4.584996495928083,1.99706625,56.465999943847656,80.57400009033204 +22,4.563190596444266,1.951900625,57.1580001171875,81.28200008789062 +23,4.4821431296212335,1.8932825,57.884000017089846,81.89999993164062 +24,4.455127239227295,1.87278,58.68000009033203,82.47600006347656 +25,4.406286137444632,1.86544875,58.723999990234375,82.26600011230468 +26,4.310754571642194,1.8313215625,59.26600009033203,82.67000000732422 +27,4.404716219220843,1.7840215625,60.09199994140625,83.1980000366211 +28,4.299009203910828,1.8148903125,59.758000092773436,83.15800011230469 +29,4.417932408196585,1.7890609375,60.447999982910154,83.57200008300781 +30,4.2956497328622,1.75007546875,61.16000005615234,83.83599997802735 +31,4.3164447375706265,1.7511821875,60.878000087890626,83.95799990234374 +32,4.325539588928223,1.7671178125,60.750000036621095,83.98800003662109 +33,4.260358010019575,1.72828484375,61.24400002929688,84.25600013427734 +34,4.185277155467442,1.740508125,61.1720001171875,84.13599995605469 +35,4.1936653682163785,1.7270590625,61.37399996337891,84.41800008789062 +36,4.253177131925311,1.7454315625,61.321999990234374,84.40199992919922 +37,4.308102743966239,1.726250625,61.394000048828126,84.24999998779298 +38,4.234909108706883,1.74166625,61.54800003417969,84.20600000488281 +39,4.306120225361416,1.71573078125,61.391999929199216,84.43199990234375 +40,4.217206188610622,1.71197203125,61.92000005615235,84.634000078125 +41,4.3120207616261075,1.70625515625,61.70000008544922,84.65799997558594 +42,4.250123279435294,1.690705,61.85000001464844,84.68599995361328 +43,4.315731082643781,1.70799375,61.53600005859375,84.68000005615234 +44,4.222789577075413,1.7375190625,61.842000007324216,84.6680000805664 +45,4.26564964226314,1.714016875,61.938000009765624,84.95400005859375 +46,4.3635857445853095,1.6910959375,61.89200005859375,84.74799995361329 +47,4.227936165673392,1.6967903125,62.11200010986328,85.02400005371094 +48,4.254791617393494,1.7004075,62.08599998779297,84.89600003173828 +49,4.355360167367118,1.6895821875,62.13000006835937,84.76999997802734 +50,4.265195778438023,1.7516853125,61.66599998046875,84.66400010742187 +51,4.264554177011762,1.69786328125,62.00400006103516,84.77399995361328 +52,4.327261243547712,1.7127665625,61.6620001171875,84.65600000488281 +53,4.337813939367022,1.72115828125,61.97000008056641,84.78199995361328 +54,4.316329751695905,1.67919625,61.95400005859375,84.94600008789062 +55,4.249390431812832,1.723519375,61.83200000976562,84.64400006347657 +56,4.298370599746704,1.71378625,61.31800004150391,84.59600013427735 +57,4.244845850127084,1.74009125,61.500000036621095,84.44800005615234 +58,4.326196159635272,1.7223190625,61.487999965820315,84.40999990478515 +59,4.326049634388515,1.79321,60.632000007324216,83.9800001123047 +60,4.213467495782035,1.6962178125,61.84800006347656,84.79200003173828 +61,4.179520555904934,1.67279703125,62.441999934082034,85.09599984863281 +62,4.2883595909391135,1.70774703125,62.03399993164062,85.20599997558594 +63,4.212563753128052,1.6541159375,62.82400009277344,85.45000010986328 +64,4.186365791729519,1.7163590625,62.156000139160156,85.09800008300782 +65,4.211346830640521,1.6630990625,62.92400000976563,85.53600000244141 +66,4.345460380826678,1.706836875,63.13999995605469,85.28199992431641 +67,4.232215166091919,1.6551934375,63.3160000756836,85.56600015136719 +68,4.2405494792120795,1.6708703125,62.974000100097655,85.73600003417968 +69,4.184299758502415,1.6155940625,63.50000006103516,86.09000020507813 +70,4.189510720116751,1.636130625,63.55400000732422,85.91000003173828 +71,4.201522128922599,1.60571,63.98600003662109,86.16800002929688 +72,4.148890921047756,1.64055875,63.548000114746095,86.05599995117187 +73,4.201035022735596,1.61706171875,63.74000010498047,85.9760001586914 +74,4.1622961929866245,1.64356109375,63.83399998779297,85.86400013183594 +75,4.16710318837847,1.60072328125,64.19599997802734,86.190000078125 +76,4.1475348472595215,1.61889765625,64.25200005615234,86.16199997558594 +77,4.158843449183872,1.63870453125,64.10599987304687,86.37799992431641 +78,4.142371841839382,1.597145625,64.36000003173828,86.40400005615234 +79,4.1166762965066095,1.60502796875,64.42400010009766,86.38800015625 +80,4.078348330089024,1.5798840625,64.87199997314453,86.685999921875 +81,4.230167474065508,1.56397234375,64.74600016357422,86.58000002685547 +82,4.071969883782523,1.53146046875,64.90000001953125,86.85000005126953 +83,4.0647357021059305,1.57282546875,64.98799995361328,86.65799997314453 +84,4.239261967795236,1.576009375,64.71600006347656,86.48800005371093 +85,4.046512143952506,1.56221296875,65.11799990234375,86.87000002685546 +86,4.121064526694162,1.5594540625,65.17199989746094,86.90399991943359 +87,3.9984947443008423,1.5599678125,65.36000000976563,86.9120000805664 +88,4.116381389754159,1.56357,65.24000002685547,87.04400015625 +89,3.9988598312650407,1.525839375,65.47000005615234,87.23199997558594 +90,4.018885595457895,1.51683296875,65.566,86.97400004882813 +91,4.1075364010674615,1.5444475,65.75200002929688,87.06599999755859 +92,4.171327829360962,1.55207921875,65.69600000244141,87.11199997558593 +93,4.023313914026533,1.542966875,65.69600006103515,87.15200008056641 +94,4.040976541382926,1.52646296875,65.62000004882813,87.28999986816406 +95,3.9684804337365285,1.55841890625,65.53800002441406,87.0920001538086 +96,3.9950338431767056,1.53197,66.24799995117188,87.56800012939453 +97,4.128159182412284,1.50979484375,66.18000010009766,87.63800002197266 +98,3.992838842528207,1.47391,66.11200015136718,87.66200010253907 +99,4.059381195477077,1.4893584375,66.54200010253906,87.65599997558594 +100,4.133719955171857,1.4780228125,66.442,87.74200010498046 +101,4.041632890701294,1.51560859375,66.54800013183593,87.67999997314453 +102,4.1323743888310025,1.49358578125,66.4760000756836,87.74200018066406 +103,4.0429258687155585,1.48665953125,66.75,87.9560002319336 +104,4.048352837562561,1.47264984375,66.65400016113281,88.054000078125 +105,4.077164786202567,1.4712521875,66.66599989501952,87.98799999511719 +106,4.014335121427264,1.47394609375,67.20400008300781,88.08599997558593 +107,4.067411524908883,1.488710625,67.01800005371094,88.16200002685547 +108,3.9856631415230885,1.490495625,66.98200008056641,87.98000007568359 +109,4.032512954303196,1.49698,67.07999988769531,88.10799999267579 +110,3.985403231212071,1.43890265625,67.46600001953125,88.34000004638672 +111,3.983332174164908,1.4535665625,67.45200002197265,88.40200007324219 +112,3.9212536300931657,1.459085,67.27200002685547,88.27800002441407 +113,4.041322333472116,1.457323125,67.76400010742188,88.45400004882812 +114,4.005600690841675,1.44920921875,67.25399997802734,88.33800010009766 +115,4.040409803390503,1.41558953125,67.61199997558593,88.486000078125 +116,3.9900557483945573,1.45428828125,68.04399999267578,88.66000004882812 +117,3.9569373641695296,1.427186875,67.8319999975586,88.76600004882812 +118,3.972442524773734,1.43617828125,67.71599994628906,88.476 +119,3.8783712216785977,1.4222303125,68.13200005126953,88.59199994140624 +120,3.945246083395822,1.41395203125,68.50200001953125,88.76200007080078 +121,3.9348261015755788,1.42097109375,68.19000015136719,88.83000002197265 +122,3.8721287761415755,1.4167175,67.98600007324218,88.48600005126953 +123,3.8878585951668874,1.387705625,68.44000002685547,88.82600005371094 +124,3.931880303791591,1.38112578125,69.11600007568359,89.17199997070313 +125,3.8602528401783536,1.38122140625,69.19399997558594,89.41599999511719 +126,3.847030554498945,1.40038890625,68.70000004394531,89.18599994384766 +127,3.9303902047021047,1.38554421875,68.95999997070312,89.22000004882813 +128,3.979553392955235,1.37263125,68.60000007080077,89.22199994384765 +129,3.9104709114347185,1.39174265625,68.886,89.12000007568359 +130,3.868299501282828,1.37742796875,69.01400005126953,89.38400004638672 +131,3.939802203859602,1.387974375,68.72800014648438,89.17399996826173 +132,3.788008655820574,1.3468928125,69.71600004394531,89.63200001708984 +133,3.8497856003897533,1.3657671875,69.5219999633789,89.34199998779297 +134,3.851942607334682,1.3613771875,69.44600002685547,89.54999994384765 +135,3.8319093329565868,1.35302671875,69.64799996582032,89.56000007080078 +136,3.841527921812875,1.33907203125,69.50600004638672,89.69800004638672 +137,3.8174962997436523,1.36149984375,69.75200015625,89.60599994384765 +138,3.7776987893240794,1.32647453125,69.90999996826172,89.81799994140626 +139,3.779037492615836,1.360001875,69.77799994140625,89.7060000732422 +140,3.858560698372977,1.334925625,69.74599997314454,89.55999997070313 +141,3.7706131083624705,1.3348275,69.85999994384765,89.70000009765624 +142,3.8451303924833025,1.354425,69.88000001220703,89.92000014648437 +143,3.827801857675825,1.31508015625,70.50800001464843,90.02399989013672 +144,3.819279636655535,1.30098140625,70.38399999267578,90.11600007080078 +145,3.793690732547215,1.308443125,70.5079999975586,90.10400004638672 +146,3.79205060005188,1.3126325,70.42800012451171,90.0999999975586 +147,3.7342803989137923,1.2962403125,70.72799999023438,90.37200001953126 +148,3.759069698197501,1.30338671875,70.68000002197266,90.24599991699219 +149,3.773835233279637,1.27972859375,71.06800010009766,90.3099999194336 +150,3.7902458224977766,1.30308015625,70.69000001464843,90.22200004638673 +151,3.833420293671744,1.30644046875,70.62199994140624,90.2620000439453 +152,3.829545021057129,1.2752184375,71.1980001953125,90.56400006591797 +153,3.762980546270098,1.2827546875,71.08000010253906,90.56200002197265 +154,3.765869344983782,1.2928425,70.85000004394531,90.53799999267578 +155,3.6597749335425243,1.253755,71.40400012695312,90.62800012451171 +156,3.7653644255229404,1.2428259375,71.46800002197266,90.85600004394531 +157,3.665971670831953,1.26535609375,71.70599999267579,90.76799986328125 +158,3.662470664296831,1.28085375,71.41400001953124,90.76199999267578 +159,3.660918814795358,1.24413703125,71.95200009521484,90.93999996826172 +160,3.651732785361154,1.23913953125,71.85800001220703,90.84800014648438 +161,3.6541929244995117,1.26079765625,72.04000009765625,90.94600007080078 +162,3.6284380640302385,1.254959375,71.92000004150391,90.97400002197266 +163,3.6653095313480923,1.23689171875,72.06400004394531,91.05199994140625 +164,3.5928574800491333,1.21643140625,72.2220000390625,91.23200014648438 +165,3.684508442878723,1.2473825,72.38599996582032,91.02999993896485 +166,3.6942974669592723,1.204405,72.53200002197265,91.2360000415039 +167,3.737000686781747,1.2211703125,72.65800006591797,91.2500000390625 +168,3.639409899711609,1.2167225,72.39999999023438,91.31800006835938 +169,3.6478631666728427,1.20903890625,72.64000004150391,91.39800001708984 +170,3.5844166789736067,1.2037178125,72.71000006591797,91.33400011962891 +171,3.5722510474068776,1.18691359375,72.85599991699219,91.61200009277344 +172,3.6715655667441234,1.2027809375,72.9860000415039,91.5980001977539 +173,3.5886222294398715,1.20322296875,73.31799988525391,91.71600009521484 +174,3.651749236243112,1.19743671875,73.04200009033202,91.51000001464844 +175,3.640444346836635,1.1770146875,73.50600003662109,91.7240000415039 +176,3.6747266224452426,1.16991953125,73.7160000805664,91.8840000390625 +177,3.5942376341138567,1.16856515625,73.54999997802734,91.77000009033203 +178,3.6450414998190745,1.17093625,73.51600009033203,91.78600004150391 +179,3.5550800391605923,1.18241265625,73.34600000976563,91.67800014648438 +180,3.5467359849384854,1.16648359375,73.8679999633789,92.04199996337891 +181,3.5346290384020125,1.1711478125,73.9060000390625,92.02400006591797 +182,3.549855317388262,1.16440859375,73.89200006103516,92.00000014404297 +183,3.488757542201451,1.13834375,74.21400000732422,92.16600006835938 +184,3.5486171756471907,1.13467140625,74.07599998535156,92.15600006835938 +185,3.527870978627886,1.14563390625,74.24400008789063,92.31800004150391 +186,3.641656960759844,1.147888125,74.40600000732422,92.22600001464843 +187,3.4327066114970615,1.12195578125,74.64799995605469,92.37800001464844 +188,3.4440344912665233,1.1374628125,74.2739998852539,92.24399996337891 +189,3.5416121823447093,1.128978125,74.56999998779297,92.36800014404297 +190,3.4425646918160573,1.10468796875,74.84600006347657,92.49800009277344 +191,3.3866710492542813,1.12815515625,74.77600001220704,92.4780001928711 +192,3.5243432010923113,1.112049375,74.93199998535157,92.6079999609375 +193,3.458450981548854,1.1140653125,74.88200003417968,92.58999993408203 +194,3.5172935724258423,1.10661859375,74.9979999584961,92.81200004150391 +195,3.3919708728790283,1.0790059375,75.4440000341797,92.87799998779298 +196,3.4189445972442627,1.0813096875,75.53599990234375,93.16600000976563 +197,3.4953403643199374,1.08123953125,75.31799997802734,92.85200001220703 +198,3.4433101756232127,1.0640003125,75.66999990722657,92.9319998828125 +199,3.4136417593274797,1.1084825,75.48000001220703,92.84799998535156 +200,3.368078657558986,1.064356875,75.87599995361329,93.06600014404297 +201,3.3975088596343994,1.0529915625,75.77000008789062,93.12799998779298 +202,3.414073722703116,1.048718125,75.8860000805664,93.09199993164063 +203,3.4181587355477467,1.039999375,76.12000003417968,93.26000008789063 +204,3.3536973680768694,1.05147796875,75.97800000488282,93.1780000366211 +205,3.3996514763150896,1.0671740625,76.00800000732421,93.1180000366211 +206,3.2854509013039723,1.04157546875,76.19400008056641,93.25400000976562 +207,3.3939605951309204,1.0417709375,76.20799992919922,93.39599995849609 +208,3.4585547958101546,1.031876875,76.7360001123047,93.36799991210937 +209,3.267347148486546,1.0176971875,76.61599995849609,93.41000009033203 +210,3.2705468790871755,1.021806875,76.78999998291016,93.56000019287109 +211,3.363382646015712,1.02334453125,76.88600003417969,93.56400001464844 +212,3.28737325327737,1.024301875,76.76,93.71399995849609 +213,3.268887758255005,1.0145903125,76.89600000732422,93.67399993652344 +214,3.246021270751953,1.00449375,76.9939999560547,93.73800000732422 +215,3.3066403525216237,1.0040428125,77.26399998291015,93.73800003662109 +216,3.300972972597395,1.016209375,77.22200002685547,93.71200008544922 +217,3.2904276847839355,0.99096859375,77.3560000366211,93.84399990722656 +218,3.254083718572344,0.9851975,77.50800002929688,93.77400000976563 +219,3.3215164116450717,1.00614453125,77.46800006347657,93.92999993408203 +220,3.217401776994978,1.00295546875,77.57199992919922,93.89199993408204 +221,3.183136684553964,0.9760646875,77.79600018554687,94.05600006347656 +222,3.207250254494803,0.98017984375,77.83199990478515,94.1139999584961 +223,3.1810088668550764,0.9771621875,78.04999997802734,94.12400011230469 +224,3.192355445453099,0.96112828125,78.28000018554687,94.09199990478515 +225,3.176583766937256,0.96966171875,78.17600005371094,94.17200001220704 +226,3.216641868863787,0.97431265625,78.21800010498048,94.18800008789063 +227,3.2392124107905795,0.9575475,78.15200011230469,94.25200001220703 +228,3.094020298549107,0.96108203125,78.412000078125,94.35999998291015 +229,3.042390619005476,0.94788421875,78.44799997802734,94.39000003417969 +230,3.1469024079186574,0.9359265625,78.59599995361329,94.4300001147461 +231,3.0367832354136874,0.9324596875,78.87600002685546,94.51400006103516 +232,3.1701260123934065,0.9365625,78.87000000976562,94.43600008789062 +233,3.1740969930376326,0.92394921875,78.981999921875,94.62600008789063 +234,3.021683692932129,0.93181640625,78.81200012695312,94.58799998291016 +235,3.066072804587228,0.93264640625,78.95200005371093,94.44799998535156 +236,3.029092584337507,0.9182596875,79.23400010986329,94.81799995849609 +237,3.0436436789376393,0.9156940625,79.14999995117188,94.7540000366211 +238,3.0316148485456194,0.91837046875,79.13999997558594,94.71800021728515 +239,3.1059979370662143,0.91385984375,79.19399998046875,94.80999995849609 +240,3.0071086372647966,0.9125759375,79.28199987060547,94.74799998291016 +241,3.076224752834865,0.89396609375,79.62000010742187,94.85999998291015 +242,2.9855558361325945,0.90599921875,79.66800010742188,94.96199998291016 +243,3.02846850667681,0.906094375,79.56800010253906,94.88000008789062 +244,2.9678858518600464,0.8839753125,79.7680000830078,95.02400000976563 +245,3.019363965306963,0.897726875,79.81999998046875,94.89400000976562 +246,2.9393607548304965,0.894104375,79.84200010742188,94.95000003417968 +247,2.954161967550005,0.89661734375,80.11799997558593,95.02600008544921 +248,2.9839111055646623,0.881688125,80.04600010498046,95.10400005859375 +249,2.8732495989118303,0.8762984375,79.99600015625,95.16200000976562 +250,2.926910638809204,0.8640259375,80.362,95.18600013916016 +251,2.9226092100143433,0.8752515625,80.348000078125,95.15000000976562 +252,2.936304875782558,0.87086,80.36000005371093,95.2120001123047 +253,2.893927880695888,0.862070625,80.55800004882812,95.31000000732422 +254,2.914907455444336,0.86268625,80.44600005615234,95.36200000976562 +255,2.9629796062197005,0.8679853125,80.69200012695312,95.32400003417969 +256,2.9341112545558383,0.85994359375,80.904000078125,95.3640001123047 +257,2.8932479109082903,0.856921875,80.853999921875,95.31600019042969 +258,2.8523381778172086,0.85901671875,80.86600015136719,95.40199992919922 +259,2.840928418295724,0.85219765625,80.96600013183594,95.41000016357422 +260,2.7624418565205167,0.848074375,81.03999997802734,95.40000000732422 +261,2.8604295594351634,0.84466109375,80.91399989746094,95.36400008544922 +262,2.836558222770691,0.848165,80.85000005371094,95.38600005859375 +263,2.789436902318682,0.8437884375,81.21199997070312,95.50800010986327 +264,2.8127035924366544,0.841511875,81.3199999975586,95.51599990234375 +265,2.7396664108548845,0.84411375,81.428,95.51200003417969 +266,2.682282737323216,0.836664375,81.16800010253907,95.45400000732423 +267,2.7652451481137956,0.8375390625,81.48000012939453,95.51200000732422 +268,2.7684823785509383,0.83409765625,81.415999921875,95.53999995605469 +269,2.7430005414145335,0.82644015625,81.54600012939453,95.53000011230469 +270,2.7615814208984375,0.826859296875,81.38000005126953,95.56999992919921 +271,2.730608412197658,0.827575625,81.52599997558593,95.62400008544923 +272,2.743230836732047,0.8226790625,81.64000013427734,95.68600008789062 +273,2.7087179933275496,0.82627703125,81.59800013183593,95.66400013916015 +274,2.680653316634042,0.81819546875,81.693999921875,95.65600000732422 +275,2.655898758343288,0.82363890625,81.71600010253906,95.69400011230469 +276,2.6966289622443065,0.818948125,81.82200005615235,95.69800013916016 +277,2.657309651374817,0.8161665625,81.8339999243164,95.69400013916015 +278,2.722650715282985,0.816388125,81.85400008056641,95.69000016357423 +279,2.7371155534471785,0.81848859375,81.80799997558594,95.73600016357422 +280,2.762461543083191,0.8157059375,81.89800002929688,95.77000011230469 +281,2.6352199826921736,0.809033125,81.97000005371093,95.80400000976563 +282,2.6447329010282243,0.81242375,81.89800002685547,95.74600013916016 +283,2.7577838727406094,0.810340625,81.9660000805664,95.78400006103516 +284,2.6313655035836354,0.8106365625,81.99800002929688,95.73400019042968 +285,2.695423790386745,0.8138815625,81.975999921875,95.81200008544921 +286,2.639449800763811,0.80932921875,82.01800010498047,95.80600013916016 +287,2.6542039769036427,0.81090203125,82.06199995117187,95.74000016357422 +288,2.5680429254259383,0.81421203125,82.14400005371094,95.75600011230469 +289,2.655463797705514,0.80852859375,82.086000078125,95.79600006103516 +290,2.640289000102452,0.80881171875,82.03200005371093,95.80800013916016 +291,2.6459262541362216,0.807845625,82.07800005371094,95.79800011230469 +292,2.615483454295567,0.8087240625,82.0779999243164,95.79400011230469 +293,2.6161666086741855,0.80778140625,82.14400005371094,95.83800011230468 +294,2.5554496560777937,0.80624515625,82.15400005371093,95.80800011230468 +295,2.562917113304138,0.8091565625,82.12000010498046,95.82000011230468 +296,2.634465183530535,0.80806171875,82.18800002685546,95.81200006103515 +297,2.5788906131471907,0.80650515625,82.12200000244141,95.83600006103515 +298,2.6572596175330028,0.80463859375,82.16400000244141,95.8400001123047 +299,2.7056877442768643,0.8056234375,82.1639999243164,95.82000013916016 +300,2.6104579312460765,0.80488515625,82.25199989746094,95.84400013916015 +301,2.6317106655665805,0.804826875,82.17200000244141,95.81400013916016 +302,2.6293483631951466,0.80684,82.18999992431641,95.82400013916016 +303,2.580767955098833,0.80481828125,82.18399992431641,95.84400021728516 +304,2.5847002608435496,0.804366875,82.2060000024414,95.86200021728516 +305,2.5991972514561246,0.80402203125,82.1739999243164,95.84600019042969 +306,2.622671059199742,0.80541546875,82.18399997558593,95.83600006103515 +307,2.5870354686464583,0.804596875,82.2959999243164,95.85400019042969 +308,2.622898374285017,0.80611625,82.20199992431641,95.84400006103516 +309,2.689241988318307,0.80708,82.17600005371094,95.81600006103515 diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml new file mode 100644 index 0000000..5bebb3e --- /dev/null +++ b/CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml @@ -0,0 +1,113 @@ +aa: rand-m7-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: true +bias_decay: false +bn_eps: null +bn_momentum: null +bn_tf: false +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.05 +epoch_repeats: 0.0 +epochs: 150 +eval_metric: top1 +experiment: '' +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 0.0 +mean: null +min_lr: 1.0e-08 +mixup: 0.2 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: deit_small_patch16_224 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_debug: 5 +opt_eps: 1.0e-08 +output: ./exp_results/deit-small-bs-test- +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.0 +resplit: false +resume: '' +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 1005 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-08 +weight_decay: 0.02 +workers: 10 diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_150.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_150.yaml new file mode 100644 index 0000000..84b7a54 --- /dev/null +++ b/CV/timm/exp_results/ViT/small/args_vit-s_150.yaml @@ -0,0 +1,111 @@ +aa: rand-m7-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: true +bias_decay: false +bn_eps: null +bn_momentum: null +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.1 +epoch_repeats: 0.0 +epochs: 150 +eval_metric: top1 +experiment: wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08 +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 0.0 +mean: null +min_lr: 1.0e-05 +mixup: 0.8 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: deit_small_patch16_224 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_eps: 1.0e-08 +output: ./exp_results/deit-small +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.25 +resplit: false +resume: null +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-08 +weight_decay: 0.02 +workers: 10 diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml new file mode 100644 index 0000000..8cf5701 --- /dev/null +++ b/CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml @@ -0,0 +1,113 @@ +aa: rand-m7-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: true +bias_decay: false +bn_eps: null +bn_momentum: null +bn_tf: false +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.05 +epoch_repeats: 0.0 +epochs: 300 +eval_metric: top1 +experiment: bs4096 +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.02121 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 0.0 +mean: null +min_lr: 1.0e-08 +mixup: 0.2 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: deit_small_patch16_224 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_debug: 5 +opt_eps: 1.0e-08 +output: ./exp_results/deit-small-bs-test- +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.0 +resplit: false +resume: '' +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 1005 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 80 +warmup_lr: 1.0e-08 +weight_decay: 0.02 +workers: 10 diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_300.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_300.yaml new file mode 100644 index 0000000..6703113 --- /dev/null +++ b/CV/timm/exp_results/ViT/small/args_vit-s_300.yaml @@ -0,0 +1,111 @@ +aa: rand-m7-mstd0.5-inc1 +amp: true +apex_amp: false +aug_repeats: 0 +aug_splits: 0 +batch_size: 256 +bce_loss: true +bias_decay: false +bn_eps: null +bn_momentum: null +channels_last: false +checkpoint_hist: 2 +clip_grad: null +clip_mode: norm +color_jitter: 0.4 +cooldown_epochs: 10 +crop_pct: null +cutmix: 1.0 +cutmix_minmax: null +data_dir: /dataset/common/imagenet-raw +dataset: '' +decay_epochs: 100 +decay_rate: 0.1 +dist_bn: reduce +drop: 0.0 +drop_block: null +drop_connect: null +drop_path: 0.1 +epoch_repeats: 0.0 +epochs: 300 +eval_metric: top1 +experiment: e300-wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08-bce +gp: null +hflip: 0.5 +img_size: null +initial_checkpoint: '' +input_size: null +interpolation: '' +jsd_loss: false +local_rank: 0 +log_interval: 50 +log_wandb: false +lr: 0.015 +lr_cycle_decay: 0.5 +lr_cycle_limit: 1 +lr_cycle_mul: 1.0 +lr_k_decay: 1.0 +lr_noise: null +lr_noise_pct: 0.67 +lr_noise_std: 1.0 +max_grad_norm: 0.0 +mean: null +min_lr: 1.0e-05 +mixup: 0.8 +mixup_mode: batch +mixup_off_epoch: 0 +mixup_prob: 1.0 +mixup_switch_prob: 0.5 +model: deit_small_patch16_224 +model_ema: false +model_ema_decay: 0.9998 +model_ema_force_cpu: false +momentum: 0.9 +native_amp: false +no_aug: false +no_prefetcher: false +no_prox: false +no_resume_opt: false +num_classes: null +opt: adan +opt_betas: +- 0.98 +- 0.92 +- 0.99 +opt_eps: 1.0e-08 +output: ./exp_results/deit-small +patience_epochs: 10 +pin_mem: false +pretrained: false +ratio: +- 0.75 +- 1.3333333333333333 +recount: 1 +recovery_interval: 0 +remode: pixel +reprob: 0.25 +resplit: false +resume: null +save_images: false +scale: +- 0.08 +- 1.0 +sched: cosine +seed: 42 +smoothing: 0.1 +split_bn: false +start_epoch: null +std: null +sync_bn: false +torchscript: false +train_interpolation: random +train_split: train +tta: 0 +use_multi_epochs_loader: false +val_split: validation +validation_batch_size: null +vflip: 0.0 +warmup_epochs: 60 +warmup_lr: 1.0e-08 +weight_decay: 0.02 +workers: 10 diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv new file mode 100644 index 0000000..f926d13 --- /dev/null +++ b/CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv @@ -0,0 +1,171 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,0.7026468387671879,6.984695,0.082,0.46 +1,0.058770897665194104,6.9116125,0.106,0.526 +2,0.007911681064537593,6.91631375,0.1,0.508 +3,0.007928581509206976,6.6319525,0.8200000009155274,3.075999998779297 +4,0.007686727081558534,6.04628,3.808000006713867,11.112000046386719 +5,0.007372890499287418,5.32454875,8.540000014648438,21.948000028076173 +6,0.007114177669531533,4.822300625,13.483999995117188,30.73600007080078 +7,0.006873541445072208,4.345096875,18.62599999267578,38.88200006591797 +8,0.006686848022841981,3.96510875,23.81200007446289,46.102000041503906 +9,0.006508722136329327,3.558395,28.93600006591797,52.72000012207031 +10,0.006406569953209588,3.319361875,32.69800003173828,57.07200003662109 +11,0.006091187142633966,2.980923125,37.60000006103515,62.92200005371094 +12,0.005965045220883829,2.7659815625,41.365999982910154,66.40400014648438 +13,0.005911312930818115,2.6131971875,44.04200001953125,69.21200000732422 +14,0.005739207878442747,2.4368365625,46.747999990234376,71.56399995117188 +15,0.005549260514921376,2.315486875,49.20399995605469,73.99400002685547 +16,0.005579812019797308,2.2251215625,50.674000087890626,75.37799997802735 +17,0.005359911625938756,2.1386896875,52.33400008789062,76.75399994384766 +18,0.0053312217351049185,2.047164375,53.998000007324215,78.22200001953125 +19,0.005294654119227614,1.98766875,55.05999994628906,78.90800022460938 +20,0.005293804021286113,1.928628125,56.43800004882812,79.80200014648437 +21,0.005101200592304979,1.892119375,56.980000048828124,80.62000014892578 +22,0.005065899142729384,1.830405625,58.26800009765625,81.35800006347657 +23,0.005011323107672589,1.7809378125,59.08200006835938,82.07200008789063 +24,0.005041455550651465,1.7671984375,59.573999936523435,82.28799998779297 +25,0.005057569460145065,1.72392078125,60.189999987792966,82.9780000390625 +26,0.005011343031323382,1.697203125,60.678000061035156,83.40599985351562 +27,0.004803977141688977,1.6762446875,61.38600006835937,83.77799998291016 +28,0.004737243688266192,1.6374884375,61.80200000976563,84.30000013427734 +29,0.0048476228756564,1.61066359375,62.402000092773434,84.65000003173829 +30,0.004830248554104141,1.60866765625,62.512000166015625,84.74799990722656 +31,0.004853829142770597,1.6123709375,63.006000034179685,84.83400006103516 +32,0.004932305309921503,1.5929675,63.099999912109375,85.05400000976563 +33,0.004792891841913972,1.568394375,63.276000119628904,85.31999998291016 +34,0.004692332952150277,1.58934375,63.035999912109375,85.20200000244141 +35,0.004577582768563714,1.54651234375,63.73400003417969,85.60999985351563 +36,0.004556031598310385,1.5503490625,63.59800005371094,85.50600002685547 +37,0.0046822375152260065,1.52474796875,64.22400008789063,85.93000013427735 +38,0.004657128559691566,1.55069484375,63.546000217285155,85.62999992675782 +39,0.004746380395122937,1.52210296875,64.32800003417968,85.91000010742188 +40,0.004707724354895098,1.5206871875,64.42799998291015,86.0240000830078 +41,0.004603428766131401,1.506674375,64.54200005859374,86.05000005371093 +42,0.004603030060284904,1.50159671875,64.60199995361329,86.00200010742188 +43,0.0047432629591120145,1.5179125,64.27400005859376,85.91999986816407 +44,0.004725775448605418,1.51297625,64.40400008789062,86.0840000024414 +45,0.004635986472879138,1.51435421875,64.39799995605469,86.12600018310548 +46,0.004731553606688976,1.51975109375,64.57999998291015,85.91800005859375 +47,0.004743808514571616,1.52170140625,64.19400000732422,86.22800018066407 +48,0.0046771604434720105,1.51519671875,64.61600005615234,86.22799995361328 +49,0.004706535787720766,1.4999521875,64.70200003173828,86.35599994873047 +50,0.004842441262943404,1.52595046875,64.25200006347656,86.07999998046876 +51,0.004725219449028373,1.5043696875,64.45800010986328,86.2180000024414 +52,0.004687858246532934,1.5141671875,64.41200015869141,86.05400005371094 +53,0.004687787432755742,1.548313125,63.66999997802734,85.65200002685548 +54,0.0047174037899822,1.52720578125,64.31400008300781,86.20199995117187 +55,0.004637726915201971,1.52093953125,64.1680000366211,86.0380000390625 +56,0.004833232844248414,1.52520125,64.17599990966796,86.19600010742188 +57,0.004762610686676843,1.52058671875,64.0140000366211,86.07000000488281 +58,0.004648298191438828,1.52946125,64.14999998779297,85.98200005859375 +59,0.0046070771225328955,1.5326153125,64.15800011474609,86.00000005371093 +60,0.004567398789471814,1.38797765625,67.20599997558594,87.92 +61,0.004383251969037312,1.36050390625,67.4280001586914,88.1979999975586 +62,0.004411891967590366,1.35785125,67.754000078125,88.27799996826172 +63,0.004354702425189316,1.377858125,67.62200020019532,88.17399994384766 +64,0.004435187638072031,1.338603125,68.13600005126953,88.3960001538086 +65,0.0044293701316096955,1.36339078125,67.78399989501953,88.31600004882813 +66,0.004406826853352998,1.3349828125,68.20400002441406,88.53000010009765 +67,0.004341115642871175,1.34182421875,68.06400010498047,88.522000078125 +68,0.00440527665029679,1.3345721875,68.53000009765626,88.51799996826172 +69,0.00445441366173327,1.31747984375,68.80199995117188,88.88400004882813 +70,0.0045289295459432265,1.31268578125,69.02600005126953,88.89000002685547 +71,0.004412627180239984,1.31370578125,68.77999997558594,88.77800002197266 +72,0.00447040267421731,1.2932334375,69.15000004638672,89.27399997314453 +73,0.004431776136958173,1.3080253125,68.917999921875,88.95799999755859 +74,0.004366434345554028,1.29256109375,69.2580001196289,89.16399991699218 +75,0.004449943878820964,1.27832703125,69.34399997558593,89.37399991699219 +76,0.004414989829196462,1.27328578125,69.49399997070313,89.35799989013672 +77,0.004385375467661236,1.2588965625,69.97000009765625,89.67000014404297 +78,0.004234651502754007,1.252145625,70.06600004882813,89.61200001953125 +79,0.004134277879659619,1.2481928125,70.17599991699218,89.73199991455078 +80,0.004376141986410532,1.2435940625,70.29000009521485,89.80399999023437 +81,0.004350113000587693,1.25145546875,70.16599994384765,89.71599991699219 +82,0.0041788803480033365,1.23727890625,70.4640000390625,89.90800012207032 +83,0.004163048130327037,1.22594515625,70.7480000390625,90.1040001196289 +84,0.004177262308076024,1.220285,70.80000002441406,90.10800001953125 +85,0.004356617806479335,1.2096825,71.06799990966798,90.28000017333984 +86,0.004136818195027965,1.20626265625,71.00800006835938,90.45400009277344 +87,0.004320669054452862,1.1960878125,71.44600004394532,90.39599999267578 +88,0.004225688005265381,1.1825890625,71.57000004394531,90.68199999267578 +89,0.004137393403133112,1.18721875,71.5899999633789,90.6060000415039 +90,0.004129843686574272,1.1719803125,71.85000006835938,90.6959999609375 +91,0.004141489758954516,1.1739284375,71.73799994628907,90.95400009521484 +92,0.004072970527756427,1.15733859375,71.9239999584961,90.9779999609375 +93,0.004200898892512279,1.16603765625,71.97399991455079,90.74799993652344 +94,0.004249856541199344,1.1445459375,72.57999996337891,90.96800004150391 +95,0.004225575564695256,1.14039734375,72.46000006835938,91.07400001708984 +96,0.004005532079775419,1.131825,72.8800000390625,91.18599999023438 +97,0.004071374174340495,1.1173815625,73.15000009521485,91.3600000439453 +98,0.004082076717168093,1.122508125,72.95799993164063,91.48199988525391 +99,0.004027015063911676,1.1120584375,73.32999996337891,91.45199999267578 +100,0.004073423183789211,1.1090534375,73.0780001171875,91.6000000390625 +101,0.004202060867100954,1.10801828125,73.33600004394532,91.67000009765626 +102,0.004041028491753552,1.08485265625,73.82200001220703,91.8820000415039 +103,0.004068882670253515,1.08951828125,73.75799998779297,91.70599988769531 +104,0.004002831843016403,1.0773490625,73.9340000390625,91.92600014648437 +105,0.0039905716798135215,1.06997796875,74.22000014160156,91.96800014404297 +106,0.0039710661263338155,1.07303515625,74.20199993164063,92.1240000390625 +107,0.004002945015339979,1.0502475,74.67599990722657,92.20000014404297 +108,0.003870416233049972,1.04936078125,74.8459999584961,92.24200006835937 +109,0.00415139301081321,1.049835,74.65800008544922,92.3300000390625 +110,0.00399666149834437,1.0420678125,75.02599999023437,92.40200004150391 +111,0.004025361367634365,1.03837640625,75.07799998779296,92.33199998779297 +112,0.0038562153931707144,1.03210140625,75.1619999609375,92.61000009277343 +113,0.004027474771386811,1.018221875,75.49000013671875,92.70800006347656 +114,0.003928569860623351,1.010503125,75.47600003417969,92.82799998779296 +115,0.0037325743386255844,1.0082990625,75.82800008789063,92.87200016845703 +116,0.003692587238869497,0.99984109375,75.9919999584961,92.90200006347656 +117,0.003773627004453114,0.993427578125,75.82000000732423,93.03400008789062 +118,0.003733301069587469,0.98609140625,76.09799993164063,93.04599998779297 +119,0.0037581040690253887,0.98603875,76.28400003173829,93.15600011474609 +120,0.003757303347811103,0.9739171875,76.39200014160156,93.22600003662109 +121,0.003823640407063067,0.96441375,76.72800016113281,93.3140001147461 +122,0.0038722677688513485,0.95932984375,76.80000005859375,93.4180000366211 +123,0.0037423527599977596,0.95374203125,76.90800008056641,93.4300001147461 +124,0.003713275771588087,0.95002859375,77.07799997558594,93.65599998535156 +125,0.0037526132738483803,0.94809203125,77.04999998046875,93.59000001220703 +126,0.003595086995379201,0.9410465625,77.21200008789063,93.65400016601562 +127,0.0037954666851354496,0.93331484375,77.58000000732422,93.78200016601562 +128,0.0036314339875908835,0.927356484375,77.58400000732422,93.86399990722656 +129,0.0036980636484388795,0.92431375,77.68600000488281,93.90999993408204 +130,0.003769875886583967,0.91506,77.9819999560547,94.0260001171875 +131,0.003566112119837531,0.90513046875,78.06400000732422,94.11600006347656 +132,0.003662788059695491,0.910395078125,78.0900000024414,94.13200021728515 +133,0.003594044263341597,0.9083621875,78.20199993164063,94.2340000366211 +134,0.003604894254489669,0.8975578125,78.35000010498047,94.22800000976562 +135,0.003663198523489492,0.892152578125,78.53599992675781,94.25000011230469 +136,0.0036279520552073207,0.89069171875,78.59200018310547,94.34800006103515 +137,0.0035507999127730727,0.885088671875,78.67200000488282,94.42800006103515 +138,0.003624363453127444,0.88099734375,78.83999992675781,94.41600008789062 +139,0.003613233383345817,0.88451125,78.92200000488282,94.43399993408202 +140,0.003535857324355415,0.873215546875,79.06199995117187,94.45999993164062 +141,0.0036269916953252895,0.872527578125,79.16999995117187,94.59400006103516 +142,0.003528143628500402,0.86826640625,79.25200000732421,94.56400006103516 +143,0.0034625070568706307,0.86327984375,79.30600000488282,94.64400013916016 +144,0.003433352885102587,0.85692328125,79.4280000805664,94.72600008789063 +145,0.0035239099857530425,0.86083546875,79.59600013427735,94.71600008789062 +146,0.003434724856301078,0.855509765625,79.66200005371094,94.77000003417969 +147,0.003357212558122618,0.855286328125,79.71399989990235,94.84200006103515 +148,0.003438713012396225,0.8520646875,79.78599995117187,94.8499999584961 +149,0.0034021507848852445,0.8481296875,79.88000002929688,94.8320000366211 +150,0.003489113911720259,0.84722625,79.9060000805664,94.82800006103515 +151,0.003307607523830874,0.843561953125,79.97799987548828,94.86000008789063 +152,0.0034429498482495546,0.843726328125,79.99799997802734,94.9439999584961 +153,0.0033417041413486004,0.842510546875,79.99800005615235,94.85000000976562 +154,0.003366844529019935,0.842011796875,80.04599989990234,94.90800013916015 +155,0.0034030966427443282,0.8417021875,80.05599989990235,94.91600000976563 +156,0.003337076399475336,0.841655,80.16599997802734,94.94600008789062 +157,0.0034864412487617563,0.8409853125,80.12999992675782,94.94600008789062 +158,0.0033682246659217136,0.8408253125,80.10799992675781,94.94800013916016 +159,0.003300395860735859,0.840429453125,80.10799992675781,94.95800013916016 +160,0.0034828968678734134,0.840489453125,80.10999992675781,94.95200013916016 +161,0.0033458996497626814,0.840411171875,80.10999992675781,94.95600013916015 +162,0.0033955154607870747,0.840509453125,80.10799992675781,94.95400013916016 +163,0.0034967419258984072,0.840469453125,80.11199992675782,94.95800013916016 +164,0.003338477507765804,0.840429453125,80.11399992675781,94.95600013916015 +165,0.0033603642701304386,0.840461171875,80.10799992675781,94.95600013916015 +166,0.003346549197366195,0.84041703125,80.11199992675782,94.95800013916016 +167,0.003418706906294184,0.840461171875,80.10799992675781,94.95200013916016 +168,0.0034616739389353563,0.840401171875,80.10999992675781,94.95600013916015 +169,0.003319057735747525,0.840421171875,80.10799992675781,94.95600013916015 diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_150.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_150.csv new file mode 100644 index 0000000..cd3c530 --- /dev/null +++ b/CV/timm/exp_results/ViT/small/summary_vit-s_150.csv @@ -0,0 +1,162 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,0.7088104273591723,6.98578625,0.076,0.4399999984359741 +1,0.0593021409586072,6.9105975,0.1,0.488 +2,0.007911931656833206,6.91798,0.098,0.534 +3,0.007876356664512838,6.6090775,0.7639999987792969,3.085999990234375 +4,0.007697046135685274,6.09368625,3.602,10.651999990844727 +5,0.007457644079944917,5.40907375,8.25999998046875,21.33200000854492 +6,0.007227104323516999,4.87317,13.316000041503907,30.212000043945313 +7,0.007069527537428907,4.469114375,17.83600005859375,37.67800001220703 +8,0.0068350267330450675,4.03336875,22.974000017089843,45.019999990234375 +9,0.006735124126342791,3.7360025,26.911999973144532,50.64200002685547 +10,0.0065133661098246065,3.4029515625,31.460000067138672,56.11799998535156 +11,0.006383622730416911,3.11413875,36.14199999267578,61.25599998046875 +12,0.006272536403100405,2.925640625,39.16599985107422,64.40200005371094 +13,0.006175674231989043,2.7787234375,41.937999921875,67.25399999023438 +14,0.006054158921220473,2.6632184375,44.03199999755859,69.6120000390625 +15,0.005941766081377864,2.490314375,46.57800006835937,71.7799999609375 +16,0.005736711734373655,2.3612234375,48.838000092773434,73.87800000732422 +17,0.005751167856422918,2.2801615625,50.37600013183594,75.05000002685547 +18,0.005688209020133529,2.2064034375,51.42600002685547,76.07400020019531 +19,0.0056528631996895585,2.11119125,53.12000000488281,77.693999921875 +20,0.005559766764885613,2.078486875,54.07400008544922,78.16200002929688 +21,0.0055256913349564585,2.02706875,54.952000029296876,79.01000000976562 +22,0.005470881537933435,1.982676875,55.996,79.79399999023437 +23,0.005492086954680937,1.9340134375,56.625999997558594,80.45800017578125 +24,0.005311453382351569,1.8639765625,57.63000004638672,81.27000006347656 +25,0.005362782394513488,1.8542515625,58.23199999267578,81.61199993896484 +26,0.0051889723898576835,1.82206859375,58.56200001220703,81.97000013916016 +27,0.005191617146400469,1.8077596875,59.058000065917966,82.1180000390625 +28,0.00539108006549733,1.76982375,59.733999992675784,82.49199993652344 +29,0.005350109356056366,1.75589625,59.6940001171875,82.95000000976563 +30,0.0051864461108509985,1.7444865625,60.152000092773434,83.12800008789063 +31,0.005111382908320853,1.7377140625,60.58999988769531,83.37200003662109 +32,0.005090781321216907,1.70070625,60.926000063476565,83.86600009033204 +33,0.0051537183046873125,1.697153125,61.247999968261716,84.04599993652344 +34,0.005152960141588535,1.69067796875,61.30999995849609,84.1339999560547 +35,0.005137387929218156,1.67412921875,61.55600004394531,84.29999990478515 +36,0.005153708858415484,1.6652390625,61.874000063476565,84.37599989990234 +37,0.005256490149934377,1.6635028125,62.1420000390625,84.33800000976562 +epoch,train_loss,eval_loss,eval_top1,eval_top5 +38,0.005088782769494823,1.63648578125,62.50000005859375,84.98400003417969 +39,0.005024506510900599,1.60348984375,62.58800013916016,85.09200006103515 +40,0.005171889306179115,1.62893296875,62.43400001220703,84.78599992675781 +41,0.00501966945427869,1.6393809375,62.182000063476565,84.9300000805664 +42,0.005019068584910461,1.6146625,63.00200009033203,84.96200003173828 +43,0.005028320310105171,1.610436875,62.552000107421875,85.07799995361329 +44,0.005035558216539877,1.603858125,62.870000009765626,85.24400005615234 +45,0.0051212664028363565,1.61266875,62.57800005371094,85.0480000830078 +46,0.005013669574899333,1.58476109375,62.805999936523435,85.38800003173829 +47,0.00513466597268624,1.60628,62.746000041503905,85.16800013427735 +48,0.005029742705768773,1.60754875,62.69999995605469,85.11400005615235 +49,0.005068301722141249,1.60245984375,63.012000014648436,85.22000013427734 +50,0.005102636824761119,1.59829046875,62.91400001953125,85.22400008789063 +51,0.00508713665684419,1.60935359375,62.944000063476565,85.19000018554688 +52,0.005092570458405784,1.62416609375,63.045999956054686,85.21000006103516 +53,0.005107233600158777,1.60259453125,62.84600011230469,85.3239999267578 +54,0.004963167610445193,1.62584015625,62.63200004150391,85.03599997558594 +55,0.005057706331302013,1.603459375,62.89600008544922,85.3440000024414 +56,0.005091265742001789,1.60293578125,62.752000063476565,85.15000005859375 +57,0.005120393088353532,1.59812140625,63.020000061035155,85.27399992675781 +58,0.005047764762171677,1.62439109375,62.720000036621094,85.08199998046875 +59,0.005100339318492583,1.6105346875,62.749999982910154,84.96400005371093 +60,0.00489781451012407,1.461613125,65.8520000024414,87.048000078125 +61,0.004919247635241065,1.44690765625,66.18199997070313,87.34000010253907 +62,0.00476340061452772,1.41814203125,66.62199994140624,87.67599994628907 +63,0.004819929466715881,1.4123228125,66.75000002441406,87.91599994384765 +64,0.004664965140234146,1.400436875,67.19600012939453,87.9180000756836 +65,0.004728357174566814,1.39647765625,67.11599997558594,87.95000015380859 +66,0.0049002468253352815,1.38950890625,67.18599995117188,88.1239999975586 +67,0.004868564462023122,1.38209890625,67.43199994140625,88.11400010009766 +68,0.00472947655777846,1.37768984375,67.502,88.29799997070313 +69,0.0046727384241031745,1.3800528125,67.54400002197265,88.31799994384765 +70,0.004654625364180122,1.358063125,67.92600010253906,88.59000007568359 +71,0.004676780397338527,1.34764734375,68.39599994384766,88.79199999267578 +72,0.004702951027346509,1.3553028125,68.19200002685547,88.68800010253906 +73,0.0046924852566527465,1.33750328125,68.48599989746094,88.95200001953125 +74,0.004712799996403711,1.32582234375,68.80199999511719,88.89799999511719 +75,0.0048501147289893454,1.327525,68.80200002441406,88.93000002441406 +76,0.0047676527007882085,1.3018096875,69.15399994140625,89.15400007324219 +77,0.00481278362816998,1.3033221875,69.2420000415039,89.29799989013672 +78,0.004725964540349585,1.2860696875,69.49800007080079,89.53000012207032 +79,0.004516901980553355,1.2861878125,69.61000001953126,89.42199997070313 +80,0.004539829911664128,1.2745015625,69.77400001953124,89.53799994140626 +81,0.004735531651281885,1.269680625,69.93200001953124,89.53599989257812 +82,0.004495503480679223,1.2706703125,70.01799997070313,89.58400004882813 +83,0.004645188538623708,1.24389453125,70.34400017089844,89.99799993896484 +84,0.004592442519164511,1.253758125,70.1959999658203,89.82199997314453 +85,0.004540879412421158,1.23293046875,70.69799996582032,90.16799996582031 +86,0.0046499134706599375,1.23806453125,70.8520001171875,90.11600001953126 +87,0.004522715928032994,1.216608125,70.97000004394532,90.38200009521485 +88,0.004548228744949613,1.21265484375,71.22600014160156,90.31400009765625 +89,0.004482994155426111,1.19394625,71.35399998779297,90.5680001977539 +90,0.00460372755437025,1.18898640625,71.69999994140625,90.69400007080078 +91,0.0045170816925487346,1.18815234375,71.54599999023438,90.59600004150391 +92,0.004452806664630771,1.1944325,71.74200006347657,90.64400014404296 +93,0.004470930102148226,1.17952640625,71.97399998046875,90.85000001220703 +94,0.004519084235653281,1.1689840625,72.18800008544922,90.80399998779296 +95,0.004441033882488098,1.1511928125,72.30200016845703,91.02200006835938 +96,0.004507575084322265,1.13273125,73.06600013671876,91.2280001196289 +97,0.004393214426402535,1.1395334375,72.67200000976563,91.32800006835937 +98,0.0044600961929453274,1.13176390625,72.93400001464843,91.3420001196289 +99,0.004350347677245736,1.12184359375,73.11000014160156,91.57600009277344 +100,0.004354501043313316,1.1171884375,73.28800006103516,91.6200000415039 +101,0.0043542285981987205,1.109035,73.32200009033203,91.71199986328125 +102,0.0044115336744913036,1.10323703125,73.56599995849609,91.8140000415039 +103,0.0043475014556731495,1.09522,73.8040000341797,91.8679999633789 +104,0.004279967563759003,1.07913765625,74.18800013671876,92.07800006591796 +105,0.004298488759169621,1.07189796875,74.11799988037109,92.08200001464844 +106,0.0043106886358665565,1.0691628125,74.25399992919922,92.29599993408203 +107,0.004302483650722674,1.0528375,74.71400006103515,92.38399998779298 +108,0.004286574815133852,1.052930625,74.78800005859375,92.34199998779297 +109,0.004312934420470681,1.039460625,74.91800000488281,92.5140001147461 +110,0.004205309669487178,1.0343,75.10400001220704,92.6020001171875 +111,0.004196903435513377,1.031161875,75.1299999584961,92.58799998535156 +112,0.004206354929400342,1.0310475,75.30400006103515,92.59400006591797 +113,0.004175113913203988,1.01361875,75.68399995361328,92.81600000976563 +114,0.004100877741750862,1.01069421875,75.83800018554687,92.9179999609375 +115,0.0040066726173141175,0.997464375,75.9339998779297,93.0339998828125 +116,0.00419412087649107,0.9867975,76.1880000341797,93.24999985595703 +117,0.004112885084136256,0.986059375,76.17600018798828,93.11800006347656 +118,0.004120496783538589,0.97795734375,76.47800013427734,93.30400000976563 +119,0.004059170971491507,0.969184375,76.74600011230469,93.32400014160156 +120,0.00405424738502396,0.96552375,76.85999995117187,93.3660000390625 +121,0.0040235080689724,0.9608075,77.030000078125,93.51000008789063 +122,0.004061605897732079,0.950822578125,76.9779999243164,93.67200006347656 +123,0.004020468215458095,0.943777890625,77.22800010742188,93.67000000976563 +124,0.004036017577163875,0.9434803125,77.14199994384765,93.77000003417969 +125,0.004021217980022941,0.938650078125,77.34199997558594,93.76799993164063 +126,0.003956006823240646,0.923268828125,77.82200008300781,93.97000008789063 +127,0.00397626292293093,0.9259103125,77.79600018310546,93.9640001147461 +128,0.0039319154041420135,0.91582265625,77.75799990478515,94.0440000366211 +129,0.003862393304838666,0.913869296875,78.03200002441406,94.07600000976562 +130,0.0039001869902546915,0.901537578125,78.20000018066406,94.08600000976563 +131,0.0038433120353147388,0.8982784375,78.3039998461914,94.24600000976562 +132,0.0038704367554081337,0.8959540625,78.38600012939453,94.2480000366211 +133,0.003911821055226028,0.8925590625,78.47600010498047,94.30200008789062 +134,0.0039027773642114232,0.88617625,78.67799997802734,94.44400008789063 +135,0.003936674761851984,0.883285546875,78.73000008544922,94.3879999584961 +136,0.0038679074329723206,0.880095859375,78.98600002685546,94.44399998291016 +137,0.003974030193473611,0.878472109375,78.96200000488281,94.42399998535156 +138,0.003772691740388317,0.873896171875,78.98399997558593,94.58800003417969 +139,0.0039038029998274787,0.87008015625,79.19400025878906,94.5519999584961 +140,0.0037787892256996463,0.865694140625,79.24800005126953,94.65600000976562 +141,0.003886453907138535,0.86606859375,79.28600008056641,94.5779999584961 +142,0.0038702244803841624,0.862626875,79.29200015625,94.63000008789062 +143,0.0038205020495557357,0.8608634375,79.3959999243164,94.62800000976563 +144,0.0038622134597972035,0.86000546875,79.36399995117188,94.67600008789063 +145,0.0037754822988063097,0.858419609375,79.458000078125,94.64599990722657 +146,0.0038074126600154807,0.85653265625,79.47600002685547,94.67800006103515 +147,0.0037796468074832645,0.85589296875,79.49800018066406,94.7039999584961 +148,0.003756655995467944,0.85510296875,79.54800015625,94.68800000976563 +149,0.0038007416961980717,0.854795390625,79.50400002685546,94.71000000976562 +150,0.0037753373284691145,0.85447609375,79.55800015625,94.66200000976562 +151,0.0038181487803480457,0.854808515625,79.54400002685547,94.69600000976563 +152,0.0038581541240481393,0.85495265625,79.57000012939453,94.69599995849609 +153,0.0038464674559820977,0.85448265625,79.61000015625,94.68400000976563 +154,0.0038058483374438117,0.8545775,79.57600002685547,94.70600000976563 +155,0.003683173995731132,0.854753359375,79.57600018066407,94.69200000976562 +156,0.0037975836977628724,0.85438921875,79.56800015625,94.72000000976563 +157,0.003804615482554904,0.854045078125,79.55200012939453,94.70400000976562 +158,0.0037559159598978503,0.85405609375,79.60600020751953,94.71600000976562 +159,0.003807129604475839,0.854081953125,79.59000015625,94.71599990722656 diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv new file mode 100644 index 0000000..434da21 --- /dev/null +++ b/CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv @@ -0,0 +1,311 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,0.7044260501861572,6.985015,0.088,0.452 +1,0.0969994724728167,6.91954625,0.1,0.502 +2,0.007983872550539672,6.909185,0.1,0.542 +3,0.007909782463684678,6.91218625,0.1,0.51 +4,0.007911908673122525,6.90983625,0.164,0.6220000016784668 +5,0.007871961453929543,6.70395875,0.5879999995803833,2.2040000076293946 +6,0.007742398825939745,6.261975,2.1440000048828125,7.538000052490235 +7,0.0075181673164479434,5.76455625,5.369999989013672,14.986000063476563 +8,0.007321218785364181,5.29548125,8.772000046386719,22.36600001098633 +9,0.007111855666153133,4.853555,13.030000030517579,29.66600005493164 +10,0.006843417999334633,4.459345,17.36600002685547,36.45599997558594 +11,0.006791550025809556,4.12680875,21.54399997314453,42.666000051269535 +12,0.006450463959481567,3.7919925,26.05999996459961,48.98600003417969 +13,0.006386950903106481,3.51486375,29.708000098876955,53.302000068359376 +14,0.006161959143355489,3.275209375,33.4859999621582,57.85200018798828 +15,0.005905966216232628,3.0013975,37.48999988769531,62.220000068359376 +16,0.0059155591879971325,2.8378715625,40.1099999584961,65.31599997070313 +17,0.00580364337656647,2.67033625,43.25800007324219,68.38600009033203 +18,0.005784340493846685,2.54994375,45.37000004638672,70.3460000390625 +19,0.005498647165950388,2.3923209375,47.886000122070314,72.7140000048828 +20,0.005400298163294792,2.3392971875,49.22999998779297,73.68799997802735 +21,0.005284862301778048,2.2200078125,51.148000134277346,75.39200002929688 +22,0.0052393467631191015,2.2018809375,51.76199992675781,76.0319999975586 +23,0.005337623995728791,2.06775625,54.09399992431641,77.97599994873048 +24,0.005264734965749085,1.9937253125,55.292,79.01200001708985 +25,0.005198978411499411,1.9435671875,56.252000021972655,79.83599996337891 +26,0.005091208382509649,1.910471875,56.89400001953125,80.4799998828125 +27,0.005193100310862064,1.84141171875,58.048000166015626,81.1960000366211 +28,0.004887306306045502,1.79133453125,59.0100001171875,81.94200009033203 +29,0.004921669606119394,1.74195515625,59.95999993896484,82.50400003417968 +30,0.00497161119710654,1.7719084375,59.681999990234374,82.49599990722656 +31,0.004979399731382728,1.7319225,60.844000092773435,82.97799988037109 +32,0.004785400058608502,1.64407359375,61.781999877929685,84.00999998535156 +33,0.004833479702938348,1.64234125,62.18799998779297,84.17799998291015 +34,0.004748103907331824,1.62056109375,62.69000000488281,84.48600005859375 +35,0.004687358741648495,1.57935859375,63.360000087890626,85.16400005859376 +36,0.004569473152514547,1.5563034375,63.79000008544922,85.58800013671875 +37,0.004761328222230077,1.54370125,63.81800003417969,85.52800001464844 +38,0.004718770156614482,1.52213484375,64.40800006591797,85.90199997802735 +39,0.004768356215208769,1.51102453125,64.6120000366211,86.04000013427735 +40,0.0047006625682115555,1.50302859375,64.84799997802735,86.34000010742187 +41,0.004745179205201566,1.48255546875,65.20000005859374,86.34799995361328 +42,0.004688572196755558,1.474654375,65.36199998046875,86.4659999243164 +43,0.004641626437660307,1.4526140625,65.774000078125,86.92200002685547 +44,0.004737225652206689,1.44859515625,65.90600008300781,86.77000010253906 +45,0.004549883306026459,1.43138296875,66.2820000805664,87.08999997558594 +46,0.004568657139316201,1.4419103125,66.08,86.97400012939453 +47,0.00449184695025906,1.4311253125,66.21400005859375,87.23800018310547 +48,0.004698772158008069,1.43329,66.484,87.15599995117188 +49,0.004482330055907369,1.416868125,66.71999997802735,87.39999995361327 +50,0.004466343321837485,1.40876625,66.59600003417968,87.35800015869141 +51,0.004509880149271339,1.42795875,66.515999921875,87.28400012939453 +52,0.004421624355018139,1.4103325,66.96200002685546,87.4540000756836 +53,0.004459893825696781,1.40280203125,67.00600000488281,87.694000078125 +54,0.004646702029276639,1.42107328125,66.31000005615235,87.36199984619141 +55,0.004660098522435874,1.40078390625,66.97999994873047,87.64400015869141 +56,0.004636922327335924,1.40150484375,67.00999997314453,87.54800015380859 +57,0.004505249846260995,1.3814259375,67.38599999511719,87.78999989501953 +58,0.004583987290970981,1.39887203125,66.95400008300781,87.64400000488281 +59,0.00445454369764775,1.3692078125,67.46200002441407,88.01799994873046 +60,0.004361059225630015,1.3869778125,67.11800007080078,87.80800005126953 +61,0.004440771474037319,1.3606525,67.41000005126953,88.12400005126953 +62,0.004636758414562792,1.39476765625,67.29600005615234,87.77800003173829 +63,0.004537882923614234,1.38170265625,67.14600000244141,87.98000002685546 +64,0.004445640544872731,1.4132978125,66.766,87.50000010009765 +65,0.004597496357746422,1.3838809375,67.57400002441406,87.97399996826172 +66,0.004552507074549794,1.3813365625,67.304,87.90600010009766 +67,0.004598443454597145,1.404621875,67.20599981445312,87.72800004882812 +68,0.004736300674267113,1.369383125,67.59199991943359,88.01800002197265 +69,0.004423990612849593,1.3800675,67.385999921875,88.02800010253907 +70,0.004376259341370314,1.375864375,67.59799997070313,88.063999921875 +71,0.004542485869023949,1.3974696875,67.25599991699218,87.80600002197265 +72,0.004551234073005617,1.38990765625,67.202,87.82200002441407 +73,0.004510169732384384,1.39546359375,67.02400010009765,87.88600002929688 +74,0.004502174386288971,1.3995421875,66.8800000024414,87.71800010498048 +75,0.004468847764655948,1.38606765625,67.2239999975586,87.90200002197265 +76,0.0044153109774924815,1.38774046875,67.27800010253907,88.01200010253906 +77,0.0045601483434438705,1.376890625,67.42200004882812,88.01800007568359 +78,0.004543673770967871,1.3869546875,67.29800007568359,87.92600002685546 +79,0.004438304342329502,1.39851296875,67.14399997070312,87.66800002441406 +80,0.004577811749186367,1.31768515625,68.536000078125,88.84199996826172 +81,0.004345477733295411,1.333046875,68.45199994628906,88.6480000756836 +82,0.004302715067751706,1.32025875,68.541999921875,88.93399996826172 +83,0.004509917518589646,1.3180740625,68.74399991699218,88.9799999975586 +84,0.004441541852429509,1.29395828125,69.16400005126953,89.25600002197265 +85,0.0045066027087159455,1.2853759375,69.141999921875,89.24200010009766 +86,0.004326534806750715,1.2793428125,69.49200007324218,89.23199996826172 +87,0.004294669663067907,1.36364703125,67.84400005126953,88.23799997314453 +88,0.004422786645591259,1.2834696875,69.44999997070312,89.29600002441406 +89,0.004403424798510969,1.28864765625,69.30399994140625,89.11400004882813 +90,0.004319502564612776,1.28498984375,69.38199994628906,89.25600002197265 +91,0.004208565398585051,1.2912790625,69.4160000415039,89.10199996826172 +92,0.0044140288373455405,1.30274,69.19799999511719,89.04800001953124 +93,0.004330580122768879,1.31306734375,68.91400002685548,88.88199997314453 +94,0.004385846754303202,1.2693425,69.68000001953125,89.57000007080079 +95,0.0044691963703371584,1.29532671875,69.619999921875,89.28600004638672 +96,0.004273455881047994,1.2526528125,70.02000009277344,89.62800004638672 +97,0.004275580518878996,1.2551025,70.17200007080078,89.62199996582031 +98,0.004168680345173925,1.2476459375,70.2620000439453,89.69599997070313 +99,0.004312307632062584,1.25005265625,70.04200002197265,89.70800001708984 +100,0.004151400818955153,1.2461659375,70.03999999511718,89.72400004394531 +101,0.004309043200919405,1.2391,70.36400012695313,89.92399999511719 +102,0.004157550341915339,1.22961453125,70.57400012207032,89.94399997070313 +103,0.004356609890237451,1.24146640625,70.50399991699219,89.85400001708985 +104,0.004298999905586243,1.227674375,70.76399998779297,90.17600006835937 +105,0.00430909771239385,1.2422475,70.44799996582032,89.9480000756836 +106,0.004445596074219793,1.22154328125,70.83000002441406,90.21400007324219 +107,0.00423419097205624,1.21691765625,70.83599999511719,90.18800004638672 +108,0.004210363578749821,1.213593125,71.11400001708985,90.26200007080078 +109,0.00412930449238047,1.20937359375,71.05799998779297,90.20600001708985 +110,0.0042115405085496604,1.22068828125,70.8900000439453,90.17599996582031 +111,0.00418245664332062,1.2140925,70.8720000439453,90.29600002197266 +112,0.004309441312216222,1.22103734375,71.18000001464844,90.14400006591796 +113,0.0042236754088662565,1.19859234375,71.22199989257813,90.38400007080078 +114,0.004432787478435785,1.23499390625,70.92599997070313,90.13999996826172 +115,0.0041279447614215314,1.1996665625,71.40400001708984,90.31399991699219 +116,0.0042328120325692,1.200145,71.30199989013671,90.44000001708984 +117,0.004356413439381868,1.2043246875,71.31400015625,90.41000007324219 +118,0.004225354467052966,1.212551875,71.19400001464844,90.07199999511718 +119,0.004277279193047434,1.19622109375,71.25399997070312,90.31799999511719 +120,0.004156066454015672,1.18837296875,71.68400017578125,90.45000014648437 +121,0.004125220933929086,1.1742715625,71.88199993896484,90.68599991210938 +122,0.004092424380360171,1.17817953125,71.87799991699218,90.69000006835938 +123,0.004256274143699557,1.18016703125,71.61000009521484,90.67000014648437 +124,0.0042632205004338175,1.1741175,71.82400002197265,90.88000001708984 +125,0.004216796689433977,1.179620625,71.8520000415039,90.77600012207031 +126,0.004339889041148126,1.173100625,71.86399999267579,90.83200006835938 +127,0.0042471098131500185,1.17969546875,71.81000006835937,90.75400001708984 +128,0.004166796104982495,1.155326875,72.28800001464843,90.86999999267579 +129,0.00414535662275739,1.1584121875,72.21200003662109,90.84199991210937 +130,0.004148415551753715,1.14498046875,72.60400009033204,90.93599993896484 +131,0.0042910316260531545,1.16384421875,72.20800009521484,90.78599996582031 +132,0.0040841237641870975,1.17078765625,72.27400009765626,90.73199996582031 +133,0.004122275277040899,1.15785671875,72.15199996582031,91.04800001953124 +134,0.0042202716576866806,1.139366875,72.58799993896484,91.17399996826173 +135,0.004182497912552208,1.149436875,72.56800004150391,91.06200004150391 +136,0.004343954788055271,1.14087421875,72.87200014160156,91.15600009765625 +137,0.004005039401818067,1.13817484375,72.76200008789063,91.28799993896484 +138,0.004071355739142746,1.13199,72.97399998535157,91.16200009521485 +139,0.00414348577032797,1.132341875,73.01999996337891,91.24399993896485 +140,0.0039712024736218154,1.1301503125,72.95399999511719,91.27600007080078 +141,0.004159792559221387,1.134925625,73.14199998779297,91.28000004150391 +142,0.004030889453133568,1.12694734375,73.06800001464843,91.26800006835937 +143,0.0040782393189147115,1.12359453125,72.86000001708985,91.48399983886719 +144,0.003809310757787898,1.12580953125,73.17400008789062,91.37000007080078 +145,0.003980578243499622,1.1145928125,73.1079999609375,91.3860001196289 +146,0.004046716610901058,1.11903015625,73.20999988525391,91.30800001464844 +147,0.004106871841941029,1.1137225,73.3460000390625,91.53399991455078 +148,0.003906197816831991,1.10643140625,73.63199998535156,91.68200006835937 +149,0.004178202943876386,1.10409578125,73.54999998535156,91.6900001171875 +150,0.00415997754316777,1.09609484375,73.84400001220703,91.62600009521485 +151,0.003989629592979327,1.09465078125,73.63000006835938,91.76400006835938 +152,0.00396822375478223,1.10190734375,73.51400012207031,91.68000016845703 +153,0.003976788342697546,1.09611203125,73.81400002197266,91.72199999023438 +154,0.004158306081080809,1.09639421875,73.66799993652344,91.80200019775391 +155,0.004064109758473933,1.08795421875,74.01999996582032,91.8660000439453 +156,0.004117032280191779,1.0851290625,74.0259998828125,91.8880000415039 +157,0.0039173789555206895,1.08440359375,73.87799995849609,91.80399998779296 +158,0.003975894884206355,1.08112921875,73.8619999609375,91.83800006835938 +159,0.004066657216753811,1.083331875,74.13599999023438,91.88200001708984 +160,0.004081751016201451,1.07107125,74.42000001220703,91.9859999633789 +161,0.003981336456490681,1.07378015625,74.17199985595703,91.9720000415039 +162,0.003877143404679373,1.06280453125,74.5020000366211,92.22400011474609 +163,0.004046246845973656,1.08194265625,74.23800003417969,91.9560000390625 +164,0.003967240249039605,1.06394828125,74.41600006347656,92.2520000415039 +165,0.004025593894766644,1.06165015625,74.64000000732422,92.18999998779297 +166,0.003986912866821513,1.0660328125,74.38600001220703,92.11399996582031 +167,0.003991044999565929,1.063330625,74.50000009033204,92.16400004394531 +168,0.0039660760085098445,1.0574934375,74.60000001708984,92.29800004150391 +169,0.0038451424334198236,1.04937484375,74.78200001464843,92.34000001464844 +170,0.00406502527766861,1.045675,74.56399998535156,92.56799988525391 +171,0.0038427552208304405,1.0538271875,74.69800006347656,92.2440000439453 +172,0.004004607035312802,1.0508121875,74.89399998779297,92.32400004150391 +173,0.003920168557669967,1.03681453125,75.0600001171875,92.54000009277344 +174,0.0037876375718042254,1.03353546875,75.05800005859375,92.5699999609375 +175,0.0039634802378714085,1.03495625,75.12800010986328,92.4180000415039 +176,0.003979514702223241,1.02824296875,75.19799989990234,92.64400006591796 +177,0.0039475191733799875,1.03462125,75.27599995605469,92.57400004150391 +178,0.003931994579033926,1.02431625,75.34600003417968,92.65600001464844 +179,0.0038644576852675527,1.02364328125,75.30000000732421,92.70000009277344 +180,0.004069518763571978,1.021311875,75.40200000244141,92.7040001147461 +181,0.0038787248195149004,1.0157940625,75.4380000390625,92.80000001708984 +182,0.0038980625104159117,1.01297875,75.60400006103515,92.78600004150391 +183,0.0038413635338656604,1.01237828125,75.65799998046874,92.83199998779297 +184,0.0038715062255505472,1.001800625,75.8019999584961,92.97600024902344 +185,0.0037820974830538034,1.00903609375,75.73799990234374,92.93200011474609 +186,0.0039416955260094255,1.00611484375,75.94000008544921,92.97599998779297 +187,0.004028597992146388,1.00629703125,75.76399995605469,92.94800006591797 +188,0.0040608441340737045,0.9983109375,76.03400006347657,93.05399998779296 +189,0.0038682857411913574,0.99529265625,76.10000005859375,93.01600009033203 +190,0.0038625796150881797,0.9906225,76.3139999584961,93.20400006591797 +191,0.003907167032593861,0.99719609375,76.07200003173828,92.99600016845703 +192,0.003970563324401155,0.98546515625,76.22399998291016,93.12800011962891 +193,0.0037146424292586744,0.98384734375,76.23200005859375,93.1819998852539 +194,0.0038293678080663085,0.980125625,76.29599997558594,93.23399993652343 +195,0.003857848176266998,0.9789878125,76.48399992675782,93.21600014404297 +196,0.003651000588433817,0.9828165625,76.30399995117187,93.18599990966797 +197,0.003931012062821537,0.974150625,76.47400006103516,93.2300001171875 +198,0.0037729314935859293,0.971365625,76.54600000732422,93.2560001147461 +199,0.003835154144326225,0.962321875,76.75400010986328,93.34400006591797 +200,0.0037835679831914604,0.97207484375,76.69200006347656,93.4340000366211 +201,0.0036511396756395698,0.9725315625,76.69000003417969,93.28200001220704 +202,0.0035157224046997726,0.96689546875,76.70999995361328,93.40600013916016 +203,0.0037986902752891183,0.96763328125,76.93200002929687,93.41600006591797 +204,0.0037802516599185765,0.96447734375,76.83799990234375,93.46799998779296 +205,0.00364596422878094,0.95649390625,77.00600002929687,93.54799985839844 +206,0.004014570848084986,0.95046234375,77.07600008544922,93.56600014404297 +207,0.003854787297314033,0.952626875,77.16199995361328,93.54800001464844 +208,0.003810833120951429,0.95615046875,77.11800005371094,93.5760001171875 +209,0.0036461960698943585,0.94509078125,77.31000010742187,93.59200014160156 +210,0.0036390326858963817,0.93478578125,77.606,93.7100000390625 +211,0.003728658310137689,0.940257734375,77.36200002441406,93.70999993408203 +212,0.003695755498483777,0.934088046875,77.38800015869141,93.8620001171875 +213,0.00374451614334248,0.93637125,77.55400008056641,93.7719999609375 +214,0.0037858944560866803,0.9297875,77.57800008056641,93.73599990722656 +215,0.003787133755395189,0.929144453125,77.604000078125,93.8679999584961 +216,0.0037237268406897783,0.92438484375,77.86200003173828,93.7780000390625 +217,0.0038397773751057684,0.9217740625,77.89799988037109,94.01000011474609 +218,0.003692085068905726,0.9237778125,78.078000078125,93.8559999609375 +219,0.0037746465823147446,0.92401,77.9880001586914,93.95799998535156 +220,0.003497931669699028,0.91463734375,78.17200003417969,94.03000001220703 +221,0.0035741630708798766,0.91683484375,78.10400008300782,94.02600009033203 +222,0.0037327913742046803,0.91270625,78.18000021240235,94.0819999584961 +223,0.003703387745190412,0.907024765625,78.2820000805664,94.11999998779297 +224,0.0035485914850141853,0.90498890625,78.47000013183593,94.14200001220703 +225,0.0035215062380302697,0.90400015625,78.47200010498047,94.1780000390625 +226,0.003617745591327548,0.901161171875,78.52600002685547,94.1599999609375 +227,0.003756721707759425,0.902990625,78.46400000244141,94.20800016601562 +228,0.0035172457282897085,0.898592890625,78.47000010498047,94.24400009033204 +229,0.003436287835938856,0.89840953125,78.6879999243164,94.25600014160156 +230,0.0035909943107981235,0.8956084375,78.79399997558593,94.21800001220703 +231,0.003540566220181063,0.8909171875,78.638000078125,94.29199998535157 +232,0.003629441751400009,0.892078515625,78.7260001586914,94.26399985595702 +233,0.0036266729002818465,0.8912621875,78.62400000488282,94.3320000390625 +234,0.0036107241467107087,0.88978859375,78.85000020751953,94.35600008789062 +235,0.003551934292772785,0.89019984375,79.11600000244141,94.36199990966797 +236,0.003503760090097785,0.88300671875,79.01200000244141,94.46399998535156 +237,0.003436240862356499,0.875820546875,79.03000018066406,94.56399993408203 +238,0.00340579726616852,0.879975703125,79.24400002929687,94.4900000366211 +239,0.0035596858360804617,0.870110234375,79.32800018310547,94.57400006347656 +240,0.0034809598000720143,0.87433234375,79.23400002441406,94.59199998779297 +241,0.0034409927029628307,0.873230625,79.2940000805664,94.57199998535157 +242,0.003514723590342328,0.86742953125,79.39000007568359,94.68400009033203 +243,0.003626737539889291,0.86634640625,79.39000008056641,94.74800014160157 +244,0.0036246690433472395,0.87029609375,79.3940000805664,94.70000008789063 +245,0.003732266020961106,0.87053484375,79.45600008300781,94.74200009033203 +246,0.0034438550064805895,0.86946375,79.6140000024414,94.72600009033204 +247,0.003378850087756291,0.86259046875,79.56600005615235,94.7200001147461 +248,0.0035159428080078214,0.8622246875,79.65200000244141,94.7580000366211 +249,0.0034493720449972898,0.85940671875,79.846,94.71400003662109 +250,0.003627320984378457,0.859804375,79.86799995361328,94.77600013916016 +251,0.003383415110874921,0.85597328125,79.78600005615235,94.87200008789063 +252,0.0033759995421860367,0.861890625,79.68400005615234,94.86600016845703 +253,0.003541645623045042,0.8537678125,79.80800003173829,94.86200009033203 +254,0.0036422949051484466,0.85650078125,79.91000005371093,94.81799993408202 +255,0.003406544477911666,0.854969375,79.98199992919922,94.78199993652343 +256,0.0033522049780003726,0.8533825,80.05600005615234,94.96200001220703 +257,0.0034389470529276878,0.8473271875,80.10600008300781,94.93000014160157 +258,0.0033865342556964606,0.84619328125,80.18000008300781,94.9299999584961 +259,0.003447333292569965,0.84689046875,80.1759999243164,94.96200006347657 +260,0.0032964720739983022,0.85028234375,80.15400003173828,94.96200003662109 +261,0.0034078260068781674,0.84569546875,80.26399992431641,95.0580000366211 +262,0.003493778232950717,0.84492140625,80.29800000244141,95.03200016601562 +263,0.003361152426805347,0.8409953125,80.38600010498047,94.9840000366211 +264,0.003216014476493001,0.8484996875,80.2539999243164,95.02400001220703 +265,0.0033724562090355903,0.84592109375,80.42600002929687,95.06999998535156 +266,0.003271011490141973,0.845681875,80.36600010986328,95.04200009033202 +267,0.00325453162076883,0.84505921875,80.45000010742187,95.08800006347656 +268,0.003399371402338147,0.84084203125,80.54600021240235,95.13200006347657 +269,0.003290993539849296,0.84360375,80.48200000244141,95.03600011474609 +270,0.003396770596737042,0.843845,80.66800008300781,95.04000009033203 +271,0.0034957354655489326,0.8456625,80.51999995117187,95.11200008789062 +272,0.0033322387316729873,0.8467834375,80.5320001586914,95.06799998535156 +273,0.0032145300647243857,0.843624375,80.62000005371094,95.14200008789062 +274,0.0032572261116001755,0.83734109375,80.67399997802734,95.1919999609375 +275,0.0032410602434538305,0.8366184375,80.61999998046875,95.23799998535156 +276,0.0032682681048754603,0.83492609375,80.75799989990234,95.13600006347656 +277,0.003225842461688444,0.8364571875,80.77199997802734,95.26400014160156 +278,0.003334630251629278,0.83932546875,80.70800020996094,95.24000001220703 +279,0.0031491983390878886,0.8327865625,80.80600005371093,95.22600003662109 +280,0.0032511689059901983,0.8352259375,80.79400003173828,95.24200001220703 +281,0.003383599338121712,0.83481140625,80.80199997802734,95.24200001220703 +282,0.003208352194633335,0.83299375,80.82000005615234,95.28400006347657 +283,0.00338284092140384,0.83310734375,80.90600002929688,95.24800011474609 +284,0.0033502599399071187,0.8340803125,80.87400010742188,95.21000006347656 +285,0.0031701632833573967,0.83368890625,80.93800005615235,95.27200006347657 +286,0.0031970099080353975,0.8329978125,80.93600013427735,95.29400001220704 +287,0.003307257400592789,0.83367359375,81.02200005615235,95.29600006347657 +288,0.0032536371145397425,0.83250046875,80.9960000805664,95.22600006347656 +289,0.0030824737914372236,0.83073984375,81.0500000805664,95.2640001147461 +290,0.003224483778467402,0.83288328125,81.01600010742187,95.24000006347656 +291,0.0033304091775789857,0.8298646875,81.03000013427734,95.31200016601562 +292,0.0032747428049333394,0.83119640625,81.06000018554687,95.2640000366211 +293,0.003150499804178253,0.83247578125,81.04200005615235,95.27000016601562 +294,0.003187613532645628,0.83180609375,81.09000005615235,95.24800011474609 +295,0.0032579210528638214,0.83188265625,81.07800005615235,95.29000011474609 +296,0.003274351533036679,0.83074125,81.12400013427734,95.29000011474609 +297,0.0031184881809167564,0.83056609375,81.10000013427734,95.29200003662109 +298,0.0031399513536598533,0.83104609375,81.07200013427735,95.2900000366211 +299,0.003196138044586405,0.83090609375,81.09400013427734,95.2840000366211 +300,0.003246222360758111,0.83078609375,81.08800013427734,95.28000003662109 +301,0.0032320290338248014,0.83078609375,81.08800013427734,95.28000003662109 +302,0.0031107992690522224,0.83080609375,81.09200013427734,95.2760000366211 +303,0.0032131875632330775,0.83076609375,81.09400013427734,95.2880000366211 +304,0.003161734639434144,0.83076609375,81.09600013427735,95.2840000366211 +305,0.00320960785029456,0.83088609375,81.09200013427734,95.28000003662109 +306,0.0031181383528746665,0.83080609375,81.10000013427734,95.28000003662109 +307,0.0033232175337616354,0.83096609375,81.09600013427735,95.2780000366211 +308,0.0031707440793979913,0.83080609375,81.09400013427734,95.2820000366211 +309,0.0033417781232856214,0.83084609375,81.08800013427734,95.28600003662109 diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_300.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_300.csv new file mode 100644 index 0000000..de0b724 --- /dev/null +++ b/CV/timm/exp_results/ViT/small/summary_vit-s_300.csv @@ -0,0 +1,311 @@ +epoch,train_loss,eval_loss,eval_top1,eval_top5 +0,0.7088104273591723,6.98578625,0.076,0.4399999984359741 +1,0.0593021409586072,6.9105975,0.1,0.488 +2,0.007911931656833206,6.91798,0.098,0.534 +3,0.007876356664512838,6.6090775,0.7639999987792969,3.085999990234375 +4,0.007697046135685274,6.09368625,3.602,10.651999990844727 +5,0.007457644079944917,5.40907375,8.25999998046875,21.33200000854492 +6,0.007227104323516999,4.87317,13.316000041503907,30.212000043945313 +7,0.007069527537428907,4.469114375,17.83600005859375,37.67800001220703 +8,0.0068350267330450675,4.03336875,22.974000017089843,45.019999990234375 +9,0.006735124126342791,3.7360025,26.911999973144532,50.64200002685547 +10,0.0065133661098246065,3.4029515625,31.460000067138672,56.11799998535156 +11,0.006383622730416911,3.11413875,36.14199999267578,61.25599998046875 +12,0.006272536403100405,2.925640625,39.16599985107422,64.40200005371094 +13,0.006175674231989043,2.7787234375,41.937999921875,67.25399999023438 +14,0.006054158921220473,2.6632184375,44.03199999755859,69.6120000390625 +15,0.005941766081377864,2.490314375,46.57800006835937,71.7799999609375 +16,0.005736711734373655,2.3612234375,48.838000092773434,73.87800000732422 +17,0.005751167856422918,2.2801615625,50.37600013183594,75.05000002685547 +18,0.005688209020133529,2.2064034375,51.42600002685547,76.07400020019531 +19,0.0056528631996895585,2.11119125,53.12000000488281,77.693999921875 +20,0.005559766764885613,2.078486875,54.07400008544922,78.16200002929688 +21,0.0055256913349564585,2.02706875,54.952000029296876,79.01000000976562 +22,0.005470881537933435,1.982676875,55.996,79.79399999023437 +23,0.005492086954680937,1.9340134375,56.625999997558594,80.45800017578125 +24,0.005311453382351569,1.8639765625,57.63000004638672,81.27000006347656 +25,0.005362782394513488,1.8542515625,58.23199999267578,81.61199993896484 +26,0.0051889723898576835,1.82206859375,58.56200001220703,81.97000013916016 +27,0.005191617146400469,1.8077596875,59.058000065917966,82.1180000390625 +28,0.00539108006549733,1.76982375,59.733999992675784,82.49199993652344 +29,0.005350109356056366,1.75589625,59.6940001171875,82.95000000976563 +30,0.0051864461108509985,1.7444865625,60.152000092773434,83.12800008789063 +31,0.005111382908320853,1.7377140625,60.58999988769531,83.37200003662109 +32,0.005090781321216907,1.70070625,60.926000063476565,83.86600009033204 +33,0.0051537183046873125,1.697153125,61.247999968261716,84.04599993652344 +34,0.005152960141588535,1.69067796875,61.30999995849609,84.1339999560547 +35,0.005137387929218156,1.67412921875,61.55600004394531,84.29999990478515 +36,0.005153708858415484,1.6652390625,61.874000063476565,84.37599989990234 +37,0.005256490149934377,1.6635028125,62.1420000390625,84.33800000976562 +38,0.005222479480185679,1.65173234375,62.20800006103516,84.648 +39,0.005251926835626364,1.63313015625,62.53800000244141,84.9480001586914 +40,0.005172699357249907,1.62525328125,62.58600004150391,84.86999993164062 +41,0.004970315178590161,1.6189990625,62.66999993652344,84.85200000488281 +42,0.004989458713680506,1.6085684375,63.08999998535156,85.12400005615234 +43,0.0052113881268139395,1.61122203125,62.83200006103515,85.20600000488281 +44,0.004982101365125605,1.61612,62.63399995605469,85.08999997802735 +45,0.0051351250149309635,1.59424671875,62.9319999609375,85.26199997558594 +46,0.005090858754036682,1.615645625,62.298000012207034,85.04000009033203 +47,0.005041462934709021,1.6103778125,62.868000163574216,85.03800008300782 +48,0.005173007891114269,1.62044953125,62.80199998046875,85.2100000024414 +49,0.005058950777830822,1.595099375,62.86000011474609,85.27200005859375 +50,0.005100807940055217,1.600473125,62.80199993408203,85.1840000805664 +51,0.005033610355375069,1.58446359375,63.045999907226566,85.38199997558594 +52,0.005180339362206203,1.6024678125,62.946000009765626,85.21599995361328 +53,0.005104479579521077,1.58289453125,62.96600009033203,85.42200013916016 +54,0.005021374972004976,1.6025959375,62.80600016601562,85.30599995361328 +55,0.005073556743030038,1.60291203125,62.93400006347656,85.36799998291016 +56,0.005158487640853439,1.6101865625,62.609999965820315,85.11199992431641 +57,0.005101768133629646,1.6161715625,62.609999963378904,85.00000008300782 +58,0.005178365357486265,1.603305,62.91400003173828,85.12200016113282 +59,0.0050634183654827735,1.625264375,62.71600003173828,85.022000078125 +60,0.005082057423091361,1.56725828125,63.44800000732422,85.69399989746094 +61,0.0050182118679263765,1.56564359375,63.832000056152346,85.810000078125 +62,0.00500678809891854,1.5584078125,63.872000053710934,85.83000008300782 +63,0.004991384588980249,1.54848390625,63.90199997802734,85.92000010742187 +64,0.005057041134153094,1.54323890625,64.27800003173829,86.02400008544922 +65,0.005009956025917616,1.570284375,63.900000036621094,85.86200023925781 +66,0.004924139373802713,1.529154375,64.41200008544922,86.16600010498047 +67,0.004968045678521905,1.52668625,64.45400008300781,86.314000078125 +68,0.00501549882548196,1.54637953125,64.18199994628907,86.06799994628906 +69,0.005025971581095031,1.51549640625,64.45600005126953,86.40200005126952 +70,0.005019565312457936,1.5235915625,64.53800000244141,86.47599987060546 +71,0.005023190851456353,1.5237465625,64.57600013183594,86.47199987304687 +72,0.00493933616339096,1.5230834375,64.73199994873048,86.44000002685547 +73,0.004933323544849243,1.5054390625,64.89200008300782,86.49800016113281 +74,0.0049628756075565305,1.52728390625,64.57800010742187,86.49999987304687 +75,0.004948072434802141,1.51232859375,64.85399995117187,86.543999921875 +76,0.0048303686281932256,1.52575109375,64.89600000244141,86.41599994873047 +77,0.004780515329912305,1.47931625,65.19199997802734,86.89199994873047 +78,0.004977641993069223,1.48489359375,65.35600005615234,86.79800002929687 +79,0.004913635186052748,1.49372359375,65.28599992919922,86.68200005615235 +80,0.004917722561263612,1.4772965625,65.56600000976563,86.98600002685546 +81,0.004869485540049416,1.462256875,65.76,87.00200013183594 +82,0.004857529919328434,1.4718278125,65.48000002685546,87.06199989746094 +83,0.004841296434668558,1.4812390625,65.68200012695313,87.07199997314453 +84,0.004901787831581065,1.46519921875,65.88999995117187,87.14199989990234 +85,0.0048849687445908785,1.45171453125,66.12199997314453,87.18400002685547 +86,0.004908482444339565,1.46072125,65.798,87.24200007568359 +87,0.004881637370479959,1.44158515625,66.44599997558593,87.32600015625 +88,0.004839210604716625,1.44029125,66.1840001586914,87.31400005126953 +89,0.004845336212643555,1.43849734375,66.24600002685547,87.5639999975586 +90,0.004818427609279752,1.4360671875,66.23799998046874,87.4899999975586 +91,0.004752129615683641,1.43512328125,66.30399999511718,87.3679999975586 +92,0.004836552809657795,1.41703296875,66.68800015380859,87.75199981445313 +93,0.00478907478308039,1.4329465625,66.48200004882813,87.63600004638671 +94,0.004824953453083124,1.42722390625,66.40000004882812,87.70200009765625 +95,0.004850512669820871,1.42813828125,66.79999997802734,87.7480000024414 +96,0.004876219467925174,1.43117953125,66.73400005126953,87.61599989257813 +97,0.004927711095660925,1.41865015625,66.74200005126953,87.89799994873047 +98,0.004851346236786672,1.40063,66.94000010742188,87.91400010498047 +99,0.004951916402205825,1.4060365625,66.91800007324218,87.90400015136719 +100,0.0047612665221095085,1.4090628125,66.87200010253906,87.78399997314453 +101,0.004887008507336889,1.401068125,67.07000002441406,88.0020000756836 +102,0.0047703505759792665,1.39104484375,67.23199989746094,88.02400012695313 +103,0.00487110427846866,1.3881403125,67.48200005371093,88.22999989501953 +104,0.004896886247609343,1.4029371875,67.3679999975586,88.10799997070312 +105,0.0048172368468450645,1.3930609375,67.28200001464843,88.07999999267578 +106,0.004833530435072524,1.38458515625,67.55199989746093,88.27199989501953 +107,0.0047719960233994895,1.394876875,67.36400005126953,88.13600001953125 +108,0.004808439041620919,1.3727871875,67.73999994140625,88.23200007324219 +109,0.004759473027661443,1.36038953125,67.80200002441406,88.47999994140625 +110,0.004733743013015815,1.37357859375,67.75000002685547,88.37600007568359 +111,0.004777830792590976,1.35628453125,67.966,88.52199996826172 +112,0.004750983955870781,1.36778015625,67.9559999194336,88.42800004638671 +113,0.004782901012471744,1.351753125,67.78600002685548,88.489999921875 +114,0.004859128035604954,1.368960625,67.9739999975586,88.39600005126952 +115,0.004820822006357568,1.348521875,68.20200010253906,88.6520001538086 +116,0.004778401726590735,1.345480625,68.45600007080078,88.76800004882813 +117,0.004646568293018001,1.340895,68.28199997314454,88.82000004638672 +118,0.004756226736520018,1.34797265625,68.43599994140625,88.7539999194336 +119,0.004734682850539684,1.34323171875,68.42199996826172,88.77000009765625 +120,0.0047223693358578855,1.3437465625,68.40399999511719,88.8380001538086 +121,0.00478786273327257,1.3476125,68.59599994628907,88.65000007324218 +122,0.004672181892341801,1.3225603125,68.87000006591796,88.85399996582031 +123,0.004665981911654983,1.3343040625,68.63799999511718,88.82599997070312 +124,0.0047178248475704876,1.33354453125,68.7540000415039,89.07199994140625 +125,0.004726118374882,1.32187890625,68.65200004638672,88.8800000756836 +126,0.004768286133185029,1.319091875,68.96800012451172,89.16199996826172 +127,0.004736629148413028,1.31102640625,68.91800007324218,89.17600004882813 +128,0.004740432536761675,1.30669734375,69.19400009765624,89.07200002197266 +129,0.004662513433556471,1.30038671875,69.16200002197266,89.32599999267578 +130,0.004642459863264646,1.3016015625,69.35200004882813,89.14799997070313 +131,0.004651597135567239,1.29885984375,69.18000006835938,89.18999981445313 +132,0.004526541002893022,1.28210890625,69.70399996582032,89.41400004882813 +133,0.004782311518543533,1.30472515625,69.34399999511719,89.38200009765625 +134,0.004684346761288387,1.2839378125,69.53999986328125,89.42399996826173 +135,0.004732772075970258,1.2874309375,69.61800012207031,89.59200004638672 +136,0.004705433168315462,1.28186265625,69.52399994140625,89.41600004638671 +137,0.004564720771408507,1.28739578125,69.6279999609375,89.51400010009766 +138,0.004698288693491902,1.26457640625,69.82800004150391,89.66599996826172 +139,0.004667031146319849,1.26491140625,70.13000006103516,89.74200014648437 +140,0.004615324449592403,1.26782265625,69.94600012207032,89.6339999951172 +141,0.004736929267112698,1.2716990625,69.92200002197265,89.66000007080078 +142,0.004645303856315357,1.26636546875,70.05000012207032,89.79600007080079 +143,0.004589632619172335,1.27597921875,69.87999999267578,89.6260001513672 +144,0.004599113316674318,1.26832953125,70.03800010009766,89.70800017333984 +145,0.004497614356556109,1.24645171875,70.2940000390625,89.89200001953125 +146,0.004668201053781169,1.25213734375,70.25399991699219,89.90399996826172 +147,0.004643831202494246,1.2573528125,70.34800009277343,89.83199994140625 +148,0.004657145689374634,1.246579375,70.32000004638672,89.87399997314454 +149,0.004672455501609615,1.24891890625,70.46399994140624,89.97999988769531 +150,0.004561962260465536,1.2371015625,70.81199999267578,90.01200011962891 +151,0.004665072109284145,1.2461521875,70.58600006835937,89.99000004638673 +152,0.004581982303144676,1.24205828125,70.82200009277344,90.14599993896485 +153,0.004534704111782568,1.2320478125,70.91400018066406,90.11200009765625 +154,0.00448989266130541,1.219623125,71.05400009521485,90.25200001953125 +155,0.004539375891909003,1.216745,71.27199994140625,90.33400009521485 +156,0.004434717386694891,1.220150625,71.1020000415039,90.27199999267579 +157,0.004551438348633903,1.2224921875,70.9960001171875,90.40999991699219 +158,0.004494417591818741,1.2208475,71.21400014892578,90.29600012451172 +159,0.004556146516863789,1.21477640625,71.20599999511718,90.46600004638672 +160,0.0044869347808084315,1.209465625,71.1119999633789,90.34599983886719 +161,0.004461011549990092,1.19804203125,71.48400001953125,90.57199991699218 +162,0.004472280120743173,1.19293375,71.64600006835937,90.66400014648437 +163,0.0044560003693082505,1.18684765625,71.78400012207031,90.65399986083985 +164,0.004553499465276088,1.2062878125,71.5279999633789,90.49000004638673 +165,0.004505423562867301,1.18359390625,71.8559999633789,90.67399988769532 +166,0.004468879961807813,1.17607625,71.90200014160156,90.79999986083985 +167,0.004504834667646459,1.17856859375,72.16599996582032,90.99200007080078 +168,0.004461629715348993,1.170638125,72.18999996337891,90.82400007080078 +169,0.004403821259204831,1.17690703125,72.00999999511718,90.80199997070312 +170,0.004433724197692105,1.1584646875,72.29599998779297,91.04799996582031 +171,0.004493325383269361,1.17607875,72.1560000366211,90.89199991210937 +172,0.004585765568273408,1.176185625,72.15800007324219,91.04000004638672 +173,0.004505930235609412,1.16091203125,72.40199990478516,91.09399996582032 +174,0.0044098537348742995,1.154465,72.56800004394532,91.14999996826172 +175,0.004415984298767788,1.1441278125,72.65400006835938,91.23800001953126 +176,0.004470436135306954,1.14189859375,72.6019999609375,91.17000002197265 +177,0.004430129724953856,1.1509359375,72.72800001708984,91.16800004394531 +178,0.004526908675740872,1.1395303125,72.98599999023438,91.24800009521485 +179,0.004396590969658324,1.14365375,72.67000001220703,91.35200004394531 +180,0.004393110816766109,1.1443346875,72.87400000976562,91.38399980957031 +181,0.004384525552658098,1.12447671875,73.29599999023438,91.42199996826172 +182,0.004510728775390557,1.120158125,73.32200006835937,91.57599993896484 +183,0.004279247085962977,1.1233425,73.32999998535156,91.6160001171875 +184,0.004418941780126521,1.12287375,73.25199999023438,91.62199991210937 +185,0.0043414472934923,1.1175003125,73.23800009277343,91.62400004394532 +186,0.00444089132361114,1.1186775,73.1880000390625,91.63200001464844 +187,0.004242212129091578,1.1076759375,73.49599999023438,91.6920001171875 +188,0.004308835071112428,1.11382453125,73.61799998779297,91.66400009277343 +189,0.004377027907009635,1.09399609375,73.75999990722656,91.7500001953125 +190,0.004387934026973588,1.09843328125,73.76799990722657,91.8679999609375 +191,0.004388023932863559,1.09843125,73.90999998779297,91.86400007080078 +192,0.004349396996466177,1.0851678125,74.0220000415039,91.89800009277344 +193,0.004335067600810102,1.089355625,73.96399995849609,91.92600001464844 +194,0.004343908280134201,1.0814921875,74.13400001464844,92.05199991210938 +195,0.004251544397058231,1.0836525,74.10800008789063,92.0840000390625 +196,0.004303004742333931,1.079254375,74.2260001147461,92.07999998779297 +197,0.004285441528606627,1.06234,74.47600001464843,92.23600001464844 +198,0.004263307001175625,1.0597365625,74.48800000976563,92.2820001171875 +199,0.004302812540637595,1.07631171875,74.16999998779296,92.24600006591797 +200,0.004331470600196293,1.068165625,74.74800008789063,92.23600006347657 +201,0.004283389409205743,1.0668040625,74.44800000732423,92.1400000390625 +202,0.004252366804783898,1.05651734375,74.55799998291016,92.20200001464843 +203,0.0042564044041293,1.05011671875,74.94400014160156,92.37599998779297 +204,0.004381564678624272,1.05434125,74.53800008544921,92.4000000390625 +205,0.004182761285587081,1.0424640625,75.0139999584961,92.45999990966797 +206,0.0041452854805226836,1.04821296875,74.94200003417969,92.46599993652343 +207,0.004292250378057361,1.03995625,75.07400000732422,92.5780001171875 +208,0.0043207124108448625,1.0382290625,75.07200006347657,92.5040000390625 +209,0.004227171286142298,1.03806,75.21599998291016,92.53199990966797 +210,0.0041973576382068655,1.0244478125,75.29000013671875,92.69000001464843 +211,0.00421309527674956,1.02601953125,75.39600011230469,92.7799999609375 +212,0.004208202340773174,1.02174296875,75.61000000732422,92.70400006591797 +213,0.0041728746977501684,1.01000328125,75.73200010742187,92.8560000366211 +214,0.004136975771481437,1.01413421875,75.54799995605468,92.78600009277343 +215,0.004179607378318906,1.002690625,75.89000006103515,92.90000001220703 +216,0.00414369604550302,1.0029115625,75.88200013427735,92.89199998535156 +217,0.004148620879277587,1.00303890625,76.00599998291015,92.96800001220703 +218,0.004154796828515828,0.99850484375,76.10200006347657,93.0359999609375 +219,0.004153257684915194,0.98883125,76.31000005371094,93.15800008789063 +220,0.004109900195284614,0.98340609375,76.32799989990234,93.21600006347656 +221,0.004117257976239281,0.9853646875,76.24999998046874,93.25400011474609 +222,0.00406772896115269,0.9858534375,76.40599997802734,93.22599993652344 +223,0.004105778061784804,0.98020984375,76.40200008544922,93.31200001220704 +224,0.004056159257223564,0.97794484375,76.45200003173828,93.29400006103516 +225,0.0040910857164167935,0.97563390625,76.60599998046875,93.2780000390625 +226,0.0040714993707037395,0.96293984375,76.86800010742188,93.49800008789063 +227,0.004126675110975546,0.96799328125,76.75399998046875,93.48799990722657 +228,0.00401083128859422,0.9581128125,76.94400013427735,93.48600008544922 +229,0.004122888103925756,0.95719859375,77.01600005859375,93.54200001220703 +230,0.004067675509889211,0.95604203125,77.01200008544922,93.5659999609375 +231,0.004003024942773793,0.95360828125,77.04200000488281,93.52799998291016 +232,0.004023495795471328,0.94518703125,77.21000000488282,93.67000014160156 +233,0.004062957230157086,0.95060203125,77.31199994873047,93.54399998535156 +234,0.003994461913992252,0.941239140625,77.35600005371094,93.75000000976563 +235,0.004072328746717956,0.9401640625,77.37200008300782,93.79200001220703 +236,0.0040370105499667784,0.93594515625,77.57600006103516,93.83400006347657 +237,0.004102625684546573,0.925669375,77.69400013183594,93.85800008789063 +238,0.00400754701279636,0.92673109375,77.79000003173829,93.92000001220703 +239,0.003931260301864573,0.91868796875,77.82599993408203,93.91800019287109 +240,0.0040078962587618405,0.9184509375,77.88200008300781,94.08200009033203 +241,0.003989631010751639,0.9131821875,77.93200005859374,94.0940000390625 +242,0.0039024739027289407,0.910870078125,78.14800002929688,94.0840000390625 +243,0.003919692660149719,0.90910875,78.17200005615234,94.09400001220703 +244,0.003983062409263637,0.90479765625,78.35199989990234,94.1960000366211 +245,0.0039435730515314,0.90500796875,78.24200010253907,94.14600000976563 +246,0.003882105811499059,0.8973740625,78.44200002929688,94.28399992919923 +247,0.003944338681841535,0.8931828125,78.48599995361329,94.34200003662109 +248,0.0038875251609299865,0.89339703125,78.55400003173828,94.31999998535156 +249,0.003919344611598977,0.8847475,78.82599997802734,94.48600011474609 +250,0.003974659834057093,0.88667890625,78.75400015625,94.32399998291015 +251,0.0038648887337850673,0.88792609375,78.61599985107422,94.39000009033204 +252,0.003882348088414541,0.883708671875,78.84999997314453,94.4559999584961 +253,0.003899832156353763,0.879748046875,78.92999987548828,94.49600006103516 +254,0.003878143765697522,0.88182046875,78.83600008056641,94.39199998535156 +255,0.003834759500542922,0.87118734375,78.98400013427734,94.57800013916015 +256,0.003816165545556162,0.870868359375,79.09200005371093,94.67200011230469 +257,0.0038604919432795475,0.865966484375,79.19800002685547,94.68200011474609 +258,0.0037719939303185257,0.86741421875,79.24200005371094,94.66000011230469 +259,0.0038805989482040915,0.85983828125,79.34400005615234,94.6920000366211 +260,0.0038489396683871746,0.86072609375,79.45199995117187,94.76999998291015 +261,0.0037831025464194162,0.8598553125,79.38800010498046,94.7120001123047 +262,0.003775066928938031,0.850538515625,79.54200013427734,94.90200011230469 +263,0.00376021843736193,0.852878203125,79.61400018310547,94.87000019042969 +264,0.003743130630547447,0.850947734375,79.70000005371094,94.85000006103516 +265,0.0037950902645077023,0.848290625,79.73400005126953,94.89799998291015 +266,0.003691152353504939,0.842548671875,79.798000078125,94.94399993164062 +267,0.003776852207790528,0.845507109375,79.84200010498047,94.96200000976563 +268,0.0036705253878608346,0.84085453125,79.87599989746094,95.00599995849609 +269,0.0037077388260513544,0.838556796875,79.99599997314454,94.95600000976563 +270,0.003797434370166489,0.833672109375,80.11799997558593,95.03399998291016 +271,0.0037534147767083986,0.841295234375,79.98400010498047,95.01400008544923 +272,0.0037938125730891314,0.833252109375,80.12200005371093,95.09199998291015 +273,0.003618617625241833,0.833057890625,80.27400005371094,95.01599993164062 +274,0.0036791543077145305,0.830380390625,80.18800008056641,95.1000001147461 +275,0.0036934222360806806,0.827929140625,80.2340000024414,95.06199995849609 +276,0.0037091131920793225,0.824863046875,80.30000010498047,95.14200003417969 +277,0.0037969240552878807,0.822872421875,80.36600005371093,95.18600006103516 +278,0.0036822593870705794,0.821608828125,80.4,95.14200003417969 +279,0.0037219872132741977,0.8220225,80.388,95.21999998291015 +280,0.0036821293511560987,0.815946875,80.54399997314454,95.25400006103516 +281,0.003686774555327637,0.817045625,80.58000005126954,95.26800006103515 +282,0.003641096143318074,0.817729375,80.548,95.22400003417968 +283,0.0036904641560145785,0.816061484375,80.64400005126953,95.22400000732422 +284,0.0035623855323397686,0.81314640625,80.63799994873047,95.33999998291016 +285,0.003658884570800832,0.812442265625,80.61200005126953,95.29000000976562 +286,0.003644167977784361,0.813734765625,80.68999994873047,95.24999998291015 +287,0.0036552690102585723,0.81131984375,80.77999999755859,95.32599998291016 +288,0.0036323205567896366,0.80879640625,80.77200002441407,95.32799998291016 +289,0.003605903275976224,0.809427109375,80.7480000024414,95.32399998291015 +290,0.0037104166944378187,0.810876796875,80.73000002685546,95.35599998291016 +291,0.0036719500785693526,0.80932296875,80.79399997314454,95.36000000976563 +292,0.0036519923014566302,0.807780859375,80.81200005126954,95.37199998291015 +293,0.0036492896199758562,0.809131171875,80.795999921875,95.35800003417968 +294,0.003632878784888557,0.80717015625,80.81000005126953,95.38600011230469 +295,0.003646290262362787,0.80859359375,80.8500001538086,95.36600006103515 +296,0.0035626571126548307,0.80803984375,80.84199997558594,95.36199998291016 +297,0.0036516755519966993,0.806596015625,80.88400005126954,95.38399998291015 +298,0.003551991772837937,0.807148125,80.84000002685546,95.37600003417968 +299,0.0035453016503847073,0.80710328125,80.86600005126954,95.33199998291016 +300,0.003604882995464972,0.807660546875,80.826000078125,95.37800003417969 +301,0.003637207838307534,0.807074296875,80.830000078125,95.36200003417969 +302,0.003711703832128218,0.807204296875,80.91999997314453,95.37599995605468 +303,0.003739510030884828,0.806617734375,80.88800002441407,95.38000003417969 +304,0.00353996387483286,0.807367421875,80.876,95.37800003417969 +305,0.0035504487376394017,0.807177421875,80.884000078125,95.37399998291015 +306,0.00364223014496799,0.80659125,80.90399994873047,95.35600003417969 +307,0.00370254267805389,0.8077678125,80.814000078125,95.36400003417968 +308,0.003639972086862794,0.806408125,80.862000078125,95.38200003417968 +309,0.0036769274322848234,0.80716296875,80.87200002685547,95.35400003417969 diff --git a/CV/timm/optim_factory.py b/CV/timm/optim_factory.py new file mode 100644 index 0000000..b0b9ae6 --- /dev/null +++ b/CV/timm/optim_factory.py @@ -0,0 +1,343 @@ +""" Optimizer Factory w/ Custom Weight Decay +Hacked together by / Copyright 2021 Ross Wightman +""" +import json +from itertools import islice +from typing import Optional, Callable, Tuple + +import torch +import torch.nn as nn +import torch.optim as optim + +from timm.models.helpers import group_parameters + +from timm.optim.adabelief import AdaBelief +from timm.optim.adafactor import Adafactor +from timm.optim.adahessian import Adahessian +from timm.optim.adamp import AdamP +from timm.optim.lamb import Lamb +from timm.optim.lars import Lars +from timm.optim.lookahead import Lookahead +from timm.optim.madgrad import MADGRAD +from timm.optim.nadam import Nadam +from timm.optim.nvnovograd import NvNovoGrad +from timm.optim.radam import RAdam +from timm.optim.rmsprop_tf import RMSpropTF +from timm.optim.sgdp import SGDP +from adan import Adan +from sam import SAM + +try: + from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD + has_apex = True +except ImportError: + has_apex = False + + +def param_groups_weight_decay( + model: nn.Module, + weight_decay=1e-5, + no_weight_decay_list=() +): + no_weight_decay_list = set(no_weight_decay_list) + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + if param.ndim <= 1 or name.endswith(".bias") or name in no_weight_decay_list: + no_decay.append(param) + else: + decay.append(param) + + return [ + {'params': no_decay, 'weight_decay': 0.}, + {'params': decay, 'weight_decay': weight_decay}] + + +def _group(it, size): + it = iter(it) + return iter(lambda: tuple(islice(it, size)), ()) + + +def _layer_map(model, layers_per_group=12, num_groups=None): + def _in_head(n, hp): + if not hp: + return True + elif isinstance(hp, (tuple, list)): + return any([n.startswith(hpi) for hpi in hp]) + else: + return n.startswith(hp) + + head_prefix = getattr(model, 'pretrained_cfg', {}).get('classifier', None) + names_trunk = [] + names_head = [] + for n, _ in model.named_parameters(): + names_head.append(n) if _in_head(n, head_prefix) else names_trunk.append(n) + + # group non-head layers + num_trunk_layers = len(names_trunk) + if num_groups is not None: + layers_per_group = -(num_trunk_layers // -num_groups) + names_trunk = list(_group(names_trunk, layers_per_group)) + + num_trunk_groups = len(names_trunk) + layer_map = {n: i for i, l in enumerate(names_trunk) for n in l} + layer_map.update({n: num_trunk_groups for n in names_head}) + return layer_map + + +def param_groups_layer_decay( + model: nn.Module, + weight_decay: float = 0.05, + no_weight_decay_list: Tuple[str] = (), + layer_decay: float = .75, + end_layer_decay: Optional[float] = None, +): + """ + Parameter groups for layer-wise lr decay & weight decay + Based on BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58 + """ + no_weight_decay_list = set(no_weight_decay_list) + param_group_names = {} # NOTE for debugging + param_groups = {} + + if hasattr(model, 'group_matcher'): + # FIXME interface needs more work + layer_map = group_parameters(model, model.group_matcher(coarse=False), reverse=True) + else: + # fallback + layer_map = _layer_map(model) + num_layers = max(layer_map.values()) + 1 + layer_max = num_layers - 1 + layer_scales = list(layer_decay ** (layer_max - i) for i in range(num_layers)) + + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + # no decay: all 1D parameters and model specific ones + if param.ndim == 1 or name in no_weight_decay_list: + g_decay = "no_decay" + this_decay = 0. + else: + g_decay = "decay" + this_decay = weight_decay + + layer_id = layer_map.get(name, layer_max) + group_name = "layer_%d_%s" % (layer_id, g_decay) + + if group_name not in param_groups: + this_scale = layer_scales[layer_id] + param_group_names[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "param_names": [], + } + param_groups[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "params": [], + } + + param_group_names[group_name]["param_names"].append(name) + param_groups[group_name]["params"].append(param) + + # FIXME temporary output to debug new feature + print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2)) + + return list(param_groups.values()) + + +def optimizer_kwargs(cfg): + """ cfg/argparse to kwargs helper + Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn. + """ + kwargs = dict( + opt=cfg.opt, + lr=cfg.lr, + weight_decay=cfg.weight_decay, + momentum=cfg.momentum) + if getattr(cfg, 'opt_eps', None) is not None: + kwargs['eps'] = cfg.opt_eps + if getattr(cfg, 'opt_betas', None) is not None: + kwargs['betas'] = cfg.opt_betas + if getattr(cfg, 'layer_decay', None) is not None: + kwargs['layer_decay'] = cfg.layer_decay + if getattr(cfg, 'opt_args', None) is not None: + kwargs.update(cfg.opt_args) + return kwargs + + +def create_optimizer(args, model, filter_bias_and_bn=True): + """ Legacy optimizer factory for backwards compatibility. + NOTE: Use create_optimizer_v2 for new code. + """ + return create_optimizer_v2( + model, + **optimizer_kwargs(cfg=args), + filter_bias_and_bn=filter_bias_and_bn, + ) + + +def create_optimizer_v2( + model_or_params, + opt: str = 'sgd', + lr: Optional[float] = None, + weight_decay: float = 0., + momentum: float = 0.9, + filter_bias_and_bn: bool = True, + layer_decay: Optional[float] = None, + param_group_fn: Optional[Callable] = None, + **kwargs): + """ Create an optimizer. + + TODO currently the model is passed in and all parameters are selected for optimization. + For more general use an interface that allows selection of parameters to optimize and lr groups, one of: + * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion + * expose the parameters interface and leave it up to caller + + Args: + model_or_params (nn.Module): model containing parameters to optimize + opt: name of optimizer to create + lr: initial learning rate + weight_decay: weight decay to apply in optimizer + momentum: momentum for momentum based optimizers (others may use betas via kwargs) + filter_bias_and_bn: filter out bias, bn and other 1d params from weight decay + **kwargs: extra optimizer specific kwargs to pass through + + Returns: + Optimizer + """ + if isinstance(model_or_params, nn.Module): + # a model was passed in, extract parameters and add weight decays to appropriate layers + no_weight_decay = {} + if hasattr(model_or_params, 'no_weight_decay'): + no_weight_decay = model_or_params.no_weight_decay() + + if param_group_fn: + parameters = param_group_fn(model_or_params) + elif layer_decay is not None: + parameters = param_groups_layer_decay( + model_or_params, + weight_decay=weight_decay, + layer_decay=layer_decay, + no_weight_decay_list=no_weight_decay) + weight_decay = 0. + elif weight_decay and filter_bias_and_bn: + parameters = param_groups_weight_decay(model_or_params, weight_decay, no_weight_decay) + weight_decay = 0. + else: + parameters = model_or_params.parameters() + else: + # iterable of parameters or param groups passed in + parameters = model_or_params + + opt_lower = opt.lower() + opt_split = opt_lower.split('_') + opt_lower = opt_split[-1] + if 'fused' in opt_lower: + assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' + + opt_args = dict(weight_decay=weight_decay, **kwargs) + if lr is not None: + opt_args.setdefault('lr', lr) + + # basic SGD & related + if opt_lower == 'sgd' or opt_lower == 'nesterov': + # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons + opt_args.pop('eps', None) + optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'sam': + opt_args.pop('eps', None) + optimizer = SAM(parameters, optim.SGD, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'adan': + optimizer = Adan(parameters, **opt_args) + elif opt_lower == 'momentum': + opt_args.pop('eps', None) + optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args) + elif opt_lower == 'sgdp': + optimizer = SGDP(parameters, momentum=momentum, nesterov=True, **opt_args) + + # adaptive + elif opt_lower == 'adam': + optimizer = optim.Adam(parameters, **opt_args) + elif opt_lower == 'adamw': + optimizer = optim.AdamW(parameters, **opt_args) + elif opt_lower == 'adamp': + optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) + elif opt_lower == 'nadam': + try: + # NOTE PyTorch >= 1.10 should have native NAdam + optimizer = optim.Nadam(parameters, **opt_args) + except AttributeError: + optimizer = Nadam(parameters, **opt_args) + elif opt_lower == 'radam': + optimizer = RAdam(parameters, **opt_args) + elif opt_lower == 'adamax': + optimizer = optim.Adamax(parameters, **opt_args) + elif opt_lower == 'adabelief': + optimizer = AdaBelief(parameters, rectify=False, **opt_args) + elif opt_lower == 'radabelief': + optimizer = AdaBelief(parameters, rectify=True, **opt_args) + elif opt_lower == 'adadelta': + optimizer = optim.Adadelta(parameters, **opt_args) + elif opt_lower == 'adagrad': + opt_args.setdefault('eps', 1e-8) + optimizer = optim.Adagrad(parameters, **opt_args) + elif opt_lower == 'adafactor': + optimizer = Adafactor(parameters, **opt_args) + elif opt_lower == 'lamb': + optimizer = Lamb(parameters, **opt_args) + elif opt_lower == 'lambc': + optimizer = Lamb(parameters, trust_clip=True, **opt_args) + elif opt_lower == 'larc': + optimizer = Lars(parameters, momentum=momentum, trust_clip=True, **opt_args) + elif opt_lower == 'lars': + optimizer = Lars(parameters, momentum=momentum, **opt_args) + elif opt_lower == 'nlarc': + optimizer = Lars(parameters, momentum=momentum, trust_clip=True, nesterov=True, **opt_args) + elif opt_lower == 'nlars': + optimizer = Lars(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'madgrad': + optimizer = MADGRAD(parameters, momentum=momentum, **opt_args) + elif opt_lower == 'madgradw': + optimizer = MADGRAD(parameters, momentum=momentum, decoupled_decay=True, **opt_args) + elif opt_lower == 'novograd' or opt_lower == 'nvnovograd': + optimizer = NvNovoGrad(parameters, **opt_args) + elif opt_lower == 'rmsprop': + optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=momentum, **opt_args) + elif opt_lower == 'rmsproptf': + optimizer = RMSpropTF(parameters, alpha=0.9, momentum=momentum, **opt_args) + + # second order + elif opt_lower == 'adahessian': + optimizer = Adahessian(parameters, **opt_args) + + # NVIDIA fused optimizers, require APEX to be installed + elif opt_lower == 'fusedsgd': + opt_args.pop('eps', None) + optimizer = FusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'fusedmomentum': + opt_args.pop('eps', None) + optimizer = FusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args) + elif opt_lower == 'fusedadam': + optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) + elif opt_lower == 'fusedadamw': + optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) + elif opt_lower == 'fusedlamb': + optimizer = FusedLAMB(parameters, **opt_args) + elif opt_lower == 'fusednovograd': + opt_args.setdefault('betas', (0.95, 0.98)) + optimizer = FusedNovoGrad(parameters, **opt_args) + + else: + assert False and "Invalid optimizer" + raise ValueError + + if len(opt_split) > 1: + if opt_split[0] == 'lookahead': + optimizer = Lookahead(optimizer) + + return optimizer diff --git a/CV/timm/sam.py b/CV/timm/sam.py new file mode 100644 index 0000000..61ae5c8 --- /dev/null +++ b/CV/timm/sam.py @@ -0,0 +1,62 @@ +import torch + + +class SAM(torch.optim.Optimizer): + def __init__(self, params, base_optimizer, rho=0.05, adaptive=False, **kwargs): + assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}" + + defaults = dict(rho=rho, adaptive=adaptive, **kwargs) + super(SAM, self).__init__(params, defaults) + + self.base_optimizer = base_optimizer(self.param_groups, **kwargs) + self.param_groups = self.base_optimizer.param_groups + + @torch.no_grad() + def first_step(self, zero_grad=False): + grad_norm = self._grad_norm() + for group in self.param_groups: + scale = group["rho"] / (grad_norm + 1e-12) + + for p in group["params"]: + if p.grad is None: continue + self.state[p]["old_p"] = p.data.clone() + e_w = (torch.pow(p, 2) if group["adaptive"] else 1.0) * p.grad * scale.to(p) + p.add_(e_w) # climb to the local maximum "w + e(w)" + + if zero_grad: self.zero_grad() + + @torch.no_grad() + def second_step(self, zero_grad=False): + for group in self.param_groups: + for p in group["params"]: + if p.grad is None: continue + p.data = self.state[p]["old_p"] # get back to "w" from "w + e(w)" + + self.base_optimizer.step() # do the actual "sharpness-aware" update + + if zero_grad: self.zero_grad() + + @torch.no_grad() + def step(self, closure=None): + assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided" + closure = torch.enable_grad()(closure) # the closure should do a full forward-backward pass + + self.first_step(zero_grad=True) + closure() + self.second_step() + + def _grad_norm(self): + shared_device = self.param_groups[0]["params"][0].device # put everything on the same device, in case of model parallelism + norm = torch.norm( + torch.stack([ + ((torch.abs(p) if group["adaptive"] else 1.0) * p.grad).norm(p=2).to(shared_device) + for group in self.param_groups for p in group["params"] + if p.grad is not None + ]), + p=2 + ) + return norm + + def load_state_dict(self, state_dict): + super().load_state_dict(state_dict) + self.base_optimizer.param_groups = self.param_groups \ No newline at end of file diff --git a/CV/timm/supervised.md b/CV/timm/supervised.md new file mode 100644 index 0000000..e266905 --- /dev/null +++ b/CV/timm/supervised.md @@ -0,0 +1,168 @@ +# Training recipes + +We provide the specific commonds and hyper-parameters for ViTs, ResNets and ConvNexts in this recipe. + + + +## Training of ViT + +### 1) Training with Setting I + +This is a prevalent setting for training [ResNets](https://arxiv.org/abs/2110.00476). To train ViT-small, you can use the following command. + +```python +python -m torch.distributed.launch --nproc_per_node=8 ./train.py + --data-dir ${IMAGENET_DIR} \ + --model deit_small_patch16_224 \ + --sched cosine -j 10 \ + --epochs ${EPOCH} --weight-decay 0.02 \ + --opt Adan \ + --lr 1.5e-2 --opt-betas 0.98 0.92 0.99 \ + --opt-eps 1e-8 --max-grad-norm 0.0 \ + --warmup-lr 1e-8 --min-lr 1.0e-08 \ + -b 256 --amp \ + --aug-repeats 0 \ + --warmup-epochs 60 \ + --aa rand-m7-mstd0.5-inc1 \ + --smoothing 0.1 \ + --remode pixel \ + --reprob 0.0 \ + --bce \ + --drop 0.0 --drop-path 0.05 \ + --mixup 0.2 --cutmix 1.0 \ + --output ${OUT_DIR} \ + --experiment ${EXP_DIR} +``` + +After training, this command should give the following results. Note, it seems that this setting cannot improve the results of ViT-Base under training setting II (see below). + +| | 150 Epoch | 300 Epoch | +| :-------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| ViT small | 80.1 | 81.1 | +| download | [config](./exp_results/ViT/small/args_vit-s_150-I.yaml)/[log](./exp_results/ViT/small/summary_vit-s_150-I.csv)/model | [config](./exp_results/ViT/small/args_vit-s_300-I.yaml)/[log](./exp_results/ViT/small/summary_vit-s_300-I.csv)/model | + + + + + +### 2) Training with Setting II + +This is the official setting used in [Deit](https://github.com/facebookresearch/deit). Note, without distillation, DeiTs and ViTs are the same models. To train ViT-small, you can use the following command. + +```python +python -m torch.distributed.launch --nproc_per_node=8 ./train.py + --data-dir ${IMAGENET_DIR} \ + --model ${MODEL_NAME} \ + --sched cosine -j 10 \ + --epochs ${EPOCH} --weight-decay .02 \ + --opt Adan \ + --lr 1.5e-2 --opt-betas 0.98 0.92 0.99 \ + --opt-eps 1e-8 --max-grad-norm 5.0 \ + --warmup-lr 1e-8 --min-lr 1e-5 \ + -b 256 --amp \ + --aug-repeats ${REP} \ + --warmup-epochs 60 \ + --aa ${AUG} \ + --smoothing 0.1 \ + --remode pixel \ + --reprob 0.25 \ + --drop 0.0 --drop-path 0.1 \ + --mixup 0.8 --cutmix 1.0 \ + --output ${OUT_DIR} \ + --experiment ${EXP_DIR} +``` +There is some differences between hyper-parameters for ViT-Base and ViT-Small. `--bce` means using the Binary Cross Entropy loss. + + | | MODEL_NAME | REP | AUG | BCE | Bias-Decay | + | --------- | :--------------------: | :--: | :------------------: | :---: | :--------: | + | ViT-Small | deit_small_patch16_224 | 0 | rand-m7-mstd0.5-inc1 | True | False | + | ViT-Base | deit_base_patch16_224 | 3 | rand-m9-mstd0.5-inc1 | False | True | + +After training, you should expect the following resutls. The results are sensitive to the `warmup-lr` and `min-lr`. + +| | 150 Epoch | 300 Epoch | +| :-------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| ViT-Small | 79.6 | 80.9 | +| download | [config](./exp_results/ViT/small/args_vit-s_150.yaml)/[log](./exp_results/ViT/small/summary_vit-s_150.csv)/model | [config](./exp_results/ViT/small/args_vit-s_300.yaml)/[log](./exp_results/ViT/small/summary_vit-s_300.csv)/model | +| ViT-Base | 81.7 | 82.3 | +| download | [config](./exp_results/ViT/base/args_vit-B_150.yaml)/[log](./exp_results/ViT/base/summary_vit-B_150.csv)/model | [config](./exp_results/ViT/base/args_vit-B_300.yaml)/[log](./exp_results/ViT/base/summary_vit-B_300.csv)/model | + + + +## ResNet +This is a default setting used to train [ResNets](https://arxiv.org/abs/2110.00476). To train ResNet-50, you can use the following command. + +```python +python -m torch.distributed.launch --nproc_per_node=8 ./train.py + --data-dir ${IMAGENET_DIR} \ + --model resnet50 \ + --sched cosine -j 8 \ + --epochs ${EPOCH} --weight-decay .02 \ + --opt Adan \ + --lr ${LR} --opt-betas 0.98 0.92 0.99 \ + --opt-eps 1e-8 --max-grad-norm 5.0 \ + --warmup-lr 1e-9 --min-lr 1.0e-05 --bias-decay \ + -b 256 --amp \ + --aug-repeats 0 \ + --warmup-epochs 60 \ + --aa rand-m7-mstd0.5-inc1 \ + --smoothing 0.0 \ + --remode pixel \ + --crop-pct 0.95 \ + --reprob 0.0 \ + --bce \ + --drop 0.0 --drop-path 0.05 \ + --mixup 0.1 --cutmix 1.0 \ + --output ${OUT_DIR} \ + --experiment ${EXP_DIR} +``` + +When training different epochs, we use slightly different learning rate, namely, `LR = 3e-2` for `EPOCH = 100` and `LR = 1.5e-2` for `EPOCH = 200 and 300`. After training, you can get the following resutls: + +| | 100 Epoch | 200 Epoch | 300 Epoch | +| :-------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| ResNet-50 | 78.1 | 79.7 | 80.2 | +| download | [config](./exp_results/ResNet/Res50/args_res50_100.yaml)/[log](./exp_results/ResNet/Res50/summary_res50_100.csv)/model | [config](./exp_results/ResNet/Res50/args_res50_200.yaml)/[log](./exp_results/ResNet/Res50/summary_res50_200.csv)/model | [config](./exp_results/ResNet/Res50/args_res50_300.yaml)/[log](./exp_results/ResNet/Res50/summary_res50_300.csv)/model | + + + +## ConvNext + +This is a default setting to train ConvNext-tiny. To train ConvNext-tiny, you can use the following command. + +```python +python -m torch.distributed.launch --nproc_per_node=8 ./train.py + --data-dir ${IMAGENET_DIR} \ + --model convnext_tiny_hnf \ + --sched cosine -j 8 \ + --epochs ${EPOCH} --weight-decay .02 \ + --opt Adan \ + --lr 1.6e-2 --opt-betas 0.98 0.92 0.90 \ + --opt-eps 1e-8 --max-grad-norm 0.0 \ + --warmup-lr 1e-9 --min-lr 1.0e-05 --bias-decay \ + -b 256 --amp \ + --aug-repeats 0 \ + --warmup-epochs 150 \ + --aa rand-m7-mstd0.5-inc1 \ + --smoothing 0.1 \ + --remode pixel \ + --reprob 0.25 \ + --drop 0.0 --drop-path 0.1 \ + --mixup 0.8 --cutmix 1.0 \ + --model-ema \ + --train-interpolation random \ + --output ${OUT_DIR} \ + --experiment ${EXP_DIR} +``` + +For this training, the performance is NOT sensitive to some hyper-params, such as `warmup-epochs` and `lr`. But whether using `model-ema` plays a key role. + +You can use the following config to train convnext tiny for 150 epoch, in which we do not utilize `model-ema`. + +This results should be: + +| | 150 Epoch | 300 Epoch | +| :-----------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| ConvNext-tiny | 81.7 | 82.4 | +| download | [config](./exp_results/ConvNext/small/args_cvnext_150.yaml)/[log](./exp_results/ConvNext/small/summary_cvnext_150.csv)/model | [config](./exp_results/ConvNext/small/args_cvnext_300.yaml)/[log](./exp_results/ConvNext/small/summary_cvnext_300.csv)/model | + diff --git a/CV/timm/train.py b/CV/timm/train.py new file mode 100644 index 0000000..975f284 --- /dev/null +++ b/CV/timm/train.py @@ -0,0 +1,830 @@ +#!/usr/bin/env python3 +""" ImageNet Training Script + +This is intended to be a lean and easily modifiable ImageNet training script that reproduces ImageNet +training results with some of the latest networks and training techniques. It favours canonical PyTorch +and standard Python style over trying to be able to 'do it all.' That said, it offers quite a few speed +and training result improvements over the usual PyTorch example scripts. Repurpose as you see fit. + +This script was started from an early version of the PyTorch ImageNet example +(https://github.com/pytorch/examples/tree/master/imagenet) + +NVIDIA CUDA specific speedups adopted from NVIDIA Apex examples +(https://github.com/NVIDIA/apex/tree/master/examples/imagenet) + +Hacked together by / Copyright 2020 Ross Wightman (https://github.com/rwightman) +""" +import argparse +import time +import yaml +import os +import logging +from collections import OrderedDict +from contextlib import suppress +from datetime import datetime + +import torch +import torch.nn as nn +import torchvision.utils +from torch.nn.parallel import DistributedDataParallel as NativeDDP + +from timm.data import create_dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset +from timm.models import create_model, safe_model_name, resume_checkpoint, load_checkpoint,\ + convert_splitbn_model, model_parameters +from timm.utils import * +from timm.loss import * +#from timm.optim import create_optimizer +from timm.scheduler import create_scheduler +from optim_factory import create_optimizer +from timm.utils import ApexScaler, NativeScaler +#import timm.optim.optim_factory as optim_factory + +try: + from apex import amp + from apex.parallel import DistributedDataParallel as ApexDDP + from apex.parallel import convert_syncbn_model + has_apex = True +except ImportError: + has_apex = False + +has_native_amp = False +try: + if getattr(torch.cuda.amp, 'autocast') is not None: + has_native_amp = True +except AttributeError: + pass + +try: + import wandb + has_wandb = True +except ImportError: + has_wandb = False + +torch.backends.cudnn.benchmark = True +_logger = logging.getLogger('train') + +# The first arg parser parses out only the --config argument, this argument is used to +# load a yaml file containing key-values that override the defaults for the main parser below +config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False) +parser.add_argument('-c', '--config', default='', type=str, metavar='FILE', + help='YAML config file specifying default arguments') + + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') + +# Dataset / Model parameters +parser.add_argument('--data-dir', metavar='DIR', + help='path to dataset') +parser.add_argument('--dataset', '-d', metavar='NAME', default='', + help='dataset type (default: ImageFolder/ImageTar if empty)') +parser.add_argument('--train-split', metavar='NAME', default='train', + help='dataset train split (default: train)') +parser.add_argument('--val-split', metavar='NAME', default='validation', + help='dataset validation split (default: validation)') +parser.add_argument('--model', default='resnet50', type=str, metavar='MODEL', + help='Name of model to train (default: "resnet50"') +parser.add_argument('--pretrained', action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') +parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH', + help='Initialize model from this checkpoint (default: none)') +parser.add_argument('--resume', default=None, type=str, metavar='PATH', + help='Resume full model and optimizer state from checkpoint (default: none)') +parser.add_argument('--no-resume-opt', action='store_true', default=False, + help='prevent resume of optimizer state when resuming model') +parser.add_argument('--num-classes', type=int, default=None, metavar='N', + help='number of label classes (Model default if None)') +parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') +parser.add_argument('--img-size', type=int, default=None, metavar='N', + help='Image patch size (default: None => model default)') +parser.add_argument('--input-size', default=None, nargs=3, type=int, + metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty') +parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') +parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') +parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') +parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') +parser.add_argument('-b', '--batch-size', type=int, default=128, metavar='N', + help='input batch size for training (default: 128)') +parser.add_argument('-vb', '--validation-batch-size', type=int, default=None, metavar='N', + help='validation batch size override (default: None)') + +# Optimizer parameters +parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "sgd"') +parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') +parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') +parser.add_argument('--weight-decay', type=float, default=2e-5, + help='weight decay (default: 2e-5)') +parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') +parser.add_argument('--clip-mode', type=str, default='norm', + help='Gradient clipping mode. One of ("norm", "value", "agc")') +parser.add_argument('--max-grad-norm', type=float, default=0.0, + help='Max grad norm (same as clip gradient norm, default: 0.0, no clipping)') +parser.add_argument('--bias-decay', action='store_true', default=False, + help='Perform the weight decay on bias term (default=False)') +parser.add_argument('--no-prox', action='store_true', default=False, + help='Perform the weight decay update like AdamW (default=False)') + + +# Learning rate schedule parameters +parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER', + help='LR scheduler (default: "step"') +parser.add_argument('--lr', type=float, default=0.05, metavar='LR', + help='learning rate (default: 0.05)') +parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') +parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') +parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') +parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') +parser.add_argument('--lr-cycle-decay', type=float, default=0.5, metavar='MULT', + help='amount to decay each learning rate cycle (default: 0.5)') +parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit, cycles enabled if > 1') +parser.add_argument('--lr-k-decay', type=float, default=1.0, + help='learning rate k-decay for cosine/poly (default: 1.0)') +parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR', + help='warmup learning rate (default: 0.0001)') +parser.add_argument('--min-lr', type=float, default=1e-6, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') +parser.add_argument('--epochs', type=int, default=300, metavar='N', + help='number of epochs to train (default: 300)') +parser.add_argument('--epoch-repeats', type=float, default=0., metavar='N', + help='epoch repeat multiplier (number of times to repeat dataset epoch per train epoch).') +parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('--decay-epochs', type=float, default=100, metavar='N', + help='epoch interval to decay LR') +parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N', + help='epochs to warmup LR, if scheduler supports') +parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') +parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') +parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + +# Augmentation & regularization parameters +parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') +parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') +parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') +parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') +parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') +parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') +parser.add_argument('--aa', type=str, default=None, metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: None)'), +parser.add_argument('--aug-repeats', type=int, default=0, + help='Number of augmentation repetitions (distributed training only) (default: 0)') +parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') +parser.add_argument('--jsd-loss', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') +parser.add_argument('--bce-loss', action='store_true', default=False, + help='Enable BCE loss w/ Mixup/CutMix use.') +parser.add_argument('--reprob', type=float, default=0., metavar='PCT', + help='Random erase prob (default: 0.)') +parser.add_argument('--remode', type=str, default='pixel', + help='Random erase mode (default: "pixel")') +parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') +parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') +parser.add_argument('--mixup', type=float, default=0.0, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix', type=float, default=0.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') +parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') +parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') +parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') +parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') +parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') +parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') +parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.)') +parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') +parser.add_argument('--drop-path', type=float, default=None, metavar='PCT', + help='Drop path rate (default: None)') +parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + +parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') +parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') +parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') +parser.add_argument('--dist-bn', type=str, default='reduce', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') +parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + +# Model Exponential Moving Average +parser.add_argument('--model-ema', action='store_true', default=False, + help='Enable tracking moving average of model weights') +parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') +parser.add_argument('--model-ema-decay', type=float, default=0.9998, + help='decay factor for model weights moving average (default: 0.9998)') + +# Misc +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=50, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--recovery-interval', type=int, default=0, metavar='N', + help='how many batches to wait before writing recovery checkpoint') +parser.add_argument('--checkpoint-hist', type=int, default=2, metavar='N', + help='number of checkpoints to keep (default: 10)') +parser.add_argument('-j', '--workers', type=int, default=4, metavar='N', + help='how many training processes to use (default: 4)') +parser.add_argument('--save-images', action='store_true', default=False, + help='save images of input bathes every log interval for debugging') +parser.add_argument('--amp', action='store_true', default=False, + help='use NVIDIA Apex AMP or Native AMP for mixed precision training') +parser.add_argument('--apex-amp', action='store_true', default=False, + help='Use NVIDIA Apex AMP mixed precision') +parser.add_argument('--native-amp', action='store_true', default=False, + help='Use Native Torch AMP mixed precision') +parser.add_argument('--channels-last', action='store_true', default=False, + help='Use channels_last memory layout') +parser.add_argument('--pin-mem', action='store_true', default=False, + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') +parser.add_argument('--no-prefetcher', action='store_true', default=False, + help='disable fast prefetcher') +parser.add_argument('--output', default='', type=str, metavar='PATH', + help='path to output folder (default: none, current dir)') +parser.add_argument('--experiment', default='', type=str, metavar='NAME', + help='name of train experiment, name of sub-folder for output') +parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC', + help='Best metric (default: "top1"') +parser.add_argument('--tta', type=int, default=0, metavar='N', + help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)') +parser.add_argument("--local_rank", default=0, type=int) +parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False, + help='use the multi-epochs-loader to save time at the beginning of every epoch') +parser.add_argument('--torchscript', dest='torchscript', action='store_true', + help='convert model torchscript for inference') +parser.add_argument('--log-wandb', action='store_true', default=False, + help='log training and validation metrics to wandb') + + +def _parse_args(): + # Do we have a config file to parse? + args_config, remaining = config_parser.parse_known_args() + if args_config.config: + with open(args_config.config, 'r') as f: + cfg = yaml.safe_load(f) + parser.set_defaults(**cfg) + + # The main arg parser parses the rest of the args, the usual + # defaults will have been overridden if config file specified. + args = parser.parse_args(remaining) + + # Cache the args as a text string to save them in the output dir later + args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) + return args, args_text + + +def main(): + setup_default_logging() + args, args_text = _parse_args() + + if args.log_wandb: + if has_wandb: + wandb.init(project=args.experiment, config=args) + else: + _logger.warning("You've requested to log metrics to wandb but package not found. " + "Metrics not being logged to wandb, try `pip install wandb`") + + args.prefetcher = not args.no_prefetcher + args.distributed = False + if 'WORLD_SIZE' in os.environ: + args.distributed = int(os.environ['WORLD_SIZE']) > 1 + args.device = 'cuda:0' + args.world_size = 1 + args.rank = 0 # global rank + if args.distributed: + args.device = 'cuda:%d' % args.local_rank + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + args.world_size = torch.distributed.get_world_size() + args.rank = torch.distributed.get_rank() + _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' + % (args.rank, args.world_size)) + else: + _logger.info('Training with a single process on 1 GPUs.') + assert args.rank >= 0 + + # resolve AMP arguments based on PyTorch / Apex availability + use_amp = None + if args.amp: + # `--amp` chooses native amp before apex (APEX ver not actively maintained) + if has_native_amp: + args.native_amp = True + elif has_apex: + args.apex_amp = True + if args.apex_amp and has_apex: + use_amp = 'apex' + elif args.native_amp and has_native_amp: + use_amp = 'native' + elif args.apex_amp or args.native_amp: + _logger.warning("Neither APEX or native Torch AMP is available, using float32. " + "Install NVIDA apex or upgrade to PyTorch 1.6") + + random_seed(args.seed, args.rank) + + model = create_model( + args.model, + pretrained=args.pretrained, + num_classes=args.num_classes, + drop_rate=args.drop, + drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps, + scriptable=args.torchscript, + checkpoint_path=args.initial_checkpoint) + if args.num_classes is None: + assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.' + args.num_classes = model.num_classes # FIXME handle model default vs config num_classes more elegantly + + if args.local_rank == 0: + _logger.info( + f'Model {safe_model_name(args.model)} created, param count:{sum([m.numel() for m in model.parameters()])}') + + data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) + + # setup augmentation batch splits for contrastive loss or split bn + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits > 1, 'A split of 1 makes no sense' + num_aug_splits = args.aug_splits + + # enable split bn (separate bn stats per batch-portion) + if args.split_bn: + assert num_aug_splits > 1 or args.resplit + model = convert_splitbn_model(model, max(num_aug_splits, 2)) + + # move model to GPU, enable channels last layout if set + model.cuda() + if args.channels_last: + model = model.to(memory_format=torch.channels_last) + + # setup synchronized BatchNorm for distributed training + if args.distributed and args.sync_bn: + assert not args.split_bn + if has_apex and use_amp == 'apex': + # Apex SyncBN preferred unless native amp is activated + model = convert_syncbn_model(model) + else: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + if args.local_rank == 0: + _logger.info( + 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' + 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.') + + if args.torchscript: + assert not use_amp == 'apex', 'Cannot use APEX AMP with torchscripted model' + assert not args.sync_bn, 'Cannot use SyncBatchNorm with torchscripted model' + model = torch.jit.script(model) + + opt_lower = args.opt.lower() + if opt_lower == 'adan': + args.opt_args = {'max_grad_norm': args.max_grad_norm, 'no_prox': args.no_prox} + optimizer = create_optimizer(args, model, filter_bias_and_bn = not args.bias_decay) + print(optimizer) + + + # setup automatic mixed-precision (AMP) loss scaling and op casting + amp_autocast = suppress # do nothing + loss_scaler = None + if use_amp == 'apex': + model, optimizer = amp.initialize(model, optimizer, opt_level='O1') + loss_scaler = ApexScaler() + if args.local_rank == 0: + _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.') + elif use_amp == 'native': + amp_autocast = torch.cuda.amp.autocast + loss_scaler = NativeScaler() + if args.local_rank == 0: + _logger.info('Using native Torch AMP. Training in mixed precision.') + else: + if args.local_rank == 0: + _logger.info('AMP not enabled. Training in float32.') + + # optionally resume from a checkpoint + resume_epoch = None + if args.experiment: + output_dir = get_outdir(args.output if args.output else './output/train', args.experiment) + resume_path = os.path.join(output_dir, "last.pth.tar") + print(resume_path, os.path.exists(resume_path)) + if os.path.exists(resume_path) and not args.resume: args.resume = resume_path + + + if args.resume: + resume_epoch = resume_checkpoint( + model, args.resume, + optimizer=None if args.no_resume_opt else optimizer, + loss_scaler=None if args.no_resume_opt else loss_scaler, + log_info=args.local_rank == 0) + + # setup exponential moving average of model weights, SWA could be used here too + model_ema = None + if args.model_ema: + # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper + model_ema = ModelEmaV2( + model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else None) + if args.resume: + load_checkpoint(model_ema.module, args.resume, use_ema=True) + + # setup distributed training + if args.distributed: + if has_apex and use_amp == 'apex': + # Apex DDP preferred unless native amp is activated + if args.local_rank == 0: + _logger.info("Using NVIDIA APEX DistributedDataParallel.") + model = ApexDDP(model, delay_allreduce=True) + else: + if args.local_rank == 0: + _logger.info("Using native Torch DistributedDataParallel.") + model = NativeDDP(model, device_ids=[args.local_rank]) # can use device str in Torch >= 1.1 + # NOTE: EMA model does not need to be wrapped by DDP + + # setup learning rate schedule and starting epoch + lr_scheduler, num_epochs = create_scheduler(args, optimizer) + start_epoch = 0 + if args.start_epoch is not None: + # a specified start_epoch will always override the resume epoch + start_epoch = args.start_epoch + elif resume_epoch is not None: + start_epoch = resume_epoch + if lr_scheduler is not None and start_epoch > 0: + lr_scheduler.step(start_epoch) + + if args.local_rank == 0: + _logger.info('Scheduled epochs: {}'.format(num_epochs)) + + # create the train and eval datasets + dataset_train = create_dataset( + args.dataset, + root=args.data_dir, split=args.train_split, is_training=True, + batch_size=args.batch_size, repeats=args.epoch_repeats) + dataset_eval = create_dataset( + args.dataset, root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size) + + # setup mixup / cutmix + collate_fn = None + mixup_fn = None + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + if mixup_active: + mixup_args = dict( + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + label_smoothing=args.smoothing, num_classes=args.num_classes) + if args.prefetcher: + assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup) + collate_fn = FastCollateMixup(**mixup_args) + else: + mixup_fn = Mixup(**mixup_args) + + # wrap dataset in AugMix helper + if num_aug_splits > 1: + dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) + + # create data loaders w/ augmentation pipeiine + train_interpolation = args.train_interpolation + if args.no_aug or not train_interpolation: + train_interpolation = data_config['interpolation'] + loader_train = create_loader( + dataset_train, + input_size=data_config['input_size'], + batch_size=args.batch_size, + is_training=True, + use_prefetcher=args.prefetcher, + no_aug=args.no_aug, + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + re_split=args.resplit, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + auto_augment=args.aa, + num_aug_repeats=args.aug_repeats, + num_aug_splits=num_aug_splits, + interpolation=train_interpolation, + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + collate_fn=collate_fn, + pin_memory=args.pin_mem, + use_multi_epochs_loader=args.use_multi_epochs_loader + ) + + loader_eval = create_loader( + dataset_eval, + input_size=data_config['input_size'], + batch_size=args.validation_batch_size or args.batch_size, + is_training=False, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + crop_pct=data_config['crop_pct'], + pin_memory=args.pin_mem, + ) + + # setup loss function + if args.jsd_loss: + assert num_aug_splits > 1 # JSD only valid with aug splits set + train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing) + elif mixup_active: + # smoothing is handled with mixup target transform which outputs sparse, soft targets + if args.bce_loss: + train_loss_fn = nn.BCEWithLogitsLoss() + else: + train_loss_fn = SoftTargetCrossEntropy() + elif args.smoothing: + if args.bce_loss: + train_loss_fn = BinaryCrossEntropy(smoothing=args.smoothing) + else: + train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing) + else: + train_loss_fn = nn.CrossEntropyLoss() + train_loss_fn = train_loss_fn.cuda() + validate_loss_fn = nn.CrossEntropyLoss().cuda() + + # setup checkpoint saver and eval metric tracking + eval_metric = args.eval_metric + best_metric = None + best_epoch = None + saver = None + output_dir = None + if args.rank == 0: + if args.experiment: + exp_name = args.experiment + else: + exp_name = '-'.join([ + datetime.now().strftime("%Y%m%d-%H%M%S"), + safe_model_name(args.model), + str(data_config['input_size'][-1]) + ]) + output_dir = get_outdir(args.output if args.output else './output/train', exp_name) + decreasing = True if eval_metric == 'loss' else False + saver = CheckpointSaver( + model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, + checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist) + with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: + f.write(args_text) + + try: + for epoch in range(start_epoch, num_epochs): + if args.distributed and hasattr(loader_train.sampler, 'set_epoch'): + loader_train.sampler.set_epoch(epoch) + + train_metrics = train_one_epoch( + epoch, model, loader_train, optimizer, train_loss_fn, args, + lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, + amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn) + + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + if args.local_rank == 0: + _logger.info("Distributing BatchNorm running means and vars") + distribute_bn(model, args.world_size, args.dist_bn == 'reduce') + + eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) + + if model_ema is not None and not args.model_ema_force_cpu: + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') + ema_eval_metrics = validate( + model_ema.module, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)') + eval_metrics = ema_eval_metrics + + if lr_scheduler is not None: + # step LR for next epoch + lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) + + if output_dir is not None: + update_summary( + epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), + write_header=best_metric is None, log_wandb=args.log_wandb and has_wandb) + + if saver is not None: + # save proper checkpoint with eval metric + save_metric = eval_metrics[eval_metric] + best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric) + + except KeyboardInterrupt: + pass + if best_metric is not None: + _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch)) + + +def train_one_epoch( + epoch, model, loader, optimizer, loss_fn, args, + lr_scheduler=None, saver=None, output_dir=None, amp_autocast=suppress, + loss_scaler=None, model_ema=None, mixup_fn=None): + + if args.mixup_off_epoch and epoch >= args.mixup_off_epoch: + if args.prefetcher and loader.mixup_enabled: + loader.mixup_enabled = False + elif mixup_fn is not None: + mixup_fn.mixup_enabled = False + + second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order + batch_time_m = AverageMeter() + data_time_m = AverageMeter() + losses_m = AverageMeter() + + model.train() + + end = time.time() + last_idx = len(loader) - 1 + num_updates = epoch * len(loader) + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + data_time_m.update(time.time() - end) + if not args.prefetcher: + input, target = input.cuda(), target.cuda() + if mixup_fn is not None: + input, target = mixup_fn(input, target) + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + with amp_autocast(): + output = model(input) + loss = loss_fn(output, target) + + if not args.distributed: + losses_m.update(loss.item(), input.size(0)) + + optimizer.zero_grad() + if loss_scaler is not None: + loss_scaler( + loss, optimizer, + clip_grad=args.clip_grad, clip_mode=args.clip_mode, + parameters=model_parameters(model, exclude_head='agc' in args.clip_mode), + create_graph=second_order) + else: + loss.backward(create_graph=second_order) + if args.clip_grad is not None: + dispatch_clip_grad( + model_parameters(model, exclude_head='agc' in args.clip_mode), + value=args.clip_grad, mode=args.clip_mode) + optimizer.step() + + if model_ema is not None: + model_ema.update(model) + + torch.cuda.synchronize() + num_updates += 1 + batch_time_m.update(time.time() - end) + if last_batch or batch_idx % args.log_interval == 0: + lrl = [param_group['lr'] for param_group in optimizer.param_groups] + lr = sum(lrl) / len(lrl) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + losses_m.update(reduced_loss.item(), input.size(0)) + + if args.local_rank == 0: + _logger.info( + 'Train: {} [{:>4d}/{} ({:>3.0f}%)] ' + 'Loss: {loss.val:#.4g} ({loss.avg:#.3g}) ' + 'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s ' + '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' + 'LR: {lr:.3e} ' + 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format( + epoch, + batch_idx, len(loader), + 100. * batch_idx / last_idx, + loss=losses_m, + batch_time=batch_time_m, + rate=input.size(0) * args.world_size / batch_time_m.val, + rate_avg=input.size(0) * args.world_size / batch_time_m.avg, + lr=lr, + data_time=data_time_m)) + + if args.save_images and output_dir: + torchvision.utils.save_image( + input, + os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx), + padding=0, + normalize=True) + + if saver is not None and args.recovery_interval and ( + last_batch or (batch_idx + 1) % args.recovery_interval == 0): + saver.save_recovery(epoch, batch_idx=batch_idx) + + if lr_scheduler is not None: + lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg) + + end = time.time() + # end for + + if hasattr(optimizer, 'sync_lookahead'): + optimizer.sync_lookahead() + + return OrderedDict([('loss', losses_m.avg)]) + + +def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''): + batch_time_m = AverageMeter() + losses_m = AverageMeter() + top1_m = AverageMeter() + top5_m = AverageMeter() + + model.eval() + + end = time.time() + last_idx = len(loader) - 1 + with torch.no_grad(): + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + if not args.prefetcher: + input = input.cuda() + target = target.cuda() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + with amp_autocast(): + output = model(input) + if isinstance(output, (tuple, list)): + output = output[0] + + # augmentation reduction + reduce_factor = args.tta + if reduce_factor > 1: + output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2) + target = target[0:target.size(0):reduce_factor] + + loss = loss_fn(output, target) + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + acc1 = reduce_tensor(acc1, args.world_size) + acc5 = reduce_tensor(acc5, args.world_size) + else: + reduced_loss = loss.data + + torch.cuda.synchronize() + + losses_m.update(reduced_loss.item(), input.size(0)) + top1_m.update(acc1.item(), output.size(0)) + top5_m.update(acc5.item(), output.size(0)) + + batch_time_m.update(time.time() - end) + end = time.time() + if args.local_rank == 0 and (last_batch or batch_idx % args.log_interval == 0): + log_name = 'Test' + log_suffix + _logger.info( + '{0}: [{1:>4d}/{2}] ' + 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' + 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' + 'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) ' + 'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format( + log_name, batch_idx, last_idx, batch_time=batch_time_m, + loss=losses_m, top1=top1_m, top5=top5_m)) + + metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)]) + + return metrics + + +if __name__ == '__main__': + main() diff --git a/NLP/BERT/README.md b/NLP/BERT/README.md new file mode 100644 index 0000000..7e8d3d9 --- /dev/null +++ b/NLP/BERT/README.md @@ -0,0 +1,213 @@ +# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models + + + +## Installation of Fairseq + +Our experiment is based on the repo [Fairseq](https://github.com/facebookresearch/fairseq). For the requirements and installation of [Fairseq](https://github.com/facebookresearch/fairseq) and Apex, please refer to that repo. + + + +## Environment + +Our experiments for this task are based on the following pkg version. + +```python +torch.__version__ = '1.10.1+cu111' +torchvision.__version__ = '0.11.2+cu111' +torchaudio.__version__ = '0.10.1+cu111' +fairseq.__version__ = '0.12.2' +``` + +If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:fairseq](https://hub.docker.com/repository/docker/xyxie/adan-image). + + + +## Usage of Adan in Fairseq + +### One step to use Adan + +Please first put the file [`adan.py`](./adan.py) to the directory `path/to/fairseq/fairseq/optim`. Then you can choose Adan as the optimizer in the config file. See following example for pre-training: + +```yaml +optimizer: + _name: adan + weight_decay: 0.02 + adan_betas: (0.98,0.92,0.99) + adan_eps: 1e-08 +``` + + + +## Pretraining + +The following steps are modified from [Fairseq-Roberta](https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.pretraining.md). For completeness, we list some key steps here. + + +### 1) Preprocess the data + +Data should be preprocessed following the [language modeling format](https://github.com/facebookresearch/fairseq/tree/main/examples/language_model). That is, each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`, and all lines should be concatenated as a 1D text stream during training. + + + +In the following steps, we use the [Bookcorpus dataset](https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz) and [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:Database_download) to demonstrate how to preprocess raw text data with the GPT-2 BPE. + +#### i) Download the dataset: + +```bash +wget https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz +tar -zxvf books1.tar.gz -C ./bert-corpus/ +``` + +```python +pip install datasets +from datasets import load_dataset + +dataset = load_dataset("wikipedia", "20220301.en") +``` + +#### ii) Generate Raw data: + + - For wikipedia dataset, we need to read each line of the json line file , replace the `\n` in the text field with a space, and write the line (add `\n` at the end), to the file new `all_data.raw`. + + - For bookcorpus dataset, read out the contexts of each book, then replace the `\n` with the space, and then write the context of the book as one line in `all_data.raw`, ended up with `\n`. + + - Split the `all_data.raw` in to `wiki.train.raw` and `wiki.dev.raw` with the ratio of 99:1. Set `wiki.test.raw = wiki.dev.raw` for compatibility of fairseq. + + + +#### iii) Encode data with the GPT-2 BPE: + +```bash +mkdir -p gpt2_bpe +wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json +wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe +for SPLIT in train valid test; do \ + python -m examples.roberta.multiprocessing_bpe_encoder \ + --encoder-json gpt2_bpe/encoder.json \ + --vocab-bpe gpt2_bpe/vocab.bpe \ + --inputs bert-corpus/wiki.${SPLIT}.raw \ + --outputs bert-corpus/wiki.${SPLIT}.bpe \ + --keep-empty \ + --workers 60; \ +done +``` + + + +#### iv) Binarize the data using the GPT-2 fairseq dictionary: + +```bash +wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt +fairseq-preprocess \ + --only-source \ + --srcdict gpt2_bpe/dict.txt \ + --trainpref bert-corpus/wiki.train.bpe \ + --validpref bert-corpus/wiki.valid.bpe \ + --testpref bert-corpus/wiki.test.bpe \ + --destdir data-bin/bert-corpus \ + --workers 60 +``` + + + +### 2) Train BERT base + +Put the provided [config files](./config/pretraining) to the directory `path/to/fairseq/examples/roberta/config/pretraining` + +```bash +DATA_DIR=/path/to/fairseq/bert-corpus + +fairseq-hydra-train -m --config-dir examples/roberta/config/pretraining \ +--config-name ${NAME} task.data=$DATA_DIR \ +checkpoint.save_dir=/path/to/save_dir/ + +``` + +We can optionally resume the training of the released BERT-base model by adding `checkpoint.restore_file=/path/to/model.pt`. Note, in our experiments, we use Adan to train BERT-base from scratch. You can use the following config files to train BERT-base with Adam or Adan: + + | NAME | Optimizer | Config | Download | + | :-------: | :-------: | :----------------------------------------------------: | :------------------------------------------------------: | + | bert-base | Adam | [config](./exp_results/pretrain/full_config-adam.yaml) | [log](./exp_results/pretrain/hydra_train-adam.log)/model | + | bert-adan | Adan | [config](./exp_results/pretrain/full_config-adan.yaml) | [log](./exp_results/pretrain/hydra_train-adan.log)/model | + +The above command assumes the training is on 8x40GB A100 GPUs. Each GPU uses a batch size of 32 sequences (`dataset.batch_size`). If you have fewer GPUs or GPUs with less memory, you may need to reduce `dataset.batch_size` and increase `dataset.update_freq` to compensate. Alternatively if you have more GPUs you can decrease `dataset.update_freq` accordingly to improve the training speed. + + +## Finetuning BERT-base on GLUE tasks + +### 1) Download the data from [GLUE website](https://gluebenchmark.com/tasks) using following commands: +```bash +wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py +python download_glue_data.py --data_dir glue_data --tasks all +``` +There some problems to download `MRPC` and `MNLI` , hence we pass the `MRPC` task and download the data of `MNLI` from the unofficial sources. + + + +### 2) Preprocess GLUE task data: + +```bash +./examples/roberta/preprocess_GLUE_tasks.sh glue_data +``` +- `glue_task_name` is one of the following: `{ALL, QQP, MNLI, QNLI, RTE, STS-B, SST-2, CoLA}`. Use `ALL` for preprocessing all the glue tasks. + + + +### 3) Fine-tuning on GLUE task: + +Example fine-tuning cmd for `RTE` task +```bash +TASK=RTE; + +python path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --avg_num 1 \ +--data_path /path/to/fairseq/GLUE/glue_data/$TASK \ +--bin_path /path/to/fairseq/GLUE/$TASK-bin \ +--pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \ +--finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \ +--task rte-adan +``` + +- `avg_num` number of repetitions. + +- `data_path` path to the data of GLUE task, e.g., CoLA, MNLI, etc. + +- `bin_path` similar to `data_path`, but is path to the binarized data after processing. + +- `pre_path` path to the pre-trained model. + +- `finetune_path` path to save/load fine-tuned model. + +- `task` config name, please refer to the directory of [fine-tuning](./config/finetuning) for the additional config files for each of the GLUE tasks. + +- This cmd-args and hyperparams are tested on one Nvidia `A100` GPU with `40gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. + + + +### 4) Inference on GLUE task +After training the model by using previous step, we can perform inference with checkpoints in `finetune_path` directory using following code snippet: + +```bash +TASK=RTE; + +python path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --inference \ +--data_path /path/to/fairseq/GLUE/glue_data/$TASK \ +--bin_path /path/to/fairseq/GLUE/$TASK-bin \ +--pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \ +--finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \ +--task rte-adan + +``` + + This should give: + +| GLUE-Task | Metric | Result | Config | +| --------- | :--------------------------- | :-------: | :-------------------------------------------: | +| CoLA | Matthew's corr. | 64.6 | [config](./config/finetuning/cola-adan.yaml) | +| SST-2 | Accuracy | 93.2 | [config](./config/finetuning/sst_2-adan.yaml) | +| STS-B | Person corr. | 89.3 | [config](./config/finetuning/sts_b-adan.yaml) | +| QQP | Accuracy | 91.2 | [config](./config/finetuning/qqp-adan.yaml) | +| MNLI | Matched acc./Mismatched acc. | 85.7/85.6 | [config](./config/finetuning/mnli-adan.yaml) | +| QNLI | Accuracy | 91.3 | [config](./config/finetuning/qnli-adan.yaml) | +| RTE | Accuracy | 73.3 | [config](./config/finetuning/rte-adan.yaml) | + diff --git a/NLP/BERT/adan.py b/NLP/BERT/adan.py new file mode 100644 index 0000000..65326ce --- /dev/null +++ b/NLP/BERT/adan.py @@ -0,0 +1,231 @@ +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import math +from collections.abc import Collection +from dataclasses import dataclass, field +from typing import Any, List + +import torch +import torch.distributed as dist +import torch.optim +from fairseq.dataclass import FairseqDataclass +from fairseq.optim import FairseqOptimizer, register_optimizer +from omegaconf import II, OmegaConf + + +logger = logging.getLogger(__name__) + + +@dataclass +class FairseqAdanConfig(FairseqDataclass): + adan_betas: Any = field( + default=(0.98, 0.92, 0.99), metadata={"help": "betas for Adan optimizer"} + ) + adan_eps: float = field( + default=1e-8, metadata={"help": "epsilon for Adam optimizer"} + ) + weight_decay: float = field(default=0.0, metadata={"help": "weight decay"}) + + no_prox: bool = field( + default=False, metadata={"help": "wether to perform prox operator"} + ) + fp16_adan_stats: bool = field( + default=False, metadata={"help": "use FP16 stats (with automatic scaling)"} + ) + # TODO common vars below in parent + tpu: bool = II("common.tpu") + lr: List[float] = II("optimization.lr") + + +@register_optimizer("adan", dataclass=FairseqAdanConfig) +class FairseqAdan(FairseqOptimizer): + """ + Adan optimizer for fairseq. + """ + + def __init__(self, cfg: FairseqAdanConfig, params): + super().__init__(cfg) + fused_adan_cls = None + use_fused_adan = ( + fused_adan_cls is not None + and torch.cuda.is_available() + ) + if getattr(cfg, "tpu", False): + if self.cfg.fp16_adan_stats: + raise NotImplementedError("--fp16-adam-stats is only supported on GPU") + # on TPUs we use the Adam defined here, since it + # automatically casts gradients to FP32 + self._optimizer = Adan(params, **self.optimizer_config) + elif use_fused_adan: + raise NotImplementedError("--fp16-adam-stats is only supported on GPU") + else: + if self.cfg.fp16_adan_stats: + raise NotImplementedError( + "--fp16-adam-stats is only supported with FusedAdanV1" + ) + self._optimizer = Adan(params, **self.optimizer_config) + + @property + def optimizer_config(self): + """ + Return a kwarg dictionary that will be used to override optimizer + args stored in checkpoints. This allows us to load a checkpoint and + resume training using a different set of optimizer args, e.g., with a + different learning rate. + """ + return { + "lr": self.cfg.lr[0] + if isinstance(self.cfg.lr, Collection) + else self.cfg.lr, + "betas": eval(self.cfg.adan_betas) + if isinstance(self.cfg.adan_betas, str) + else OmegaConf.to_container(self.cfg.adan_betas), + "eps": self.cfg.adan_eps, + "weight_decay": self.cfg.weight_decay, + } + + def average_params(self): + """Reduce Params is only used during BMUF distributed training.""" + state_dict = self.optimizer.state_dict() + total_gpus = float(dist.get_world_size()) + + for _, value in state_dict["state"].items(): + value["exp_avg"] /= total_gpus + value["exp_avg_sq"] /= total_gpus + value['exp_avg_diff'] /= total_gpus + dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM) + dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM) + dist.all_reduce(value["exp_avg_diff"], op=dist.ReduceOp.SUM) + + +class Adan(torch.optim.Optimizer): + r"""Implements Adan algorithm. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.98, 0.92, 0.99)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + """ + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, no_prox = False): + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, no_prox = no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @property + def supports_memory_efficient_fp16(self): + return True + + @property + def supports_flat_params(self): + return True + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + p_data_fp32 = p.data + if p.data.dtype in {torch.float16, torch.bfloat16}: + p_data_fp32 = p_data_fp32.float() + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + state['exp_avg_diff'] = torch.zeros_like(p_data_fp32) + else: + state["exp_avg"] = state["exp_avg"].to(p_data_fp32) + state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32) + state['exp_avg_diff'] = state['exp_avg_diff'].to(p_data_fp32) + + + grad = p.grad.data + if grad.dtype in {torch.float16, torch.bfloat16}: + grad = grad.float() + if grad.is_sparse: + raise RuntimeError( + "Adan does not support sparse gradients, please consider SparseAdam instead" + ) + + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + + copy_grad = grad.clone() + + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + + update = grad+beta2*diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # v_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg/bias_correction1+beta2*exp_avg_diff/bias_correction2) ).div_(denom) + + if group['no_prox']: + p_data_fp32.mul_(1 - group['lr'] * group['weight_decay']) + p_data_fp32.add_(update, alpha=-group['lr']) + else: + p_data_fp32.add_(update, alpha=-group['lr']) + p_data_fp32.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad + + if p.data.dtype in {torch.float16, torch.bfloat16}: + p.data.copy_(p_data_fp32) + return loss diff --git a/NLP/BERT/config/finetuning/acc_test.py b/NLP/BERT/config/finetuning/acc_test.py new file mode 100644 index 0000000..efd67df --- /dev/null +++ b/NLP/BERT/config/finetuning/acc_test.py @@ -0,0 +1,116 @@ +import os +from fairseq.models.roberta import RobertaModel +import argparse +from scipy.stats import pearsonr +from sklearn.metrics import matthews_corrcoef + + +def get_acc(model_path, data_path, bin_path, task='rte'): + acc_list = [] + gold, pred = [], [] + roberta = RobertaModel.from_pretrained( + model_path, + checkpoint_file='checkpoint_best.pt', + data_name_or_path=bin_path#'RTE-bin' + ) + + label_fn = lambda label: roberta.task.label_dictionary.string( + [label + roberta.task.label_dictionary.nspecial] + ) + ncorrect, nsamples = 0, 0 + roberta.cuda() + roberta.eval() + if 'mnli' not in task: + dev_files = ['dev.tsv'] + else: dev_files = ['dev_mismatched.tsv', 'dev_matched.tsv'] + for dev_file in dev_files: + with open(os.path.join(data_path, dev_file)) as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + if 'rte' in task or 'qnli' in task: + sent1, sent2, target = tokens[1], tokens[2], tokens[3] + tokens = roberta.encode(sent1, sent2) + elif 'qqp' in task: + sent1, sent2, target = tokens[3], tokens[4], tokens[5] + tokens = roberta.encode(sent1, sent2) + elif 'mnli' in task: + sent1, sent2, target = tokens[8], tokens[9], tokens[11] + tokens = roberta.encode(sent1, sent2) + elif 'mrpc' in task: + sent1, sent2, target = tokens[3], tokens[4], tokens[0] + tokens = roberta.encode(sent1, sent2) + elif 'sts_b' in task: + sent1, sent2, target = tokens[7], tokens[8], float(tokens[9]) + tokens = roberta.encode(sent1, sent2) + elif 'sst_2' in task: + sent, target = tokens[0], tokens[1] + tokens = roberta.encode(sent) + + elif 'cola' in task: + sent, target = tokens[3], tokens[1] + tokens = roberta.encode(sent) + if 'sts_b' not in task: + prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() + prediction_label = label_fn(prediction) + ncorrect += int(prediction_label == target) + + nsamples += 1 + if 'cola' in task: + target = int(target) + prediction_label = int(prediction_label) + pred.append(prediction_label) + gold.append(target) + + else: + features = roberta.extract_features(tokens) + predictions = 5.0 * roberta.model.classification_heads['sentence_classification_head'](features) + gold.append(target) + pred.append(predictions.item()) + if 'cola' in task: + out = matthews_corrcoef(gold, pred) + elif 'sts_b' in task: + out = pearsonr(gold, pred)[0] + else: out = float(ncorrect)/float(nsamples) + + acc_list.append(out) + return acc_list + + +parser = argparse.ArgumentParser(description='GLUE test for acc') +parser.add_argument('--avg_num', type=int, default=1, + help='number of try') +parser.add_argument('--pre_path', type=str, default='./baseline/checkpoint_20_1000000.pt', + help='path to pre-trained model') +parser.add_argument('--data_path', type=str, default='./GLUE/glue_data/STS-B', + help='path to data') +parser.add_argument('--bin_path', type=str, default='./GLUE/STS-B-bin', + help='path to -bin data') +parser.add_argument('--finetune_path', type=str, default='./bert-fintune/adam/STS-B/', + help='path to finetuned model') +parser.add_argument('--task', type=str, default='sts_b', + help='task of finetune') +parser.add_argument('--inference', action='store_true', default=False, + help='inference only') +args = parser.parse_args() + + +acc_avg = 0.0 +acc_avg2 = 0.0 +for _ in range(args.avg_num): + if not args.inference: + val = os.system(' fairseq-hydra-train --config-dir ./fairseq/examples/roberta/config/finetuning \ + --config-name {} \ + task.data={} checkpoint.restore_file={} \ + checkpoint.save_dir={}'.format(args.task, args.bin_path, args.pre_path, args.finetune_path)) + all_acc = get_acc(args.finetune_path, args.data_path, args.bin_path, args.task) + acc_avg+=all_acc[0] + if len(all_acc)>1: + acc_avg2+=all_acc[1] + +if acc_avg2>0: + print('Mismatched Accuracy1:{}, Matched Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num), float(acc_avg2)/float(args.avg_num))) +else: + print('AVG Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num))) + + \ No newline at end of file diff --git a/NLP/BERT/config/finetuning/cola-adan.yaml b/NLP/BERT/config/finetuning/cola-adan.yaml new file mode 100644 index 0000000..cddfbfe --- /dev/null +++ b/NLP/BERT/config/finetuning/cola-adan.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adan + weight_decay: 0.01 + adan_betas: (0.98,0.99,0.99) + adan_eps: 1e-08 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 320 + +optimization: + clip_norm: 0.0 + lr: [4e-05] + max_update: 5336 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/cola.yaml b/NLP/BERT/config/finetuning/cola.yaml new file mode 100644 index 0000000..ac76611 --- /dev/null +++ b/NLP/BERT/config/finetuning/cola.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 320 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 5336 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/mnli-adan.yaml b/NLP/BERT/config/finetuning/mnli-adan.yaml new file mode 100644 index 0000000..8edf286 --- /dev/null +++ b/NLP/BERT/config/finetuning/mnli-adan.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 3 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adan + weight_decay: 0.01 + adan_betas: (0.98,0.92,0.999) + adan_eps: 1e-08 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 7432 + +optimization: + clip_norm: 1.0 + lr: [2.0e-05] + max_update: 123873 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/mnli.yaml b/NLP/BERT/config/finetuning/mnli.yaml new file mode 100644 index 0000000..5be10c3 --- /dev/null +++ b/NLP/BERT/config/finetuning/mnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 3 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 7432 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 123873 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/qnli-adan.yaml b/NLP/BERT/config/finetuning/qnli-adan.yaml new file mode 100644 index 0000000..36f1bce --- /dev/null +++ b/NLP/BERT/config/finetuning/qnli-adan.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adan + weight_decay: 0.001 + adan_betas: (0.98,0.99,0.99) + adan_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 1986 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 33112 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/qnli.yaml b/NLP/BERT/config/finetuning/qnli.yaml new file mode 100644 index 0000000..b4595b0 --- /dev/null +++ b/NLP/BERT/config/finetuning/qnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1986 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 33112 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/qqp-adan.yaml b/NLP/BERT/config/finetuning/qqp-adan.yaml new file mode 100644 index 0000000..df48414 --- /dev/null +++ b/NLP/BERT/config/finetuning/qqp-adan.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adan + weight_decay: 0.001 + adan_betas: (0.98,0.99,0.99) + adan_eps: 1e-08 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 28318 + +optimization: + clip_norm: 0.0 + lr: [4e-05] + max_update: 113272 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/qqp.yaml b/NLP/BERT/config/finetuning/qqp.yaml new file mode 100644 index 0000000..5a2b2ed --- /dev/null +++ b/NLP/BERT/config/finetuning/qqp.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 28318 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 113272 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/rte-adan.yaml b/NLP/BERT/config/finetuning/rte-adan.yaml new file mode 100644 index 0000000..c43f6e2 --- /dev/null +++ b/NLP/BERT/config/finetuning/rte-adan.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adan + weight_decay: 0.01 + adan_betas: (0.98,0.99,0.99) + adan_eps: 1e-08 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 122 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2036 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/rte.yaml b/NLP/BERT/config/finetuning/rte.yaml new file mode 100644 index 0000000..7318465 --- /dev/null +++ b/NLP/BERT/config/finetuning/rte.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 122 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2036 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/sst_2-adan.yaml b/NLP/BERT/config/finetuning/sst_2-adan.yaml new file mode 100644 index 0000000..ed79f63 --- /dev/null +++ b/NLP/BERT/config/finetuning/sst_2-adan.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adan + weight_decay: 0.01 + adan_betas: (0.98,0.92,0.99) + adan_eps: 1e-08 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1256 + +optimization: + clip_norm: 0.0 + lr: [4e-05] + max_update: 20935 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/sst_2.yaml b/NLP/BERT/config/finetuning/sst_2.yaml new file mode 100644 index 0000000..a93ad2f --- /dev/null +++ b/NLP/BERT/config/finetuning/sst_2.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1256 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 20935 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/sts_b-adan.yaml b/NLP/BERT/config/finetuning/sts_b-adan.yaml new file mode 100644 index 0000000..6c4069f --- /dev/null +++ b/NLP/BERT/config/finetuning/sts_b-adan.yaml @@ -0,0 +1,58 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 1 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + regression_target: true + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adan + weight_decay: 0.01 + adan_betas: (0.98,0.99,0.99) + adan_eps: 1e-8 + +lr_scheduler: + _name: cosine + warmup_updates: 214 + +optimization: + clip_norm: 0.5 + lr: [4e-05] + max_update: 3598 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/finetuning/sts_b.yaml b/NLP/BERT/config/finetuning/sts_b.yaml new file mode 100644 index 0000000..2d49522 --- /dev/null +++ b/NLP/BERT/config/finetuning/sts_b.yaml @@ -0,0 +1,58 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 1 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + regression_target: true + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 214 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 3598 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/pretraining/base.yaml b/NLP/BERT/config/pretraining/base.yaml new file mode 100644 index 0000000..9782990 --- /dev/null +++ b/NLP/BERT/config/pretraining/base.yaml @@ -0,0 +1,42 @@ +# @package _group_ +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + +task: + _name: masked_lm + data: ??? + sample_break_mode: complete + tokens_per_sample: 512 + +criterion: masked_lm + +dataset: + batch_size: 16 + ignore_unused_valid_subsets: true + +optimizer: + _name: adam + weight_decay: 0.01 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 10000 + +optimization: + clip_norm: 0 + lr: [0.0005] + max_update: 125000 + update_freq: [16] + +model: + _name: roberta + max_positions: 512 + dropout: 0.1 + attention_dropout: 0.1 diff --git a/NLP/BERT/config/pretraining/bert-adan.yaml b/NLP/BERT/config/pretraining/bert-adan.yaml new file mode 100644 index 0000000..b0e3ebb --- /dev/null +++ b/NLP/BERT/config/pretraining/bert-adan.yaml @@ -0,0 +1,52 @@ +# @package _group_ +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + save_interval: 5 + save_interval_updates: 50000 + +task: + _name: masked_lm + data: ??? + sample_break_mode: complete + tokens_per_sample: 512 + +criterion: masked_lm + + + +optimizer: + _name: adan + weight_decay: 0.02 + adan_betas: (0.98,0.92,0.99) + adan_eps: 1e-08 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 10000 + +optimization: + clip_norm: 5.0 + lr: [0.001] + max_update: 1000000 + update_freq: [1] + +model: + _name: roberta + max_positions: 512 + dropout: 0.1 + attention_dropout: 0.1 + +distributed_training: + ddp_backend: no_c10d + +dataset: + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + validate_interval_updates: 50000 + batch_size: 32 + ignore_unused_valid_subsets: true diff --git a/NLP/BERT/config/pretraining/bert-base.yaml b/NLP/BERT/config/pretraining/bert-base.yaml new file mode 100644 index 0000000..f8ae660 --- /dev/null +++ b/NLP/BERT/config/pretraining/bert-base.yaml @@ -0,0 +1,54 @@ +# @package _group_ +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_dir: 'bert/baseline/' + no_epoch_checkpoints: true + save_interval: 5 + save_interval_updates: 50000 + +task: + _name: masked_lm + data: ??? + sample_break_mode: complete + tokens_per_sample: 512 + +criterion: masked_lm + + + +optimizer: + _name: adam + weight_decay: 0.01 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 10000 + +optimization: + clip_norm: 0 + lr: [0.0001] + max_update: 1000000 + update_freq: [1] + +model: + _name: roberta + max_positions: 512 + dropout: 0.1 + attention_dropout: 0.1 + +distributed_training: + ddp_backend: no_c10d + +dataset: + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + validate_interval_updates: 50000 + batch_size: 32 + ignore_unused_valid_subsets: true + diff --git a/NLP/BERT/exp_results/pretrain/full_config-adam.yaml b/NLP/BERT/exp_results/pretrain/full_config-adam.yaml new file mode 100644 index 0000000..5a35e9b --- /dev/null +++ b/NLP/BERT/exp_results/pretrain/full_config-adam.yaml @@ -0,0 +1,376 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + hydra_help: ??? + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + output_subdir: .hydra + overrides: + hydra: [] + task: + - task.data=/dataset/common/bert-corpus-0729/ + job: + name: hydra_train + override_dirname: task.data=/dataset/common/bert-corpus-0729/ + id: ??? + num: ??? + config_name: bert-base + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.0.7 + cwd: /vit-opt/fairseq + verbose: false +_name: null +common: + _name: null + no_progress_bar: false + log_interval: 200 + log_format: json + log_file: null + aim_repo: null + aim_run_hash: null + tensorboard_logdir: null + wandb_project: null + azureml_logging: false + seed: 1 + cpu: false + tpu: false + bf16: false + memory_efficient_bf16: false + fp16: true + memory_efficient_fp16: false + fp16_no_flatten_grads: false + fp16_init_scale: 128 + fp16_scale_window: null + fp16_scale_tolerance: 0.0 + on_cpu_convert_precision: false + min_loss_scale: 0.0001 + threshold_loss_scale: null + amp: false + amp_batch_retries: 2 + amp_init_scale: 128 + amp_scale_window: null + user_dir: null + empty_cache_freq: 0 + all_gather_list_size: 16384 + model_parallel_size: 1 + quantization_config_path: null + profile: false + reset_logging: false + suppress_crashes: false + use_plasma_view: false + plasma_path: /tmp/plasma +common_eval: + _name: null + path: null + post_process: null + quiet: false + model_overrides: '{}' + results_path: null +distributed_training: + _name: null + distributed_world_size: 8 + distributed_num_procs: 8 + distributed_rank: 0 + distributed_backend: nccl + distributed_init_method: null + distributed_port: -1 + device_id: 0 + distributed_no_spawn: false + ddp_backend: no_c10d + ddp_comm_hook: none + bucket_cap_mb: 25 + fix_batches_to_gpus: false + find_unused_parameters: false + gradient_as_bucket_view: false + fast_stat_sync: false + heartbeat_timeout: -1 + broadcast_buffers: false + slowmo_momentum: null + slowmo_base_algorithm: localsgd + localsgd_frequency: 3 + nprocs_per_node: 8 + pipeline_model_parallel: false + pipeline_balance: null + pipeline_devices: null + pipeline_chunks: 0 + pipeline_encoder_balance: null + pipeline_encoder_devices: null + pipeline_decoder_balance: null + pipeline_decoder_devices: null + pipeline_checkpoint: never + zero_sharding: none + fp16: ${common.fp16} + memory_efficient_fp16: ${common.memory_efficient_fp16} + tpu: ${common.tpu} + no_reshard_after_forward: false + fp32_reduce_scatter: false + cpu_offload: false + use_sharded_state: false + not_fsdp_flatten_parameters: false +dataset: + _name: null + num_workers: 1 + skip_invalid_size_inputs_valid_test: true + max_tokens: null + batch_size: 32 + required_batch_size_multiple: 8 + required_seq_len_multiple: 1 + dataset_impl: null + data_buffer_size: 10 + train_subset: train + valid_subset: valid + combine_valid_subsets: null + ignore_unused_valid_subsets: true + validate_interval: 5 + validate_interval_updates: 50000 + validate_after_updates: 0 + fixed_validation_seed: null + disable_validation: false + max_tokens_valid: ${dataset.max_tokens} + batch_size_valid: ${dataset.batch_size} + max_valid_steps: null + curriculum: 0 + gen_subset: test + num_shards: 1 + shard_id: 0 + grouped_shuffling: false + update_epoch_batch_itr: ${dataset.grouped_shuffling} + update_ordered_indices_seed: false +optimization: + _name: null + max_epoch: 0 + max_update: 1000000 + stop_time_hours: 0.0 + clip_norm: 0.0 + sentence_avg: false + update_freq: + - 1 + lr: + - 0.0001 + stop_min_lr: -1.0 + use_bmuf: false + skip_remainder_batch: false +checkpoint: + _name: null + save_dir: bert/baseline/ + restore_file: checkpoint_last.pt + continue_once: null + finetune_from_model: null + reset_dataloader: false + reset_lr_scheduler: false + reset_meters: false + reset_optimizer: false + optimizer_overrides: '{}' + save_interval: 5 + save_interval_updates: 50000 + keep_interval_updates: -1 + keep_interval_updates_pattern: -1 + keep_last_epochs: -1 + keep_best_checkpoints: -1 + no_save: false + no_epoch_checkpoints: true + no_last_checkpoints: false + no_save_optimizer_state: false + best_checkpoint_metric: loss + maximize_best_checkpoint_metric: false + patience: -1 + checkpoint_suffix: '' + checkpoint_shard_count: 1 + load_checkpoint_on_all_dp_ranks: false + write_checkpoints_asynchronously: false + model_parallel_size: ${common.model_parallel_size} +bmuf: + _name: null + block_lr: 1.0 + block_momentum: 0.875 + global_sync_iter: 50 + warmup_iterations: 500 + use_nbm: false + average_sync: false + distributed_world_size: ${distributed_training.distributed_world_size} +generation: + _name: null + beam: 5 + nbest: 1 + max_len_a: 0.0 + max_len_b: 200 + min_len: 1 + match_source_len: false + unnormalized: false + no_early_stop: false + no_beamable_mm: false + lenpen: 1.0 + unkpen: 0.0 + replace_unk: null + sacrebleu: false + score_reference: false + prefix_size: 0 + no_repeat_ngram_size: 0 + sampling: false + sampling_topk: -1 + sampling_topp: -1.0 + constraints: null + temperature: 1.0 + diverse_beam_groups: -1 + diverse_beam_strength: 0.5 + diversity_rate: -1.0 + print_alignment: null + print_step: false + lm_path: null + lm_weight: 0.0 + iter_decode_eos_penalty: 0.0 + iter_decode_max_iter: 10 + iter_decode_force_max_iter: false + iter_decode_with_beam: 1 + iter_decode_with_external_reranker: false + retain_iter_history: false + retain_dropout: false + retain_dropout_modules: null + decoding_format: null + no_seed_provided: false + eos_token: null +eval_lm: + _name: null + output_word_probs: false + output_word_stats: false + context_window: 0 + softmax_batch: 9223372036854775807 +interactive: + _name: null + buffer_size: 0 + input: '-' +model: + _name: roberta + max_positions: 512 + dropout: 0.1 + attention_dropout: 0.1 +task: + _name: masked_lm + data: /dataset/common/bert-corpus-0729/ + sample_break_mode: complete + tokens_per_sample: 512 +criterion: masked_lm +optimizer: + _name: adam + weight_decay: 0.01 + adam_betas: (0.9,0.98) + adam_eps: 1.0e-06 +lr_scheduler: + _name: polynomial_decay + warmup_updates: 10000 +scoring: null +bpe: null +tokenizer: null +ema: + _name: null + store_ema: false + ema_decay: 0.9999 + ema_start_update: 0 + ema_seed_model: null + ema_update_freq: 1 + ema_fp32: false diff --git a/NLP/BERT/exp_results/pretrain/full_config-adan.yaml b/NLP/BERT/exp_results/pretrain/full_config-adan.yaml new file mode 100644 index 0000000..7ec930a --- /dev/null +++ b/NLP/BERT/exp_results/pretrain/full_config-adan.yaml @@ -0,0 +1,376 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + hydra_help: ??? + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + output_subdir: .hydra + overrides: + hydra: [] + task: + - task.data=/dataset/common/bert-corpus-0729/ + job: + name: hydra_train + override_dirname: task.data=/dataset/common/bert-corpus-0729/ + id: ??? + num: ??? + config_name: bert-adan2 + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.0.7 + cwd: /vit-opt/fairseq + verbose: false +_name: null +common: + _name: null + no_progress_bar: false + log_interval: 200 + log_format: json + log_file: null + aim_repo: null + aim_run_hash: null + tensorboard_logdir: null + wandb_project: null + azureml_logging: false + seed: 1 + cpu: false + tpu: false + bf16: false + memory_efficient_bf16: false + fp16: true + memory_efficient_fp16: false + fp16_no_flatten_grads: false + fp16_init_scale: 128 + fp16_scale_window: null + fp16_scale_tolerance: 0.0 + on_cpu_convert_precision: false + min_loss_scale: 0.0001 + threshold_loss_scale: null + amp: false + amp_batch_retries: 2 + amp_init_scale: 128 + amp_scale_window: null + user_dir: null + empty_cache_freq: 0 + all_gather_list_size: 16384 + model_parallel_size: 1 + quantization_config_path: null + profile: false + reset_logging: false + suppress_crashes: false + use_plasma_view: false + plasma_path: /tmp/plasma +common_eval: + _name: null + path: null + post_process: null + quiet: false + model_overrides: '{}' + results_path: null +distributed_training: + _name: null + distributed_world_size: 8 + distributed_num_procs: 8 + distributed_rank: 0 + distributed_backend: nccl + distributed_init_method: null + distributed_port: -1 + device_id: 0 + distributed_no_spawn: false + ddp_backend: no_c10d + ddp_comm_hook: none + bucket_cap_mb: 25 + fix_batches_to_gpus: false + find_unused_parameters: false + gradient_as_bucket_view: false + fast_stat_sync: false + heartbeat_timeout: -1 + broadcast_buffers: false + slowmo_momentum: null + slowmo_base_algorithm: localsgd + localsgd_frequency: 3 + nprocs_per_node: 8 + pipeline_model_parallel: false + pipeline_balance: null + pipeline_devices: null + pipeline_chunks: 0 + pipeline_encoder_balance: null + pipeline_encoder_devices: null + pipeline_decoder_balance: null + pipeline_decoder_devices: null + pipeline_checkpoint: never + zero_sharding: none + fp16: ${common.fp16} + memory_efficient_fp16: ${common.memory_efficient_fp16} + tpu: ${common.tpu} + no_reshard_after_forward: false + fp32_reduce_scatter: false + cpu_offload: false + use_sharded_state: false + not_fsdp_flatten_parameters: false +dataset: + _name: null + num_workers: 1 + skip_invalid_size_inputs_valid_test: true + max_tokens: null + batch_size: 32 + required_batch_size_multiple: 8 + required_seq_len_multiple: 1 + dataset_impl: null + data_buffer_size: 10 + train_subset: train + valid_subset: valid + combine_valid_subsets: null + ignore_unused_valid_subsets: true + validate_interval: 5 + validate_interval_updates: 50000 + validate_after_updates: 0 + fixed_validation_seed: null + disable_validation: false + max_tokens_valid: ${dataset.max_tokens} + batch_size_valid: ${dataset.batch_size} + max_valid_steps: null + curriculum: 0 + gen_subset: test + num_shards: 1 + shard_id: 0 + grouped_shuffling: false + update_epoch_batch_itr: ${dataset.grouped_shuffling} + update_ordered_indices_seed: false +optimization: + _name: null + max_epoch: 0 + max_update: 1000000 + stop_time_hours: 0.0 + clip_norm: 5.0 + sentence_avg: false + update_freq: + - 1 + lr: + - 0.001 + stop_min_lr: -1.0 + use_bmuf: false + skip_remainder_batch: false +checkpoint: + _name: null + save_dir: bert/adan2/ + restore_file: checkpoint_last.pt + continue_once: null + finetune_from_model: null + reset_dataloader: false + reset_lr_scheduler: false + reset_meters: false + reset_optimizer: false + optimizer_overrides: '{}' + save_interval: 5 + save_interval_updates: 50000 + keep_interval_updates: -1 + keep_interval_updates_pattern: -1 + keep_last_epochs: -1 + keep_best_checkpoints: -1 + no_save: false + no_epoch_checkpoints: true + no_last_checkpoints: false + no_save_optimizer_state: false + best_checkpoint_metric: loss + maximize_best_checkpoint_metric: false + patience: -1 + checkpoint_suffix: '' + checkpoint_shard_count: 1 + load_checkpoint_on_all_dp_ranks: false + write_checkpoints_asynchronously: false + model_parallel_size: ${common.model_parallel_size} +bmuf: + _name: null + block_lr: 1.0 + block_momentum: 0.875 + global_sync_iter: 50 + warmup_iterations: 500 + use_nbm: false + average_sync: false + distributed_world_size: ${distributed_training.distributed_world_size} +generation: + _name: null + beam: 5 + nbest: 1 + max_len_a: 0.0 + max_len_b: 200 + min_len: 1 + match_source_len: false + unnormalized: false + no_early_stop: false + no_beamable_mm: false + lenpen: 1.0 + unkpen: 0.0 + replace_unk: null + sacrebleu: false + score_reference: false + prefix_size: 0 + no_repeat_ngram_size: 0 + sampling: false + sampling_topk: -1 + sampling_topp: -1.0 + constraints: null + temperature: 1.0 + diverse_beam_groups: -1 + diverse_beam_strength: 0.5 + diversity_rate: -1.0 + print_alignment: null + print_step: false + lm_path: null + lm_weight: 0.0 + iter_decode_eos_penalty: 0.0 + iter_decode_max_iter: 10 + iter_decode_force_max_iter: false + iter_decode_with_beam: 1 + iter_decode_with_external_reranker: false + retain_iter_history: false + retain_dropout: false + retain_dropout_modules: null + decoding_format: null + no_seed_provided: false + eos_token: null +eval_lm: + _name: null + output_word_probs: false + output_word_stats: false + context_window: 0 + softmax_batch: 9223372036854775807 +interactive: + _name: null + buffer_size: 0 + input: '-' +model: + _name: roberta + max_positions: 512 + dropout: 0.1 + attention_dropout: 0.1 +task: + _name: masked_lm + data: /dataset/common/bert-corpus-0729/ + sample_break_mode: complete + tokens_per_sample: 512 +criterion: masked_lm +optimizer: + _name: adan + weight_decay: 0.02 + adan_betas: (0.98,0.92,0.99) + adan_eps: 1.0e-08 +lr_scheduler: + _name: polynomial_decay + warmup_updates: 10000 +scoring: null +bpe: null +tokenizer: null +ema: + _name: null + store_ema: false + ema_decay: 0.9999 + ema_start_update: 0 + ema_seed_model: null + ema_update_freq: 1 + ema_fp32: false diff --git a/NLP/Transformer-XL/README.md b/NLP/Transformer-XL/README.md new file mode 100644 index 0000000..27aff32 --- /dev/null +++ b/NLP/Transformer-XL/README.md @@ -0,0 +1,92 @@ +# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models + +We first provide the instruction to modify the official training files from [Transformer-XL](https://github.com/kimiyoung/transformer-xl) to support Adan. **For data preparation, please follow that repo.** + + + +## Environment + +As rtecommended by the official [Transformer-XL](https://github.com/kimiyoung/transformer-xl), our experiments for this task are based on the following pkg version. + +```python +torch.__version__ = '1.1.0' +``` + + + +## Usage of Adan for Transformer-XL + +### Two steps to use Adan + +**Step 1.** add the following parameters to the file `train.py`. + +```python +parser.add_argument('--optim', default='adam', type=str, choices=['adam', 'sgd', 'adagrad', 'adan'], help='optimizer to use.') +parser.add_argument('--wd', type=float, default=0.02, help='weight decay (default: 0.02)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='Optimizer Betas (default: None, use opt default)') +``` + +* `optim`: the choice of optimizers. We add Adan in the choices. + +* `wd`: decoupled weight decay. + +* `opt-betas`: optimizer betas for Adan. + + + +**Step 2.** replace the originl optimizitor creation with the following: + +```python +from adan import Adan + +elif args.optim.lower() == 'adan': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = Adan(sparse_params,betas=args.opt_betas, lr=args.lr, weight_decay= args.wd) + optimizer = Adan(dense_params, lr=args.lr,betas=args.opt_betas, weight_decay= args.wd) + else: + optimizer = Adan(model.parameters(), lr=args.lr, betas=args.opt_betas, weight_decay= args.wd) + +``` + + + +## Data Prepration + +see `bash getdata.sh` in repo [Transformer-XL](https://github.com/kimiyoung/transformer-xl). + + + +## Training and Evaluation + +- #### Training + + `bash run_wt103_adan.sh train --work_dir PATH_TO_WORK_DIR` + +- #### Evaluation + + `bash run_wt103_adan.sh eval --work_dir PATH_TO_WORK_DIR` + + + +- #### Tips for Experiments + + - For Adan, we set `args.wd = 0.02` for all steps, which is consist with the other experiments. + - For the experiment using `steps = 50k` , we choose a slight larger `LR`. + +## Results and Logs + + With different setting for `lr` and `max_step` in `run_wt103_adan.sh`, we have the following results: + + | | LR | Steps | Test PPL | Download | + | ------------------- | :----: | :---: | :------: | :--------------------------------------: | + | Baseline (Adam) | 2.5e-4 | 200k | 24.2 | [log&config](./exp_results/log-adam.txt) | + | Transformer-XL-base | 1.5e-3 | 50k | 26.2 | [log&config](./exp_results/log-50k.txt) | + | Transformer-XL-base | 1e-3 | 100k | 24.2 | [log&config](./exp_results/log-100k.txt) | + | Transformer-XL-base | 1e-3 | 200k | 23.5 | [log&config](./exp_results/log-200k.txt) | + diff --git a/NLP/Transformer-XL/adan.py b/NLP/Transformer-XL/adan.py new file mode 100644 index 0000000..e2a224a --- /dev/null +++ b/NLP/Transformer-XL/adan.py @@ -0,0 +1,154 @@ +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import torch +from torch.optim.optimizer import Optimizer +from timm.utils import * + + +class Adan(Optimizer): + """ + Implements a pytorch variant of Adan + + Adan was proposed in + Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. + https://arxiv.org/abs/2208.06677 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float, flot], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) + max_grad_norm (float, optional): value used to clip + global grad norm (default: 0.0 no clip) + no_prox (bool): how to perform the decoupled weight decay (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, max_grad_norm=0.0, no_prox=False): + if not 0.0 <= max_grad_norm: + raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= betas[2] < 1.0: + raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm, no_prox=no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @torch.no_grad() + def restart_opt(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + if p.requires_grad: + state = self.state[p] + # State initialization + + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + # Exponential moving average of gradient difference + state['exp_avg_diff'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self): + """ + Performs a single optimization step. + """ + if self.defaults['max_grad_norm'] > 0: + device = self.param_groups[0]['params'][0].device + global_grad_norm = torch.zeros(1, device=device) + + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + for group in self.param_groups: + + for p in group['params']: + if p.grad is not None: + grad = p.grad + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + + clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) + else: + clip_global_grad_norm = 1.0 + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) + + grad = p.grad.mul_(clip_global_grad_norm) + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + copy_grad = grad.clone() + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + update = grad + beta2 * diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) + + if group['no_prox']: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + p.add_(update, alpha=-group['lr']) + else: + p.add_(update, alpha=-group['lr']) + p.data.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad diff --git a/NLP/Transformer-XL/data_utils.py b/NLP/Transformer-XL/data_utils.py new file mode 100644 index 0000000..df762a7 --- /dev/null +++ b/NLP/Transformer-XL/data_utils.py @@ -0,0 +1,273 @@ +import os, sys +import glob + +from collections import Counter, OrderedDict +import numpy as np +import torch + +from utils.vocabulary import Vocab + +class LMOrderedIterator(object): + def __init__(self, data, bsz, bptt, device='cpu', ext_len=None): + """ + data -- LongTensor -- the LongTensor is strictly ordered + """ + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + + # Work out how cleanly we can divide the dataset into bsz parts. + self.n_step = data.size(0) // bsz + + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, self.n_step * bsz) + + # Evenly divide the data across the bsz batches. + self.data = data.view(bsz, -1).t().contiguous().to(device) + + # Number of mini-batches + self.n_batch = (self.n_step + self.bptt - 1) // self.bptt + + def get_batch(self, i, bptt=None): + if bptt is None: bptt = self.bptt + seq_len = min(bptt, self.data.size(0) - 1 - i) + + end_idx = i + seq_len + beg_idx = max(0, i - self.ext_len) + + data = self.data[beg_idx:end_idx] + target = self.data[i+1:i+1+seq_len] + + return data, target, seq_len + + def get_fixlen_iter(self, start=0): + for i in range(start, self.data.size(0) - 1, self.bptt): + yield self.get_batch(i) + + def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): + max_len = self.bptt + max_deviation * std + i = start + while True: + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2. + bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) + data, target, seq_len = self.get_batch(i, bptt) + i += seq_len + yield data, target, seq_len + if i >= self.data.size(0) - 2: + break + + def __iter__(self): + return self.get_fixlen_iter() + + +class LMShuffledIterator(object): + def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False): + """ + data -- list[LongTensor] -- there is no order among the LongTensors + """ + self.data = data + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self): + # index iterator + epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \ + else np.array(range(len(self.data))) + + # sentence iterator + for idx in epoch_indices: + yield self.data[idx] + + def stream_iterator(self, sent_stream): + # streams for each data in the batch + streams = [None] * self.bsz + + data = torch.LongTensor(self.bptt, self.bsz) + target = torch.LongTensor(self.bptt, self.bsz) + + n_retain = 0 + + while True: + # data : [n_retain+bptt x bsz] + # target : [bptt x bsz] + data[n_retain:].fill_(-1) + target.fill_(-1) + + valid_batch = True + + for i in range(self.bsz): + n_filled = 0 + try: + while n_filled < self.bptt: + if streams[i] is None or len(streams[i]) <= 1: + streams[i] = next(sent_stream) + # number of new tokens to fill in + n_new = min(len(streams[i]) - 1, self.bptt - n_filled) + # first n_retain tokens are retained from last batch + data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \ + streams[i][:n_new] + target[n_filled:n_filled+n_new, i] = \ + streams[i][1:n_new+1] + streams[i] = streams[i][n_new:] + n_filled += n_new + except StopIteration: + valid_batch = False + break + + if not valid_batch: + return + + data = data.to(self.device) + target = target.to(self.device) + + yield data, target, self.bptt + + n_retain = min(data.size(0), self.ext_len) + if n_retain > 0: + data[:n_retain] = data[-n_retain:] + data.resize_(n_retain + self.bptt, data.size(1)) + + def __iter__(self): + # sent_stream is an iterator + sent_stream = self.get_sent_stream() + + for batch in self.stream_iterator(sent_stream): + yield batch + + +class LMMultiFileIterator(LMShuffledIterator): + def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None, + shuffle=False): + + self.paths = paths + self.vocab = vocab + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self, path): + sents = self.vocab.encode_file(path, add_double_eos=True) + if self.shuffle: + np.random.shuffle(sents) + sent_stream = iter(sents) + + return sent_stream + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.paths) + + for path in self.paths: + # sent_stream is an iterator + sent_stream = self.get_sent_stream(path) + for batch in self.stream_iterator(sent_stream): + yield batch + + +class Corpus(object): + def __init__(self, path, dataset, *args, **kwargs): + self.dataset = dataset + self.vocab = Vocab(*args, **kwargs) + + if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']: + self.vocab.count_file(os.path.join(path, 'train.txt')) + self.vocab.count_file(os.path.join(path, 'valid.txt')) + self.vocab.count_file(os.path.join(path, 'test.txt')) + elif self.dataset == 'wt103': + self.vocab.count_file(os.path.join(path, 'train.txt')) + elif self.dataset == 'lm1b': + train_path_pattern = os.path.join( + path, '1-billion-word-language-modeling-benchmark-r13output', + 'training-monolingual.tokenized.shuffled', 'news.en-*') + train_paths = glob.glob(train_path_pattern) + # the vocab will load from file when build_vocab() is called + + self.vocab.build_vocab() + + if self.dataset in ['ptb', 'wt2', 'wt103']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True) + elif self.dataset in ['enwik8', 'text8']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True, add_eos=False) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True, add_eos=False) + elif self.dataset == 'lm1b': + self.train = train_paths + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True) + + def get_iterator(self, split, *args, **kwargs): + if split == 'train': + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(self.train, *args, **kwargs) + elif self.dataset == 'lm1b': + kwargs['shuffle'] = True + data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) + elif split in ['valid', 'test']: + data = self.valid if split == 'valid' else self.test + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(data, *args, **kwargs) + elif self.dataset == 'lm1b': + data_iter = LMShuffledIterator(data, *args, **kwargs) + + return data_iter + + +def get_lm_corpus(datadir, dataset): + fn = os.path.join(datadir, 'cache.pt') + if os.path.exists(fn): + print('Loading cached dataset...') + corpus = torch.load(fn) + else: + print('Producing dataset {}...'.format(dataset)) + kwargs = {} + if dataset in ['wt103', 'wt2']: + kwargs['special'] = [''] + kwargs['lower_case'] = False + elif dataset == 'ptb': + kwargs['special'] = [''] + kwargs['lower_case'] = True + elif dataset == 'lm1b': + kwargs['special'] = [] + kwargs['lower_case'] = False + kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt') + elif dataset in ['enwik8', 'text8']: + pass + + corpus = Corpus(datadir, dataset, **kwargs) + torch.save(corpus, fn) + + return corpus + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description='unit test') + parser.add_argument('--datadir', type=str, default='../data/text8', + help='location of the data corpus') + parser.add_argument('--dataset', type=str, default='text8', + choices=['ptb', 'wt2', 'wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') + args = parser.parse_args() + + corpus = get_lm_corpus(args.datadir, args.dataset) + print('Vocab size : {}'.format(len(corpus.vocab.idx2sym))) diff --git a/NLP/Transformer-XL/eval.py b/NLP/Transformer-XL/eval.py new file mode 100644 index 0000000..eff3618 --- /dev/null +++ b/NLP/Transformer-XL/eval.py @@ -0,0 +1,122 @@ +# coding: utf-8 +import argparse +import time +import math +import os, sys + +import torch + +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import get_logger + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/wikitext-103', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='wt103', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--split', type=str, default='all', + choices=['all', 'valid', 'test'], + help='which split to evaluate') +parser.add_argument('--batch_size', type=int, default=10, + help='batch size') +parser.add_argument('--tgt_len', type=int, default=5, + help='number of tokens to predict') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=0, + help='length of the retained previous heads') +parser.add_argument('--clamp_len', type=int, default=-1, + help='max positional embedding index') +parser.add_argument('--cuda', action='store_true', + help='use CUDA') +parser.add_argument('--work_dir', type=str, required=True, + help='path to the work_dir') +parser.add_argument('--no_log', action='store_true', + help='do not log the eval result') +parser.add_argument('--same_length', action='store_true', + help='set same length attention with masking') +args = parser.parse_args() +assert args.ext_len >= 0, 'extended context length must be non-negative' + +device = torch.device("cuda" if args.cuda else "cpu") + +# Get logger +logging = get_logger(os.path.join(args.work_dir, 'log.txt'), + log_=not args.no_log) + +# Load dataset +corpus = get_lm_corpus(args.data, args.dataset) +ntokens = len(corpus.vocab) + +va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, + device=device, ext_len=args.ext_len) +te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, + device=device, ext_len=args.ext_len) + +# Load the best saved model. +with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f: + model = torch.load(f) +model.backward_compatible() +model = model.to(device) + +logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( + args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) + +model.reset_length(args.tgt_len, args.ext_len, args.mem_len) +if args.clamp_len > 0: + model.clamp_len = args.clamp_len +if args.same_length: + model.same_length = True + +############################################################################### +# Evaluation code +############################################################################### +def evaluate(eval_iter): + # Turn on evaluation mode which disables dropout. + model.eval() + total_len, total_loss = 0, 0. + start_time = time.time() + with torch.no_grad(): + mems = tuple() + for idx, (data, target, seq_len) in enumerate(eval_iter): + ret = model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.item() + total_len += seq_len + total_time = time.time() - start_time + logging('Time : {:.2f}s, {:.2f}ms/segment'.format( + total_time, 1000 * total_time / (idx+1))) + return total_loss / total_len + +# Run on test data. +if args.split == 'all': + test_loss = evaluate(te_iter) + valid_loss = evaluate(va_iter) +elif args.split == 'valid': + valid_loss = evaluate(va_iter) + test_loss = None +elif args.split == 'test': + test_loss = evaluate(te_iter) + valid_loss = None + +def format_log(loss, split): + if args.dataset in ['enwik8', 'text8']: + log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format( + split, loss, loss / math.log(2)) + else: + log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( + split, loss, math.exp(loss)) + return log_str + +log_str = '' +if valid_loss is not None: + log_str += format_log(valid_loss, 'valid') +if test_loss is not None: + log_str += format_log(test_loss, 'test') + +logging('=' * 100) +logging(log_str) +logging('=' * 100) diff --git a/NLP/Transformer-XL/exp_results/log-100k.txt b/NLP/Transformer-XL/exp_results/log-100k.txt new file mode 100644 index 0000000..00c50df --- /dev/null +++ b/NLP/Transformer-XL/exp_results/log-100k.txt @@ -0,0 +1,649 @@ +==================================================================================================== + - data : /root/autodl-tmp/data/wikitext-103/ + - dataset : wt103 + - n_layer : 16 + - n_head : 10 + - d_head : 41 + - d_embed : 410 + - d_model : 410 + - d_inner : 2100 + - dropout : 0.1 + - dropatt : 0.0 + - init : normal + - emb_init : normal + - init_range : 0.1 + - emb_init_range : 0.01 + - init_std : 0.02 + - proj_init_std : 0.01 + - optim : adan + - lr : 0.001 + - wd : 0.02 + - mom : 0.0 + - scheduler : cosine + - warmup_step : 3000 + - decay_rate : 0.5 + - lr_min : 1e-06 + - clip : 0.25 + - clip_nonemb : False + - max_step : 100000 + - batch_size : 60 + - batch_chunk : 1 + - tgt_len : 150 + - eval_tgt_len : 150 + - ext_len : 0 + - mem_len : 150 + - not_tied : False + - seed : 1111 + - cuda : True + - adaptive : True + - div_val : 1 + - pre_lnorm : False + - varlen : False + - multi_gpu : True + - log_interval : 200 + - eval_interval : 4000 + - work_dir : /root/autodl-tmp/-wt103/20220810-001355 + - restart : False + - restart_dir : + - debug : False + - same_length : False + - attn_type : 0 + - clamp_len : -1 + - eta_min : 0.0 + - gpu0_bsz : 4 + - max_eval_steps : -1 + - sample_softmax : -1 + - patience : 0 + - finetune_v2 : False + - finetune_v3 : False + - fp16 : False + - static_loss_scale : 1 + - dynamic_loss_scale : False + - opt_betas : [0.9, 0.9, 0.999] + - tied : True + - n_token : 267735 + - n_all_param : 151107538 + - n_nonemb_param : 41066400 +==================================================================================================== +#params = 151107538 +#non emb params = 41066400 +| epoch 1 step 200 | 200 batches | lr 6.67e-05 | ms/batch 742.71 | loss 8.90 | ppl 7366.806 +| epoch 1 step 400 | 400 batches | lr 0.000133 | ms/batch 761.92 | loss 6.85 | ppl 942.451 +| epoch 1 step 600 | 600 batches | lr 0.0002 | ms/batch 704.16 | loss 6.34 | ppl 567.781 +| epoch 1 step 800 | 800 batches | lr 0.000267 | ms/batch 669.19 | loss 6.06 | ppl 428.925 +| epoch 1 step 1000 | 1000 batches | lr 0.000333 | ms/batch 697.67 | loss 5.80 | ppl 330.968 +| epoch 1 step 1200 | 1200 batches | lr 0.0004 | ms/batch 710.36 | loss 5.60 | ppl 270.691 +| epoch 1 step 1400 | 1400 batches | lr 0.000467 | ms/batch 726.18 | loss 5.43 | ppl 228.271 +| epoch 1 step 1600 | 1600 batches | lr 0.000533 | ms/batch 712.97 | loss 5.28 | ppl 196.416 +| epoch 1 step 1800 | 1800 batches | lr 0.0006 | ms/batch 695.31 | loss 5.15 | ppl 173.240 +| epoch 1 step 2000 | 2000 batches | lr 0.000667 | ms/batch 700.07 | loss 5.04 | ppl 154.584 +| epoch 1 step 2200 | 2200 batches | lr 0.000733 | ms/batch 681.35 | loss 4.93 | ppl 138.813 +| epoch 1 step 2400 | 2400 batches | lr 0.0008 | ms/batch 680.03 | loss 4.85 | ppl 128.135 +| epoch 1 step 2600 | 2600 batches | lr 0.000867 | ms/batch 672.90 | loss 4.76 | ppl 116.945 +| epoch 1 step 2800 | 2800 batches | lr 0.000933 | ms/batch 674.70 | loss 4.69 | ppl 108.587 +| epoch 1 step 3000 | 3000 batches | lr 0.001 | ms/batch 681.39 | loss 4.64 | ppl 103.975 +| epoch 1 step 3200 | 3200 batches | lr 0.000999 | ms/batch 693.50 | loss 4.58 | ppl 97.506 +| epoch 1 step 3400 | 3400 batches | lr 0.000999 | ms/batch 674.28 | loss 4.53 | ppl 93.139 +| epoch 1 step 3600 | 3600 batches | lr 0.000999 | ms/batch 693.74 | loss 4.45 | ppl 85.849 +| epoch 1 step 3800 | 3800 batches | lr 0.000998 | ms/batch 674.43 | loss 4.48 | ppl 88.153 +| epoch 1 step 4000 | 4000 batches | lr 0.000998 | ms/batch 672.46 | loss 4.43 | ppl 84.328 +---------------------------------------------------------------------------------------------------- +| Eval 1 at step 4000 | time: 2792.28s | valid loss 4.37 | valid ppl 78.835 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 4200 | 4200 batches | lr 0.000998 | ms/batch 736.53 | loss 4.38 | ppl 79.983 +| epoch 1 step 4400 | 4400 batches | lr 0.000997 | ms/batch 707.78 | loss 4.36 | ppl 78.055 +| epoch 1 step 4600 | 4600 batches | lr 0.000997 | ms/batch 716.77 | loss 4.34 | ppl 76.331 +| epoch 1 step 4800 | 4800 batches | lr 0.000996 | ms/batch 690.44 | loss 4.28 | ppl 72.184 +| epoch 1 step 5000 | 5000 batches | lr 0.000996 | ms/batch 673.77 | loss 4.31 | ppl 74.590 +| epoch 1 step 5200 | 5200 batches | lr 0.000995 | ms/batch 678.84 | loss 4.25 | ppl 70.193 +| epoch 1 step 5400 | 5400 batches | lr 0.000995 | ms/batch 677.47 | loss 4.20 | ppl 66.462 +| epoch 1 step 5600 | 5600 batches | lr 0.000994 | ms/batch 671.76 | loss 4.22 | ppl 67.988 +| epoch 1 step 5800 | 5800 batches | lr 0.000994 | ms/batch 690.14 | loss 4.21 | ppl 67.462 +| epoch 1 step 6000 | 6000 batches | lr 0.000993 | ms/batch 704.75 | loss 4.17 | ppl 64.509 +| epoch 1 step 6200 | 6200 batches | lr 0.000992 | ms/batch 714.31 | loss 4.14 | ppl 62.962 +| epoch 1 step 6400 | 6400 batches | lr 0.000992 | ms/batch 691.45 | loss 4.17 | ppl 64.894 +| epoch 1 step 6600 | 6600 batches | lr 0.000991 | ms/batch 713.05 | loss 4.11 | ppl 60.698 +| epoch 1 step 6800 | 6800 batches | lr 0.000991 | ms/batch 685.79 | loss 4.10 | ppl 60.561 +| epoch 1 step 7000 | 7000 batches | lr 0.00099 | ms/batch 700.60 | loss 4.11 | ppl 60.660 +| epoch 1 step 7200 | 7200 batches | lr 0.000989 | ms/batch 675.17 | loss 4.06 | ppl 57.759 +| epoch 1 step 7400 | 7400 batches | lr 0.000988 | ms/batch 702.69 | loss 4.05 | ppl 57.520 +| epoch 1 step 7600 | 7600 batches | lr 0.000988 | ms/batch 691.46 | loss 4.03 | ppl 56.370 +| epoch 1 step 7800 | 7800 batches | lr 0.000987 | ms/batch 677.30 | loss 4.05 | ppl 57.587 +| epoch 1 step 8000 | 8000 batches | lr 0.000986 | ms/batch 692.82 | loss 4.05 | ppl 57.212 +---------------------------------------------------------------------------------------------------- +| Eval 2 at step 8000 | time: 2775.07s | valid loss 3.93 | valid ppl 50.908 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 8200 | 8200 batches | lr 0.000985 | ms/batch 745.71 | loss 4.02 | ppl 55.804 +| epoch 1 step 8400 | 8400 batches | lr 0.000985 | ms/batch 703.07 | loss 4.03 | ppl 56.420 +| epoch 1 step 8600 | 8600 batches | lr 0.000984 | ms/batch 688.98 | loss 4.01 | ppl 55.313 +| epoch 1 step 8800 | 8800 batches | lr 0.000983 | ms/batch 700.17 | loss 4.02 | ppl 55.826 +| epoch 1 step 9000 | 9000 batches | lr 0.000982 | ms/batch 673.45 | loss 3.99 | ppl 54.215 +| epoch 1 step 9200 | 9200 batches | lr 0.000981 | ms/batch 691.53 | loss 3.98 | ppl 53.544 +| epoch 1 step 9400 | 9400 batches | lr 0.00098 | ms/batch 681.53 | loss 3.99 | ppl 53.802 +| epoch 1 step 9600 | 9600 batches | lr 0.000979 | ms/batch 705.40 | loss 4.00 | ppl 54.643 +| epoch 1 step 9800 | 9800 batches | lr 0.000978 | ms/batch 716.62 | loss 3.96 | ppl 52.276 +| epoch 1 step 10000 | 10000 batches | lr 0.000977 | ms/batch 679.81 | loss 3.97 | ppl 53.073 +| epoch 1 step 10200 | 10200 batches | lr 0.000976 | ms/batch 680.69 | loss 3.94 | ppl 51.218 +| epoch 1 step 10400 | 10400 batches | lr 0.000975 | ms/batch 677.39 | loss 3.93 | ppl 51.130 +| epoch 1 step 10600 | 10600 batches | lr 0.000974 | ms/batch 682.82 | loss 3.96 | ppl 52.328 +| epoch 1 step 10800 | 10800 batches | lr 0.000973 | ms/batch 675.32 | loss 3.92 | ppl 50.152 +| epoch 1 step 11000 | 11000 batches | lr 0.000972 | ms/batch 687.74 | loss 3.95 | ppl 52.112 +| epoch 1 step 11200 | 11200 batches | lr 0.000971 | ms/batch 687.73 | loss 3.93 | ppl 50.965 +| epoch 1 step 11400 | 11400 batches | lr 0.00097 | ms/batch 692.52 | loss 3.93 | ppl 50.818 +| epoch 2 step 11600 | 130 batches | lr 0.000969 | ms/batch 719.64 | loss 3.90 | ppl 49.417 +| epoch 2 step 11800 | 330 batches | lr 0.000968 | ms/batch 690.59 | loss 3.88 | ppl 48.186 +| epoch 2 step 12000 | 530 batches | lr 0.000967 | ms/batch 700.90 | loss 3.90 | ppl 49.205 +---------------------------------------------------------------------------------------------------- +| Eval 3 at step 12000 | time: 2772.08s | valid loss 3.78 | valid ppl 43.627 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 12200 | 730 batches | lr 0.000966 | ms/batch 772.15 | loss 3.87 | ppl 47.839 +| epoch 2 step 12400 | 930 batches | lr 0.000964 | ms/batch 681.74 | loss 3.87 | ppl 47.878 +| epoch 2 step 12600 | 1130 batches | lr 0.000963 | ms/batch 692.52 | loss 3.90 | ppl 49.212 +| epoch 2 step 12800 | 1330 batches | lr 0.000962 | ms/batch 672.00 | loss 3.86 | ppl 47.513 +| epoch 2 step 13000 | 1530 batches | lr 0.000961 | ms/batch 699.31 | loss 3.85 | ppl 47.004 +| epoch 2 step 13200 | 1730 batches | lr 0.000959 | ms/batch 703.25 | loss 3.84 | ppl 46.727 +| epoch 2 step 13400 | 1930 batches | lr 0.000958 | ms/batch 694.76 | loss 3.85 | ppl 46.999 +| epoch 2 step 13600 | 2130 batches | lr 0.000957 | ms/batch 702.36 | loss 3.87 | ppl 47.877 +| epoch 2 step 13800 | 2330 batches | lr 0.000956 | ms/batch 714.52 | loss 3.84 | ppl 46.684 +| epoch 2 step 14000 | 2530 batches | lr 0.000954 | ms/batch 704.35 | loss 3.83 | ppl 45.921 +| epoch 2 step 14200 | 2730 batches | lr 0.000953 | ms/batch 701.29 | loss 3.80 | ppl 44.917 +| epoch 2 step 14400 | 2930 batches | lr 0.000951 | ms/batch 688.11 | loss 3.79 | ppl 44.149 +| epoch 2 step 14600 | 3130 batches | lr 0.00095 | ms/batch 704.84 | loss 3.80 | ppl 44.497 +| epoch 2 step 14800 | 3330 batches | lr 0.000949 | ms/batch 716.44 | loss 3.80 | ppl 44.659 +| epoch 2 step 15000 | 3530 batches | lr 0.000947 | ms/batch 695.23 | loss 3.76 | ppl 42.957 +| epoch 2 step 15200 | 3730 batches | lr 0.000946 | ms/batch 675.92 | loss 3.79 | ppl 44.272 +| epoch 2 step 15400 | 3930 batches | lr 0.000944 | ms/batch 680.85 | loss 3.78 | ppl 43.873 +| epoch 2 step 15600 | 4130 batches | lr 0.000943 | ms/batch 676.88 | loss 3.77 | ppl 43.466 +| epoch 2 step 15800 | 4330 batches | lr 0.000941 | ms/batch 690.26 | loss 3.78 | ppl 43.828 +| epoch 2 step 16000 | 4530 batches | lr 0.00094 | ms/batch 681.76 | loss 3.78 | ppl 43.855 +---------------------------------------------------------------------------------------------------- +| Eval 4 at step 16000 | time: 2785.52s | valid loss 3.68 | valid ppl 39.575 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 16200 | 4730 batches | lr 0.000938 | ms/batch 761.98 | loss 3.74 | ppl 41.963 +| epoch 2 step 16400 | 4930 batches | lr 0.000937 | ms/batch 719.77 | loss 3.76 | ppl 42.816 +| epoch 2 step 16600 | 5130 batches | lr 0.000935 | ms/batch 682.43 | loss 3.75 | ppl 42.488 +| epoch 2 step 16800 | 5330 batches | lr 0.000934 | ms/batch 678.56 | loss 3.74 | ppl 42.072 +| epoch 2 step 17000 | 5530 batches | lr 0.000932 | ms/batch 702.18 | loss 3.73 | ppl 41.580 +| epoch 2 step 17200 | 5730 batches | lr 0.000931 | ms/batch 693.54 | loss 3.75 | ppl 42.350 +| epoch 2 step 17400 | 5930 batches | lr 0.000929 | ms/batch 682.69 | loss 3.73 | ppl 41.637 +| epoch 2 step 17600 | 6130 batches | lr 0.000927 | ms/batch 702.62 | loss 3.72 | ppl 41.292 +| epoch 2 step 17800 | 6330 batches | lr 0.000926 | ms/batch 676.86 | loss 3.75 | ppl 42.496 +| epoch 2 step 18000 | 6530 batches | lr 0.000924 | ms/batch 686.50 | loss 3.69 | ppl 40.096 +| epoch 2 step 18200 | 6730 batches | lr 0.000922 | ms/batch 678.10 | loss 3.70 | ppl 40.308 +| epoch 2 step 18400 | 6930 batches | lr 0.00092 | ms/batch 703.33 | loss 3.71 | ppl 40.840 +| epoch 2 step 18600 | 7130 batches | lr 0.000919 | ms/batch 690.96 | loss 3.69 | ppl 39.977 +| epoch 2 step 18800 | 7330 batches | lr 0.000917 | ms/batch 746.79 | loss 3.67 | ppl 39.106 +| epoch 2 step 19000 | 7530 batches | lr 0.000915 | ms/batch 676.15 | loss 3.69 | ppl 40.078 +| epoch 2 step 19200 | 7730 batches | lr 0.000913 | ms/batch 707.35 | loss 3.69 | ppl 40.034 +| epoch 2 step 19400 | 7930 batches | lr 0.000912 | ms/batch 674.04 | loss 3.68 | ppl 39.801 +| epoch 2 step 19600 | 8130 batches | lr 0.00091 | ms/batch 709.95 | loss 3.70 | ppl 40.300 +| epoch 2 step 19800 | 8330 batches | lr 0.000908 | ms/batch 685.00 | loss 3.69 | ppl 39.868 +| epoch 2 step 20000 | 8530 batches | lr 0.000906 | ms/batch 706.46 | loss 3.67 | ppl 39.391 +---------------------------------------------------------------------------------------------------- +| Eval 5 at step 20000 | time: 2788.84s | valid loss 3.60 | valid ppl 36.475 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 20200 | 8730 batches | lr 0.000904 | ms/batch 752.81 | loss 3.69 | ppl 40.136 +| epoch 2 step 20400 | 8930 batches | lr 0.000902 | ms/batch 688.44 | loss 3.69 | ppl 39.976 +| epoch 2 step 20600 | 9130 batches | lr 0.000901 | ms/batch 690.82 | loss 3.68 | ppl 39.641 +| epoch 2 step 20800 | 9330 batches | lr 0.000899 | ms/batch 698.88 | loss 3.67 | ppl 39.207 +| epoch 2 step 21000 | 9530 batches | lr 0.000897 | ms/batch 700.37 | loss 3.71 | ppl 40.939 +| epoch 2 step 21200 | 9730 batches | lr 0.000895 | ms/batch 675.10 | loss 3.66 | ppl 38.940 +| epoch 2 step 21400 | 9930 batches | lr 0.000893 | ms/batch 694.48 | loss 3.67 | ppl 39.373 +| epoch 2 step 21600 | 10130 batches | lr 0.000891 | ms/batch 684.69 | loss 3.66 | ppl 38.760 +| epoch 2 step 21800 | 10330 batches | lr 0.000889 | ms/batch 729.00 | loss 3.67 | ppl 39.128 +| epoch 2 step 22000 | 10530 batches | lr 0.000887 | ms/batch 710.08 | loss 3.68 | ppl 39.746 +| epoch 2 step 22200 | 10730 batches | lr 0.000885 | ms/batch 693.05 | loss 3.65 | ppl 38.365 +| epoch 2 step 22400 | 10930 batches | lr 0.000883 | ms/batch 698.33 | loss 3.65 | ppl 38.293 +| epoch 2 step 22600 | 11130 batches | lr 0.000881 | ms/batch 713.05 | loss 3.69 | ppl 40.048 +| epoch 2 step 22800 | 11330 batches | lr 0.000879 | ms/batch 673.93 | loss 3.66 | ppl 38.769 +| epoch 3 step 23000 | 60 batches | lr 0.000877 | ms/batch 695.65 | loss 3.66 | ppl 38.901 +| epoch 3 step 23200 | 260 batches | lr 0.000875 | ms/batch 671.63 | loss 3.62 | ppl 37.173 +| epoch 3 step 23400 | 460 batches | lr 0.000873 | ms/batch 692.68 | loss 3.66 | ppl 38.720 +| epoch 3 step 23600 | 660 batches | lr 0.00087 | ms/batch 696.22 | loss 3.62 | ppl 37.317 +| epoch 3 step 23800 | 860 batches | lr 0.000868 | ms/batch 691.28 | loss 3.65 | ppl 38.609 +| epoch 3 step 24000 | 1060 batches | lr 0.000866 | ms/batch 699.25 | loss 3.64 | ppl 38.097 +---------------------------------------------------------------------------------------------------- +| Eval 6 at step 24000 | time: 2785.75s | valid loss 3.55 | valid ppl 34.856 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 24200 | 1260 batches | lr 0.000864 | ms/batch 771.85 | loss 3.63 | ppl 37.667 +| epoch 3 step 24400 | 1460 batches | lr 0.000862 | ms/batch 678.13 | loss 3.63 | ppl 37.615 +| epoch 3 step 24600 | 1660 batches | lr 0.00086 | ms/batch 676.14 | loss 3.62 | ppl 37.282 +| epoch 3 step 24800 | 1860 batches | lr 0.000857 | ms/batch 728.81 | loss 3.62 | ppl 37.511 +| epoch 3 step 25000 | 2060 batches | lr 0.000855 | ms/batch 694.21 | loss 3.66 | ppl 39.016 +| epoch 3 step 25200 | 2260 batches | lr 0.000853 | ms/batch 724.01 | loss 3.64 | ppl 37.938 +| epoch 3 step 25400 | 2460 batches | lr 0.000851 | ms/batch 678.12 | loss 3.62 | ppl 37.370 +| epoch 3 step 25600 | 2660 batches | lr 0.000848 | ms/batch 696.01 | loss 3.62 | ppl 37.468 +| epoch 3 step 25800 | 2860 batches | lr 0.000846 | ms/batch 694.04 | loss 3.56 | ppl 35.299 +| epoch 3 step 26000 | 3060 batches | lr 0.000844 | ms/batch 711.11 | loss 3.61 | ppl 37.126 +| epoch 3 step 26200 | 3260 batches | lr 0.000842 | ms/batch 723.43 | loss 3.61 | ppl 36.969 +| epoch 3 step 26400 | 3460 batches | lr 0.000839 | ms/batch 720.20 | loss 3.57 | ppl 35.667 +| epoch 3 step 26600 | 3660 batches | lr 0.000837 | ms/batch 684.79 | loss 3.59 | ppl 36.147 +| epoch 3 step 26800 | 3860 batches | lr 0.000835 | ms/batch 701.18 | loss 3.59 | ppl 36.331 +| epoch 3 step 27000 | 4060 batches | lr 0.000832 | ms/batch 706.21 | loss 3.60 | ppl 36.676 +| epoch 3 step 27200 | 4260 batches | lr 0.00083 | ms/batch 714.36 | loss 3.59 | ppl 36.233 +| epoch 3 step 27400 | 4460 batches | lr 0.000827 | ms/batch 692.59 | loss 3.59 | ppl 36.376 +| epoch 3 step 27600 | 4660 batches | lr 0.000825 | ms/batch 711.44 | loss 3.58 | ppl 35.999 +| epoch 3 step 27800 | 4860 batches | lr 0.000823 | ms/batch 728.11 | loss 3.57 | ppl 35.621 +| epoch 3 step 28000 | 5060 batches | lr 0.00082 | ms/batch 692.62 | loss 3.59 | ppl 36.065 +---------------------------------------------------------------------------------------------------- +| Eval 7 at step 28000 | time: 2821.18s | valid loss 3.51 | valid ppl 33.444 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 28200 | 5260 batches | lr 0.000818 | ms/batch 784.83 | loss 3.57 | ppl 35.469 +| epoch 3 step 28400 | 5460 batches | lr 0.000815 | ms/batch 676.58 | loss 3.55 | ppl 34.677 +| epoch 3 step 28600 | 5660 batches | lr 0.000813 | ms/batch 693.09 | loss 3.60 | ppl 36.443 +| epoch 3 step 28800 | 5860 batches | lr 0.00081 | ms/batch 692.23 | loss 3.57 | ppl 35.440 +| epoch 3 step 29000 | 6060 batches | lr 0.000808 | ms/batch 694.47 | loss 3.56 | ppl 35.226 +| epoch 3 step 29200 | 6260 batches | lr 0.000805 | ms/batch 679.24 | loss 3.56 | ppl 35.224 +| epoch 3 step 29400 | 6460 batches | lr 0.000803 | ms/batch 705.43 | loss 3.57 | ppl 35.528 +| epoch 3 step 29600 | 6660 batches | lr 0.0008 | ms/batch 716.64 | loss 3.52 | ppl 33.679 +| epoch 3 step 29800 | 6860 batches | lr 0.000798 | ms/batch 711.33 | loss 3.55 | ppl 34.776 +| epoch 3 step 30000 | 7060 batches | lr 0.000795 | ms/batch 730.14 | loss 3.54 | ppl 34.480 +| epoch 3 step 30200 | 7260 batches | lr 0.000793 | ms/batch 709.85 | loss 3.51 | ppl 33.497 +| epoch 3 step 30400 | 7460 batches | lr 0.00079 | ms/batch 685.34 | loss 3.54 | ppl 34.308 +| epoch 3 step 30600 | 7660 batches | lr 0.000788 | ms/batch 706.36 | loss 3.52 | ppl 33.834 +| epoch 3 step 30800 | 7860 batches | lr 0.000785 | ms/batch 699.03 | loss 3.53 | ppl 34.222 +| epoch 3 step 31000 | 8060 batches | lr 0.000783 | ms/batch 720.24 | loss 3.54 | ppl 34.453 +| epoch 3 step 31200 | 8260 batches | lr 0.00078 | ms/batch 673.26 | loss 3.53 | ppl 34.066 +| epoch 3 step 31400 | 8460 batches | lr 0.000777 | ms/batch 694.72 | loss 3.54 | ppl 34.454 +| epoch 3 step 31600 | 8660 batches | lr 0.000775 | ms/batch 708.28 | loss 3.53 | ppl 34.274 +| epoch 3 step 31800 | 8860 batches | lr 0.000772 | ms/batch 682.86 | loss 3.54 | ppl 34.392 +| epoch 3 step 32000 | 9060 batches | lr 0.000769 | ms/batch 688.85 | loss 3.54 | ppl 34.370 +---------------------------------------------------------------------------------------------------- +| Eval 8 at step 32000 | time: 2806.41s | valid loss 3.46 | valid ppl 31.891 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 32200 | 9260 batches | lr 0.000767 | ms/batch 786.16 | loss 3.52 | ppl 33.871 +| epoch 3 step 32400 | 9460 batches | lr 0.000764 | ms/batch 725.79 | loss 3.54 | ppl 34.633 +| epoch 3 step 32600 | 9660 batches | lr 0.000761 | ms/batch 700.74 | loss 3.54 | ppl 34.622 +| epoch 3 step 32800 | 9860 batches | lr 0.000759 | ms/batch 688.71 | loss 3.50 | ppl 33.131 +| epoch 3 step 33000 | 10060 batches | lr 0.000756 | ms/batch 714.76 | loss 3.55 | ppl 34.776 +| epoch 3 step 33200 | 10260 batches | lr 0.000753 | ms/batch 707.51 | loss 3.50 | ppl 32.988 +| epoch 3 step 33400 | 10460 batches | lr 0.000751 | ms/batch 683.71 | loss 3.53 | ppl 34.236 +| epoch 3 step 33600 | 10660 batches | lr 0.000748 | ms/batch 719.18 | loss 3.54 | ppl 34.467 +| epoch 3 step 33800 | 10860 batches | lr 0.000745 | ms/batch 745.78 | loss 3.49 | ppl 32.814 +| epoch 3 step 34000 | 11060 batches | lr 0.000742 | ms/batch 710.58 | loss 3.53 | ppl 34.283 +| epoch 3 step 34200 | 11260 batches | lr 0.00074 | ms/batch 694.54 | loss 3.54 | ppl 34.583 +| epoch 3 step 34400 | 11460 batches | lr 0.000737 | ms/batch 688.33 | loss 3.51 | ppl 33.583 +| epoch 4 step 34600 | 190 batches | lr 0.000734 | ms/batch 682.61 | loss 3.49 | ppl 32.864 +| epoch 4 step 34800 | 390 batches | lr 0.000731 | ms/batch 713.82 | loss 3.50 | ppl 33.187 +| epoch 4 step 35000 | 590 batches | lr 0.000728 | ms/batch 709.46 | loss 3.49 | ppl 32.943 +| epoch 4 step 35200 | 790 batches | lr 0.000726 | ms/batch 684.47 | loss 3.51 | ppl 33.445 +| epoch 4 step 35400 | 990 batches | lr 0.000723 | ms/batch 721.54 | loss 3.49 | ppl 32.743 +| epoch 4 step 35600 | 1190 batches | lr 0.00072 | ms/batch 705.58 | loss 3.51 | ppl 33.363 +| epoch 4 step 35800 | 1390 batches | lr 0.000717 | ms/batch 715.79 | loss 3.50 | ppl 32.989 +| epoch 4 step 36000 | 1590 batches | lr 0.000714 | ms/batch 707.76 | loss 3.48 | ppl 32.568 +---------------------------------------------------------------------------------------------------- +| Eval 9 at step 36000 | time: 2837.19s | valid loss 3.44 | valid ppl 31.101 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 36200 | 1790 batches | lr 0.000711 | ms/batch 744.09 | loss 3.49 | ppl 32.869 +| epoch 4 step 36400 | 1990 batches | lr 0.000709 | ms/batch 685.71 | loss 3.52 | ppl 33.861 +| epoch 4 step 36600 | 2190 batches | lr 0.000706 | ms/batch 702.84 | loss 3.51 | ppl 33.326 +| epoch 4 step 36800 | 2390 batches | lr 0.000703 | ms/batch 705.87 | loss 3.51 | ppl 33.286 +| epoch 4 step 37000 | 2590 batches | lr 0.0007 | ms/batch 693.72 | loss 3.48 | ppl 32.465 +| epoch 4 step 37200 | 2790 batches | lr 0.000697 | ms/batch 699.40 | loss 3.46 | ppl 31.888 +| epoch 4 step 37400 | 2990 batches | lr 0.000694 | ms/batch 697.96 | loss 3.48 | ppl 32.390 +| epoch 4 step 37600 | 3190 batches | lr 0.000691 | ms/batch 679.96 | loss 3.48 | ppl 32.335 +| epoch 4 step 37800 | 3390 batches | lr 0.000688 | ms/batch 692.96 | loss 3.48 | ppl 32.327 +| epoch 4 step 38000 | 3590 batches | lr 0.000685 | ms/batch 719.86 | loss 3.45 | ppl 31.410 +| epoch 4 step 38200 | 3790 batches | lr 0.000682 | ms/batch 708.23 | loss 3.47 | ppl 32.106 +| epoch 4 step 38400 | 3990 batches | lr 0.000679 | ms/batch 713.26 | loss 3.48 | ppl 32.539 +| epoch 4 step 38600 | 4190 batches | lr 0.000677 | ms/batch 720.48 | loss 3.46 | ppl 31.968 +| epoch 4 step 38800 | 4390 batches | lr 0.000674 | ms/batch 706.09 | loss 3.47 | ppl 32.081 +| epoch 4 step 39000 | 4590 batches | lr 0.000671 | ms/batch 706.32 | loss 3.48 | ppl 32.534 +| epoch 4 step 39200 | 4790 batches | lr 0.000668 | ms/batch 724.90 | loss 3.44 | ppl 31.078 +| epoch 4 step 39400 | 4990 batches | lr 0.000665 | ms/batch 684.94 | loss 3.49 | ppl 32.633 +| epoch 4 step 39600 | 5190 batches | lr 0.000662 | ms/batch 687.24 | loss 3.44 | ppl 31.273 +| epoch 4 step 39800 | 5390 batches | lr 0.000659 | ms/batch 721.71 | loss 3.42 | ppl 30.694 +| epoch 4 step 40000 | 5590 batches | lr 0.000656 | ms/batch 697.69 | loss 3.45 | ppl 31.450 +---------------------------------------------------------------------------------------------------- +| Eval 10 at step 40000 | time: 2814.33s | valid loss 3.41 | valid ppl 30.132 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 40200 | 5790 batches | lr 0.000653 | ms/batch 754.92 | loss 3.47 | ppl 32.025 +| epoch 4 step 40400 | 5990 batches | lr 0.00065 | ms/batch 694.46 | loss 3.44 | ppl 31.158 +| epoch 4 step 40600 | 6190 batches | lr 0.000647 | ms/batch 676.98 | loss 3.44 | ppl 31.171 +| epoch 4 step 40800 | 6390 batches | lr 0.000644 | ms/batch 689.04 | loss 3.47 | ppl 32.015 +| epoch 4 step 41000 | 6590 batches | lr 0.000641 | ms/batch 685.40 | loss 3.40 | ppl 30.022 +| epoch 4 step 41200 | 6790 batches | lr 0.000638 | ms/batch 747.15 | loss 3.43 | ppl 30.725 +| epoch 4 step 41400 | 6990 batches | lr 0.000635 | ms/batch 705.11 | loss 3.44 | ppl 31.182 +| epoch 4 step 41600 | 7190 batches | lr 0.000632 | ms/batch 696.98 | loss 3.39 | ppl 29.650 +| epoch 4 step 41800 | 7390 batches | lr 0.000629 | ms/batch 702.79 | loss 3.42 | ppl 30.476 +| epoch 4 step 42000 | 7590 batches | lr 0.000626 | ms/batch 695.10 | loss 3.39 | ppl 29.763 +| epoch 4 step 42200 | 7790 batches | lr 0.000622 | ms/batch 715.71 | loss 3.42 | ppl 30.681 +| epoch 4 step 42400 | 7990 batches | lr 0.000619 | ms/batch 741.98 | loss 3.42 | ppl 30.604 +| epoch 4 step 42600 | 8190 batches | lr 0.000616 | ms/batch 705.83 | loss 3.41 | ppl 30.193 +| epoch 4 step 42800 | 8390 batches | lr 0.000613 | ms/batch 712.28 | loss 3.44 | ppl 31.079 +| epoch 4 step 43000 | 8590 batches | lr 0.00061 | ms/batch 724.30 | loss 3.41 | ppl 30.299 +| epoch 4 step 43200 | 8790 batches | lr 0.000607 | ms/batch 719.79 | loss 3.43 | ppl 30.914 +| epoch 4 step 43400 | 8990 batches | lr 0.000604 | ms/batch 699.25 | loss 3.42 | ppl 30.455 +| epoch 4 step 43600 | 9190 batches | lr 0.000601 | ms/batch 685.74 | loss 3.41 | ppl 30.187 +| epoch 4 step 43800 | 9390 batches | lr 0.000598 | ms/batch 719.13 | loss 3.42 | ppl 30.441 +| epoch 4 step 44000 | 9590 batches | lr 0.000595 | ms/batch 753.12 | loss 3.44 | ppl 31.043 +---------------------------------------------------------------------------------------------------- +| Eval 11 at step 44000 | time: 2840.79s | valid loss 3.37 | valid ppl 29.010 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 44200 | 9790 batches | lr 0.000592 | ms/batch 773.20 | loss 3.41 | ppl 30.168 +| epoch 4 step 44400 | 9990 batches | lr 0.000589 | ms/batch 694.87 | loss 3.41 | ppl 30.196 +| epoch 4 step 44600 | 10190 batches | lr 0.000586 | ms/batch 724.33 | loss 3.40 | ppl 29.936 +| epoch 4 step 44800 | 10390 batches | lr 0.000582 | ms/batch 701.37 | loss 3.40 | ppl 30.038 +| epoch 4 step 45000 | 10590 batches | lr 0.000579 | ms/batch 724.47 | loss 3.43 | ppl 30.942 +| epoch 4 step 45200 | 10790 batches | lr 0.000576 | ms/batch 700.16 | loss 3.38 | ppl 29.477 +| epoch 4 step 45400 | 10990 batches | lr 0.000573 | ms/batch 699.42 | loss 3.42 | ppl 30.491 +| epoch 4 step 45600 | 11190 batches | lr 0.00057 | ms/batch 697.52 | loss 3.42 | ppl 30.633 +| epoch 4 step 45800 | 11390 batches | lr 0.000567 | ms/batch 716.39 | loss 3.41 | ppl 30.406 +| epoch 5 step 46000 | 120 batches | lr 0.000564 | ms/batch 697.18 | loss 3.39 | ppl 29.776 +| epoch 5 step 46200 | 320 batches | lr 0.000561 | ms/batch 688.95 | loss 3.38 | ppl 29.331 +| epoch 5 step 46400 | 520 batches | lr 0.000557 | ms/batch 702.04 | loss 3.41 | ppl 30.334 +| epoch 5 step 46600 | 720 batches | lr 0.000554 | ms/batch 714.74 | loss 3.37 | ppl 29.146 +| epoch 5 step 46800 | 920 batches | lr 0.000551 | ms/batch 694.28 | loss 3.38 | ppl 29.263 +| epoch 5 step 47000 | 1120 batches | lr 0.000548 | ms/batch 691.20 | loss 3.41 | ppl 30.380 +| epoch 5 step 47200 | 1320 batches | lr 0.000545 | ms/batch 709.55 | loss 3.38 | ppl 29.299 +| epoch 5 step 47400 | 1520 batches | lr 0.000542 | ms/batch 715.69 | loss 3.38 | ppl 29.302 +| epoch 5 step 47600 | 1720 batches | lr 0.000539 | ms/batch 703.59 | loss 3.37 | ppl 29.087 +| epoch 5 step 47800 | 1920 batches | lr 0.000536 | ms/batch 684.68 | loss 3.40 | ppl 29.883 +| epoch 5 step 48000 | 2120 batches | lr 0.000532 | ms/batch 705.81 | loss 3.41 | ppl 30.359 +---------------------------------------------------------------------------------------------------- +| Eval 12 at step 48000 | time: 2823.57s | valid loss 3.34 | valid ppl 28.152 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 48200 | 2320 batches | lr 0.000529 | ms/batch 771.37 | loss 3.39 | ppl 29.735 +| epoch 5 step 48400 | 2520 batches | lr 0.000526 | ms/batch 724.35 | loss 3.38 | ppl 29.266 +| epoch 5 step 48600 | 2720 batches | lr 0.000523 | ms/batch 709.33 | loss 3.36 | ppl 28.891 +| epoch 5 step 48800 | 2920 batches | lr 0.00052 | ms/batch 716.29 | loss 3.35 | ppl 28.605 +| epoch 5 step 49000 | 3120 batches | lr 0.000517 | ms/batch 701.20 | loss 3.37 | ppl 29.121 +| epoch 5 step 49200 | 3320 batches | lr 0.000514 | ms/batch 717.37 | loss 3.38 | ppl 29.440 +| epoch 5 step 49400 | 3520 batches | lr 0.00051 | ms/batch 687.15 | loss 3.34 | ppl 28.306 +| epoch 5 step 49600 | 3720 batches | lr 0.000507 | ms/batch 706.52 | loss 3.37 | ppl 29.021 +| epoch 5 step 49800 | 3920 batches | lr 0.000504 | ms/batch 722.49 | loss 3.36 | ppl 28.862 +| epoch 5 step 50000 | 4120 batches | lr 0.000501 | ms/batch 714.17 | loss 3.36 | ppl 28.886 +| epoch 5 step 50200 | 4320 batches | lr 0.000498 | ms/batch 685.39 | loss 3.37 | ppl 28.957 +| epoch 5 step 50400 | 4520 batches | lr 0.000495 | ms/batch 715.33 | loss 3.38 | ppl 29.372 +| epoch 5 step 50600 | 4720 batches | lr 0.000492 | ms/batch 718.29 | loss 3.34 | ppl 28.187 +| epoch 5 step 50800 | 4920 batches | lr 0.000488 | ms/batch 717.46 | loss 3.35 | ppl 28.583 +| epoch 5 step 51000 | 5120 batches | lr 0.000485 | ms/batch 722.98 | loss 3.35 | ppl 28.452 +| epoch 5 step 51200 | 5320 batches | lr 0.000482 | ms/batch 730.83 | loss 3.34 | ppl 28.284 +| epoch 5 step 51400 | 5520 batches | lr 0.000479 | ms/batch 705.06 | loss 3.34 | ppl 28.130 +| epoch 5 step 51600 | 5720 batches | lr 0.000476 | ms/batch 736.14 | loss 3.35 | ppl 28.474 +| epoch 5 step 51800 | 5920 batches | lr 0.000473 | ms/batch 709.48 | loss 3.35 | ppl 28.381 +| epoch 5 step 52000 | 6120 batches | lr 0.000469 | ms/batch 719.02 | loss 3.34 | ppl 28.123 +---------------------------------------------------------------------------------------------------- +| Eval 13 at step 52000 | time: 2861.73s | valid loss 3.32 | valid ppl 27.651 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 52200 | 6320 batches | lr 0.000466 | ms/batch 795.83 | loss 3.36 | ppl 28.824 +| epoch 5 step 52400 | 6520 batches | lr 0.000463 | ms/batch 697.32 | loss 3.30 | ppl 27.207 +| epoch 5 step 52600 | 6720 batches | lr 0.00046 | ms/batch 724.64 | loss 3.31 | ppl 27.379 +| epoch 5 step 52800 | 6920 batches | lr 0.000457 | ms/batch 734.21 | loss 3.33 | ppl 27.948 +| epoch 5 step 53000 | 7120 batches | lr 0.000454 | ms/batch 707.81 | loss 3.31 | ppl 27.522 +| epoch 5 step 53200 | 7320 batches | lr 0.000451 | ms/batch 704.60 | loss 3.28 | ppl 26.696 +| epoch 5 step 53400 | 7520 batches | lr 0.000448 | ms/batch 729.67 | loss 3.32 | ppl 27.541 +| epoch 5 step 53600 | 7720 batches | lr 0.000444 | ms/batch 709.88 | loss 3.31 | ppl 27.326 +| epoch 5 step 53800 | 7920 batches | lr 0.000441 | ms/batch 722.95 | loss 3.31 | ppl 27.348 +| epoch 5 step 54000 | 8120 batches | lr 0.000438 | ms/batch 728.94 | loss 3.32 | ppl 27.682 +| epoch 5 step 54200 | 8320 batches | lr 0.000435 | ms/batch 706.14 | loss 3.31 | ppl 27.518 +| epoch 5 step 54400 | 8520 batches | lr 0.000432 | ms/batch 723.15 | loss 3.30 | ppl 27.196 +| epoch 5 step 54600 | 8720 batches | lr 0.000429 | ms/batch 759.15 | loss 3.32 | ppl 27.670 +| epoch 5 step 54800 | 8920 batches | lr 0.000426 | ms/batch 692.95 | loss 3.32 | ppl 27.792 +| epoch 5 step 55000 | 9120 batches | lr 0.000423 | ms/batch 736.12 | loss 3.31 | ppl 27.454 +| epoch 5 step 55200 | 9320 batches | lr 0.000419 | ms/batch 709.42 | loss 3.30 | ppl 27.208 +| epoch 5 step 55400 | 9520 batches | lr 0.000416 | ms/batch 707.95 | loss 3.33 | ppl 28.072 +| epoch 5 step 55600 | 9720 batches | lr 0.000413 | ms/batch 691.25 | loss 3.30 | ppl 27.225 +| epoch 5 step 55800 | 9920 batches | lr 0.00041 | ms/batch 685.81 | loss 3.31 | ppl 27.293 +| epoch 5 step 56000 | 10120 batches | lr 0.000407 | ms/batch 709.93 | loss 3.30 | ppl 27.183 +---------------------------------------------------------------------------------------------------- +| Eval 14 at step 56000 | time: 2871.27s | valid loss 3.29 | valid ppl 26.758 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 56200 | 10320 batches | lr 0.000404 | ms/batch 784.81 | loss 3.31 | ppl 27.262 +| epoch 5 step 56400 | 10520 batches | lr 0.000401 | ms/batch 708.23 | loss 3.33 | ppl 27.876 +| epoch 5 step 56600 | 10720 batches | lr 0.000398 | ms/batch 718.78 | loss 3.29 | ppl 26.834 +| epoch 5 step 56800 | 10920 batches | lr 0.000395 | ms/batch 723.00 | loss 3.29 | ppl 26.727 +| epoch 5 step 57000 | 11120 batches | lr 0.000392 | ms/batch 730.49 | loss 3.34 | ppl 28.295 +| epoch 5 step 57200 | 11320 batches | lr 0.000389 | ms/batch 728.66 | loss 3.30 | ppl 27.060 +| epoch 6 step 57400 | 50 batches | lr 0.000386 | ms/batch 693.11 | loss 3.32 | ppl 27.563 +| epoch 6 step 57600 | 250 batches | lr 0.000382 | ms/batch 714.89 | loss 3.27 | ppl 26.241 +| epoch 6 step 57800 | 450 batches | lr 0.000379 | ms/batch 727.56 | loss 3.31 | ppl 27.269 +| epoch 6 step 58000 | 650 batches | lr 0.000376 | ms/batch 714.18 | loss 3.27 | ppl 26.327 +| epoch 6 step 58200 | 850 batches | lr 0.000373 | ms/batch 737.04 | loss 3.31 | ppl 27.365 +| epoch 6 step 58400 | 1050 batches | lr 0.00037 | ms/batch 722.31 | loss 3.28 | ppl 26.671 +| epoch 6 step 58600 | 1250 batches | lr 0.000367 | ms/batch 718.13 | loss 3.28 | ppl 26.642 +| epoch 6 step 58800 | 1450 batches | lr 0.000364 | ms/batch 758.91 | loss 3.29 | ppl 26.793 +| epoch 6 step 59000 | 1650 batches | lr 0.000361 | ms/batch 744.06 | loss 3.27 | ppl 26.246 +| epoch 6 step 59200 | 1850 batches | lr 0.000358 | ms/batch 737.10 | loss 3.28 | ppl 26.644 +| epoch 6 step 59400 | 2050 batches | lr 0.000355 | ms/batch 722.53 | loss 3.32 | ppl 27.782 +| epoch 6 step 59600 | 2250 batches | lr 0.000352 | ms/batch 738.70 | loss 3.29 | ppl 26.834 +| epoch 6 step 59800 | 2450 batches | lr 0.000349 | ms/batch 740.37 | loss 3.29 | ppl 26.765 +| epoch 6 step 60000 | 2650 batches | lr 0.000346 | ms/batch 722.84 | loss 3.29 | ppl 26.752 +---------------------------------------------------------------------------------------------------- +| Eval 15 at step 60000 | time: 2912.80s | valid loss 3.27 | valid ppl 26.281 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 60200 | 2850 batches | lr 0.000343 | ms/batch 774.99 | loss 3.23 | ppl 25.400 +| epoch 6 step 60400 | 3050 batches | lr 0.00034 | ms/batch 736.04 | loss 3.28 | ppl 26.615 +| epoch 6 step 60600 | 3250 batches | lr 0.000337 | ms/batch 723.86 | loss 3.27 | ppl 26.433 +| epoch 6 step 60800 | 3450 batches | lr 0.000334 | ms/batch 699.97 | loss 3.26 | ppl 25.944 +| epoch 6 step 61000 | 3650 batches | lr 0.000331 | ms/batch 699.08 | loss 3.26 | ppl 25.978 +| epoch 6 step 61200 | 3850 batches | lr 0.000328 | ms/batch 728.93 | loss 3.26 | ppl 26.106 +| epoch 6 step 61400 | 4050 batches | lr 0.000325 | ms/batch 698.87 | loss 3.28 | ppl 26.608 +| epoch 6 step 61600 | 4250 batches | lr 0.000322 | ms/batch 700.55 | loss 3.26 | ppl 26.047 +| epoch 6 step 61800 | 4450 batches | lr 0.000319 | ms/batch 743.96 | loss 3.27 | ppl 26.276 +| epoch 6 step 62000 | 4650 batches | lr 0.000317 | ms/batch 728.97 | loss 3.26 | ppl 26.099 +| epoch 6 step 62200 | 4850 batches | lr 0.000314 | ms/batch 731.16 | loss 3.25 | ppl 25.752 +| epoch 6 step 62400 | 5050 batches | lr 0.000311 | ms/batch 719.64 | loss 3.26 | ppl 26.134 +| epoch 6 step 62600 | 5250 batches | lr 0.000308 | ms/batch 760.40 | loss 3.25 | ppl 25.803 +| epoch 6 step 62800 | 5450 batches | lr 0.000305 | ms/batch 721.34 | loss 3.23 | ppl 25.210 +| epoch 6 step 63000 | 5650 batches | lr 0.000302 | ms/batch 717.89 | loss 3.27 | ppl 26.336 +| epoch 6 step 63200 | 5850 batches | lr 0.000299 | ms/batch 725.35 | loss 3.25 | ppl 25.735 +| epoch 6 step 63400 | 6050 batches | lr 0.000296 | ms/batch 686.94 | loss 3.24 | ppl 25.469 +| epoch 6 step 63600 | 6250 batches | lr 0.000293 | ms/batch 716.59 | loss 3.25 | ppl 25.788 +| epoch 6 step 63800 | 6450 batches | lr 0.000291 | ms/batch 707.89 | loss 3.25 | ppl 25.795 +| epoch 6 step 64000 | 6650 batches | lr 0.000288 | ms/batch 727.95 | loss 3.20 | ppl 24.511 +---------------------------------------------------------------------------------------------------- +| Eval 16 at step 64000 | time: 2885.83s | valid loss 3.25 | valid ppl 25.737 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 64200 | 6850 batches | lr 0.000285 | ms/batch 779.72 | loss 3.23 | ppl 25.290 +| epoch 6 step 64400 | 7050 batches | lr 0.000282 | ms/batch 687.37 | loss 3.23 | ppl 25.262 +| epoch 6 step 64600 | 7250 batches | lr 0.000279 | ms/batch 746.50 | loss 3.19 | ppl 24.366 +| epoch 6 step 64800 | 7450 batches | lr 0.000276 | ms/batch 718.93 | loss 3.22 | ppl 24.984 +| epoch 6 step 65000 | 7650 batches | lr 0.000274 | ms/batch 726.70 | loss 3.20 | ppl 24.541 +| epoch 6 step 65200 | 7850 batches | lr 0.000271 | ms/batch 719.23 | loss 3.22 | ppl 25.018 +| epoch 6 step 65400 | 8050 batches | lr 0.000268 | ms/batch 711.20 | loss 3.23 | ppl 25.214 +| epoch 6 step 65600 | 8250 batches | lr 0.000265 | ms/batch 717.61 | loss 3.21 | ppl 24.835 +| epoch 6 step 65800 | 8450 batches | lr 0.000262 | ms/batch 728.49 | loss 3.23 | ppl 25.206 +| epoch 6 step 66000 | 8650 batches | lr 0.00026 | ms/batch 730.31 | loss 3.21 | ppl 24.890 +| epoch 6 step 66200 | 8850 batches | lr 0.000257 | ms/batch 692.18 | loss 3.24 | ppl 25.410 +| epoch 6 step 66400 | 9050 batches | lr 0.000254 | ms/batch 735.80 | loss 3.22 | ppl 25.128 +| epoch 6 step 66600 | 9250 batches | lr 0.000251 | ms/batch 726.67 | loss 3.21 | ppl 24.728 +| epoch 6 step 66800 | 9450 batches | lr 0.000249 | ms/batch 691.71 | loss 3.23 | ppl 25.201 +| epoch 6 step 67000 | 9650 batches | lr 0.000246 | ms/batch 716.45 | loss 3.24 | ppl 25.548 +| epoch 6 step 67200 | 9850 batches | lr 0.000243 | ms/batch 721.99 | loss 3.19 | ppl 24.247 +| epoch 6 step 67400 | 10050 batches | lr 0.000241 | ms/batch 732.11 | loss 3.24 | ppl 25.416 +| epoch 6 step 67600 | 10250 batches | lr 0.000238 | ms/batch 732.60 | loss 3.19 | ppl 24.382 +| epoch 6 step 67800 | 10450 batches | lr 0.000235 | ms/batch 738.25 | loss 3.22 | ppl 25.058 +| epoch 6 step 68000 | 10650 batches | lr 0.000233 | ms/batch 728.29 | loss 3.23 | ppl 25.388 +---------------------------------------------------------------------------------------------------- +| Eval 17 at step 68000 | time: 2892.01s | valid loss 3.23 | valid ppl 25.318 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 68200 | 10850 batches | lr 0.00023 | ms/batch 761.27 | loss 3.18 | ppl 24.097 +| epoch 6 step 68400 | 11050 batches | lr 0.000227 | ms/batch 706.40 | loss 3.23 | ppl 25.283 +| epoch 6 step 68600 | 11250 batches | lr 0.000225 | ms/batch 763.81 | loss 3.24 | ppl 25.592 +| epoch 6 step 68800 | 11450 batches | lr 0.000222 | ms/batch 724.69 | loss 3.21 | ppl 24.756 +| epoch 7 step 69000 | 180 batches | lr 0.000219 | ms/batch 725.10 | loss 3.19 | ppl 24.390 +| epoch 7 step 69200 | 380 batches | lr 0.000217 | ms/batch 719.68 | loss 3.20 | ppl 24.464 +| epoch 7 step 69400 | 580 batches | lr 0.000214 | ms/batch 712.69 | loss 3.20 | ppl 24.451 +| epoch 7 step 69600 | 780 batches | lr 0.000212 | ms/batch 725.29 | loss 3.20 | ppl 24.622 +| epoch 7 step 69800 | 980 batches | lr 0.000209 | ms/batch 732.38 | loss 3.18 | ppl 24.086 +| epoch 7 step 70000 | 1180 batches | lr 0.000206 | ms/batch 744.68 | loss 3.21 | ppl 24.853 +| epoch 7 step 70200 | 1380 batches | lr 0.000204 | ms/batch 698.30 | loss 3.19 | ppl 24.298 +| epoch 7 step 70400 | 1580 batches | lr 0.000201 | ms/batch 693.41 | loss 3.19 | ppl 24.256 +| epoch 7 step 70600 | 1780 batches | lr 0.000199 | ms/batch 727.91 | loss 3.19 | ppl 24.231 +| epoch 7 step 70800 | 1980 batches | lr 0.000196 | ms/batch 689.58 | loss 3.22 | ppl 25.011 +| epoch 7 step 71000 | 2180 batches | lr 0.000194 | ms/batch 722.72 | loss 3.21 | ppl 24.789 +| epoch 7 step 71200 | 2380 batches | lr 0.000191 | ms/batch 720.35 | loss 3.20 | ppl 24.643 +| epoch 7 step 71400 | 2580 batches | lr 0.000189 | ms/batch 736.56 | loss 3.19 | ppl 24.315 +| epoch 7 step 71600 | 2780 batches | lr 0.000187 | ms/batch 713.16 | loss 3.17 | ppl 23.782 +| epoch 7 step 71800 | 2980 batches | lr 0.000184 | ms/batch 681.34 | loss 3.18 | ppl 24.050 +| epoch 7 step 72000 | 3180 batches | lr 0.000182 | ms/batch 712.65 | loss 3.19 | ppl 24.394 +---------------------------------------------------------------------------------------------------- +| Eval 18 at step 72000 | time: 2878.12s | valid loss 3.21 | valid ppl 24.850 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 72200 | 3380 batches | lr 0.000179 | ms/batch 749.92 | loss 3.19 | ppl 24.229 +| epoch 7 step 72400 | 3580 batches | lr 0.000177 | ms/batch 709.24 | loss 3.16 | ppl 23.648 +| epoch 7 step 72600 | 3780 batches | lr 0.000174 | ms/batch 732.91 | loss 3.18 | ppl 23.938 +| epoch 7 step 72800 | 3980 batches | lr 0.000172 | ms/batch 714.76 | loss 3.19 | ppl 24.213 +| epoch 7 step 73000 | 4180 batches | lr 0.00017 | ms/batch 719.33 | loss 3.18 | ppl 24.092 +| epoch 7 step 73200 | 4380 batches | lr 0.000167 | ms/batch 709.24 | loss 3.18 | ppl 24.057 +| epoch 7 step 73400 | 4580 batches | lr 0.000165 | ms/batch 750.40 | loss 3.20 | ppl 24.511 +| epoch 7 step 73600 | 4780 batches | lr 0.000163 | ms/batch 732.09 | loss 3.15 | ppl 23.398 +| epoch 7 step 73800 | 4980 batches | lr 0.00016 | ms/batch 749.69 | loss 3.19 | ppl 24.322 +| epoch 7 step 74000 | 5180 batches | lr 0.000158 | ms/batch 732.47 | loss 3.16 | ppl 23.623 +| epoch 7 step 74200 | 5380 batches | lr 0.000156 | ms/batch 734.25 | loss 3.14 | ppl 23.147 +| epoch 7 step 74400 | 5580 batches | lr 0.000153 | ms/batch 705.61 | loss 3.16 | ppl 23.636 +| epoch 7 step 74600 | 5780 batches | lr 0.000151 | ms/batch 718.58 | loss 3.18 | ppl 24.164 +| epoch 7 step 74800 | 5980 batches | lr 0.000149 | ms/batch 718.67 | loss 3.16 | ppl 23.490 +| epoch 7 step 75000 | 6180 batches | lr 0.000147 | ms/batch 710.85 | loss 3.16 | ppl 23.495 +| epoch 7 step 75200 | 6380 batches | lr 0.000145 | ms/batch 724.50 | loss 3.19 | ppl 24.244 +| epoch 7 step 75400 | 6580 batches | lr 0.000142 | ms/batch 740.93 | loss 3.12 | ppl 22.548 +| epoch 7 step 75600 | 6780 batches | lr 0.00014 | ms/batch 745.37 | loss 3.15 | ppl 23.251 +| epoch 7 step 75800 | 6980 batches | lr 0.000138 | ms/batch 713.31 | loss 3.16 | ppl 23.564 +| epoch 7 step 76000 | 7180 batches | lr 0.000136 | ms/batch 720.59 | loss 3.11 | ppl 22.422 +---------------------------------------------------------------------------------------------------- +| Eval 19 at step 76000 | time: 2902.26s | valid loss 3.20 | valid ppl 24.479 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 76200 | 7380 batches | lr 0.000134 | ms/batch 762.44 | loss 3.14 | ppl 23.037 +| epoch 7 step 76400 | 7580 batches | lr 0.000131 | ms/batch 732.61 | loss 3.11 | ppl 22.458 +| epoch 7 step 76600 | 7780 batches | lr 0.000129 | ms/batch 695.86 | loss 3.15 | ppl 23.248 +| epoch 7 step 76800 | 7980 batches | lr 0.000127 | ms/batch 742.29 | loss 3.14 | ppl 23.190 +| epoch 7 step 77000 | 8180 batches | lr 0.000125 | ms/batch 752.96 | loss 3.13 | ppl 22.825 +| epoch 7 step 77200 | 8380 batches | lr 0.000123 | ms/batch 722.77 | loss 3.16 | ppl 23.556 +| epoch 7 step 77400 | 8580 batches | lr 0.000121 | ms/batch 719.94 | loss 3.14 | ppl 23.028 +| epoch 7 step 77600 | 8780 batches | lr 0.000119 | ms/batch 744.23 | loss 3.15 | ppl 23.304 +| epoch 7 step 77800 | 8980 batches | lr 0.000117 | ms/batch 750.43 | loss 3.15 | ppl 23.339 +| epoch 7 step 78000 | 9180 batches | lr 0.000115 | ms/batch 748.00 | loss 3.13 | ppl 22.849 +| epoch 7 step 78200 | 9380 batches | lr 0.000113 | ms/batch 748.11 | loss 3.15 | ppl 23.225 +| epoch 7 step 78400 | 9580 batches | lr 0.000111 | ms/batch 766.61 | loss 3.16 | ppl 23.632 +| epoch 7 step 78600 | 9780 batches | lr 0.000109 | ms/batch 760.63 | loss 3.14 | ppl 23.013 +| epoch 7 step 78800 | 9980 batches | lr 0.000107 | ms/batch 747.21 | loss 3.13 | ppl 22.924 +| epoch 7 step 79000 | 10180 batches | lr 0.000105 | ms/batch 735.24 | loss 3.13 | ppl 22.790 +| epoch 7 step 79200 | 10380 batches | lr 0.000103 | ms/batch 760.44 | loss 3.14 | ppl 23.063 +| epoch 7 step 79400 | 10580 batches | lr 0.000101 | ms/batch 758.52 | loss 3.16 | ppl 23.590 +| epoch 7 step 79600 | 10780 batches | lr 9.94e-05 | ms/batch 750.88 | loss 3.12 | ppl 22.600 +| epoch 7 step 79800 | 10980 batches | lr 9.75e-05 | ms/batch 754.39 | loss 3.14 | ppl 23.110 +| epoch 7 step 80000 | 11180 batches | lr 9.57e-05 | ms/batch 727.37 | loss 3.16 | ppl 23.628 +---------------------------------------------------------------------------------------------------- +| Eval 20 at step 80000 | time: 2972.05s | valid loss 3.18 | valid ppl 24.133 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 80200 | 11380 batches | lr 9.38e-05 | ms/batch 794.23 | loss 3.15 | ppl 23.294 +| epoch 8 step 80400 | 110 batches | lr 9.2e-05 | ms/batch 734.78 | loss 3.13 | ppl 22.874 +| epoch 8 step 80600 | 310 batches | lr 9.02e-05 | ms/batch 754.47 | loss 3.12 | ppl 22.589 +| epoch 8 step 80800 | 510 batches | lr 8.84e-05 | ms/batch 740.76 | loss 3.15 | ppl 23.330 +| epoch 8 step 81000 | 710 batches | lr 8.66e-05 | ms/batch 735.69 | loss 3.11 | ppl 22.359 +| epoch 8 step 81200 | 910 batches | lr 8.49e-05 | ms/batch 752.15 | loss 3.12 | ppl 22.600 +| epoch 8 step 81400 | 1110 batches | lr 8.31e-05 | ms/batch 742.53 | loss 3.15 | ppl 23.245 +| epoch 8 step 81600 | 1310 batches | lr 8.14e-05 | ms/batch 773.49 | loss 3.12 | ppl 22.646 +| epoch 8 step 81800 | 1510 batches | lr 7.97e-05 | ms/batch 760.43 | loss 3.12 | ppl 22.674 +| epoch 8 step 82000 | 1710 batches | lr 7.8e-05 | ms/batch 737.05 | loss 3.11 | ppl 22.328 +| epoch 8 step 82200 | 1910 batches | lr 7.63e-05 | ms/batch 733.76 | loss 3.14 | ppl 23.159 +| epoch 8 step 82400 | 2110 batches | lr 7.46e-05 | ms/batch 764.27 | loss 3.16 | ppl 23.570 +| epoch 8 step 82600 | 2310 batches | lr 7.3e-05 | ms/batch 772.41 | loss 3.14 | ppl 23.087 +| epoch 8 step 82800 | 2510 batches | lr 7.14e-05 | ms/batch 745.45 | loss 3.12 | ppl 22.685 +| epoch 8 step 83000 | 2710 batches | lr 6.98e-05 | ms/batch 755.61 | loss 3.12 | ppl 22.584 +| epoch 8 step 83200 | 2910 batches | lr 6.82e-05 | ms/batch 750.13 | loss 3.09 | ppl 22.066 +| epoch 8 step 83400 | 3110 batches | lr 6.66e-05 | ms/batch 748.21 | loss 3.12 | ppl 22.669 +| epoch 8 step 83600 | 3310 batches | lr 6.5e-05 | ms/batch 724.78 | loss 3.14 | ppl 23.128 +| epoch 8 step 83800 | 3510 batches | lr 6.35e-05 | ms/batch 740.45 | loss 3.10 | ppl 22.196 +| epoch 8 step 84000 | 3710 batches | lr 6.2e-05 | ms/batch 751.59 | loss 3.12 | ppl 22.623 +---------------------------------------------------------------------------------------------------- +| Eval 21 at step 84000 | time: 2998.13s | valid loss 3.17 | valid ppl 23.903 +---------------------------------------------------------------------------------------------------- +| epoch 8 step 84200 | 3910 batches | lr 6.05e-05 | ms/batch 825.75 | loss 3.11 | ppl 22.467 +| epoch 8 step 84400 | 4110 batches | lr 5.9e-05 | ms/batch 733.29 | loss 3.12 | ppl 22.706 +| epoch 8 step 84600 | 4310 batches | lr 5.75e-05 | ms/batch 742.55 | loss 3.12 | ppl 22.669 +| epoch 8 step 84800 | 4510 batches | lr 5.6e-05 | ms/batch 751.39 | loss 3.14 | ppl 23.073 +| epoch 8 step 85000 | 4710 batches | lr 5.46e-05 | ms/batch 770.53 | loss 3.10 | ppl 22.104 +| epoch 8 step 85200 | 4910 batches | lr 5.32e-05 | ms/batch 739.47 | loss 3.11 | ppl 22.408 +| epoch 8 step 85400 | 5110 batches | lr 5.18e-05 | ms/batch 724.96 | loss 3.11 | ppl 22.412 +| epoch 8 step 85600 | 5310 batches | lr 5.04e-05 | ms/batch 741.18 | loss 3.10 | ppl 22.161 +| epoch 8 step 85800 | 5510 batches | lr 4.9e-05 | ms/batch 752.19 | loss 3.10 | ppl 22.286 +| epoch 8 step 86000 | 5710 batches | lr 4.77e-05 | ms/batch 746.66 | loss 3.11 | ppl 22.364 +| epoch 8 step 86200 | 5910 batches | lr 4.63e-05 | ms/batch 738.32 | loss 3.11 | ppl 22.427 +| epoch 8 step 86400 | 6110 batches | lr 4.5e-05 | ms/batch 759.33 | loss 3.10 | ppl 22.299 +| epoch 8 step 86600 | 6310 batches | lr 4.37e-05 | ms/batch 748.11 | loss 3.12 | ppl 22.675 +| epoch 8 step 86800 | 6510 batches | lr 4.25e-05 | ms/batch 745.24 | loss 3.07 | ppl 21.580 +| epoch 8 step 87000 | 6710 batches | lr 4.12e-05 | ms/batch 745.61 | loss 3.08 | ppl 21.680 +| epoch 8 step 87200 | 6910 batches | lr 4e-05 | ms/batch 752.93 | loss 3.10 | ppl 22.089 +| epoch 8 step 87400 | 7110 batches | lr 3.87e-05 | ms/batch 604.82 | loss 3.09 | ppl 21.917 +| epoch 8 step 87600 | 7310 batches | lr 3.75e-05 | ms/batch 430.85 | loss 3.05 | ppl 21.129 +| epoch 8 step 87800 | 7510 batches | lr 3.63e-05 | ms/batch 430.44 | loss 3.09 | ppl 21.941 +| epoch 8 step 88000 | 7710 batches | lr 3.52e-05 | ms/batch 432.19 | loss 3.08 | ppl 21.673 +---------------------------------------------------------------------------------------------------- +| Eval 22 at step 88000 | time: 2776.62s | valid loss 3.16 | valid ppl 23.687 +---------------------------------------------------------------------------------------------------- +| epoch 8 step 88200 | 7910 batches | lr 3.4e-05 | ms/batch 488.14 | loss 3.08 | ppl 21.771 +| epoch 8 step 88400 | 8110 batches | lr 3.29e-05 | ms/batch 430.18 | loss 3.09 | ppl 22.011 +| epoch 8 step 88600 | 8310 batches | lr 3.18e-05 | ms/batch 432.60 | loss 3.09 | ppl 21.873 +| epoch 8 step 88800 | 8510 batches | lr 3.07e-05 | ms/batch 432.02 | loss 3.08 | ppl 21.770 +| epoch 8 step 89000 | 8710 batches | lr 2.96e-05 | ms/batch 432.92 | loss 3.10 | ppl 22.144 +| epoch 8 step 89200 | 8910 batches | lr 2.86e-05 | ms/batch 431.36 | loss 3.10 | ppl 22.127 +| epoch 8 step 89400 | 9110 batches | lr 2.75e-05 | ms/batch 431.38 | loss 3.10 | ppl 22.138 +| epoch 8 step 89600 | 9310 batches | lr 2.65e-05 | ms/batch 430.48 | loss 3.08 | ppl 21.755 +| epoch 8 step 89800 | 9510 batches | lr 2.55e-05 | ms/batch 431.16 | loss 3.11 | ppl 22.437 +| epoch 8 step 90000 | 9710 batches | lr 2.45e-05 | ms/batch 429.64 | loss 3.09 | ppl 21.973 +| epoch 8 step 90200 | 9910 batches | lr 2.36e-05 | ms/batch 428.56 | loss 3.08 | ppl 21.767 +| epoch 8 step 90400 | 10110 batches | lr 2.26e-05 | ms/batch 429.16 | loss 3.09 | ppl 22.028 +| epoch 8 step 90600 | 10310 batches | lr 2.17e-05 | ms/batch 431.47 | loss 3.09 | ppl 21.880 +| epoch 8 step 90800 | 10510 batches | lr 2.08e-05 | ms/batch 430.01 | loss 3.11 | ppl 22.506 +| epoch 8 step 91000 | 10710 batches | lr 1.99e-05 | ms/batch 430.75 | loss 3.08 | ppl 21.691 +| epoch 8 step 91200 | 10910 batches | lr 1.9e-05 | ms/batch 431.30 | loss 3.07 | ppl 21.584 +| epoch 8 step 91400 | 11110 batches | lr 1.82e-05 | ms/batch 430.69 | loss 3.13 | ppl 22.905 +| epoch 8 step 91600 | 11310 batches | lr 1.73e-05 | ms/batch 431.02 | loss 3.09 | ppl 22.051 +| epoch 9 step 91800 | 40 batches | lr 1.65e-05 | ms/batch 429.67 | loss 3.11 | ppl 22.378 +| epoch 9 step 92000 | 240 batches | lr 1.57e-05 | ms/batch 430.81 | loss 3.06 | ppl 21.367 +---------------------------------------------------------------------------------------------------- +| Eval 23 at step 92000 | time: 1730.21s | valid loss 3.16 | valid ppl 23.602 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 92200 | 440 batches | lr 1.5e-05 | ms/batch 483.29 | loss 3.10 | ppl 22.199 +| epoch 9 step 92400 | 640 batches | lr 1.42e-05 | ms/batch 434.23 | loss 3.07 | ppl 21.539 +| epoch 9 step 92600 | 840 batches | lr 1.35e-05 | ms/batch 434.24 | loss 3.11 | ppl 22.439 +| epoch 9 step 92800 | 1040 batches | lr 1.28e-05 | ms/batch 432.72 | loss 3.07 | ppl 21.632 +| epoch 9 step 93000 | 1240 batches | lr 1.21e-05 | ms/batch 429.50 | loss 3.08 | ppl 21.800 +| epoch 9 step 93200 | 1440 batches | lr 1.14e-05 | ms/batch 432.40 | loss 3.09 | ppl 22.049 +| epoch 9 step 93400 | 1640 batches | lr 1.07e-05 | ms/batch 431.08 | loss 3.07 | ppl 21.468 +| epoch 9 step 93600 | 1840 batches | lr 1.01e-05 | ms/batch 430.19 | loss 3.09 | ppl 21.946 +| epoch 9 step 93800 | 2040 batches | lr 9.47e-06 | ms/batch 431.40 | loss 3.13 | ppl 22.849 +| epoch 9 step 94000 | 2240 batches | lr 8.87e-06 | ms/batch 432.65 | loss 3.10 | ppl 22.092 +| epoch 9 step 94200 | 2440 batches | lr 8.29e-06 | ms/batch 429.09 | loss 3.10 | ppl 22.179 +| epoch 9 step 94400 | 2640 batches | lr 7.73e-06 | ms/batch 428.25 | loss 3.10 | ppl 22.114 +| epoch 9 step 94600 | 2840 batches | lr 7.19e-06 | ms/batch 428.08 | loss 3.05 | ppl 21.164 +| epoch 9 step 94800 | 3040 batches | lr 6.67e-06 | ms/batch 428.49 | loss 3.09 | ppl 22.038 +| epoch 9 step 95000 | 3240 batches | lr 6.17e-06 | ms/batch 430.82 | loss 3.09 | ppl 21.949 +| epoch 9 step 95200 | 3440 batches | lr 5.68e-06 | ms/batch 427.08 | loss 3.08 | ppl 21.680 +| epoch 9 step 95400 | 3640 batches | lr 5.22e-06 | ms/batch 428.74 | loss 3.07 | ppl 21.579 +| epoch 9 step 95600 | 3840 batches | lr 4.78e-06 | ms/batch 427.39 | loss 3.09 | ppl 21.879 +| epoch 9 step 95800 | 4040 batches | lr 4.35e-06 | ms/batch 427.67 | loss 3.10 | ppl 22.228 +| epoch 9 step 96000 | 4240 batches | lr 3.95e-06 | ms/batch 427.59 | loss 3.08 | ppl 21.796 +---------------------------------------------------------------------------------------------------- +| Eval 24 at step 96000 | time: 1726.61s | valid loss 3.16 | valid ppl 23.510 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 96200 | 4440 batches | lr 3.57e-06 | ms/batch 481.05 | loss 3.09 | ppl 21.968 +| epoch 9 step 96400 | 4640 batches | lr 3.2e-06 | ms/batch 426.74 | loss 3.09 | ppl 21.871 +| epoch 9 step 96600 | 4840 batches | lr 2.85e-06 | ms/batch 427.07 | loss 3.07 | ppl 21.565 +| epoch 9 step 96800 | 5040 batches | lr 2.53e-06 | ms/batch 436.58 | loss 3.09 | ppl 22.056 +| epoch 9 step 97000 | 5240 batches | lr 2.22e-06 | ms/batch 427.55 | loss 3.08 | ppl 21.784 +| epoch 9 step 97200 | 5440 batches | lr 1.94e-06 | ms/batch 426.99 | loss 3.05 | ppl 21.169 +| epoch 9 step 97400 | 5640 batches | lr 1.67e-06 | ms/batch 427.80 | loss 3.10 | ppl 22.104 +| epoch 9 step 97600 | 5840 batches | lr 1.42e-06 | ms/batch 429.61 | loss 3.09 | ppl 21.891 +| epoch 9 step 97800 | 6040 batches | lr 1.2e-06 | ms/batch 427.90 | loss 3.06 | ppl 21.431 +| epoch 9 step 98000 | 6240 batches | lr 9.88e-07 | ms/batch 431.01 | loss 3.08 | ppl 21.797 +| epoch 9 step 98200 | 6440 batches | lr 8.01e-07 | ms/batch 427.47 | loss 3.09 | ppl 21.956 +| epoch 9 step 98400 | 6640 batches | lr 6.33e-07 | ms/batch 427.01 | loss 3.04 | ppl 20.833 +| epoch 9 step 98600 | 6840 batches | lr 4.84e-07 | ms/batch 573.59 | loss 3.07 | ppl 21.489 +| epoch 9 step 98800 | 7040 batches | lr 3.56e-07 | ms/batch 711.47 | loss 3.07 | ppl 21.563 +| epoch 9 step 99000 | 7240 batches | lr 2.47e-07 | ms/batch 736.74 | loss 3.04 | ppl 20.823 +| epoch 9 step 99200 | 7440 batches | lr 1.58e-07 | ms/batch 708.78 | loss 3.05 | ppl 21.211 +| epoch 9 step 99400 | 7640 batches | lr 8.9e-08 | ms/batch 750.12 | loss 3.04 | ppl 20.909 +| epoch 9 step 99600 | 7840 batches | lr 3.96e-08 | ms/batch 726.05 | loss 3.07 | ppl 21.536 +| epoch 9 step 99800 | 8040 batches | lr 9.89e-09 | ms/batch 691.15 | loss 3.07 | ppl 21.509 +| epoch 9 step 100000 | 8240 batches | lr 0 | ms/batch 704.59 | loss 3.06 | ppl 21.301 +---------------------------------------------------------------------------------------------------- +| Eval 25 at step 100000 | time: 2157.66s | valid loss 3.16 | valid ppl 23.503 +---------------------------------------------------------------------------------------------------- +---------------------------------------------------------------------------------------------------- +End of training +==================================================================================================== +| End of training | test loss 3.19 | test ppl 24.264 +==================================================================================================== diff --git a/NLP/Transformer-XL/exp_results/log-200k.txt b/NLP/Transformer-XL/exp_results/log-200k.txt new file mode 100644 index 0000000..62efb00 --- /dev/null +++ b/NLP/Transformer-XL/exp_results/log-200k.txt @@ -0,0 +1,1224 @@ +==================================================================================================== + - data : /root/autodl-tmp/data/wikitext-103/ + - dataset : wt103 + - n_layer : 16 + - n_head : 10 + - d_head : 41 + - d_embed : 410 + - d_model : 410 + - d_inner : 2100 + - dropout : 0.1 + - dropatt : 0.0 + - init : normal + - emb_init : normal + - init_range : 0.1 + - emb_init_range : 0.01 + - init_std : 0.02 + - proj_init_std : 0.01 + - optim : adan + - lr : 0.001 + - wd : 0.02 + - mom : 0.0 + - scheduler : cosine + - warmup_step : 3000 + - decay_rate : 0.5 + - lr_min : 1e-06 + - clip : 0.25 + - clip_nonemb : False + - max_step : 200000 + - batch_size : 60 + - batch_chunk : 1 + - tgt_len : 150 + - eval_tgt_len : 150 + - ext_len : 0 + - mem_len : 150 + - not_tied : False + - seed : 1111 + - cuda : True + - adaptive : True + - div_val : 1 + - pre_lnorm : False + - varlen : False + - multi_gpu : True + - log_interval : 200 + - eval_interval : 4000 + - work_dir : /root/autodl-tmp/-wt103/20220811-105308 + - restart : False + - restart_dir : + - debug : False + - same_length : False + - attn_type : 0 + - clamp_len : -1 + - eta_min : 0.0 + - gpu0_bsz : 4 + - max_eval_steps : -1 + - sample_softmax : -1 + - patience : 0 + - finetune_v2 : False + - finetune_v3 : False + - fp16 : False + - static_loss_scale : 1 + - dynamic_loss_scale : False + - opt_betas : [0.9, 0.9, 0.999] + - tied : True + - n_token : 267735 + - n_all_param : 151107538 + - n_nonemb_param : 41066400 +==================================================================================================== +#params = 151107538 +#non emb params = 41066400 +| epoch 1 step 200 | 200 batches | lr 6.67e-05 | ms/batch 776.32 | loss 8.90 | ppl 7366.806 +| epoch 1 step 400 | 400 batches | lr 0.000133 | ms/batch 706.08 | loss 6.85 | ppl 942.451 +| epoch 1 step 600 | 600 batches | lr 0.0002 | ms/batch 682.24 | loss 6.34 | ppl 567.781 +| epoch 1 step 800 | 800 batches | lr 0.000267 | ms/batch 727.20 | loss 6.06 | ppl 428.925 +| epoch 1 step 1000 | 1000 batches | lr 0.000333 | ms/batch 722.60 | loss 5.80 | ppl 330.968 +| epoch 1 step 1200 | 1200 batches | lr 0.0004 | ms/batch 707.72 | loss 5.60 | ppl 270.691 +| epoch 1 step 1400 | 1400 batches | lr 0.000467 | ms/batch 715.23 | loss 5.43 | ppl 228.271 +| epoch 1 step 1600 | 1600 batches | lr 0.000533 | ms/batch 717.15 | loss 5.28 | ppl 196.416 +| epoch 1 step 1800 | 1800 batches | lr 0.0006 | ms/batch 706.30 | loss 5.15 | ppl 173.240 +| epoch 1 step 2000 | 2000 batches | lr 0.000667 | ms/batch 692.22 | loss 5.04 | ppl 154.584 +| epoch 1 step 2200 | 2200 batches | lr 0.000733 | ms/batch 676.79 | loss 4.93 | ppl 138.813 +| epoch 1 step 2400 | 2400 batches | lr 0.0008 | ms/batch 692.14 | loss 4.85 | ppl 128.135 +| epoch 1 step 2600 | 2600 batches | lr 0.000867 | ms/batch 670.68 | loss 4.76 | ppl 116.945 +| epoch 1 step 2800 | 2800 batches | lr 0.000933 | ms/batch 709.41 | loss 4.69 | ppl 108.587 +| epoch 1 step 3000 | 3000 batches | lr 0.001 | ms/batch 684.10 | loss 4.64 | ppl 103.975 +| epoch 1 step 3200 | 3200 batches | lr 0.001 | ms/batch 705.82 | loss 4.58 | ppl 97.501 +| epoch 1 step 3400 | 3400 batches | lr 0.001 | ms/batch 696.96 | loss 4.53 | ppl 93.101 +| epoch 1 step 3600 | 3600 batches | lr 0.000999 | ms/batch 698.89 | loss 4.45 | ppl 85.852 +| epoch 1 step 3800 | 3800 batches | lr 0.000999 | ms/batch 728.79 | loss 4.48 | ppl 88.166 +| epoch 1 step 4000 | 4000 batches | lr 0.000999 | ms/batch 728.35 | loss 4.44 | ppl 84.369 +---------------------------------------------------------------------------------------------------- +| Eval 1 at step 4000 | time: 2837.46s | valid loss 4.37 | valid ppl 78.692 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 4200 | 4200 batches | lr 0.000999 | ms/batch 775.55 | loss 4.38 | ppl 79.980 +| epoch 1 step 4400 | 4400 batches | lr 0.000999 | ms/batch 703.47 | loss 4.36 | ppl 78.094 +| epoch 1 step 4600 | 4600 batches | lr 0.000999 | ms/batch 740.85 | loss 4.34 | ppl 76.334 +| epoch 1 step 4800 | 4800 batches | lr 0.000999 | ms/batch 705.75 | loss 4.28 | ppl 72.245 +| epoch 1 step 5000 | 5000 batches | lr 0.000999 | ms/batch 693.81 | loss 4.31 | ppl 74.614 +| epoch 1 step 5200 | 5200 batches | lr 0.000999 | ms/batch 712.14 | loss 4.25 | ppl 70.189 +| epoch 1 step 5400 | 5400 batches | lr 0.000998 | ms/batch 744.54 | loss 4.20 | ppl 66.510 +| epoch 1 step 5600 | 5600 batches | lr 0.000998 | ms/batch 686.33 | loss 4.22 | ppl 67.986 +| epoch 1 step 5800 | 5800 batches | lr 0.000998 | ms/batch 757.67 | loss 4.21 | ppl 67.454 +| epoch 1 step 6000 | 6000 batches | lr 0.000998 | ms/batch 743.34 | loss 4.17 | ppl 64.554 +| epoch 1 step 6200 | 6200 batches | lr 0.000998 | ms/batch 715.31 | loss 4.14 | ppl 62.901 +| epoch 1 step 6400 | 6400 batches | lr 0.000998 | ms/batch 726.38 | loss 4.17 | ppl 64.900 +| epoch 1 step 6600 | 6600 batches | lr 0.000998 | ms/batch 708.39 | loss 4.11 | ppl 60.722 +| epoch 1 step 6800 | 6800 batches | lr 0.000997 | ms/batch 681.98 | loss 4.10 | ppl 60.559 +| epoch 1 step 7000 | 7000 batches | lr 0.000997 | ms/batch 726.10 | loss 4.11 | ppl 60.652 +| epoch 1 step 7200 | 7200 batches | lr 0.000997 | ms/batch 714.34 | loss 4.06 | ppl 57.786 +| epoch 1 step 7400 | 7400 batches | lr 0.000997 | ms/batch 696.85 | loss 4.05 | ppl 57.517 +| epoch 1 step 7600 | 7600 batches | lr 0.000997 | ms/batch 720.62 | loss 4.03 | ppl 56.394 +| epoch 1 step 7800 | 7800 batches | lr 0.000996 | ms/batch 712.74 | loss 4.05 | ppl 57.635 +| epoch 1 step 8000 | 8000 batches | lr 0.000996 | ms/batch 695.84 | loss 4.05 | ppl 57.298 +---------------------------------------------------------------------------------------------------- +| Eval 2 at step 8000 | time: 2868.86s | valid loss 3.94 | valid ppl 51.178 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 8200 | 8200 batches | lr 0.000996 | ms/batch 738.23 | loss 4.02 | ppl 55.917 +| epoch 1 step 8400 | 8400 batches | lr 0.000996 | ms/batch 734.08 | loss 4.03 | ppl 56.542 +| epoch 1 step 8600 | 8600 batches | lr 0.000996 | ms/batch 707.68 | loss 4.01 | ppl 55.411 +| epoch 1 step 8800 | 8800 batches | lr 0.000995 | ms/batch 729.09 | loss 4.02 | ppl 55.927 +| epoch 1 step 9000 | 9000 batches | lr 0.000995 | ms/batch 686.10 | loss 3.99 | ppl 54.282 +| epoch 1 step 9200 | 9200 batches | lr 0.000995 | ms/batch 692.20 | loss 3.98 | ppl 53.707 +| epoch 1 step 9400 | 9400 batches | lr 0.000995 | ms/batch 735.51 | loss 3.99 | ppl 53.919 +| epoch 1 step 9600 | 9600 batches | lr 0.000995 | ms/batch 749.40 | loss 4.00 | ppl 54.757 +| epoch 1 step 9800 | 9800 batches | lr 0.000994 | ms/batch 704.19 | loss 3.96 | ppl 52.375 +| epoch 1 step 10000 | 10000 batches | lr 0.000994 | ms/batch 703.88 | loss 3.97 | ppl 53.129 +| epoch 1 step 10200 | 10200 batches | lr 0.000994 | ms/batch 727.49 | loss 3.94 | ppl 51.329 +| epoch 1 step 10400 | 10400 batches | lr 0.000994 | ms/batch 692.36 | loss 3.94 | ppl 51.268 +| epoch 1 step 10600 | 10600 batches | lr 0.000993 | ms/batch 694.79 | loss 3.96 | ppl 52.487 +| epoch 1 step 10800 | 10800 batches | lr 0.000993 | ms/batch 718.57 | loss 3.92 | ppl 50.269 +| epoch 1 step 11000 | 11000 batches | lr 0.000993 | ms/batch 698.89 | loss 3.96 | ppl 52.263 +| epoch 1 step 11200 | 11200 batches | lr 0.000993 | ms/batch 704.48 | loss 3.93 | ppl 51.073 +| epoch 1 step 11400 | 11400 batches | lr 0.000992 | ms/batch 705.65 | loss 3.93 | ppl 50.985 +| epoch 2 step 11600 | 130 batches | lr 0.000992 | ms/batch 691.91 | loss 3.90 | ppl 49.549 +| epoch 2 step 11800 | 330 batches | lr 0.000992 | ms/batch 692.51 | loss 3.88 | ppl 48.290 +| epoch 2 step 12000 | 530 batches | lr 0.000991 | ms/batch 705.18 | loss 3.90 | ppl 49.346 +---------------------------------------------------------------------------------------------------- +| Eval 3 at step 12000 | time: 2838.27s | valid loss 3.79 | valid ppl 44.041 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 12200 | 730 batches | lr 0.000991 | ms/batch 759.90 | loss 3.87 | ppl 47.958 +| epoch 2 step 12400 | 930 batches | lr 0.000991 | ms/batch 714.42 | loss 3.87 | ppl 48.080 +| epoch 2 step 12600 | 1130 batches | lr 0.00099 | ms/batch 699.20 | loss 3.90 | ppl 49.413 +| epoch 2 step 12800 | 1330 batches | lr 0.00099 | ms/batch 708.63 | loss 3.87 | ppl 47.722 +| epoch 2 step 13000 | 1530 batches | lr 0.00099 | ms/batch 714.74 | loss 3.86 | ppl 47.251 +| epoch 2 step 13200 | 1730 batches | lr 0.00099 | ms/batch 684.72 | loss 3.85 | ppl 46.990 +| epoch 2 step 13400 | 1930 batches | lr 0.000989 | ms/batch 751.38 | loss 3.85 | ppl 47.227 +| epoch 2 step 13600 | 2130 batches | lr 0.000989 | ms/batch 715.16 | loss 3.87 | ppl 48.126 +| epoch 2 step 13800 | 2330 batches | lr 0.000989 | ms/batch 699.09 | loss 3.85 | ppl 46.907 +| epoch 2 step 14000 | 2530 batches | lr 0.000988 | ms/batch 711.72 | loss 3.83 | ppl 46.153 +| epoch 2 step 14200 | 2730 batches | lr 0.000988 | ms/batch 682.58 | loss 3.81 | ppl 45.173 +| epoch 2 step 14400 | 2930 batches | lr 0.000987 | ms/batch 719.64 | loss 3.79 | ppl 44.409 +| epoch 2 step 14600 | 3130 batches | lr 0.000987 | ms/batch 719.75 | loss 3.80 | ppl 44.802 +| epoch 2 step 14800 | 3330 batches | lr 0.000987 | ms/batch 715.90 | loss 3.81 | ppl 44.978 +| epoch 2 step 15000 | 3530 batches | lr 0.000986 | ms/batch 701.70 | loss 3.77 | ppl 43.266 +| epoch 2 step 15200 | 3730 batches | lr 0.000986 | ms/batch 731.21 | loss 3.80 | ppl 44.576 +| epoch 2 step 15400 | 3930 batches | lr 0.000986 | ms/batch 685.54 | loss 3.79 | ppl 44.202 +| epoch 2 step 15600 | 4130 batches | lr 0.000985 | ms/batch 715.92 | loss 3.78 | ppl 43.802 +| epoch 2 step 15800 | 4330 batches | lr 0.000985 | ms/batch 709.67 | loss 3.79 | ppl 44.150 +| epoch 2 step 16000 | 4530 batches | lr 0.000985 | ms/batch 698.36 | loss 3.79 | ppl 44.245 +---------------------------------------------------------------------------------------------------- +| Eval 4 at step 16000 | time: 2843.67s | valid loss 3.69 | valid ppl 40.088 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 16200 | 4730 batches | lr 0.000984 | ms/batch 794.03 | loss 3.75 | ppl 42.359 +| epoch 2 step 16400 | 4930 batches | lr 0.000984 | ms/batch 719.73 | loss 3.77 | ppl 43.208 +| epoch 2 step 16600 | 5130 batches | lr 0.000983 | ms/batch 687.12 | loss 3.76 | ppl 42.866 +| epoch 2 step 16800 | 5330 batches | lr 0.000983 | ms/batch 714.50 | loss 3.75 | ppl 42.520 +| epoch 2 step 17000 | 5530 batches | lr 0.000982 | ms/batch 740.55 | loss 3.74 | ppl 41.965 +| epoch 2 step 17200 | 5730 batches | lr 0.000982 | ms/batch 686.23 | loss 3.76 | ppl 42.748 +| epoch 2 step 17400 | 5930 batches | lr 0.000982 | ms/batch 714.69 | loss 3.74 | ppl 42.066 +| epoch 2 step 17600 | 6130 batches | lr 0.000981 | ms/batch 716.37 | loss 3.73 | ppl 41.737 +| epoch 2 step 17800 | 6330 batches | lr 0.000981 | ms/batch 709.37 | loss 3.76 | ppl 42.999 +| epoch 2 step 18000 | 6530 batches | lr 0.00098 | ms/batch 707.37 | loss 3.70 | ppl 40.547 +| epoch 2 step 18200 | 6730 batches | lr 0.00098 | ms/batch 740.15 | loss 3.71 | ppl 40.752 +| epoch 2 step 18400 | 6930 batches | lr 0.000979 | ms/batch 700.09 | loss 3.72 | ppl 41.308 +| epoch 2 step 18600 | 7130 batches | lr 0.000979 | ms/batch 692.00 | loss 3.70 | ppl 40.409 +| epoch 2 step 18800 | 7330 batches | lr 0.000979 | ms/batch 703.47 | loss 3.68 | ppl 39.589 +| epoch 2 step 19000 | 7530 batches | lr 0.000978 | ms/batch 688.29 | loss 3.70 | ppl 40.570 +| epoch 2 step 19200 | 7730 batches | lr 0.000978 | ms/batch 682.44 | loss 3.70 | ppl 40.581 +| epoch 2 step 19400 | 7930 batches | lr 0.000977 | ms/batch 728.02 | loss 3.70 | ppl 40.350 +| epoch 2 step 19600 | 8130 batches | lr 0.000977 | ms/batch 685.89 | loss 3.71 | ppl 40.839 +| epoch 2 step 19800 | 8330 batches | lr 0.000976 | ms/batch 750.43 | loss 3.70 | ppl 40.432 +| epoch 2 step 20000 | 8530 batches | lr 0.000976 | ms/batch 684.49 | loss 3.69 | ppl 40.035 +---------------------------------------------------------------------------------------------------- +| Eval 5 at step 20000 | time: 2844.94s | valid loss 3.61 | valid ppl 36.930 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 20200 | 8730 batches | lr 0.000975 | ms/batch 792.71 | loss 3.71 | ppl 40.665 +| epoch 2 step 20400 | 8930 batches | lr 0.000975 | ms/batch 724.20 | loss 3.70 | ppl 40.601 +| epoch 2 step 20600 | 9130 batches | lr 0.000974 | ms/batch 703.31 | loss 3.70 | ppl 40.266 +| epoch 2 step 20800 | 9330 batches | lr 0.000974 | ms/batch 712.60 | loss 3.68 | ppl 39.824 +| epoch 2 step 21000 | 9530 batches | lr 0.000973 | ms/batch 707.33 | loss 3.73 | ppl 41.620 +| epoch 2 step 21200 | 9730 batches | lr 0.000973 | ms/batch 732.18 | loss 3.68 | ppl 39.564 +| epoch 2 step 21400 | 9930 batches | lr 0.000972 | ms/batch 739.74 | loss 3.69 | ppl 39.997 +| epoch 2 step 21600 | 10130 batches | lr 0.000972 | ms/batch 721.44 | loss 3.67 | ppl 39.422 +| epoch 2 step 21800 | 10330 batches | lr 0.000971 | ms/batch 724.90 | loss 3.68 | ppl 39.825 +| epoch 2 step 22000 | 10530 batches | lr 0.000971 | ms/batch 700.39 | loss 3.70 | ppl 40.466 +| epoch 2 step 22200 | 10730 batches | lr 0.00097 | ms/batch 697.06 | loss 3.67 | ppl 39.058 +| epoch 2 step 22400 | 10930 batches | lr 0.00097 | ms/batch 698.49 | loss 3.66 | ppl 39.010 +| epoch 2 step 22600 | 11130 batches | lr 0.000969 | ms/batch 735.66 | loss 3.71 | ppl 40.749 +| epoch 2 step 22800 | 11330 batches | lr 0.000968 | ms/batch 694.62 | loss 3.68 | ppl 39.480 +| epoch 3 step 23000 | 60 batches | lr 0.000968 | ms/batch 702.47 | loss 3.68 | ppl 39.624 +| epoch 3 step 23200 | 260 batches | lr 0.000967 | ms/batch 735.52 | loss 3.64 | ppl 37.917 +| epoch 3 step 23400 | 460 batches | lr 0.000967 | ms/batch 714.13 | loss 3.68 | ppl 39.527 +| epoch 3 step 23600 | 660 batches | lr 0.000966 | ms/batch 688.65 | loss 3.64 | ppl 38.062 +| epoch 3 step 23800 | 860 batches | lr 0.000966 | ms/batch 729.42 | loss 3.67 | ppl 39.410 +| epoch 3 step 24000 | 1060 batches | lr 0.000965 | ms/batch 720.33 | loss 3.66 | ppl 38.919 +---------------------------------------------------------------------------------------------------- +| Eval 6 at step 24000 | time: 2870.93s | valid loss 3.57 | valid ppl 35.685 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 24200 | 1260 batches | lr 0.000965 | ms/batch 762.39 | loss 3.65 | ppl 38.550 +| epoch 3 step 24400 | 1460 batches | lr 0.000964 | ms/batch 704.86 | loss 3.65 | ppl 38.452 +| epoch 3 step 24600 | 1660 batches | lr 0.000963 | ms/batch 712.42 | loss 3.64 | ppl 38.214 +| epoch 3 step 24800 | 1860 batches | lr 0.000963 | ms/batch 692.60 | loss 3.65 | ppl 38.427 +| epoch 3 step 25000 | 2060 batches | lr 0.000962 | ms/batch 712.66 | loss 3.69 | ppl 39.912 +| epoch 3 step 25200 | 2260 batches | lr 0.000962 | ms/batch 713.12 | loss 3.66 | ppl 38.905 +| epoch 3 step 25400 | 2460 batches | lr 0.000961 | ms/batch 746.11 | loss 3.65 | ppl 38.302 +| epoch 3 step 25600 | 2660 batches | lr 0.00096 | ms/batch 715.35 | loss 3.65 | ppl 38.395 +| epoch 3 step 25800 | 2860 batches | lr 0.00096 | ms/batch 709.29 | loss 3.59 | ppl 36.239 +| epoch 3 step 26000 | 3060 batches | lr 0.000959 | ms/batch 724.27 | loss 3.64 | ppl 38.109 +| epoch 3 step 26200 | 3260 batches | lr 0.000958 | ms/batch 684.82 | loss 3.64 | ppl 37.948 +| epoch 3 step 26400 | 3460 batches | lr 0.000958 | ms/batch 703.25 | loss 3.60 | ppl 36.652 +| epoch 3 step 26600 | 3660 batches | lr 0.000957 | ms/batch 697.91 | loss 3.62 | ppl 37.174 +| epoch 3 step 26800 | 3860 batches | lr 0.000957 | ms/batch 723.58 | loss 3.62 | ppl 37.381 +| epoch 3 step 27000 | 4060 batches | lr 0.000956 | ms/batch 720.99 | loss 3.63 | ppl 37.721 +| epoch 3 step 27200 | 4260 batches | lr 0.000955 | ms/batch 717.62 | loss 3.62 | ppl 37.339 +| epoch 3 step 27400 | 4460 batches | lr 0.000955 | ms/batch 722.90 | loss 3.62 | ppl 37.489 +| epoch 3 step 27600 | 4660 batches | lr 0.000954 | ms/batch 743.44 | loss 3.61 | ppl 37.092 +| epoch 3 step 27800 | 4860 batches | lr 0.000953 | ms/batch 696.12 | loss 3.60 | ppl 36.720 +| epoch 3 step 28000 | 5060 batches | lr 0.000953 | ms/batch 723.37 | loss 3.62 | ppl 37.226 +---------------------------------------------------------------------------------------------------- +| Eval 7 at step 28000 | time: 2861.34s | valid loss 3.55 | valid ppl 34.679 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 28200 | 5260 batches | lr 0.000952 | ms/batch 784.09 | loss 3.60 | ppl 36.586 +| epoch 3 step 28400 | 5460 batches | lr 0.000951 | ms/batch 697.94 | loss 3.58 | ppl 35.797 +| epoch 3 step 28600 | 5660 batches | lr 0.000951 | ms/batch 696.51 | loss 3.63 | ppl 37.613 +| epoch 3 step 28800 | 5860 batches | lr 0.00095 | ms/batch 709.45 | loss 3.60 | ppl 36.645 +| epoch 3 step 29000 | 6060 batches | lr 0.000949 | ms/batch 726.06 | loss 3.60 | ppl 36.438 +| epoch 3 step 29200 | 6260 batches | lr 0.000949 | ms/batch 713.31 | loss 3.60 | ppl 36.437 +| epoch 3 step 29400 | 6460 batches | lr 0.000948 | ms/batch 711.05 | loss 3.60 | ppl 36.736 +| epoch 3 step 29600 | 6660 batches | lr 0.000947 | ms/batch 718.44 | loss 3.55 | ppl 34.875 +| epoch 3 step 29800 | 6860 batches | lr 0.000946 | ms/batch 702.59 | loss 3.58 | ppl 35.994 +| epoch 3 step 30000 | 7060 batches | lr 0.000946 | ms/batch 707.51 | loss 3.58 | ppl 35.706 +| epoch 3 step 30200 | 7260 batches | lr 0.000945 | ms/batch 721.07 | loss 3.55 | ppl 34.761 +| epoch 3 step 30400 | 7460 batches | lr 0.000944 | ms/batch 709.39 | loss 3.57 | ppl 35.623 +| epoch 3 step 30600 | 7660 batches | lr 0.000944 | ms/batch 744.37 | loss 3.56 | ppl 35.102 +| epoch 3 step 30800 | 7860 batches | lr 0.000943 | ms/batch 734.93 | loss 3.57 | ppl 35.533 +| epoch 3 step 31000 | 8060 batches | lr 0.000942 | ms/batch 726.62 | loss 3.58 | ppl 35.834 +| epoch 3 step 31200 | 8260 batches | lr 0.000941 | ms/batch 720.25 | loss 3.57 | ppl 35.399 +| epoch 3 step 31400 | 8460 batches | lr 0.000941 | ms/batch 718.52 | loss 3.58 | ppl 35.858 +| epoch 3 step 31600 | 8660 batches | lr 0.00094 | ms/batch 739.97 | loss 3.57 | ppl 35.692 +| epoch 3 step 31800 | 8860 batches | lr 0.000939 | ms/batch 718.51 | loss 3.58 | ppl 35.785 +| epoch 3 step 32000 | 9060 batches | lr 0.000938 | ms/batch 707.81 | loss 3.58 | ppl 35.812 +---------------------------------------------------------------------------------------------------- +| Eval 8 at step 32000 | time: 2877.68s | valid loss 3.50 | valid ppl 33.030 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 32200 | 9260 batches | lr 0.000938 | ms/batch 794.55 | loss 3.56 | ppl 35.300 +| epoch 3 step 32400 | 9460 batches | lr 0.000937 | ms/batch 707.68 | loss 3.59 | ppl 36.119 +| epoch 3 step 32600 | 9660 batches | lr 0.000936 | ms/batch 743.86 | loss 3.59 | ppl 36.164 +| epoch 3 step 32800 | 9860 batches | lr 0.000935 | ms/batch 695.30 | loss 3.54 | ppl 34.575 +| epoch 3 step 33000 | 10060 batches | lr 0.000935 | ms/batch 692.14 | loss 3.59 | ppl 36.388 +| epoch 3 step 33200 | 10260 batches | lr 0.000934 | ms/batch 715.57 | loss 3.54 | ppl 34.497 +| epoch 3 step 33400 | 10460 batches | lr 0.000933 | ms/batch 716.72 | loss 3.58 | ppl 35.765 +| epoch 3 step 33600 | 10660 batches | lr 0.000932 | ms/batch 731.54 | loss 3.58 | ppl 36.053 +| epoch 3 step 33800 | 10860 batches | lr 0.000931 | ms/batch 681.57 | loss 3.54 | ppl 34.340 +| epoch 3 step 34000 | 11060 batches | lr 0.000931 | ms/batch 703.97 | loss 3.58 | ppl 35.930 +| epoch 3 step 34200 | 11260 batches | lr 0.00093 | ms/batch 701.49 | loss 3.59 | ppl 36.200 +| epoch 3 step 34400 | 11460 batches | lr 0.000929 | ms/batch 733.09 | loss 3.56 | ppl 35.206 +| epoch 4 step 34600 | 190 batches | lr 0.000928 | ms/batch 756.94 | loss 3.54 | ppl 34.517 +| epoch 4 step 34800 | 390 batches | lr 0.000927 | ms/batch 720.83 | loss 3.55 | ppl 34.839 +| epoch 4 step 35000 | 590 batches | lr 0.000927 | ms/batch 720.58 | loss 3.54 | ppl 34.625 +| epoch 4 step 35200 | 790 batches | lr 0.000926 | ms/batch 697.74 | loss 3.56 | ppl 35.160 +| epoch 4 step 35400 | 990 batches | lr 0.000925 | ms/batch 699.80 | loss 3.54 | ppl 34.435 +| epoch 4 step 35600 | 1190 batches | lr 0.000924 | ms/batch 714.28 | loss 3.56 | ppl 35.131 +| epoch 4 step 35800 | 1390 batches | lr 0.000923 | ms/batch 756.65 | loss 3.55 | ppl 34.742 +| epoch 4 step 36000 | 1590 batches | lr 0.000922 | ms/batch 709.40 | loss 3.54 | ppl 34.353 +---------------------------------------------------------------------------------------------------- +| Eval 9 at step 36000 | time: 2874.62s | valid loss 3.49 | valid ppl 32.646 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 36200 | 1790 batches | lr 0.000922 | ms/batch 803.92 | loss 3.55 | ppl 34.710 +| epoch 4 step 36400 | 1990 batches | lr 0.000921 | ms/batch 728.02 | loss 3.57 | ppl 35.683 +| epoch 4 step 36600 | 2190 batches | lr 0.00092 | ms/batch 688.41 | loss 3.56 | ppl 35.170 +| epoch 4 step 36800 | 2390 batches | lr 0.000919 | ms/batch 762.72 | loss 3.56 | ppl 35.152 +| epoch 4 step 37000 | 2590 batches | lr 0.000918 | ms/batch 713.16 | loss 3.54 | ppl 34.340 +| epoch 4 step 37200 | 2790 batches | lr 0.000917 | ms/batch 707.43 | loss 3.52 | ppl 33.736 +| epoch 4 step 37400 | 2990 batches | lr 0.000916 | ms/batch 740.26 | loss 3.54 | ppl 34.315 +| epoch 4 step 37600 | 3190 batches | lr 0.000916 | ms/batch 717.95 | loss 3.53 | ppl 34.261 +| epoch 4 step 37800 | 3390 batches | lr 0.000915 | ms/batch 709.80 | loss 3.53 | ppl 34.276 +| epoch 4 step 38000 | 3590 batches | lr 0.000914 | ms/batch 733.53 | loss 3.51 | ppl 33.321 +| epoch 4 step 38200 | 3790 batches | lr 0.000913 | ms/batch 758.57 | loss 3.53 | ppl 34.107 +| epoch 4 step 38400 | 3990 batches | lr 0.000912 | ms/batch 718.85 | loss 3.54 | ppl 34.534 +| epoch 4 step 38600 | 4190 batches | lr 0.000911 | ms/batch 739.54 | loss 3.52 | ppl 33.947 +| epoch 4 step 38800 | 4390 batches | lr 0.00091 | ms/batch 687.41 | loss 3.53 | ppl 34.144 +| epoch 4 step 39000 | 4590 batches | lr 0.000909 | ms/batch 738.74 | loss 3.54 | ppl 34.622 +| epoch 4 step 39200 | 4790 batches | lr 0.000908 | ms/batch 698.45 | loss 3.50 | ppl 33.113 +| epoch 4 step 39400 | 4990 batches | lr 0.000907 | ms/batch 693.14 | loss 3.55 | ppl 34.783 +| epoch 4 step 39600 | 5190 batches | lr 0.000907 | ms/batch 712.17 | loss 3.51 | ppl 33.354 +| epoch 4 step 39800 | 5390 batches | lr 0.000906 | ms/batch 703.60 | loss 3.49 | ppl 32.707 +| epoch 4 step 40000 | 5590 batches | lr 0.000905 | ms/batch 736.01 | loss 3.51 | ppl 33.575 +---------------------------------------------------------------------------------------------------- +| Eval 10 at step 40000 | time: 2894.08s | valid loss 3.46 | valid ppl 31.859 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 40200 | 5790 batches | lr 0.000904 | ms/batch 783.88 | loss 3.53 | ppl 34.189 +| epoch 4 step 40400 | 5990 batches | lr 0.000903 | ms/batch 727.73 | loss 3.51 | ppl 33.317 +| epoch 4 step 40600 | 6190 batches | lr 0.000902 | ms/batch 746.60 | loss 3.51 | ppl 33.287 +| epoch 4 step 40800 | 6390 batches | lr 0.000901 | ms/batch 716.44 | loss 3.53 | ppl 34.260 +| epoch 4 step 41000 | 6590 batches | lr 0.0009 | ms/batch 720.41 | loss 3.47 | ppl 32.119 +| epoch 4 step 41200 | 6790 batches | lr 0.000899 | ms/batch 717.76 | loss 3.49 | ppl 32.904 +| epoch 4 step 41400 | 6990 batches | lr 0.000898 | ms/batch 722.41 | loss 3.51 | ppl 33.437 +| epoch 4 step 41600 | 7190 batches | lr 0.000897 | ms/batch 691.50 | loss 3.46 | ppl 31.813 +| epoch 4 step 41800 | 7390 batches | lr 0.000896 | ms/batch 718.66 | loss 3.49 | ppl 32.731 +| epoch 4 step 42000 | 7590 batches | lr 0.000895 | ms/batch 704.21 | loss 3.47 | ppl 31.977 +| epoch 4 step 42200 | 7790 batches | lr 0.000894 | ms/batch 716.09 | loss 3.50 | ppl 32.973 +| epoch 4 step 42400 | 7990 batches | lr 0.000893 | ms/batch 716.72 | loss 3.49 | ppl 32.928 +| epoch 4 step 42600 | 8190 batches | lr 0.000892 | ms/batch 769.51 | loss 3.48 | ppl 32.525 +| epoch 4 step 42800 | 8390 batches | lr 0.000891 | ms/batch 721.86 | loss 3.51 | ppl 33.503 +| epoch 4 step 43000 | 8590 batches | lr 0.00089 | ms/batch 693.31 | loss 3.49 | ppl 32.709 +| epoch 4 step 43200 | 8790 batches | lr 0.000889 | ms/batch 716.81 | loss 3.51 | ppl 33.341 +| epoch 4 step 43400 | 8990 batches | lr 0.000888 | ms/batch 724.20 | loss 3.49 | ppl 32.874 +| epoch 4 step 43600 | 9190 batches | lr 0.000887 | ms/batch 743.40 | loss 3.48 | ppl 32.617 +| epoch 4 step 43800 | 9390 batches | lr 0.000886 | ms/batch 731.34 | loss 3.49 | ppl 32.906 +| epoch 4 step 44000 | 9590 batches | lr 0.000885 | ms/batch 707.15 | loss 3.51 | ppl 33.593 +---------------------------------------------------------------------------------------------------- +| Eval 11 at step 44000 | time: 2893.83s | valid loss 3.44 | valid ppl 31.142 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 44200 | 9790 batches | lr 0.000884 | ms/batch 788.65 | loss 3.49 | ppl 32.688 +| epoch 4 step 44400 | 9990 batches | lr 0.000883 | ms/batch 722.71 | loss 3.49 | ppl 32.749 +| epoch 4 step 44600 | 10190 batches | lr 0.000882 | ms/batch 731.49 | loss 3.48 | ppl 32.440 +| epoch 4 step 44800 | 10390 batches | lr 0.000881 | ms/batch 722.01 | loss 3.48 | ppl 32.562 +| epoch 4 step 45000 | 10590 batches | lr 0.00088 | ms/batch 707.83 | loss 3.51 | ppl 33.595 +| epoch 4 step 45200 | 10790 batches | lr 0.000879 | ms/batch 721.94 | loss 3.47 | ppl 31.984 +| epoch 4 step 45400 | 10990 batches | lr 0.000878 | ms/batch 702.94 | loss 3.50 | ppl 33.148 +| epoch 4 step 45600 | 11190 batches | lr 0.000877 | ms/batch 731.15 | loss 3.51 | ppl 33.303 +| epoch 4 step 45800 | 11390 batches | lr 0.000876 | ms/batch 744.59 | loss 3.50 | ppl 33.078 +| epoch 5 step 46000 | 120 batches | lr 0.000875 | ms/batch 718.10 | loss 3.48 | ppl 32.481 +| epoch 5 step 46200 | 320 batches | lr 0.000874 | ms/batch 718.77 | loss 3.47 | ppl 31.988 +| epoch 5 step 46400 | 520 batches | lr 0.000873 | ms/batch 707.60 | loss 3.50 | ppl 33.036 +| epoch 5 step 46600 | 720 batches | lr 0.000872 | ms/batch 736.58 | loss 3.46 | ppl 31.813 +| epoch 5 step 46800 | 920 batches | lr 0.000871 | ms/batch 740.84 | loss 3.47 | ppl 31.987 +| epoch 5 step 47000 | 1120 batches | lr 0.00087 | ms/batch 697.11 | loss 3.50 | ppl 33.275 +| epoch 5 step 47200 | 1320 batches | lr 0.000869 | ms/batch 708.82 | loss 3.47 | ppl 32.018 +| epoch 5 step 47400 | 1520 batches | lr 0.000868 | ms/batch 730.85 | loss 3.47 | ppl 32.114 +| epoch 5 step 47600 | 1720 batches | lr 0.000867 | ms/batch 731.39 | loss 3.46 | ppl 31.886 +| epoch 5 step 47800 | 1920 batches | lr 0.000866 | ms/batch 733.07 | loss 3.49 | ppl 32.773 +| epoch 5 step 48000 | 2120 batches | lr 0.000865 | ms/batch 713.54 | loss 3.51 | ppl 33.315 +---------------------------------------------------------------------------------------------------- +| Eval 12 at step 48000 | time: 2897.76s | valid loss 3.42 | valid ppl 30.472 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 48200 | 2320 batches | lr 0.000864 | ms/batch 788.00 | loss 3.49 | ppl 32.699 +| epoch 5 step 48400 | 2520 batches | lr 0.000863 | ms/batch 762.17 | loss 3.47 | ppl 32.162 +| epoch 5 step 48600 | 2720 batches | lr 0.000861 | ms/batch 722.27 | loss 3.46 | ppl 31.777 +| epoch 5 step 48800 | 2920 batches | lr 0.00086 | ms/batch 724.85 | loss 3.45 | ppl 31.489 +| epoch 5 step 49000 | 3120 batches | lr 0.000859 | ms/batch 710.81 | loss 3.47 | ppl 32.099 +| epoch 5 step 49200 | 3320 batches | lr 0.000858 | ms/batch 706.84 | loss 3.48 | ppl 32.407 +| epoch 5 step 49400 | 3520 batches | lr 0.000857 | ms/batch 707.39 | loss 3.44 | ppl 31.235 +| epoch 5 step 49600 | 3720 batches | lr 0.000856 | ms/batch 716.47 | loss 3.47 | ppl 32.056 +| epoch 5 step 49800 | 3920 batches | lr 0.000855 | ms/batch 721.75 | loss 3.46 | ppl 31.917 +| epoch 5 step 50000 | 4120 batches | lr 0.000854 | ms/batch 701.48 | loss 3.46 | ppl 31.968 +| epoch 5 step 50200 | 4320 batches | lr 0.000853 | ms/batch 733.62 | loss 3.47 | ppl 32.081 +| epoch 5 step 50400 | 4520 batches | lr 0.000852 | ms/batch 707.41 | loss 3.48 | ppl 32.529 +| epoch 5 step 50600 | 4720 batches | lr 0.00085 | ms/batch 733.10 | loss 3.44 | ppl 31.243 +| epoch 5 step 50800 | 4920 batches | lr 0.000849 | ms/batch 439.30 | loss 3.46 | ppl 31.752 +| epoch 5 step 51000 | 5120 batches | lr 0.000848 | ms/batch 428.23 | loss 3.45 | ppl 31.582 +| epoch 5 step 51200 | 5320 batches | lr 0.000847 | ms/batch 428.16 | loss 3.45 | ppl 31.426 +| epoch 5 step 51400 | 5520 batches | lr 0.000846 | ms/batch 428.00 | loss 3.44 | ppl 31.258 +| epoch 5 step 51600 | 5720 batches | lr 0.000845 | ms/batch 428.31 | loss 3.46 | ppl 31.686 +| epoch 5 step 51800 | 5920 batches | lr 0.000844 | ms/batch 428.68 | loss 3.45 | ppl 31.622 +| epoch 5 step 52000 | 6120 batches | lr 0.000842 | ms/batch 428.13 | loss 3.45 | ppl 31.374 +---------------------------------------------------------------------------------------------------- +| Eval 13 at step 52000 | time: 2482.68s | valid loss 3.41 | valid ppl 30.380 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 52200 | 6320 batches | lr 0.000841 | ms/batch 479.93 | loss 3.47 | ppl 32.078 +| epoch 5 step 52400 | 6520 batches | lr 0.00084 | ms/batch 428.34 | loss 3.41 | ppl 30.391 +| epoch 5 step 52600 | 6720 batches | lr 0.000839 | ms/batch 428.29 | loss 3.42 | ppl 30.557 +| epoch 5 step 52800 | 6920 batches | lr 0.000838 | ms/batch 428.06 | loss 3.44 | ppl 31.190 +| epoch 5 step 53000 | 7120 batches | lr 0.000837 | ms/batch 427.79 | loss 3.43 | ppl 30.785 +| epoch 5 step 53200 | 7320 batches | lr 0.000836 | ms/batch 428.04 | loss 3.40 | ppl 29.880 +| epoch 5 step 53400 | 7520 batches | lr 0.000834 | ms/batch 427.78 | loss 3.43 | ppl 30.849 +| epoch 5 step 53600 | 7720 batches | lr 0.000833 | ms/batch 428.29 | loss 3.42 | ppl 30.652 +| epoch 5 step 53800 | 7920 batches | lr 0.000832 | ms/batch 430.31 | loss 3.42 | ppl 30.697 +| epoch 5 step 54000 | 8120 batches | lr 0.000831 | ms/batch 428.09 | loss 3.44 | ppl 31.114 +| epoch 5 step 54200 | 8320 batches | lr 0.00083 | ms/batch 428.52 | loss 3.43 | ppl 30.845 +| epoch 5 step 54400 | 8520 batches | lr 0.000828 | ms/batch 428.56 | loss 3.42 | ppl 30.624 +| epoch 5 step 54600 | 8720 batches | lr 0.000827 | ms/batch 428.02 | loss 3.44 | ppl 31.145 +| epoch 5 step 54800 | 8920 batches | lr 0.000826 | ms/batch 428.01 | loss 3.44 | ppl 31.221 +| epoch 5 step 55000 | 9120 batches | lr 0.000825 | ms/batch 427.99 | loss 3.43 | ppl 30.961 +| epoch 5 step 55200 | 9320 batches | lr 0.000824 | ms/batch 428.43 | loss 3.42 | ppl 30.708 +| epoch 5 step 55400 | 9520 batches | lr 0.000823 | ms/batch 428.12 | loss 3.46 | ppl 31.685 +| epoch 5 step 55600 | 9720 batches | lr 0.000821 | ms/batch 427.89 | loss 3.43 | ppl 30.732 +| epoch 5 step 55800 | 9920 batches | lr 0.00082 | ms/batch 428.47 | loss 3.43 | ppl 30.858 +| epoch 5 step 56000 | 10120 batches | lr 0.000819 | ms/batch 428.88 | loss 3.43 | ppl 30.769 +---------------------------------------------------------------------------------------------------- +| Eval 14 at step 56000 | time: 1719.48s | valid loss 3.39 | valid ppl 29.702 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 56200 | 10320 batches | lr 0.000818 | ms/batch 481.91 | loss 3.43 | ppl 30.830 +| epoch 5 step 56400 | 10520 batches | lr 0.000816 | ms/batch 428.55 | loss 3.45 | ppl 31.519 +| epoch 5 step 56600 | 10720 batches | lr 0.000815 | ms/batch 428.19 | loss 3.42 | ppl 30.448 +| epoch 5 step 56800 | 10920 batches | lr 0.000814 | ms/batch 428.24 | loss 3.41 | ppl 30.308 +| epoch 5 step 57000 | 11120 batches | lr 0.000813 | ms/batch 428.07 | loss 3.47 | ppl 32.121 +| epoch 5 step 57200 | 11320 batches | lr 0.000812 | ms/batch 428.22 | loss 3.42 | ppl 30.698 +| epoch 6 step 57400 | 50 batches | lr 0.00081 | ms/batch 427.60 | loss 3.44 | ppl 31.304 +| epoch 6 step 57600 | 250 batches | lr 0.000809 | ms/batch 428.27 | loss 3.40 | ppl 29.816 +| epoch 6 step 57800 | 450 batches | lr 0.000808 | ms/batch 428.43 | loss 3.43 | ppl 31.010 +| epoch 6 step 58000 | 650 batches | lr 0.000807 | ms/batch 428.85 | loss 3.40 | ppl 29.986 +| epoch 6 step 58200 | 850 batches | lr 0.000805 | ms/batch 428.36 | loss 3.44 | ppl 31.179 +| epoch 6 step 58400 | 1050 batches | lr 0.000804 | ms/batch 428.27 | loss 3.42 | ppl 30.427 +| epoch 6 step 58600 | 1250 batches | lr 0.000803 | ms/batch 427.88 | loss 3.42 | ppl 30.439 +| epoch 6 step 58800 | 1450 batches | lr 0.000802 | ms/batch 428.26 | loss 3.42 | ppl 30.628 +| epoch 6 step 59000 | 1650 batches | lr 0.0008 | ms/batch 428.41 | loss 3.40 | ppl 29.997 +| epoch 6 step 59200 | 1850 batches | lr 0.000799 | ms/batch 428.81 | loss 3.42 | ppl 30.513 +| epoch 6 step 59400 | 2050 batches | lr 0.000798 | ms/batch 427.82 | loss 3.46 | ppl 31.775 +| epoch 6 step 59600 | 2250 batches | lr 0.000797 | ms/batch 428.09 | loss 3.43 | ppl 30.763 +| epoch 6 step 59800 | 2450 batches | lr 0.000795 | ms/batch 428.44 | loss 3.42 | ppl 30.721 +| epoch 6 step 60000 | 2650 batches | lr 0.000794 | ms/batch 428.03 | loss 3.42 | ppl 30.694 +---------------------------------------------------------------------------------------------------- +| Eval 15 at step 60000 | time: 1719.35s | valid loss 3.38 | valid ppl 29.457 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 60200 | 2850 batches | lr 0.000793 | ms/batch 481.37 | loss 3.37 | ppl 29.154 +| epoch 6 step 60400 | 3050 batches | lr 0.000792 | ms/batch 428.38 | loss 3.42 | ppl 30.655 +| epoch 6 step 60600 | 3250 batches | lr 0.00079 | ms/batch 428.15 | loss 3.41 | ppl 30.363 +| epoch 6 step 60800 | 3450 batches | lr 0.000789 | ms/batch 428.57 | loss 3.40 | ppl 29.835 +| epoch 6 step 61000 | 3650 batches | lr 0.000788 | ms/batch 428.17 | loss 3.40 | ppl 29.899 +| epoch 6 step 61200 | 3850 batches | lr 0.000786 | ms/batch 428.39 | loss 3.41 | ppl 30.122 +| epoch 6 step 61400 | 4050 batches | lr 0.000785 | ms/batch 428.27 | loss 3.42 | ppl 30.664 +| epoch 6 step 61600 | 4250 batches | lr 0.000784 | ms/batch 428.29 | loss 3.41 | ppl 30.120 +| epoch 6 step 61800 | 4450 batches | lr 0.000783 | ms/batch 427.99 | loss 3.41 | ppl 30.317 +| epoch 6 step 62000 | 4650 batches | lr 0.000781 | ms/batch 428.43 | loss 3.41 | ppl 30.140 +| epoch 6 step 62200 | 4850 batches | lr 0.00078 | ms/batch 428.23 | loss 3.40 | ppl 29.843 +| epoch 6 step 62400 | 5050 batches | lr 0.000779 | ms/batch 428.52 | loss 3.41 | ppl 30.256 +| epoch 6 step 62600 | 5250 batches | lr 0.000777 | ms/batch 428.32 | loss 3.40 | ppl 29.897 +| epoch 6 step 62800 | 5450 batches | lr 0.000776 | ms/batch 428.15 | loss 3.37 | ppl 29.184 +| epoch 6 step 63000 | 5650 batches | lr 0.000775 | ms/batch 428.74 | loss 3.42 | ppl 30.596 +| epoch 6 step 63200 | 5850 batches | lr 0.000773 | ms/batch 428.17 | loss 3.40 | ppl 29.873 +| epoch 6 step 63400 | 6050 batches | lr 0.000772 | ms/batch 431.10 | loss 3.39 | ppl 29.602 +| epoch 6 step 63600 | 6250 batches | lr 0.000771 | ms/batch 428.80 | loss 3.40 | ppl 29.894 +| epoch 6 step 63800 | 6450 batches | lr 0.000769 | ms/batch 428.27 | loss 3.40 | ppl 30.015 +| epoch 6 step 64000 | 6650 batches | lr 0.000768 | ms/batch 427.89 | loss 3.35 | ppl 28.502 +---------------------------------------------------------------------------------------------------- +| Eval 16 at step 64000 | time: 1720.26s | valid loss 3.37 | valid ppl 29.191 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 64200 | 6850 batches | lr 0.000767 | ms/batch 480.29 | loss 3.38 | ppl 29.424 +| epoch 6 step 64400 | 7050 batches | lr 0.000765 | ms/batch 428.06 | loss 3.38 | ppl 29.457 +| epoch 6 step 64600 | 7250 batches | lr 0.000764 | ms/batch 428.26 | loss 3.35 | ppl 28.404 +| epoch 6 step 64800 | 7450 batches | lr 0.000763 | ms/batch 427.97 | loss 3.37 | ppl 29.176 +| epoch 6 step 65000 | 7650 batches | lr 0.000761 | ms/batch 427.80 | loss 3.36 | ppl 28.687 +| epoch 6 step 65200 | 7850 batches | lr 0.00076 | ms/batch 427.94 | loss 3.38 | ppl 29.239 +| epoch 6 step 65400 | 8050 batches | lr 0.000759 | ms/batch 428.21 | loss 3.38 | ppl 29.423 +| epoch 6 step 65600 | 8250 batches | lr 0.000757 | ms/batch 428.24 | loss 3.37 | ppl 29.027 +| epoch 6 step 65800 | 8450 batches | lr 0.000756 | ms/batch 428.08 | loss 3.39 | ppl 29.561 +| epoch 6 step 66000 | 8650 batches | lr 0.000755 | ms/batch 428.12 | loss 3.37 | ppl 29.182 +| epoch 6 step 66200 | 8850 batches | lr 0.000753 | ms/batch 427.80 | loss 3.39 | ppl 29.755 +| epoch 6 step 66400 | 9050 batches | lr 0.000752 | ms/batch 427.84 | loss 3.38 | ppl 29.461 +| epoch 6 step 66600 | 9250 batches | lr 0.000751 | ms/batch 428.23 | loss 3.37 | ppl 29.042 +| epoch 6 step 66800 | 9450 batches | lr 0.000749 | ms/batch 428.13 | loss 3.39 | ppl 29.675 +| epoch 6 step 67000 | 9650 batches | lr 0.000748 | ms/batch 428.30 | loss 3.40 | ppl 29.988 +| epoch 6 step 67200 | 9850 batches | lr 0.000747 | ms/batch 427.99 | loss 3.35 | ppl 28.570 +| epoch 6 step 67400 | 10050 batches | lr 0.000745 | ms/batch 427.95 | loss 3.40 | ppl 29.984 +| epoch 6 step 67600 | 10250 batches | lr 0.000744 | ms/batch 428.03 | loss 3.35 | ppl 28.630 +| epoch 6 step 67800 | 10450 batches | lr 0.000742 | ms/batch 430.31 | loss 3.39 | ppl 29.531 +| epoch 6 step 68000 | 10650 batches | lr 0.000741 | ms/batch 427.87 | loss 3.40 | ppl 29.901 +---------------------------------------------------------------------------------------------------- +| Eval 17 at step 68000 | time: 1719.02s | valid loss 3.36 | valid ppl 28.688 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 68200 | 10850 batches | lr 0.00074 | ms/batch 480.96 | loss 3.35 | ppl 28.405 +| epoch 6 step 68400 | 11050 batches | lr 0.000738 | ms/batch 427.96 | loss 3.39 | ppl 29.811 +| epoch 6 step 68600 | 11250 batches | lr 0.000737 | ms/batch 428.15 | loss 3.41 | ppl 30.203 +| epoch 6 step 68800 | 11450 batches | lr 0.000736 | ms/batch 428.01 | loss 3.37 | ppl 29.109 +| epoch 7 step 69000 | 180 batches | lr 0.000734 | ms/batch 426.98 | loss 3.36 | ppl 28.847 +| epoch 7 step 69200 | 380 batches | lr 0.000733 | ms/batch 427.99 | loss 3.36 | ppl 28.907 +| epoch 7 step 69400 | 580 batches | lr 0.000731 | ms/batch 428.36 | loss 3.37 | ppl 28.943 +| epoch 7 step 69600 | 780 batches | lr 0.00073 | ms/batch 428.04 | loss 3.37 | ppl 29.147 +| epoch 7 step 69800 | 980 batches | lr 0.000729 | ms/batch 428.00 | loss 3.35 | ppl 28.565 +| epoch 7 step 70000 | 1180 batches | lr 0.000727 | ms/batch 428.01 | loss 3.38 | ppl 29.455 +| epoch 7 step 70200 | 1380 batches | lr 0.000726 | ms/batch 428.23 | loss 3.36 | ppl 28.842 +| epoch 7 step 70400 | 1580 batches | lr 0.000724 | ms/batch 428.06 | loss 3.36 | ppl 28.832 +| epoch 7 step 70600 | 1780 batches | lr 0.000723 | ms/batch 428.43 | loss 3.36 | ppl 28.804 +| epoch 7 step 70800 | 1980 batches | lr 0.000722 | ms/batch 428.28 | loss 3.39 | ppl 29.744 +| epoch 7 step 71000 | 2180 batches | lr 0.00072 | ms/batch 428.36 | loss 3.38 | ppl 29.446 +| epoch 7 step 71200 | 2380 batches | lr 0.000719 | ms/batch 428.04 | loss 3.38 | ppl 29.368 +| epoch 7 step 71400 | 2580 batches | lr 0.000717 | ms/batch 428.28 | loss 3.36 | ppl 28.901 +| epoch 7 step 71600 | 2780 batches | lr 0.000716 | ms/batch 428.22 | loss 3.34 | ppl 28.336 +| epoch 7 step 71800 | 2980 batches | lr 0.000714 | ms/batch 427.98 | loss 3.36 | ppl 28.688 +| epoch 7 step 72000 | 3180 batches | lr 0.000713 | ms/batch 428.29 | loss 3.37 | ppl 29.018 +---------------------------------------------------------------------------------------------------- +| Eval 18 at step 72000 | time: 1718.69s | valid loss 3.34 | valid ppl 28.340 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 72200 | 3380 batches | lr 0.000712 | ms/batch 480.57 | loss 3.36 | ppl 28.833 +| epoch 7 step 72400 | 3580 batches | lr 0.00071 | ms/batch 428.02 | loss 3.34 | ppl 28.200 +| epoch 7 step 72600 | 3780 batches | lr 0.000709 | ms/batch 428.30 | loss 3.36 | ppl 28.651 +| epoch 7 step 72800 | 3980 batches | lr 0.000707 | ms/batch 428.18 | loss 3.36 | ppl 28.922 +| epoch 7 step 73000 | 4180 batches | lr 0.000706 | ms/batch 428.44 | loss 3.36 | ppl 28.777 +| epoch 7 step 73200 | 4380 batches | lr 0.000704 | ms/batch 428.60 | loss 3.36 | ppl 28.768 +| epoch 7 step 73400 | 4580 batches | lr 0.000703 | ms/batch 427.98 | loss 3.38 | ppl 29.301 +| epoch 7 step 73600 | 4780 batches | lr 0.000702 | ms/batch 427.88 | loss 3.33 | ppl 28.012 +| epoch 7 step 73800 | 4980 batches | lr 0.0007 | ms/batch 428.03 | loss 3.37 | ppl 29.179 +| epoch 7 step 74000 | 5180 batches | lr 0.000699 | ms/batch 428.27 | loss 3.34 | ppl 28.334 +| epoch 7 step 74200 | 5380 batches | lr 0.000697 | ms/batch 428.23 | loss 3.32 | ppl 27.662 +| epoch 7 step 74400 | 5580 batches | lr 0.000696 | ms/batch 428.04 | loss 3.35 | ppl 28.373 +| epoch 7 step 74600 | 5780 batches | lr 0.000694 | ms/batch 428.14 | loss 3.37 | ppl 28.974 +| epoch 7 step 74800 | 5980 batches | lr 0.000693 | ms/batch 428.03 | loss 3.34 | ppl 28.198 +| epoch 7 step 75000 | 6180 batches | lr 0.000691 | ms/batch 428.09 | loss 3.34 | ppl 28.141 +| epoch 7 step 75200 | 6380 batches | lr 0.00069 | ms/batch 428.46 | loss 3.37 | ppl 29.134 +| epoch 7 step 75400 | 6580 batches | lr 0.000689 | ms/batch 428.24 | loss 3.30 | ppl 27.073 +| epoch 7 step 75600 | 6780 batches | lr 0.000687 | ms/batch 428.32 | loss 3.33 | ppl 27.915 +| epoch 7 step 75800 | 6980 batches | lr 0.000686 | ms/batch 428.01 | loss 3.34 | ppl 28.342 +| epoch 7 step 76000 | 7180 batches | lr 0.000684 | ms/batch 428.26 | loss 3.30 | ppl 27.012 +---------------------------------------------------------------------------------------------------- +| Eval 19 at step 76000 | time: 1719.03s | valid loss 3.34 | valid ppl 28.085 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 76200 | 7380 batches | lr 0.000683 | ms/batch 480.62 | loss 3.32 | ppl 27.748 +| epoch 7 step 76400 | 7580 batches | lr 0.000681 | ms/batch 428.12 | loss 3.30 | ppl 27.084 +| epoch 7 step 76600 | 7780 batches | lr 0.00068 | ms/batch 428.01 | loss 3.33 | ppl 28.010 +| epoch 7 step 76800 | 7980 batches | lr 0.000678 | ms/batch 428.40 | loss 3.33 | ppl 27.921 +| epoch 7 step 77000 | 8180 batches | lr 0.000677 | ms/batch 428.37 | loss 3.31 | ppl 27.488 +| epoch 7 step 77200 | 8380 batches | lr 0.000675 | ms/batch 428.44 | loss 3.35 | ppl 28.428 +| epoch 7 step 77400 | 8580 batches | lr 0.000674 | ms/batch 428.56 | loss 3.32 | ppl 27.769 +| epoch 7 step 77600 | 8780 batches | lr 0.000672 | ms/batch 428.27 | loss 3.34 | ppl 28.127 +| epoch 7 step 77800 | 8980 batches | lr 0.000671 | ms/batch 428.11 | loss 3.34 | ppl 28.080 +| epoch 7 step 78000 | 9180 batches | lr 0.00067 | ms/batch 428.36 | loss 3.32 | ppl 27.589 +| epoch 7 step 78200 | 9380 batches | lr 0.000668 | ms/batch 428.37 | loss 3.33 | ppl 28.024 +| epoch 7 step 78400 | 9580 batches | lr 0.000667 | ms/batch 428.24 | loss 3.35 | ppl 28.582 +| epoch 7 step 78600 | 9780 batches | lr 0.000665 | ms/batch 428.30 | loss 3.32 | ppl 27.792 +| epoch 7 step 78800 | 9980 batches | lr 0.000664 | ms/batch 428.32 | loss 3.33 | ppl 27.822 +| epoch 7 step 79000 | 10180 batches | lr 0.000662 | ms/batch 428.43 | loss 3.31 | ppl 27.507 +| epoch 7 step 79200 | 10380 batches | lr 0.000661 | ms/batch 428.67 | loss 3.33 | ppl 27.883 +| epoch 7 step 79400 | 10580 batches | lr 0.000659 | ms/batch 428.45 | loss 3.35 | ppl 28.534 +| epoch 7 step 79600 | 10780 batches | lr 0.000658 | ms/batch 428.45 | loss 3.31 | ppl 27.300 +| epoch 7 step 79800 | 10980 batches | lr 0.000656 | ms/batch 428.51 | loss 3.33 | ppl 28.003 +| epoch 7 step 80000 | 11180 batches | lr 0.000655 | ms/batch 428.08 | loss 3.35 | ppl 28.570 +---------------------------------------------------------------------------------------------------- +| Eval 20 at step 80000 | time: 1719.62s | valid loss 3.33 | valid ppl 27.910 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 80200 | 11380 batches | lr 0.000653 | ms/batch 481.33 | loss 3.34 | ppl 28.104 +| epoch 8 step 80400 | 110 batches | lr 0.000652 | ms/batch 427.32 | loss 3.32 | ppl 27.722 +| epoch 8 step 80600 | 310 batches | lr 0.00065 | ms/batch 428.44 | loss 3.31 | ppl 27.342 +| epoch 8 step 80800 | 510 batches | lr 0.000649 | ms/batch 428.57 | loss 3.34 | ppl 28.236 +| epoch 8 step 81000 | 710 batches | lr 0.000647 | ms/batch 428.00 | loss 3.30 | ppl 27.046 +| epoch 8 step 81200 | 910 batches | lr 0.000646 | ms/batch 428.73 | loss 3.31 | ppl 27.389 +| epoch 8 step 81400 | 1110 batches | lr 0.000644 | ms/batch 428.04 | loss 3.34 | ppl 28.203 +| epoch 8 step 81600 | 1310 batches | lr 0.000643 | ms/batch 428.37 | loss 3.31 | ppl 27.453 +| epoch 8 step 81800 | 1510 batches | lr 0.000641 | ms/batch 428.54 | loss 3.31 | ppl 27.477 +| epoch 8 step 82000 | 1710 batches | lr 0.00064 | ms/batch 428.08 | loss 3.30 | ppl 27.048 +| epoch 8 step 82200 | 1910 batches | lr 0.000638 | ms/batch 428.45 | loss 3.33 | ppl 28.077 +| epoch 8 step 82400 | 2110 batches | lr 0.000637 | ms/batch 428.41 | loss 3.35 | ppl 28.551 +| epoch 8 step 82600 | 2310 batches | lr 0.000635 | ms/batch 428.17 | loss 3.33 | ppl 27.998 +| epoch 8 step 82800 | 2510 batches | lr 0.000634 | ms/batch 428.32 | loss 3.31 | ppl 27.500 +| epoch 8 step 83000 | 2710 batches | lr 0.000632 | ms/batch 428.30 | loss 3.31 | ppl 27.355 +| epoch 8 step 83200 | 2910 batches | lr 0.000631 | ms/batch 428.26 | loss 3.29 | ppl 26.778 +| epoch 8 step 83400 | 3110 batches | lr 0.000629 | ms/batch 428.27 | loss 3.32 | ppl 27.565 +| epoch 8 step 83600 | 3310 batches | lr 0.000628 | ms/batch 428.68 | loss 3.33 | ppl 27.977 +| epoch 8 step 83800 | 3510 batches | lr 0.000626 | ms/batch 428.36 | loss 3.29 | ppl 26.866 +| epoch 8 step 84000 | 3710 batches | lr 0.000624 | ms/batch 428.21 | loss 3.31 | ppl 27.460 +---------------------------------------------------------------------------------------------------- +| Eval 21 at step 84000 | time: 1719.55s | valid loss 3.31 | valid ppl 27.444 +---------------------------------------------------------------------------------------------------- +| epoch 8 step 84200 | 3910 batches | lr 0.000623 | ms/batch 480.82 | loss 3.30 | ppl 27.247 +| epoch 8 step 84400 | 4110 batches | lr 0.000621 | ms/batch 428.46 | loss 3.32 | ppl 27.559 +| epoch 8 step 84600 | 4310 batches | lr 0.00062 | ms/batch 428.36 | loss 3.31 | ppl 27.483 +| epoch 8 step 84800 | 4510 batches | lr 0.000618 | ms/batch 428.27 | loss 3.33 | ppl 27.937 +| epoch 8 step 85000 | 4710 batches | lr 0.000617 | ms/batch 428.47 | loss 3.29 | ppl 26.787 +| epoch 8 step 85200 | 4910 batches | lr 0.000615 | ms/batch 428.45 | loss 3.30 | ppl 27.248 +| epoch 8 step 85400 | 5110 batches | lr 0.000614 | ms/batch 428.55 | loss 3.30 | ppl 27.202 +| epoch 8 step 85600 | 5310 batches | lr 0.000612 | ms/batch 428.21 | loss 3.29 | ppl 26.922 +| epoch 8 step 85800 | 5510 batches | lr 0.000611 | ms/batch 428.44 | loss 3.30 | ppl 26.991 +| epoch 8 step 86000 | 5710 batches | lr 0.000609 | ms/batch 428.89 | loss 3.30 | ppl 27.137 +| epoch 8 step 86200 | 5910 batches | lr 0.000608 | ms/batch 428.44 | loss 3.31 | ppl 27.249 +| epoch 8 step 86400 | 6110 batches | lr 0.000606 | ms/batch 428.40 | loss 3.30 | ppl 27.105 +| epoch 8 step 86600 | 6310 batches | lr 0.000605 | ms/batch 428.80 | loss 3.31 | ppl 27.474 +| epoch 8 step 86800 | 6510 batches | lr 0.000603 | ms/batch 429.72 | loss 3.26 | ppl 26.174 +| epoch 8 step 87000 | 6710 batches | lr 0.000602 | ms/batch 428.74 | loss 3.27 | ppl 26.276 +| epoch 8 step 87200 | 6910 batches | lr 0.0006 | ms/batch 428.17 | loss 3.29 | ppl 26.765 +| epoch 8 step 87400 | 7110 batches | lr 0.000598 | ms/batch 427.98 | loss 3.28 | ppl 26.610 +| epoch 8 step 87600 | 7310 batches | lr 0.000597 | ms/batch 428.15 | loss 3.25 | ppl 25.667 +| epoch 8 step 87800 | 7510 batches | lr 0.000595 | ms/batch 428.23 | loss 3.28 | ppl 26.612 +| epoch 8 step 88000 | 7710 batches | lr 0.000594 | ms/batch 428.25 | loss 3.27 | ppl 26.351 +---------------------------------------------------------------------------------------------------- +| Eval 22 at step 88000 | time: 1720.20s | valid loss 3.30 | valid ppl 27.148 +---------------------------------------------------------------------------------------------------- +| epoch 8 step 88200 | 7910 batches | lr 0.000592 | ms/batch 481.35 | loss 3.27 | ppl 26.388 +| epoch 8 step 88400 | 8110 batches | lr 0.000591 | ms/batch 428.47 | loss 3.28 | ppl 26.693 +| epoch 8 step 88600 | 8310 batches | lr 0.000589 | ms/batch 428.66 | loss 3.28 | ppl 26.491 +| epoch 8 step 88800 | 8510 batches | lr 0.000588 | ms/batch 428.62 | loss 3.28 | ppl 26.477 +| epoch 8 step 89000 | 8710 batches | lr 0.000586 | ms/batch 428.72 | loss 3.29 | ppl 26.868 +| epoch 8 step 89200 | 8910 batches | lr 0.000585 | ms/batch 431.39 | loss 3.29 | ppl 26.753 +| epoch 8 step 89400 | 9110 batches | lr 0.000583 | ms/batch 429.99 | loss 3.29 | ppl 26.822 +| epoch 8 step 89600 | 9310 batches | lr 0.000581 | ms/batch 428.65 | loss 3.27 | ppl 26.355 +| epoch 8 step 89800 | 9510 batches | lr 0.00058 | ms/batch 428.13 | loss 3.30 | ppl 27.153 +| epoch 8 step 90000 | 9710 batches | lr 0.000578 | ms/batch 428.01 | loss 3.28 | ppl 26.579 +| epoch 8 step 90200 | 9910 batches | lr 0.000577 | ms/batch 428.22 | loss 3.27 | ppl 26.390 +| epoch 8 step 90400 | 10110 batches | lr 0.000575 | ms/batch 427.84 | loss 3.28 | ppl 26.629 +| epoch 8 step 90600 | 10310 batches | lr 0.000574 | ms/batch 428.60 | loss 3.28 | ppl 26.444 +| epoch 8 step 90800 | 10510 batches | lr 0.000572 | ms/batch 429.39 | loss 3.30 | ppl 27.174 +| epoch 8 step 91000 | 10710 batches | lr 0.000571 | ms/batch 428.29 | loss 3.27 | ppl 26.291 +| epoch 8 step 91200 | 10910 batches | lr 0.000569 | ms/batch 430.09 | loss 3.26 | ppl 26.014 +| epoch 8 step 91400 | 11110 batches | lr 0.000567 | ms/batch 428.66 | loss 3.32 | ppl 27.663 +| epoch 8 step 91600 | 11310 batches | lr 0.000566 | ms/batch 428.81 | loss 3.28 | ppl 26.603 +| epoch 9 step 91800 | 40 batches | lr 0.000564 | ms/batch 426.93 | loss 3.30 | ppl 26.989 +| epoch 9 step 92000 | 240 batches | lr 0.000563 | ms/batch 428.41 | loss 3.25 | ppl 25.705 +---------------------------------------------------------------------------------------------------- +| Eval 23 at step 92000 | time: 1721.26s | valid loss 3.30 | valid ppl 27.072 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 92200 | 440 batches | lr 0.000561 | ms/batch 483.07 | loss 3.29 | ppl 26.728 +| epoch 9 step 92400 | 640 batches | lr 0.00056 | ms/batch 428.39 | loss 3.25 | ppl 25.916 +| epoch 9 step 92600 | 840 batches | lr 0.000558 | ms/batch 428.56 | loss 3.30 | ppl 27.003 +| epoch 9 step 92800 | 1040 batches | lr 0.000557 | ms/batch 428.59 | loss 3.26 | ppl 26.037 +| epoch 9 step 93000 | 1240 batches | lr 0.000555 | ms/batch 427.68 | loss 3.27 | ppl 26.276 +| epoch 9 step 93200 | 1440 batches | lr 0.000553 | ms/batch 430.44 | loss 3.28 | ppl 26.496 +| epoch 9 step 93400 | 1640 batches | lr 0.000552 | ms/batch 429.16 | loss 3.25 | ppl 25.806 +| epoch 9 step 93600 | 1840 batches | lr 0.00055 | ms/batch 428.82 | loss 3.27 | ppl 26.350 +| epoch 9 step 93800 | 2040 batches | lr 0.000549 | ms/batch 430.56 | loss 3.31 | ppl 27.417 +| epoch 9 step 94000 | 2240 batches | lr 0.000547 | ms/batch 428.76 | loss 3.28 | ppl 26.510 +| epoch 9 step 94200 | 2440 batches | lr 0.000546 | ms/batch 428.37 | loss 3.28 | ppl 26.535 +| epoch 9 step 94400 | 2640 batches | lr 0.000544 | ms/batch 429.44 | loss 3.27 | ppl 26.435 +| epoch 9 step 94600 | 2840 batches | lr 0.000542 | ms/batch 431.05 | loss 3.23 | ppl 25.312 +| epoch 9 step 94800 | 3040 batches | lr 0.000541 | ms/batch 431.02 | loss 3.28 | ppl 26.446 +| epoch 9 step 95000 | 3240 batches | lr 0.000539 | ms/batch 430.52 | loss 3.27 | ppl 26.223 +| epoch 9 step 95200 | 3440 batches | lr 0.000538 | ms/batch 431.61 | loss 3.25 | ppl 25.850 +| epoch 9 step 95400 | 3640 batches | lr 0.000536 | ms/batch 430.76 | loss 3.25 | ppl 25.776 +| epoch 9 step 95600 | 3840 batches | lr 0.000535 | ms/batch 431.52 | loss 3.27 | ppl 26.191 +| epoch 9 step 95800 | 4040 batches | lr 0.000533 | ms/batch 431.13 | loss 3.28 | ppl 26.543 +| epoch 9 step 96000 | 4240 batches | lr 0.000532 | ms/batch 430.68 | loss 3.26 | ppl 26.073 +---------------------------------------------------------------------------------------------------- +| Eval 24 at step 96000 | time: 1725.84s | valid loss 3.29 | valid ppl 26.753 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 96200 | 4440 batches | lr 0.00053 | ms/batch 485.06 | loss 3.26 | ppl 26.156 +| epoch 9 step 96400 | 4640 batches | lr 0.000528 | ms/batch 430.88 | loss 3.26 | ppl 26.108 +| epoch 9 step 96600 | 4840 batches | lr 0.000527 | ms/batch 431.97 | loss 3.25 | ppl 25.737 +| epoch 9 step 96800 | 5040 batches | lr 0.000525 | ms/batch 432.24 | loss 3.27 | ppl 26.276 +| epoch 9 step 97000 | 5240 batches | lr 0.000524 | ms/batch 431.45 | loss 3.26 | ppl 25.981 +| epoch 9 step 97200 | 5440 batches | lr 0.000522 | ms/batch 430.67 | loss 3.23 | ppl 25.161 +| epoch 9 step 97400 | 5640 batches | lr 0.000521 | ms/batch 432.60 | loss 3.27 | ppl 26.376 +| epoch 9 step 97600 | 5840 batches | lr 0.000519 | ms/batch 431.40 | loss 3.26 | ppl 26.045 +| epoch 9 step 97800 | 6040 batches | lr 0.000517 | ms/batch 432.17 | loss 3.24 | ppl 25.492 +| epoch 9 step 98000 | 6240 batches | lr 0.000516 | ms/batch 431.30 | loss 3.25 | ppl 25.846 +| epoch 9 step 98200 | 6440 batches | lr 0.000514 | ms/batch 432.92 | loss 3.26 | ppl 26.078 +| epoch 9 step 98400 | 6640 batches | lr 0.000513 | ms/batch 431.41 | loss 3.21 | ppl 24.699 +| epoch 9 step 98600 | 6840 batches | lr 0.000511 | ms/batch 431.49 | loss 3.24 | ppl 25.454 +| epoch 9 step 98800 | 7040 batches | lr 0.00051 | ms/batch 430.99 | loss 3.24 | ppl 25.585 +| epoch 9 step 99000 | 7240 batches | lr 0.000508 | ms/batch 430.86 | loss 3.21 | ppl 24.714 +| epoch 9 step 99200 | 7440 batches | lr 0.000506 | ms/batch 430.27 | loss 3.23 | ppl 25.190 +| epoch 9 step 99400 | 7640 batches | lr 0.000505 | ms/batch 432.07 | loss 3.21 | ppl 24.787 +| epoch 9 step 99600 | 7840 batches | lr 0.000503 | ms/batch 431.24 | loss 3.24 | ppl 25.439 +| epoch 9 step 99800 | 8040 batches | lr 0.000502 | ms/batch 430.41 | loss 3.24 | ppl 25.411 +| epoch 9 step 100000 | 8240 batches | lr 0.0005 | ms/batch 431.67 | loss 3.22 | ppl 25.115 +---------------------------------------------------------------------------------------------------- +| Eval 25 at step 100000 | time: 1732.27s | valid loss 3.28 | valid ppl 26.518 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 100200 | 8440 batches | lr 0.000499 | ms/batch 484.14 | loss 3.24 | ppl 25.577 +| epoch 9 step 100400 | 8640 batches | lr 0.000497 | ms/batch 431.81 | loss 3.23 | ppl 25.193 +| epoch 9 step 100600 | 8840 batches | lr 0.000495 | ms/batch 431.22 | loss 3.25 | ppl 25.863 +| epoch 9 step 100800 | 9040 batches | lr 0.000494 | ms/batch 431.17 | loss 3.24 | ppl 25.506 +| epoch 9 step 101000 | 9240 batches | lr 0.000492 | ms/batch 432.11 | loss 3.22 | ppl 25.014 +| epoch 9 step 101200 | 9440 batches | lr 0.000491 | ms/batch 430.57 | loss 3.24 | ppl 25.629 +| epoch 9 step 101400 | 9640 batches | lr 0.000489 | ms/batch 430.89 | loss 3.26 | ppl 26.022 +| epoch 9 step 101600 | 9840 batches | lr 0.000488 | ms/batch 431.35 | loss 3.21 | ppl 24.780 +| epoch 9 step 101800 | 10040 batches | lr 0.000486 | ms/batch 430.97 | loss 3.25 | ppl 25.722 +| epoch 9 step 102000 | 10240 batches | lr 0.000484 | ms/batch 432.01 | loss 3.22 | ppl 24.964 +| epoch 9 step 102200 | 10440 batches | lr 0.000483 | ms/batch 430.66 | loss 3.24 | ppl 25.515 +| epoch 9 step 102400 | 10640 batches | lr 0.000481 | ms/batch 431.30 | loss 3.26 | ppl 26.013 +| epoch 9 step 102600 | 10840 batches | lr 0.00048 | ms/batch 430.47 | loss 3.20 | ppl 24.498 +| epoch 9 step 102800 | 11040 batches | lr 0.000478 | ms/batch 430.42 | loss 3.26 | ppl 25.984 +| epoch 9 step 103000 | 11240 batches | lr 0.000477 | ms/batch 430.79 | loss 3.26 | ppl 26.065 +| epoch 9 step 103200 | 11440 batches | lr 0.000475 | ms/batch 431.88 | loss 3.23 | ppl 25.322 +| epoch 10 step 103400 | 170 batches | lr 0.000473 | ms/batch 429.77 | loss 3.22 | ppl 25.117 +| epoch 10 step 103600 | 370 batches | lr 0.000472 | ms/batch 431.10 | loss 3.21 | ppl 24.886 +| epoch 10 step 103800 | 570 batches | lr 0.00047 | ms/batch 430.70 | loss 3.23 | ppl 25.215 +| epoch 10 step 104000 | 770 batches | lr 0.000469 | ms/batch 430.67 | loss 3.23 | ppl 25.190 +---------------------------------------------------------------------------------------------------- +| Eval 26 at step 104000 | time: 1730.45s | valid loss 3.26 | valid ppl 26.179 +---------------------------------------------------------------------------------------------------- +| epoch 10 step 104200 | 970 batches | lr 0.000467 | ms/batch 484.27 | loss 3.21 | ppl 24.692 +| epoch 10 step 104400 | 1170 batches | lr 0.000466 | ms/batch 432.12 | loss 3.24 | ppl 25.567 +| epoch 10 step 104600 | 1370 batches | lr 0.000464 | ms/batch 432.32 | loss 3.22 | ppl 24.984 +| epoch 10 step 104800 | 1570 batches | lr 0.000462 | ms/batch 430.59 | loss 3.21 | ppl 24.857 +| epoch 10 step 105000 | 1770 batches | lr 0.000461 | ms/batch 431.50 | loss 3.22 | ppl 24.967 +| epoch 10 step 105200 | 1970 batches | lr 0.000459 | ms/batch 432.34 | loss 3.25 | ppl 25.699 +| epoch 10 step 105400 | 2170 batches | lr 0.000458 | ms/batch 431.17 | loss 3.24 | ppl 25.529 +| epoch 10 step 105600 | 2370 batches | lr 0.000456 | ms/batch 430.79 | loss 3.23 | ppl 25.362 +| epoch 10 step 105800 | 2570 batches | lr 0.000455 | ms/batch 431.08 | loss 3.22 | ppl 25.140 +| epoch 10 step 106000 | 2770 batches | lr 0.000453 | ms/batch 432.28 | loss 3.20 | ppl 24.603 +| epoch 10 step 106200 | 2970 batches | lr 0.000451 | ms/batch 430.58 | loss 3.21 | ppl 24.817 +| epoch 10 step 106400 | 3170 batches | lr 0.00045 | ms/batch 431.15 | loss 3.23 | ppl 25.248 +| epoch 10 step 106600 | 3370 batches | lr 0.000448 | ms/batch 431.26 | loss 3.22 | ppl 25.082 +| epoch 10 step 106800 | 3570 batches | lr 0.000447 | ms/batch 431.44 | loss 3.20 | ppl 24.526 +| epoch 10 step 107000 | 3770 batches | lr 0.000445 | ms/batch 431.31 | loss 3.21 | ppl 24.815 +| epoch 10 step 107200 | 3970 batches | lr 0.000444 | ms/batch 430.57 | loss 3.22 | ppl 25.021 +| epoch 10 step 107400 | 4170 batches | lr 0.000442 | ms/batch 431.10 | loss 3.22 | ppl 24.926 +| epoch 10 step 107600 | 4370 batches | lr 0.000441 | ms/batch 431.03 | loss 3.22 | ppl 25.090 +| epoch 10 step 107800 | 4570 batches | lr 0.000439 | ms/batch 431.94 | loss 3.23 | ppl 25.375 +| epoch 10 step 108000 | 4770 batches | lr 0.000437 | ms/batch 431.69 | loss 3.19 | ppl 24.269 +---------------------------------------------------------------------------------------------------- +| Eval 27 at step 108000 | time: 1731.81s | valid loss 3.25 | valid ppl 25.797 +---------------------------------------------------------------------------------------------------- +| epoch 10 step 108200 | 4970 batches | lr 0.000436 | ms/batch 485.38 | loss 3.23 | ppl 25.232 +| epoch 10 step 108400 | 5170 batches | lr 0.000434 | ms/batch 431.08 | loss 3.21 | ppl 24.658 +| epoch 10 step 108600 | 5370 batches | lr 0.000433 | ms/batch 431.32 | loss 3.18 | ppl 24.114 +| epoch 10 step 108800 | 5570 batches | lr 0.000431 | ms/batch 432.75 | loss 3.20 | ppl 24.577 +| epoch 10 step 109000 | 5770 batches | lr 0.00043 | ms/batch 430.87 | loss 3.22 | ppl 25.109 +| epoch 10 step 109200 | 5970 batches | lr 0.000428 | ms/batch 432.85 | loss 3.20 | ppl 24.520 +| epoch 10 step 109400 | 6170 batches | lr 0.000427 | ms/batch 431.12 | loss 3.20 | ppl 24.429 +| epoch 10 step 109600 | 6370 batches | lr 0.000425 | ms/batch 431.69 | loss 3.24 | ppl 25.443 +| epoch 10 step 109800 | 6570 batches | lr 0.000423 | ms/batch 431.06 | loss 3.15 | ppl 23.412 +| epoch 10 step 110000 | 6770 batches | lr 0.000422 | ms/batch 431.66 | loss 3.19 | ppl 24.228 +| epoch 10 step 110200 | 6970 batches | lr 0.00042 | ms/batch 432.02 | loss 3.20 | ppl 24.598 +| epoch 10 step 110400 | 7170 batches | lr 0.000419 | ms/batch 432.58 | loss 3.16 | ppl 23.460 +| epoch 10 step 110600 | 7370 batches | lr 0.000417 | ms/batch 431.44 | loss 3.18 | ppl 24.138 +| epoch 10 step 110800 | 7570 batches | lr 0.000416 | ms/batch 433.20 | loss 3.16 | ppl 23.507 +| epoch 10 step 111000 | 7770 batches | lr 0.000414 | ms/batch 430.91 | loss 3.19 | ppl 24.391 +| epoch 10 step 111200 | 7970 batches | lr 0.000413 | ms/batch 433.04 | loss 3.18 | ppl 24.116 +| epoch 10 step 111400 | 8170 batches | lr 0.000411 | ms/batch 431.97 | loss 3.17 | ppl 23.883 +| epoch 10 step 111600 | 8370 batches | lr 0.000409 | ms/batch 432.20 | loss 3.20 | ppl 24.590 +| epoch 10 step 111800 | 8570 batches | lr 0.000408 | ms/batch 432.86 | loss 3.18 | ppl 24.126 +| epoch 10 step 112000 | 8770 batches | lr 0.000406 | ms/batch 432.45 | loss 3.19 | ppl 24.310 +---------------------------------------------------------------------------------------------------- +| Eval 28 at step 112000 | time: 1734.16s | valid loss 3.24 | valid ppl 25.577 +---------------------------------------------------------------------------------------------------- +| epoch 10 step 112200 | 8970 batches | lr 0.000405 | ms/batch 484.80 | loss 3.20 | ppl 24.473 +| epoch 10 step 112400 | 9170 batches | lr 0.000403 | ms/batch 432.34 | loss 3.18 | ppl 23.977 +| epoch 10 step 112600 | 9370 batches | lr 0.000402 | ms/batch 434.24 | loss 3.19 | ppl 24.270 +| epoch 10 step 112800 | 9570 batches | lr 0.0004 | ms/batch 430.73 | loss 3.21 | ppl 24.773 +| epoch 10 step 113000 | 9770 batches | lr 0.000399 | ms/batch 431.89 | loss 3.19 | ppl 24.185 +| epoch 10 step 113200 | 9970 batches | lr 0.000397 | ms/batch 432.06 | loss 3.19 | ppl 24.191 +| epoch 10 step 113400 | 10170 batches | lr 0.000396 | ms/batch 431.38 | loss 3.16 | ppl 23.627 +| epoch 10 step 113600 | 10370 batches | lr 0.000394 | ms/batch 430.96 | loss 3.19 | ppl 24.257 +| epoch 10 step 113800 | 10570 batches | lr 0.000393 | ms/batch 431.43 | loss 3.21 | ppl 24.877 +| epoch 10 step 114000 | 10770 batches | lr 0.000391 | ms/batch 432.73 | loss 3.17 | ppl 23.728 +| epoch 10 step 114200 | 10970 batches | lr 0.000389 | ms/batch 433.81 | loss 3.18 | ppl 24.106 +| epoch 10 step 114400 | 11170 batches | lr 0.000388 | ms/batch 431.64 | loss 3.22 | ppl 24.942 +| epoch 10 step 114600 | 11370 batches | lr 0.000386 | ms/batch 434.07 | loss 3.19 | ppl 24.404 +| epoch 11 step 114800 | 100 batches | lr 0.000385 | ms/batch 430.90 | loss 3.18 | ppl 24.123 +| epoch 11 step 115000 | 300 batches | lr 0.000383 | ms/batch 432.01 | loss 3.16 | ppl 23.679 +| epoch 11 step 115200 | 500 batches | lr 0.000382 | ms/batch 432.69 | loss 3.20 | ppl 24.598 +| epoch 11 step 115400 | 700 batches | lr 0.00038 | ms/batch 433.40 | loss 3.15 | ppl 23.424 +| epoch 11 step 115600 | 900 batches | lr 0.000379 | ms/batch 431.01 | loss 3.17 | ppl 23.860 +| epoch 11 step 115800 | 1100 batches | lr 0.000377 | ms/batch 431.82 | loss 3.19 | ppl 24.356 +| epoch 11 step 116000 | 1300 batches | lr 0.000376 | ms/batch 431.01 | loss 3.17 | ppl 23.859 +---------------------------------------------------------------------------------------------------- +| Eval 29 at step 116000 | time: 1734.75s | valid loss 3.24 | valid ppl 25.504 +---------------------------------------------------------------------------------------------------- +| epoch 11 step 116200 | 1500 batches | lr 0.000374 | ms/batch 484.53 | loss 3.17 | ppl 23.735 +| epoch 11 step 116400 | 1700 batches | lr 0.000373 | ms/batch 431.49 | loss 3.16 | ppl 23.553 +| epoch 11 step 116600 | 1900 batches | lr 0.000371 | ms/batch 431.62 | loss 3.19 | ppl 24.285 +| epoch 11 step 116800 | 2100 batches | lr 0.00037 | ms/batch 431.29 | loss 3.21 | ppl 24.801 +| epoch 11 step 117000 | 2300 batches | lr 0.000368 | ms/batch 431.24 | loss 3.19 | ppl 24.343 +| epoch 11 step 117200 | 2500 batches | lr 0.000367 | ms/batch 431.80 | loss 3.17 | ppl 23.817 +| epoch 11 step 117400 | 2700 batches | lr 0.000365 | ms/batch 431.05 | loss 3.18 | ppl 23.943 +| epoch 11 step 117600 | 2900 batches | lr 0.000364 | ms/batch 431.78 | loss 3.14 | ppl 23.072 +| epoch 11 step 117800 | 3100 batches | lr 0.000362 | ms/batch 433.44 | loss 3.18 | ppl 23.941 +| epoch 11 step 118000 | 3300 batches | lr 0.000361 | ms/batch 431.83 | loss 3.19 | ppl 24.346 +| epoch 11 step 118200 | 3500 batches | lr 0.000359 | ms/batch 430.98 | loss 3.15 | ppl 23.383 +| epoch 11 step 118400 | 3700 batches | lr 0.000358 | ms/batch 431.54 | loss 3.17 | ppl 23.837 +| epoch 11 step 118600 | 3900 batches | lr 0.000356 | ms/batch 430.95 | loss 3.16 | ppl 23.611 +| epoch 11 step 118800 | 4100 batches | lr 0.000355 | ms/batch 432.44 | loss 3.18 | ppl 24.134 +| epoch 11 step 119000 | 4300 batches | lr 0.000353 | ms/batch 431.52 | loss 3.17 | ppl 23.747 +| epoch 11 step 119200 | 4500 batches | lr 0.000352 | ms/batch 432.70 | loss 3.19 | ppl 24.290 +| epoch 11 step 119400 | 4700 batches | lr 0.00035 | ms/batch 432.66 | loss 3.15 | ppl 23.296 +| epoch 11 step 119600 | 4900 batches | lr 0.000349 | ms/batch 432.65 | loss 3.16 | ppl 23.587 +| epoch 11 step 119800 | 5100 batches | lr 0.000347 | ms/batch 432.23 | loss 3.17 | ppl 23.761 +| epoch 11 step 120000 | 5300 batches | lr 0.000346 | ms/batch 432.28 | loss 3.15 | ppl 23.380 +---------------------------------------------------------------------------------------------------- +| Eval 30 at step 120000 | time: 1733.79s | valid loss 3.23 | valid ppl 25.207 +---------------------------------------------------------------------------------------------------- +| epoch 11 step 120200 | 5500 batches | lr 0.000344 | ms/batch 485.19 | loss 3.15 | ppl 23.385 +| epoch 11 step 120400 | 5700 batches | lr 0.000343 | ms/batch 431.60 | loss 3.16 | ppl 23.630 +| epoch 11 step 120600 | 5900 batches | lr 0.000341 | ms/batch 432.39 | loss 3.17 | ppl 23.706 +| epoch 11 step 120800 | 6100 batches | lr 0.00034 | ms/batch 431.23 | loss 3.16 | ppl 23.594 +| epoch 11 step 121000 | 6300 batches | lr 0.000338 | ms/batch 432.67 | loss 3.17 | ppl 23.740 +| epoch 11 step 121200 | 6500 batches | lr 0.000337 | ms/batch 431.72 | loss 3.13 | ppl 22.899 +| epoch 11 step 121400 | 6700 batches | lr 0.000335 | ms/batch 432.59 | loss 3.13 | ppl 22.826 +| epoch 11 step 121600 | 6900 batches | lr 0.000334 | ms/batch 431.15 | loss 3.15 | ppl 23.332 +| epoch 11 step 121800 | 7100 batches | lr 0.000332 | ms/batch 430.77 | loss 3.15 | ppl 23.221 +| epoch 11 step 122000 | 7300 batches | lr 0.000331 | ms/batch 429.79 | loss 3.10 | ppl 22.234 +| epoch 11 step 122200 | 7500 batches | lr 0.000329 | ms/batch 432.21 | loss 3.15 | ppl 23.235 +| epoch 11 step 122400 | 7700 batches | lr 0.000328 | ms/batch 432.24 | loss 3.13 | ppl 22.791 +| epoch 11 step 122600 | 7900 batches | lr 0.000326 | ms/batch 433.78 | loss 3.13 | ppl 22.859 +| epoch 11 step 122800 | 8100 batches | lr 0.000325 | ms/batch 433.88 | loss 3.15 | ppl 23.242 +| epoch 11 step 123000 | 8300 batches | lr 0.000323 | ms/batch 433.02 | loss 3.13 | ppl 22.926 +| epoch 11 step 123200 | 8500 batches | lr 0.000322 | ms/batch 431.07 | loss 3.13 | ppl 22.963 +| epoch 11 step 123400 | 8700 batches | lr 0.00032 | ms/batch 432.33 | loss 3.15 | ppl 23.392 +| epoch 11 step 123600 | 8900 batches | lr 0.000319 | ms/batch 429.32 | loss 3.15 | ppl 23.243 +| epoch 11 step 123800 | 9100 batches | lr 0.000317 | ms/batch 432.13 | loss 3.15 | ppl 23.279 +| epoch 11 step 124000 | 9300 batches | lr 0.000316 | ms/batch 431.79 | loss 3.13 | ppl 22.908 +---------------------------------------------------------------------------------------------------- +| Eval 31 at step 124000 | time: 1733.89s | valid loss 3.21 | valid ppl 24.812 +---------------------------------------------------------------------------------------------------- +| epoch 11 step 124200 | 9500 batches | lr 0.000315 | ms/batch 485.31 | loss 3.15 | ppl 23.395 +| epoch 11 step 124400 | 9700 batches | lr 0.000313 | ms/batch 431.01 | loss 3.14 | ppl 23.217 +| epoch 11 step 124600 | 9900 batches | lr 0.000312 | ms/batch 430.95 | loss 3.13 | ppl 22.847 +| epoch 11 step 124800 | 10100 batches | lr 0.00031 | ms/batch 430.50 | loss 3.14 | ppl 23.214 +| epoch 11 step 125000 | 10300 batches | lr 0.000309 | ms/batch 431.25 | loss 3.13 | ppl 22.910 +| epoch 11 step 125200 | 10500 batches | lr 0.000307 | ms/batch 432.16 | loss 3.17 | ppl 23.719 +| epoch 11 step 125400 | 10700 batches | lr 0.000306 | ms/batch 430.75 | loss 3.13 | ppl 22.860 +| epoch 11 step 125600 | 10900 batches | lr 0.000304 | ms/batch 431.47 | loss 3.12 | ppl 22.570 +| epoch 11 step 125800 | 11100 batches | lr 0.000303 | ms/batch 430.65 | loss 3.17 | ppl 23.879 +| epoch 11 step 126000 | 11300 batches | lr 0.000301 | ms/batch 431.81 | loss 3.15 | ppl 23.372 +| epoch 12 step 126200 | 30 batches | lr 0.0003 | ms/batch 429.97 | loss 3.15 | ppl 23.380 +| epoch 12 step 126400 | 230 batches | lr 0.000299 | ms/batch 431.33 | loss 3.11 | ppl 22.355 +| epoch 12 step 126600 | 430 batches | lr 0.000297 | ms/batch 430.87 | loss 3.14 | ppl 23.169 +| epoch 12 step 126800 | 630 batches | lr 0.000296 | ms/batch 432.29 | loss 3.12 | ppl 22.578 +| epoch 12 step 127000 | 830 batches | lr 0.000294 | ms/batch 432.44 | loss 3.15 | ppl 23.438 +| epoch 12 step 127200 | 1030 batches | lr 0.000293 | ms/batch 431.80 | loss 3.12 | ppl 22.547 +| epoch 12 step 127400 | 1230 batches | lr 0.000291 | ms/batch 431.91 | loss 3.13 | ppl 22.962 +| epoch 12 step 127600 | 1430 batches | lr 0.00029 | ms/batch 432.43 | loss 3.13 | ppl 22.857 +| epoch 12 step 127800 | 1630 batches | lr 0.000289 | ms/batch 431.24 | loss 3.11 | ppl 22.423 +| epoch 12 step 128000 | 1830 batches | lr 0.000287 | ms/batch 431.67 | loss 3.14 | ppl 23.045 +---------------------------------------------------------------------------------------------------- +| Eval 32 at step 128000 | time: 1731.99s | valid loss 3.21 | valid ppl 24.767 +---------------------------------------------------------------------------------------------------- +| epoch 12 step 128200 | 2030 batches | lr 0.000286 | ms/batch 484.47 | loss 3.17 | ppl 23.741 +| epoch 12 step 128400 | 2230 batches | lr 0.000284 | ms/batch 431.11 | loss 3.14 | ppl 23.123 +| epoch 12 step 128600 | 2430 batches | lr 0.000283 | ms/batch 432.77 | loss 3.14 | ppl 23.177 +| epoch 12 step 128800 | 2630 batches | lr 0.000282 | ms/batch 432.06 | loss 3.13 | ppl 22.892 +| epoch 12 step 129000 | 2830 batches | lr 0.00028 | ms/batch 431.54 | loss 3.10 | ppl 22.155 +| epoch 12 step 129200 | 3030 batches | lr 0.000279 | ms/batch 432.06 | loss 3.13 | ppl 22.914 +| epoch 12 step 129400 | 3230 batches | lr 0.000277 | ms/batch 431.25 | loss 3.13 | ppl 22.780 +| epoch 12 step 129600 | 3430 batches | lr 0.000276 | ms/batch 430.82 | loss 3.12 | ppl 22.660 +| epoch 12 step 129800 | 3630 batches | lr 0.000274 | ms/batch 432.19 | loss 3.11 | ppl 22.377 +| epoch 12 step 130000 | 3830 batches | lr 0.000273 | ms/batch 431.91 | loss 3.12 | ppl 22.730 +| epoch 12 step 130200 | 4030 batches | lr 0.000272 | ms/batch 431.49 | loss 3.14 | ppl 23.125 +| epoch 12 step 130400 | 4230 batches | lr 0.00027 | ms/batch 432.13 | loss 3.12 | ppl 22.750 +| epoch 12 step 130600 | 4430 batches | lr 0.000269 | ms/batch 431.86 | loss 3.12 | ppl 22.713 +| epoch 12 step 130800 | 4630 batches | lr 0.000267 | ms/batch 431.34 | loss 3.12 | ppl 22.744 +| epoch 12 step 131000 | 4830 batches | lr 0.000266 | ms/batch 430.75 | loss 3.11 | ppl 22.398 +| epoch 12 step 131200 | 5030 batches | lr 0.000265 | ms/batch 431.12 | loss 3.13 | ppl 22.885 +| epoch 12 step 131400 | 5230 batches | lr 0.000263 | ms/batch 430.46 | loss 3.12 | ppl 22.669 +| epoch 12 step 131600 | 5430 batches | lr 0.000262 | ms/batch 431.34 | loss 3.09 | ppl 21.950 +| epoch 12 step 131800 | 5630 batches | lr 0.000261 | ms/batch 431.72 | loss 3.13 | ppl 22.806 +| epoch 12 step 132000 | 5830 batches | lr 0.000259 | ms/batch 430.10 | loss 3.12 | ppl 22.723 +---------------------------------------------------------------------------------------------------- +| Eval 33 at step 132000 | time: 1732.22s | valid loss 3.20 | valid ppl 24.478 +---------------------------------------------------------------------------------------------------- +| epoch 12 step 132200 | 6030 batches | lr 0.000258 | ms/batch 483.85 | loss 3.10 | ppl 22.208 +| epoch 12 step 132400 | 6230 batches | lr 0.000256 | ms/batch 431.01 | loss 3.11 | ppl 22.454 +| epoch 12 step 132600 | 6430 batches | lr 0.000255 | ms/batch 431.62 | loss 3.13 | ppl 22.788 +| epoch 12 step 132800 | 6630 batches | lr 0.000254 | ms/batch 430.91 | loss 3.07 | ppl 21.552 +| epoch 12 step 133000 | 6830 batches | lr 0.000252 | ms/batch 431.29 | loss 3.10 | ppl 22.161 +| epoch 12 step 133200 | 7030 batches | lr 0.000251 | ms/batch 432.30 | loss 3.11 | ppl 22.333 +| epoch 12 step 133400 | 7230 batches | lr 0.00025 | ms/batch 430.20 | loss 3.07 | ppl 21.561 +| epoch 12 step 133600 | 7430 batches | lr 0.000248 | ms/batch 430.76 | loss 3.08 | ppl 21.775 +| epoch 12 step 133800 | 7630 batches | lr 0.000247 | ms/batch 431.00 | loss 3.08 | ppl 21.656 +| epoch 12 step 134000 | 7830 batches | lr 0.000246 | ms/batch 431.51 | loss 3.10 | ppl 22.131 +| epoch 12 step 134200 | 8030 batches | lr 0.000244 | ms/batch 430.65 | loss 3.10 | ppl 22.148 +| epoch 12 step 134400 | 8230 batches | lr 0.000243 | ms/batch 431.44 | loss 3.09 | ppl 21.895 +| epoch 12 step 134600 | 8430 batches | lr 0.000241 | ms/batch 431.15 | loss 3.10 | ppl 22.214 +| epoch 12 step 134800 | 8630 batches | lr 0.00024 | ms/batch 431.28 | loss 3.09 | ppl 21.994 +| epoch 12 step 135000 | 8830 batches | lr 0.000239 | ms/batch 430.56 | loss 3.11 | ppl 22.496 +| epoch 12 step 135200 | 9030 batches | lr 0.000237 | ms/batch 431.01 | loss 3.11 | ppl 22.324 +| epoch 12 step 135400 | 9230 batches | lr 0.000236 | ms/batch 430.67 | loss 3.07 | ppl 21.638 +| epoch 12 step 135600 | 9430 batches | lr 0.000235 | ms/batch 431.20 | loss 3.10 | ppl 22.290 +| epoch 12 step 135800 | 9630 batches | lr 0.000233 | ms/batch 431.59 | loss 3.12 | ppl 22.606 +| epoch 12 step 136000 | 9830 batches | lr 0.000232 | ms/batch 431.20 | loss 3.08 | ppl 21.688 +---------------------------------------------------------------------------------------------------- +| Eval 34 at step 136000 | time: 1730.84s | valid loss 3.19 | valid ppl 24.239 +---------------------------------------------------------------------------------------------------- +| epoch 12 step 136200 | 10030 batches | lr 0.000231 | ms/batch 483.47 | loss 3.10 | ppl 22.265 +| epoch 12 step 136400 | 10230 batches | lr 0.000229 | ms/batch 431.69 | loss 3.09 | ppl 21.896 +| epoch 12 step 136600 | 10430 batches | lr 0.000228 | ms/batch 431.61 | loss 3.09 | ppl 22.074 +| epoch 12 step 136800 | 10630 batches | lr 0.000227 | ms/batch 431.64 | loss 3.12 | ppl 22.752 +| epoch 12 step 137000 | 10830 batches | lr 0.000226 | ms/batch 431.16 | loss 3.06 | ppl 21.360 +| epoch 12 step 137200 | 11030 batches | lr 0.000224 | ms/batch 430.85 | loss 3.12 | ppl 22.677 +| epoch 12 step 137400 | 11230 batches | lr 0.000223 | ms/batch 431.55 | loss 3.12 | ppl 22.545 +| epoch 12 step 137600 | 11430 batches | lr 0.000222 | ms/batch 430.96 | loss 3.10 | ppl 22.250 +| epoch 13 step 137800 | 160 batches | lr 0.00022 | ms/batch 430.15 | loss 3.09 | ppl 21.936 +| epoch 13 step 138000 | 360 batches | lr 0.000219 | ms/batch 431.25 | loss 3.08 | ppl 21.697 +| epoch 13 step 138200 | 560 batches | lr 0.000218 | ms/batch 430.49 | loss 3.09 | ppl 22.047 +| epoch 13 step 138400 | 760 batches | lr 0.000216 | ms/batch 431.16 | loss 3.09 | ppl 21.894 +| epoch 13 step 138600 | 960 batches | lr 0.000215 | ms/batch 430.96 | loss 3.07 | ppl 21.542 +| epoch 13 step 138800 | 1160 batches | lr 0.000214 | ms/batch 430.70 | loss 3.10 | ppl 22.305 +| epoch 13 step 139000 | 1360 batches | lr 0.000213 | ms/batch 432.79 | loss 3.08 | ppl 21.774 +| epoch 13 step 139200 | 1560 batches | lr 0.000211 | ms/batch 431.02 | loss 3.08 | ppl 21.693 +| epoch 13 step 139400 | 1760 batches | lr 0.00021 | ms/batch 433.07 | loss 3.08 | ppl 21.695 +| epoch 13 step 139600 | 1960 batches | lr 0.000209 | ms/batch 431.58 | loss 3.11 | ppl 22.326 +| epoch 13 step 139800 | 2160 batches | lr 0.000207 | ms/batch 430.88 | loss 3.11 | ppl 22.432 +| epoch 13 step 140000 | 2360 batches | lr 0.000206 | ms/batch 430.34 | loss 3.09 | ppl 21.997 +---------------------------------------------------------------------------------------------------- +| Eval 35 at step 140000 | time: 1731.19s | valid loss 3.18 | valid ppl 23.962 +---------------------------------------------------------------------------------------------------- +| epoch 13 step 140200 | 2560 batches | lr 0.000205 | ms/batch 484.26 | loss 3.09 | ppl 22.042 +| epoch 13 step 140400 | 2760 batches | lr 0.000204 | ms/batch 430.93 | loss 3.07 | ppl 21.495 +| epoch 13 step 140600 | 2960 batches | lr 0.000202 | ms/batch 431.04 | loss 3.07 | ppl 21.645 +| epoch 13 step 140800 | 3160 batches | lr 0.000201 | ms/batch 430.73 | loss 3.09 | ppl 21.999 +| epoch 13 step 141000 | 3360 batches | lr 0.0002 | ms/batch 431.31 | loss 3.09 | ppl 21.953 +| epoch 13 step 141200 | 3560 batches | lr 0.000199 | ms/batch 431.24 | loss 3.07 | ppl 21.515 +| epoch 13 step 141400 | 3760 batches | lr 0.000197 | ms/batch 431.92 | loss 3.08 | ppl 21.696 +| epoch 13 step 141600 | 3960 batches | lr 0.000196 | ms/batch 430.43 | loss 3.08 | ppl 21.807 +| epoch 13 step 141800 | 4160 batches | lr 0.000195 | ms/batch 431.24 | loss 3.08 | ppl 21.863 +| epoch 13 step 142000 | 4360 batches | lr 0.000194 | ms/batch 432.55 | loss 3.08 | ppl 21.818 +| epoch 13 step 142200 | 4560 batches | lr 0.000192 | ms/batch 431.39 | loss 3.10 | ppl 22.231 +| epoch 13 step 142400 | 4760 batches | lr 0.000191 | ms/batch 430.91 | loss 3.05 | ppl 21.181 +| epoch 13 step 142600 | 4960 batches | lr 0.00019 | ms/batch 430.37 | loss 3.09 | ppl 21.940 +| epoch 13 step 142800 | 5160 batches | lr 0.000189 | ms/batch 431.21 | loss 3.07 | ppl 21.603 +| epoch 13 step 143000 | 5360 batches | lr 0.000187 | ms/batch 430.65 | loss 3.06 | ppl 21.268 +| epoch 13 step 143200 | 5560 batches | lr 0.000186 | ms/batch 430.50 | loss 3.06 | ppl 21.369 +| epoch 13 step 143400 | 5760 batches | lr 0.000185 | ms/batch 430.32 | loss 3.08 | ppl 21.808 +| epoch 13 step 143600 | 5960 batches | lr 0.000184 | ms/batch 430.46 | loss 3.07 | ppl 21.536 +| epoch 13 step 143800 | 6160 batches | lr 0.000183 | ms/batch 431.46 | loss 3.06 | ppl 21.313 +| epoch 13 step 144000 | 6360 batches | lr 0.000181 | ms/batch 431.41 | loss 3.11 | ppl 22.363 +---------------------------------------------------------------------------------------------------- +| Eval 36 at step 144000 | time: 1730.58s | valid loss 3.18 | valid ppl 24.033 +---------------------------------------------------------------------------------------------------- +| epoch 13 step 144200 | 6560 batches | lr 0.00018 | ms/batch 463.01 | loss 3.02 | ppl 20.408 +| epoch 13 step 144400 | 6760 batches | lr 0.000179 | ms/batch 430.89 | loss 3.05 | ppl 21.202 +| epoch 13 step 144600 | 6960 batches | lr 0.000178 | ms/batch 431.83 | loss 3.07 | ppl 21.498 +| epoch 13 step 144800 | 7160 batches | lr 0.000177 | ms/batch 431.57 | loss 3.02 | ppl 20.567 +| epoch 13 step 145000 | 7360 batches | lr 0.000175 | ms/batch 431.30 | loss 3.05 | ppl 21.061 +| epoch 13 step 145200 | 7560 batches | lr 0.000174 | ms/batch 431.94 | loss 3.03 | ppl 20.732 +| epoch 13 step 145400 | 7760 batches | lr 0.000173 | ms/batch 430.52 | loss 3.06 | ppl 21.330 +| epoch 13 step 145600 | 7960 batches | lr 0.000172 | ms/batch 432.25 | loss 3.04 | ppl 20.941 +| epoch 13 step 145800 | 8160 batches | lr 0.000171 | ms/batch 428.44 | loss 3.04 | ppl 20.953 +| epoch 13 step 146000 | 8360 batches | lr 0.000169 | ms/batch 428.75 | loss 3.07 | ppl 21.486 +| epoch 13 step 146200 | 8560 batches | lr 0.000168 | ms/batch 428.29 | loss 3.05 | ppl 21.119 +| epoch 13 step 146400 | 8760 batches | lr 0.000167 | ms/batch 429.25 | loss 3.06 | ppl 21.234 +| epoch 13 step 146600 | 8960 batches | lr 0.000166 | ms/batch 428.49 | loss 3.07 | ppl 21.543 +| epoch 13 step 146800 | 9160 batches | lr 0.000165 | ms/batch 431.81 | loss 3.04 | ppl 20.923 +| epoch 13 step 147000 | 9360 batches | lr 0.000164 | ms/batch 428.07 | loss 3.05 | ppl 21.187 +| epoch 13 step 147200 | 9560 batches | lr 0.000162 | ms/batch 428.50 | loss 3.08 | ppl 21.742 +| epoch 13 step 147400 | 9760 batches | lr 0.000161 | ms/batch 428.93 | loss 3.05 | ppl 21.118 +| epoch 13 step 147600 | 9960 batches | lr 0.00016 | ms/batch 429.07 | loss 3.05 | ppl 21.214 +| epoch 13 step 147800 | 10160 batches | lr 0.000159 | ms/batch 428.38 | loss 3.03 | ppl 20.674 +| epoch 13 step 148000 | 10360 batches | lr 0.000158 | ms/batch 429.30 | loss 3.06 | ppl 21.383 +---------------------------------------------------------------------------------------------------- +| Eval 37 at step 148000 | time: 1726.13s | valid loss 3.17 | valid ppl 23.691 +---------------------------------------------------------------------------------------------------- +| epoch 13 step 148200 | 10560 batches | lr 0.000157 | ms/batch 481.88 | loss 3.08 | ppl 21.750 +| epoch 13 step 148400 | 10760 batches | lr 0.000155 | ms/batch 429.14 | loss 3.04 | ppl 20.808 +| epoch 13 step 148600 | 10960 batches | lr 0.000154 | ms/batch 428.38 | loss 3.04 | ppl 20.987 +| epoch 13 step 148800 | 11160 batches | lr 0.000153 | ms/batch 428.50 | loss 3.09 | ppl 22.015 +| epoch 13 step 149000 | 11360 batches | lr 0.000152 | ms/batch 429.49 | loss 3.06 | ppl 21.327 +| epoch 14 step 149200 | 90 batches | lr 0.000151 | ms/batch 428.11 | loss 3.06 | ppl 21.261 +| epoch 14 step 149400 | 290 batches | lr 0.00015 | ms/batch 429.16 | loss 3.03 | ppl 20.713 +| epoch 14 step 149600 | 490 batches | lr 0.000149 | ms/batch 428.77 | loss 3.07 | ppl 21.532 +| epoch 14 step 149800 | 690 batches | lr 0.000148 | ms/batch 429.07 | loss 3.02 | ppl 20.589 +| epoch 14 step 150000 | 890 batches | lr 0.000146 | ms/batch 428.29 | loss 3.05 | ppl 21.031 +| epoch 14 step 150200 | 1090 batches | lr 0.000145 | ms/batch 428.38 | loss 3.06 | ppl 21.266 +| epoch 14 step 150400 | 1290 batches | lr 0.000144 | ms/batch 429.10 | loss 3.04 | ppl 20.860 +| epoch 14 step 150600 | 1490 batches | lr 0.000143 | ms/batch 428.88 | loss 3.04 | ppl 20.851 +| epoch 14 step 150800 | 1690 batches | lr 0.000142 | ms/batch 428.45 | loss 3.04 | ppl 20.828 +| epoch 14 step 151000 | 1890 batches | lr 0.000141 | ms/batch 428.61 | loss 3.05 | ppl 21.108 +| epoch 14 step 151200 | 2090 batches | lr 0.00014 | ms/batch 429.88 | loss 3.09 | ppl 21.960 +| epoch 14 step 151400 | 2290 batches | lr 0.000139 | ms/batch 428.60 | loss 3.06 | ppl 21.348 +| epoch 14 step 151600 | 2490 batches | lr 0.000138 | ms/batch 427.77 | loss 3.04 | ppl 20.892 +| epoch 14 step 151800 | 2690 batches | lr 0.000137 | ms/batch 429.55 | loss 3.05 | ppl 21.183 +| epoch 14 step 152000 | 2890 batches | lr 0.000136 | ms/batch 428.22 | loss 3.00 | ppl 20.146 +---------------------------------------------------------------------------------------------------- +| Eval 38 at step 152000 | time: 1721.33s | valid loss 3.16 | valid ppl 23.586 +---------------------------------------------------------------------------------------------------- +| epoch 14 step 152200 | 3090 batches | lr 0.000134 | ms/batch 483.70 | loss 3.05 | ppl 21.117 +| epoch 14 step 152400 | 3290 batches | lr 0.000133 | ms/batch 428.34 | loss 3.06 | ppl 21.403 +| epoch 14 step 152600 | 3490 batches | lr 0.000132 | ms/batch 429.22 | loss 3.03 | ppl 20.632 +| epoch 14 step 152800 | 3690 batches | lr 0.000131 | ms/batch 428.12 | loss 3.04 | ppl 20.924 +| epoch 14 step 153000 | 3890 batches | lr 0.00013 | ms/batch 432.35 | loss 3.03 | ppl 20.735 +| epoch 14 step 153200 | 4090 batches | lr 0.000129 | ms/batch 428.36 | loss 3.06 | ppl 21.290 +| epoch 14 step 153400 | 4290 batches | lr 0.000128 | ms/batch 435.89 | loss 3.04 | ppl 20.850 +| epoch 14 step 153600 | 4490 batches | lr 0.000127 | ms/batch 434.49 | loss 3.06 | ppl 21.298 +| epoch 14 step 153800 | 4690 batches | lr 0.000126 | ms/batch 428.56 | loss 3.02 | ppl 20.588 +| epoch 14 step 154000 | 4890 batches | lr 0.000125 | ms/batch 428.64 | loss 3.03 | ppl 20.689 +| epoch 14 step 154200 | 5090 batches | lr 0.000124 | ms/batch 428.26 | loss 3.04 | ppl 20.997 +| epoch 14 step 154400 | 5290 batches | lr 0.000123 | ms/batch 428.63 | loss 3.03 | ppl 20.656 +| epoch 14 step 154600 | 5490 batches | lr 0.000122 | ms/batch 430.44 | loss 3.02 | ppl 20.492 +| epoch 14 step 154800 | 5690 batches | lr 0.000121 | ms/batch 429.37 | loss 3.04 | ppl 20.889 +| epoch 14 step 155000 | 5890 batches | lr 0.00012 | ms/batch 428.16 | loss 3.04 | ppl 20.854 +| epoch 14 step 155200 | 6090 batches | lr 0.000119 | ms/batch 428.56 | loss 3.04 | ppl 20.856 +| epoch 14 step 155400 | 6290 batches | lr 0.000118 | ms/batch 428.39 | loss 3.04 | ppl 20.911 +| epoch 14 step 155600 | 6490 batches | lr 0.000117 | ms/batch 428.91 | loss 3.01 | ppl 20.322 +| epoch 14 step 155800 | 6690 batches | lr 0.000116 | ms/batch 427.78 | loss 3.00 | ppl 20.057 +| epoch 14 step 156000 | 6890 batches | lr 0.000115 | ms/batch 428.59 | loss 3.03 | ppl 20.600 +---------------------------------------------------------------------------------------------------- +| Eval 39 at step 156000 | time: 1724.70s | valid loss 3.15 | valid ppl 23.443 +---------------------------------------------------------------------------------------------------- +| epoch 14 step 156200 | 7090 batches | lr 0.000114 | ms/batch 483.92 | loss 3.02 | ppl 20.526 +| epoch 14 step 156400 | 7290 batches | lr 0.000113 | ms/batch 428.29 | loss 2.97 | ppl 19.558 +| epoch 14 step 156600 | 7490 batches | lr 0.000112 | ms/batch 428.20 | loss 3.02 | ppl 20.494 +| epoch 14 step 156800 | 7690 batches | lr 0.000111 | ms/batch 428.23 | loss 3.00 | ppl 20.151 +| epoch 14 step 157000 | 7890 batches | lr 0.00011 | ms/batch 431.45 | loss 3.00 | ppl 20.111 +| epoch 14 step 157200 | 8090 batches | lr 0.000109 | ms/batch 431.07 | loss 3.02 | ppl 20.545 +| epoch 14 step 157400 | 8290 batches | lr 0.000108 | ms/batch 429.87 | loss 3.01 | ppl 20.280 +| epoch 14 step 157600 | 8490 batches | lr 0.000107 | ms/batch 429.34 | loss 3.01 | ppl 20.317 +| epoch 14 step 157800 | 8690 batches | lr 0.000106 | ms/batch 429.35 | loss 3.03 | ppl 20.696 +| epoch 14 step 158000 | 8890 batches | lr 0.000105 | ms/batch 430.34 | loss 3.02 | ppl 20.527 +| epoch 14 step 158200 | 9090 batches | lr 0.000104 | ms/batch 429.23 | loss 3.02 | ppl 20.538 +| epoch 14 step 158400 | 9290 batches | lr 0.000103 | ms/batch 429.86 | loss 3.01 | ppl 20.345 +| epoch 14 step 158600 | 9490 batches | lr 0.000102 | ms/batch 430.44 | loss 3.02 | ppl 20.569 +| epoch 14 step 158800 | 9690 batches | lr 0.000101 | ms/batch 429.23 | loss 3.02 | ppl 20.562 +| epoch 14 step 159000 | 9890 batches | lr 0.0001 | ms/batch 429.96 | loss 3.00 | ppl 20.119 +| epoch 14 step 159200 | 10090 batches | lr 9.92e-05 | ms/batch 431.43 | loss 3.03 | ppl 20.658 +| epoch 14 step 159400 | 10290 batches | lr 9.83e-05 | ms/batch 431.56 | loss 3.00 | ppl 20.177 +| epoch 14 step 159600 | 10490 batches | lr 9.74e-05 | ms/batch 429.18 | loss 3.04 | ppl 21.009 +| epoch 14 step 159800 | 10690 batches | lr 9.64e-05 | ms/batch 429.35 | loss 3.01 | ppl 20.323 +| epoch 14 step 160000 | 10890 batches | lr 9.55e-05 | ms/batch 429.02 | loss 3.00 | ppl 19.986 +---------------------------------------------------------------------------------------------------- +| Eval 40 at step 160000 | time: 1725.57s | valid loss 3.15 | valid ppl 23.322 +---------------------------------------------------------------------------------------------------- +| epoch 14 step 160200 | 11090 batches | lr 9.46e-05 | ms/batch 481.68 | loss 3.04 | ppl 21.005 +| epoch 14 step 160400 | 11290 batches | lr 9.37e-05 | ms/batch 428.54 | loss 3.04 | ppl 20.853 +| epoch 15 step 160600 | 20 batches | lr 9.28e-05 | ms/batch 429.04 | loss 3.03 | ppl 20.670 +| epoch 15 step 160800 | 220 batches | lr 9.19e-05 | ms/batch 428.96 | loss 2.99 | ppl 19.888 +| epoch 15 step 161000 | 420 batches | lr 9.09e-05 | ms/batch 428.59 | loss 3.02 | ppl 20.582 +| epoch 15 step 161200 | 620 batches | lr 9e-05 | ms/batch 429.51 | loss 2.99 | ppl 19.964 +| epoch 15 step 161400 | 820 batches | lr 8.91e-05 | ms/batch 429.16 | loss 3.03 | ppl 20.734 +| epoch 15 step 161600 | 1020 batches | lr 8.83e-05 | ms/batch 428.53 | loss 2.99 | ppl 19.982 +| epoch 15 step 161800 | 1220 batches | lr 8.74e-05 | ms/batch 428.46 | loss 3.02 | ppl 20.448 +| epoch 15 step 162000 | 1420 batches | lr 8.65e-05 | ms/batch 428.75 | loss 3.01 | ppl 20.289 +| epoch 15 step 162200 | 1620 batches | lr 8.56e-05 | ms/batch 428.80 | loss 2.99 | ppl 19.828 +| epoch 15 step 162400 | 1820 batches | lr 8.47e-05 | ms/batch 430.89 | loss 3.02 | ppl 20.551 +| epoch 15 step 162600 | 2020 batches | lr 8.38e-05 | ms/batch 431.71 | loss 3.05 | ppl 21.076 +| epoch 15 step 162800 | 2220 batches | lr 8.3e-05 | ms/batch 429.82 | loss 3.02 | ppl 20.554 +| epoch 15 step 163000 | 2420 batches | lr 8.21e-05 | ms/batch 428.24 | loss 3.02 | ppl 20.554 +| epoch 15 step 163200 | 2620 batches | lr 8.13e-05 | ms/batch 428.88 | loss 3.01 | ppl 20.309 +| epoch 15 step 163400 | 2820 batches | lr 8.04e-05 | ms/batch 429.25 | loss 2.99 | ppl 19.802 +| epoch 15 step 163600 | 3020 batches | lr 7.95e-05 | ms/batch 430.14 | loss 3.01 | ppl 20.356 +| epoch 15 step 163800 | 3220 batches | lr 7.87e-05 | ms/batch 428.14 | loss 3.01 | ppl 20.250 +| epoch 15 step 164000 | 3420 batches | lr 7.79e-05 | ms/batch 428.57 | loss 3.01 | ppl 20.314 +---------------------------------------------------------------------------------------------------- +| Eval 41 at step 164000 | time: 1722.82s | valid loss 3.15 | valid ppl 23.228 +---------------------------------------------------------------------------------------------------- +| epoch 15 step 164200 | 3620 batches | lr 7.7e-05 | ms/batch 481.45 | loss 2.99 | ppl 19.844 +| epoch 15 step 164400 | 3820 batches | lr 7.62e-05 | ms/batch 429.58 | loss 3.01 | ppl 20.294 +| epoch 15 step 164600 | 4020 batches | lr 7.53e-05 | ms/batch 428.34 | loss 3.03 | ppl 20.605 +| epoch 15 step 164800 | 4220 batches | lr 7.45e-05 | ms/batch 432.92 | loss 3.01 | ppl 20.216 +| epoch 15 step 165000 | 4420 batches | lr 7.37e-05 | ms/batch 429.87 | loss 3.01 | ppl 20.269 +| epoch 15 step 165200 | 4620 batches | lr 7.29e-05 | ms/batch 429.01 | loss 3.01 | ppl 20.313 +| epoch 15 step 165400 | 4820 batches | lr 7.21e-05 | ms/batch 428.76 | loss 3.00 | ppl 19.990 +| epoch 15 step 165600 | 5020 batches | lr 7.13e-05 | ms/batch 428.79 | loss 3.02 | ppl 20.541 +| epoch 15 step 165800 | 5220 batches | lr 7.04e-05 | ms/batch 428.63 | loss 3.00 | ppl 20.101 +| epoch 15 step 166000 | 5420 batches | lr 6.96e-05 | ms/batch 428.36 | loss 2.98 | ppl 19.608 +| epoch 15 step 166200 | 5620 batches | lr 6.88e-05 | ms/batch 428.57 | loss 3.01 | ppl 20.309 +| epoch 15 step 166400 | 5820 batches | lr 6.81e-05 | ms/batch 431.45 | loss 3.01 | ppl 20.265 +| epoch 15 step 166600 | 6020 batches | lr 6.73e-05 | ms/batch 428.47 | loss 2.99 | ppl 19.874 +| epoch 15 step 166800 | 6220 batches | lr 6.65e-05 | ms/batch 428.45 | loss 3.00 | ppl 20.062 +| epoch 15 step 167000 | 6420 batches | lr 6.57e-05 | ms/batch 428.92 | loss 3.01 | ppl 20.380 +| epoch 15 step 167200 | 6620 batches | lr 6.49e-05 | ms/batch 428.16 | loss 2.96 | ppl 19.293 +| epoch 15 step 167400 | 6820 batches | lr 6.42e-05 | ms/batch 430.00 | loss 2.99 | ppl 19.858 +| epoch 15 step 167600 | 7020 batches | lr 6.34e-05 | ms/batch 431.79 | loss 3.00 | ppl 20.049 +| epoch 15 step 167800 | 7220 batches | lr 6.26e-05 | ms/batch 428.44 | loss 2.96 | ppl 19.284 +| epoch 15 step 168000 | 7420 batches | lr 6.19e-05 | ms/batch 431.93 | loss 2.97 | ppl 19.458 +---------------------------------------------------------------------------------------------------- +| Eval 42 at step 168000 | time: 1724.13s | valid loss 3.14 | valid ppl 23.110 +---------------------------------------------------------------------------------------------------- +| epoch 15 step 168200 | 7620 batches | lr 6.11e-05 | ms/batch 481.67 | loss 2.96 | ppl 19.254 +| epoch 15 step 168400 | 7820 batches | lr 6.04e-05 | ms/batch 428.92 | loss 2.99 | ppl 19.864 +| epoch 15 step 168600 | 8020 batches | lr 5.96e-05 | ms/batch 428.32 | loss 2.99 | ppl 19.852 +| epoch 15 step 168800 | 8220 batches | lr 5.89e-05 | ms/batch 428.77 | loss 2.98 | ppl 19.604 +| epoch 15 step 169000 | 8420 batches | lr 5.81e-05 | ms/batch 431.33 | loss 2.99 | ppl 19.895 +| epoch 15 step 169200 | 8620 batches | lr 5.74e-05 | ms/batch 428.35 | loss 2.98 | ppl 19.771 +| epoch 15 step 169400 | 8820 batches | lr 5.67e-05 | ms/batch 429.98 | loss 3.00 | ppl 20.183 +| epoch 15 step 169600 | 9020 batches | lr 5.59e-05 | ms/batch 428.27 | loss 3.00 | ppl 20.035 +| epoch 15 step 169800 | 9220 batches | lr 5.52e-05 | ms/batch 428.16 | loss 2.97 | ppl 19.416 +| epoch 15 step 170000 | 9420 batches | lr 5.45e-05 | ms/batch 428.17 | loss 2.99 | ppl 19.919 +| epoch 15 step 170200 | 9620 batches | lr 5.38e-05 | ms/batch 429.42 | loss 3.01 | ppl 20.260 +| epoch 15 step 170400 | 9820 batches | lr 5.31e-05 | ms/batch 428.41 | loss 2.97 | ppl 19.573 +| epoch 15 step 170600 | 10020 batches | lr 5.24e-05 | ms/batch 428.58 | loss 2.99 | ppl 19.872 +| epoch 15 step 170800 | 10220 batches | lr 5.17e-05 | ms/batch 428.30 | loss 2.98 | ppl 19.782 +| epoch 15 step 171000 | 10420 batches | lr 5.1e-05 | ms/batch 428.42 | loss 2.98 | ppl 19.778 +| epoch 15 step 171200 | 10620 batches | lr 5.03e-05 | ms/batch 428.34 | loss 3.02 | ppl 20.469 +| epoch 15 step 171400 | 10820 batches | lr 4.96e-05 | ms/batch 428.37 | loss 2.96 | ppl 19.309 +| epoch 15 step 171600 | 11020 batches | lr 4.89e-05 | ms/batch 428.57 | loss 3.01 | ppl 20.275 +| epoch 15 step 171800 | 11220 batches | lr 4.83e-05 | ms/batch 430.51 | loss 3.01 | ppl 20.222 +| epoch 15 step 172000 | 11420 batches | lr 4.76e-05 | ms/batch 429.74 | loss 3.01 | ppl 20.201 +---------------------------------------------------------------------------------------------------- +| Eval 43 at step 172000 | time: 1721.76s | valid loss 3.14 | valid ppl 23.035 +---------------------------------------------------------------------------------------------------- +| epoch 16 step 172200 | 150 batches | lr 4.69e-05 | ms/batch 480.04 | loss 2.99 | ppl 19.801 +| epoch 16 step 172400 | 350 batches | lr 4.63e-05 | ms/batch 428.93 | loss 2.97 | ppl 19.473 +| epoch 16 step 172600 | 550 batches | lr 4.56e-05 | ms/batch 428.42 | loss 2.99 | ppl 19.978 +| epoch 16 step 172800 | 750 batches | lr 4.5e-05 | ms/batch 428.37 | loss 2.98 | ppl 19.650 +| epoch 16 step 173000 | 950 batches | lr 4.43e-05 | ms/batch 428.78 | loss 2.97 | ppl 19.486 +| epoch 16 step 173200 | 1150 batches | lr 4.37e-05 | ms/batch 428.45 | loss 3.00 | ppl 20.096 +| epoch 16 step 173400 | 1350 batches | lr 4.3e-05 | ms/batch 428.00 | loss 2.98 | ppl 19.677 +| epoch 16 step 173600 | 1550 batches | lr 4.24e-05 | ms/batch 428.26 | loss 2.98 | ppl 19.595 +| epoch 16 step 173800 | 1750 batches | lr 4.18e-05 | ms/batch 428.85 | loss 2.97 | ppl 19.502 +| epoch 16 step 174000 | 1950 batches | lr 4.11e-05 | ms/batch 429.02 | loss 3.00 | ppl 20.143 +| epoch 16 step 174200 | 2150 batches | lr 4.05e-05 | ms/batch 428.57 | loss 3.01 | ppl 20.385 +| epoch 16 step 174400 | 2350 batches | lr 3.99e-05 | ms/batch 428.93 | loss 2.99 | ppl 19.878 +| epoch 16 step 174600 | 2550 batches | lr 3.93e-05 | ms/batch 428.57 | loss 2.99 | ppl 19.965 +| epoch 16 step 174800 | 2750 batches | lr 3.87e-05 | ms/batch 428.31 | loss 2.97 | ppl 19.491 +| epoch 16 step 175000 | 2950 batches | lr 3.81e-05 | ms/batch 428.82 | loss 2.97 | ppl 19.544 +| epoch 16 step 175200 | 3150 batches | lr 3.75e-05 | ms/batch 428.52 | loss 2.99 | ppl 19.909 +| epoch 16 step 175400 | 3350 batches | lr 3.69e-05 | ms/batch 431.04 | loss 2.99 | ppl 19.941 +| epoch 16 step 175600 | 3550 batches | lr 3.63e-05 | ms/batch 428.37 | loss 2.97 | ppl 19.533 +| epoch 16 step 175800 | 3750 batches | lr 3.57e-05 | ms/batch 428.73 | loss 2.98 | ppl 19.693 +| epoch 16 step 176000 | 3950 batches | lr 3.51e-05 | ms/batch 429.12 | loss 2.98 | ppl 19.722 +---------------------------------------------------------------------------------------------------- +| Eval 44 at step 176000 | time: 1720.98s | valid loss 3.13 | valid ppl 22.961 +---------------------------------------------------------------------------------------------------- +| epoch 16 step 176200 | 4150 batches | lr 3.45e-05 | ms/batch 481.57 | loss 2.99 | ppl 19.858 +| epoch 16 step 176400 | 4350 batches | lr 3.4e-05 | ms/batch 428.92 | loss 2.99 | ppl 19.850 +| epoch 16 step 176600 | 4550 batches | lr 3.34e-05 | ms/batch 428.40 | loss 3.01 | ppl 20.276 +| epoch 16 step 176800 | 4750 batches | lr 3.28e-05 | ms/batch 432.59 | loss 2.96 | ppl 19.228 +| epoch 16 step 177000 | 4950 batches | lr 3.23e-05 | ms/batch 429.38 | loss 2.99 | ppl 19.854 +| epoch 16 step 177200 | 5150 batches | lr 3.17e-05 | ms/batch 428.90 | loss 2.98 | ppl 19.677 +| epoch 16 step 177400 | 5350 batches | lr 3.12e-05 | ms/batch 428.84 | loss 2.97 | ppl 19.407 +| epoch 16 step 177600 | 5550 batches | lr 3.06e-05 | ms/batch 429.22 | loss 2.97 | ppl 19.489 +| epoch 16 step 177800 | 5750 batches | lr 3.01e-05 | ms/batch 428.66 | loss 2.99 | ppl 19.841 +| epoch 16 step 178000 | 5950 batches | lr 2.96e-05 | ms/batch 428.51 | loss 2.97 | ppl 19.551 +| epoch 16 step 178200 | 6150 batches | lr 2.9e-05 | ms/batch 428.34 | loss 2.97 | ppl 19.513 +| epoch 16 step 178400 | 6350 batches | lr 2.85e-05 | ms/batch 428.44 | loss 3.01 | ppl 20.244 +| epoch 16 step 178600 | 6550 batches | lr 2.8e-05 | ms/batch 428.77 | loss 2.93 | ppl 18.681 +| epoch 16 step 178800 | 6750 batches | lr 2.75e-05 | ms/batch 428.39 | loss 2.96 | ppl 19.316 +| epoch 16 step 179000 | 6950 batches | lr 2.7e-05 | ms/batch 428.69 | loss 2.97 | ppl 19.587 +| epoch 16 step 179200 | 7150 batches | lr 2.65e-05 | ms/batch 428.29 | loss 2.94 | ppl 18.849 +| epoch 16 step 179400 | 7350 batches | lr 2.6e-05 | ms/batch 428.68 | loss 2.95 | ppl 19.086 +| epoch 16 step 179600 | 7550 batches | lr 2.55e-05 | ms/batch 428.60 | loss 2.95 | ppl 19.086 +| epoch 16 step 179800 | 7750 batches | lr 2.5e-05 | ms/batch 428.68 | loss 2.96 | ppl 19.386 +| epoch 16 step 180000 | 7950 batches | lr 2.45e-05 | ms/batch 428.49 | loss 2.95 | ppl 19.104 +---------------------------------------------------------------------------------------------------- +| Eval 45 at step 180000 | time: 1721.79s | valid loss 3.13 | valid ppl 22.853 +---------------------------------------------------------------------------------------------------- +| epoch 16 step 180200 | 8150 batches | lr 2.4e-05 | ms/batch 481.12 | loss 2.96 | ppl 19.338 +| epoch 16 step 180400 | 8350 batches | lr 2.35e-05 | ms/batch 431.71 | loss 2.97 | ppl 19.506 +| epoch 16 step 180600 | 8550 batches | lr 2.3e-05 | ms/batch 428.61 | loss 2.96 | ppl 19.224 +| epoch 16 step 180800 | 8750 batches | lr 2.26e-05 | ms/batch 428.53 | loss 2.97 | ppl 19.506 +| epoch 16 step 181000 | 8950 batches | lr 2.21e-05 | ms/batch 428.23 | loss 2.98 | ppl 19.751 +| epoch 16 step 181200 | 9150 batches | lr 2.16e-05 | ms/batch 429.02 | loss 2.95 | ppl 19.154 +| epoch 16 step 181400 | 9350 batches | lr 2.12e-05 | ms/batch 430.94 | loss 2.97 | ppl 19.462 +| epoch 16 step 181600 | 9550 batches | lr 2.07e-05 | ms/batch 432.03 | loss 3.00 | ppl 20.034 +| epoch 16 step 181800 | 9750 batches | lr 2.03e-05 | ms/batch 432.56 | loss 2.96 | ppl 19.237 +| epoch 16 step 182000 | 9950 batches | lr 1.99e-05 | ms/batch 433.30 | loss 2.97 | ppl 19.457 +| epoch 16 step 182200 | 10150 batches | lr 1.94e-05 | ms/batch 431.96 | loss 2.95 | ppl 19.045 +| epoch 16 step 182400 | 10350 batches | lr 1.9e-05 | ms/batch 432.55 | loss 2.98 | ppl 19.590 +| epoch 16 step 182600 | 10550 batches | lr 1.86e-05 | ms/batch 432.69 | loss 3.00 | ppl 20.060 +| epoch 16 step 182800 | 10750 batches | lr 1.81e-05 | ms/batch 432.46 | loss 2.94 | ppl 19.004 +| epoch 16 step 183000 | 10950 batches | lr 1.77e-05 | ms/batch 433.87 | loss 2.96 | ppl 19.317 +| epoch 16 step 183200 | 11150 batches | lr 1.73e-05 | ms/batch 430.79 | loss 3.01 | ppl 20.293 +| epoch 16 step 183400 | 11350 batches | lr 1.69e-05 | ms/batch 429.54 | loss 2.97 | ppl 19.576 +| epoch 17 step 183600 | 80 batches | lr 1.65e-05 | ms/batch 428.43 | loss 2.98 | ppl 19.634 +| epoch 17 step 183800 | 280 batches | lr 1.61e-05 | ms/batch 432.08 | loss 2.95 | ppl 19.031 +| epoch 17 step 184000 | 480 batches | lr 1.57e-05 | ms/batch 429.23 | loss 2.99 | ppl 19.851 +---------------------------------------------------------------------------------------------------- +| Eval 46 at step 184000 | time: 1729.72s | valid loss 3.13 | valid ppl 22.820 +---------------------------------------------------------------------------------------------------- +| epoch 17 step 184200 | 680 batches | lr 1.53e-05 | ms/batch 480.81 | loss 2.94 | ppl 19.004 +| epoch 17 step 184400 | 880 batches | lr 1.49e-05 | ms/batch 428.57 | loss 2.97 | ppl 19.496 +| epoch 17 step 184600 | 1080 batches | lr 1.46e-05 | ms/batch 428.97 | loss 2.97 | ppl 19.571 +| epoch 17 step 184800 | 1280 batches | lr 1.42e-05 | ms/batch 428.24 | loss 2.96 | ppl 19.205 +| epoch 17 step 185000 | 1480 batches | lr 1.38e-05 | ms/batch 429.06 | loss 2.96 | ppl 19.267 +| epoch 17 step 185200 | 1680 batches | lr 1.35e-05 | ms/batch 429.83 | loss 2.96 | ppl 19.297 +| epoch 17 step 185400 | 1880 batches | lr 1.31e-05 | ms/batch 430.28 | loss 2.97 | ppl 19.457 +| epoch 17 step 185600 | 2080 batches | lr 1.27e-05 | ms/batch 428.80 | loss 3.01 | ppl 20.313 +| epoch 17 step 185800 | 2280 batches | lr 1.24e-05 | ms/batch 428.95 | loss 2.99 | ppl 19.825 +| epoch 17 step 186000 | 2480 batches | lr 1.2e-05 | ms/batch 432.86 | loss 2.96 | ppl 19.376 +| epoch 17 step 186200 | 2680 batches | lr 1.17e-05 | ms/batch 429.42 | loss 2.98 | ppl 19.685 +| epoch 17 step 186400 | 2880 batches | lr 1.14e-05 | ms/batch 428.91 | loss 2.93 | ppl 18.645 +| epoch 17 step 186600 | 3080 batches | lr 1.1e-05 | ms/batch 429.49 | loss 2.97 | ppl 19.566 +| epoch 17 step 186800 | 3280 batches | lr 1.07e-05 | ms/batch 431.47 | loss 2.99 | ppl 19.831 +| epoch 17 step 187000 | 3480 batches | lr 1.04e-05 | ms/batch 430.23 | loss 2.95 | ppl 19.146 +| epoch 17 step 187200 | 3680 batches | lr 1.01e-05 | ms/batch 429.15 | loss 2.97 | ppl 19.491 +| epoch 17 step 187400 | 3880 batches | lr 9.76e-06 | ms/batch 431.85 | loss 2.96 | ppl 19.216 +| epoch 17 step 187600 | 4080 batches | lr 9.46e-06 | ms/batch 429.38 | loss 2.98 | ppl 19.778 +| epoch 17 step 187800 | 4280 batches | lr 9.16e-06 | ms/batch 429.06 | loss 2.96 | ppl 19.381 +| epoch 17 step 188000 | 4480 batches | lr 8.86e-06 | ms/batch 432.13 | loss 2.99 | ppl 19.797 +---------------------------------------------------------------------------------------------------- +| Eval 47 at step 188000 | time: 1725.40s | valid loss 3.13 | valid ppl 22.784 +---------------------------------------------------------------------------------------------------- +| epoch 17 step 188200 | 4680 batches | lr 8.57e-06 | ms/batch 482.30 | loss 2.96 | ppl 19.223 +| epoch 17 step 188400 | 4880 batches | lr 8.28e-06 | ms/batch 434.48 | loss 2.96 | ppl 19.235 +| epoch 17 step 188600 | 5080 batches | lr 8e-06 | ms/batch 428.56 | loss 2.98 | ppl 19.594 +| epoch 17 step 188800 | 5280 batches | lr 7.72e-06 | ms/batch 428.74 | loss 2.96 | ppl 19.347 +| epoch 17 step 189000 | 5480 batches | lr 7.45e-06 | ms/batch 432.26 | loss 2.95 | ppl 19.043 +| epoch 17 step 189200 | 5680 batches | lr 7.18e-06 | ms/batch 429.46 | loss 2.98 | ppl 19.617 +| epoch 17 step 189400 | 5880 batches | lr 6.92e-06 | ms/batch 429.20 | loss 2.96 | ppl 19.388 +| epoch 17 step 189600 | 6080 batches | lr 6.66e-06 | ms/batch 430.29 | loss 2.97 | ppl 19.430 +| epoch 17 step 189800 | 6280 batches | lr 6.41e-06 | ms/batch 430.46 | loss 2.97 | ppl 19.575 +| epoch 17 step 190000 | 6480 batches | lr 6.16e-06 | ms/batch 429.53 | loss 2.95 | ppl 19.088 +| epoch 17 step 190200 | 6680 batches | lr 5.91e-06 | ms/batch 430.35 | loss 2.93 | ppl 18.675 +| epoch 17 step 190400 | 6880 batches | lr 5.68e-06 | ms/batch 428.73 | loss 2.96 | ppl 19.301 +| epoch 17 step 190600 | 7080 batches | lr 5.44e-06 | ms/batch 430.43 | loss 2.95 | ppl 19.070 +| epoch 17 step 190800 | 7280 batches | lr 5.21e-06 | ms/batch 430.71 | loss 2.91 | ppl 18.382 +| epoch 17 step 191000 | 7480 batches | lr 4.99e-06 | ms/batch 428.97 | loss 2.95 | ppl 19.146 +| epoch 17 step 191200 | 7680 batches | lr 4.77e-06 | ms/batch 428.68 | loss 2.94 | ppl 18.838 +| epoch 17 step 191400 | 7880 batches | lr 4.56e-06 | ms/batch 435.99 | loss 2.94 | ppl 18.890 +| epoch 17 step 191600 | 8080 batches | lr 4.35e-06 | ms/batch 428.95 | loss 2.96 | ppl 19.240 +| epoch 17 step 191800 | 8280 batches | lr 4.14e-06 | ms/batch 431.74 | loss 2.95 | ppl 19.035 +| epoch 17 step 192000 | 8480 batches | lr 3.94e-06 | ms/batch 430.40 | loss 2.95 | ppl 19.092 +---------------------------------------------------------------------------------------------------- +| Eval 48 at step 192000 | time: 1727.76s | valid loss 3.13 | valid ppl 22.769 +---------------------------------------------------------------------------------------------------- +| epoch 17 step 192200 | 8680 batches | lr 3.75e-06 | ms/batch 482.57 | loss 2.96 | ppl 19.349 +| epoch 17 step 192400 | 8880 batches | lr 3.56e-06 | ms/batch 429.22 | loss 2.96 | ppl 19.309 +| epoch 17 step 192600 | 9080 batches | lr 3.37e-06 | ms/batch 429.91 | loss 2.96 | ppl 19.268 +| epoch 17 step 192800 | 9280 batches | lr 3.2e-06 | ms/batch 428.73 | loss 2.95 | ppl 19.147 +| epoch 17 step 193000 | 9480 batches | lr 3.02e-06 | ms/batch 429.72 | loss 2.97 | ppl 19.395 +| epoch 17 step 193200 | 9680 batches | lr 2.85e-06 | ms/batch 428.35 | loss 2.96 | ppl 19.365 +| epoch 17 step 193400 | 9880 batches | lr 2.69e-06 | ms/batch 428.39 | loss 2.94 | ppl 18.828 +| epoch 17 step 193600 | 10080 batches | lr 2.53e-06 | ms/batch 429.53 | loss 2.97 | ppl 19.541 +| epoch 17 step 193800 | 10280 batches | lr 2.37e-06 | ms/batch 431.64 | loss 2.94 | ppl 18.977 +| epoch 17 step 194000 | 10480 batches | lr 2.22e-06 | ms/batch 428.52 | loss 2.98 | ppl 19.732 +| epoch 17 step 194200 | 10680 batches | lr 2.07e-06 | ms/batch 429.27 | loss 2.96 | ppl 19.303 +| epoch 17 step 194400 | 10880 batches | lr 1.93e-06 | ms/batch 428.66 | loss 2.94 | ppl 18.856 +| epoch 17 step 194600 | 11080 batches | lr 1.8e-06 | ms/batch 429.55 | loss 2.98 | ppl 19.745 +| epoch 17 step 194800 | 11280 batches | lr 1.67e-06 | ms/batch 429.71 | loss 2.98 | ppl 19.731 +| epoch 18 step 195000 | 10 batches | lr 1.54e-06 | ms/batch 427.88 | loss 2.97 | ppl 19.547 +| epoch 18 step 195200 | 210 batches | lr 1.42e-06 | ms/batch 428.77 | loss 2.94 | ppl 18.860 +| epoch 18 step 195400 | 410 batches | lr 1.3e-06 | ms/batch 428.59 | loss 2.97 | ppl 19.491 +| epoch 18 step 195600 | 610 batches | lr 1.19e-06 | ms/batch 429.81 | loss 2.94 | ppl 18.910 +| epoch 18 step 195800 | 810 batches | lr 1.09e-06 | ms/batch 430.47 | loss 2.98 | ppl 19.594 +| epoch 18 step 196000 | 1010 batches | lr 9.87e-07 | ms/batch 430.25 | loss 2.94 | ppl 18.915 +---------------------------------------------------------------------------------------------------- +| Eval 49 at step 196000 | time: 1723.60s | valid loss 3.12 | valid ppl 22.721 +---------------------------------------------------------------------------------------------------- +| epoch 18 step 196200 | 1210 batches | lr 8.91e-07 | ms/batch 481.11 | loss 2.97 | ppl 19.444 +| epoch 18 step 196400 | 1410 batches | lr 7.99e-07 | ms/batch 429.35 | loss 2.96 | ppl 19.282 +| epoch 18 step 196600 | 1610 batches | lr 7.13e-07 | ms/batch 430.13 | loss 2.94 | ppl 18.853 +| epoch 18 step 196800 | 1810 batches | lr 6.32e-07 | ms/batch 430.89 | loss 2.97 | ppl 19.428 +| epoch 18 step 197000 | 2010 batches | lr 5.55e-07 | ms/batch 429.33 | loss 2.99 | ppl 19.982 +| epoch 18 step 197200 | 2210 batches | lr 4.84e-07 | ms/batch 434.58 | loss 2.98 | ppl 19.660 +| epoch 18 step 197400 | 2410 batches | lr 4.17e-07 | ms/batch 431.17 | loss 2.97 | ppl 19.544 +| epoch 18 step 197600 | 2610 batches | lr 3.55e-07 | ms/batch 430.55 | loss 2.96 | ppl 19.355 +| epoch 18 step 197800 | 2810 batches | lr 2.99e-07 | ms/batch 430.41 | loss 2.94 | ppl 18.958 +| epoch 18 step 198000 | 3010 batches | lr 2.47e-07 | ms/batch 429.36 | loss 2.96 | ppl 19.330 +| epoch 18 step 198200 | 3210 batches | lr 2e-07 | ms/batch 430.41 | loss 2.96 | ppl 19.325 +| epoch 18 step 198400 | 3410 batches | lr 1.58e-07 | ms/batch 429.43 | loss 2.97 | ppl 19.499 +| epoch 18 step 198600 | 3610 batches | lr 1.21e-07 | ms/batch 431.50 | loss 2.94 | ppl 18.898 +| epoch 18 step 198800 | 3810 batches | lr 8.88e-08 | ms/batch 429.80 | loss 2.96 | ppl 19.348 +| epoch 18 step 199000 | 4010 batches | lr 6.17e-08 | ms/batch 429.77 | loss 2.98 | ppl 19.655 +| epoch 18 step 199200 | 4210 batches | lr 3.95e-08 | ms/batch 429.61 | loss 2.96 | ppl 19.266 +| epoch 18 step 199400 | 4410 batches | lr 2.22e-08 | ms/batch 430.88 | loss 2.97 | ppl 19.436 +| epoch 18 step 199600 | 4610 batches | lr 9.87e-09 | ms/batch 429.55 | loss 2.97 | ppl 19.504 +| epoch 18 step 199800 | 4810 batches | lr 2.47e-09 | ms/batch 428.95 | loss 2.94 | ppl 19.004 +| epoch 18 step 200000 | 5010 batches | lr 0 | ms/batch 430.23 | loss 2.98 | ppl 19.716 +---------------------------------------------------------------------------------------------------- +| Eval 50 at step 200000 | time: 1727.18s | valid loss 3.12 | valid ppl 22.725 +---------------------------------------------------------------------------------------------------- +---------------------------------------------------------------------------------------------------- +End of training +==================================================================================================== +| End of training | test loss 3.16 | test ppl 23.511 +==================================================================================================== diff --git a/NLP/Transformer-XL/exp_results/log-50k.txt b/NLP/Transformer-XL/exp_results/log-50k.txt new file mode 100644 index 0000000..a69845b --- /dev/null +++ b/NLP/Transformer-XL/exp_results/log-50k.txt @@ -0,0 +1,360 @@ +==================================================================================================== + - data : /root/autodl-tmp/data/wikitext-103/ + - dataset : wt103 + - n_layer : 16 + - n_head : 10 + - d_head : 41 + - d_embed : 410 + - d_model : 410 + - d_inner : 2100 + - dropout : 0.1 + - dropatt : 0.0 + - init : normal + - emb_init : normal + - init_range : 0.1 + - emb_init_range : 0.01 + - init_std : 0.02 + - proj_init_std : 0.01 + - optim : adan + - lr : 0.0015 + - wd : 0.02 + - mom : 0.0 + - scheduler : cosine + - warmup_step : 5000 + - decay_rate : 0.5 + - lr_min : 1e-06 + - clip : 0.25 + - clip_nonemb : False + - max_step : 50000 + - batch_size : 60 + - batch_chunk : 1 + - tgt_len : 150 + - eval_tgt_len : 150 + - ext_len : 0 + - mem_len : 150 + - not_tied : False + - seed : 1111 + - cuda : True + - adaptive : True + - div_val : 1 + - pre_lnorm : False + - varlen : False + - multi_gpu : True + - log_interval : 200 + - eval_interval : 4000 + - work_dir : /root/autodl-tmp/-wt103/20220809-222534 + - restart : False + - restart_dir : + - debug : False + - same_length : False + - attn_type : 0 + - clamp_len : -1 + - eta_min : 0.0 + - gpu0_bsz : 4 + - max_eval_steps : -1 + - sample_softmax : -1 + - patience : 0 + - finetune_v2 : False + - finetune_v3 : False + - fp16 : False + - static_loss_scale : 1 + - dynamic_loss_scale : False + - opt_betas : [0.9, 0.9, 0.999] + - tied : True + - n_token : 267735 + - n_all_param : 151107538 + - n_nonemb_param : 41066400 +==================================================================================================== +#params = 151107538 +#non emb params = 41066400 +| epoch 1 step 200 | 200 batches | lr 6e-05 | ms/batch 731.01 | loss 8.99 | ppl 7986.754 +| epoch 1 step 400 | 400 batches | lr 0.00012 | ms/batch 671.04 | loss 6.94 | ppl 1033.129 +| epoch 1 step 600 | 600 batches | lr 0.00018 | ms/batch 674.05 | loss 6.40 | ppl 599.798 +| epoch 1 step 800 | 800 batches | lr 0.00024 | ms/batch 672.64 | loss 6.11 | ppl 452.258 +| epoch 1 step 1000 | 1000 batches | lr 0.0003 | ms/batch 672.77 | loss 5.85 | ppl 348.893 +| epoch 1 step 1200 | 1200 batches | lr 0.00036 | ms/batch 673.66 | loss 5.65 | ppl 285.037 +| epoch 1 step 1400 | 1400 batches | lr 0.00042 | ms/batch 674.81 | loss 5.48 | ppl 240.623 +| epoch 1 step 1600 | 1600 batches | lr 0.00048 | ms/batch 671.81 | loss 5.33 | ppl 206.955 +| epoch 1 step 1800 | 1800 batches | lr 0.00054 | ms/batch 673.69 | loss 5.21 | ppl 182.225 +| epoch 1 step 2000 | 2000 batches | lr 0.0006 | ms/batch 670.74 | loss 5.09 | ppl 162.138 +| epoch 1 step 2200 | 2200 batches | lr 0.00066 | ms/batch 672.15 | loss 4.98 | ppl 145.111 +| epoch 1 step 2400 | 2400 batches | lr 0.00072 | ms/batch 670.57 | loss 4.89 | ppl 133.331 +| epoch 1 step 2600 | 2600 batches | lr 0.00078 | ms/batch 672.95 | loss 4.80 | ppl 121.355 +| epoch 1 step 2800 | 2800 batches | lr 0.00084 | ms/batch 671.53 | loss 4.72 | ppl 112.435 +| epoch 1 step 3000 | 3000 batches | lr 0.0009 | ms/batch 667.80 | loss 4.67 | ppl 107.032 +| epoch 1 step 3200 | 3200 batches | lr 0.00096 | ms/batch 670.42 | loss 4.61 | ppl 100.273 +| epoch 1 step 3400 | 3400 batches | lr 0.00102 | ms/batch 673.73 | loss 4.56 | ppl 95.679 +| epoch 1 step 3600 | 3600 batches | lr 0.00108 | ms/batch 670.60 | loss 4.48 | ppl 88.439 +| epoch 1 step 3800 | 3800 batches | lr 0.00114 | ms/batch 672.03 | loss 4.51 | ppl 90.996 +| epoch 1 step 4000 | 4000 batches | lr 0.0012 | ms/batch 660.71 | loss 4.47 | ppl 87.228 +---------------------------------------------------------------------------------------------------- +| Eval 1 at step 4000 | time: 2706.60s | valid loss 4.43 | valid ppl 83.560 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 4200 | 4200 batches | lr 0.00126 | ms/batch 741.78 | loss 4.42 | ppl 83.146 +| epoch 1 step 4400 | 4400 batches | lr 0.00132 | ms/batch 671.50 | loss 4.40 | ppl 81.572 +| epoch 1 step 4600 | 4600 batches | lr 0.00138 | ms/batch 669.10 | loss 4.38 | ppl 79.989 +| epoch 1 step 4800 | 4800 batches | lr 0.00144 | ms/batch 671.50 | loss 4.33 | ppl 76.228 +| epoch 1 step 5000 | 5000 batches | lr 0.0015 | ms/batch 669.83 | loss 4.37 | ppl 79.175 +| epoch 1 step 5200 | 5200 batches | lr 0.0015 | ms/batch 669.53 | loss 4.32 | ppl 74.879 +| epoch 1 step 5400 | 5400 batches | lr 0.00149 | ms/batch 668.42 | loss 4.26 | ppl 70.961 +| epoch 1 step 5600 | 5600 batches | lr 0.00149 | ms/batch 669.68 | loss 4.28 | ppl 72.426 +| epoch 1 step 5800 | 5800 batches | lr 0.00149 | ms/batch 668.33 | loss 4.28 | ppl 71.883 +| epoch 1 step 6000 | 6000 batches | lr 0.00148 | ms/batch 669.96 | loss 4.23 | ppl 68.809 +| epoch 1 step 6200 | 6200 batches | lr 0.00148 | ms/batch 671.62 | loss 4.20 | ppl 66.917 +| epoch 1 step 6400 | 6400 batches | lr 0.00148 | ms/batch 670.80 | loss 4.23 | ppl 68.826 +| epoch 1 step 6600 | 6600 batches | lr 0.00147 | ms/batch 671.47 | loss 4.17 | ppl 64.485 +| epoch 1 step 6800 | 6800 batches | lr 0.00147 | ms/batch 671.88 | loss 4.16 | ppl 64.148 +| epoch 1 step 7000 | 7000 batches | lr 0.00146 | ms/batch 669.08 | loss 4.16 | ppl 64.382 +| epoch 1 step 7200 | 7200 batches | lr 0.00146 | ms/batch 669.37 | loss 4.12 | ppl 61.310 +| epoch 1 step 7400 | 7400 batches | lr 0.00146 | ms/batch 669.99 | loss 4.11 | ppl 61.000 +| epoch 1 step 7600 | 7600 batches | lr 0.00145 | ms/batch 669.12 | loss 4.09 | ppl 59.732 +| epoch 1 step 7800 | 7800 batches | lr 0.00145 | ms/batch 671.55 | loss 4.11 | ppl 60.794 +| epoch 1 step 8000 | 8000 batches | lr 0.00144 | ms/batch 659.11 | loss 4.10 | ppl 60.478 +---------------------------------------------------------------------------------------------------- +| Eval 2 at step 8000 | time: 2687.58s | valid loss 4.01 | valid ppl 55.175 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 8200 | 8200 batches | lr 0.00144 | ms/batch 742.68 | loss 4.08 | ppl 58.932 +| epoch 1 step 8400 | 8400 batches | lr 0.00143 | ms/batch 669.52 | loss 4.09 | ppl 59.603 +| epoch 1 step 8600 | 8600 batches | lr 0.00143 | ms/batch 670.69 | loss 4.07 | ppl 58.419 +| epoch 1 step 8800 | 8800 batches | lr 0.00142 | ms/batch 670.29 | loss 4.08 | ppl 58.862 +| epoch 1 step 9000 | 9000 batches | lr 0.00142 | ms/batch 671.07 | loss 4.04 | ppl 57.075 +| epoch 1 step 9200 | 9200 batches | lr 0.00141 | ms/batch 670.31 | loss 4.03 | ppl 56.375 +| epoch 1 step 9400 | 9400 batches | lr 0.00141 | ms/batch 668.76 | loss 4.04 | ppl 56.654 +| epoch 1 step 9600 | 9600 batches | lr 0.0014 | ms/batch 668.70 | loss 4.05 | ppl 57.438 +| epoch 1 step 9800 | 9800 batches | lr 0.0014 | ms/batch 669.90 | loss 4.01 | ppl 54.931 +| epoch 1 step 10000 | 10000 batches | lr 0.00139 | ms/batch 671.54 | loss 4.02 | ppl 55.691 +| epoch 1 step 10200 | 10200 batches | lr 0.00138 | ms/batch 668.10 | loss 3.98 | ppl 53.731 +| epoch 1 step 10400 | 10400 batches | lr 0.00138 | ms/batch 668.55 | loss 3.98 | ppl 53.647 +| epoch 1 step 10600 | 10600 batches | lr 0.00137 | ms/batch 670.24 | loss 4.00 | ppl 54.823 +| epoch 1 step 10800 | 10800 batches | lr 0.00137 | ms/batch 669.67 | loss 3.96 | ppl 52.449 +| epoch 1 step 11000 | 11000 batches | lr 0.00136 | ms/batch 668.12 | loss 4.00 | ppl 54.511 +| epoch 1 step 11200 | 11200 batches | lr 0.00135 | ms/batch 669.36 | loss 3.98 | ppl 53.348 +| epoch 1 step 11400 | 11400 batches | lr 0.00135 | ms/batch 667.23 | loss 3.97 | ppl 53.053 +| epoch 2 step 11600 | 130 batches | lr 0.00134 | ms/batch 671.47 | loss 3.95 | ppl 51.832 +| epoch 2 step 11800 | 330 batches | lr 0.00134 | ms/batch 670.28 | loss 3.92 | ppl 50.430 +| epoch 2 step 12000 | 530 batches | lr 0.00133 | ms/batch 658.97 | loss 3.94 | ppl 51.495 +---------------------------------------------------------------------------------------------------- +| Eval 3 at step 12000 | time: 2685.36s | valid loss 3.83 | valid ppl 46.199 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 12200 | 730 batches | lr 0.00132 | ms/batch 741.77 | loss 3.91 | ppl 50.018 +| epoch 2 step 12400 | 930 batches | lr 0.00132 | ms/batch 669.29 | loss 3.91 | ppl 50.118 +| epoch 2 step 12600 | 1130 batches | lr 0.00131 | ms/batch 670.23 | loss 3.94 | ppl 51.393 +| epoch 2 step 12800 | 1330 batches | lr 0.0013 | ms/batch 670.21 | loss 3.91 | ppl 49.684 +| epoch 2 step 13000 | 1530 batches | lr 0.00129 | ms/batch 669.82 | loss 3.90 | ppl 49.205 +| epoch 2 step 13200 | 1730 batches | lr 0.00129 | ms/batch 668.80 | loss 3.89 | ppl 48.946 +| epoch 2 step 13400 | 1930 batches | lr 0.00128 | ms/batch 669.89 | loss 3.90 | ppl 49.160 +| epoch 2 step 13600 | 2130 batches | lr 0.00127 | ms/batch 670.73 | loss 3.91 | ppl 50.134 +| epoch 2 step 13800 | 2330 batches | lr 0.00127 | ms/batch 669.47 | loss 3.89 | ppl 48.907 +| epoch 2 step 14000 | 2530 batches | lr 0.00126 | ms/batch 670.64 | loss 3.88 | ppl 48.187 +| epoch 2 step 14200 | 2730 batches | lr 0.00125 | ms/batch 669.45 | loss 3.85 | ppl 47.194 +| epoch 2 step 14400 | 2930 batches | lr 0.00124 | ms/batch 670.69 | loss 3.84 | ppl 46.316 +| epoch 2 step 14600 | 3130 batches | lr 0.00124 | ms/batch 668.19 | loss 3.84 | ppl 46.742 +| epoch 2 step 14800 | 3330 batches | lr 0.00123 | ms/batch 668.82 | loss 3.85 | ppl 46.832 +| epoch 2 step 15000 | 3530 batches | lr 0.00122 | ms/batch 669.99 | loss 3.81 | ppl 45.024 +| epoch 2 step 15200 | 3730 batches | lr 0.00121 | ms/batch 668.58 | loss 3.83 | ppl 46.255 +| epoch 2 step 15400 | 3930 batches | lr 0.0012 | ms/batch 670.31 | loss 3.82 | ppl 45.787 +| epoch 2 step 15600 | 4130 batches | lr 0.0012 | ms/batch 667.87 | loss 3.81 | ppl 45.203 +| epoch 2 step 15800 | 4330 batches | lr 0.00119 | ms/batch 669.87 | loss 3.82 | ppl 45.456 +| epoch 2 step 16000 | 4530 batches | lr 0.00118 | ms/batch 656.97 | loss 3.82 | ppl 45.455 +---------------------------------------------------------------------------------------------------- +| Eval 4 at step 16000 | time: 2684.61s | valid loss 3.70 | valid ppl 40.554 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 16200 | 4730 batches | lr 0.00117 | ms/batch 743.72 | loss 3.77 | ppl 43.325 +| epoch 2 step 16400 | 4930 batches | lr 0.00116 | ms/batch 669.07 | loss 3.79 | ppl 44.198 +| epoch 2 step 16600 | 5130 batches | lr 0.00116 | ms/batch 670.76 | loss 3.78 | ppl 43.728 +| epoch 2 step 16800 | 5330 batches | lr 0.00115 | ms/batch 673.39 | loss 3.77 | ppl 43.271 +| epoch 2 step 17000 | 5530 batches | lr 0.00114 | ms/batch 668.77 | loss 3.75 | ppl 42.620 +| epoch 2 step 17200 | 5730 batches | lr 0.00113 | ms/batch 668.81 | loss 3.77 | ppl 43.340 +| epoch 2 step 17400 | 5930 batches | lr 0.00112 | ms/batch 671.39 | loss 3.75 | ppl 42.598 +| epoch 2 step 17600 | 6130 batches | lr 0.00111 | ms/batch 670.80 | loss 3.74 | ppl 42.211 +| epoch 2 step 17800 | 6330 batches | lr 0.0011 | ms/batch 670.83 | loss 3.77 | ppl 43.377 +| epoch 2 step 18000 | 6530 batches | lr 0.0011 | ms/batch 670.94 | loss 3.71 | ppl 40.882 +| epoch 2 step 18200 | 6730 batches | lr 0.00109 | ms/batch 671.71 | loss 3.71 | ppl 41.009 +| epoch 2 step 18400 | 6930 batches | lr 0.00108 | ms/batch 671.77 | loss 3.73 | ppl 41.510 +| epoch 2 step 18600 | 7130 batches | lr 0.00107 | ms/batch 672.45 | loss 3.70 | ppl 40.538 +| epoch 2 step 18800 | 7330 batches | lr 0.00106 | ms/batch 676.93 | loss 3.68 | ppl 39.664 +| epoch 2 step 19000 | 7530 batches | lr 0.00105 | ms/batch 673.81 | loss 3.70 | ppl 40.567 +| epoch 2 step 19200 | 7730 batches | lr 0.00104 | ms/batch 673.02 | loss 3.70 | ppl 40.493 +| epoch 2 step 19400 | 7930 batches | lr 0.00103 | ms/batch 671.76 | loss 3.69 | ppl 40.199 +| epoch 2 step 19600 | 8130 batches | lr 0.00102 | ms/batch 672.49 | loss 3.70 | ppl 40.628 +| epoch 2 step 19800 | 8330 batches | lr 0.00102 | ms/batch 675.15 | loss 3.69 | ppl 40.150 +| epoch 2 step 20000 | 8530 batches | lr 0.00101 | ms/batch 662.59 | loss 3.68 | ppl 39.675 +---------------------------------------------------------------------------------------------------- +| Eval 5 at step 20000 | time: 2694.60s | valid loss 3.60 | valid ppl 36.520 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 20200 | 8730 batches | lr 0.000997 | ms/batch 743.34 | loss 3.70 | ppl 40.281 +| epoch 2 step 20400 | 8930 batches | lr 0.000988 | ms/batch 672.38 | loss 3.69 | ppl 40.101 +| epoch 2 step 20600 | 9130 batches | lr 0.000978 | ms/batch 671.32 | loss 3.68 | ppl 39.723 +| epoch 2 step 20800 | 9330 batches | lr 0.000969 | ms/batch 670.29 | loss 3.67 | ppl 39.195 +| epoch 2 step 21000 | 9530 batches | lr 0.00096 | ms/batch 673.92 | loss 3.71 | ppl 40.874 +| epoch 2 step 21200 | 9730 batches | lr 0.00095 | ms/batch 673.78 | loss 3.66 | ppl 38.777 +| epoch 2 step 21400 | 9930 batches | lr 0.000941 | ms/batch 671.65 | loss 3.67 | ppl 39.193 +| epoch 2 step 21600 | 10130 batches | lr 0.000932 | ms/batch 671.55 | loss 3.65 | ppl 38.482 +| epoch 2 step 21800 | 10330 batches | lr 0.000922 | ms/batch 671.69 | loss 3.66 | ppl 38.807 +| epoch 2 step 22000 | 10530 batches | lr 0.000913 | ms/batch 671.36 | loss 3.67 | ppl 39.367 +| epoch 2 step 22200 | 10730 batches | lr 0.000903 | ms/batch 672.87 | loss 3.63 | ppl 37.849 +| epoch 2 step 22400 | 10930 batches | lr 0.000894 | ms/batch 674.08 | loss 3.63 | ppl 37.837 +| epoch 2 step 22600 | 11130 batches | lr 0.000884 | ms/batch 671.07 | loss 3.68 | ppl 39.497 +| epoch 2 step 22800 | 11330 batches | lr 0.000875 | ms/batch 671.94 | loss 3.64 | ppl 38.144 +| epoch 3 step 23000 | 60 batches | lr 0.000865 | ms/batch 672.34 | loss 3.65 | ppl 38.332 +| epoch 3 step 23200 | 260 batches | lr 0.000855 | ms/batch 674.27 | loss 3.60 | ppl 36.501 +| epoch 3 step 23400 | 460 batches | lr 0.000846 | ms/batch 674.42 | loss 3.64 | ppl 37.995 +| epoch 3 step 23600 | 660 batches | lr 0.000836 | ms/batch 672.56 | loss 3.60 | ppl 36.540 +| epoch 3 step 23800 | 860 batches | lr 0.000827 | ms/batch 673.12 | loss 3.63 | ppl 37.738 +| epoch 3 step 24000 | 1060 batches | lr 0.000817 | ms/batch 664.65 | loss 3.62 | ppl 37.164 +---------------------------------------------------------------------------------------------------- +| Eval 6 at step 24000 | time: 2697.80s | valid loss 3.52 | valid ppl 33.726 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 24200 | 1260 batches | lr 0.000807 | ms/batch 740.67 | loss 3.60 | ppl 36.765 +| epoch 3 step 24400 | 1460 batches | lr 0.000798 | ms/batch 674.30 | loss 3.60 | ppl 36.720 +| epoch 3 step 24600 | 1660 batches | lr 0.000788 | ms/batch 672.55 | loss 3.59 | ppl 36.339 +| epoch 3 step 24800 | 1860 batches | lr 0.000778 | ms/batch 671.83 | loss 3.60 | ppl 36.487 +| epoch 3 step 25000 | 2060 batches | lr 0.000769 | ms/batch 671.74 | loss 3.63 | ppl 37.859 +| epoch 3 step 25200 | 2260 batches | lr 0.000759 | ms/batch 672.23 | loss 3.61 | ppl 36.807 +| epoch 3 step 25400 | 2460 batches | lr 0.000749 | ms/batch 671.61 | loss 3.59 | ppl 36.224 +| epoch 3 step 25600 | 2660 batches | lr 0.00074 | ms/batch 674.02 | loss 3.59 | ppl 36.343 +| epoch 3 step 25800 | 2860 batches | lr 0.00073 | ms/batch 671.84 | loss 3.53 | ppl 34.173 +| epoch 3 step 26000 | 3060 batches | lr 0.00072 | ms/batch 672.60 | loss 3.58 | ppl 35.903 +| epoch 3 step 26200 | 3260 batches | lr 0.000711 | ms/batch 673.04 | loss 3.58 | ppl 35.696 +| epoch 3 step 26400 | 3460 batches | lr 0.000701 | ms/batch 673.00 | loss 3.54 | ppl 34.395 +| epoch 3 step 26600 | 3660 batches | lr 0.000692 | ms/batch 673.81 | loss 3.55 | ppl 34.771 +| epoch 3 step 26800 | 3860 batches | lr 0.000682 | ms/batch 672.00 | loss 3.55 | ppl 34.852 +| epoch 3 step 27000 | 4060 batches | lr 0.000672 | ms/batch 673.44 | loss 3.56 | ppl 35.128 +| epoch 3 step 27200 | 4260 batches | lr 0.000663 | ms/batch 671.63 | loss 3.54 | ppl 34.582 +| epoch 3 step 27400 | 4460 batches | lr 0.000653 | ms/batch 672.23 | loss 3.55 | ppl 34.678 +| epoch 3 step 27600 | 4660 batches | lr 0.000644 | ms/batch 671.70 | loss 3.53 | ppl 34.204 +| epoch 3 step 27800 | 4860 batches | lr 0.000634 | ms/batch 670.97 | loss 3.52 | ppl 33.707 +| epoch 3 step 28000 | 5060 batches | lr 0.000625 | ms/batch 663.55 | loss 3.53 | ppl 34.105 +---------------------------------------------------------------------------------------------------- +| Eval 7 at step 28000 | time: 2697.22s | valid loss 3.44 | valid ppl 31.229 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 28200 | 5260 batches | lr 0.000615 | ms/batch 738.31 | loss 3.51 | ppl 33.439 +| epoch 3 step 28400 | 5460 batches | lr 0.000606 | ms/batch 670.03 | loss 3.49 | ppl 32.676 +| epoch 3 step 28600 | 5660 batches | lr 0.000596 | ms/batch 673.65 | loss 3.53 | ppl 34.273 +| epoch 3 step 28800 | 5860 batches | lr 0.000587 | ms/batch 670.70 | loss 3.50 | ppl 33.257 +| epoch 3 step 29000 | 6060 batches | lr 0.000577 | ms/batch 672.88 | loss 3.50 | ppl 33.035 +| epoch 3 step 29200 | 6260 batches | lr 0.000568 | ms/batch 671.74 | loss 3.50 | ppl 33.001 +| epoch 3 step 29400 | 6460 batches | lr 0.000559 | ms/batch 670.97 | loss 3.50 | ppl 33.162 +| epoch 3 step 29600 | 6660 batches | lr 0.00055 | ms/batch 671.14 | loss 3.45 | ppl 31.426 +| epoch 3 step 29800 | 6860 batches | lr 0.00054 | ms/batch 672.59 | loss 3.48 | ppl 32.386 +| epoch 3 step 30000 | 7060 batches | lr 0.000531 | ms/batch 671.72 | loss 3.47 | ppl 32.047 +| epoch 3 step 30200 | 7260 batches | lr 0.000522 | ms/batch 669.64 | loss 3.44 | ppl 31.093 +| epoch 3 step 30400 | 7460 batches | lr 0.000513 | ms/batch 674.88 | loss 3.46 | ppl 31.766 +| epoch 3 step 30600 | 7660 batches | lr 0.000504 | ms/batch 673.98 | loss 3.44 | ppl 31.226 +| epoch 3 step 30800 | 7860 batches | lr 0.000495 | ms/batch 672.05 | loss 3.45 | ppl 31.633 +| epoch 3 step 31000 | 8060 batches | lr 0.000486 | ms/batch 675.06 | loss 3.46 | ppl 31.822 +| epoch 3 step 31200 | 8260 batches | lr 0.000477 | ms/batch 675.76 | loss 3.45 | ppl 31.384 +| epoch 3 step 31400 | 8460 batches | lr 0.000468 | ms/batch 674.16 | loss 3.46 | ppl 31.680 +| epoch 3 step 31600 | 8660 batches | lr 0.000459 | ms/batch 673.56 | loss 3.45 | ppl 31.480 +| epoch 3 step 31800 | 8860 batches | lr 0.00045 | ms/batch 671.05 | loss 3.45 | ppl 31.470 +| epoch 3 step 32000 | 9060 batches | lr 0.000441 | ms/batch 662.55 | loss 3.45 | ppl 31.454 +---------------------------------------------------------------------------------------------------- +| Eval 8 at step 32000 | time: 2696.71s | valid loss 3.37 | valid ppl 29.048 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 32200 | 9260 batches | lr 0.000433 | ms/batch 741.24 | loss 3.43 | ppl 30.924 +| epoch 3 step 32400 | 9460 batches | lr 0.000424 | ms/batch 672.63 | loss 3.45 | ppl 31.583 +| epoch 3 step 32600 | 9660 batches | lr 0.000415 | ms/batch 672.60 | loss 3.45 | ppl 31.560 +| epoch 3 step 32800 | 9860 batches | lr 0.000407 | ms/batch 671.88 | loss 3.41 | ppl 30.145 +| epoch 3 step 33000 | 10060 batches | lr 0.000398 | ms/batch 672.49 | loss 3.45 | ppl 31.582 +| epoch 3 step 33200 | 10260 batches | lr 0.00039 | ms/batch 671.16 | loss 3.40 | ppl 29.971 +| epoch 3 step 33400 | 10460 batches | lr 0.000382 | ms/batch 671.28 | loss 3.43 | ppl 30.997 +| epoch 3 step 33600 | 10660 batches | lr 0.000373 | ms/batch 672.12 | loss 3.44 | ppl 31.166 +| epoch 3 step 33800 | 10860 batches | lr 0.000365 | ms/batch 671.60 | loss 3.39 | ppl 29.578 +| epoch 3 step 34000 | 11060 batches | lr 0.000357 | ms/batch 672.62 | loss 3.43 | ppl 30.954 +| epoch 3 step 34200 | 11260 batches | lr 0.000349 | ms/batch 671.84 | loss 3.44 | ppl 31.123 +| epoch 3 step 34400 | 11460 batches | lr 0.000341 | ms/batch 673.17 | loss 3.41 | ppl 30.185 +| epoch 4 step 34600 | 190 batches | lr 0.000333 | ms/batch 670.84 | loss 3.39 | ppl 29.520 +| epoch 4 step 34800 | 390 batches | lr 0.000325 | ms/batch 673.47 | loss 3.39 | ppl 29.798 +| epoch 4 step 35000 | 590 batches | lr 0.000317 | ms/batch 672.91 | loss 3.38 | ppl 29.482 +| epoch 4 step 35200 | 790 batches | lr 0.000309 | ms/batch 671.06 | loss 3.40 | ppl 29.950 +| epoch 4 step 35400 | 990 batches | lr 0.000301 | ms/batch 673.00 | loss 3.38 | ppl 29.249 +| epoch 4 step 35600 | 1190 batches | lr 0.000294 | ms/batch 673.68 | loss 3.39 | ppl 29.768 +| epoch 4 step 35800 | 1390 batches | lr 0.000286 | ms/batch 671.24 | loss 3.38 | ppl 29.479 +| epoch 4 step 36000 | 1590 batches | lr 0.000279 | ms/batch 660.61 | loss 3.37 | ppl 29.048 +---------------------------------------------------------------------------------------------------- +| Eval 9 at step 36000 | time: 2695.59s | valid loss 3.32 | valid ppl 27.645 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 36200 | 1790 batches | lr 0.000271 | ms/batch 738.61 | loss 3.38 | ppl 29.267 +| epoch 4 step 36400 | 1990 batches | lr 0.000264 | ms/batch 671.84 | loss 3.41 | ppl 30.128 +| epoch 4 step 36600 | 2190 batches | lr 0.000257 | ms/batch 670.16 | loss 3.39 | ppl 29.614 +| epoch 4 step 36800 | 2390 batches | lr 0.00025 | ms/batch 672.50 | loss 3.39 | ppl 29.549 +| epoch 4 step 37000 | 2590 batches | lr 0.000242 | ms/batch 674.54 | loss 3.36 | ppl 28.867 +| epoch 4 step 37200 | 2790 batches | lr 0.000235 | ms/batch 672.19 | loss 3.34 | ppl 28.314 +| epoch 4 step 37400 | 2990 batches | lr 0.000229 | ms/batch 670.71 | loss 3.36 | ppl 28.677 +| epoch 4 step 37600 | 3190 batches | lr 0.000222 | ms/batch 668.95 | loss 3.36 | ppl 28.682 +| epoch 4 step 37800 | 3390 batches | lr 0.000215 | ms/batch 672.94 | loss 3.36 | ppl 28.683 +| epoch 4 step 38000 | 3590 batches | lr 0.000208 | ms/batch 672.33 | loss 3.33 | ppl 27.802 +| epoch 4 step 38200 | 3790 batches | lr 0.000202 | ms/batch 673.11 | loss 3.34 | ppl 28.335 +| epoch 4 step 38400 | 3990 batches | lr 0.000195 | ms/batch 670.77 | loss 3.36 | ppl 28.747 +| epoch 4 step 38600 | 4190 batches | lr 0.000189 | ms/batch 671.42 | loss 3.34 | ppl 28.160 +| epoch 4 step 38800 | 4390 batches | lr 0.000183 | ms/batch 674.42 | loss 3.34 | ppl 28.212 +| epoch 4 step 39000 | 4590 batches | lr 0.000176 | ms/batch 671.51 | loss 3.35 | ppl 28.619 +| epoch 4 step 39200 | 4790 batches | lr 0.00017 | ms/batch 673.38 | loss 3.30 | ppl 27.241 +| epoch 4 step 39400 | 4990 batches | lr 0.000164 | ms/batch 671.09 | loss 3.35 | ppl 28.548 +| epoch 4 step 39600 | 5190 batches | lr 0.000158 | ms/batch 673.71 | loss 3.31 | ppl 27.271 +| epoch 4 step 39800 | 5390 batches | lr 0.000153 | ms/batch 671.79 | loss 3.29 | ppl 26.839 +| epoch 4 step 40000 | 5590 batches | lr 0.000147 | ms/batch 663.99 | loss 3.31 | ppl 27.419 +---------------------------------------------------------------------------------------------------- +| Eval 10 at step 40000 | time: 2695.51s | valid loss 3.28 | valid ppl 26.473 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 40200 | 5790 batches | lr 0.000141 | ms/batch 737.94 | loss 3.33 | ppl 27.939 +| epoch 4 step 40400 | 5990 batches | lr 0.000136 | ms/batch 674.02 | loss 3.30 | ppl 27.155 +| epoch 4 step 40600 | 6190 batches | lr 0.00013 | ms/batch 671.99 | loss 3.30 | ppl 27.222 +| epoch 4 step 40800 | 6390 batches | lr 0.000125 | ms/batch 674.33 | loss 3.33 | ppl 27.819 +| epoch 4 step 41000 | 6590 batches | lr 0.00012 | ms/batch 672.00 | loss 3.26 | ppl 26.092 +| epoch 4 step 41200 | 6790 batches | lr 0.000115 | ms/batch 670.91 | loss 3.29 | ppl 26.772 +| epoch 4 step 41400 | 6990 batches | lr 0.00011 | ms/batch 670.93 | loss 3.30 | ppl 27.098 +| epoch 4 step 41600 | 7190 batches | lr 0.000105 | ms/batch 672.93 | loss 3.25 | ppl 25.775 +| epoch 4 step 41800 | 7390 batches | lr 9.98e-05 | ms/batch 673.77 | loss 3.28 | ppl 26.457 +| epoch 4 step 42000 | 7590 batches | lr 9.51e-05 | ms/batch 672.27 | loss 3.25 | ppl 25.813 +| epoch 4 step 42200 | 7790 batches | lr 9.05e-05 | ms/batch 671.48 | loss 3.28 | ppl 26.654 +| epoch 4 step 42400 | 7990 batches | lr 8.6e-05 | ms/batch 671.27 | loss 3.28 | ppl 26.600 +| epoch 4 step 42600 | 8190 batches | lr 8.16e-05 | ms/batch 673.39 | loss 3.27 | ppl 26.227 +| epoch 4 step 42800 | 8390 batches | lr 7.73e-05 | ms/batch 673.21 | loss 3.29 | ppl 26.959 +| epoch 4 step 43000 | 8590 batches | lr 7.32e-05 | ms/batch 675.70 | loss 3.27 | ppl 26.299 +| epoch 4 step 43200 | 8790 batches | lr 6.91e-05 | ms/batch 673.58 | loss 3.29 | ppl 26.749 +| epoch 4 step 43400 | 8990 batches | lr 6.52e-05 | ms/batch 673.15 | loss 3.28 | ppl 26.451 +| epoch 4 step 43600 | 9190 batches | lr 6.13e-05 | ms/batch 671.88 | loss 3.26 | ppl 26.136 +| epoch 4 step 43800 | 9390 batches | lr 5.76e-05 | ms/batch 673.32 | loss 3.28 | ppl 26.443 +| epoch 4 step 44000 | 9590 batches | lr 5.4e-05 | ms/batch 662.94 | loss 3.29 | ppl 26.910 +---------------------------------------------------------------------------------------------------- +| Eval 11 at step 44000 | time: 2697.59s | valid loss 3.25 | valid ppl 25.763 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 44200 | 9790 batches | lr 5.05e-05 | ms/batch 740.81 | loss 3.27 | ppl 26.191 +| epoch 4 step 44400 | 9990 batches | lr 4.71e-05 | ms/batch 672.14 | loss 3.26 | ppl 26.166 +| epoch 4 step 44600 | 10190 batches | lr 4.38e-05 | ms/batch 670.84 | loss 3.26 | ppl 26.037 +| epoch 4 step 44800 | 10390 batches | lr 4.07e-05 | ms/batch 672.90 | loss 3.26 | ppl 26.088 +| epoch 4 step 45000 | 10590 batches | lr 3.76e-05 | ms/batch 673.66 | loss 3.29 | ppl 26.884 +| epoch 4 step 45200 | 10790 batches | lr 3.47e-05 | ms/batch 672.88 | loss 3.24 | ppl 25.586 +| epoch 4 step 45400 | 10990 batches | lr 3.19e-05 | ms/batch 671.20 | loss 3.28 | ppl 26.487 +| epoch 4 step 45600 | 11190 batches | lr 2.92e-05 | ms/batch 674.06 | loss 3.28 | ppl 26.688 +| epoch 4 step 45800 | 11390 batches | lr 2.66e-05 | ms/batch 670.83 | loss 3.28 | ppl 26.449 +| epoch 5 step 46000 | 120 batches | lr 2.41e-05 | ms/batch 671.63 | loss 3.26 | ppl 26.029 +| epoch 5 step 46200 | 320 batches | lr 2.18e-05 | ms/batch 675.05 | loss 3.24 | ppl 25.647 +| epoch 5 step 46400 | 520 batches | lr 1.96e-05 | ms/batch 671.64 | loss 3.28 | ppl 26.462 +| epoch 5 step 46600 | 720 batches | lr 1.75e-05 | ms/batch 674.85 | loss 3.24 | ppl 25.535 +| epoch 5 step 46800 | 920 batches | lr 1.55e-05 | ms/batch 672.46 | loss 3.24 | ppl 25.522 +| epoch 5 step 47000 | 1120 batches | lr 1.36e-05 | ms/batch 672.98 | loss 3.28 | ppl 26.567 +| epoch 5 step 47200 | 1320 batches | lr 1.19e-05 | ms/batch 669.86 | loss 3.24 | ppl 25.624 +| epoch 5 step 47400 | 1520 batches | lr 1.02e-05 | ms/batch 673.34 | loss 3.25 | ppl 25.746 +| epoch 5 step 47600 | 1720 batches | lr 8.72e-06 | ms/batch 673.91 | loss 3.24 | ppl 25.514 +| epoch 5 step 47800 | 1920 batches | lr 7.33e-06 | ms/batch 672.36 | loss 3.27 | ppl 26.267 +| epoch 5 step 48000 | 2120 batches | lr 6.06e-06 | ms/batch 663.53 | loss 3.29 | ppl 26.743 +---------------------------------------------------------------------------------------------------- +| Eval 12 at step 48000 | time: 2697.55s | valid loss 3.24 | valid ppl 25.471 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 48200 | 2320 batches | lr 4.91e-06 | ms/batch 739.34 | loss 3.27 | ppl 26.196 +| epoch 5 step 48400 | 2520 batches | lr 3.88e-06 | ms/batch 674.08 | loss 3.25 | ppl 25.864 +| epoch 5 step 48600 | 2720 batches | lr 2.97e-06 | ms/batch 672.56 | loss 3.24 | ppl 25.526 +| epoch 5 step 48800 | 2920 batches | lr 2.18e-06 | ms/batch 672.85 | loss 3.23 | ppl 25.302 +| epoch 5 step 49000 | 3120 batches | lr 1.52e-06 | ms/batch 673.40 | loss 3.25 | ppl 25.757 +| epoch 5 step 49200 | 3320 batches | lr 9.71e-07 | ms/batch 672.09 | loss 3.27 | ppl 26.197 +| epoch 5 step 49400 | 3520 batches | lr 5.46e-07 | ms/batch 670.25 | loss 3.23 | ppl 25.175 +| epoch 5 step 49600 | 3720 batches | lr 2.43e-07 | ms/batch 673.34 | loss 3.25 | ppl 25.791 +| epoch 5 step 49800 | 3920 batches | lr 6.07e-08 | ms/batch 670.68 | loss 3.25 | ppl 25.720 +| epoch 5 step 50000 | 4120 batches | lr 0 | ms/batch 475.96 | loss 3.25 | ppl 25.749 +---------------------------------------------------------------------------------------------------- +End of training +==================================================================================================== +| End of training | test loss 3.27 | test ppl 26.217 +==================================================================================================== diff --git a/NLP/Transformer-XL/exp_results/log-adam.txt b/NLP/Transformer-XL/exp_results/log-adam.txt new file mode 100644 index 0000000..34c52c6 --- /dev/null +++ b/NLP/Transformer-XL/exp_results/log-adam.txt @@ -0,0 +1,1224 @@ +==================================================================================================== + - data : /root/autodl-tmp/data/wikitext-103/ + - dataset : wt103 + - n_layer : 16 + - n_head : 10 + - d_head : 41 + - d_embed : 410 + - d_model : 410 + - d_inner : 2100 + - dropout : 0.1 + - dropatt : 0.0 + - init : normal + - emb_init : normal + - init_range : 0.1 + - emb_init_range : 0.01 + - init_std : 0.02 + - proj_init_std : 0.01 + - optim : adam + - lr : 0.00025 + - wd : 0.02 + - mom : 0.0 + - scheduler : cosine + - warmup_step : 0 + - decay_rate : 0.5 + - lr_min : 0.0 + - clip : 0.25 + - clip_nonemb : False + - max_step : 200000 + - batch_size : 60 + - batch_chunk : 1 + - tgt_len : 150 + - eval_tgt_len : 150 + - ext_len : 0 + - mem_len : 150 + - not_tied : False + - seed : 1111 + - cuda : True + - adaptive : True + - div_val : 1 + - pre_lnorm : False + - varlen : False + - multi_gpu : True + - log_interval : 200 + - eval_interval : 4000 + - work_dir : /root/autodl-tmp/-wt103/20220810-185417 + - restart : False + - restart_dir : + - debug : False + - same_length : False + - attn_type : 0 + - clamp_len : -1 + - eta_min : 0.0 + - gpu0_bsz : 4 + - max_eval_steps : -1 + - sample_softmax : -1 + - patience : 0 + - finetune_v2 : False + - finetune_v3 : False + - fp16 : False + - static_loss_scale : 1 + - dynamic_loss_scale : False + - opt_betas : None + - tied : True + - n_token : 267735 + - n_all_param : 151107538 + - n_nonemb_param : 41066400 +==================================================================================================== +#params = 151107538 +#non emb params = 41066400 +| epoch 1 step 200 | 200 batches | lr 0.00025 | ms/batch 764.49 | loss 6.97 | ppl 1066.907 +| epoch 1 step 400 | 400 batches | lr 0.00025 | ms/batch 687.98 | loss 6.03 | ppl 417.069 +| epoch 1 step 600 | 600 batches | lr 0.00025 | ms/batch 683.07 | loss 5.69 | ppl 297.083 +| epoch 1 step 800 | 800 batches | lr 0.00025 | ms/batch 723.35 | loss 5.49 | ppl 241.413 +| epoch 1 step 1000 | 1000 batches | lr 0.00025 | ms/batch 694.77 | loss 5.30 | ppl 199.605 +| epoch 1 step 1200 | 1200 batches | lr 0.00025 | ms/batch 677.41 | loss 5.17 | ppl 176.453 +| epoch 1 step 1400 | 1400 batches | lr 0.00025 | ms/batch 677.36 | loss 5.07 | ppl 159.156 +| epoch 1 step 1600 | 1600 batches | lr 0.00025 | ms/batch 638.81 | loss 4.98 | ppl 145.306 +| epoch 1 step 1800 | 1800 batches | lr 0.00025 | ms/batch 383.71 | loss 4.91 | ppl 136.268 +| epoch 1 step 2000 | 2000 batches | lr 0.00025 | ms/batch 382.65 | loss 4.85 | ppl 127.951 +| epoch 1 step 2200 | 2200 batches | lr 0.00025 | ms/batch 382.54 | loss 4.78 | ppl 119.484 +| epoch 1 step 2400 | 2400 batches | lr 0.00025 | ms/batch 382.40 | loss 4.73 | ppl 113.765 +| epoch 1 step 2600 | 2600 batches | lr 0.00025 | ms/batch 384.26 | loss 4.68 | ppl 107.611 +| epoch 1 step 2800 | 2800 batches | lr 0.00025 | ms/batch 382.49 | loss 4.63 | ppl 102.007 +| epoch 1 step 3000 | 3000 batches | lr 0.00025 | ms/batch 383.20 | loss 4.60 | ppl 99.044 +| epoch 1 step 3200 | 3200 batches | lr 0.00025 | ms/batch 382.09 | loss 4.55 | ppl 94.494 +| epoch 1 step 3400 | 3400 batches | lr 0.00025 | ms/batch 382.43 | loss 4.52 | ppl 91.563 +| epoch 1 step 3600 | 3600 batches | lr 0.00025 | ms/batch 382.40 | loss 4.45 | ppl 85.252 +| epoch 1 step 3800 | 3800 batches | lr 0.00025 | ms/batch 382.46 | loss 4.49 | ppl 88.831 +| epoch 1 step 4000 | 4000 batches | lr 0.00025 | ms/batch 382.79 | loss 4.45 | ppl 85.701 +---------------------------------------------------------------------------------------------------- +| Eval 1 at step 4000 | time: 2034.38s | valid loss 4.28 | valid ppl 72.551 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 4200 | 4200 batches | lr 0.00025 | ms/batch 425.25 | loss 4.40 | ppl 81.592 +| epoch 1 step 4400 | 4400 batches | lr 0.00025 | ms/batch 382.45 | loss 4.38 | ppl 80.012 +| epoch 1 step 4600 | 4600 batches | lr 0.00025 | ms/batch 381.95 | loss 4.36 | ppl 78.430 +| epoch 1 step 4800 | 4800 batches | lr 0.00025 | ms/batch 383.26 | loss 4.31 | ppl 74.659 +| epoch 1 step 5000 | 5000 batches | lr 0.00025 | ms/batch 382.36 | loss 4.35 | ppl 77.294 +| epoch 1 step 5200 | 5200 batches | lr 0.00025 | ms/batch 383.05 | loss 4.29 | ppl 73.083 +| epoch 1 step 5400 | 5400 batches | lr 0.00025 | ms/batch 382.53 | loss 4.24 | ppl 69.188 +| epoch 1 step 5600 | 5600 batches | lr 0.00025 | ms/batch 382.05 | loss 4.26 | ppl 70.726 +| epoch 1 step 5800 | 5800 batches | lr 0.000249 | ms/batch 383.48 | loss 4.26 | ppl 70.533 +| epoch 1 step 6000 | 6000 batches | lr 0.000249 | ms/batch 382.63 | loss 4.21 | ppl 67.321 +| epoch 1 step 6200 | 6200 batches | lr 0.000249 | ms/batch 382.38 | loss 4.18 | ppl 65.667 +| epoch 1 step 6400 | 6400 batches | lr 0.000249 | ms/batch 382.63 | loss 4.22 | ppl 68.112 +| epoch 1 step 6600 | 6600 batches | lr 0.000249 | ms/batch 383.94 | loss 4.15 | ppl 63.675 +| epoch 1 step 6800 | 6800 batches | lr 0.000249 | ms/batch 383.22 | loss 4.15 | ppl 63.453 +| epoch 1 step 7000 | 7000 batches | lr 0.000249 | ms/batch 382.85 | loss 4.15 | ppl 63.563 +| epoch 1 step 7200 | 7200 batches | lr 0.000249 | ms/batch 383.21 | loss 4.10 | ppl 60.547 +| epoch 1 step 7400 | 7400 batches | lr 0.000249 | ms/batch 382.26 | loss 4.10 | ppl 60.203 +| epoch 1 step 7600 | 7600 batches | lr 0.000249 | ms/batch 382.51 | loss 4.08 | ppl 58.953 +| epoch 1 step 7800 | 7800 batches | lr 0.000249 | ms/batch 382.04 | loss 4.10 | ppl 60.279 +| epoch 1 step 8000 | 8000 batches | lr 0.000249 | ms/batch 382.26 | loss 4.09 | ppl 59.987 +---------------------------------------------------------------------------------------------------- +| Eval 2 at step 8000 | time: 1537.11s | valid loss 3.92 | valid ppl 50.244 +---------------------------------------------------------------------------------------------------- +| epoch 1 step 8200 | 8200 batches | lr 0.000249 | ms/batch 426.91 | loss 4.07 | ppl 58.474 +| epoch 1 step 8400 | 8400 batches | lr 0.000249 | ms/batch 382.09 | loss 4.08 | ppl 58.943 +| epoch 1 step 8600 | 8600 batches | lr 0.000249 | ms/batch 383.51 | loss 4.06 | ppl 57.842 +| epoch 1 step 8800 | 8800 batches | lr 0.000249 | ms/batch 383.16 | loss 4.07 | ppl 58.371 +| epoch 1 step 9000 | 9000 batches | lr 0.000249 | ms/batch 382.59 | loss 4.03 | ppl 56.484 +| epoch 1 step 9200 | 9200 batches | lr 0.000249 | ms/batch 383.24 | loss 4.02 | ppl 55.887 +| epoch 1 step 9400 | 9400 batches | lr 0.000249 | ms/batch 382.44 | loss 4.03 | ppl 56.143 +| epoch 1 step 9600 | 9600 batches | lr 0.000249 | ms/batch 382.34 | loss 4.04 | ppl 56.989 +| epoch 1 step 9800 | 9800 batches | lr 0.000249 | ms/batch 382.46 | loss 4.00 | ppl 54.426 +| epoch 1 step 10000 | 10000 batches | lr 0.000248 | ms/batch 383.27 | loss 4.01 | ppl 55.195 +| epoch 1 step 10200 | 10200 batches | lr 0.000248 | ms/batch 382.34 | loss 3.98 | ppl 53.358 +| epoch 1 step 10400 | 10400 batches | lr 0.000248 | ms/batch 382.68 | loss 3.97 | ppl 53.066 +| epoch 1 step 10600 | 10600 batches | lr 0.000248 | ms/batch 382.80 | loss 3.99 | ppl 54.306 +| epoch 1 step 10800 | 10800 batches | lr 0.000248 | ms/batch 384.05 | loss 3.95 | ppl 51.980 +| epoch 1 step 11000 | 11000 batches | lr 0.000248 | ms/batch 382.48 | loss 3.99 | ppl 54.189 +| epoch 1 step 11200 | 11200 batches | lr 0.000248 | ms/batch 382.43 | loss 3.97 | ppl 52.836 +| epoch 1 step 11400 | 11400 batches | lr 0.000248 | ms/batch 382.62 | loss 3.96 | ppl 52.684 +| epoch 2 step 11600 | 130 batches | lr 0.000248 | ms/batch 384.77 | loss 3.93 | ppl 50.757 +| epoch 2 step 11800 | 330 batches | lr 0.000248 | ms/batch 384.18 | loss 3.89 | ppl 48.921 +| epoch 2 step 12000 | 530 batches | lr 0.000248 | ms/batch 382.18 | loss 3.91 | ppl 49.890 +---------------------------------------------------------------------------------------------------- +| Eval 3 at step 12000 | time: 1537.95s | valid loss 3.77 | valid ppl 43.379 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 12200 | 730 batches | lr 0.000248 | ms/batch 426.96 | loss 3.88 | ppl 48.351 +| epoch 2 step 12400 | 930 batches | lr 0.000248 | ms/batch 382.32 | loss 3.88 | ppl 48.358 +| epoch 2 step 12600 | 1130 batches | lr 0.000248 | ms/batch 382.56 | loss 3.90 | ppl 49.504 +| epoch 2 step 12800 | 1330 batches | lr 0.000247 | ms/batch 383.00 | loss 3.87 | ppl 47.881 +| epoch 2 step 13000 | 1530 batches | lr 0.000247 | ms/batch 384.66 | loss 3.86 | ppl 47.436 +| epoch 2 step 13200 | 1730 batches | lr 0.000247 | ms/batch 385.68 | loss 3.85 | ppl 47.200 +| epoch 2 step 13400 | 1930 batches | lr 0.000247 | ms/batch 385.97 | loss 3.86 | ppl 47.400 +| epoch 2 step 13600 | 2130 batches | lr 0.000247 | ms/batch 387.10 | loss 3.88 | ppl 48.414 +| epoch 2 step 13800 | 2330 batches | lr 0.000247 | ms/batch 387.55 | loss 3.85 | ppl 47.186 +| epoch 2 step 14000 | 2530 batches | lr 0.000247 | ms/batch 385.67 | loss 3.84 | ppl 46.648 +| epoch 2 step 14200 | 2730 batches | lr 0.000247 | ms/batch 385.10 | loss 3.82 | ppl 45.693 +| epoch 2 step 14400 | 2930 batches | lr 0.000247 | ms/batch 385.39 | loss 3.81 | ppl 45.134 +| epoch 2 step 14600 | 3130 batches | lr 0.000247 | ms/batch 386.09 | loss 3.82 | ppl 45.500 +| epoch 2 step 14800 | 3330 batches | lr 0.000247 | ms/batch 385.83 | loss 3.82 | ppl 45.721 +| epoch 2 step 15000 | 3530 batches | lr 0.000247 | ms/batch 384.09 | loss 3.78 | ppl 43.946 +| epoch 2 step 15200 | 3730 batches | lr 0.000246 | ms/batch 385.04 | loss 3.81 | ppl 45.324 +| epoch 2 step 15400 | 3930 batches | lr 0.000246 | ms/batch 384.82 | loss 3.81 | ppl 44.927 +| epoch 2 step 15600 | 4130 batches | lr 0.000246 | ms/batch 385.06 | loss 3.79 | ppl 44.331 +| epoch 2 step 15800 | 4330 batches | lr 0.000246 | ms/batch 384.90 | loss 3.80 | ppl 44.771 +| epoch 2 step 16000 | 4530 batches | lr 0.000246 | ms/batch 386.44 | loss 3.80 | ppl 44.784 +---------------------------------------------------------------------------------------------------- +| Eval 4 at step 16000 | time: 1546.41s | valid loss 3.65 | valid ppl 38.633 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 16200 | 4730 batches | lr 0.000246 | ms/batch 429.10 | loss 3.76 | ppl 42.832 +| epoch 2 step 16400 | 4930 batches | lr 0.000246 | ms/batch 386.10 | loss 3.78 | ppl 43.794 +| epoch 2 step 16600 | 5130 batches | lr 0.000246 | ms/batch 386.13 | loss 3.77 | ppl 43.324 +| epoch 2 step 16800 | 5330 batches | lr 0.000246 | ms/batch 385.77 | loss 3.76 | ppl 42.944 +| epoch 2 step 17000 | 5530 batches | lr 0.000246 | ms/batch 384.98 | loss 3.74 | ppl 42.284 +| epoch 2 step 17200 | 5730 batches | lr 0.000245 | ms/batch 384.86 | loss 3.76 | ppl 43.149 +| epoch 2 step 17400 | 5930 batches | lr 0.000245 | ms/batch 385.57 | loss 3.75 | ppl 42.421 +| epoch 2 step 17600 | 6130 batches | lr 0.000245 | ms/batch 385.85 | loss 3.74 | ppl 42.025 +| epoch 2 step 17800 | 6330 batches | lr 0.000245 | ms/batch 386.39 | loss 3.77 | ppl 43.312 +| epoch 2 step 18000 | 6530 batches | lr 0.000245 | ms/batch 386.91 | loss 3.71 | ppl 40.843 +| epoch 2 step 18200 | 6730 batches | lr 0.000245 | ms/batch 385.35 | loss 3.72 | ppl 41.108 +| epoch 2 step 18400 | 6930 batches | lr 0.000245 | ms/batch 383.48 | loss 3.73 | ppl 41.559 +| epoch 2 step 18600 | 7130 batches | lr 0.000245 | ms/batch 383.69 | loss 3.70 | ppl 40.583 +| epoch 2 step 18800 | 7330 batches | lr 0.000245 | ms/batch 382.21 | loss 3.68 | ppl 39.788 +| epoch 2 step 19000 | 7530 batches | lr 0.000244 | ms/batch 382.49 | loss 3.71 | ppl 40.743 +| epoch 2 step 19200 | 7730 batches | lr 0.000244 | ms/batch 381.98 | loss 3.71 | ppl 40.765 +| epoch 2 step 19400 | 7930 batches | lr 0.000244 | ms/batch 382.74 | loss 3.70 | ppl 40.560 +| epoch 2 step 19600 | 8130 batches | lr 0.000244 | ms/batch 382.31 | loss 3.71 | ppl 41.029 +| epoch 2 step 19800 | 8330 batches | lr 0.000244 | ms/batch 383.90 | loss 3.70 | ppl 40.507 +| epoch 2 step 20000 | 8530 batches | lr 0.000244 | ms/batch 382.56 | loss 3.69 | ppl 40.172 +---------------------------------------------------------------------------------------------------- +| Eval 5 at step 20000 | time: 1543.91s | valid loss 3.58 | valid ppl 36.050 +---------------------------------------------------------------------------------------------------- +| epoch 2 step 20200 | 8730 batches | lr 0.000244 | ms/batch 426.51 | loss 3.71 | ppl 40.844 +| epoch 2 step 20400 | 8930 batches | lr 0.000244 | ms/batch 382.52 | loss 3.71 | ppl 40.678 +| epoch 2 step 20600 | 9130 batches | lr 0.000244 | ms/batch 382.75 | loss 3.70 | ppl 40.294 +| epoch 2 step 20800 | 9330 batches | lr 0.000243 | ms/batch 382.10 | loss 3.69 | ppl 39.944 +| epoch 2 step 21000 | 9530 batches | lr 0.000243 | ms/batch 382.83 | loss 3.73 | ppl 41.725 +| epoch 2 step 21200 | 9730 batches | lr 0.000243 | ms/batch 381.82 | loss 3.68 | ppl 39.593 +| epoch 2 step 21400 | 9930 batches | lr 0.000243 | ms/batch 382.79 | loss 3.69 | ppl 40.048 +| epoch 2 step 21600 | 10130 batches | lr 0.000243 | ms/batch 381.93 | loss 3.68 | ppl 39.454 +| epoch 2 step 21800 | 10330 batches | lr 0.000243 | ms/batch 382.28 | loss 3.68 | ppl 39.787 +| epoch 2 step 22000 | 10530 batches | lr 0.000243 | ms/batch 382.05 | loss 3.70 | ppl 40.356 +| epoch 2 step 22200 | 10730 batches | lr 0.000242 | ms/batch 382.76 | loss 3.66 | ppl 39.021 +| epoch 2 step 22400 | 10930 batches | lr 0.000242 | ms/batch 381.75 | loss 3.66 | ppl 39.049 +| epoch 2 step 22600 | 11130 batches | lr 0.000242 | ms/batch 384.69 | loss 3.71 | ppl 40.838 +| epoch 2 step 22800 | 11330 batches | lr 0.000242 | ms/batch 381.62 | loss 3.67 | ppl 39.428 +| epoch 3 step 23000 | 60 batches | lr 0.000242 | ms/batch 381.30 | loss 3.68 | ppl 39.482 +| epoch 3 step 23200 | 260 batches | lr 0.000242 | ms/batch 382.06 | loss 3.62 | ppl 37.256 +| epoch 3 step 23400 | 460 batches | lr 0.000242 | ms/batch 383.57 | loss 3.66 | ppl 38.850 +| epoch 3 step 23600 | 660 batches | lr 0.000242 | ms/batch 381.67 | loss 3.62 | ppl 37.381 +| epoch 3 step 23800 | 860 batches | lr 0.000241 | ms/batch 383.06 | loss 3.66 | ppl 38.722 +| epoch 3 step 24000 | 1060 batches | lr 0.000241 | ms/batch 382.42 | loss 3.64 | ppl 38.178 +---------------------------------------------------------------------------------------------------- +| Eval 6 at step 24000 | time: 1535.94s | valid loss 3.54 | valid ppl 34.412 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 24200 | 1260 batches | lr 0.000241 | ms/batch 426.42 | loss 3.63 | ppl 37.832 +| epoch 3 step 24400 | 1460 batches | lr 0.000241 | ms/batch 383.25 | loss 3.63 | ppl 37.748 +| epoch 3 step 24600 | 1660 batches | lr 0.000241 | ms/batch 382.90 | loss 3.62 | ppl 37.471 +| epoch 3 step 24800 | 1860 batches | lr 0.000241 | ms/batch 382.79 | loss 3.63 | ppl 37.761 +| epoch 3 step 25000 | 2060 batches | lr 0.00024 | ms/batch 383.41 | loss 3.67 | ppl 39.280 +| epoch 3 step 25200 | 2260 batches | lr 0.00024 | ms/batch 382.61 | loss 3.64 | ppl 38.232 +| epoch 3 step 25400 | 2460 batches | lr 0.00024 | ms/batch 382.20 | loss 3.63 | ppl 37.701 +| epoch 3 step 25600 | 2660 batches | lr 0.00024 | ms/batch 382.62 | loss 3.63 | ppl 37.828 +| epoch 3 step 25800 | 2860 batches | lr 0.00024 | ms/batch 382.53 | loss 3.58 | ppl 35.716 +| epoch 3 step 26000 | 3060 batches | lr 0.00024 | ms/batch 382.55 | loss 3.63 | ppl 37.634 +| epoch 3 step 26200 | 3260 batches | lr 0.00024 | ms/batch 382.81 | loss 3.62 | ppl 37.520 +| epoch 3 step 26400 | 3460 batches | lr 0.000239 | ms/batch 384.69 | loss 3.59 | ppl 36.219 +| epoch 3 step 26600 | 3660 batches | lr 0.000239 | ms/batch 382.44 | loss 3.60 | ppl 36.700 +| epoch 3 step 26800 | 3860 batches | lr 0.000239 | ms/batch 382.15 | loss 3.61 | ppl 36.900 +| epoch 3 step 27000 | 4060 batches | lr 0.000239 | ms/batch 382.14 | loss 3.62 | ppl 37.292 +| epoch 3 step 27200 | 4260 batches | lr 0.000239 | ms/batch 383.17 | loss 3.61 | ppl 36.796 +| epoch 3 step 27400 | 4460 batches | lr 0.000239 | ms/batch 382.18 | loss 3.61 | ppl 36.903 +| epoch 3 step 27600 | 4660 batches | lr 0.000238 | ms/batch 382.49 | loss 3.60 | ppl 36.548 +| epoch 3 step 27800 | 4860 batches | lr 0.000238 | ms/batch 381.75 | loss 3.59 | ppl 36.199 +| epoch 3 step 28000 | 5060 batches | lr 0.000238 | ms/batch 382.08 | loss 3.60 | ppl 36.657 +---------------------------------------------------------------------------------------------------- +| Eval 7 at step 28000 | time: 1536.83s | valid loss 3.50 | valid ppl 33.127 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 28200 | 5260 batches | lr 0.000238 | ms/batch 426.01 | loss 3.58 | ppl 36.005 +| epoch 3 step 28400 | 5460 batches | lr 0.000238 | ms/batch 382.73 | loss 3.56 | ppl 35.230 +| epoch 3 step 28600 | 5660 batches | lr 0.000238 | ms/batch 382.35 | loss 3.61 | ppl 36.999 +| epoch 3 step 28800 | 5860 batches | lr 0.000237 | ms/batch 382.16 | loss 3.58 | ppl 35.999 +| epoch 3 step 29000 | 6060 batches | lr 0.000237 | ms/batch 382.25 | loss 3.58 | ppl 35.815 +| epoch 3 step 29200 | 6260 batches | lr 0.000237 | ms/batch 382.26 | loss 3.58 | ppl 35.851 +| epoch 3 step 29400 | 6460 batches | lr 0.000237 | ms/batch 383.97 | loss 3.59 | ppl 36.178 +| epoch 3 step 29600 | 6660 batches | lr 0.000237 | ms/batch 382.68 | loss 3.54 | ppl 34.313 +| epoch 3 step 29800 | 6860 batches | lr 0.000237 | ms/batch 382.70 | loss 3.57 | ppl 35.428 +| epoch 3 step 30000 | 7060 batches | lr 0.000236 | ms/batch 384.33 | loss 3.56 | ppl 35.112 +| epoch 3 step 30200 | 7260 batches | lr 0.000236 | ms/batch 382.75 | loss 3.53 | ppl 34.109 +| epoch 3 step 30400 | 7460 batches | lr 0.000236 | ms/batch 382.94 | loss 3.55 | ppl 34.943 +| epoch 3 step 30600 | 7660 batches | lr 0.000236 | ms/batch 384.39 | loss 3.54 | ppl 34.438 +| epoch 3 step 30800 | 7860 batches | lr 0.000236 | ms/batch 382.63 | loss 3.55 | ppl 34.942 +| epoch 3 step 31000 | 8060 batches | lr 0.000235 | ms/batch 384.05 | loss 3.56 | ppl 35.184 +| epoch 3 step 31200 | 8260 batches | lr 0.000235 | ms/batch 382.68 | loss 3.55 | ppl 34.799 +| epoch 3 step 31400 | 8460 batches | lr 0.000235 | ms/batch 382.61 | loss 3.56 | ppl 35.170 +| epoch 3 step 31600 | 8660 batches | lr 0.000235 | ms/batch 382.17 | loss 3.56 | ppl 35.065 +| epoch 3 step 31800 | 8860 batches | lr 0.000235 | ms/batch 382.49 | loss 3.56 | ppl 35.131 +| epoch 3 step 32000 | 9060 batches | lr 0.000235 | ms/batch 382.24 | loss 3.56 | ppl 35.142 +---------------------------------------------------------------------------------------------------- +| Eval 8 at step 32000 | time: 1537.58s | valid loss 3.46 | valid ppl 31.818 +---------------------------------------------------------------------------------------------------- +| epoch 3 step 32200 | 9260 batches | lr 0.000234 | ms/batch 426.15 | loss 3.54 | ppl 34.637 +| epoch 3 step 32400 | 9460 batches | lr 0.000234 | ms/batch 383.26 | loss 3.57 | ppl 35.490 +| epoch 3 step 32600 | 9660 batches | lr 0.000234 | ms/batch 382.25 | loss 3.57 | ppl 35.516 +| epoch 3 step 32800 | 9860 batches | lr 0.000234 | ms/batch 382.36 | loss 3.52 | ppl 33.934 +| epoch 3 step 33000 | 10060 batches | lr 0.000234 | ms/batch 382.17 | loss 3.58 | ppl 35.722 +| epoch 3 step 33200 | 10260 batches | lr 0.000233 | ms/batch 382.47 | loss 3.52 | ppl 33.869 +| epoch 3 step 33400 | 10460 batches | lr 0.000233 | ms/batch 383.24 | loss 3.56 | ppl 35.052 +| epoch 3 step 33600 | 10660 batches | lr 0.000233 | ms/batch 382.21 | loss 3.57 | ppl 35.355 +| epoch 3 step 33800 | 10860 batches | lr 0.000233 | ms/batch 382.50 | loss 3.52 | ppl 33.700 +| epoch 3 step 34000 | 11060 batches | lr 0.000233 | ms/batch 382.55 | loss 3.56 | ppl 35.290 +| epoch 3 step 34200 | 11260 batches | lr 0.000232 | ms/batch 382.62 | loss 3.57 | ppl 35.557 +| epoch 3 step 34400 | 11460 batches | lr 0.000232 | ms/batch 382.65 | loss 3.54 | ppl 34.550 +| epoch 4 step 34600 | 190 batches | lr 0.000232 | ms/batch 381.14 | loss 3.51 | ppl 33.420 +| epoch 4 step 34800 | 390 batches | lr 0.000232 | ms/batch 381.97 | loss 3.52 | ppl 33.787 +| epoch 4 step 35000 | 590 batches | lr 0.000232 | ms/batch 382.60 | loss 3.51 | ppl 33.552 +| epoch 4 step 35200 | 790 batches | lr 0.000231 | ms/batch 385.96 | loss 3.53 | ppl 34.089 +| epoch 4 step 35400 | 990 batches | lr 0.000231 | ms/batch 382.69 | loss 3.51 | ppl 33.374 +| epoch 4 step 35600 | 1190 batches | lr 0.000231 | ms/batch 382.30 | loss 3.53 | ppl 34.051 +| epoch 4 step 35800 | 1390 batches | lr 0.000231 | ms/batch 382.36 | loss 3.52 | ppl 33.694 +| epoch 4 step 36000 | 1590 batches | lr 0.000231 | ms/batch 382.00 | loss 3.51 | ppl 33.320 +---------------------------------------------------------------------------------------------------- +| Eval 9 at step 36000 | time: 1536.56s | valid loss 3.44 | valid ppl 31.250 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 36200 | 1790 batches | lr 0.00023 | ms/batch 426.70 | loss 3.52 | ppl 33.653 +| epoch 4 step 36400 | 1990 batches | lr 0.00023 | ms/batch 382.33 | loss 3.54 | ppl 34.638 +| epoch 4 step 36600 | 2190 batches | lr 0.00023 | ms/batch 383.26 | loss 3.53 | ppl 34.169 +| epoch 4 step 36800 | 2390 batches | lr 0.00023 | ms/batch 382.43 | loss 3.53 | ppl 34.156 +| epoch 4 step 37000 | 2590 batches | lr 0.000229 | ms/batch 383.03 | loss 3.51 | ppl 33.352 +| epoch 4 step 37200 | 2790 batches | lr 0.000229 | ms/batch 382.01 | loss 3.49 | ppl 32.825 +| epoch 4 step 37400 | 2990 batches | lr 0.000229 | ms/batch 382.88 | loss 3.51 | ppl 33.368 +| epoch 4 step 37600 | 3190 batches | lr 0.000229 | ms/batch 382.42 | loss 3.51 | ppl 33.417 +| epoch 4 step 37800 | 3390 batches | lr 0.000229 | ms/batch 382.74 | loss 3.51 | ppl 33.414 +| epoch 4 step 38000 | 3590 batches | lr 0.000228 | ms/batch 381.55 | loss 3.48 | ppl 32.456 +| epoch 4 step 38200 | 3790 batches | lr 0.000228 | ms/batch 386.35 | loss 3.50 | ppl 33.250 +| epoch 4 step 38400 | 3990 batches | lr 0.000228 | ms/batch 382.08 | loss 3.52 | ppl 33.648 +| epoch 4 step 38600 | 4190 batches | lr 0.000228 | ms/batch 382.31 | loss 3.50 | ppl 33.089 +| epoch 4 step 38800 | 4390 batches | lr 0.000227 | ms/batch 382.64 | loss 3.50 | ppl 33.248 +| epoch 4 step 39000 | 4590 batches | lr 0.000227 | ms/batch 383.65 | loss 3.52 | ppl 33.624 +| epoch 4 step 39200 | 4790 batches | lr 0.000227 | ms/batch 382.21 | loss 3.47 | ppl 32.242 +| epoch 4 step 39400 | 4990 batches | lr 0.000227 | ms/batch 382.62 | loss 3.52 | ppl 33.868 +| epoch 4 step 39600 | 5190 batches | lr 0.000227 | ms/batch 382.88 | loss 3.48 | ppl 32.418 +| epoch 4 step 39800 | 5390 batches | lr 0.000226 | ms/batch 382.21 | loss 3.46 | ppl 31.803 +| epoch 4 step 40000 | 5590 batches | lr 0.000226 | ms/batch 381.89 | loss 3.48 | ppl 32.611 +---------------------------------------------------------------------------------------------------- +| Eval 10 at step 40000 | time: 1537.11s | valid loss 3.42 | valid ppl 30.522 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 40200 | 5790 batches | lr 0.000226 | ms/batch 426.61 | loss 3.50 | ppl 33.271 +| epoch 4 step 40400 | 5990 batches | lr 0.000226 | ms/batch 382.10 | loss 3.48 | ppl 32.384 +| epoch 4 step 40600 | 6190 batches | lr 0.000225 | ms/batch 382.91 | loss 3.48 | ppl 32.374 +| epoch 4 step 40800 | 6390 batches | lr 0.000225 | ms/batch 382.15 | loss 3.51 | ppl 33.374 +| epoch 4 step 41000 | 6590 batches | lr 0.000225 | ms/batch 383.66 | loss 3.44 | ppl 31.217 +| epoch 4 step 41200 | 6790 batches | lr 0.000225 | ms/batch 382.20 | loss 3.47 | ppl 32.031 +| epoch 4 step 41400 | 6990 batches | lr 0.000224 | ms/batch 383.41 | loss 3.48 | ppl 32.533 +| epoch 4 step 41600 | 7190 batches | lr 0.000224 | ms/batch 382.45 | loss 3.43 | ppl 30.920 +| epoch 4 step 41800 | 7390 batches | lr 0.000224 | ms/batch 382.32 | loss 3.46 | ppl 31.829 +| epoch 4 step 42000 | 7590 batches | lr 0.000224 | ms/batch 382.28 | loss 3.44 | ppl 31.101 +| epoch 4 step 42200 | 7790 batches | lr 0.000224 | ms/batch 383.12 | loss 3.47 | ppl 32.066 +| epoch 4 step 42400 | 7990 batches | lr 0.000223 | ms/batch 382.94 | loss 3.47 | ppl 32.038 +| epoch 4 step 42600 | 8190 batches | lr 0.000223 | ms/batch 382.32 | loss 3.45 | ppl 31.633 +| epoch 4 step 42800 | 8390 batches | lr 0.000223 | ms/batch 384.01 | loss 3.48 | ppl 32.533 +| epoch 4 step 43000 | 8590 batches | lr 0.000223 | ms/batch 382.16 | loss 3.46 | ppl 31.763 +| epoch 4 step 43200 | 8790 batches | lr 0.000222 | ms/batch 382.60 | loss 3.48 | ppl 32.401 +| epoch 4 step 43400 | 8990 batches | lr 0.000222 | ms/batch 382.37 | loss 3.47 | ppl 31.981 +| epoch 4 step 43600 | 9190 batches | lr 0.000222 | ms/batch 382.48 | loss 3.46 | ppl 31.690 +| epoch 4 step 43800 | 9390 batches | lr 0.000222 | ms/batch 384.84 | loss 3.47 | ppl 32.016 +| epoch 4 step 44000 | 9590 batches | lr 0.000221 | ms/batch 382.36 | loss 3.49 | ppl 32.684 +---------------------------------------------------------------------------------------------------- +| Eval 11 at step 44000 | time: 1537.23s | valid loss 3.40 | valid ppl 29.815 +---------------------------------------------------------------------------------------------------- +| epoch 4 step 44200 | 9790 batches | lr 0.000221 | ms/batch 428.35 | loss 3.46 | ppl 31.782 +| epoch 4 step 44400 | 9990 batches | lr 0.000221 | ms/batch 382.90 | loss 3.46 | ppl 31.814 +| epoch 4 step 44600 | 10190 batches | lr 0.000221 | ms/batch 385.08 | loss 3.45 | ppl 31.522 +| epoch 4 step 44800 | 10390 batches | lr 0.00022 | ms/batch 382.88 | loss 3.45 | ppl 31.641 +| epoch 4 step 45000 | 10590 batches | lr 0.00022 | ms/batch 381.85 | loss 3.49 | ppl 32.665 +| epoch 4 step 45200 | 10790 batches | lr 0.00022 | ms/batch 382.45 | loss 3.44 | ppl 31.149 +| epoch 4 step 45400 | 10990 batches | lr 0.00022 | ms/batch 382.05 | loss 3.47 | ppl 32.268 +| epoch 4 step 45600 | 11190 batches | lr 0.000219 | ms/batch 382.67 | loss 3.48 | ppl 32.483 +| epoch 4 step 45800 | 11390 batches | lr 0.000219 | ms/batch 383.04 | loss 3.47 | ppl 32.167 +| epoch 5 step 46000 | 120 batches | lr 0.000219 | ms/batch 381.34 | loss 3.45 | ppl 31.375 +| epoch 5 step 46200 | 320 batches | lr 0.000219 | ms/batch 383.01 | loss 3.43 | ppl 30.760 +| epoch 5 step 46400 | 520 batches | lr 0.000218 | ms/batch 382.83 | loss 3.46 | ppl 31.853 +| epoch 5 step 46600 | 720 batches | lr 0.000218 | ms/batch 382.75 | loss 3.42 | ppl 30.716 +| epoch 5 step 46800 | 920 batches | lr 0.000218 | ms/batch 382.52 | loss 3.43 | ppl 30.822 +| epoch 5 step 47000 | 1120 batches | lr 0.000217 | ms/batch 382.63 | loss 3.47 | ppl 32.008 +| epoch 5 step 47200 | 1320 batches | lr 0.000217 | ms/batch 382.45 | loss 3.43 | ppl 30.837 +| epoch 5 step 47400 | 1520 batches | lr 0.000217 | ms/batch 383.05 | loss 3.43 | ppl 31.007 +| epoch 5 step 47600 | 1720 batches | lr 0.000217 | ms/batch 382.51 | loss 3.43 | ppl 30.726 +| epoch 5 step 47800 | 1920 batches | lr 0.000216 | ms/batch 382.05 | loss 3.45 | ppl 31.615 +| epoch 5 step 48000 | 2120 batches | lr 0.000216 | ms/batch 383.67 | loss 3.47 | ppl 32.131 +---------------------------------------------------------------------------------------------------- +| Eval 12 at step 48000 | time: 1537.36s | valid loss 3.38 | valid ppl 29.286 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 48200 | 2320 batches | lr 0.000216 | ms/batch 426.18 | loss 3.45 | ppl 31.544 +| epoch 5 step 48400 | 2520 batches | lr 0.000216 | ms/batch 382.55 | loss 3.44 | ppl 31.092 +| epoch 5 step 48600 | 2720 batches | lr 0.000215 | ms/batch 383.24 | loss 3.42 | ppl 30.680 +| epoch 5 step 48800 | 2920 batches | lr 0.000215 | ms/batch 382.99 | loss 3.42 | ppl 30.430 +| epoch 5 step 49000 | 3120 batches | lr 0.000215 | ms/batch 382.66 | loss 3.44 | ppl 31.035 +| epoch 5 step 49200 | 3320 batches | lr 0.000214 | ms/batch 383.18 | loss 3.45 | ppl 31.405 +| epoch 5 step 49400 | 3520 batches | lr 0.000214 | ms/batch 382.78 | loss 3.41 | ppl 30.224 +| epoch 5 step 49600 | 3720 batches | lr 0.000214 | ms/batch 382.63 | loss 3.43 | ppl 31.025 +| epoch 5 step 49800 | 3920 batches | lr 0.000214 | ms/batch 382.76 | loss 3.43 | ppl 30.894 +| epoch 5 step 50000 | 4120 batches | lr 0.000213 | ms/batch 382.26 | loss 3.43 | ppl 30.885 +| epoch 5 step 50200 | 4320 batches | lr 0.000213 | ms/batch 382.89 | loss 3.44 | ppl 31.043 +| epoch 5 step 50400 | 4520 batches | lr 0.000213 | ms/batch 384.25 | loss 3.45 | ppl 31.416 +| epoch 5 step 50600 | 4720 batches | lr 0.000213 | ms/batch 382.92 | loss 3.41 | ppl 30.166 +| epoch 5 step 50800 | 4920 batches | lr 0.000212 | ms/batch 382.12 | loss 3.43 | ppl 30.728 +| epoch 5 step 51000 | 5120 batches | lr 0.000212 | ms/batch 382.48 | loss 3.42 | ppl 30.516 +| epoch 5 step 51200 | 5320 batches | lr 0.000212 | ms/batch 382.48 | loss 3.41 | ppl 30.393 +| epoch 5 step 51400 | 5520 batches | lr 0.000211 | ms/batch 383.12 | loss 3.41 | ppl 30.179 +| epoch 5 step 51600 | 5720 batches | lr 0.000211 | ms/batch 382.46 | loss 3.42 | ppl 30.587 +| epoch 5 step 51800 | 5920 batches | lr 0.000211 | ms/batch 382.88 | loss 3.42 | ppl 30.558 +| epoch 5 step 52000 | 6120 batches | lr 0.000211 | ms/batch 382.46 | loss 3.41 | ppl 30.275 +---------------------------------------------------------------------------------------------------- +| Eval 13 at step 52000 | time: 1537.27s | valid loss 3.37 | valid ppl 29.135 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 52200 | 6320 batches | lr 0.00021 | ms/batch 427.28 | loss 3.44 | ppl 31.060 +| epoch 5 step 52400 | 6520 batches | lr 0.00021 | ms/batch 382.67 | loss 3.38 | ppl 29.347 +| epoch 5 step 52600 | 6720 batches | lr 0.00021 | ms/batch 384.93 | loss 3.39 | ppl 29.540 +| epoch 5 step 52800 | 6920 batches | lr 0.000209 | ms/batch 382.20 | loss 3.41 | ppl 30.174 +| epoch 5 step 53000 | 7120 batches | lr 0.000209 | ms/batch 384.43 | loss 3.40 | ppl 29.817 +| epoch 5 step 53200 | 7320 batches | lr 0.000209 | ms/batch 382.30 | loss 3.36 | ppl 28.910 +| epoch 5 step 53400 | 7520 batches | lr 0.000209 | ms/batch 383.00 | loss 3.39 | ppl 29.792 +| epoch 5 step 53600 | 7720 batches | lr 0.000208 | ms/batch 382.44 | loss 3.39 | ppl 29.660 +| epoch 5 step 53800 | 7920 batches | lr 0.000208 | ms/batch 382.02 | loss 3.39 | ppl 29.703 +| epoch 5 step 54000 | 8120 batches | lr 0.000208 | ms/batch 382.41 | loss 3.40 | ppl 30.079 +| epoch 5 step 54200 | 8320 batches | lr 0.000207 | ms/batch 382.90 | loss 3.40 | ppl 29.826 +| epoch 5 step 54400 | 8520 batches | lr 0.000207 | ms/batch 382.56 | loss 3.39 | ppl 29.573 +| epoch 5 step 54600 | 8720 batches | lr 0.000207 | ms/batch 382.32 | loss 3.40 | ppl 30.113 +| epoch 5 step 54800 | 8920 batches | lr 0.000206 | ms/batch 382.09 | loss 3.41 | ppl 30.261 +| epoch 5 step 55000 | 9120 batches | lr 0.000206 | ms/batch 383.65 | loss 3.40 | ppl 29.949 +| epoch 5 step 55200 | 9320 batches | lr 0.000206 | ms/batch 382.70 | loss 3.39 | ppl 29.722 +| epoch 5 step 55400 | 9520 batches | lr 0.000206 | ms/batch 382.58 | loss 3.42 | ppl 30.640 +| epoch 5 step 55600 | 9720 batches | lr 0.000205 | ms/batch 383.54 | loss 3.39 | ppl 29.772 +| epoch 5 step 55800 | 9920 batches | lr 0.000205 | ms/batch 382.56 | loss 3.40 | ppl 29.829 +| epoch 5 step 56000 | 10120 batches | lr 0.000205 | ms/batch 383.56 | loss 3.39 | ppl 29.737 +---------------------------------------------------------------------------------------------------- +| Eval 14 at step 56000 | time: 1537.89s | valid loss 3.35 | valid ppl 28.430 +---------------------------------------------------------------------------------------------------- +| epoch 5 step 56200 | 10320 batches | lr 0.000204 | ms/batch 429.52 | loss 3.40 | ppl 29.888 +| epoch 5 step 56400 | 10520 batches | lr 0.000204 | ms/batch 383.60 | loss 3.42 | ppl 30.470 +| epoch 5 step 56600 | 10720 batches | lr 0.000204 | ms/batch 382.22 | loss 3.38 | ppl 29.429 +| epoch 5 step 56800 | 10920 batches | lr 0.000203 | ms/batch 383.42 | loss 3.38 | ppl 29.378 +| epoch 5 step 57000 | 11120 batches | lr 0.000203 | ms/batch 382.26 | loss 3.44 | ppl 31.147 +| epoch 5 step 57200 | 11320 batches | lr 0.000203 | ms/batch 382.92 | loss 3.39 | ppl 29.724 +| epoch 6 step 57400 | 50 batches | lr 0.000203 | ms/batch 382.09 | loss 3.41 | ppl 30.289 +| epoch 6 step 57600 | 250 batches | lr 0.000202 | ms/batch 383.62 | loss 3.35 | ppl 28.598 +| epoch 6 step 57800 | 450 batches | lr 0.000202 | ms/batch 382.49 | loss 3.39 | ppl 29.762 +| epoch 6 step 58000 | 650 batches | lr 0.000202 | ms/batch 383.51 | loss 3.36 | ppl 28.802 +| epoch 6 step 58200 | 850 batches | lr 0.000201 | ms/batch 382.50 | loss 3.40 | ppl 29.984 +| epoch 6 step 58400 | 1050 batches | lr 0.000201 | ms/batch 386.57 | loss 3.37 | ppl 29.208 +| epoch 6 step 58600 | 1250 batches | lr 0.000201 | ms/batch 383.06 | loss 3.37 | ppl 29.214 +| epoch 6 step 58800 | 1450 batches | lr 0.0002 | ms/batch 382.90 | loss 3.38 | ppl 29.414 +| epoch 6 step 59000 | 1650 batches | lr 0.0002 | ms/batch 381.99 | loss 3.36 | ppl 28.865 +| epoch 6 step 59200 | 1850 batches | lr 0.0002 | ms/batch 382.72 | loss 3.38 | ppl 29.336 +| epoch 6 step 59400 | 2050 batches | lr 0.000199 | ms/batch 382.45 | loss 3.42 | ppl 30.590 +| epoch 6 step 59600 | 2250 batches | lr 0.000199 | ms/batch 383.23 | loss 3.39 | ppl 29.581 +| epoch 6 step 59800 | 2450 batches | lr 0.000199 | ms/batch 382.01 | loss 3.39 | ppl 29.554 +| epoch 6 step 60000 | 2650 batches | lr 0.000198 | ms/batch 385.56 | loss 3.39 | ppl 29.556 +---------------------------------------------------------------------------------------------------- +| Eval 15 at step 60000 | time: 1539.02s | valid loss 3.34 | valid ppl 28.124 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 60200 | 2850 batches | lr 0.000198 | ms/batch 427.18 | loss 3.34 | ppl 28.084 +| epoch 6 step 60400 | 3050 batches | lr 0.000198 | ms/batch 382.74 | loss 3.38 | ppl 29.496 +| epoch 6 step 60600 | 3250 batches | lr 0.000198 | ms/batch 382.29 | loss 3.38 | ppl 29.316 +| epoch 6 step 60800 | 3450 batches | lr 0.000197 | ms/batch 383.43 | loss 3.36 | ppl 28.769 +| epoch 6 step 61000 | 3650 batches | lr 0.000197 | ms/batch 382.43 | loss 3.36 | ppl 28.811 +| epoch 6 step 61200 | 3850 batches | lr 0.000197 | ms/batch 383.71 | loss 3.37 | ppl 29.053 +| epoch 6 step 61400 | 4050 batches | lr 0.000196 | ms/batch 383.78 | loss 3.39 | ppl 29.601 +| epoch 6 step 61600 | 4250 batches | lr 0.000196 | ms/batch 382.55 | loss 3.37 | ppl 28.986 +| epoch 6 step 61800 | 4450 batches | lr 0.000196 | ms/batch 384.36 | loss 3.38 | ppl 29.261 +| epoch 6 step 62000 | 4650 batches | lr 0.000195 | ms/batch 382.85 | loss 3.37 | ppl 29.053 +| epoch 6 step 62200 | 4850 batches | lr 0.000195 | ms/batch 382.12 | loss 3.36 | ppl 28.773 +| epoch 6 step 62400 | 5050 batches | lr 0.000195 | ms/batch 382.25 | loss 3.37 | ppl 29.208 +| epoch 6 step 62600 | 5250 batches | lr 0.000194 | ms/batch 382.20 | loss 3.36 | ppl 28.811 +| epoch 6 step 62800 | 5450 batches | lr 0.000194 | ms/batch 383.91 | loss 3.34 | ppl 28.159 +| epoch 6 step 63000 | 5650 batches | lr 0.000194 | ms/batch 385.04 | loss 3.38 | ppl 29.398 +| epoch 6 step 63200 | 5850 batches | lr 0.000193 | ms/batch 381.98 | loss 3.36 | ppl 28.768 +| epoch 6 step 63400 | 6050 batches | lr 0.000193 | ms/batch 383.86 | loss 3.35 | ppl 28.541 +| epoch 6 step 63600 | 6250 batches | lr 0.000193 | ms/batch 383.24 | loss 3.36 | ppl 28.893 +| epoch 6 step 63800 | 6450 batches | lr 0.000192 | ms/batch 384.46 | loss 3.37 | ppl 28.936 +| epoch 6 step 64000 | 6650 batches | lr 0.000192 | ms/batch 383.12 | loss 3.31 | ppl 27.491 +---------------------------------------------------------------------------------------------------- +| Eval 16 at step 64000 | time: 1538.94s | valid loss 3.33 | valid ppl 27.945 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 64200 | 6850 batches | lr 0.000192 | ms/batch 426.87 | loss 3.35 | ppl 28.395 +| epoch 6 step 64400 | 7050 batches | lr 0.000191 | ms/batch 384.04 | loss 3.35 | ppl 28.397 +| epoch 6 step 64600 | 7250 batches | lr 0.000191 | ms/batch 383.26 | loss 3.31 | ppl 27.419 +| epoch 6 step 64800 | 7450 batches | lr 0.000191 | ms/batch 382.49 | loss 3.34 | ppl 28.186 +| epoch 6 step 65000 | 7650 batches | lr 0.00019 | ms/batch 382.51 | loss 3.32 | ppl 27.650 +| epoch 6 step 65200 | 7850 batches | lr 0.00019 | ms/batch 382.66 | loss 3.34 | ppl 28.265 +| epoch 6 step 65400 | 8050 batches | lr 0.00019 | ms/batch 382.99 | loss 3.35 | ppl 28.415 +| epoch 6 step 65600 | 8250 batches | lr 0.000189 | ms/batch 382.01 | loss 3.33 | ppl 28.063 +| epoch 6 step 65800 | 8450 batches | lr 0.000189 | ms/batch 383.37 | loss 3.35 | ppl 28.493 +| epoch 6 step 66000 | 8650 batches | lr 0.000189 | ms/batch 382.16 | loss 3.34 | ppl 28.161 +| epoch 6 step 66200 | 8850 batches | lr 0.000188 | ms/batch 383.05 | loss 3.36 | ppl 28.722 +| epoch 6 step 66400 | 9050 batches | lr 0.000188 | ms/batch 381.98 | loss 3.35 | ppl 28.462 +| epoch 6 step 66600 | 9250 batches | lr 0.000188 | ms/batch 382.97 | loss 3.33 | ppl 28.032 +| epoch 6 step 66800 | 9450 batches | lr 0.000187 | ms/batch 382.50 | loss 3.35 | ppl 28.632 +| epoch 6 step 67000 | 9650 batches | lr 0.000187 | ms/batch 382.59 | loss 3.37 | ppl 28.996 +| epoch 6 step 67200 | 9850 batches | lr 0.000187 | ms/batch 382.80 | loss 3.32 | ppl 27.543 +| epoch 6 step 67400 | 10050 batches | lr 0.000186 | ms/batch 382.34 | loss 3.36 | ppl 28.905 +| epoch 6 step 67600 | 10250 batches | lr 0.000186 | ms/batch 383.19 | loss 3.32 | ppl 27.730 +| epoch 6 step 67800 | 10450 batches | lr 0.000186 | ms/batch 382.78 | loss 3.35 | ppl 28.489 +| epoch 6 step 68000 | 10650 batches | lr 0.000185 | ms/batch 382.85 | loss 3.37 | ppl 28.941 +---------------------------------------------------------------------------------------------------- +| Eval 17 at step 68000 | time: 1537.35s | valid loss 3.32 | valid ppl 27.546 +---------------------------------------------------------------------------------------------------- +| epoch 6 step 68200 | 10850 batches | lr 0.000185 | ms/batch 426.77 | loss 3.31 | ppl 27.487 +| epoch 6 step 68400 | 11050 batches | lr 0.000185 | ms/batch 382.33 | loss 3.36 | ppl 28.856 +| epoch 6 step 68600 | 11250 batches | lr 0.000184 | ms/batch 383.02 | loss 3.37 | ppl 29.210 +| epoch 6 step 68800 | 11450 batches | lr 0.000184 | ms/batch 382.50 | loss 3.34 | ppl 28.198 +| epoch 7 step 69000 | 180 batches | lr 0.000183 | ms/batch 382.69 | loss 3.32 | ppl 27.723 +| epoch 7 step 69200 | 380 batches | lr 0.000183 | ms/batch 382.53 | loss 3.32 | ppl 27.754 +| epoch 7 step 69400 | 580 batches | lr 0.000183 | ms/batch 383.34 | loss 3.32 | ppl 27.786 +| epoch 7 step 69600 | 780 batches | lr 0.000182 | ms/batch 382.77 | loss 3.33 | ppl 28.006 +| epoch 7 step 69800 | 980 batches | lr 0.000182 | ms/batch 385.85 | loss 3.31 | ppl 27.419 +| epoch 7 step 70000 | 1180 batches | lr 0.000182 | ms/batch 382.26 | loss 3.34 | ppl 28.337 +| epoch 7 step 70200 | 1380 batches | lr 0.000181 | ms/batch 381.99 | loss 3.32 | ppl 27.696 +| epoch 7 step 70400 | 1580 batches | lr 0.000181 | ms/batch 382.65 | loss 3.32 | ppl 27.663 +| epoch 7 step 70600 | 1780 batches | lr 0.000181 | ms/batch 383.32 | loss 3.32 | ppl 27.705 +| epoch 7 step 70800 | 1980 batches | lr 0.00018 | ms/batch 383.40 | loss 3.35 | ppl 28.606 +| epoch 7 step 71000 | 2180 batches | lr 0.00018 | ms/batch 382.11 | loss 3.34 | ppl 28.329 +| epoch 7 step 71200 | 2380 batches | lr 0.00018 | ms/batch 384.90 | loss 3.34 | ppl 28.226 +| epoch 7 step 71400 | 2580 batches | lr 0.000179 | ms/batch 383.90 | loss 3.33 | ppl 27.848 +| epoch 7 step 71600 | 2780 batches | lr 0.000179 | ms/batch 382.26 | loss 3.31 | ppl 27.291 +| epoch 7 step 71800 | 2980 batches | lr 0.000179 | ms/batch 382.65 | loss 3.32 | ppl 27.616 +| epoch 7 step 72000 | 3180 batches | lr 0.000178 | ms/batch 383.18 | loss 3.33 | ppl 28.000 +---------------------------------------------------------------------------------------------------- +| Eval 18 at step 72000 | time: 1538.28s | valid loss 3.30 | valid ppl 27.248 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 72200 | 3380 batches | lr 0.000178 | ms/batch 425.93 | loss 3.33 | ppl 27.861 +| epoch 7 step 72400 | 3580 batches | lr 0.000178 | ms/batch 382.87 | loss 3.30 | ppl 27.166 +| epoch 7 step 72600 | 3780 batches | lr 0.000177 | ms/batch 382.93 | loss 3.32 | ppl 27.592 +| epoch 7 step 72800 | 3980 batches | lr 0.000177 | ms/batch 383.39 | loss 3.33 | ppl 27.882 +| epoch 7 step 73000 | 4180 batches | lr 0.000176 | ms/batch 382.71 | loss 3.32 | ppl 27.750 +| epoch 7 step 73200 | 4380 batches | lr 0.000176 | ms/batch 382.81 | loss 3.32 | ppl 27.778 +| epoch 7 step 73400 | 4580 batches | lr 0.000176 | ms/batch 383.26 | loss 3.34 | ppl 28.229 +| epoch 7 step 73600 | 4780 batches | lr 0.000175 | ms/batch 382.44 | loss 3.30 | ppl 27.014 +| epoch 7 step 73800 | 4980 batches | lr 0.000175 | ms/batch 382.82 | loss 3.34 | ppl 28.153 +| epoch 7 step 74000 | 5180 batches | lr 0.000175 | ms/batch 384.51 | loss 3.31 | ppl 27.294 +| epoch 7 step 74200 | 5380 batches | lr 0.000174 | ms/batch 382.19 | loss 3.28 | ppl 26.677 +| epoch 7 step 74400 | 5580 batches | lr 0.000174 | ms/batch 382.97 | loss 3.31 | ppl 27.304 +| epoch 7 step 74600 | 5780 batches | lr 0.000174 | ms/batch 382.61 | loss 3.33 | ppl 27.918 +| epoch 7 step 74800 | 5980 batches | lr 0.000173 | ms/batch 384.75 | loss 3.30 | ppl 27.162 +| epoch 7 step 75000 | 6180 batches | lr 0.000173 | ms/batch 382.19 | loss 3.30 | ppl 27.189 +| epoch 7 step 75200 | 6380 batches | lr 0.000172 | ms/batch 382.48 | loss 3.34 | ppl 28.110 +| epoch 7 step 75400 | 6580 batches | lr 0.000172 | ms/batch 384.47 | loss 3.26 | ppl 26.103 +| epoch 7 step 75600 | 6780 batches | lr 0.000172 | ms/batch 382.06 | loss 3.29 | ppl 26.928 +| epoch 7 step 75800 | 6980 batches | lr 0.000171 | ms/batch 382.02 | loss 3.31 | ppl 27.354 +| epoch 7 step 76000 | 7180 batches | lr 0.000171 | ms/batch 382.19 | loss 3.26 | ppl 26.088 +---------------------------------------------------------------------------------------------------- +| Eval 19 at step 76000 | time: 1537.77s | valid loss 3.30 | valid ppl 27.007 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 76200 | 7380 batches | lr 0.000171 | ms/batch 426.04 | loss 3.29 | ppl 26.797 +| epoch 7 step 76400 | 7580 batches | lr 0.00017 | ms/batch 382.30 | loss 3.26 | ppl 26.136 +| epoch 7 step 76600 | 7780 batches | lr 0.00017 | ms/batch 382.02 | loss 3.30 | ppl 27.056 +| epoch 7 step 76800 | 7980 batches | lr 0.00017 | ms/batch 382.37 | loss 3.30 | ppl 27.002 +| epoch 7 step 77000 | 8180 batches | lr 0.000169 | ms/batch 381.93 | loss 3.28 | ppl 26.581 +| epoch 7 step 77200 | 8380 batches | lr 0.000169 | ms/batch 382.07 | loss 3.31 | ppl 27.477 +| epoch 7 step 77400 | 8580 batches | lr 0.000168 | ms/batch 382.05 | loss 3.29 | ppl 26.873 +| epoch 7 step 77600 | 8780 batches | lr 0.000168 | ms/batch 382.22 | loss 3.30 | ppl 27.165 +| epoch 7 step 77800 | 8980 batches | lr 0.000168 | ms/batch 381.94 | loss 3.30 | ppl 27.157 +| epoch 7 step 78000 | 9180 batches | lr 0.000167 | ms/batch 382.32 | loss 3.28 | ppl 26.666 +| epoch 7 step 78200 | 9380 batches | lr 0.000167 | ms/batch 382.20 | loss 3.30 | ppl 27.120 +| epoch 7 step 78400 | 9580 batches | lr 0.000167 | ms/batch 384.94 | loss 3.32 | ppl 27.624 +| epoch 7 step 78600 | 9780 batches | lr 0.000166 | ms/batch 382.60 | loss 3.29 | ppl 26.882 +| epoch 7 step 78800 | 9980 batches | lr 0.000166 | ms/batch 382.96 | loss 3.29 | ppl 26.881 +| epoch 7 step 79000 | 10180 batches | lr 0.000165 | ms/batch 382.31 | loss 3.28 | ppl 26.599 +| epoch 7 step 79200 | 10380 batches | lr 0.000165 | ms/batch 382.49 | loss 3.30 | ppl 26.981 +| epoch 7 step 79400 | 10580 batches | lr 0.000165 | ms/batch 381.98 | loss 3.32 | ppl 27.616 +| epoch 7 step 79600 | 10780 batches | lr 0.000164 | ms/batch 382.74 | loss 3.28 | ppl 26.452 +| epoch 7 step 79800 | 10980 batches | lr 0.000164 | ms/batch 382.19 | loss 3.30 | ppl 27.073 +| epoch 7 step 80000 | 11180 batches | lr 0.000164 | ms/batch 382.42 | loss 3.32 | ppl 27.720 +---------------------------------------------------------------------------------------------------- +| Eval 20 at step 80000 | time: 1535.91s | valid loss 3.29 | valid ppl 26.801 +---------------------------------------------------------------------------------------------------- +| epoch 7 step 80200 | 11380 batches | lr 0.000163 | ms/batch 426.32 | loss 3.31 | ppl 27.251 +| epoch 8 step 80400 | 110 batches | lr 0.000163 | ms/batch 381.08 | loss 3.29 | ppl 26.710 +| epoch 8 step 80600 | 310 batches | lr 0.000163 | ms/batch 382.69 | loss 3.27 | ppl 26.275 +| epoch 8 step 80800 | 510 batches | lr 0.000162 | ms/batch 382.14 | loss 3.30 | ppl 27.200 +| epoch 8 step 81000 | 710 batches | lr 0.000162 | ms/batch 382.38 | loss 3.26 | ppl 26.123 +| epoch 8 step 81200 | 910 batches | lr 0.000161 | ms/batch 381.93 | loss 3.27 | ppl 26.392 +| epoch 8 step 81400 | 1110 batches | lr 0.000161 | ms/batch 382.53 | loss 3.30 | ppl 27.145 +| epoch 8 step 81600 | 1310 batches | lr 0.000161 | ms/batch 382.13 | loss 3.27 | ppl 26.432 +| epoch 8 step 81800 | 1510 batches | lr 0.00016 | ms/batch 382.22 | loss 3.28 | ppl 26.450 +| epoch 8 step 82000 | 1710 batches | lr 0.00016 | ms/batch 382.63 | loss 3.26 | ppl 26.073 +| epoch 8 step 82200 | 1910 batches | lr 0.000159 | ms/batch 384.42 | loss 3.30 | ppl 27.082 +| epoch 8 step 82400 | 2110 batches | lr 0.000159 | ms/batch 382.36 | loss 3.32 | ppl 27.564 +| epoch 8 step 82600 | 2310 batches | lr 0.000159 | ms/batch 382.85 | loss 3.30 | ppl 26.997 +| epoch 8 step 82800 | 2510 batches | lr 0.000158 | ms/batch 382.56 | loss 3.28 | ppl 26.548 +| epoch 8 step 83000 | 2710 batches | lr 0.000158 | ms/batch 383.18 | loss 3.27 | ppl 26.416 +| epoch 8 step 83200 | 2910 batches | lr 0.000158 | ms/batch 382.57 | loss 3.25 | ppl 25.839 +| epoch 8 step 83400 | 3110 batches | lr 0.000157 | ms/batch 383.07 | loss 3.28 | ppl 26.580 +| epoch 8 step 83600 | 3310 batches | lr 0.000157 | ms/batch 382.96 | loss 3.30 | ppl 27.031 +| epoch 8 step 83800 | 3510 batches | lr 0.000156 | ms/batch 382.14 | loss 3.26 | ppl 25.985 +| epoch 8 step 84000 | 3710 batches | lr 0.000156 | ms/batch 382.44 | loss 3.28 | ppl 26.556 +---------------------------------------------------------------------------------------------------- +| Eval 21 at step 84000 | time: 1536.38s | valid loss 3.28 | valid ppl 26.596 +---------------------------------------------------------------------------------------------------- +| epoch 8 step 84200 | 3910 batches | lr 0.000156 | ms/batch 426.73 | loss 3.27 | ppl 26.340 +| epoch 8 step 84400 | 4110 batches | lr 0.000155 | ms/batch 383.01 | loss 3.28 | ppl 26.661 +| epoch 8 step 84600 | 4310 batches | lr 0.000155 | ms/batch 382.82 | loss 3.28 | ppl 26.601 +| epoch 8 step 84800 | 4510 batches | lr 0.000155 | ms/batch 382.43 | loss 3.30 | ppl 27.018 +| epoch 8 step 85000 | 4710 batches | lr 0.000154 | ms/batch 382.14 | loss 3.25 | ppl 25.913 +| epoch 8 step 85200 | 4910 batches | lr 0.000154 | ms/batch 382.26 | loss 3.27 | ppl 26.342 +| epoch 8 step 85400 | 5110 batches | lr 0.000153 | ms/batch 382.40 | loss 3.27 | ppl 26.318 +| epoch 8 step 85600 | 5310 batches | lr 0.000153 | ms/batch 382.15 | loss 3.26 | ppl 26.005 +| epoch 8 step 85800 | 5510 batches | lr 0.000153 | ms/batch 382.10 | loss 3.26 | ppl 26.088 +| epoch 8 step 86000 | 5710 batches | lr 0.000152 | ms/batch 382.38 | loss 3.26 | ppl 26.174 +| epoch 8 step 86200 | 5910 batches | lr 0.000152 | ms/batch 382.06 | loss 3.27 | ppl 26.388 +| epoch 8 step 86400 | 6110 batches | lr 0.000152 | ms/batch 382.67 | loss 3.27 | ppl 26.188 +| epoch 8 step 86600 | 6310 batches | lr 0.000151 | ms/batch 382.05 | loss 3.28 | ppl 26.641 +| epoch 8 step 86800 | 6510 batches | lr 0.000151 | ms/batch 382.46 | loss 3.23 | ppl 25.326 +| epoch 8 step 87000 | 6710 batches | lr 0.00015 | ms/batch 382.15 | loss 3.24 | ppl 25.460 +| epoch 8 step 87200 | 6910 batches | lr 0.00015 | ms/batch 382.31 | loss 3.26 | ppl 25.930 +| epoch 8 step 87400 | 7110 batches | lr 0.00015 | ms/batch 382.02 | loss 3.25 | ppl 25.772 +| epoch 8 step 87600 | 7310 batches | lr 0.000149 | ms/batch 382.39 | loss 3.21 | ppl 24.844 +| epoch 8 step 87800 | 7510 batches | lr 0.000149 | ms/batch 381.83 | loss 3.25 | ppl 25.800 +| epoch 8 step 88000 | 7710 batches | lr 0.000148 | ms/batch 382.25 | loss 3.24 | ppl 25.514 +---------------------------------------------------------------------------------------------------- +| Eval 22 at step 88000 | time: 1535.57s | valid loss 3.27 | valid ppl 26.318 +---------------------------------------------------------------------------------------------------- +| epoch 8 step 88200 | 7910 batches | lr 0.000148 | ms/batch 428.61 | loss 3.24 | ppl 25.613 +| epoch 8 step 88400 | 8110 batches | lr 0.000148 | ms/batch 384.30 | loss 3.25 | ppl 25.863 +| epoch 8 step 88600 | 8310 batches | lr 0.000147 | ms/batch 382.44 | loss 3.25 | ppl 25.698 +| epoch 8 step 88800 | 8510 batches | lr 0.000147 | ms/batch 383.09 | loss 3.24 | ppl 25.631 +| epoch 8 step 89000 | 8710 batches | lr 0.000146 | ms/batch 382.43 | loss 3.26 | ppl 26.027 +| epoch 8 step 89200 | 8910 batches | lr 0.000146 | ms/batch 382.16 | loss 3.26 | ppl 25.968 +| epoch 8 step 89400 | 9110 batches | lr 0.000146 | ms/batch 383.10 | loss 3.26 | ppl 26.008 +| epoch 8 step 89600 | 9310 batches | lr 0.000145 | ms/batch 382.52 | loss 3.24 | ppl 25.563 +| epoch 8 step 89800 | 9510 batches | lr 0.000145 | ms/batch 382.29 | loss 3.27 | ppl 26.341 +| epoch 8 step 90000 | 9710 batches | lr 0.000145 | ms/batch 382.88 | loss 3.25 | ppl 25.798 +| epoch 8 step 90200 | 9910 batches | lr 0.000144 | ms/batch 383.02 | loss 3.24 | ppl 25.588 +| epoch 8 step 90400 | 10110 batches | lr 0.000144 | ms/batch 382.30 | loss 3.25 | ppl 25.882 +| epoch 8 step 90600 | 10310 batches | lr 0.000143 | ms/batch 382.20 | loss 3.25 | ppl 25.703 +| epoch 8 step 90800 | 10510 batches | lr 0.000143 | ms/batch 382.03 | loss 3.27 | ppl 26.421 +| epoch 8 step 91000 | 10710 batches | lr 0.000143 | ms/batch 382.76 | loss 3.24 | ppl 25.531 +| epoch 8 step 91200 | 10910 batches | lr 0.000142 | ms/batch 382.12 | loss 3.23 | ppl 25.348 +| epoch 8 step 91400 | 11110 batches | lr 0.000142 | ms/batch 382.21 | loss 3.29 | ppl 26.919 +| epoch 8 step 91600 | 11310 batches | lr 0.000141 | ms/batch 382.14 | loss 3.25 | ppl 25.882 +| epoch 9 step 91800 | 40 batches | lr 0.000141 | ms/batch 382.47 | loss 3.27 | ppl 26.230 +| epoch 9 step 92000 | 240 batches | lr 0.000141 | ms/batch 382.51 | loss 3.21 | ppl 24.853 +---------------------------------------------------------------------------------------------------- +| Eval 23 at step 92000 | time: 1536.94s | valid loss 3.27 | valid ppl 26.218 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 92200 | 440 batches | lr 0.00014 | ms/batch 428.15 | loss 3.25 | ppl 25.837 +| epoch 9 step 92400 | 640 batches | lr 0.00014 | ms/batch 382.43 | loss 3.22 | ppl 25.062 +| epoch 9 step 92600 | 840 batches | lr 0.000139 | ms/batch 382.40 | loss 3.26 | ppl 26.170 +| epoch 9 step 92800 | 1040 batches | lr 0.000139 | ms/batch 382.80 | loss 3.23 | ppl 25.183 +| epoch 9 step 93000 | 1240 batches | lr 0.000139 | ms/batch 382.69 | loss 3.24 | ppl 25.433 +| epoch 9 step 93200 | 1440 batches | lr 0.000138 | ms/batch 382.44 | loss 3.25 | ppl 25.668 +| epoch 9 step 93400 | 1640 batches | lr 0.000138 | ms/batch 382.71 | loss 3.22 | ppl 24.999 +| epoch 9 step 93600 | 1840 batches | lr 0.000138 | ms/batch 382.20 | loss 3.24 | ppl 25.529 +| epoch 9 step 93800 | 2040 batches | lr 0.000137 | ms/batch 382.68 | loss 3.28 | ppl 26.591 +| epoch 9 step 94000 | 2240 batches | lr 0.000137 | ms/batch 382.11 | loss 3.25 | ppl 25.717 +| epoch 9 step 94200 | 2440 batches | lr 0.000136 | ms/batch 382.20 | loss 3.25 | ppl 25.779 +| epoch 9 step 94400 | 2640 batches | lr 0.000136 | ms/batch 382.68 | loss 3.24 | ppl 25.650 +| epoch 9 step 94600 | 2840 batches | lr 0.000136 | ms/batch 382.16 | loss 3.20 | ppl 24.565 +| epoch 9 step 94800 | 3040 batches | lr 0.000135 | ms/batch 382.20 | loss 3.25 | ppl 25.666 +| epoch 9 step 95000 | 3240 batches | lr 0.000135 | ms/batch 382.37 | loss 3.24 | ppl 25.475 +| epoch 9 step 95200 | 3440 batches | lr 0.000134 | ms/batch 384.41 | loss 3.23 | ppl 25.172 +| epoch 9 step 95400 | 3640 batches | lr 0.000134 | ms/batch 382.59 | loss 3.22 | ppl 25.074 +| epoch 9 step 95600 | 3840 batches | lr 0.000134 | ms/batch 382.09 | loss 3.24 | ppl 25.433 +| epoch 9 step 95800 | 4040 batches | lr 0.000133 | ms/batch 382.85 | loss 3.25 | ppl 25.792 +| epoch 9 step 96000 | 4240 batches | lr 0.000133 | ms/batch 381.98 | loss 3.23 | ppl 25.300 +---------------------------------------------------------------------------------------------------- +| Eval 24 at step 96000 | time: 1536.74s | valid loss 3.26 | valid ppl 25.985 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 96200 | 4440 batches | lr 0.000132 | ms/batch 426.34 | loss 3.24 | ppl 25.442 +| epoch 9 step 96400 | 4640 batches | lr 0.000132 | ms/batch 384.43 | loss 3.23 | ppl 25.346 +| epoch 9 step 96600 | 4840 batches | lr 0.000132 | ms/batch 382.04 | loss 3.22 | ppl 25.046 +| epoch 9 step 96800 | 5040 batches | lr 0.000131 | ms/batch 383.09 | loss 3.24 | ppl 25.583 +| epoch 9 step 97000 | 5240 batches | lr 0.000131 | ms/batch 382.36 | loss 3.23 | ppl 25.241 +| epoch 9 step 97200 | 5440 batches | lr 0.00013 | ms/batch 382.39 | loss 3.20 | ppl 24.466 +| epoch 9 step 97400 | 5640 batches | lr 0.00013 | ms/batch 382.22 | loss 3.24 | ppl 25.589 +| epoch 9 step 97600 | 5840 batches | lr 0.00013 | ms/batch 384.87 | loss 3.23 | ppl 25.329 +| epoch 9 step 97800 | 6040 batches | lr 0.000129 | ms/batch 382.09 | loss 3.21 | ppl 24.792 +| epoch 9 step 98000 | 6240 batches | lr 0.000129 | ms/batch 382.24 | loss 3.23 | ppl 25.197 +| epoch 9 step 98200 | 6440 batches | lr 0.000129 | ms/batch 384.08 | loss 3.23 | ppl 25.386 +| epoch 9 step 98400 | 6640 batches | lr 0.000128 | ms/batch 384.03 | loss 3.18 | ppl 24.057 +| epoch 9 step 98600 | 6840 batches | lr 0.000128 | ms/batch 382.74 | loss 3.21 | ppl 24.797 +| epoch 9 step 98800 | 7040 batches | lr 0.000127 | ms/batch 382.19 | loss 3.22 | ppl 24.906 +| epoch 9 step 99000 | 7240 batches | lr 0.000127 | ms/batch 382.54 | loss 3.18 | ppl 24.052 +| epoch 9 step 99200 | 7440 batches | lr 0.000127 | ms/batch 382.03 | loss 3.20 | ppl 24.555 +| epoch 9 step 99400 | 7640 batches | lr 0.000126 | ms/batch 382.21 | loss 3.18 | ppl 24.134 +| epoch 9 step 99600 | 7840 batches | lr 0.000126 | ms/batch 382.21 | loss 3.21 | ppl 24.800 +| epoch 9 step 99800 | 8040 batches | lr 0.000125 | ms/batch 382.39 | loss 3.21 | ppl 24.779 +| epoch 9 step 100000 | 8240 batches | lr 0.000125 | ms/batch 382.26 | loss 3.20 | ppl 24.531 +---------------------------------------------------------------------------------------------------- +| Eval 25 at step 100000 | time: 1537.21s | valid loss 3.25 | valid ppl 25.840 +---------------------------------------------------------------------------------------------------- +| epoch 9 step 100200 | 8440 batches | lr 0.000125 | ms/batch 427.57 | loss 3.22 | ppl 24.958 +| epoch 9 step 100400 | 8640 batches | lr 0.000124 | ms/batch 382.27 | loss 3.20 | ppl 24.578 +| epoch 9 step 100600 | 8840 batches | lr 0.000124 | ms/batch 382.52 | loss 3.23 | ppl 25.217 +| epoch 9 step 100800 | 9040 batches | lr 0.000123 | ms/batch 382.37 | loss 3.22 | ppl 24.969 +| epoch 9 step 101000 | 9240 batches | lr 0.000123 | ms/batch 382.24 | loss 3.20 | ppl 24.417 +| epoch 9 step 101200 | 9440 batches | lr 0.000123 | ms/batch 382.79 | loss 3.22 | ppl 25.039 +| epoch 9 step 101400 | 9640 batches | lr 0.000122 | ms/batch 382.67 | loss 3.24 | ppl 25.415 +| epoch 9 step 101600 | 9840 batches | lr 0.000122 | ms/batch 382.45 | loss 3.19 | ppl 24.174 +| epoch 9 step 101800 | 10040 batches | lr 0.000121 | ms/batch 382.08 | loss 3.22 | ppl 25.102 +| epoch 9 step 102000 | 10240 batches | lr 0.000121 | ms/batch 383.61 | loss 3.20 | ppl 24.451 +| epoch 9 step 102200 | 10440 batches | lr 0.000121 | ms/batch 382.06 | loss 3.22 | ppl 24.923 +| epoch 9 step 102400 | 10640 batches | lr 0.00012 | ms/batch 382.37 | loss 3.24 | ppl 25.448 +| epoch 9 step 102600 | 10840 batches | lr 0.00012 | ms/batch 382.39 | loss 3.18 | ppl 23.979 +| epoch 9 step 102800 | 11040 batches | lr 0.00012 | ms/batch 382.32 | loss 3.24 | ppl 25.423 +| epoch 9 step 103000 | 11240 batches | lr 0.000119 | ms/batch 383.03 | loss 3.24 | ppl 25.534 +| epoch 9 step 103200 | 11440 batches | lr 0.000119 | ms/batch 382.33 | loss 3.21 | ppl 24.815 +| epoch 10 step 103400 | 170 batches | lr 0.000118 | ms/batch 381.61 | loss 3.20 | ppl 24.481 +| epoch 10 step 103600 | 370 batches | lr 0.000118 | ms/batch 383.21 | loss 3.19 | ppl 24.264 +| epoch 10 step 103800 | 570 batches | lr 0.000118 | ms/batch 382.43 | loss 3.20 | ppl 24.604 +| epoch 10 step 104000 | 770 batches | lr 0.000117 | ms/batch 382.42 | loss 3.20 | ppl 24.608 +---------------------------------------------------------------------------------------------------- +| Eval 26 at step 104000 | time: 1536.35s | valid loss 3.24 | valid ppl 25.656 +---------------------------------------------------------------------------------------------------- +| epoch 10 step 104200 | 970 batches | lr 0.000117 | ms/batch 428.66 | loss 3.18 | ppl 24.059 +| epoch 10 step 104400 | 1170 batches | lr 0.000116 | ms/batch 382.64 | loss 3.22 | ppl 24.956 +| epoch 10 step 104600 | 1370 batches | lr 0.000116 | ms/batch 382.09 | loss 3.19 | ppl 24.344 +| epoch 10 step 104800 | 1570 batches | lr 0.000116 | ms/batch 382.32 | loss 3.19 | ppl 24.285 +| epoch 10 step 105000 | 1770 batches | lr 0.000115 | ms/batch 382.47 | loss 3.19 | ppl 24.407 +| epoch 10 step 105200 | 1970 batches | lr 0.000115 | ms/batch 382.28 | loss 3.22 | ppl 25.101 +| epoch 10 step 105400 | 2170 batches | lr 0.000114 | ms/batch 382.16 | loss 3.22 | ppl 24.958 +| epoch 10 step 105600 | 2370 batches | lr 0.000114 | ms/batch 382.82 | loss 3.21 | ppl 24.760 +| epoch 10 step 105800 | 2570 batches | lr 0.000114 | ms/batch 382.65 | loss 3.20 | ppl 24.606 +| epoch 10 step 106000 | 2770 batches | lr 0.000113 | ms/batch 383.22 | loss 3.18 | ppl 24.045 +| epoch 10 step 106200 | 2970 batches | lr 0.000113 | ms/batch 382.13 | loss 3.19 | ppl 24.269 +| epoch 10 step 106400 | 3170 batches | lr 0.000112 | ms/batch 382.19 | loss 3.21 | ppl 24.703 +| epoch 10 step 106600 | 3370 batches | lr 0.000112 | ms/batch 381.99 | loss 3.20 | ppl 24.587 +| epoch 10 step 106800 | 3570 batches | lr 0.000112 | ms/batch 381.93 | loss 3.18 | ppl 23.994 +| epoch 10 step 107000 | 3770 batches | lr 0.000111 | ms/batch 382.52 | loss 3.19 | ppl 24.305 +| epoch 10 step 107200 | 3970 batches | lr 0.000111 | ms/batch 382.40 | loss 3.20 | ppl 24.528 +| epoch 10 step 107400 | 4170 batches | lr 0.000111 | ms/batch 382.31 | loss 3.19 | ppl 24.408 +| epoch 10 step 107600 | 4370 batches | lr 0.00011 | ms/batch 382.60 | loss 3.20 | ppl 24.599 +| epoch 10 step 107800 | 4570 batches | lr 0.00011 | ms/batch 382.24 | loss 3.21 | ppl 24.863 +| epoch 10 step 108000 | 4770 batches | lr 0.000109 | ms/batch 382.13 | loss 3.17 | ppl 23.782 +---------------------------------------------------------------------------------------------------- +| Eval 27 at step 108000 | time: 1536.23s | valid loss 3.23 | valid ppl 25.255 +---------------------------------------------------------------------------------------------------- +| epoch 10 step 108200 | 4970 batches | lr 0.000109 | ms/batch 426.28 | loss 3.21 | ppl 24.763 +| epoch 10 step 108400 | 5170 batches | lr 0.000109 | ms/batch 382.29 | loss 3.19 | ppl 24.200 +| epoch 10 step 108600 | 5370 batches | lr 0.000108 | ms/batch 382.26 | loss 3.16 | ppl 23.645 +| epoch 10 step 108800 | 5570 batches | lr 0.000108 | ms/batch 382.46 | loss 3.18 | ppl 24.039 +| epoch 10 step 109000 | 5770 batches | lr 0.000107 | ms/batch 383.71 | loss 3.20 | ppl 24.615 +| epoch 10 step 109200 | 5970 batches | lr 0.000107 | ms/batch 382.20 | loss 3.18 | ppl 24.018 +| epoch 10 step 109400 | 6170 batches | lr 0.000107 | ms/batch 382.78 | loss 3.18 | ppl 23.980 +| epoch 10 step 109600 | 6370 batches | lr 0.000106 | ms/batch 382.05 | loss 3.22 | ppl 25.013 +| epoch 10 step 109800 | 6570 batches | lr 0.000106 | ms/batch 382.35 | loss 3.13 | ppl 22.954 +| epoch 10 step 110000 | 6770 batches | lr 0.000105 | ms/batch 382.09 | loss 3.17 | ppl 23.779 +| epoch 10 step 110200 | 6970 batches | lr 0.000105 | ms/batch 382.41 | loss 3.18 | ppl 24.146 +| epoch 10 step 110400 | 7170 batches | lr 0.000105 | ms/batch 381.99 | loss 3.14 | ppl 23.079 +| epoch 10 step 110600 | 7370 batches | lr 0.000104 | ms/batch 382.25 | loss 3.17 | ppl 23.727 +| epoch 10 step 110800 | 7570 batches | lr 0.000104 | ms/batch 381.90 | loss 3.14 | ppl 23.090 +| epoch 10 step 111000 | 7770 batches | lr 0.000104 | ms/batch 382.75 | loss 3.18 | ppl 24.008 +| epoch 10 step 111200 | 7970 batches | lr 0.000103 | ms/batch 382.33 | loss 3.17 | ppl 23.716 +| epoch 10 step 111400 | 8170 batches | lr 0.000103 | ms/batch 382.39 | loss 3.16 | ppl 23.509 +| epoch 10 step 111600 | 8370 batches | lr 0.000102 | ms/batch 382.05 | loss 3.19 | ppl 24.226 +| epoch 10 step 111800 | 8570 batches | lr 0.000102 | ms/batch 382.85 | loss 3.17 | ppl 23.716 +| epoch 10 step 112000 | 8770 batches | lr 0.000102 | ms/batch 382.42 | loss 3.18 | ppl 23.938 +---------------------------------------------------------------------------------------------------- +| Eval 28 at step 112000 | time: 1535.84s | valid loss 3.23 | valid ppl 25.189 +---------------------------------------------------------------------------------------------------- +| epoch 10 step 112200 | 8970 batches | lr 0.000101 | ms/batch 426.52 | loss 3.18 | ppl 24.127 +| epoch 10 step 112400 | 9170 batches | lr 0.000101 | ms/batch 383.30 | loss 3.16 | ppl 23.619 +| epoch 10 step 112600 | 9370 batches | lr 0.0001 | ms/batch 382.14 | loss 3.18 | ppl 23.950 +| epoch 10 step 112800 | 9570 batches | lr 0.0001 | ms/batch 385.48 | loss 3.20 | ppl 24.423 +| epoch 10 step 113000 | 9770 batches | lr 9.97e-05 | ms/batch 382.69 | loss 3.17 | ppl 23.829 +| epoch 10 step 113200 | 9970 batches | lr 9.93e-05 | ms/batch 382.49 | loss 3.17 | ppl 23.821 +| epoch 10 step 113400 | 10170 batches | lr 9.89e-05 | ms/batch 382.23 | loss 3.15 | ppl 23.294 +| epoch 10 step 113600 | 10370 batches | lr 9.85e-05 | ms/batch 384.21 | loss 3.18 | ppl 23.956 +| epoch 10 step 113800 | 10570 batches | lr 9.81e-05 | ms/batch 382.45 | loss 3.20 | ppl 24.546 +| epoch 10 step 114000 | 10770 batches | lr 9.77e-05 | ms/batch 382.38 | loss 3.15 | ppl 23.438 +| epoch 10 step 114200 | 10970 batches | lr 9.73e-05 | ms/batch 382.18 | loss 3.17 | ppl 23.817 +| epoch 10 step 114400 | 11170 batches | lr 9.7e-05 | ms/batch 381.94 | loss 3.21 | ppl 24.673 +| epoch 10 step 114600 | 11370 batches | lr 9.66e-05 | ms/batch 382.15 | loss 3.18 | ppl 24.121 +| epoch 11 step 114800 | 100 batches | lr 9.62e-05 | ms/batch 381.61 | loss 3.17 | ppl 23.803 +| epoch 11 step 115000 | 300 batches | lr 9.58e-05 | ms/batch 382.75 | loss 3.15 | ppl 23.304 +| epoch 11 step 115200 | 500 batches | lr 9.54e-05 | ms/batch 384.39 | loss 3.19 | ppl 24.227 +| epoch 11 step 115400 | 700 batches | lr 9.51e-05 | ms/batch 385.50 | loss 3.14 | ppl 23.071 +| epoch 11 step 115600 | 900 batches | lr 9.47e-05 | ms/batch 385.56 | loss 3.16 | ppl 23.511 +| epoch 11 step 115800 | 1100 batches | lr 9.43e-05 | ms/batch 382.23 | loss 3.18 | ppl 24.006 +| epoch 11 step 116000 | 1300 batches | lr 9.39e-05 | ms/batch 382.19 | loss 3.16 | ppl 23.507 +---------------------------------------------------------------------------------------------------- +| Eval 29 at step 116000 | time: 1538.31s | valid loss 3.22 | valid ppl 25.114 +---------------------------------------------------------------------------------------------------- +| epoch 11 step 116200 | 1500 batches | lr 9.35e-05 | ms/batch 426.05 | loss 3.15 | ppl 23.409 +| epoch 11 step 116400 | 1700 batches | lr 9.32e-05 | ms/batch 382.79 | loss 3.15 | ppl 23.260 +| epoch 11 step 116600 | 1900 batches | lr 9.28e-05 | ms/batch 382.10 | loss 3.18 | ppl 23.952 +| epoch 11 step 116800 | 2100 batches | lr 9.24e-05 | ms/batch 382.37 | loss 3.20 | ppl 24.514 +| epoch 11 step 117000 | 2300 batches | lr 9.2e-05 | ms/batch 382.47 | loss 3.18 | ppl 24.021 +| epoch 11 step 117200 | 2500 batches | lr 9.16e-05 | ms/batch 382.34 | loss 3.16 | ppl 23.538 +| epoch 11 step 117400 | 2700 batches | lr 9.13e-05 | ms/batch 382.68 | loss 3.16 | ppl 23.667 +| epoch 11 step 117600 | 2900 batches | lr 9.09e-05 | ms/batch 382.24 | loss 3.13 | ppl 22.809 +| epoch 11 step 117800 | 3100 batches | lr 9.05e-05 | ms/batch 382.77 | loss 3.16 | ppl 23.658 +| epoch 11 step 118000 | 3300 batches | lr 9.01e-05 | ms/batch 382.95 | loss 3.18 | ppl 24.090 +| epoch 11 step 118200 | 3500 batches | lr 8.97e-05 | ms/batch 382.53 | loss 3.14 | ppl 23.180 +| epoch 11 step 118400 | 3700 batches | lr 8.94e-05 | ms/batch 382.45 | loss 3.16 | ppl 23.624 +| epoch 11 step 118600 | 3900 batches | lr 8.9e-05 | ms/batch 382.36 | loss 3.15 | ppl 23.373 +| epoch 11 step 118800 | 4100 batches | lr 8.86e-05 | ms/batch 382.39 | loss 3.17 | ppl 23.850 +| epoch 11 step 119000 | 4300 batches | lr 8.82e-05 | ms/batch 383.06 | loss 3.16 | ppl 23.517 +| epoch 11 step 119200 | 4500 batches | lr 8.79e-05 | ms/batch 382.42 | loss 3.18 | ppl 24.035 +| epoch 11 step 119400 | 4700 batches | lr 8.75e-05 | ms/batch 382.30 | loss 3.14 | ppl 23.085 +| epoch 11 step 119600 | 4900 batches | lr 8.71e-05 | ms/batch 382.34 | loss 3.15 | ppl 23.369 +| epoch 11 step 119800 | 5100 batches | lr 8.67e-05 | ms/batch 382.28 | loss 3.16 | ppl 23.601 +| epoch 11 step 120000 | 5300 batches | lr 8.64e-05 | ms/batch 382.23 | loss 3.14 | ppl 23.129 +---------------------------------------------------------------------------------------------------- +| Eval 30 at step 120000 | time: 1536.06s | valid loss 3.22 | valid ppl 24.910 +---------------------------------------------------------------------------------------------------- +| epoch 11 step 120200 | 5500 batches | lr 8.6e-05 | ms/batch 426.45 | loss 3.14 | ppl 23.184 +| epoch 11 step 120400 | 5700 batches | lr 8.56e-05 | ms/batch 382.84 | loss 3.15 | ppl 23.387 +| epoch 11 step 120600 | 5900 batches | lr 8.53e-05 | ms/batch 382.80 | loss 3.16 | ppl 23.487 +| epoch 11 step 120800 | 6100 batches | lr 8.49e-05 | ms/batch 382.29 | loss 3.15 | ppl 23.382 +| epoch 11 step 121000 | 6300 batches | lr 8.45e-05 | ms/batch 384.19 | loss 3.16 | ppl 23.578 +| epoch 11 step 121200 | 6500 batches | lr 8.41e-05 | ms/batch 382.43 | loss 3.12 | ppl 22.710 +| epoch 11 step 121400 | 6700 batches | lr 8.38e-05 | ms/batch 382.14 | loss 3.12 | ppl 22.638 +| epoch 11 step 121600 | 6900 batches | lr 8.34e-05 | ms/batch 382.48 | loss 3.14 | ppl 23.168 +| epoch 11 step 121800 | 7100 batches | lr 8.3e-05 | ms/batch 383.22 | loss 3.14 | ppl 23.054 +| epoch 11 step 122000 | 7300 batches | lr 8.27e-05 | ms/batch 382.59 | loss 3.09 | ppl 22.058 +| epoch 11 step 122200 | 7500 batches | lr 8.23e-05 | ms/batch 382.23 | loss 3.14 | ppl 23.079 +| epoch 11 step 122400 | 7700 batches | lr 8.19e-05 | ms/batch 382.91 | loss 3.12 | ppl 22.627 +| epoch 11 step 122600 | 7900 batches | lr 8.16e-05 | ms/batch 382.47 | loss 3.13 | ppl 22.780 +| epoch 11 step 122800 | 8100 batches | lr 8.12e-05 | ms/batch 382.22 | loss 3.14 | ppl 23.145 +| epoch 11 step 123000 | 8300 batches | lr 8.08e-05 | ms/batch 382.37 | loss 3.13 | ppl 22.848 +| epoch 11 step 123200 | 8500 batches | lr 8.04e-05 | ms/batch 382.30 | loss 3.13 | ppl 22.881 +| epoch 11 step 123400 | 8700 batches | lr 8.01e-05 | ms/batch 382.49 | loss 3.15 | ppl 23.295 +| epoch 11 step 123600 | 8900 batches | lr 7.97e-05 | ms/batch 382.00 | loss 3.14 | ppl 23.137 +| epoch 11 step 123800 | 9100 batches | lr 7.93e-05 | ms/batch 382.89 | loss 3.14 | ppl 23.205 +| epoch 11 step 124000 | 9300 batches | lr 7.9e-05 | ms/batch 382.01 | loss 3.13 | ppl 22.877 +---------------------------------------------------------------------------------------------------- +| Eval 31 at step 124000 | time: 1536.54s | valid loss 3.21 | valid ppl 24.705 +---------------------------------------------------------------------------------------------------- +| epoch 11 step 124200 | 9500 batches | lr 7.86e-05 | ms/batch 426.03 | loss 3.15 | ppl 23.341 +| epoch 11 step 124400 | 9700 batches | lr 7.83e-05 | ms/batch 382.70 | loss 3.14 | ppl 23.144 +| epoch 11 step 124600 | 9900 batches | lr 7.79e-05 | ms/batch 382.71 | loss 3.13 | ppl 22.771 +| epoch 11 step 124800 | 10100 batches | lr 7.75e-05 | ms/batch 382.50 | loss 3.14 | ppl 23.138 +| epoch 11 step 125000 | 10300 batches | lr 7.72e-05 | ms/batch 382.99 | loss 3.13 | ppl 22.907 +| epoch 11 step 125200 | 10500 batches | lr 7.68e-05 | ms/batch 382.03 | loss 3.16 | ppl 23.676 +| epoch 11 step 125400 | 10700 batches | lr 7.64e-05 | ms/batch 382.49 | loss 3.13 | ppl 22.800 +| epoch 11 step 125600 | 10900 batches | lr 7.61e-05 | ms/batch 382.28 | loss 3.12 | ppl 22.598 +| epoch 11 step 125800 | 11100 batches | lr 7.57e-05 | ms/batch 382.13 | loss 3.17 | ppl 23.875 +| epoch 11 step 126000 | 11300 batches | lr 7.54e-05 | ms/batch 383.41 | loss 3.15 | ppl 23.357 +| epoch 12 step 126200 | 30 batches | lr 7.5e-05 | ms/batch 381.25 | loss 3.15 | ppl 23.413 +| epoch 12 step 126400 | 230 batches | lr 7.46e-05 | ms/batch 382.16 | loss 3.10 | ppl 22.274 +| epoch 12 step 126600 | 430 batches | lr 7.43e-05 | ms/batch 383.09 | loss 3.14 | ppl 23.086 +| epoch 12 step 126800 | 630 batches | lr 7.39e-05 | ms/batch 382.18 | loss 3.11 | ppl 22.526 +| epoch 12 step 127000 | 830 batches | lr 7.36e-05 | ms/batch 382.31 | loss 3.15 | ppl 23.399 +| epoch 12 step 127200 | 1030 batches | lr 7.32e-05 | ms/batch 382.19 | loss 3.11 | ppl 22.478 +| epoch 12 step 127400 | 1230 batches | lr 7.28e-05 | ms/batch 383.22 | loss 3.13 | ppl 22.942 +| epoch 12 step 127600 | 1430 batches | lr 7.25e-05 | ms/batch 383.14 | loss 3.13 | ppl 22.840 +| epoch 12 step 127800 | 1630 batches | lr 7.21e-05 | ms/batch 382.25 | loss 3.11 | ppl 22.402 +| epoch 12 step 128000 | 1830 batches | lr 7.18e-05 | ms/batch 382.04 | loss 3.14 | ppl 22.998 +---------------------------------------------------------------------------------------------------- +| Eval 32 at step 128000 | time: 1536.17s | valid loss 3.21 | valid ppl 24.729 +---------------------------------------------------------------------------------------------------- +| epoch 12 step 128200 | 2030 batches | lr 7.14e-05 | ms/batch 413.67 | loss 3.17 | ppl 23.753 +| epoch 12 step 128400 | 2230 batches | lr 7.11e-05 | ms/batch 382.31 | loss 3.14 | ppl 23.103 +| epoch 12 step 128600 | 2430 batches | lr 7.07e-05 | ms/batch 382.32 | loss 3.14 | ppl 23.163 +| epoch 12 step 128800 | 2630 batches | lr 7.04e-05 | ms/batch 383.18 | loss 3.13 | ppl 22.938 +| epoch 12 step 129000 | 2830 batches | lr 7e-05 | ms/batch 382.20 | loss 3.10 | ppl 22.155 +| epoch 12 step 129200 | 3030 batches | lr 6.97e-05 | ms/batch 382.30 | loss 3.13 | ppl 22.918 +| epoch 12 step 129400 | 3230 batches | lr 6.93e-05 | ms/batch 383.25 | loss 3.13 | ppl 22.808 +| epoch 12 step 129600 | 3430 batches | lr 6.9e-05 | ms/batch 382.15 | loss 3.12 | ppl 22.706 +| epoch 12 step 129800 | 3630 batches | lr 6.86e-05 | ms/batch 382.07 | loss 3.11 | ppl 22.435 +| epoch 12 step 130000 | 3830 batches | lr 6.83e-05 | ms/batch 382.59 | loss 3.13 | ppl 22.807 +| epoch 12 step 130200 | 4030 batches | lr 6.79e-05 | ms/batch 382.34 | loss 3.14 | ppl 23.171 +| epoch 12 step 130400 | 4230 batches | lr 6.76e-05 | ms/batch 382.69 | loss 3.13 | ppl 22.817 +| epoch 12 step 130600 | 4430 batches | lr 6.72e-05 | ms/batch 382.08 | loss 3.13 | ppl 22.790 +| epoch 12 step 130800 | 4630 batches | lr 6.69e-05 | ms/batch 382.35 | loss 3.13 | ppl 22.794 +| epoch 12 step 131000 | 4830 batches | lr 6.65e-05 | ms/batch 382.00 | loss 3.11 | ppl 22.490 +| epoch 12 step 131200 | 5030 batches | lr 6.62e-05 | ms/batch 382.50 | loss 3.14 | ppl 23.008 +| epoch 12 step 131400 | 5230 batches | lr 6.58e-05 | ms/batch 382.93 | loss 3.12 | ppl 22.728 +| epoch 12 step 131600 | 5430 batches | lr 6.55e-05 | ms/batch 382.13 | loss 3.09 | ppl 22.037 +| epoch 12 step 131800 | 5630 batches | lr 6.51e-05 | ms/batch 382.22 | loss 3.13 | ppl 22.860 +| epoch 12 step 132000 | 5830 batches | lr 6.48e-05 | ms/batch 382.29 | loss 3.13 | ppl 22.808 +---------------------------------------------------------------------------------------------------- +| Eval 33 at step 132000 | time: 1535.91s | valid loss 3.20 | valid ppl 24.508 +---------------------------------------------------------------------------------------------------- +| epoch 12 step 132200 | 6030 batches | lr 6.44e-05 | ms/batch 426.93 | loss 3.10 | ppl 22.292 +| epoch 12 step 132400 | 6230 batches | lr 6.41e-05 | ms/batch 382.45 | loss 3.12 | ppl 22.597 +| epoch 12 step 132600 | 6430 batches | lr 6.38e-05 | ms/batch 382.37 | loss 3.13 | ppl 22.881 +| epoch 12 step 132800 | 6630 batches | lr 6.34e-05 | ms/batch 383.06 | loss 3.08 | ppl 21.695 +| epoch 12 step 133000 | 6830 batches | lr 6.31e-05 | ms/batch 382.08 | loss 3.10 | ppl 22.302 +| epoch 12 step 133200 | 7030 batches | lr 6.27e-05 | ms/batch 382.16 | loss 3.11 | ppl 22.463 +| epoch 12 step 133400 | 7230 batches | lr 6.24e-05 | ms/batch 382.05 | loss 3.08 | ppl 21.683 +| epoch 12 step 133600 | 7430 batches | lr 6.2e-05 | ms/batch 383.07 | loss 3.09 | ppl 21.918 +| epoch 12 step 133800 | 7630 batches | lr 6.17e-05 | ms/batch 381.84 | loss 3.08 | ppl 21.792 +| epoch 12 step 134000 | 7830 batches | lr 6.14e-05 | ms/batch 382.49 | loss 3.10 | ppl 22.284 +| epoch 12 step 134200 | 8030 batches | lr 6.1e-05 | ms/batch 381.94 | loss 3.11 | ppl 22.330 +| epoch 12 step 134400 | 8230 batches | lr 6.07e-05 | ms/batch 382.66 | loss 3.10 | ppl 22.087 +| epoch 12 step 134600 | 8430 batches | lr 6.04e-05 | ms/batch 381.98 | loss 3.11 | ppl 22.433 +| epoch 12 step 134800 | 8630 batches | lr 6e-05 | ms/batch 382.70 | loss 3.10 | ppl 22.219 +| epoch 12 step 135000 | 8830 batches | lr 5.97e-05 | ms/batch 382.07 | loss 3.12 | ppl 22.686 +| epoch 12 step 135200 | 9030 batches | lr 5.94e-05 | ms/batch 382.67 | loss 3.12 | ppl 22.550 +| epoch 12 step 135400 | 9230 batches | lr 5.9e-05 | ms/batch 383.43 | loss 3.09 | ppl 21.869 +| epoch 12 step 135600 | 9430 batches | lr 5.87e-05 | ms/batch 382.29 | loss 3.12 | ppl 22.561 +| epoch 12 step 135800 | 9630 batches | lr 5.84e-05 | ms/batch 383.83 | loss 3.13 | ppl 22.883 +| epoch 12 step 136000 | 9830 batches | lr 5.8e-05 | ms/batch 382.22 | loss 3.09 | ppl 21.958 +---------------------------------------------------------------------------------------------------- +| Eval 34 at step 136000 | time: 1536.34s | valid loss 3.19 | valid ppl 24.347 +---------------------------------------------------------------------------------------------------- +| epoch 12 step 136200 | 10030 batches | lr 5.77e-05 | ms/batch 427.48 | loss 3.11 | ppl 22.491 +| epoch 12 step 136400 | 10230 batches | lr 5.74e-05 | ms/batch 382.28 | loss 3.10 | ppl 22.171 +| epoch 12 step 136600 | 10430 batches | lr 5.7e-05 | ms/batch 382.31 | loss 3.11 | ppl 22.329 +| epoch 12 step 136800 | 10630 batches | lr 5.67e-05 | ms/batch 382.04 | loss 3.14 | ppl 23.048 +| epoch 12 step 137000 | 10830 batches | lr 5.64e-05 | ms/batch 382.41 | loss 3.08 | ppl 21.659 +| epoch 12 step 137200 | 11030 batches | lr 5.6e-05 | ms/batch 382.01 | loss 3.13 | ppl 22.971 +| epoch 12 step 137400 | 11230 batches | lr 5.57e-05 | ms/batch 382.43 | loss 3.13 | ppl 22.881 +| epoch 12 step 137600 | 11430 batches | lr 5.54e-05 | ms/batch 382.30 | loss 3.12 | ppl 22.562 +| epoch 13 step 137800 | 160 batches | lr 5.51e-05 | ms/batch 381.95 | loss 3.10 | ppl 22.208 +| epoch 13 step 138000 | 360 batches | lr 5.47e-05 | ms/batch 382.40 | loss 3.09 | ppl 21.933 +| epoch 13 step 138200 | 560 batches | lr 5.44e-05 | ms/batch 382.21 | loss 3.11 | ppl 22.330 +| epoch 13 step 138400 | 760 batches | lr 5.41e-05 | ms/batch 382.34 | loss 3.10 | ppl 22.150 +| epoch 13 step 138600 | 960 batches | lr 5.38e-05 | ms/batch 383.74 | loss 3.08 | ppl 21.791 +| epoch 13 step 138800 | 1160 batches | lr 5.34e-05 | ms/batch 382.09 | loss 3.12 | ppl 22.594 +| epoch 13 step 139000 | 1360 batches | lr 5.31e-05 | ms/batch 382.12 | loss 3.09 | ppl 22.023 +| epoch 13 step 139200 | 1560 batches | lr 5.28e-05 | ms/batch 382.56 | loss 3.09 | ppl 21.997 +| epoch 13 step 139400 | 1760 batches | lr 5.25e-05 | ms/batch 382.43 | loss 3.09 | ppl 21.979 +| epoch 13 step 139600 | 1960 batches | lr 5.22e-05 | ms/batch 382.02 | loss 3.12 | ppl 22.641 +| epoch 13 step 139800 | 2160 batches | lr 5.18e-05 | ms/batch 382.36 | loss 3.12 | ppl 22.747 +| epoch 13 step 140000 | 2360 batches | lr 5.15e-05 | ms/batch 382.02 | loss 3.11 | ppl 22.326 +---------------------------------------------------------------------------------------------------- +| Eval 35 at step 140000 | time: 1535.70s | valid loss 3.19 | valid ppl 24.249 +---------------------------------------------------------------------------------------------------- +| epoch 13 step 140200 | 2560 batches | lr 5.12e-05 | ms/batch 426.40 | loss 3.11 | ppl 22.373 +| epoch 13 step 140400 | 2760 batches | lr 5.09e-05 | ms/batch 382.20 | loss 3.08 | ppl 21.833 +| epoch 13 step 140600 | 2960 batches | lr 5.06e-05 | ms/batch 382.51 | loss 3.09 | ppl 21.962 +| epoch 13 step 140800 | 3160 batches | lr 5.03e-05 | ms/batch 382.14 | loss 3.11 | ppl 22.316 +| epoch 13 step 141000 | 3360 batches | lr 4.99e-05 | ms/batch 382.34 | loss 3.10 | ppl 22.298 +| epoch 13 step 141200 | 3560 batches | lr 4.96e-05 | ms/batch 382.74 | loss 3.08 | ppl 21.850 +| epoch 13 step 141400 | 3760 batches | lr 4.93e-05 | ms/batch 382.82 | loss 3.09 | ppl 22.077 +| epoch 13 step 141600 | 3960 batches | lr 4.9e-05 | ms/batch 382.57 | loss 3.10 | ppl 22.157 +| epoch 13 step 141800 | 4160 batches | lr 4.87e-05 | ms/batch 382.56 | loss 3.10 | ppl 22.237 +| epoch 13 step 142000 | 4360 batches | lr 4.84e-05 | ms/batch 382.02 | loss 3.10 | ppl 22.210 +| epoch 13 step 142200 | 4560 batches | lr 4.81e-05 | ms/batch 382.78 | loss 3.12 | ppl 22.641 +| epoch 13 step 142400 | 4760 batches | lr 4.78e-05 | ms/batch 382.16 | loss 3.07 | ppl 21.546 +| epoch 13 step 142600 | 4960 batches | lr 4.75e-05 | ms/batch 382.39 | loss 3.11 | ppl 22.384 +| epoch 13 step 142800 | 5160 batches | lr 4.72e-05 | ms/batch 382.22 | loss 3.09 | ppl 21.989 +| epoch 13 step 143000 | 5360 batches | lr 4.68e-05 | ms/batch 382.58 | loss 3.08 | ppl 21.657 +| epoch 13 step 143200 | 5560 batches | lr 4.65e-05 | ms/batch 382.38 | loss 3.08 | ppl 21.757 +| epoch 13 step 143400 | 5760 batches | lr 4.62e-05 | ms/batch 383.58 | loss 3.10 | ppl 22.194 +| epoch 13 step 143600 | 5960 batches | lr 4.59e-05 | ms/batch 382.90 | loss 3.09 | ppl 21.933 +| epoch 13 step 143800 | 6160 batches | lr 4.56e-05 | ms/batch 383.73 | loss 3.08 | ppl 21.719 +| epoch 13 step 144000 | 6360 batches | lr 4.53e-05 | ms/batch 382.50 | loss 3.13 | ppl 22.838 +---------------------------------------------------------------------------------------------------- +| Eval 36 at step 144000 | time: 1536.60s | valid loss 3.19 | valid ppl 24.245 +---------------------------------------------------------------------------------------------------- +| epoch 13 step 144200 | 6560 batches | lr 4.5e-05 | ms/batch 428.61 | loss 3.03 | ppl 20.797 +| epoch 13 step 144400 | 6760 batches | lr 4.47e-05 | ms/batch 382.58 | loss 3.07 | ppl 21.616 +| epoch 13 step 144600 | 6960 batches | lr 4.44e-05 | ms/batch 382.81 | loss 3.09 | ppl 21.927 +| epoch 13 step 144800 | 7160 batches | lr 4.41e-05 | ms/batch 382.66 | loss 3.04 | ppl 20.999 +| epoch 13 step 145000 | 7360 batches | lr 4.38e-05 | ms/batch 382.36 | loss 3.07 | ppl 21.506 +| epoch 13 step 145200 | 7560 batches | lr 4.35e-05 | ms/batch 382.21 | loss 3.05 | ppl 21.175 +| epoch 13 step 145400 | 7760 batches | lr 4.32e-05 | ms/batch 382.89 | loss 3.08 | ppl 21.819 +| epoch 13 step 145600 | 7960 batches | lr 4.29e-05 | ms/batch 382.31 | loss 3.06 | ppl 21.426 +| epoch 13 step 145800 | 8160 batches | lr 4.26e-05 | ms/batch 383.34 | loss 3.07 | ppl 21.444 +| epoch 13 step 146000 | 8360 batches | lr 4.23e-05 | ms/batch 382.40 | loss 3.09 | ppl 22.010 +| epoch 13 step 146200 | 8560 batches | lr 4.2e-05 | ms/batch 382.59 | loss 3.07 | ppl 21.595 +| epoch 13 step 146400 | 8760 batches | lr 4.17e-05 | ms/batch 382.41 | loss 3.08 | ppl 21.741 +| epoch 13 step 146600 | 8960 batches | lr 4.15e-05 | ms/batch 382.37 | loss 3.10 | ppl 22.096 +| epoch 13 step 146800 | 9160 batches | lr 4.12e-05 | ms/batch 382.26 | loss 3.07 | ppl 21.442 +| epoch 13 step 147000 | 9360 batches | lr 4.09e-05 | ms/batch 382.91 | loss 3.08 | ppl 21.744 +| epoch 13 step 147200 | 9560 batches | lr 4.06e-05 | ms/batch 385.27 | loss 3.11 | ppl 22.345 +| epoch 13 step 147400 | 9760 batches | lr 4.03e-05 | ms/batch 384.15 | loss 3.08 | ppl 21.665 +| epoch 13 step 147600 | 9960 batches | lr 4e-05 | ms/batch 383.92 | loss 3.08 | ppl 21.738 +| epoch 13 step 147800 | 10160 batches | lr 3.97e-05 | ms/batch 383.83 | loss 3.05 | ppl 21.213 +| epoch 13 step 148000 | 10360 batches | lr 3.94e-05 | ms/batch 384.68 | loss 3.09 | ppl 21.995 +---------------------------------------------------------------------------------------------------- +| Eval 37 at step 148000 | time: 1538.82s | valid loss 3.18 | valid ppl 23.993 +---------------------------------------------------------------------------------------------------- +| epoch 13 step 148200 | 10560 batches | lr 3.91e-05 | ms/batch 468.97 | loss 3.11 | ppl 22.352 +| epoch 13 step 148400 | 10760 batches | lr 3.89e-05 | ms/batch 704.91 | loss 3.06 | ppl 21.398 +| epoch 13 step 148600 | 10960 batches | lr 3.86e-05 | ms/batch 703.78 | loss 3.07 | ppl 21.622 +| epoch 13 step 148800 | 11160 batches | lr 3.83e-05 | ms/batch 671.68 | loss 3.12 | ppl 22.641 +| epoch 13 step 149000 | 11360 batches | lr 3.8e-05 | ms/batch 704.17 | loss 3.09 | ppl 21.935 +| epoch 14 step 149200 | 90 batches | lr 3.77e-05 | ms/batch 707.89 | loss 3.08 | ppl 21.847 +| epoch 14 step 149400 | 290 batches | lr 3.74e-05 | ms/batch 692.06 | loss 3.06 | ppl 21.250 +| epoch 14 step 149600 | 490 batches | lr 3.72e-05 | ms/batch 698.40 | loss 3.10 | ppl 22.096 +| epoch 14 step 149800 | 690 batches | lr 3.69e-05 | ms/batch 708.46 | loss 3.05 | ppl 21.130 +| epoch 14 step 150000 | 890 batches | lr 3.66e-05 | ms/batch 701.80 | loss 3.07 | ppl 21.611 +| epoch 14 step 150200 | 1090 batches | lr 3.63e-05 | ms/batch 684.70 | loss 3.08 | ppl 21.866 +| epoch 14 step 150400 | 1290 batches | lr 3.61e-05 | ms/batch 680.94 | loss 3.07 | ppl 21.455 +| epoch 14 step 150600 | 1490 batches | lr 3.58e-05 | ms/batch 682.02 | loss 3.07 | ppl 21.451 +| epoch 14 step 150800 | 1690 batches | lr 3.55e-05 | ms/batch 667.16 | loss 3.06 | ppl 21.432 +| epoch 14 step 151000 | 1890 batches | lr 3.52e-05 | ms/batch 687.92 | loss 3.08 | ppl 21.720 +| epoch 14 step 151200 | 2090 batches | lr 3.5e-05 | ms/batch 690.29 | loss 3.12 | ppl 22.629 +| epoch 14 step 151400 | 2290 batches | lr 3.47e-05 | ms/batch 695.24 | loss 3.09 | ppl 21.973 +| epoch 14 step 151600 | 2490 batches | lr 3.44e-05 | ms/batch 690.62 | loss 3.07 | ppl 21.541 +| epoch 14 step 151800 | 2690 batches | lr 3.41e-05 | ms/batch 691.73 | loss 3.08 | ppl 21.853 +| epoch 14 step 152000 | 2890 batches | lr 3.39e-05 | ms/batch 721.76 | loss 3.03 | ppl 20.724 +---------------------------------------------------------------------------------------------------- +| Eval 38 at step 152000 | time: 2730.88s | valid loss 3.17 | valid ppl 23.892 +---------------------------------------------------------------------------------------------------- +| epoch 14 step 152200 | 3090 batches | lr 3.36e-05 | ms/batch 773.37 | loss 3.08 | ppl 21.734 +| epoch 14 step 152400 | 3290 batches | lr 3.33e-05 | ms/batch 682.72 | loss 3.09 | ppl 22.046 +| epoch 14 step 152600 | 3490 batches | lr 3.31e-05 | ms/batch 701.64 | loss 3.06 | ppl 21.282 +| epoch 14 step 152800 | 3690 batches | lr 3.28e-05 | ms/batch 716.98 | loss 3.07 | ppl 21.645 +| epoch 14 step 153000 | 3890 batches | lr 3.25e-05 | ms/batch 702.88 | loss 3.06 | ppl 21.403 +| epoch 14 step 153200 | 4090 batches | lr 3.23e-05 | ms/batch 682.68 | loss 3.09 | ppl 21.972 +| epoch 14 step 153400 | 4290 batches | lr 3.2e-05 | ms/batch 704.02 | loss 3.07 | ppl 21.549 +| epoch 14 step 153600 | 4490 batches | lr 3.18e-05 | ms/batch 703.61 | loss 3.09 | ppl 21.998 +| epoch 14 step 153800 | 4690 batches | lr 3.15e-05 | ms/batch 710.51 | loss 3.06 | ppl 21.290 +| epoch 14 step 154000 | 4890 batches | lr 3.12e-05 | ms/batch 713.73 | loss 3.07 | ppl 21.440 +| epoch 14 step 154200 | 5090 batches | lr 3.1e-05 | ms/batch 737.96 | loss 3.08 | ppl 21.739 +| epoch 14 step 154400 | 5290 batches | lr 3.07e-05 | ms/batch 711.39 | loss 3.06 | ppl 21.344 +| epoch 14 step 154600 | 5490 batches | lr 3.05e-05 | ms/batch 702.95 | loss 3.05 | ppl 21.190 +| epoch 14 step 154800 | 5690 batches | lr 3.02e-05 | ms/batch 719.75 | loss 3.07 | ppl 21.542 +| epoch 14 step 155000 | 5890 batches | lr 2.99e-05 | ms/batch 672.31 | loss 3.07 | ppl 21.580 +| epoch 14 step 155200 | 6090 batches | lr 2.97e-05 | ms/batch 709.44 | loss 3.07 | ppl 21.587 +| epoch 14 step 155400 | 6290 batches | lr 2.94e-05 | ms/batch 709.79 | loss 3.07 | ppl 21.648 +| epoch 14 step 155600 | 6490 batches | lr 2.92e-05 | ms/batch 688.42 | loss 3.05 | ppl 21.036 +| epoch 14 step 155800 | 6690 batches | lr 2.89e-05 | ms/batch 689.25 | loss 3.03 | ppl 20.757 +| epoch 14 step 156000 | 6890 batches | lr 2.87e-05 | ms/batch 721.47 | loss 3.06 | ppl 21.351 +---------------------------------------------------------------------------------------------------- +| Eval 39 at step 156000 | time: 2828.47s | valid loss 3.17 | valid ppl 23.854 +---------------------------------------------------------------------------------------------------- +| epoch 14 step 156200 | 7090 batches | lr 2.84e-05 | ms/batch 761.55 | loss 3.06 | ppl 21.267 +| epoch 14 step 156400 | 7290 batches | lr 2.82e-05 | ms/batch 656.50 | loss 3.01 | ppl 20.271 +| epoch 14 step 156600 | 7490 batches | lr 2.79e-05 | ms/batch 694.99 | loss 3.06 | ppl 21.258 +| epoch 14 step 156800 | 7690 batches | lr 2.77e-05 | ms/batch 716.22 | loss 3.04 | ppl 20.894 +| epoch 14 step 157000 | 7890 batches | lr 2.74e-05 | ms/batch 713.94 | loss 3.04 | ppl 20.902 +| epoch 14 step 157200 | 8090 batches | lr 2.72e-05 | ms/batch 687.11 | loss 3.06 | ppl 21.311 +| epoch 14 step 157400 | 8290 batches | lr 2.7e-05 | ms/batch 682.84 | loss 3.05 | ppl 21.037 +| epoch 14 step 157600 | 8490 batches | lr 2.67e-05 | ms/batch 665.10 | loss 3.05 | ppl 21.110 +| epoch 14 step 157800 | 8690 batches | lr 2.65e-05 | ms/batch 742.98 | loss 3.07 | ppl 21.548 +| epoch 14 step 158000 | 8890 batches | lr 2.62e-05 | ms/batch 742.00 | loss 3.06 | ppl 21.303 +| epoch 14 step 158200 | 9090 batches | lr 2.6e-05 | ms/batch 682.98 | loss 3.06 | ppl 21.343 +| epoch 14 step 158400 | 9290 batches | lr 2.58e-05 | ms/batch 707.66 | loss 3.05 | ppl 21.196 +| epoch 14 step 158600 | 9490 batches | lr 2.55e-05 | ms/batch 700.45 | loss 3.06 | ppl 21.433 +| epoch 14 step 158800 | 9690 batches | lr 2.53e-05 | ms/batch 678.26 | loss 3.06 | ppl 21.401 +| epoch 14 step 159000 | 9890 batches | lr 2.5e-05 | ms/batch 678.52 | loss 3.04 | ppl 20.949 +| epoch 14 step 159200 | 10090 batches | lr 2.48e-05 | ms/batch 704.73 | loss 3.07 | ppl 21.508 +| epoch 14 step 159400 | 10290 batches | lr 2.46e-05 | ms/batch 705.36 | loss 3.05 | ppl 21.058 +| epoch 14 step 159600 | 10490 batches | lr 2.43e-05 | ms/batch 690.24 | loss 3.09 | ppl 21.881 +| epoch 14 step 159800 | 10690 batches | lr 2.41e-05 | ms/batch 698.55 | loss 3.05 | ppl 21.185 +| epoch 14 step 160000 | 10890 batches | lr 2.39e-05 | ms/batch 678.42 | loss 3.04 | ppl 20.881 +---------------------------------------------------------------------------------------------------- +| Eval 40 at step 160000 | time: 2795.13s | valid loss 3.17 | valid ppl 23.806 +---------------------------------------------------------------------------------------------------- +| epoch 14 step 160200 | 11090 batches | lr 2.36e-05 | ms/batch 743.16 | loss 3.09 | ppl 21.924 +| epoch 14 step 160400 | 11290 batches | lr 2.34e-05 | ms/batch 670.98 | loss 3.08 | ppl 21.781 +| epoch 15 step 160600 | 20 batches | lr 2.32e-05 | ms/batch 688.74 | loss 3.07 | ppl 21.534 +| epoch 15 step 160800 | 220 batches | lr 2.3e-05 | ms/batch 707.95 | loss 3.03 | ppl 20.736 +| epoch 15 step 161000 | 420 batches | lr 2.27e-05 | ms/batch 685.60 | loss 3.07 | ppl 21.451 +| epoch 15 step 161200 | 620 batches | lr 2.25e-05 | ms/batch 711.76 | loss 3.04 | ppl 20.824 +| epoch 15 step 161400 | 820 batches | lr 2.23e-05 | ms/batch 695.85 | loss 3.07 | ppl 21.648 +| epoch 15 step 161600 | 1020 batches | lr 2.21e-05 | ms/batch 680.45 | loss 3.04 | ppl 20.808 +| epoch 15 step 161800 | 1220 batches | lr 2.18e-05 | ms/batch 733.80 | loss 3.06 | ppl 21.352 +| epoch 15 step 162000 | 1420 batches | lr 2.16e-05 | ms/batch 702.32 | loss 3.05 | ppl 21.184 +| epoch 15 step 162200 | 1620 batches | lr 2.14e-05 | ms/batch 689.95 | loss 3.03 | ppl 20.716 +| epoch 15 step 162400 | 1820 batches | lr 2.12e-05 | ms/batch 700.66 | loss 3.07 | ppl 21.463 +| epoch 15 step 162600 | 2020 batches | lr 2.1e-05 | ms/batch 673.18 | loss 3.09 | ppl 21.980 +| epoch 15 step 162800 | 2220 batches | lr 2.07e-05 | ms/batch 709.69 | loss 3.07 | ppl 21.463 +| epoch 15 step 163000 | 2420 batches | lr 2.05e-05 | ms/batch 709.74 | loss 3.07 | ppl 21.488 +| epoch 15 step 163200 | 2620 batches | lr 2.03e-05 | ms/batch 702.37 | loss 3.06 | ppl 21.232 +| epoch 15 step 163400 | 2820 batches | lr 2.01e-05 | ms/batch 695.04 | loss 3.03 | ppl 20.696 +| epoch 15 step 163600 | 3020 batches | lr 1.99e-05 | ms/batch 718.85 | loss 3.06 | ppl 21.244 +| epoch 15 step 163800 | 3220 batches | lr 1.97e-05 | ms/batch 674.99 | loss 3.05 | ppl 21.183 +| epoch 15 step 164000 | 3420 batches | lr 1.95e-05 | ms/batch 708.94 | loss 3.06 | ppl 21.252 +---------------------------------------------------------------------------------------------------- +| Eval 41 at step 164000 | time: 2798.25s | valid loss 3.17 | valid ppl 23.747 +---------------------------------------------------------------------------------------------------- +| epoch 15 step 164200 | 3620 batches | lr 1.92e-05 | ms/batch 756.27 | loss 3.03 | ppl 20.794 +| epoch 15 step 164400 | 3820 batches | lr 1.9e-05 | ms/batch 686.46 | loss 3.06 | ppl 21.270 +| epoch 15 step 164600 | 4020 batches | lr 1.88e-05 | ms/batch 695.84 | loss 3.07 | ppl 21.566 +| epoch 15 step 164800 | 4220 batches | lr 1.86e-05 | ms/batch 708.79 | loss 3.05 | ppl 21.174 +| epoch 15 step 165000 | 4420 batches | lr 1.84e-05 | ms/batch 678.67 | loss 3.06 | ppl 21.240 +| epoch 15 step 165200 | 4620 batches | lr 1.82e-05 | ms/batch 696.74 | loss 3.06 | ppl 21.238 +| epoch 15 step 165400 | 4820 batches | lr 1.8e-05 | ms/batch 725.44 | loss 3.04 | ppl 20.967 +| epoch 15 step 165600 | 5020 batches | lr 1.78e-05 | ms/batch 682.40 | loss 3.07 | ppl 21.539 +| epoch 15 step 165800 | 5220 batches | lr 1.76e-05 | ms/batch 686.03 | loss 3.05 | ppl 21.048 +| epoch 15 step 166000 | 5420 batches | lr 1.74e-05 | ms/batch 705.11 | loss 3.02 | ppl 20.520 +| epoch 15 step 166200 | 5620 batches | lr 1.72e-05 | ms/batch 692.95 | loss 3.06 | ppl 21.245 +| epoch 15 step 166400 | 5820 batches | lr 1.7e-05 | ms/batch 680.20 | loss 3.05 | ppl 21.210 +| epoch 15 step 166600 | 6020 batches | lr 1.68e-05 | ms/batch 725.01 | loss 3.04 | ppl 20.885 +| epoch 15 step 166800 | 6220 batches | lr 1.66e-05 | ms/batch 696.24 | loss 3.05 | ppl 21.047 +| epoch 15 step 167000 | 6420 batches | lr 1.64e-05 | ms/batch 679.60 | loss 3.06 | ppl 21.386 +| epoch 15 step 167200 | 6620 batches | lr 1.62e-05 | ms/batch 685.90 | loss 3.01 | ppl 20.239 +| epoch 15 step 167400 | 6820 batches | lr 1.6e-05 | ms/batch 696.26 | loss 3.04 | ppl 20.831 +| epoch 15 step 167600 | 7020 batches | lr 1.58e-05 | ms/batch 667.73 | loss 3.05 | ppl 21.056 +| epoch 15 step 167800 | 7220 batches | lr 1.57e-05 | ms/batch 710.56 | loss 3.01 | ppl 20.250 +| epoch 15 step 168000 | 7420 batches | lr 1.55e-05 | ms/batch 684.67 | loss 3.02 | ppl 20.435 +---------------------------------------------------------------------------------------------------- +| Eval 42 at step 168000 | time: 2785.72s | valid loss 3.16 | valid ppl 23.632 +---------------------------------------------------------------------------------------------------- +| epoch 15 step 168200 | 7620 batches | lr 1.53e-05 | ms/batch 757.05 | loss 3.01 | ppl 20.240 +| epoch 15 step 168400 | 7820 batches | lr 1.51e-05 | ms/batch 723.60 | loss 3.04 | ppl 20.901 +| epoch 15 step 168600 | 8020 batches | lr 1.49e-05 | ms/batch 655.26 | loss 3.04 | ppl 20.915 +| epoch 15 step 168800 | 8220 batches | lr 1.47e-05 | ms/batch 744.40 | loss 3.03 | ppl 20.637 +| epoch 15 step 169000 | 8420 batches | lr 1.45e-05 | ms/batch 683.70 | loss 3.04 | ppl 20.935 +| epoch 15 step 169200 | 8620 batches | lr 1.43e-05 | ms/batch 706.63 | loss 3.04 | ppl 20.841 +| epoch 15 step 169400 | 8820 batches | lr 1.42e-05 | ms/batch 673.37 | loss 3.06 | ppl 21.253 +| epoch 15 step 169600 | 9020 batches | lr 1.4e-05 | ms/batch 724.83 | loss 3.05 | ppl 21.077 +| epoch 15 step 169800 | 9220 batches | lr 1.38e-05 | ms/batch 710.05 | loss 3.02 | ppl 20.465 +| epoch 15 step 170000 | 9420 batches | lr 1.36e-05 | ms/batch 714.29 | loss 3.05 | ppl 21.075 +| epoch 15 step 170200 | 9620 batches | lr 1.34e-05 | ms/batch 708.96 | loss 3.06 | ppl 21.377 +| epoch 15 step 170400 | 9820 batches | lr 1.33e-05 | ms/batch 709.15 | loss 3.03 | ppl 20.644 +| epoch 15 step 170600 | 10020 batches | lr 1.31e-05 | ms/batch 675.72 | loss 3.04 | ppl 20.958 +| epoch 15 step 170800 | 10220 batches | lr 1.29e-05 | ms/batch 688.52 | loss 3.04 | ppl 20.876 +| epoch 15 step 171000 | 10420 batches | lr 1.27e-05 | ms/batch 685.00 | loss 3.04 | ppl 20.869 +| epoch 15 step 171200 | 10620 batches | lr 1.26e-05 | ms/batch 720.81 | loss 3.07 | ppl 21.626 +| epoch 15 step 171400 | 10820 batches | lr 1.24e-05 | ms/batch 688.74 | loss 3.02 | ppl 20.402 +| epoch 15 step 171600 | 11020 batches | lr 1.22e-05 | ms/batch 688.38 | loss 3.06 | ppl 21.433 +| epoch 15 step 171800 | 11220 batches | lr 1.21e-05 | ms/batch 725.25 | loss 3.06 | ppl 21.409 +| epoch 15 step 172000 | 11420 batches | lr 1.19e-05 | ms/batch 688.06 | loss 3.06 | ppl 21.341 +---------------------------------------------------------------------------------------------------- +| Eval 43 at step 172000 | time: 2811.86s | valid loss 3.16 | valid ppl 23.555 +---------------------------------------------------------------------------------------------------- +| epoch 16 step 172200 | 150 batches | lr 1.17e-05 | ms/batch 733.80 | loss 3.04 | ppl 20.922 +| epoch 16 step 172400 | 350 batches | lr 1.16e-05 | ms/batch 716.14 | loss 3.02 | ppl 20.536 +| epoch 16 step 172600 | 550 batches | lr 1.14e-05 | ms/batch 697.95 | loss 3.05 | ppl 21.120 +| epoch 16 step 172800 | 750 batches | lr 1.12e-05 | ms/batch 677.36 | loss 3.03 | ppl 20.767 +| epoch 16 step 173000 | 950 batches | lr 1.11e-05 | ms/batch 688.14 | loss 3.02 | ppl 20.590 +| epoch 16 step 173200 | 1150 batches | lr 1.09e-05 | ms/batch 694.21 | loss 3.06 | ppl 21.245 +| epoch 16 step 173400 | 1350 batches | lr 1.08e-05 | ms/batch 687.60 | loss 3.04 | ppl 20.835 +| epoch 16 step 173600 | 1550 batches | lr 1.06e-05 | ms/batch 689.94 | loss 3.03 | ppl 20.718 +| epoch 16 step 173800 | 1750 batches | lr 1.04e-05 | ms/batch 701.32 | loss 3.03 | ppl 20.615 +| epoch 16 step 174000 | 1950 batches | lr 1.03e-05 | ms/batch 718.46 | loss 3.06 | ppl 21.302 +| epoch 16 step 174200 | 2150 batches | lr 1.01e-05 | ms/batch 701.55 | loss 3.07 | ppl 21.531 +| epoch 16 step 174400 | 2350 batches | lr 9.97e-06 | ms/batch 714.53 | loss 3.05 | ppl 21.045 +| epoch 16 step 174600 | 2550 batches | lr 9.82e-06 | ms/batch 688.64 | loss 3.05 | ppl 21.136 +| epoch 16 step 174800 | 2750 batches | lr 9.67e-06 | ms/batch 676.25 | loss 3.03 | ppl 20.650 +| epoch 16 step 175000 | 2950 batches | lr 9.52e-06 | ms/batch 672.01 | loss 3.03 | ppl 20.677 +| epoch 16 step 175200 | 3150 batches | lr 9.37e-06 | ms/batch 682.98 | loss 3.05 | ppl 21.058 +| epoch 16 step 175400 | 3350 batches | lr 9.22e-06 | ms/batch 703.95 | loss 3.05 | ppl 21.083 +| epoch 16 step 175600 | 3550 batches | lr 9.07e-06 | ms/batch 725.15 | loss 3.03 | ppl 20.678 +| epoch 16 step 175800 | 3750 batches | lr 8.92e-06 | ms/batch 697.98 | loss 3.04 | ppl 20.887 +| epoch 16 step 176000 | 3950 batches | lr 8.78e-06 | ms/batch 714.39 | loss 3.04 | ppl 20.890 +---------------------------------------------------------------------------------------------------- +| Eval 44 at step 176000 | time: 2793.96s | valid loss 3.16 | valid ppl 23.555 +---------------------------------------------------------------------------------------------------- +| epoch 16 step 176200 | 4150 batches | lr 8.63e-06 | ms/batch 740.62 | loss 3.05 | ppl 21.035 +| epoch 16 step 176400 | 4350 batches | lr 8.49e-06 | ms/batch 688.27 | loss 3.05 | ppl 21.013 +| epoch 16 step 176600 | 4550 batches | lr 8.35e-06 | ms/batch 709.61 | loss 3.07 | ppl 21.515 +| epoch 16 step 176800 | 4750 batches | lr 8.21e-06 | ms/batch 675.71 | loss 3.01 | ppl 20.389 +| epoch 16 step 177000 | 4950 batches | lr 8.07e-06 | ms/batch 680.17 | loss 3.05 | ppl 21.062 +| epoch 16 step 177200 | 5150 batches | lr 7.93e-06 | ms/batch 701.57 | loss 3.04 | ppl 20.847 +| epoch 16 step 177400 | 5350 batches | lr 7.79e-06 | ms/batch 675.55 | loss 3.02 | ppl 20.562 +| epoch 16 step 177600 | 5550 batches | lr 7.66e-06 | ms/batch 697.09 | loss 3.03 | ppl 20.635 +| epoch 16 step 177800 | 5750 batches | lr 7.52e-06 | ms/batch 694.86 | loss 3.04 | ppl 21.003 +| epoch 16 step 178000 | 5950 batches | lr 7.39e-06 | ms/batch 717.27 | loss 3.03 | ppl 20.709 +| epoch 16 step 178200 | 6150 batches | lr 7.26e-06 | ms/batch 708.80 | loss 3.03 | ppl 20.721 +| epoch 16 step 178400 | 6350 batches | lr 7.13e-06 | ms/batch 680.38 | loss 3.07 | ppl 21.498 +| epoch 16 step 178600 | 6550 batches | lr 7e-06 | ms/batch 690.85 | loss 2.99 | ppl 19.816 +| epoch 16 step 178800 | 6750 batches | lr 6.87e-06 | ms/batch 686.33 | loss 3.02 | ppl 20.487 +| epoch 16 step 179000 | 6950 batches | lr 6.74e-06 | ms/batch 700.78 | loss 3.03 | ppl 20.767 +| epoch 16 step 179200 | 7150 batches | lr 6.61e-06 | ms/batch 699.08 | loss 3.00 | ppl 20.040 +| epoch 16 step 179400 | 7350 batches | lr 6.49e-06 | ms/batch 731.67 | loss 3.01 | ppl 20.243 +| epoch 16 step 179600 | 7550 batches | lr 6.36e-06 | ms/batch 701.46 | loss 3.01 | ppl 20.274 +| epoch 16 step 179800 | 7750 batches | lr 6.24e-06 | ms/batch 708.31 | loss 3.03 | ppl 20.608 +| epoch 16 step 180000 | 7950 batches | lr 6.12e-06 | ms/batch 709.01 | loss 3.01 | ppl 20.331 +---------------------------------------------------------------------------------------------------- +| Eval 45 at step 180000 | time: 2799.41s | valid loss 3.16 | valid ppl 23.509 +---------------------------------------------------------------------------------------------------- +| epoch 16 step 180200 | 8150 batches | lr 6e-06 | ms/batch 762.66 | loss 3.02 | ppl 20.552 +| epoch 16 step 180400 | 8350 batches | lr 5.88e-06 | ms/batch 712.89 | loss 3.03 | ppl 20.748 +| epoch 16 step 180600 | 8550 batches | lr 5.76e-06 | ms/batch 697.51 | loss 3.02 | ppl 20.448 +| epoch 16 step 180800 | 8750 batches | lr 5.64e-06 | ms/batch 692.89 | loss 3.03 | ppl 20.772 +| epoch 16 step 181000 | 8950 batches | lr 5.53e-06 | ms/batch 704.48 | loss 3.04 | ppl 20.993 +| epoch 16 step 181200 | 9150 batches | lr 5.41e-06 | ms/batch 681.81 | loss 3.01 | ppl 20.388 +| epoch 16 step 181400 | 9350 batches | lr 5.3e-06 | ms/batch 739.49 | loss 3.03 | ppl 20.750 +| epoch 16 step 181600 | 9550 batches | lr 5.18e-06 | ms/batch 673.63 | loss 3.06 | ppl 21.365 +| epoch 16 step 181800 | 9750 batches | lr 5.07e-06 | ms/batch 678.87 | loss 3.02 | ppl 20.486 +| epoch 16 step 182000 | 9950 batches | lr 4.96e-06 | ms/batch 688.93 | loss 3.03 | ppl 20.719 +| epoch 16 step 182200 | 10150 batches | lr 4.85e-06 | ms/batch 700.14 | loss 3.01 | ppl 20.286 +| epoch 16 step 182400 | 10350 batches | lr 4.75e-06 | ms/batch 698.98 | loss 3.04 | ppl 20.915 +| epoch 16 step 182600 | 10550 batches | lr 4.64e-06 | ms/batch 675.18 | loss 3.06 | ppl 21.356 +| epoch 16 step 182800 | 10750 batches | lr 4.53e-06 | ms/batch 675.41 | loss 3.01 | ppl 20.282 +| epoch 16 step 183000 | 10950 batches | lr 4.43e-06 | ms/batch 696.78 | loss 3.03 | ppl 20.604 +| epoch 16 step 183200 | 11150 batches | lr 4.33e-06 | ms/batch 705.01 | loss 3.08 | ppl 21.672 +| epoch 16 step 183400 | 11350 batches | lr 4.23e-06 | ms/batch 724.39 | loss 3.04 | ppl 20.891 +| epoch 17 step 183600 | 80 batches | lr 4.12e-06 | ms/batch 694.42 | loss 3.04 | ppl 20.978 +| epoch 17 step 183800 | 280 batches | lr 4.03e-06 | ms/batch 706.89 | loss 3.01 | ppl 20.311 +| epoch 17 step 184000 | 480 batches | lr 3.93e-06 | ms/batch 697.17 | loss 3.05 | ppl 21.175 +---------------------------------------------------------------------------------------------------- +| Eval 46 at step 184000 | time: 2799.09s | valid loss 3.16 | valid ppl 23.480 +---------------------------------------------------------------------------------------------------- +| epoch 17 step 184200 | 680 batches | lr 3.83e-06 | ms/batch 724.21 | loss 3.01 | ppl 20.267 +| epoch 17 step 184400 | 880 batches | lr 3.73e-06 | ms/batch 717.97 | loss 3.04 | ppl 20.832 +| epoch 17 step 184600 | 1080 batches | lr 3.64e-06 | ms/batch 700.46 | loss 3.04 | ppl 20.875 +| epoch 17 step 184800 | 1280 batches | lr 3.55e-06 | ms/batch 707.72 | loss 3.02 | ppl 20.489 +| epoch 17 step 185000 | 1480 batches | lr 3.45e-06 | ms/batch 667.24 | loss 3.02 | ppl 20.563 +| epoch 17 step 185200 | 1680 batches | lr 3.36e-06 | ms/batch 734.80 | loss 3.02 | ppl 20.586 +| epoch 17 step 185400 | 1880 batches | lr 3.27e-06 | ms/batch 688.00 | loss 3.03 | ppl 20.797 +| epoch 17 step 185600 | 2080 batches | lr 3.18e-06 | ms/batch 689.00 | loss 3.08 | ppl 21.708 +| epoch 17 step 185800 | 2280 batches | lr 3.1e-06 | ms/batch 736.30 | loss 3.05 | ppl 21.169 +| epoch 17 step 186000 | 2480 batches | lr 3.01e-06 | ms/batch 688.24 | loss 3.03 | ppl 20.685 +| epoch 17 step 186200 | 2680 batches | lr 2.93e-06 | ms/batch 682.16 | loss 3.05 | ppl 21.041 +| epoch 17 step 186400 | 2880 batches | lr 2.84e-06 | ms/batch 733.76 | loss 2.99 | ppl 19.908 +| epoch 17 step 186600 | 3080 batches | lr 2.76e-06 | ms/batch 681.75 | loss 3.04 | ppl 20.892 +| epoch 17 step 186800 | 3280 batches | lr 2.68e-06 | ms/batch 694.90 | loss 3.05 | ppl 21.196 +| epoch 17 step 187000 | 3480 batches | lr 2.6e-06 | ms/batch 714.81 | loss 3.02 | ppl 20.444 +| epoch 17 step 187200 | 3680 batches | lr 2.52e-06 | ms/batch 739.94 | loss 3.04 | ppl 20.839 +| epoch 17 step 187400 | 3880 batches | lr 2.44e-06 | ms/batch 696.52 | loss 3.02 | ppl 20.547 +| epoch 17 step 187600 | 4080 batches | lr 2.36e-06 | ms/batch 711.46 | loss 3.05 | ppl 21.143 +| epoch 17 step 187800 | 4280 batches | lr 2.29e-06 | ms/batch 676.34 | loss 3.03 | ppl 20.690 +| epoch 17 step 188000 | 4480 batches | lr 2.21e-06 | ms/batch 721.67 | loss 3.05 | ppl 21.132 +---------------------------------------------------------------------------------------------------- +| Eval 47 at step 188000 | time: 2818.78s | valid loss 3.15 | valid ppl 23.437 +---------------------------------------------------------------------------------------------------- +| epoch 17 step 188200 | 4680 batches | lr 2.14e-06 | ms/batch 744.57 | loss 3.02 | ppl 20.544 +| epoch 17 step 188400 | 4880 batches | lr 2.07e-06 | ms/batch 679.14 | loss 3.02 | ppl 20.582 +| epoch 17 step 188600 | 5080 batches | lr 2e-06 | ms/batch 683.64 | loss 3.04 | ppl 20.906 +| epoch 17 step 188800 | 5280 batches | lr 1.93e-06 | ms/batch 701.30 | loss 3.03 | ppl 20.615 +| epoch 17 step 189000 | 5480 batches | lr 1.86e-06 | ms/batch 708.69 | loss 3.01 | ppl 20.322 +| epoch 17 step 189200 | 5680 batches | lr 1.79e-06 | ms/batch 672.27 | loss 3.04 | ppl 20.907 +| epoch 17 step 189400 | 5880 batches | lr 1.73e-06 | ms/batch 732.04 | loss 3.03 | ppl 20.725 +| epoch 17 step 189600 | 6080 batches | lr 1.66e-06 | ms/batch 710.39 | loss 3.03 | ppl 20.774 +| epoch 17 step 189800 | 6280 batches | lr 1.6e-06 | ms/batch 692.23 | loss 3.04 | ppl 20.937 +| epoch 17 step 190000 | 6480 batches | lr 1.54e-06 | ms/batch 703.65 | loss 3.02 | ppl 20.415 +| epoch 17 step 190200 | 6680 batches | lr 1.48e-06 | ms/batch 695.33 | loss 2.99 | ppl 19.968 +| epoch 17 step 190400 | 6880 batches | lr 1.42e-06 | ms/batch 698.42 | loss 3.03 | ppl 20.649 +| epoch 17 step 190600 | 7080 batches | lr 1.36e-06 | ms/batch 685.73 | loss 3.02 | ppl 20.404 +| epoch 17 step 190800 | 7280 batches | lr 1.3e-06 | ms/batch 685.45 | loss 2.98 | ppl 19.645 +| epoch 17 step 191000 | 7480 batches | lr 1.25e-06 | ms/batch 684.16 | loss 3.02 | ppl 20.496 +| epoch 17 step 191200 | 7680 batches | lr 1.19e-06 | ms/batch 693.92 | loss 3.00 | ppl 20.163 +| epoch 17 step 191400 | 7880 batches | lr 1.14e-06 | ms/batch 687.54 | loss 3.01 | ppl 20.235 +| epoch 17 step 191600 | 8080 batches | lr 1.09e-06 | ms/batch 705.35 | loss 3.03 | ppl 20.600 +| epoch 17 step 191800 | 8280 batches | lr 1.04e-06 | ms/batch 708.66 | loss 3.01 | ppl 20.376 +| epoch 17 step 192000 | 8480 batches | lr 9.86e-07 | ms/batch 703.61 | loss 3.02 | ppl 20.442 +---------------------------------------------------------------------------------------------------- +| Eval 48 at step 192000 | time: 2792.73s | valid loss 3.15 | valid ppl 23.404 +---------------------------------------------------------------------------------------------------- +| epoch 17 step 192200 | 8680 batches | lr 9.37e-07 | ms/batch 738.99 | loss 3.03 | ppl 20.750 +| epoch 17 step 192400 | 8880 batches | lr 8.9e-07 | ms/batch 684.91 | loss 3.03 | ppl 20.652 +| epoch 17 step 192600 | 9080 batches | lr 8.44e-07 | ms/batch 697.17 | loss 3.03 | ppl 20.656 +| epoch 17 step 192800 | 9280 batches | lr 7.99e-07 | ms/batch 716.20 | loss 3.02 | ppl 20.529 +| epoch 17 step 193000 | 9480 batches | lr 7.55e-07 | ms/batch 708.87 | loss 3.03 | ppl 20.800 +| epoch 17 step 193200 | 9680 batches | lr 7.12e-07 | ms/batch 680.97 | loss 3.03 | ppl 20.765 +| epoch 17 step 193400 | 9880 batches | lr 6.71e-07 | ms/batch 701.09 | loss 3.01 | ppl 20.225 +| epoch 17 step 193600 | 10080 batches | lr 6.31e-07 | ms/batch 697.86 | loss 3.04 | ppl 20.959 +| epoch 17 step 193800 | 10280 batches | lr 5.92e-07 | ms/batch 704.29 | loss 3.01 | ppl 20.360 +| epoch 17 step 194000 | 10480 batches | lr 5.55e-07 | ms/batch 705.22 | loss 3.05 | ppl 21.131 +| epoch 17 step 194200 | 10680 batches | lr 5.18e-07 | ms/batch 690.06 | loss 3.03 | ppl 20.726 +| epoch 17 step 194400 | 10880 batches | lr 4.83e-07 | ms/batch 694.26 | loss 3.01 | ppl 20.253 +| epoch 17 step 194600 | 11080 batches | lr 4.49e-07 | ms/batch 691.17 | loss 3.05 | ppl 21.187 +| epoch 17 step 194800 | 11280 batches | lr 4.17e-07 | ms/batch 706.39 | loss 3.05 | ppl 21.185 +| epoch 18 step 195000 | 10 batches | lr 3.85e-07 | ms/batch 710.81 | loss 3.04 | ppl 20.965 +| epoch 18 step 195200 | 210 batches | lr 3.55e-07 | ms/batch 698.26 | loss 3.01 | ppl 20.292 +| epoch 18 step 195400 | 410 batches | lr 3.26e-07 | ms/batch 694.39 | loss 3.04 | ppl 20.958 +| epoch 18 step 195600 | 610 batches | lr 2.98e-07 | ms/batch 691.04 | loss 3.01 | ppl 20.287 +| epoch 18 step 195800 | 810 batches | lr 2.72e-07 | ms/batch 701.33 | loss 3.05 | ppl 21.051 +| epoch 18 step 196000 | 1010 batches | lr 2.47e-07 | ms/batch 719.59 | loss 3.01 | ppl 20.240 +---------------------------------------------------------------------------------------------------- +| Eval 49 at step 196000 | time: 2804.08s | valid loss 3.15 | valid ppl 23.395 +---------------------------------------------------------------------------------------------------- +| epoch 18 step 196200 | 1210 batches | lr 2.23e-07 | ms/batch 743.40 | loss 3.04 | ppl 20.868 +| epoch 18 step 196400 | 1410 batches | lr 2e-07 | ms/batch 688.08 | loss 3.03 | ppl 20.707 +| epoch 18 step 196600 | 1610 batches | lr 1.78e-07 | ms/batch 698.43 | loss 3.01 | ppl 20.227 +| epoch 18 step 196800 | 1810 batches | lr 1.58e-07 | ms/batch 698.99 | loss 3.04 | ppl 20.847 +| epoch 18 step 197000 | 2010 batches | lr 1.39e-07 | ms/batch 711.49 | loss 3.06 | ppl 21.434 +| epoch 18 step 197200 | 2210 batches | lr 1.21e-07 | ms/batch 699.04 | loss 3.05 | ppl 21.071 +| epoch 18 step 197400 | 2410 batches | lr 1.04e-07 | ms/batch 678.89 | loss 3.04 | ppl 20.965 +| epoch 18 step 197600 | 2610 batches | lr 8.88e-08 | ms/batch 705.13 | loss 3.03 | ppl 20.720 +| epoch 18 step 197800 | 2810 batches | lr 7.46e-08 | ms/batch 712.00 | loss 3.01 | ppl 20.327 +| epoch 18 step 198000 | 3010 batches | lr 6.17e-08 | ms/batch 711.63 | loss 3.03 | ppl 20.694 +| epoch 18 step 198200 | 3210 batches | lr 5e-08 | ms/batch 692.05 | loss 3.03 | ppl 20.710 +| epoch 18 step 198400 | 3410 batches | lr 3.95e-08 | ms/batch 685.17 | loss 3.04 | ppl 20.895 +| epoch 18 step 198600 | 3610 batches | lr 3.02e-08 | ms/batch 692.91 | loss 3.01 | ppl 20.257 +| epoch 18 step 198800 | 3810 batches | lr 2.22e-08 | ms/batch 685.56 | loss 3.03 | ppl 20.780 +| epoch 18 step 199000 | 4010 batches | lr 1.54e-08 | ms/batch 699.55 | loss 3.05 | ppl 21.096 +| epoch 18 step 199200 | 4210 batches | lr 9.87e-09 | ms/batch 690.53 | loss 3.03 | ppl 20.654 +| epoch 18 step 199400 | 4410 batches | lr 5.55e-09 | ms/batch 688.91 | loss 3.04 | ppl 20.838 +| epoch 18 step 199600 | 4610 batches | lr 2.47e-09 | ms/batch 711.03 | loss 3.04 | ppl 20.891 +| epoch 18 step 199800 | 4810 batches | lr 6.17e-10 | ms/batch 686.10 | loss 3.02 | ppl 20.406 +| epoch 18 step 200000 | 5010 batches | lr 0 | ms/batch 702.14 | loss 3.05 | ppl 21.176 +---------------------------------------------------------------------------------------------------- +| Eval 50 at step 200000 | time: 2793.85s | valid loss 3.15 | valid ppl 23.396 +---------------------------------------------------------------------------------------------------- +---------------------------------------------------------------------------------------------------- +End of training +==================================================================================================== +| End of training | test loss 3.19 | test ppl 24.241 +==================================================================================================== diff --git a/NLP/Transformer-XL/mem_transformer.py b/NLP/Transformer-XL/mem_transformer.py new file mode 100644 index 0000000..45147df --- /dev/null +++ b/NLP/Transformer-XL/mem_transformer.py @@ -0,0 +1,812 @@ +import sys +import math +import functools + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +sys.path.append('utils') +from proj_adaptive_softmax import ProjectedAdaptiveLogSoftmax +from log_uniform_sampler import LogUniformSampler, sample_logits + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super(PositionalEmbedding, self).__init__() + + self.demb = demb + + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[:,None,:].expand(-1, bsz, -1) + else: + return pos_emb[:,None,:] + + +class PositionwiseFF(nn.Module): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False): + super(PositionwiseFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), + nn.Dropout(dropout), + nn.Linear(d_inner, d_model), + nn.Dropout(dropout), + ) + + self.layer_norm = nn.LayerNorm(d_model) + + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + ##### layer normalization + positionwise feed-forward + core_out = self.CoreNet(self.layer_norm(inp)) + + ##### residual connection + output = core_out + inp + else: + ##### positionwise feed-forward + core_out = self.CoreNet(inp) + + ##### residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + return output + +class MultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + pre_lnorm=False): + super(MultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.q_net = nn.Linear(d_model, n_head * d_head, bias=False) + self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + def forward(self, h, attn_mask=None, mems=None): + ##### multihead attention + # [hlen x bsz x n_head x d_head] + + if mems is not None: + c = torch.cat([mems, h], 0) + else: + c = h + + if self.pre_lnorm: + ##### layer normalization + c = self.layer_norm(c) + + head_q = self.q_net(h) + head_k, head_v = torch.chunk(self.kv_net(c), 2, -1) + + head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head) + head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head) + head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head) + + # [qlen x klen x bsz x n_head] + attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k)) + attn_score.mul_(self.scale) + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head] + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v)) + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = h + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(h + attn_out) + + return output + +class RelMultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False): + super(RelMultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + def _parallelogram_mask(self, h, w, left=False): + mask = torch.ones((h, w)).byte() + m = min(h, w) + mask[:m,:m] = torch.triu(mask[:m,:m]) + mask[-m:,-m:] = torch.tril(mask[-m:,-m:]) + + if left: + return mask + else: + return mask.flip(0) + + def _shift(self, x, qlen, klen, mask, left=False): + if qlen > 1: + zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)), + device=x.device, dtype=x.dtype) + else: + zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype) + + if left: + mask = mask.flip(1) + x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1) + else: + x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1) + + x = x_padded.masked_select(mask[:,:,None,None]) \ + .view(qlen, klen, x.size(2), x.size(3)) + + return x + + def _rel_shift(self, x, zero_triu=False): + zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]), + device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=1) + + x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:]) + + x = x_padded[1:].view_as(x) + + if zero_triu: + ones = torch.ones((x.size(0), x.size(1))) + x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None] + + return x + + def forward(self, w, r, attn_mask=None, mems=None): + raise NotImplementedError + +class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) + + def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None): + qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + + r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head + + #### compute attention score + rw_head_q = w_head_q + r_w_bias # qlen x bsz x n_head x d_head + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + + rr_head_q = w_head_q + r_r_bias + BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head + BD = self._rel_shift(BD) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score = attn_score.float().masked_fill( + attn_mask[None,:,:,None], -float('inf')).type_as(attn_score) + elif attn_mask.dim() == 3: + attn_score = attn_score.float().masked_fill( + attn_mask[:,:,:,None], -float('inf')).type_as(attn_score) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class RelLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): + # r_emb: [klen, n_head, d_head], used for term B + # r_w_bias: [n_head, d_head], used for term C + # r_bias: [klen, n_head], used for term D + + qlen, bsz = w.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) + + if klen > r_emb.size(0): + r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1) + r_emb = torch.cat([r_emb_pad, r_emb], 0) + r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1) + r_bias = torch.cat([r_bias_pad, r_bias], 0) + else: + r_emb = r_emb[-klen:] + r_bias = r_bias[-klen:] + + #### compute attention score + rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head + + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb)) # qlen x klen x bsz x n_head + D_ = r_bias[None, :, None] # 1 x klen x 1 x n_head + BD = self._rel_shift(B_ + D_) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class DecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs): + super(DecoderLayer, self).__init__() + + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, dec_attn_mask=None, mems=None): + + output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout, + **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None): + + output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelPartialLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelPartialLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model, + d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None): + + output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + + +class AdaptiveEmbedding(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + sample_softmax=False): + super(AdaptiveEmbedding, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + + self.cutoffs = cutoffs + [n_token] + self.div_val = div_val + self.d_proj = d_proj + + self.emb_scale = d_proj ** 0.5 + + self.cutoff_ends = [0] + self.cutoffs + + self.emb_layers = nn.ModuleList() + self.emb_projs = nn.ParameterList() + if div_val == 1: + self.emb_layers.append( + nn.Embedding(n_token, d_embed, sparse=sample_softmax>0) + ) + if d_proj != d_embed: + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed))) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i)) + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i))) + + def forward(self, inp): + if self.div_val == 1: + embed = self.emb_layers[0](inp) + if self.d_proj != self.d_embed: + embed = F.linear(embed, self.emb_projs[0]) + else: + param = next(self.parameters()) + inp_flat = inp.view(-1) + emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], + dtype=param.dtype, device=param.device) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + + mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + inp_i = inp_flat.index_select(0, indices_i) - l_idx + emb_i = self.emb_layers[i](inp_i) + emb_i = F.linear(emb_i, self.emb_projs[i]) + + emb_flat.index_copy_(0, indices_i, emb_i) + + embed = emb_flat.view(*inp.size(), self.d_proj) + + embed.mul_(self.emb_scale) + + return embed + +class MemTransformerLM(nn.Module): + def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner, + dropout, dropatt, tie_weight=True, d_embed=None, + div_val=1, tie_projs=[False], pre_lnorm=False, + tgt_len=None, ext_len=None, mem_len=None, + cutoffs=[], adapt_inp=False, + same_length=False, attn_type=0, clamp_len=-1, + sample_softmax=-1): + super(MemTransformerLM, self).__init__() + self.n_token = n_token + + d_embed = d_model if d_embed is None else d_embed + self.d_embed = d_embed + self.d_model = d_model + self.n_head = n_head + self.d_head = d_head + + self.word_emb = AdaptiveEmbedding(n_token, d_embed, d_model, cutoffs, + div_val=div_val) + + self.drop = nn.Dropout(dropout) + + self.n_layer = n_layer + + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + self.max_klen = tgt_len + ext_len + mem_len + + self.attn_type = attn_type + + self.layers = nn.ModuleList() + if attn_type == 0: # the default attention + for i in range(n_layer): + self.layers.append( + RelPartialLearnableDecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + elif attn_type == 1: # learnable embeddings + for i in range(n_layer): + self.layers.append( + RelLearnableDecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + elif attn_type in [2, 3]: # absolute embeddings + for i in range(n_layer): + self.layers.append( + DecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + + self.sample_softmax = sample_softmax + # use sampled softmax + if sample_softmax > 0: + self.out_layer = nn.Linear(d_model, n_token) + if tie_weight: + self.out_layer.weight = self.word_emb.weight + self.tie_weight = tie_weight + self.sampler = LogUniformSampler(n_token, sample_softmax) + + # use adaptive softmax (including standard softmax) + else: + self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model, + cutoffs, div_val=div_val) + + if tie_weight: + for i in range(len(self.crit.out_layers)): + self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight + + if tie_projs: + for i, tie_proj in enumerate(tie_projs): + if tie_proj and div_val == 1 and d_model != d_embed: + self.crit.out_projs[i] = self.word_emb.emb_projs[0] + elif tie_proj and div_val != 1: + self.crit.out_projs[i] = self.word_emb.emb_projs[i] + + self.same_length = same_length + self.clamp_len = clamp_len + + self._create_params() + + def backward_compatible(self): + self.sample_softmax = -1 + + def _create_params(self): + if self.attn_type == 0: # default attention + self.pos_emb = PositionalEmbedding(self.d_model) + self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + elif self.attn_type == 1: # learnable + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.Tensor( + self.n_layer, self.n_head, self.d_head)) + self.r_bias = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head)) + elif self.attn_type == 2: # absolute standard + self.pos_emb = PositionalEmbedding(self.d_model) + elif self.attn_type == 3: # absolute deeper SA + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + + def reset_length(self, tgt_len, ext_len, mem_len): + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + + def init_mems(self): + if self.mem_len > 0: + mems = [] + param = next(self.parameters()) + for i in range(self.n_layer+1): + empty = torch.empty(0, dtype=param.dtype, device=param.device) + mems.append(empty) + + return mems + else: + return None + + def _update_mems(self, hids, mems, qlen, mlen): + # does not deal with None + if mems is None: return None + + # mems is not None + assert len(hids) == len(mems), 'len(hids) != len(mems)' + + # There are `mlen + qlen` steps that can be cached into mems + # For the next step, the last `ext_len` of the `qlen` tokens + # will be used as the extended context. Hence, we only cache + # the tokens from `mlen + qlen - self.ext_len - self.mem_len` + # to `mlen + qlen - self.ext_len`. + with torch.no_grad(): + new_mems = [] + end_idx = mlen + max(0, qlen - 0 - self.ext_len) + beg_idx = max(0, end_idx - self.mem_len) + for i in range(len(hids)): + + cat = torch.cat([mems[i], hids[i]], dim=0) + new_mems.append(cat[beg_idx:end_idx].detach()) + + return new_mems + + def _forward(self, dec_inp, mems=None): + qlen, bsz = dec_inp.size() + + word_emb = self.word_emb(dec_inp) + + mlen = mems[0].size(0) if mems is not None else 0 + klen = mlen + qlen + if self.same_length: + all_ones = word_emb.new_ones(qlen, klen) + mask_len = klen - self.mem_len + if mask_len > 0: + mask_shift_len = qlen - mask_len + else: + mask_shift_len = qlen + dec_attn_mask = (torch.triu(all_ones, 1+mlen) + + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1 + else: + dec_attn_mask = torch.triu( + word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None] + + hids = [] + if self.attn_type == 0: # default + pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb) + pos_emb = self.drop(pos_emb) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, pos_emb, self.r_w_bias, + self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) + hids.append(core_out) + elif self.attn_type == 1: # learnable + core_out = self.drop(word_emb) + hids.append(core_out) + for i, layer in enumerate(self.layers): + if self.clamp_len > 0: + r_emb = self.r_emb[i][-self.clamp_len :] + r_bias = self.r_bias[i][-self.clamp_len :] + else: + r_emb, r_bias = self.r_emb[i], self.r_bias[i] + + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, r_emb, self.r_w_bias[i], + r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) + hids.append(core_out) + elif self.attn_type == 2: # absolute + pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb + pos_emb[-qlen:]) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + if mems_i is not None and i == 0: + mems_i += pos_emb[:mlen] + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + hids.append(core_out) + elif self.attn_type == 3: + core_out = self.drop(word_emb) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + if mems_i is not None and mlen > 0: + cur_emb = self.r_emb[i][:-qlen] + cur_size = cur_emb.size(0) + if cur_size < mlen: + cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1) + cur_emb = torch.cat([cur_emb_pad, cur_emb], 0) + else: + cur_emb = cur_emb[-mlen:] + mems_i += cur_emb.view(mlen, 1, -1) + core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1) + + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + hids.append(core_out) + + core_out = self.drop(core_out) + + new_mems = self._update_mems(hids, mems, mlen, qlen) + + return core_out, new_mems + + def forward(self, data, target, *mems): + # nn.DataParallel does not allow size(0) tensors to be broadcasted. + # So, have to initialize size(0) mems inside the model forward. + # Moreover, have to return new_mems to allow nn.DataParallel to piece + # them together. + if not mems: mems = self.init_mems() + + tgt_len = target.size(0) + hidden, new_mems = self._forward(data, mems=mems) + + pred_hid = hidden[-tgt_len:] + if self.sample_softmax > 0 and self.training: + assert self.tie_weight + logit = sample_logits(self.word_emb, + self.out_layer.bias, target, pred_hid, self.sampler) + loss = -F.log_softmax(logit, -1)[:, :, 0] + else: + loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1)) + loss = loss.view(tgt_len, -1) + + if new_mems is None: + return [loss] + else: + return [loss] + new_mems + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='unit test') + + parser.add_argument('--n_layer', type=int, default=4, help='') + parser.add_argument('--n_rel_layer', type=int, default=4, help='') + parser.add_argument('--n_head', type=int, default=2, help='') + parser.add_argument('--d_head', type=int, default=2, help='') + parser.add_argument('--d_model', type=int, default=200, help='') + parser.add_argument('--d_embed', type=int, default=200, help='') + parser.add_argument('--d_inner', type=int, default=200, help='') + parser.add_argument('--dropout', type=float, default=0.0, help='') + parser.add_argument('--cuda', action='store_true', help='') + parser.add_argument('--seed', type=int, default=1111, help='') + parser.add_argument('--multi_gpu', action='store_true', help='') + + args = parser.parse_args() + + device = torch.device("cuda" if args.cuda else "cpu") + + B = 4 + tgt_len, mem_len, ext_len = 36, 36, 0 + data_len = tgt_len * 20 + args.n_token = 10000 + + import data_utils + + data = torch.LongTensor(data_len*B).random_(0, args.n_token).to(device) + diter = data_utils.LMOrderedIterator(data, B, tgt_len, device=device, ext_len=ext_len) + + cutoffs = [args.n_token // 2] + tie_projs = [False] + [True] * len(cutoffs) + + for div_val in [1, 2]: + for d_embed in [200, 100]: + model = MemTransformerLM(args.n_token, args.n_layer, args.n_head, + args.d_model, args.d_head, args.d_inner, args.dropout, + dropatt=args.dropout, tie_weight=True, + d_embed=d_embed, div_val=div_val, + tie_projs=tie_projs, pre_lnorm=True, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + cutoffs=cutoffs, attn_type=0).to(device) + + print(sum(p.numel() for p in model.parameters())) + + mems = tuple() + for idx, (inp, tgt, seqlen) in enumerate(diter): + print('batch {}'.format(idx)) + out = model(inp, tgt, *mems) + mems = out[1:] diff --git a/NLP/Transformer-XL/run_wt103_adan.sh b/NLP/Transformer-XL/run_wt103_adan.sh new file mode 100644 index 0000000..8ea88ff --- /dev/null +++ b/NLP/Transformer-XL/run_wt103_adan.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +if [[ $1 == 'train' ]]; then + echo 'Run training...' + python train.py \ + --cuda \ + --data /root/autodl-tmp/data/wikitext-103/ \ + --dataset wt103 \ + --adaptive \ + --n_layer 16 \ + --d_model 410 \ + --n_head 10 \ + --d_head 41 \ + --d_inner 2100 \ + --dropout 0.1 \ + --dropatt 0.0 \ + --optim adan \ + --wd 0.02 \ + --lr 0.0015 \ + --opt-betas 0.9 0.9 0.999 \ + --clip 0.25 \ + --lr_min 1e-6 \ + --warmup_step 5000 \ + --max_step 200000 \ + --tgt_len 150 \ + --mem_len 150 \ + --eval_tgt_len 150 \ + --batch_size 60 \ + --multi_gpu \ + --gpu0_bsz 4 \ + ${@:2} +elif [[ $1 == 'eval' ]]; then + echo 'Run evaluation...' + python eval.py \ + --cuda \ + --data /root/autodl-tmp/data/wikitext-103/ \ + --dataset wt103 \ + --tgt_len 64 \ + --mem_len 640 \ + --clamp_len 400 \ + --same_length \ + --split test \ + ${@:2} +else + echo 'unknown argment 1' +fi diff --git a/NLP/Transformer-XL/train.py b/NLP/Transformer-XL/train.py new file mode 100644 index 0000000..be07202 --- /dev/null +++ b/NLP/Transformer-XL/train.py @@ -0,0 +1,581 @@ +# coding: utf-8 +import argparse +import time +import math +import os, sys +import itertools + +import numpy as np + +import torch +import torch.nn as nn +import torch.optim as optim +from adan import Adan + +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import create_exp_dir +from utils.data_parallel import BalancedDataParallel + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/wikitext-103', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='wt103', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--n_layer', type=int, default=12, + help='number of total layers') +parser.add_argument('--n_head', type=int, default=10, + help='number of heads') +parser.add_argument('--d_head', type=int, default=50, + help='head dimension') +parser.add_argument('--d_embed', type=int, default=-1, + help='embedding dimension') +parser.add_argument('--d_model', type=int, default=500, + help='model dimension') +parser.add_argument('--d_inner', type=int, default=1000, + help='inner dimension in FF') +parser.add_argument('--dropout', type=float, default=0.0, + help='global dropout rate') +parser.add_argument('--dropatt', type=float, default=0.0, + help='attention probability dropout rate') +parser.add_argument('--init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--emb_init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--init_range', type=float, default=0.1, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--emb_init_range', type=float, default=0.01, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--init_std', type=float, default=0.02, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--proj_init_std', type=float, default=0.01, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--optim', default='adam', type=str, + choices=['adam', 'sgd', 'adagrad', 'adan'], + help='optimizer to use.') +parser.add_argument('--lr', type=float, default=0.00025, + help='initial learning rate (0.00025|5 for adam|sgd)') +parser.add_argument('--wd', type=float, default=0.02, + help='weight decayss') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--scheduler', default='cosine', type=str, + choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], + help='lr scheduler to use.') +parser.add_argument('--warmup_step', type=int, default=0, + help='upper epoch limit') +parser.add_argument('--decay_rate', type=float, default=0.5, + help='decay factor when ReduceLROnPlateau is used') +parser.add_argument('--lr_min', type=float, default=0.0, + help='minimum learning rate during annealing') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--clip_nonemb', action='store_true', + help='only clip the gradient of non-embedding params') +parser.add_argument('--max_step', type=int, default=100000, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=60, + help='batch size') +parser.add_argument('--batch_chunk', type=int, default=1, + help='split batch into chunks to save memory') +parser.add_argument('--tgt_len', type=int, default=70, + help='number of tokens to predict') +parser.add_argument('--eval_tgt_len', type=int, default=50, + help='number of tokens to predict for evaluation') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=0, + help='length of the retained previous heads') +parser.add_argument('--not_tied', action='store_true', + help='do not tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +parser.add_argument('--cuda', action='store_true', + help='use CUDA') +parser.add_argument('--adaptive', action='store_true', + help='use adaptive softmax') +parser.add_argument('--div_val', type=int, default=1, + help='divident value for adapative input and softmax') +parser.add_argument('--pre_lnorm', action='store_true', + help='apply LayerNorm to the input instead of the output') +parser.add_argument('--varlen', action='store_true', + help='use variable length') +parser.add_argument('--multi_gpu', action='store_true', + help='use multiple GPU') +parser.add_argument('--log-interval', type=int, default=200, + help='report interval') +parser.add_argument('--eval-interval', type=int, default=4000, + help='evaluation interval') +parser.add_argument('--work_dir', default='LM-TFM', type=str, + help='experiment directory.') +parser.add_argument('--restart', action='store_true', + help='restart training from the saved checkpoint') +parser.add_argument('--restart_dir', type=str, default='', + help='restart dir') +parser.add_argument('--debug', action='store_true', + help='run in debug mode (do not create exp dir)') +parser.add_argument('--same_length', action='store_true', + help='use the same attn length for all tokens') +parser.add_argument('--attn_type', type=int, default=0, + help='attention type. 0 for ours, 1 for Shaw et al,' + '2 for Vaswani et al, 3 for Al Rfou et al.') +parser.add_argument('--clamp_len', type=int, default=-1, + help='use the same pos embeddings after clamp_len') +parser.add_argument('--eta_min', type=float, default=0.0, + help='min learning rate for cosine scheduler') +parser.add_argument('--gpu0_bsz', type=int, default=-1, + help='batch size on gpu 0') +parser.add_argument('--max_eval_steps', type=int, default=-1, + help='max eval steps') +parser.add_argument('--sample_softmax', type=int, default=-1, + help='number of samples in sampled softmax') +parser.add_argument('--patience', type=int, default=0, + help='patience') +parser.add_argument('--finetune_v2', action='store_true', + help='finetune v2') +parser.add_argument('--finetune_v3', action='store_true', + help='finetune v3') +parser.add_argument('--fp16', action='store_true', + help='Run in pseudo-fp16 mode (fp16 storage fp32 math).') +parser.add_argument('--static-loss-scale', type=float, default=1, + help='Static loss scale, positive power of 2 values can ' + 'improve fp16 convergence.') +parser.add_argument('--dynamic-loss-scale', action='store_true', + help='Use dynamic loss scaling. If supplied, this argument' + ' supersedes --static-loss-scale.') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') +args = parser.parse_args() +args.tied = not args.not_tied + +if args.d_embed < 0: + args.d_embed = args.d_model + +assert args.ext_len >= 0, 'extended context length must be non-negative' +assert args.batch_size % args.batch_chunk == 0 + +args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) +args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) +logging = create_exp_dir(args.work_dir, + scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) + +# Set the random seed manually for reproducibility. +np.random.seed(args.seed) +torch.manual_seed(args.seed) +if torch.cuda.is_available(): + if not args.cuda: + print('WARNING: You have a CUDA device, so you should probably run with --cuda') + else: + torch.cuda.manual_seed_all(args.seed) + +# Validate `--fp16` option +if args.fp16: + if not args.cuda: + print('WARNING: --fp16 requires --cuda, ignoring --fp16 option') + args.fp16 = False + else: + try: + from apex.fp16_utils import FP16_Optimizer + except: + print('WARNING: apex not installed, ignoring --fp16 option') + args.fp16 = False + +device = torch.device('cuda' if args.cuda else 'cpu') + +############################################################################### +# Load data +############################################################################### +corpus = get_lm_corpus(args.data, args.dataset) +ntokens = len(corpus.vocab) +args.n_token = ntokens + +eval_batch_size = 10 +tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len, + device=device, ext_len=args.ext_len) +va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len, + device=device, ext_len=args.ext_len) +te_iter = corpus.get_iterator('test', eval_batch_size, args.eval_tgt_len, + device=device, ext_len=args.ext_len) + +# adaptive softmax / embedding +cutoffs, tie_projs = [], [False] +if args.adaptive: + assert args.dataset in ['wt103', 'lm1b'] + if args.dataset == 'wt103': + cutoffs = [20000, 40000, 200000] + tie_projs += [True] * len(cutoffs) + elif args.dataset == 'lm1b': + cutoffs = [60000, 100000, 640000] + tie_projs += [False] * len(cutoffs) + +############################################################################### +# Build the model +############################################################################### +def init_weight(weight): + if args.init == 'uniform': + nn.init.uniform_(weight, -args.init_range, args.init_range) + elif args.init == 'normal': + nn.init.normal_(weight, 0.0, args.init_std) + +def init_bias(bias): + nn.init.constant_(bias, 0.0) + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, args.init_std) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + init_bias(m.r_bias) + +def update_dropout(m): + classname = m.__class__.__name__ + if classname.find('Dropout') != -1: + if hasattr(m, 'p'): + m.p = args.dropout + +def update_dropatt(m): + if hasattr(m, 'dropatt'): + m.dropatt.p = args.dropatt + +if args.restart: + with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f: + model = torch.load(f) + if not args.fp16: + model = model.float() + model.apply(update_dropout) + model.apply(update_dropatt) +else: + model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) + model.apply(weights_init) + model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing +args.n_all_param = sum([p.nelement() for p in model.parameters()]) +args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) + +if args.fp16: + model = model.half() + +if args.multi_gpu: + model = model.to(device) + if args.gpu0_bsz >= 0: + para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk, + model, dim=1).to(device) + else: + para_model = nn.DataParallel(model, dim=1).to(device) +else: + para_model = model.to(device) + +#### optimizer +if args.optim.lower() == 'sgd': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) + optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) + else: + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.mom) +elif args.optim.lower() == 'adam': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) + optimizer = optim.Adam(dense_params, lr=args.lr) + else: + optimizer = optim.Adam(model.parameters(), lr=args.lr) + +elif args.optim.lower() == 'adan': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = Adan(sparse_params,betas=args.opt_betas, lr=args.lr, weight_decay= args.wd) + optimizer = Adan(dense_params, lr=args.lr,betas=args.opt_betas, weight_decay= args.wd) + else: + optimizer = Adan(model.parameters(), lr=args.lr, betas=args.opt_betas, weight_decay= args.wd) + +elif args.optim.lower() == 'adagrad': + optimizer = optim.Adagrad(model.parameters(), lr=args.lr) + +#### scheduler +if args.scheduler == 'cosine': + # here we do not set eta_min to lr_min to be backward compatible + # because in previous versions eta_min is default to 0 + # rather than the default value of lr_min 1e-6 + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, + args.max_step, eta_min=args.eta_min) # should use eta_min arg + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse, + args.max_step, eta_min=args.eta_min) # should use eta_min arg +elif args.scheduler == 'inv_sqrt': + # originally used for Transformer (in Attention is all you need) + def lr_lambda(step): + # return a multiplier instead of a learning rate + if step == 0 and args.warmup_step == 0: + return 1. + else: + return 1. / (step ** 0.5) if step > args.warmup_step \ + else step / (args.warmup_step ** 1.5) + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) +elif args.scheduler == 'dev_perf': + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) +elif args.scheduler == 'constant': + pass + +if args.cuda and args.fp16: + # If args.dynamic_loss_scale is False, static_loss_scale will be used. + # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale. + optimizer = FP16_Optimizer(optimizer, + static_loss_scale = args.static_loss_scale, + dynamic_loss_scale = args.dynamic_loss_scale, + dynamic_loss_args = {'init_scale': 2 ** 16}) + +if args.restart: + if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')): + with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f: + opt_state_dict = torch.load(f) + optimizer.load_state_dict(opt_state_dict) + else: + print('Optimizer was not saved. Start from scratch.') + +logging('=' * 100) +for k, v in args.__dict__.items(): + logging(' - {} : {}'.format(k, v)) +logging('=' * 100) +logging('#params = {}'.format(args.n_all_param)) +logging('#non emb params = {}'.format(args.n_nonemb_param)) + +############################################################################### +# Training code +############################################################################### + +def evaluate(eval_iter): + # Turn on evaluation mode which disables dropout. + model.eval() + + # If the model does not use memory at all, make the ext_len longer. + # Otherwise, make the mem_len longer and keep the ext_len the same. + if args.mem_len == 0: + model.reset_length(args.eval_tgt_len, + args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len) + else: + model.reset_length(args.eval_tgt_len, + args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len) + + # Evaluation + total_len, total_loss = 0, 0. + with torch.no_grad(): + mems = tuple() + for i, (data, target, seq_len) in enumerate(eval_iter): + if args.max_eval_steps > 0 and i >= args.max_eval_steps: + break + ret = model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.float().item() + total_len += seq_len + + # Switch back to the training mode + model.reset_length(args.tgt_len, args.ext_len, args.mem_len) + model.train() + + return total_loss / total_len + + +def train(): + # Turn on training mode which enables dropout. + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + model.train() + if args.batch_chunk > 1: + mems = [tuple() for _ in range(args.batch_chunk)] + else: + mems = tuple() + train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter + for batch, (data, target, seq_len) in enumerate(train_iter): + model.zero_grad() + if args.batch_chunk > 1: + data_chunks = torch.chunk(data, args.batch_chunk, 1) + target_chunks = torch.chunk(target, args.batch_chunk, 1) + for i in range(args.batch_chunk): + data_i = data_chunks[i].contiguous() + target_i = target_chunks[i].contiguous() + ret = para_model(data_i, target_i, *mems[i]) + loss, mems[i] = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) / args.batch_chunk + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + train_loss += loss.float().item() + else: + ret = para_model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + train_loss += loss.float().item() + + if args.fp16: + optimizer.clip_master_grads(args.clip) + else: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) + + optimizer.step() + if args.sample_softmax > 0: + optimizer_sparse.step() + + # step-wise learning rate annealing + train_step += 1 + if args.scheduler in ['cosine', 'constant', 'dev_perf']: + # linear warmup stage + if train_step < args.warmup_step: + curr_lr = args.lr * train_step / args.warmup_step + optimizer.param_groups[0]['lr'] = curr_lr + if args.sample_softmax > 0: + optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2 + else: + if args.scheduler == 'cosine': + scheduler.step(train_step) + if args.sample_softmax > 0: + scheduler_sparse.step(train_step) + elif args.scheduler == 'inv_sqrt': + scheduler.step(train_step) + + if train_step % args.log_interval == 0: + cur_loss = train_loss / args.log_interval + elapsed = time.time() - log_start_time + log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \ + '| ms/batch {:5.2f} | loss {:5.2f}'.format( + epoch, train_step, batch+1, optimizer.param_groups[0]['lr'], + elapsed * 1000 / args.log_interval, cur_loss) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2)) + else: + log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) + logging(log_str) + train_loss = 0 + log_start_time = time.time() + + if train_step % args.eval_interval == 0: + val_loss = evaluate(va_iter) + logging('-' * 100) + log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \ + '| valid loss {:5.2f}'.format( + train_step // args.eval_interval, train_step, + (time.time() - eval_start_time), val_loss) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2)) + else: + log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss)) + logging(log_str) + logging('-' * 100) + # Save the model if the validation loss is the best we've seen so far. + if not best_val_loss or val_loss < best_val_loss: + if not args.debug: + with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f: + torch.save(model, f) + with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f: + torch.save(optimizer.state_dict(), f) + best_val_loss = val_loss + + # dev-performance based learning rate annealing + if args.scheduler == 'dev_perf': + scheduler.step(val_loss) + if args.sample_softmax > 0: + scheduler_sparse.step(val_loss) + + eval_start_time = time.time() + + if train_step == args.max_step: + break + +# Loop over epochs. +train_step = 0 +train_loss = 0 +best_val_loss = None + +log_start_time = time.time() +eval_start_time = time.time() + +# At any point you can hit Ctrl + C to break out of training early. +try: + for epoch in itertools.count(start=1): + train() + if train_step == args.max_step: + logging('-' * 100) + logging('End of training') + break +except KeyboardInterrupt: + logging('-' * 100) + logging('Exiting from training early') + +# Load the best saved model. +with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f: + model = torch.load(f) +para_model = model.to(device) + +# Run on test data. +test_loss = evaluate(te_iter) +logging('=' * 100) +if args.dataset in ['enwik8', 'text8']: + logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format( + test_loss, test_loss / math.log(2))) +else: + logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format( + test_loss, math.exp(test_loss))) +logging('=' * 100) diff --git a/NLP/Transformer-XL/utils/adaptive_softmax.py b/NLP/Transformer-XL/utils/adaptive_softmax.py new file mode 100644 index 0000000..68ae016 --- /dev/null +++ b/NLP/Transformer-XL/utils/adaptive_softmax.py @@ -0,0 +1,90 @@ +from collections import defaultdict + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class AdaptiveLogSoftmax(nn.Module): + def __init__(self, in_features, n_classes, cutoffs, keep_order=False): + super(AdaptiveLogSoftmax, self).__init__() + + cutoffs = list(cutoffs) + + if (cutoffs != sorted(cutoffs)) \ + or (min(cutoffs) <= 0) \ + or (max(cutoffs) >= (n_classes - 1)) \ + or (len(set(cutoffs)) != len(cutoffs)) \ + or any([int(c) != c for c in cutoffs]): + + raise ValueError("cutoffs should be a sequence of unique, positive " + "integers sorted in an increasing order, where " + "each value is between 1 and n_classes-1") + + self.in_features = in_features + self.n_classes = n_classes + self.cutoffs = cutoffs + [n_classes] + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.keep_order = keep_order + + + def forward(self, hidden, target, weight, bias, keep_order=False): + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + head_weight = torch.cat( + [weight[:self.shortlist_size], self.cluster_weight], dim=0) + head_bias = torch.cat( + [bias[:self.shortlist_size], self.cluster_bias], dim=0) + + head_logit = F.linear(hidden, head_weight, bias=head_bias) + head_logprob = F.log_softmax(head_logit, dim=1) + + nll = torch.zeros_like(target, + dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1] + + mask_i = (target >= l_idx) & (target < h_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + + if i == 0: + logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) + else: + weight_i = weight[l_idx:h_idx] + bias_i = bias[l_idx:h_idx] + + hidden_i = hidden.index_select(0, indices_i) + + tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + logprob_i = head_logprob_i[:, -i] \ + + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) + + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + nll.index_copy_(0, indices_i, -logprob_i) + else: + nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + + offset += logprob_i.size(0) + + return nll diff --git a/NLP/Transformer-XL/utils/data_parallel.py b/NLP/Transformer-XL/utils/data_parallel.py new file mode 100644 index 0000000..d7e1811 --- /dev/null +++ b/NLP/Transformer-XL/utils/data_parallel.py @@ -0,0 +1,91 @@ + +from torch.nn.parallel import DataParallel +import torch +from torch.nn.parallel._functions import Scatter +from torch.nn.parallel.parallel_apply import parallel_apply + +def scatter(inputs, target_gpus, chunk_sizes, dim=0): + r""" + Slices tensors into approximately equal chunks and + distributes them across given GPUs. Duplicates + references to objects that are not tensors. + """ + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + try: + return Scatter.apply(target_gpus, chunk_sizes, dim, obj) + except: + print('obj', obj.size()) + print('dim', dim) + print('chunk_sizes', chunk_sizes) + quit() + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + return list(map(list, zip(*map(scatter_map, obj)))) + if isinstance(obj, dict) and len(obj) > 0: + return list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return [obj for targets in target_gpus] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + return scatter_map(inputs) + finally: + scatter_map = None + +def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0): + r"""Scatter with support for kwargs dictionary""" + inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else [] + kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs + +class BalancedDataParallel(DataParallel): + def __init__(self, gpu0_bsz, *args, **kwargs): + self.gpu0_bsz = gpu0_bsz + super().__init__(*args, **kwargs) + + def forward(self, *inputs, **kwargs): + if not self.device_ids: + return self.module(*inputs, **kwargs) + if self.gpu0_bsz == 0: + device_ids = self.device_ids[1:] + else: + device_ids = self.device_ids + inputs, kwargs = self.scatter(inputs, kwargs, device_ids) + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + replicas = self.replicate(self.module, self.device_ids) + if self.gpu0_bsz == 0: + replicas = replicas[1:] + outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs) + return self.gather(outputs, self.output_device) + + def parallel_apply(self, replicas, device_ids, inputs, kwargs): + return parallel_apply(replicas, inputs, kwargs, device_ids) + + def scatter(self, inputs, kwargs, device_ids): + bsz = inputs[0].size(self.dim) + num_dev = len(self.device_ids) + gpu0_bsz = self.gpu0_bsz + bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1) + if gpu0_bsz < bsz_unit: + chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1) + delta = bsz - sum(chunk_sizes) + for i in range(delta): + chunk_sizes[i + 1] += 1 + if gpu0_bsz == 0: + chunk_sizes = chunk_sizes[1:] + else: + return super().scatter(inputs, kwargs, device_ids) + return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim) + diff --git a/NLP/Transformer-XL/utils/exp_utils.py b/NLP/Transformer-XL/utils/exp_utils.py new file mode 100644 index 0000000..e44f7c2 --- /dev/null +++ b/NLP/Transformer-XL/utils/exp_utils.py @@ -0,0 +1,40 @@ +import functools +import os, shutil + +import numpy as np + +import torch + + +def logging(s, log_path, print_=True, log_=True): + if print_: + print(s) + if log_: + with open(log_path, 'a+') as f_log: + f_log.write(s + '\n') + +def get_logger(log_path, **kwargs): + return functools.partial(logging, log_path=log_path, **kwargs) + +def create_exp_dir(dir_path, scripts_to_save=None, debug=False): + if debug: + print('Debug Mode : no experiment dir created') + return functools.partial(logging, log_path=None, log_=False) + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + print('Experiment dir : {}'.format(dir_path)) + if scripts_to_save is not None: + script_path = os.path.join(dir_path, 'scripts') + if not os.path.exists(script_path): + os.makedirs(script_path) + for script in scripts_to_save: + dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script)) + shutil.copyfile(script, dst_file) + + return get_logger(log_path=os.path.join(dir_path, 'log.txt')) + +def save_checkpoint(model, optimizer, path, epoch): + torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch))) + torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch))) diff --git a/NLP/Transformer-XL/utils/log_uniform_sampler.py b/NLP/Transformer-XL/utils/log_uniform_sampler.py new file mode 100644 index 0000000..503f635 --- /dev/null +++ b/NLP/Transformer-XL/utils/log_uniform_sampler.py @@ -0,0 +1,147 @@ +import torch +from torch import nn +import numpy as np + +class LogUniformSampler(object): + def __init__(self, range_max, n_sample): + """ + Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py + `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` + + expected count can be approximated by 1 - (1 - p)^n + and we use a numerically stable version -expm1(num_tries * log1p(-p)) + + Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run + """ + with torch.no_grad(): + self.range_max = range_max + log_indices = torch.arange(1., range_max+2., 1.).log_() + self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] + # print('P', self.dist.numpy().tolist()[-30:]) + + self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() + + self.n_sample = n_sample + + def sample(self, labels): + """ + labels: [b1, b2] + Return + true_log_probs: [b1, b2] + samp_log_probs: [n_sample] + neg_samples: [n_sample] + """ + + # neg_samples = torch.empty(0).long() + n_sample = self.n_sample + n_tries = 2 * n_sample + + with torch.no_grad(): + neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique() + device = labels.device + neg_samples = neg_samples.to(device) + true_log_probs = self.log_q[labels].to(device) + samp_log_probs = self.log_q[neg_samples].to(device) + return true_log_probs, samp_log_probs, neg_samples + +def sample_logits(embedding, bias, labels, inputs, sampler): + """ + embedding: an nn.Embedding layer + bias: [n_vocab] + labels: [b1, b2] + inputs: [b1, b2, n_emb] + sampler: you may use a LogUniformSampler + Return + logits: [b1, b2, 1 + n_sample] + """ + true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) + n_sample = neg_samples.size(0) + b1, b2 = labels.size(0), labels.size(1) + all_ids = torch.cat([labels.view(-1), neg_samples]) + all_w = embedding(all_ids) + true_w = all_w[: -n_sample].view(b1, b2, -1) + sample_w = all_w[- n_sample:].view(n_sample, -1) + + all_b = bias[all_ids] + true_b = all_b[: -n_sample].view(b1, b2) + sample_b = all_b[- n_sample:] + + hit = (labels[:, :, None] == neg_samples).detach() + + true_logits = torch.einsum('ijk,ijk->ij', + [true_w, inputs]) + true_b - true_log_probs + sample_logits = torch.einsum('lk,ijk->ijl', + [sample_w, inputs]) + sample_b - samp_log_probs + sample_logits.masked_fill_(hit, -1e30) + logits = torch.cat([true_logits[:, :, None], sample_logits], -1) + + return logits + + +# class LogUniformSampler(object): +# def __init__(self, range_max, unique=False): +# """ +# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py +# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` +# """ +# self.range_max = range_max +# log_indices = torch.arange(1., range_max+2., 1.).log_() +# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] + +# self.unique = unique + +# if self.unique: +# self.exclude_mask = torch.ByteTensor(range_max).fill_(0) + +# def sample(self, n_sample, labels): +# pos_sample, new_labels = labels.unique(return_inverse=True) +# n_pos_sample = pos_sample.size(0) +# n_neg_sample = n_sample - n_pos_sample + +# if self.unique: +# self.exclude_mask.index_fill_(0, pos_sample, 1) +# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0) +# self.exclude_mask.index_fill_(0, pos_sample, 0) +# else: +# sample_dist = self.dist + +# neg_sample = torch.multinomial(sample_dist, n_neg_sample) + +# sample = torch.cat([pos_sample, neg_sample]) +# sample_prob = self.dist[sample] + +# return new_labels, sample, sample_prob + + +if __name__ == '__main__': + S, B = 3, 4 + n_vocab = 10000 + n_sample = 5 + H = 32 + + labels = torch.LongTensor(S, B).random_(0, n_vocab) + + # sampler = LogUniformSampler(n_vocab, unique=False) + # new_labels, sample, sample_prob = sampler.sample(n_sample, labels) + + sampler = LogUniformSampler(n_vocab, unique=True) + # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels) + + # print('true_probs', true_probs.numpy().tolist()) + # print('samp_probs', samp_probs.numpy().tolist()) + # print('neg_samples', neg_samples.numpy().tolist()) + + # print('sum', torch.sum(sampler.dist).item()) + + # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item() + + embedding = nn.Embedding(n_vocab, H) + bias = torch.zeros(n_vocab) + inputs = torch.Tensor(S, B, H).normal_() + + logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample) + print('logits', logits.detach().numpy().tolist()) + print('logits shape', logits.size()) + print('out_labels', out_labels.detach().numpy().tolist()) + print('out_labels shape', out_labels.size()) + diff --git a/NLP/Transformer-XL/utils/proj_adaptive_softmax.py b/NLP/Transformer-XL/utils/proj_adaptive_softmax.py new file mode 100644 index 0000000..a0fbfeb --- /dev/null +++ b/NLP/Transformer-XL/utils/proj_adaptive_softmax.py @@ -0,0 +1,151 @@ +from collections import defaultdict + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) +CUDA_MINOR = int(torch.version.cuda.split('.')[1]) + +class ProjectedAdaptiveLogSoftmax(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + keep_order=False): + super(ProjectedAdaptiveLogSoftmax, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + self.d_proj = d_proj + + self.cutoffs = cutoffs + [n_token] + self.cutoff_ends = [0] + self.cutoffs + self.div_val = div_val + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + if self.n_clusters > 0: + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.out_layers = nn.ModuleList() + self.out_projs = nn.ParameterList() + + if div_val == 1: + for i in range(len(self.cutoffs)): + if d_proj != d_embed: + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_embed)) + ) + else: + self.out_projs.append(None) + + self.out_layers.append(nn.Linear(d_embed, n_token)) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_emb_i)) + ) + + self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) + + self.keep_order = keep_order + + def _compute_logit(self, hidden, weight, bias, proj): + if proj is None: + logit = F.linear(hidden, weight, bias=bias) + else: + # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: + proj_hid = F.linear(hidden, proj.t().contiguous()) + logit = F.linear(proj_hid, weight, bias=bias) + # else: + # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) + # if bias is not None: + # logit = logit + bias + + return logit + + def forward(self, hidden, target, keep_order=False): + ''' + hidden :: [len*bsz x d_proj] + target :: [len*bsz] + ''' + + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, + self.out_layers[0].bias, self.out_projs[0]) + nll = -F.log_softmax(logit, dim=-1) \ + .gather(1, target.unsqueeze(1)).squeeze(1) + else: + # construct weights and biases + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat( + [weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat( + [bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + head_logprob = F.log_softmax(head_logit, dim=1) + + nll = torch.zeros_like(target, + dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] + + mask_i = (target >= l_idx) & (target < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + + if i == 0: + logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + hidden_i = hidden.index_select(0, indices_i) + + tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + logprob_i = head_logprob_i[:, -i] \ + + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) + + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + nll.index_copy_(0, indices_i, -logprob_i) + else: + nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + + offset += logprob_i.size(0) + + return nll diff --git a/NLP/Transformer-XL/utils/vocabulary.py b/NLP/Transformer-XL/utils/vocabulary.py new file mode 100644 index 0000000..b6b8249 --- /dev/null +++ b/NLP/Transformer-XL/utils/vocabulary.py @@ -0,0 +1,163 @@ +import os +from collections import Counter, OrderedDict + +import torch + +class Vocab(object): + def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True, + delimiter=None, vocab_file=None): + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + + def tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + # convert to lower case + if self.lower_case: + line = line.lower() + + # empty delimiter '' will evaluate False + if self.delimiter == '': + symbols = line + else: + symbols = line.split(self.delimiter) + + if add_double_eos: # lm1b + return [''] + symbols + [''] + elif add_eos: + return symbols + [''] + else: + return symbols + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: print('counting file {} ...'.format(path)) + assert os.path.exists(path) + + sents = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: print('counting {} sents ...'.format(len(sents))) + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, 'r', encoding='utf-8') as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + self.unk_idx = self.sym2idx[''] + + def build_vocab(self): + if self.vocab_file: + print('building vocab from {}'.format(self.vocab_file)) + self._build_from_file(self.vocab_file) + print('final vocab size {}'.format(len(self))) + else: + print('building vocab with min_freq={}, max_size={}'.format( + self.min_freq, self.max_size)) + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: break + self.add_symbol(sym) + + print('final vocab size {} from {} unique tokens'.format( + len(self), len(self.counter))) + + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, + add_double_eos=False): + if verbose: print('encoding file {} ...'.format(path)) + assert os.path.exists(path) + encoded = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos, + add_double_eos=add_double_eos) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: print('encoding {} sents ...'.format(len(sents))) + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def get_sym(self, idx): + assert 0 <= idx < len(self), 'Index {} out of range'.format(idx) + return self.idx2sym[idx] + + def get_idx(self, sym): + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + # print('encounter unk {}'.format(sym)) + assert '' not in sym + assert hasattr(self, 'unk_idx') + return self.sym2idx.get(sym, self.unk_idx) + + def get_symbols(self, indices): + return [self.get_sym(idx) for idx in indices] + + def get_indices(self, symbols): + return [self.get_idx(sym) for sym in symbols] + + def convert_to_tensor(self, symbols): + return torch.LongTensor(self.get_indices(symbols)) + + def convert_to_sent(self, indices, exclude=None): + if exclude is None: + return ' '.join([self.get_sym(idx) for idx in indices]) + else: + return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude]) + + def __len__(self): + return len(self.idx2sym) diff --git a/README.md b/README.md new file mode 100644 index 0000000..1b6a463 --- /dev/null +++ b/README.md @@ -0,0 +1,135 @@ +# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models + +This is an official PyTorch implementation of **Adan**. See paper [here](https://arxiv.org/abs/2208.06677). If you find our adan helpful or heuristic to your projects, please cite this paper and also star this repository. Thanks! + + + + +```tex +@article{xie2022adan, + title={Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models}, + author={Xie, Xingyu and Zhou, Pan and Li, Huan and Lin, Zhouchen and Yan, Shuicheng}, + journal={arXiv preprint arXiv:2208.06677}, + year={2022} +} +``` + + + +## Usage + +For your convenience to use Adan, we briefly provide some intuitive instructions below, then provide some general experimental tips, and finally give more details (e.g. specific commonds and hyper-parameters) for each experiment in the paper. + +#### 1) Two steps to use Adan + +**Step 1.** add Adan-dependent hyper-parameters by adding the following hyper-parameters to the config: + +```python +parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient (default: 0.0, no gradient clip)') +parser.add_argument('--weight-decay', type=float, default=0.02, help='weight decay, similar one used in AdamW (default: 0.02)') +parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)') +parser.add_argument('--no-prox', action='store_true', default=False, help='whether perform weight decay like AdamW (default=False)') +``` +`no-prox`: It determines the update rule of parameters with weight decay. By default, Adan updates the parameters in the way presented in Algorithm 1 in the paper: + + $$\boldsymbol{\theta}_{k+1} = ( 1+\lambda \eta)^{-1}\left[\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k)\right],$$_ + +But one also can update the parameter like Adamw: + +$$\boldsymbol{\theta}_{k+1} = ( 1-\lambda \eta)\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k).$$ +In all experiments, we set `no-prox=False` in our paper. + +**Step 2.** creat the Adan optimizer as follows. In this step, we can directly replace the vanilla optimizer by using the following command: + +```python +from adam import Adan +optimizer = Adan(param, lr=args.lr, weight_decay=args.weight_decay, betas=args.opt_betas, eps = args.opt_eps, max_grad_norm=args.max_grad_norm, no_prox=args.no_prox) +``` + +#### 2) Tips for Experiments + +- To make Adan simple, in all experiments except Table 12 in the paper, we do not use the restart strategy in Adan. But Table 12 shows that restart strategy can further slightly improve the performance of Adan. +- Adan often allow one to use a large peak learning rate which often fails other optimizers, e.g. Adam and AdamW. For example, in all experiments except for the experiments on MAE pre-training and LSTM, the learning rate used by Adan is **5-10 times** than that in Adam/AdamW. +- It seems that Adan prefers a large batch size for large-scale experiments, e.g. 2,048 total batch size in our paper. +- Adan is relatively robust to `beta1`, `beta2` and `beta3`, especially for `beta2`. If you hope better performance, you can first tune `beta3` and then `beta1`. +- Interestingly, we found that `weight_decay = 0.02` is suitable for all experiments in our paper. + +#### 3) More extra detailed steps to reproduce experimental results in paper + +Please refer to the following links for detailed steps. In these detailed steps, we even include the **docker images** for reproducibility. + +- [Instruction](./CV/timm/) for **ViTs**, **ResNets**, and **ConvNext**. +- [Instruction](./CV/MAE/) for **MAE**. +- [Instruction](./NLP/BERT/) for **BERT**. +- [Instruction](./NLP/Transformer-XL/) for **Transformer-XL**. + + + +## Model Zoo + +### Results on vision tasks + +For your convenience to use Adan, we provide the configs and log files for the experiments on ImageNet-1k. + +| Model | Epoch | Training Setting | Acc. (%) | Config | Download | +| ------------- | :-----: | :-----: | :------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| ViT-S | 150 | I | 80.1 | [config](./CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml) | [log](./CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv)/model | +| ViT-S | 150 | II | 79.6 | [config](./CV/timm/exp_results/ViT/small/args_vit-s_150.yaml) | [log](./CV/timm/exp_results/ViT/small/summary_vit-s_150.csv)/model | +| ViT-S | 300 | I | 81.1 | [config](./CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml) | [log](./CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv)/model | +| ViT-S | 300 | II | 80.7 | [config](./CV/timm/exp_results/ViT/small/args_vit-s_300.yaml) | [log](./CV/timm/exp_results/ViT/small/summary_vit-s_300.csv)/model | +| ViT-B | 150 | II | 81.7 | [config](./CV/timm/exp_results/ViT/base/args_vit-B_150.yaml) | [log](./CV/timm/exp_results/ViT/base/summary_vit-B_150.csv)/model | +| ViT-B | 300 | II | 82.3 | [config](./CV/timm/exp_results/ViT/base/args_vit-B_300.yaml) | [log](./CV/timm/exp_results/ViT/base/summary_vit-B_300.csv)/model | +| ResNet-50 | 100 | I | 78.1 | [config](./CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml) | [log](./CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv)/model | +| ResNet-50 | 200 | I | 79.7 | [config](./exp_results/ResNet/Res50/args_res50_200.yaml) | [log](./exp_results/ResNet/Res50/summary_res50_200.csv)/model | +| ResNet-50 | 300 | I | 80.2 | [config](./CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml) | [log](./CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv)/model | +| ConvNext-tiny | 150 | II | 81.7 | [config](./CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml) | [log](./CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv)//model | +| ConvNext-tiny | 300 | II | 82.4 | [config](./CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml) | [log](./CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv)/model | +| MAE-small | 800+100 | --- | 83.8 | [config](./CV/MAE/README.md) | [log-pretrain](./CV/MAE/exp_results/MAE/base/log_base_pretrain.txt)/[log-finetune](./CV/MAE/exp_results/MAE/base/log_base_ft.txt)/model | +| MAE-Large | 800+50 | --- | 85.9 | [config](./CV/MAE/README.md) | [log-pretrain](./CV/MAE/exp_results/MAE/large/log_large_pretrain.txt)/[log-finetune](./CV/MAE/exp_results/MAE/large/log_large_ft.txt)/model | + + + +### Results on NLP tasks + +#### BERT-base + +We give the configs and log files of the BERT-base model pre-trained on the Bookcorpus and Wikipedia datasets and fine-tuned on GLUE tasks. Note, we provide the config and log file, and detailed [instruction](./NLP/BERT/README.md) for BERT-base in the folder `./NLP/BERT`. + + + + +| Pretraining | Config | Log | Model | +| --------- | :--------: | :--------- | :--------: | +| Adan | [config](./NLP/BERT/config/pretraining/bert-adan.yaml) | [log](./NLP/BERT/exp_results/pretrain/hydra_train-adan.log) | model | + + +| Fine-tuning on GLUE-Task | Metric | Result | Config | +| -------------- | :--------------------------- | :-------: | :-----------------------------------------------------: | +| CoLA | Matthew's corr. | 64.6 | [config](./NLP/BERT/config/finetuning/cola-adan.yaml) | +| SST-2 | Accuracy | 93.2 | [config](./NLP/BERT/config/finetuning/sst_2-adan.yaml) | +| STS-B | Person corr. | 89.3 | [config](./NLP/BERT/config/finetuning/sts_b-adan.yaml) | +| QQP | Accuracy | 91.2 | [config](./NLP/BERT/config/finetuning/qqp-adan.yaml) | +| MNLI | Matched acc./Mismatched acc. | 85.7/85.6 | [config](./NLP/BERT/config/finetuning/mnli-adan.yaml) | +| QNLI | Accuracy | 91.3 | [config](./NLP/BERT/config/finetuning/qnli-adan.yaml) | +| RTE | Accuracy | 73.3 | [config](./NLP/BERT/config/finetuning/rte-adan.yaml) | + + + +#### Transformer-XL-base + +We provide the config and log for Transformer-XL-base trained on the WikiText-103 dataset. + +| | Steps | Test PPL | Download | +| ------------------- | :---: | :------: | :---------------------------------------------------------: | +| Baseline (Adam) | 200k | 24.2 | [log&config](./NLP/Transformer-XL/exp_results/log-adam.txt) | +| Transformer-XL-base | 50k | 26.2 | [log&config](./NLP/Transformer-XL/exp_results/log-50k.txt) | +| Transformer-XL-base | 100k | 24.2 | [log&config](./NLP/Transformer-XL/exp_results/log-100k.txt) | +| Transformer-XL-base | 200k | 23.5 | [log&config](./NLP/Transformer-XL/exp_results/log-200k.txt) | + + + +​ + + + diff --git a/adan.py b/adan.py new file mode 100644 index 0000000..e2a224a --- /dev/null +++ b/adan.py @@ -0,0 +1,154 @@ +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import torch +from torch.optim.optimizer import Optimizer +from timm.utils import * + + +class Adan(Optimizer): + """ + Implements a pytorch variant of Adan + + Adan was proposed in + Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. + https://arxiv.org/abs/2208.06677 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float, flot], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) + max_grad_norm (float, optional): value used to clip + global grad norm (default: 0.0 no clip) + no_prox (bool): how to perform the decoupled weight decay (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, max_grad_norm=0.0, no_prox=False): + if not 0.0 <= max_grad_norm: + raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= betas[2] < 1.0: + raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm, no_prox=no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @torch.no_grad() + def restart_opt(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + if p.requires_grad: + state = self.state[p] + # State initialization + + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + # Exponential moving average of gradient difference + state['exp_avg_diff'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self): + """ + Performs a single optimization step. + """ + if self.defaults['max_grad_norm'] > 0: + device = self.param_groups[0]['params'][0].device + global_grad_norm = torch.zeros(1, device=device) + + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + for group in self.param_groups: + + for p in group['params']: + if p.grad is not None: + grad = p.grad + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + + clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) + else: + clip_global_grad_norm = 1.0 + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) + + grad = p.grad.mul_(clip_global_grad_norm) + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + copy_grad = grad.clone() + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + update = grad + beta2 * diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) + + if group['no_prox']: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + p.add_(update, alpha=-group['lr']) + else: + p.add_(update, alpha=-group['lr']) + p.data.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad