From 920e25b282e99d41eb20d3993485b73d6cc428ba Mon Sep 17 00:00:00 2001
From: Xingyu Xie <xyxie@pku.edu.cn>
Date: Thu, 1 Sep 2022 18:50:19 +0800
Subject: [PATCH] code submission

---
 CV/MAE/README.md                              |  126 ++
 CV/MAE/adan.py                                |  154 +++
 CV/MAE/engine_finetune.py                     |  131 ++
 CV/MAE/engine_pretrain.py                     |   83 ++
 CV/MAE/exp_results/MAE/base/log_base_ft.txt   |  100 ++
 .../MAE/base/log_base_pretrain.txt            |  800 +++++++++++
 CV/MAE/exp_results/MAE/large/log_large_ft.txt |   50 +
 .../MAE/large/log_large_pretrain.txt          |  801 +++++++++++
 CV/MAE/main_finetune.py                       |  391 ++++++
 CV/MAE/main_linprobe.py                       |  316 +++++
 CV/MAE/main_pretrain.py                       |  277 ++++
 CV/MAE/models_mae.py                          |  250 ++++
 CV/MAE/models_vit.py                          |   74 +
 CV/MAE/util/crop.py                           |   42 +
 CV/MAE/util/datasets.py                       |   65 +
 CV/MAE/util/lars.py                           |   47 +
 CV/MAE/util/lr_decay.py                       |   76 +
 CV/MAE/util/lr_sched.py                       |   21 +
 CV/MAE/util/misc.py                           |  366 +++++
 CV/MAE/util/pos_embed.py                      |   96 ++
 CV/timm/README.md                             |   79 ++
 CV/timm/adan.py                               |  154 +++
 .../ConvNext/small/args_cvnext_150.yaml       |  111 ++
 .../ConvNext/small/args_cvnext_300.yaml       |  111 ++
 .../ConvNext/small/summary_cvnext_150.csv     |  162 +++
 .../ConvNext/small/summary_cvnext_300.csv     |  311 +++++
 .../ResNet/Res50/args_res50_100.yaml          |  111 ++
 .../ResNet/Res50/args_res50_200.yaml          |  111 ++
 .../ResNet/Res50/args_res50_300.yaml          |  112 ++
 .../ResNet/Res50/summary_res50_100.csv        |  111 ++
 .../ResNet/Res50/summary_res50_200.csv        |  211 +++
 .../ResNet/Res50/summary_res50_300.csv        |  311 +++++
 .../exp_results/ViT/base/args_vit-B_150.yaml  |  112 ++
 .../exp_results/ViT/base/args_vit-B_300.yaml  |  112 ++
 .../ViT/base/summary_vit-B_150.csv            |  161 +++
 .../ViT/base/summary_vit-B_300.csv            |  311 +++++
 .../ViT/small/args_vit-s_150-I.yaml           |  113 ++
 .../exp_results/ViT/small/args_vit-s_150.yaml |  111 ++
 .../ViT/small/args_vit-s_300-I.yaml           |  113 ++
 .../exp_results/ViT/small/args_vit-s_300.yaml |  111 ++
 .../ViT/small/summary_vit-s_150-I.csv         |  171 +++
 .../ViT/small/summary_vit-s_150.csv           |  162 +++
 .../ViT/small/summary_vit-s_300-I.csv         |  311 +++++
 .../ViT/small/summary_vit-s_300.csv           |  311 +++++
 CV/timm/optim_factory.py                      |  343 +++++
 CV/timm/sam.py                                |   62 +
 CV/timm/supervised.md                         |  168 +++
 CV/timm/train.py                              |  830 +++++++++++
 NLP/BERT/README.md                            |  213 +++
 NLP/BERT/adan.py                              |  231 ++++
 NLP/BERT/config/finetuning/acc_test.py        |  116 ++
 NLP/BERT/config/finetuning/cola-adan.yaml     |   59 +
 NLP/BERT/config/finetuning/cola.yaml          |   59 +
 NLP/BERT/config/finetuning/mnli-adan.yaml     |   59 +
 NLP/BERT/config/finetuning/mnli.yaml          |   59 +
 NLP/BERT/config/finetuning/qnli-adan.yaml     |   59 +
 NLP/BERT/config/finetuning/qnli.yaml          |   59 +
 NLP/BERT/config/finetuning/qqp-adan.yaml      |   59 +
 NLP/BERT/config/finetuning/qqp.yaml           |   59 +
 NLP/BERT/config/finetuning/rte-adan.yaml      |   59 +
 NLP/BERT/config/finetuning/rte.yaml           |   59 +
 NLP/BERT/config/finetuning/sst_2-adan.yaml    |   59 +
 NLP/BERT/config/finetuning/sst_2.yaml         |   59 +
 NLP/BERT/config/finetuning/sts_b-adan.yaml    |   58 +
 NLP/BERT/config/finetuning/sts_b.yaml         |   58 +
 NLP/BERT/config/pretraining/base.yaml         |   42 +
 NLP/BERT/config/pretraining/bert-adan.yaml    |   52 +
 NLP/BERT/config/pretraining/bert-base.yaml    |   54 +
 .../pretrain/full_config-adam.yaml            |  376 +++++
 .../pretrain/full_config-adan.yaml            |  376 +++++
 NLP/Transformer-XL/README.md                  |   92 ++
 NLP/Transformer-XL/adan.py                    |  154 +++
 NLP/Transformer-XL/data_utils.py              |  273 ++++
 NLP/Transformer-XL/eval.py                    |  122 ++
 NLP/Transformer-XL/exp_results/log-100k.txt   |  649 +++++++++
 NLP/Transformer-XL/exp_results/log-200k.txt   | 1224 +++++++++++++++++
 NLP/Transformer-XL/exp_results/log-50k.txt    |  360 +++++
 NLP/Transformer-XL/exp_results/log-adam.txt   | 1224 +++++++++++++++++
 NLP/Transformer-XL/mem_transformer.py         |  812 +++++++++++
 NLP/Transformer-XL/run_wt103_adan.sh          |   46 +
 NLP/Transformer-XL/train.py                   |  581 ++++++++
 NLP/Transformer-XL/utils/adaptive_softmax.py  |   90 ++
 NLP/Transformer-XL/utils/data_parallel.py     |   91 ++
 NLP/Transformer-XL/utils/exp_utils.py         |   40 +
 .../utils/log_uniform_sampler.py              |  147 ++
 .../utils/proj_adaptive_softmax.py            |  151 ++
 NLP/Transformer-XL/utils/vocabulary.py        |  163 +++
 README.md                                     |  135 ++
 adan.py                                       |  154 +++
 89 files changed, 18455 insertions(+)
 create mode 100644 CV/MAE/README.md
 create mode 100644 CV/MAE/adan.py
 create mode 100644 CV/MAE/engine_finetune.py
 create mode 100644 CV/MAE/engine_pretrain.py
 create mode 100644 CV/MAE/exp_results/MAE/base/log_base_ft.txt
 create mode 100644 CV/MAE/exp_results/MAE/base/log_base_pretrain.txt
 create mode 100644 CV/MAE/exp_results/MAE/large/log_large_ft.txt
 create mode 100644 CV/MAE/exp_results/MAE/large/log_large_pretrain.txt
 create mode 100644 CV/MAE/main_finetune.py
 create mode 100644 CV/MAE/main_linprobe.py
 create mode 100644 CV/MAE/main_pretrain.py
 create mode 100644 CV/MAE/models_mae.py
 create mode 100644 CV/MAE/models_vit.py
 create mode 100644 CV/MAE/util/crop.py
 create mode 100644 CV/MAE/util/datasets.py
 create mode 100644 CV/MAE/util/lars.py
 create mode 100644 CV/MAE/util/lr_decay.py
 create mode 100644 CV/MAE/util/lr_sched.py
 create mode 100644 CV/MAE/util/misc.py
 create mode 100644 CV/MAE/util/pos_embed.py
 create mode 100644 CV/timm/README.md
 create mode 100644 CV/timm/adan.py
 create mode 100644 CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml
 create mode 100644 CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml
 create mode 100644 CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv
 create mode 100644 CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv
 create mode 100644 CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml
 create mode 100644 CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml
 create mode 100644 CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml
 create mode 100644 CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv
 create mode 100644 CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv
 create mode 100644 CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv
 create mode 100644 CV/timm/exp_results/ViT/base/args_vit-B_150.yaml
 create mode 100644 CV/timm/exp_results/ViT/base/args_vit-B_300.yaml
 create mode 100644 CV/timm/exp_results/ViT/base/summary_vit-B_150.csv
 create mode 100644 CV/timm/exp_results/ViT/base/summary_vit-B_300.csv
 create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml
 create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_150.yaml
 create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml
 create mode 100644 CV/timm/exp_results/ViT/small/args_vit-s_300.yaml
 create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv
 create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_150.csv
 create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv
 create mode 100644 CV/timm/exp_results/ViT/small/summary_vit-s_300.csv
 create mode 100644 CV/timm/optim_factory.py
 create mode 100644 CV/timm/sam.py
 create mode 100644 CV/timm/supervised.md
 create mode 100644 CV/timm/train.py
 create mode 100644 NLP/BERT/README.md
 create mode 100644 NLP/BERT/adan.py
 create mode 100644 NLP/BERT/config/finetuning/acc_test.py
 create mode 100644 NLP/BERT/config/finetuning/cola-adan.yaml
 create mode 100644 NLP/BERT/config/finetuning/cola.yaml
 create mode 100644 NLP/BERT/config/finetuning/mnli-adan.yaml
 create mode 100644 NLP/BERT/config/finetuning/mnli.yaml
 create mode 100644 NLP/BERT/config/finetuning/qnli-adan.yaml
 create mode 100644 NLP/BERT/config/finetuning/qnli.yaml
 create mode 100644 NLP/BERT/config/finetuning/qqp-adan.yaml
 create mode 100644 NLP/BERT/config/finetuning/qqp.yaml
 create mode 100644 NLP/BERT/config/finetuning/rte-adan.yaml
 create mode 100644 NLP/BERT/config/finetuning/rte.yaml
 create mode 100644 NLP/BERT/config/finetuning/sst_2-adan.yaml
 create mode 100644 NLP/BERT/config/finetuning/sst_2.yaml
 create mode 100644 NLP/BERT/config/finetuning/sts_b-adan.yaml
 create mode 100644 NLP/BERT/config/finetuning/sts_b.yaml
 create mode 100644 NLP/BERT/config/pretraining/base.yaml
 create mode 100644 NLP/BERT/config/pretraining/bert-adan.yaml
 create mode 100644 NLP/BERT/config/pretraining/bert-base.yaml
 create mode 100644 NLP/BERT/exp_results/pretrain/full_config-adam.yaml
 create mode 100644 NLP/BERT/exp_results/pretrain/full_config-adan.yaml
 create mode 100644 NLP/Transformer-XL/README.md
 create mode 100644 NLP/Transformer-XL/adan.py
 create mode 100644 NLP/Transformer-XL/data_utils.py
 create mode 100644 NLP/Transformer-XL/eval.py
 create mode 100644 NLP/Transformer-XL/exp_results/log-100k.txt
 create mode 100644 NLP/Transformer-XL/exp_results/log-200k.txt
 create mode 100644 NLP/Transformer-XL/exp_results/log-50k.txt
 create mode 100644 NLP/Transformer-XL/exp_results/log-adam.txt
 create mode 100644 NLP/Transformer-XL/mem_transformer.py
 create mode 100644 NLP/Transformer-XL/run_wt103_adan.sh
 create mode 100644 NLP/Transformer-XL/train.py
 create mode 100644 NLP/Transformer-XL/utils/adaptive_softmax.py
 create mode 100644 NLP/Transformer-XL/utils/data_parallel.py
 create mode 100644 NLP/Transformer-XL/utils/exp_utils.py
 create mode 100644 NLP/Transformer-XL/utils/log_uniform_sampler.py
 create mode 100644 NLP/Transformer-XL/utils/proj_adaptive_softmax.py
 create mode 100644 NLP/Transformer-XL/utils/vocabulary.py
 create mode 100644 README.md
 create mode 100644 adan.py

diff --git a/CV/MAE/README.md b/CV/MAE/README.md
new file mode 100644
index 0000000..918c011
--- /dev/null
+++ b/CV/MAE/README.md
@@ -0,0 +1,126 @@
+# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
+
+We provide the instruction to modify the official training and fine-tuning files used in [MAE](https://github.com/facebookresearch/mae) such that you can use Adan to train MAE. **Please follow MAE instruction to install necessary packages.**
+
+
+
+## Environment
+
+Our experiments for this task are based on the following pkg version.
+
+```python
+torch.__version__  = '1.7.1+cu110'
+torchvision.__version__ = '0.8.2+cu110'
+timm.__version__ = '0.4.5'
+torchaudio.__version__ = '0.7.2'
+```
+If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:mae](https://hub.docker.com/repository/docker/xyxie/adan-image).
+
+
+
+## Usage of Adan for MAE
+
+### Two steps to use Adan
+
+**Step 1.** add the following parameters to the `main_pretrain.py` and `main_finetune.py`.
+
+```python
+parser.add_argument('--use-adan', action='store_true', default=False, help='whether to use Adan')
+parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient  (default: 0.0, no gradient clip)')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)')
+```
+
+* `use-adan`: whether to use Adan. The default optimizer is AdamW.
+
+* `max-grad-norm`: it determines whether to perform gradient clipping. 
+
+* `opt-eps`: optimizer epsilon to avoid the bad case where second-order moment is zero.
+
+* `opt-betas`: optimizer betas for Adan.
+
+  
+
+**Step 2.** creat the Adan optimizer as follows. In this step, you can directly replace the vanilla optimizer creator :
+
+```python
+# following timm: set wd as 0 for bias and norm layers
+param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay)
+if args.use_adan:
+  if args.bias_decay:
+    param = model_without_ddp.parameters() 
+  else: 
+    param = param_groups
+    args.weight_decay = 0.0
+    optimizer = Adan(param, weight_decay=args.weight_decay,
+                     lr=args.lr, betas=args.opt_betas, 
+                     eps = args.opt_eps, max_grad_norm=args.max_grad_norm)
+  else:
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+```
+
+
+
+## MAE Pre-training
+
+```python
+python main_pretrain.py \
+    --batch_size 256 --accum_iter 1 \
+    --model ${MODEL_NAME} --norm_pix_loss --mask_ratio 0.75 \
+    --epochs 800 \
+    --lr ${LR}  --weight_decay 0.02 --warmup_epochs ${WR_EPOCH} \
+    --min_lr ${MIN_LR} \
+    --opt-betas 0.98 0.92 0.90 --opt-eps 1e-8 --max-grad-norm 10.0 \
+    --use-adan  \
+    --data_path ${IMAGENET_DIR}
+    --output_dir ${OUT_DIR}
+```
+
+- The pre-training file `main_pretrain.py` comes from [MAE](https://github.com/facebookresearch/mae).
+- We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large.
+- There are some differences between hyper-parameters for MAE-Base and MAE-Large
+
+|           |      MODEL_NAME       |   LR   | MIN_LR | WR_EPOCH |
+| :-------: | :-------------------: | :----: | :----: | :------: |
+| MAE-Base  | mae_vit_base_patch16  | 2.0e-3 |  1e-8  |    40    |
+| MAE-Large | mae_vit_large_patch16 | 2.2e-3 |  1e-4  |    80    |
+
+
+
+## MAE Fine-tuning
+
+```python
+python main_finetune.py \
+  --accum_iter 1 \
+  --batch_size 256 \
+  --model ${MODEL_NAME} \
+  --finetune  ${PATH to Ptr-trained Model} \
+  --epochs ${EPOCH} \
+  --lr 1.5e-2 --layer_decay ${LAYER_DECAY} \
+  --min-lr ${MIN_LR} \
+  --opt-betas 0.98 0.92 0.99 \
+  --opt-eps 1e-8 --max-grad-norm 0 \
+  --use-adan --warmup-epochs ${WR_EPOCH} \
+  --weight_decay ${WD} --drop_path ${DROP_PATH} \
+  --mixup 0.8 --cutmix 1.0 --reprob 0.25 \
+  --dist_eval --data_path ${IMAGENET_DIR}
+```
+
+- The fine-tune file `main_finetune.py` comes from [MAE](https://github.com/facebookresearch/mae).
+- We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large.
+- There are some differences between hyper-parameters for MAE-Base and MAE-Large
+
+|           |    MODEL_NAME     | EPOCH | MIN_LR | LAYER_DECAY | WR_EPOCH | WD   | DROP_PATH |
+| :-------: | :---------------: | :---: | :----: | :---------: | :------: | ---- | :-------: |
+| MAE-Base  | vit_base_patch16  |  100  |  1e-6  |    0.65     |    40    | 5e-3 |    0.1    |
+| MAE-Large | vit_large_patch16 |  50   |  1e-5  |    0.75     |    10    | 1e-3 |    0.2    |
+
+
+
+## Results and Logs
+
+|          |                           MAE-Base                           |                          MAE-Large                           |
+| :------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| Top-1 Acc. (%) |                             83.8                             |                             85.9                             |
+| download | [log-pretrain](./exp_results/MAE/base/log_base_pretrain.txt)/[log-finetune](./exp_results/MAE/base/log_base_ft.txt)/model | [log-pretrain](./exp_results/MAE/large/log_large_pretrain.txt)/[log-finetune](./exp_results/MAE/large/log_large_ft.txt)/model |
+
diff --git a/CV/MAE/adan.py b/CV/MAE/adan.py
new file mode 100644
index 0000000..e2a224a
--- /dev/null
+++ b/CV/MAE/adan.py
@@ -0,0 +1,154 @@
+# Copyright 2022 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from timm.utils import *
+
+
+class Adan(Optimizer):
+    """
+    Implements a pytorch variant of Adan
+
+    Adan was proposed in
+    Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022.
+    https://arxiv.org/abs/2208.06677
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float, flot], optional): coefficients used for computing 
+            running averages of gradient and its norm. (default: (0.98, 0.92, 0.99))
+        eps (float, optional): term added to the denominator to improve 
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0)
+        max_grad_norm (float, optional): value used to clip 
+            global grad norm (default: 0.0 no clip)
+        no_prox (bool): how to perform the decoupled weight decay (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8,
+                 weight_decay=0.0, max_grad_norm=0.0, no_prox=False):
+        if not 0.0 <= max_grad_norm:
+            raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm))
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= betas[2] < 1.0:
+            raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm, no_prox=no_prox)
+        super(Adan, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adan, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('no_prox', False)
+
+    @torch.no_grad()
+    def restart_opt(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                if p.requires_grad:
+                    state = self.state[p]
+                    # State initialization
+
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    # Exponential moving average of gradient difference
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+    @torch.no_grad()
+    def step(self):
+        """
+            Performs a single optimization step.
+        """
+        if self.defaults['max_grad_norm'] > 0:
+            device = self.param_groups[0]['params'][0].device
+            global_grad_norm = torch.zeros(1, device=device)
+
+            max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device)
+            for group in self.param_groups:
+
+                for p in group['params']:
+                    if p.grad is not None:
+                        grad = p.grad
+                        global_grad_norm.add_(grad.pow(2).sum())
+
+            global_grad_norm = torch.sqrt(global_grad_norm)
+
+            clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0)
+        else:
+            clip_global_grad_norm = 1.0
+
+        for group in self.param_groups:
+            beta1, beta2, beta3 = group['betas']
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            bias_correction1 = 1.0 - beta1 ** group['step']
+
+            bias_correction2 = 1.0 - beta2 ** group['step']
+
+            bias_correction3 = 1.0 - beta3 ** group['step']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+                grad = p.grad.mul_(clip_global_grad_norm)
+                if 'pre_grad' not in state or group['step'] == 1:
+                    state['pre_grad'] = grad
+
+                copy_grad = grad.clone()
+
+                exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff']
+                diff = grad - state['pre_grad']
+
+                update = grad + beta2 * diff
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
+                exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
+                exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # n_t
+
+                denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps'])
+                update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom)
+
+                if group['no_prox']:
+                    p.data.mul_(1 - group['lr'] * group['weight_decay'])
+                    p.add_(update, alpha=-group['lr'])
+                else:
+                    p.add_(update, alpha=-group['lr'])
+                    p.data.div_(1 + group['lr'] * group['weight_decay'])
+
+                state['pre_grad'] = copy_grad
diff --git a/CV/MAE/engine_finetune.py b/CV/MAE/engine_finetune.py
new file mode 100644
index 0000000..3b0fcbd
--- /dev/null
+++ b/CV/MAE/engine_finetune.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+
+import math
+import sys
+from typing import Iterable, Optional
+
+import torch
+
+from timm.data import Mixup
+from timm.utils import accuracy
+
+import util.misc as misc
+import util.lr_sched as lr_sched
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler, max_norm: float = 0,
+                    mixup_fn: Optional[Mixup] = None, log_writer=None,
+                    args=None):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 20
+
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    for data_iter_step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+
+        # we use a per iteration (instead of per epoch) lr scheduler
+        if data_iter_step % accum_iter == 0:
+            lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
+
+        samples = samples.to(device, non_blocking=True)
+        targets = targets.to(device, non_blocking=True)
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+        with torch.cuda.amp.autocast():
+            outputs = model(samples)
+            loss = criterion(outputs, targets)
+
+        loss_value = loss.item()
+
+        
+
+        loss /= accum_iter
+        loss_scaler(loss, optimizer, clip_grad=max_norm,
+                    parameters=model.parameters(), create_graph=False,
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        metric_logger.update(loss=loss_value)
+        min_lr = 10.
+        max_lr = 0.
+        for group in optimizer.param_groups:
+            min_lr = min(min_lr, group["lr"])
+            max_lr = max(max_lr, group["lr"])
+
+        metric_logger.update(lr=max_lr)
+
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if not math.isfinite(loss_value_reduce):
+            print("Loss is {}, stopping training".format(loss_value_reduce))
+            sys.exit(1)
+        if log_writer is not None and (data_iter_step + 1) % accum_iter == 0:
+            """ We use epoch_1000x as the x-axis in tensorboard.
+            This calibrates different curves when batch size changes.
+            """
+            epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
+            log_writer.add_scalar('loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('lr', max_lr, epoch_1000x)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def evaluate(data_loader, model, device):
+    criterion = torch.nn.CrossEntropyLoss()
+
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    header = 'Test:'
+
+    # switch to evaluation mode
+    model.eval()
+
+    for batch in metric_logger.log_every(data_loader, 10, header):
+        images = batch[0]
+        target = batch[-1]
+        images = images.to(device, non_blocking=True)
+        target = target.to(device, non_blocking=True)
+
+        # compute output
+        with torch.cuda.amp.autocast():
+            output = model(images)
+            loss = criterion(output, target)
+
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        batch_size = images.shape[0]
+        metric_logger.update(loss=loss.item())
+        metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
+        metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
+          .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
+
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
\ No newline at end of file
diff --git a/CV/MAE/engine_pretrain.py b/CV/MAE/engine_pretrain.py
new file mode 100644
index 0000000..8f41b63
--- /dev/null
+++ b/CV/MAE/engine_pretrain.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import math
+import sys
+from typing import Iterable
+
+import torch
+
+import util.misc as misc
+import util.lr_sched as lr_sched
+
+
+def train_one_epoch(model: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    log_writer=None,
+                    args=None):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 20
+
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    for data_iter_step, (samples, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+
+        # we use a per iteration (instead of per epoch) lr scheduler
+        if data_iter_step % accum_iter == 0:
+            lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
+
+        samples = samples.to(device, non_blocking=True)
+
+        with torch.cuda.amp.autocast():
+            loss, _, _ = model(samples, mask_ratio=args.mask_ratio)
+
+        loss_value = loss.item()
+
+        
+
+        loss /= accum_iter
+        loss_scaler(loss, optimizer, parameters=model.parameters(),
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        metric_logger.update(loss=loss_value)
+
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=lr)
+
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if not math.isfinite(loss_value_reduce):
+            print("Loss is {}, stopping training".format(loss_value_reduce))
+            sys.exit(1)
+        if log_writer is not None and (data_iter_step + 1) % accum_iter == 0:
+            """ We use epoch_1000x as the x-axis in tensorboard.
+            This calibrates different curves when batch size changes.
+            """
+            epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
+            log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('lr', lr, epoch_1000x)
+
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
\ No newline at end of file
diff --git a/CV/MAE/exp_results/MAE/base/log_base_ft.txt b/CV/MAE/exp_results/MAE/base/log_base_ft.txt
new file mode 100644
index 0000000..10d975f
--- /dev/null
+++ b/CV/MAE/exp_results/MAE/base/log_base_ft.txt
@@ -0,0 +1,100 @@
+{"train_lr": 0.00018719999999999997, "train_loss": 6.220443151950836, "test_loss": 3.146200124025345, "test_acc1": 38.718000028839114, "test_acc5": 68.56600002685546, "epoch": 0, "n_parameters": 86567656}
+{"train_lr": 0.0005622000000000001, "train_loss": 4.947992331647873, "test_loss": 1.8401640093326568, "test_acc1": 59.50200001586914, "test_acc5": 84.50000004486084, "epoch": 1, "n_parameters": 86567656}
+{"train_lr": 0.0009372000000000001, "train_loss": 4.558939405536652, "test_loss": 1.5222917068004609, "test_acc1": 65.57200004241943, "test_acc5": 88.05600001373291, "epoch": 2, "n_parameters": 86567656}
+{"train_lr": 0.0013122000000000001, "train_loss": 4.343958680105209, "test_loss": 1.3651889091730118, "test_acc1": 68.48600001678467, "test_acc5": 89.68400004119873, "epoch": 3, "n_parameters": 86567656}
+{"train_lr": 0.0016872, "train_loss": 4.213459821510315, "test_loss": 1.3220995950698853, "test_acc1": 70.23400000396728, "test_acc5": 90.71000001220703, "epoch": 4, "n_parameters": 86567656}
+{"train_lr": 0.002062199999999999, "train_loss": 4.090291801166535, "test_loss": 1.228445138335228, "test_acc1": 71.64600002532958, "test_acc5": 91.52000002593994, "epoch": 5, "n_parameters": 86567656}
+{"train_lr": 0.002437199999999999, "train_loss": 4.008170649766922, "test_loss": 1.185943089723587, "test_acc1": 72.97000003448487, "test_acc5": 91.96400001800536, "epoch": 6, "n_parameters": 86567656}
+{"train_lr": 0.002812200000000001, "train_loss": 3.933375036430359, "test_loss": 1.1420067197084427, "test_acc1": 73.51400005615234, "test_acc5": 92.32200002044678, "epoch": 7, "n_parameters": 86567656}
+{"train_lr": 0.0031871999999999994, "train_loss": 3.883904767179489, "test_loss": 1.154113737642765, "test_acc1": 74.26400002593994, "test_acc5": 92.74000003173828, "epoch": 8, "n_parameters": 86567656}
+{"train_lr": 0.0035621999999999993, "train_loss": 3.82627286157608, "test_loss": 1.1308922132849692, "test_acc1": 74.69400002990723, "test_acc5": 93.01800005096436, "epoch": 9, "n_parameters": 86567656}
+{"train_lr": 0.003937199999999998, "train_loss": 3.7571802374839782, "test_loss": 1.0861055210232735, "test_acc1": 75.23200000030518, "test_acc5": 93.20800005615234, "epoch": 10, "n_parameters": 86567656}
+{"train_lr": 0.0043122, "train_loss": 3.7400177223205566, "test_loss": 1.0553014129400253, "test_acc1": 75.61800002456665, "test_acc5": 93.4340000265503, "epoch": 11, "n_parameters": 86567656}
+{"train_lr": 0.004687200000000001, "train_loss": 3.6843111953258516, "test_loss": 1.0651562806963921, "test_acc1": 76.1400000088501, "test_acc5": 93.58800003448486, "epoch": 12, "n_parameters": 86567656}
+{"train_lr": 0.0050622, "train_loss": 3.663368493080139, "test_loss": 1.0271986865997313, "test_acc1": 76.15800001403808, "test_acc5": 93.72600003509521, "epoch": 13, "n_parameters": 86567656}
+{"train_lr": 0.0054372, "train_loss": 3.62431772274971, "test_loss": 1.0381141716241837, "test_acc1": 76.57800002990723, "test_acc5": 93.96000005645752, "epoch": 14, "n_parameters": 86567656}
+{"train_lr": 0.0058122, "train_loss": 3.5911776705265046, "test_loss": 1.025875808596611, "test_acc1": 76.83400003021241, "test_acc5": 93.91400003479004, "epoch": 15, "n_parameters": 86567656}
+{"train_lr": 0.0061872, "train_loss": 3.558628352546692, "test_loss": 1.0111908143758774, "test_acc1": 76.96200001129151, "test_acc5": 93.98400003753662, "epoch": 16, "n_parameters": 86567656}
+{"train_lr": 0.006562199999999997, "train_loss": 3.543642453479767, "test_loss": 0.9803780218958855, "test_acc1": 77.42599999786377, "test_acc5": 94.18000003753662, "epoch": 17, "n_parameters": 86567656}
+{"train_lr": 0.0069372, "train_loss": 3.5387943693637847, "test_loss": 0.9889326795935631, "test_acc1": 77.75000002258301, "test_acc5": 94.21000005096435, "epoch": 18, "n_parameters": 86567656}
+{"train_lr": 0.007312200000000001, "train_loss": 3.499820264673233, "test_loss": 0.9561663055419922, "test_acc1": 77.62800001464844, "test_acc5": 94.35400003509521, "epoch": 19, "n_parameters": 86567656}
+{"train_lr": 0.007687199999999997, "train_loss": 3.4993985069274904, "test_loss": 0.9616166499257087, "test_acc1": 77.71800002593994, "test_acc5": 94.36600001068115, "epoch": 20, "n_parameters": 86567656}
+{"train_lr": 0.0080622, "train_loss": 3.460160034608841, "test_loss": 0.9682401439547539, "test_acc1": 78.13600002288818, "test_acc5": 94.4920000137329, "epoch": 21, "n_parameters": 86567656}
+{"train_lr": 0.0084372, "train_loss": 3.4610034842967985, "test_loss": 0.9606137126684189, "test_acc1": 77.98400003875733, "test_acc5": 94.52200002197266, "epoch": 22, "n_parameters": 86567656}
+{"train_lr": 0.008812199999999997, "train_loss": 3.436005702972412, "test_loss": 0.9578395664691925, "test_acc1": 78.23200000976563, "test_acc5": 94.63200002716064, "epoch": 23, "n_parameters": 86567656}
+{"train_lr": 0.009187199999999998, "train_loss": 3.421230093240738, "test_loss": 0.9625651022791862, "test_acc1": 78.46600002075195, "test_acc5": 94.70200002471924, "epoch": 24, "n_parameters": 86567656}
+{"train_lr": 0.009562199999999996, "train_loss": 3.4071589920043945, "test_loss": 0.9450574347376823, "test_acc1": 78.4380000125122, "test_acc5": 94.63400004058838, "epoch": 25, "n_parameters": 86567656}
+{"train_lr": 0.009937199999999998, "train_loss": 3.3995270119667054, "test_loss": 0.9474635648727417, "test_acc1": 78.45400002380372, "test_acc5": 94.70600004058838, "epoch": 26, "n_parameters": 86567656}
+{"train_lr": 0.010312199999999997, "train_loss": 3.3837591108322145, "test_loss": 0.9371258601546287, "test_acc1": 78.65400002624511, "test_acc5": 94.7400000402832, "epoch": 27, "n_parameters": 86567656}
+{"train_lr": 0.010687199999999997, "train_loss": 3.370058294558525, "test_loss": 0.9240103733539581, "test_acc1": 78.72200001831055, "test_acc5": 94.82600001617432, "epoch": 28, "n_parameters": 86567656}
+{"train_lr": 0.011062199999999998, "train_loss": 3.3644386556625365, "test_loss": 0.9514913991093635, "test_acc1": 78.53200000701904, "test_acc5": 94.69600005126954, "epoch": 29, "n_parameters": 86567656}
+{"train_lr": 0.011437199999999996, "train_loss": 3.352064240050316, "test_loss": 0.9473175323009491, "test_acc1": 78.84600002838135, "test_acc5": 94.88000001922607, "epoch": 30, "n_parameters": 86567656}
+{"train_lr": 0.011812200000000004, "train_loss": 3.343575291442871, "test_loss": 0.8956420955061912, "test_acc1": 78.94000000701904, "test_acc5": 95.00600002227783, "epoch": 31, "n_parameters": 86567656}
+{"train_lr": 0.012187199999999997, "train_loss": 3.3357870742559435, "test_loss": 0.9404549324512481, "test_acc1": 78.7719999963379, "test_acc5": 94.98000003814697, "epoch": 32, "n_parameters": 86567656}
+{"train_lr": 0.012562199999999997, "train_loss": 3.319555434727669, "test_loss": 0.9541194596886635, "test_acc1": 78.83600001220704, "test_acc5": 94.94200002471923, "epoch": 33, "n_parameters": 86567656}
+{"train_lr": 0.012937200000000005, "train_loss": 3.31564322450161, "test_loss": 0.9260695472359657, "test_acc1": 79.08000001220704, "test_acc5": 94.95400003814697, "epoch": 34, "n_parameters": 86567656}
+{"train_lr": 0.013312200000000005, "train_loss": 3.309744071626663, "test_loss": 0.920071712732315, "test_acc1": 79.07800001220703, "test_acc5": 94.88000004058838, "epoch": 35, "n_parameters": 86567656}
+{"train_lr": 0.013687199999999998, "train_loss": 3.2949381719112396, "test_loss": 0.8983643808960915, "test_acc1": 79.15200002593994, "test_acc5": 95.06000004058838, "epoch": 36, "n_parameters": 86567656}
+{"train_lr": 0.014062200000000004, "train_loss": 3.27849584107399, "test_loss": 0.9205039608478546, "test_acc1": 79.14400001251221, "test_acc5": 94.99400002471924, "epoch": 37, "n_parameters": 86567656}
+{"train_lr": 0.014437199999999996, "train_loss": 3.2822228091716767, "test_loss": 0.8969993716478348, "test_acc1": 79.23400001312255, "test_acc5": 95.05600001953125, "epoch": 38, "n_parameters": 86567656}
+{"train_lr": 0.0148122, "train_loss": 3.2667752093315126, "test_loss": 0.8877088937163353, "test_acc1": 79.5680000289917, "test_acc5": 94.98600002471923, "epoch": 39, "n_parameters": 86567656}
+{"train_lr": 0.014996581744440903, "train_loss": 3.265050085401535, "test_loss": 0.8858077436685562, "test_acc1": 79.5340000366211, "test_acc5": 95.0980000491333, "epoch": 40, "n_parameters": 86567656}
+{"train_lr": 0.014976050594237029, "train_loss": 3.2547211683750152, "test_loss": 0.8691368037462235, "test_acc1": 79.7200000289917, "test_acc5": 95.13200002441407, "epoch": 41, "n_parameters": 86567656}
+{"train_lr": 0.014935028122712916, "train_loss": 3.241869815301895, "test_loss": 0.8789618426561355, "test_acc1": 79.56200002838135, "test_acc5": 95.1620000302124, "epoch": 42, "n_parameters": 86567656}
+{"train_lr": 0.014873626769611584, "train_loss": 3.2315730389356614, "test_loss": 0.8627288854122162, "test_acc1": 79.916000050354, "test_acc5": 95.27200001678467, "epoch": 43, "n_parameters": 86567656}
+{"train_lr": 0.014792014831773887, "train_loss": 3.202810504412651, "test_loss": 0.8483913645148278, "test_acc1": 80.12600003234863, "test_acc5": 95.39000004089355, "epoch": 44, "n_parameters": 86567656}
+{"train_lr": 0.014690416001848693, "train_loss": 3.1993762041330336, "test_loss": 0.8779441103339195, "test_acc1": 79.98000003997802, "test_acc5": 95.40600001953125, "epoch": 45, "n_parameters": 86567656}
+{"train_lr": 0.014569108755166755, "train_loss": 3.187324148273468, "test_loss": 0.8693245351314545, "test_acc1": 80.12200002899169, "test_acc5": 95.39200002746583, "epoch": 46, "n_parameters": 86567656}
+{"train_lr": 0.014428425586459301, "train_loss": 3.1755990085601806, "test_loss": 0.8747495183348656, "test_acc1": 80.1980000241089, "test_acc5": 95.35000002197266, "epoch": 47, "n_parameters": 86567656}
+{"train_lr": 0.014268752098512963, "train_loss": 3.155262738776207, "test_loss": 0.8533616551756859, "test_acc1": 80.38600003753662, "test_acc5": 95.45800000335693, "epoch": 48, "n_parameters": 86567656}
+{"train_lr": 0.014090525945259457, "train_loss": 3.1335597133159636, "test_loss": 0.8487207201123238, "test_acc1": 80.55600003723144, "test_acc5": 95.59800004394532, "epoch": 49, "n_parameters": 86567656}
+{"train_lr": 0.013894235632196493, "train_loss": 3.1262916456222536, "test_loss": 0.8581038397550583, "test_acc1": 80.52800002960205, "test_acc5": 95.61000001708985, "epoch": 50, "n_parameters": 86567656}
+{"train_lr": 0.013680419177428186, "train_loss": 3.1185608382940293, "test_loss": 0.8400468546152114, "test_acc1": 80.67600002471924, "test_acc5": 95.60000000610351, "epoch": 51, "n_parameters": 86567656}
+{"train_lr": 0.01344966263699487, "train_loss": 3.1101703361988067, "test_loss": 0.8435578069090843, "test_acc1": 80.63000002166748, "test_acc5": 95.73200002227783, "epoch": 52, "n_parameters": 86567656}
+{"train_lr": 0.01320259849853417, "train_loss": 3.096173421573639, "test_loss": 0.8371610799431801, "test_acc1": 81.05600002746581, "test_acc5": 95.72600002532958, "epoch": 53, "n_parameters": 86567656}
+{"train_lr": 0.012939903947676419, "train_loss": 3.085701132774353, "test_loss": 0.8331937485933304, "test_acc1": 80.97600001831054, "test_acc5": 95.77200003051757, "epoch": 54, "n_parameters": 86567656}
+{"train_lr": 0.012662299011925835, "train_loss": 3.0661963938474655, "test_loss": 0.8213660365343094, "test_acc1": 80.88600003204346, "test_acc5": 95.75800004089355, "epoch": 55, "n_parameters": 86567656}
+{"train_lr": 0.012370544587115206, "train_loss": 3.057591044449806, "test_loss": 0.8165697306394577, "test_acc1": 81.3160000213623, "test_acc5": 95.83400003814697, "epoch": 56, "n_parameters": 86567656}
+{"train_lr": 0.012065440351843355, "train_loss": 3.0463304943323135, "test_loss": 0.8060084617137909, "test_acc1": 81.3940000354004, "test_acc5": 95.85600003845215, "epoch": 57, "n_parameters": 86567656}
+{"train_lr": 0.011747822575611631, "train_loss": 3.026911934018135, "test_loss": 0.7906639388203621, "test_acc1": 81.43200004058838, "test_acc5": 95.98400001434327, "epoch": 58, "n_parameters": 86567656}
+{"train_lr": 0.011418561826667327, "train_loss": 3.0174565678834915, "test_loss": 0.800553865134716, "test_acc1": 81.60400002746582, "test_acc5": 95.92800002471924, "epoch": 59, "n_parameters": 86567656}
+{"train_lr": 0.01107856058583667, "train_loss": 3.008225770974159, "test_loss": 0.7980207592248917, "test_acc1": 81.7600000326538, "test_acc5": 96.03000002502442, "epoch": 60, "n_parameters": 86567656}
+{"train_lr": 0.010728750772887586, "train_loss": 2.9927734558582304, "test_loss": 0.8040242698788643, "test_acc1": 81.70800001647949, "test_acc5": 96.02200004943847, "epoch": 61, "n_parameters": 86567656}
+{"train_lr": 0.01037009119220237, "train_loss": 2.9911838919639586, "test_loss": 0.7953862142562866, "test_acc1": 81.8280000354004, "test_acc5": 96.05800003082275, "epoch": 62, "n_parameters": 86567656}
+{"train_lr": 0.010003564904761585, "train_loss": 2.9719505157470705, "test_loss": 0.7940907120704651, "test_acc1": 81.78600001922608, "test_acc5": 96.01400002502442, "epoch": 63, "n_parameters": 86567656}
+{"train_lr": 0.009630176533642159, "train_loss": 2.963402523112297, "test_loss": 0.7831828370690346, "test_acc1": 81.97200001159668, "test_acc5": 96.0800000253296, "epoch": 64, "n_parameters": 86567656}
+{"train_lr": 0.009250949510415432, "train_loss": 2.94107414290905, "test_loss": 0.7819095095992088, "test_acc1": 82.04200000030518, "test_acc5": 96.19600002777099, "epoch": 65, "n_parameters": 86567656}
+{"train_lr": 0.008866923269992324, "train_loss": 2.9405784098625185, "test_loss": 0.7755044403672219, "test_acc1": 82.14000001678467, "test_acc5": 96.21200004425049, "epoch": 66, "n_parameters": 86567656}
+{"train_lr": 0.00847915040160449, "train_loss": 2.9315583733558657, "test_loss": 0.7770591515302658, "test_acc1": 82.12400002716065, "test_acc5": 96.27200004425049, "epoch": 67, "n_parameters": 86567656}
+{"train_lr": 0.008088693763730424, "train_loss": 2.9024018899202346, "test_loss": 0.7656473967432976, "test_acc1": 82.40400001159668, "test_acc5": 96.2580000253296, "epoch": 68, "n_parameters": 86567656}
+{"train_lr": 0.0076966235708742035, "train_loss": 2.89498575322628, "test_loss": 0.7626381632685661, "test_acc1": 82.49200003051757, "test_acc5": 96.27400003356934, "epoch": 69, "n_parameters": 86567656}
+{"train_lr": 0.0073040144601820185, "train_loss": 2.864926402044296, "test_loss": 0.7629410058259964, "test_acc1": 82.72200000610351, "test_acc5": 96.35200002532959, "epoch": 70, "n_parameters": 86567656}
+{"train_lr": 0.006911942545936348, "train_loss": 2.8626769891262054, "test_loss": 0.7633640518784524, "test_acc1": 82.480000027771, "test_acc5": 96.36400005249024, "epoch": 71, "n_parameters": 86567656}
+{"train_lr": 0.006521482470001542, "train_loss": 2.8596914556503297, "test_loss": 0.7628568401932716, "test_acc1": 82.56400003570556, "test_acc5": 96.32000005767823, "epoch": 72, "n_parameters": 86567656}
+{"train_lr": 0.006133704456305189, "train_loss": 2.8379484293699266, "test_loss": 0.7544870236515999, "test_acc1": 82.728000027771, "test_acc5": 96.4060000415039, "epoch": 73, "n_parameters": 86567656}
+{"train_lr": 0.0057496713774287216, "train_loss": 2.825576261472702, "test_loss": 0.7530808946490288, "test_acc1": 82.57800004364013, "test_acc5": 96.29800003082275, "epoch": 74, "n_parameters": 86567656}
+{"train_lr": 0.005370435841347517, "train_loss": 2.821367850232124, "test_loss": 0.7521498575806618, "test_acc1": 82.79800002227783, "test_acc5": 96.43200002807617, "epoch": 75, "n_parameters": 86567656}
+{"train_lr": 0.004997037306305652, "train_loss": 2.8102516706705094, "test_loss": 0.7544213259220123, "test_acc1": 82.91400002807617, "test_acc5": 96.54000005767823, "epoch": 76, "n_parameters": 86567656}
+{"train_lr": 0.004630499231733052, "train_loss": 2.7939453321933745, "test_loss": 0.7465361738204956, "test_acc1": 82.92000003570557, "test_acc5": 96.50400005523682, "epoch": 77, "n_parameters": 86567656}
+{"train_lr": 0.004271826273014355, "train_loss": 2.7792876475334167, "test_loss": 0.7468091368675231, "test_acc1": 83.13600001190186, "test_acc5": 96.53200005767822, "epoch": 78, "n_parameters": 86567656}
+{"train_lr": 0.003922001527798389, "train_loss": 2.7769425602436066, "test_loss": 0.7462005805969238, "test_acc1": 83.0420000515747, "test_acc5": 96.48400005249023, "epoch": 79, "n_parameters": 86567656}
+{"train_lr": 0.0035819838413957477, "train_loss": 2.7580106742620467, "test_loss": 0.7476282814145088, "test_acc1": 83.30400002258301, "test_acc5": 96.52800004699706, "epoch": 80, "n_parameters": 86567656}
+{"train_lr": 0.0032527051786505347, "train_loss": 2.755719056749344, "test_loss": 0.7485474190115928, "test_acc1": 83.2220000415039, "test_acc5": 96.54400005249023, "epoch": 81, "n_parameters": 86567656}
+{"train_lr": 0.0029350680694894497, "train_loss": 2.733246625614166, "test_loss": 0.7376724031567573, "test_acc1": 83.25400003051757, "test_acc5": 96.54200004974365, "epoch": 82, "n_parameters": 86567656}
+{"train_lr": 0.0026299431351500197, "train_loss": 2.725854627633095, "test_loss": 0.7386321437358856, "test_acc1": 83.2880000064087, "test_acc5": 96.5220000390625, "epoch": 83, "n_parameters": 86567656}
+{"train_lr": 0.0023381667018682875, "train_loss": 2.730336236548424, "test_loss": 0.7432589226961136, "test_acc1": 83.51000001190185, "test_acc5": 96.53000004425049, "epoch": 84, "n_parameters": 86567656}
+{"train_lr": 0.0020605385085667566, "train_loss": 2.7197771132946014, "test_loss": 0.7351065069437027, "test_acc1": 83.53600001495361, "test_acc5": 96.56200005523682, "epoch": 85, "n_parameters": 86567656}
+{"train_lr": 0.0017978195148255654, "train_loss": 2.703706868839264, "test_loss": 0.7340468415617942, "test_acc1": 83.50800002014161, "test_acc5": 96.56400005523682, "epoch": 86, "n_parameters": 86567656}
+{"train_lr": 0.0015507298151451832, "train_loss": 2.7080028397798537, "test_loss": 0.7307805678248406, "test_acc1": 83.53800004180908, "test_acc5": 96.64000005249024, "epoch": 87, "n_parameters": 86567656}
+{"train_lr": 0.0013199466652174393, "train_loss": 2.700676307797432, "test_loss": 0.7344949060678482, "test_acc1": 83.53000002807617, "test_acc5": 96.63800004699706, "epoch": 88, "n_parameters": 86567656}
+{"train_lr": 0.0011061026256147478, "train_loss": 2.689790417647362, "test_loss": 0.7330510130524636, "test_acc1": 83.54800002838135, "test_acc5": 96.6440000390625, "epoch": 89, "n_parameters": 86567656}
+{"train_lr": 0.0009097838279855404, "train_loss": 2.6793027224302293, "test_loss": 0.7295243856310845, "test_acc1": 83.56600002258301, "test_acc5": 96.64400005523682, "epoch": 90, "n_parameters": 86567656}
+{"train_lr": 0.0007315283685081682, "train_loss": 2.678568696594238, "test_loss": 0.7311912828683853, "test_acc1": 83.71000003631592, "test_acc5": 96.63600004180908, "epoch": 91, "n_parameters": 86567656}
+{"train_lr": 0.0005718248330066727, "train_loss": 2.6540179803133013, "test_loss": 0.7337997442483902, "test_acc1": 83.62800002532958, "test_acc5": 96.64800004974366, "epoch": 92, "n_parameters": 86567656}
+{"train_lr": 0.00043111095777100694, "train_loss": 2.6582588331222534, "test_loss": 0.7270043037831784, "test_acc1": 83.62800000091553, "test_acc5": 96.68400004974366, "epoch": 93, "n_parameters": 86567656}
+{"train_lr": 0.0003097724297522902, "train_loss": 2.6649514198064805, "test_loss": 0.7313019317388535, "test_acc1": 83.72400000915528, "test_acc5": 96.64400004974365, "epoch": 94, "n_parameters": 86567656}
+{"train_lr": 0.0002081418294216848, "train_loss": 2.6643570078611374, "test_loss": 0.7312484115362168, "test_acc1": 83.72200002807617, "test_acc5": 96.63200004974365, "epoch": 95, "n_parameters": 86567656}
+{"train_lr": 0.00012649771919044005, "train_loss": 2.6634057205677033, "test_loss": 0.7318920171260834, "test_acc1": 83.75600001708985, "test_acc5": 96.65400004699707, "epoch": 96, "n_parameters": 86567656}
+{"train_lr": 6.506387988968701e-05, "train_loss": 2.6628331270217895, "test_loss": 0.7311200454831124, "test_acc1": 83.77000001983643, "test_acc5": 96.64400004699706, "epoch": 97, "n_parameters": 86567656}
+{"train_lr": 2.4008697402735765e-05, "train_loss": 2.6537726917743685, "test_loss": 0.7295740690827369, "test_acc1": 83.79800002258301, "test_acc5": 96.67800005249024, "epoch": 98, "n_parameters": 86567656}
+{"train_lr": 3.4447011310720205e-06, "train_loss": 2.6564826879262924, "test_loss": 0.7303899875283242, "test_acc1": 83.80000002258301, "test_acc5": 96.79000005249023, "epoch": 99, "n_parameters": 86567656}
diff --git a/CV/MAE/exp_results/MAE/base/log_base_pretrain.txt b/CV/MAE/exp_results/MAE/base/log_base_pretrain.txt
new file mode 100644
index 0000000..aa05f86
--- /dev/null
+++ b/CV/MAE/exp_results/MAE/base/log_base_pretrain.txt
@@ -0,0 +1,800 @@
+{"train_lr": 2.4920064e-05, "train_loss": 1.029866009569168, "epoch": 0}
+{"train_lr": 7.492006399999998e-05, "train_loss": 0.826913864672184, "epoch": 1}
+{"train_lr": 0.00012492006400000006, "train_loss": 0.7834360855937004, "epoch": 2}
+{"train_lr": 0.00017492006399999992, "train_loss": 0.7394427828550338, "epoch": 3}
+{"train_lr": 0.00022492006399999994, "train_loss": 0.7072458884239197, "epoch": 4}
+{"train_lr": 0.0002749200639999999, "train_loss": 0.6952311094760895, "epoch": 5}
+{"train_lr": 0.00032492006400000004, "train_loss": 0.6914466771841049, "epoch": 6}
+{"train_lr": 0.00037492006400000006, "train_loss": 0.6886494241833687, "epoch": 7}
+{"train_lr": 0.00042492006399999987, "train_loss": 0.6822491272926331, "epoch": 8}
+{"train_lr": 0.00047492006400000044, "train_loss": 0.6565844881653786, "epoch": 9}
+{"train_lr": 0.0005249200640000002, "train_loss": 0.6322732063770294, "epoch": 10}
+{"train_lr": 0.0005749200639999999, "train_loss": 0.6173936517953873, "epoch": 11}
+{"train_lr": 0.0006249200639999998, "train_loss": 0.5982944983005524, "epoch": 12}
+{"train_lr": 0.0006749200640000005, "train_loss": 0.5760513068437576, "epoch": 13}
+{"train_lr": 0.000724920064, "train_loss": 0.558085383951664, "epoch": 14}
+{"train_lr": 0.0007749200639999998, "train_loss": 0.5438177249908447, "epoch": 15}
+{"train_lr": 0.0008249200639999997, "train_loss": 0.5321405304253102, "epoch": 16}
+{"train_lr": 0.0008749200639999996, "train_loss": 0.5226460194289684, "epoch": 17}
+{"train_lr": 0.0009249200639999996, "train_loss": 0.5143405359089375, "epoch": 18}
+{"train_lr": 0.0009749200640000003, "train_loss": 0.5078899907290936, "epoch": 19}
+{"train_lr": 0.0010249200639999999, "train_loss": 0.5023593332231044, "epoch": 20}
+{"train_lr": 0.0010749200640000002, "train_loss": 0.4975539141893387, "epoch": 21}
+{"train_lr": 0.0011249200639999997, "train_loss": 0.49327956531047823, "epoch": 22}
+{"train_lr": 0.001174920064000001, "train_loss": 0.48933574736714364, "epoch": 23}
+{"train_lr": 0.001224920064, "train_loss": 0.4860016059339046, "epoch": 24}
+{"train_lr": 0.0012749200639999999, "train_loss": 0.48271030520796776, "epoch": 25}
+{"train_lr": 0.0013249200639999996, "train_loss": 0.48013067763447764, "epoch": 26}
+{"train_lr": 0.0013749200639999988, "train_loss": 0.47759121404886246, "epoch": 27}
+{"train_lr": 0.001424920064, "train_loss": 0.4752511966407299, "epoch": 28}
+{"train_lr": 0.0014749200639999995, "train_loss": 0.47316404255628586, "epoch": 29}
+{"train_lr": 0.001524920064000001, "train_loss": 0.4711698636054993, "epoch": 30}
+{"train_lr": 0.001574920064, "train_loss": 0.4693278255581856, "epoch": 31}
+{"train_lr": 0.0016249200640000012, "train_loss": 0.46783283928632735, "epoch": 32}
+{"train_lr": 0.0016749200639999998, "train_loss": 0.46624995667934416, "epoch": 33}
+{"train_lr": 0.0017249200640000002, "train_loss": 0.46449592822194097, "epoch": 34}
+{"train_lr": 0.0017749200639999995, "train_loss": 0.46306066621541975, "epoch": 35}
+{"train_lr": 0.0018249200639999994, "train_loss": 0.4618008250772953, "epoch": 36}
+{"train_lr": 0.0018749200640000001, "train_loss": 0.46067862812876703, "epoch": 37}
+{"train_lr": 0.0019249200639999994, "train_loss": 0.4598328410089016, "epoch": 38}
+{"train_lr": 0.001974920064, "train_loss": 0.4584417822599411, "epoch": 39}
+{"train_lr": 0.0019999971657724378, "train_loss": 0.4574848892211914, "epoch": 40}
+{"train_lr": 0.001999980105982979, "train_loss": 0.45612444841861727, "epoch": 41}
+{"train_lr": 0.0019999459593779763, "train_loss": 0.4549763072431087, "epoch": 42}
+{"train_lr": 0.001999894726540914, "train_loss": 0.4538106670200825, "epoch": 43}
+{"train_lr": 0.00199982640834721, "train_loss": 0.45279076434373855, "epoch": 44}
+{"train_lr": 0.0019997410059642406, "train_loss": 0.45213532138466833, "epoch": 45}
+{"train_lr": 0.001999638520851299, "train_loss": 0.45124949448108675, "epoch": 46}
+{"train_lr": 0.0019995189547595644, "train_loss": 0.45040599479079246, "epoch": 47}
+{"train_lr": 0.0019993823097320955, "train_loss": 0.44934970703125, "epoch": 48}
+{"train_lr": 0.00199922858810378, "train_loss": 0.44850586191415787, "epoch": 49}
+{"train_lr": 0.001999057792501294, "train_loss": 0.44782372673153875, "epoch": 50}
+{"train_lr": 0.0019988699258430644, "train_loss": 0.4471781154215336, "epoch": 51}
+{"train_lr": 0.001998664991339216, "train_loss": 0.44638750190138815, "epoch": 52}
+{"train_lr": 0.001998442992491514, "train_loss": 0.44575022593736646, "epoch": 53}
+{"train_lr": 0.001998203933093299, "train_loss": 0.4452553203165531, "epoch": 54}
+{"train_lr": 0.0019979478172294477, "train_loss": 0.4447741946578026, "epoch": 55}
+{"train_lr": 0.0019976746492762656, "train_loss": 0.4442489013373852, "epoch": 56}
+{"train_lr": 0.001997384433901443, "train_loss": 0.4437332093179226, "epoch": 57}
+{"train_lr": 0.0019970771760639545, "train_loss": 0.4431314005434513, "epoch": 58}
+{"train_lr": 0.001996752881013995, "train_loss": 0.44259421687722206, "epoch": 59}
+{"train_lr": 0.0019964115542928738, "train_loss": 0.4422669967353344, "epoch": 60}
+{"train_lr": 0.0019960532017329145, "train_loss": 0.4417998247206211, "epoch": 61}
+{"train_lr": 0.0019956778294573777, "train_loss": 0.44136346296072004, "epoch": 62}
+{"train_lr": 0.0019952854438803442, "train_loss": 0.44111803106069564, "epoch": 63}
+{"train_lr": 0.0019948760517065917, "train_loss": 0.44124629287719724, "epoch": 64}
+{"train_lr": 0.001994449659931513, "train_loss": 0.44030142896175384, "epoch": 65}
+{"train_lr": 0.001994006275840954, "train_loss": 0.4400160102546215, "epoch": 66}
+{"train_lr": 0.001993545907011146, "train_loss": 0.4394983403027058, "epoch": 67}
+{"train_lr": 0.0019930685613085016, "train_loss": 0.43934193002581595, "epoch": 68}
+{"train_lr": 0.001992574246889553, "train_loss": 0.43939045332074167, "epoch": 69}
+{"train_lr": 0.0019920629722007623, "train_loss": 0.43876420689821244, "epoch": 70}
+{"train_lr": 0.001991534745978413, "train_loss": 0.4383896221101284, "epoch": 71}
+{"train_lr": 0.00199098957724843, "train_loss": 0.43805962885022165, "epoch": 72}
+{"train_lr": 0.001990427475326234, "train_loss": 0.4378157087266445, "epoch": 73}
+{"train_lr": 0.0019898484498166082, "train_loss": 0.4375223469555378, "epoch": 74}
+{"train_lr": 0.001989252510613496, "train_loss": 0.4371592809557915, "epoch": 75}
+{"train_lr": 0.0019886396678998547, "train_loss": 0.43709583897590637, "epoch": 76}
+{"train_lr": 0.001988009932147472, "train_loss": 0.43670710064172746, "epoch": 77}
+{"train_lr": 0.0019873633141167983, "train_loss": 0.4365320971310139, "epoch": 78}
+{"train_lr": 0.0019866998248567525, "train_loss": 0.4362802542924881, "epoch": 79}
+{"train_lr": 0.0019860194757045316, "train_loss": 0.43601478363275525, "epoch": 80}
+{"train_lr": 0.001985322278285424, "train_loss": 0.4358106957912445, "epoch": 81}
+{"train_lr": 0.001984608244512617, "train_loss": 0.43634837040901187, "epoch": 82}
+{"train_lr": 0.0019838773865869753, "train_loss": 0.4354546032965183, "epoch": 83}
+{"train_lr": 0.001983129716996845, "train_loss": 0.43514904779195784, "epoch": 84}
+{"train_lr": 0.0019823652485178316, "train_loss": 0.4349977902054787, "epoch": 85}
+{"train_lr": 0.0019815839942125928, "train_loss": 0.4347984156310558, "epoch": 86}
+{"train_lr": 0.001980785967430611, "train_loss": 0.4346069442749023, "epoch": 87}
+{"train_lr": 0.001979971181807968, "train_loss": 0.43437728793025016, "epoch": 88}
+{"train_lr": 0.0019791396512670954, "train_loss": 0.43415831446647646, "epoch": 89}
+{"train_lr": 0.0019782913900165513, "train_loss": 0.43396707623004915, "epoch": 90}
+{"train_lr": 0.001977426412550794, "train_loss": 0.43384554549455645, "epoch": 91}
+{"train_lr": 0.0019765447336498893, "train_loss": 0.43357861334681513, "epoch": 92}
+{"train_lr": 0.0019756463683793042, "train_loss": 0.43340606517791747, "epoch": 93}
+{"train_lr": 0.0019747313320896127, "train_loss": 0.43321831868886945, "epoch": 94}
+{"train_lr": 0.001973799640416274, "train_loss": 0.43303715973496437, "epoch": 95}
+{"train_lr": 0.001972851309279318, "train_loss": 0.4329533623635769, "epoch": 96}
+{"train_lr": 0.001971886354883114, "train_loss": 0.4327393824696541, "epoch": 97}
+{"train_lr": 0.0019709047937160624, "train_loss": 0.4326424191534519, "epoch": 98}
+{"train_lr": 0.00196990664255034, "train_loss": 0.432473964703083, "epoch": 99}
+{"train_lr": 0.001968891918441605, "train_loss": 0.4323611004710197, "epoch": 100}
+{"train_lr": 0.0019678606387286746, "train_loss": 0.43220903632044794, "epoch": 101}
+{"train_lr": 0.0019668128210332835, "train_loss": 0.4320597696781158, "epoch": 102}
+{"train_lr": 0.001965748483259745, "train_loss": 0.431807940363884, "epoch": 103}
+{"train_lr": 0.0019646676435946544, "train_loss": 0.43166343091726306, "epoch": 104}
+{"train_lr": 0.0019635703205065857, "train_loss": 0.43165171412825587, "epoch": 105}
+{"train_lr": 0.001962456532745752, "train_loss": 0.4314692829966545, "epoch": 106}
+{"train_lr": 0.001961326299343718, "train_loss": 0.43120107041597366, "epoch": 107}
+{"train_lr": 0.0019601796396130477, "train_loss": 0.43122232078313827, "epoch": 108}
+{"train_lr": 0.001959016573147011, "train_loss": 0.43111382276415827, "epoch": 109}
+{"train_lr": 0.001957837119819182, "train_loss": 0.43085430263876917, "epoch": 110}
+{"train_lr": 0.0019566412997831803, "train_loss": 0.43075852123498914, "epoch": 111}
+{"train_lr": 0.0019554291334722604, "train_loss": 0.4307229078769684, "epoch": 112}
+{"train_lr": 0.001954200641599004, "train_loss": 0.43064907240271566, "epoch": 113}
+{"train_lr": 0.001952955845154955, "train_loss": 0.4303462516546249, "epoch": 114}
+{"train_lr": 0.0019516947654102353, "train_loss": 0.430285882461071, "epoch": 115}
+{"train_lr": 0.0019504174239132258, "train_loss": 0.4302071396172047, "epoch": 116}
+{"train_lr": 0.0019491238424901728, "train_loss": 0.43007939971089365, "epoch": 117}
+{"train_lr": 0.0019478140432448074, "train_loss": 0.42995246585607527, "epoch": 118}
+{"train_lr": 0.0019464880485579859, "train_loss": 0.4299236563742161, "epoch": 119}
+{"train_lr": 0.0019451458810873046, "train_loss": 0.42975575862526894, "epoch": 120}
+{"train_lr": 0.0019437875637666987, "train_loss": 0.4296689044058323, "epoch": 121}
+{"train_lr": 0.0019424131198060568, "train_loss": 0.4295571488142014, "epoch": 122}
+{"train_lr": 0.001941022572690844, "train_loss": 0.4295304506480694, "epoch": 123}
+{"train_lr": 0.0019396159461816677, "train_loss": 0.4295160254955292, "epoch": 124}
+{"train_lr": 0.0019381932643138978, "train_loss": 0.4293148836731911, "epoch": 125}
+{"train_lr": 0.0019367545513972342, "train_loss": 0.4292679689407349, "epoch": 126}
+{"train_lr": 0.0019352998320153279, "train_loss": 0.42902680062055587, "epoch": 127}
+{"train_lr": 0.0019338291310252987, "train_loss": 0.4290702934384346, "epoch": 128}
+{"train_lr": 0.001932342473557387, "train_loss": 0.4288153325974941, "epoch": 129}
+{"train_lr": 0.0019308398850144532, "train_loss": 0.4288082259654999, "epoch": 130}
+{"train_lr": 0.0019293213910715973, "train_loss": 0.4287457905650139, "epoch": 131}
+{"train_lr": 0.0019277870176756878, "train_loss": 0.42868427852988245, "epoch": 132}
+{"train_lr": 0.0019262367910449316, "train_loss": 0.4285197732448578, "epoch": 133}
+{"train_lr": 0.0019246707376684355, "train_loss": 0.4284260827243328, "epoch": 134}
+{"train_lr": 0.0019230888843057212, "train_loss": 0.42834890897870065, "epoch": 135}
+{"train_lr": 0.001921491257986288, "train_loss": 0.42836915620565413, "epoch": 136}
+{"train_lr": 0.001919877886009182, "train_loss": 0.4281206827223301, "epoch": 137}
+{"train_lr": 0.0019182487959424652, "train_loss": 0.4281239497900009, "epoch": 138}
+{"train_lr": 0.0019166040156227992, "train_loss": 0.4281063589513302, "epoch": 139}
+{"train_lr": 0.0019149435731549388, "train_loss": 0.42810667996406554, "epoch": 140}
+{"train_lr": 0.001913267496911266, "train_loss": 0.42796390196084977, "epoch": 141}
+{"train_lr": 0.001911575815531295, "train_loss": 0.4278896269261837, "epoch": 142}
+{"train_lr": 0.001909868557921197, "train_loss": 0.42768696791529653, "epoch": 143}
+{"train_lr": 0.0019081457532532941, "train_loss": 0.42768659583330154, "epoch": 144}
+{"train_lr": 0.0019064074309655585, "train_loss": 0.42750629413723945, "epoch": 145}
+{"train_lr": 0.0019046536207611357, "train_loss": 0.42743830469250677, "epoch": 146}
+{"train_lr": 0.0019028843526077868, "train_loss": 0.42752327723503114, "epoch": 147}
+{"train_lr": 0.0019010996567374366, "train_loss": 0.42736736696958544, "epoch": 148}
+{"train_lr": 0.0018992995636456075, "train_loss": 0.4276691595375538, "epoch": 149}
+{"train_lr": 0.0018974841040909193, "train_loss": 0.4272169639468193, "epoch": 150}
+{"train_lr": 0.0018956533090945722, "train_loss": 0.4271261396050453, "epoch": 151}
+{"train_lr": 0.0018938072099398014, "train_loss": 0.42715407326221466, "epoch": 152}
+{"train_lr": 0.0018919458381713458, "train_loss": 0.4283799661755562, "epoch": 153}
+{"train_lr": 0.0018900692255949196, "train_loss": 0.4275367811322212, "epoch": 154}
+{"train_lr": 0.0018881774042766428, "train_loss": 0.42704309683442115, "epoch": 155}
+{"train_lr": 0.0018862704065425273, "train_loss": 0.4270126509964466, "epoch": 156}
+{"train_lr": 0.0018843482649779047, "train_loss": 0.42692647844552994, "epoch": 157}
+{"train_lr": 0.0018824110124268732, "train_loss": 0.42698997198939326, "epoch": 158}
+{"train_lr": 0.0018804586819917309, "train_loss": 0.42690860251784324, "epoch": 159}
+{"train_lr": 0.0018784913070324137, "train_loss": 0.42671867433190347, "epoch": 160}
+{"train_lr": 0.0018765089211659387, "train_loss": 0.4265693518280983, "epoch": 161}
+{"train_lr": 0.0018745115582658063, "train_loss": 0.42649883098006247, "epoch": 162}
+{"train_lr": 0.0018724992524614336, "train_loss": 0.42644061017632484, "epoch": 163}
+{"train_lr": 0.0018704720381375868, "train_loss": 0.42639319202899933, "epoch": 164}
+{"train_lr": 0.0018684299499337567, "train_loss": 0.4262290573775768, "epoch": 165}
+{"train_lr": 0.0018663730227436023, "train_loss": 0.42622224123477936, "epoch": 166}
+{"train_lr": 0.0018643012917143186, "train_loss": 0.4262301009774208, "epoch": 167}
+{"train_lr": 0.0018622147922460915, "train_loss": 0.426052722042799, "epoch": 168}
+{"train_lr": 0.0018601135599914326, "train_loss": 0.42615303208231925, "epoch": 169}
+{"train_lr": 0.0018579976308546009, "train_loss": 0.4259471821427345, "epoch": 170}
+{"train_lr": 0.0018558670409909955, "train_loss": 0.42602315420508385, "epoch": 171}
+{"train_lr": 0.001853721826806509, "train_loss": 0.4258790579974651, "epoch": 172}
+{"train_lr": 0.001851562024956937, "train_loss": 0.4258593296408653, "epoch": 173}
+{"train_lr": 0.0018493876723473352, "train_loss": 0.42584737250208854, "epoch": 174}
+{"train_lr": 0.0018471988061313895, "train_loss": 0.4257563955247402, "epoch": 175}
+{"train_lr": 0.001844995463710784, "train_loss": 0.42564952899217606, "epoch": 176}
+{"train_lr": 0.0018427776827345638, "train_loss": 0.425537062728405, "epoch": 177}
+{"train_lr": 0.0018405455010984842, "train_loss": 0.42545911307930945, "epoch": 178}
+{"train_lr": 0.0018382989569443692, "train_loss": 0.42556525562405584, "epoch": 179}
+{"train_lr": 0.0018360380886594638, "train_loss": 0.4254311235845089, "epoch": 180}
+{"train_lr": 0.0018337629348757645, "train_loss": 0.4255017030119896, "epoch": 181}
+{"train_lr": 0.0018314735344693734, "train_loss": 0.42531982975006105, "epoch": 182}
+{"train_lr": 0.0018291699265598315, "train_loss": 0.4253919682562351, "epoch": 183}
+{"train_lr": 0.0018268521505094332, "train_loss": 0.42532408665418625, "epoch": 184}
+{"train_lr": 0.0018245202459225905, "train_loss": 0.4252872289419174, "epoch": 185}
+{"train_lr": 0.0018221742526451034, "train_loss": 0.42515819770097735, "epoch": 186}
+{"train_lr": 0.0018198142107635346, "train_loss": 0.4251448391377926, "epoch": 187}
+{"train_lr": 0.001817440160604478, "train_loss": 0.4251269141793251, "epoch": 188}
+{"train_lr": 0.001815052142733913, "train_loss": 0.42496180029511454, "epoch": 189}
+{"train_lr": 0.001812650197956469, "train_loss": 0.4249068469822407, "epoch": 190}
+{"train_lr": 0.001810234367314759, "train_loss": 0.4250109994530678, "epoch": 191}
+{"train_lr": 0.001807804692088656, "train_loss": 0.424981004846096, "epoch": 192}
+{"train_lr": 0.0018053612137946117, "train_loss": 0.4247819488167763, "epoch": 193}
+{"train_lr": 0.00180290397418492, "train_loss": 0.4246981884419918, "epoch": 194}
+{"train_lr": 0.0018004330152470427, "train_loss": 0.4247443710744381, "epoch": 195}
+{"train_lr": 0.001797948379202839, "train_loss": 0.42464397926330566, "epoch": 196}
+{"train_lr": 0.001795450108507886, "train_loss": 0.4248728358566761, "epoch": 197}
+{"train_lr": 0.0017929382458507345, "train_loss": 0.42464299993515014, "epoch": 198}
+{"train_lr": 0.001790412834152188, "train_loss": 0.42458462185263635, "epoch": 199}
+{"train_lr": 0.0017878739165645666, "train_loss": 0.4244833302080631, "epoch": 200}
+{"train_lr": 0.0017853215364709624, "train_loss": 0.4244596959531307, "epoch": 201}
+{"train_lr": 0.001782755737484517, "train_loss": 0.4243732154786587, "epoch": 202}
+{"train_lr": 0.0017801765634476482, "train_loss": 0.42429113371372223, "epoch": 203}
+{"train_lr": 0.0017775840584313269, "train_loss": 0.4243124633014202, "epoch": 204}
+{"train_lr": 0.0017749782667343087, "train_loss": 0.4242242727458477, "epoch": 205}
+{"train_lr": 0.0017723592328823872, "train_loss": 0.42419893629550937, "epoch": 206}
+{"train_lr": 0.0017697270016276267, "train_loss": 0.424120762860775, "epoch": 207}
+{"train_lr": 0.0017670816179475896, "train_loss": 0.4241485097467899, "epoch": 208}
+{"train_lr": 0.0017644231270445914, "train_loss": 0.4241050954818726, "epoch": 209}
+{"train_lr": 0.0017617515743449002, "train_loss": 0.42401950508356095, "epoch": 210}
+{"train_lr": 0.0017590670054979855, "train_loss": 0.4240114216029644, "epoch": 211}
+{"train_lr": 0.0017563694663757193, "train_loss": 0.4239511508405209, "epoch": 212}
+{"train_lr": 0.001753659003071607, "train_loss": 0.4239001268327236, "epoch": 213}
+{"train_lr": 0.0017509356618999798, "train_loss": 0.4238909521043301, "epoch": 214}
+{"train_lr": 0.0017481994893952333, "train_loss": 0.42393678986430167, "epoch": 215}
+{"train_lr": 0.0017454505323109951, "train_loss": 0.42374238679409026, "epoch": 216}
+{"train_lr": 0.0017426888376193663, "train_loss": 0.42381820154190064, "epoch": 217}
+{"train_lr": 0.0017399144525100897, "train_loss": 0.423696692097187, "epoch": 218}
+{"train_lr": 0.0017371274243897503, "train_loss": 0.4236421609342098, "epoch": 219}
+{"train_lr": 0.0017343278008809635, "train_loss": 0.42375850692987443, "epoch": 220}
+{"train_lr": 0.0017315156298215765, "train_loss": 0.42361214114427564, "epoch": 221}
+{"train_lr": 0.0017286909592638356, "train_loss": 0.423529484629631, "epoch": 222}
+{"train_lr": 0.001725853837473557, "train_loss": 0.4241306705236435, "epoch": 223}
+{"train_lr": 0.001723004312929336, "train_loss": 0.4236619794726372, "epoch": 224}
+{"train_lr": 0.0017201424343216843, "train_loss": 0.423469975990057, "epoch": 225}
+{"train_lr": 0.001717268250552199, "train_loss": 0.4235054041683674, "epoch": 226}
+{"train_lr": 0.0017143818107327635, "train_loss": 0.4234480388879776, "epoch": 227}
+{"train_lr": 0.001711483164184661, "train_loss": 0.42325710557699203, "epoch": 228}
+{"train_lr": 0.0017085723604377695, "train_loss": 0.423364332896471, "epoch": 229}
+{"train_lr": 0.001705649449229696, "train_loss": 0.4232913333415985, "epoch": 230}
+{"train_lr": 0.0017027144805049166, "train_loss": 0.42325079972147944, "epoch": 231}
+{"train_lr": 0.0016997675044139638, "train_loss": 0.4232346039891243, "epoch": 232}
+{"train_lr": 0.0016968085713125144, "train_loss": 0.42314266840815545, "epoch": 233}
+{"train_lr": 0.001693837731760583, "train_loss": 0.4230865432739258, "epoch": 234}
+{"train_lr": 0.001690855036521616, "train_loss": 0.42304754146933554, "epoch": 235}
+{"train_lr": 0.0016878605365616413, "train_loss": 0.4229908716440201, "epoch": 236}
+{"train_lr": 0.0016848542830484078, "train_loss": 0.42301636381149293, "epoch": 237}
+{"train_lr": 0.0016818363273504887, "train_loss": 0.4229623642385006, "epoch": 238}
+{"train_lr": 0.0016788067210364202, "train_loss": 0.42286261225342753, "epoch": 239}
+{"train_lr": 0.0016757655158738203, "train_loss": 0.4231466094911098, "epoch": 240}
+{"train_lr": 0.0016727127638284855, "train_loss": 0.4228993058741093, "epoch": 241}
+{"train_lr": 0.0016696485170635351, "train_loss": 0.4227703313648701, "epoch": 242}
+{"train_lr": 0.001666572827938487, "train_loss": 0.42268667768239976, "epoch": 243}
+{"train_lr": 0.0016634857490083828, "train_loss": 0.4226205878555775, "epoch": 244}
+{"train_lr": 0.001660387333022884, "train_loss": 0.4228058986365795, "epoch": 245}
+{"train_lr": 0.0016572776329253699, "train_loss": 0.4226573086321354, "epoch": 246}
+{"train_lr": 0.0016541567018520343, "train_loss": 0.4226382351756096, "epoch": 247}
+{"train_lr": 0.0016510245931309836, "train_loss": 0.42262957600951195, "epoch": 248}
+{"train_lr": 0.001647881360281309, "train_loss": 0.42256055372953416, "epoch": 249}
+{"train_lr": 0.0016447270570121876, "train_loss": 0.42256660661697387, "epoch": 250}
+{"train_lr": 0.0016415617372219618, "train_loss": 0.4224966368377209, "epoch": 251}
+{"train_lr": 0.001638385454997211, "train_loss": 0.422463566839695, "epoch": 252}
+{"train_lr": 0.00163519826461184, "train_loss": 0.42237583945393564, "epoch": 253}
+{"train_lr": 0.0016320002205261264, "train_loss": 0.42226354267001154, "epoch": 254}
+{"train_lr": 0.0016287913773858353, "train_loss": 0.4222660710632801, "epoch": 255}
+{"train_lr": 0.0016255717900212328, "train_loss": 0.4222601546764374, "epoch": 256}
+{"train_lr": 0.0016223415134461888, "train_loss": 0.4221981988191605, "epoch": 257}
+{"train_lr": 0.0016191006028572102, "train_loss": 0.4222234422802925, "epoch": 258}
+{"train_lr": 0.0016158491136325235, "train_loss": 0.42203234511613846, "epoch": 259}
+{"train_lr": 0.0016125871013311073, "train_loss": 0.4220574823975563, "epoch": 260}
+{"train_lr": 0.0016093146216917486, "train_loss": 0.42207971769571306, "epoch": 261}
+{"train_lr": 0.0016060317306321, "train_loss": 0.42206390278339384, "epoch": 262}
+{"train_lr": 0.0016027384842477105, "train_loss": 0.4220549532175064, "epoch": 263}
+{"train_lr": 0.0015994349388110693, "train_loss": 0.4220111142575741, "epoch": 264}
+{"train_lr": 0.001596121150770662, "train_loss": 0.4219424910187721, "epoch": 265}
+{"train_lr": 0.0015927971767499772, "train_loss": 0.4220361890375614, "epoch": 266}
+{"train_lr": 0.0015894630735465585, "train_loss": 0.4218697710752487, "epoch": 267}
+{"train_lr": 0.001586118898131038, "train_loss": 0.42191931760907175, "epoch": 268}
+{"train_lr": 0.0015827647076461402, "train_loss": 0.4217308155596256, "epoch": 269}
+{"train_lr": 0.0015794005594057226, "train_loss": 0.4218587208151817, "epoch": 270}
+{"train_lr": 0.0015760265108938055, "train_loss": 0.4217528022646904, "epoch": 271}
+{"train_lr": 0.001572642619763563, "train_loss": 0.421679973757267, "epoch": 272}
+{"train_lr": 0.0015692489438363627, "train_loss": 0.4217708421468735, "epoch": 273}
+{"train_lr": 0.001565845541100755, "train_loss": 0.4216467033326626, "epoch": 274}
+{"train_lr": 0.001562432469711511, "train_loss": 0.42169866006374357, "epoch": 275}
+{"train_lr": 0.0015590097879886, "train_loss": 0.42156042192578314, "epoch": 276}
+{"train_lr": 0.001555577554416206, "train_loss": 0.42161925470232964, "epoch": 277}
+{"train_lr": 0.0015521358276417347, "train_loss": 0.42153908587694167, "epoch": 278}
+{"train_lr": 0.0015486846664748033, "train_loss": 0.4215101927101612, "epoch": 279}
+{"train_lr": 0.0015452241298862248, "train_loss": 0.4214610160768032, "epoch": 280}
+{"train_lr": 0.0015417542770070323, "train_loss": 0.42141325249671935, "epoch": 281}
+{"train_lr": 0.0015382751671274308, "train_loss": 0.42136881043314933, "epoch": 282}
+{"train_lr": 0.0015347868596958091, "train_loss": 0.42135062956213953, "epoch": 283}
+{"train_lr": 0.0015312894143177202, "train_loss": 0.42139784327149393, "epoch": 284}
+{"train_lr": 0.0015277828907548521, "train_loss": 0.4214449079275131, "epoch": 285}
+{"train_lr": 0.001524267348924025, "train_loss": 0.4213422214746475, "epoch": 286}
+{"train_lr": 0.0015207428488961414, "train_loss": 0.42129106523990634, "epoch": 287}
+{"train_lr": 0.0015172094508951826, "train_loss": 0.42131546414494514, "epoch": 288}
+{"train_lr": 0.0015136672152971753, "train_loss": 0.4212149278342724, "epoch": 289}
+{"train_lr": 0.0015101162026291506, "train_loss": 0.42106798495054243, "epoch": 290}
+{"train_lr": 0.001506556473568119, "train_loss": 0.42114609475135806, "epoch": 291}
+{"train_lr": 0.0015029880889400262, "train_loss": 0.42112620157003405, "epoch": 292}
+{"train_lr": 0.001499411109718721, "train_loss": 0.4210899014830589, "epoch": 293}
+{"train_lr": 0.001495825597024904, "train_loss": 0.42102030997872353, "epoch": 294}
+{"train_lr": 0.0014922316121251074, "train_loss": 0.4210644329071045, "epoch": 295}
+{"train_lr": 0.0014886292164306054, "train_loss": 0.42094684926271436, "epoch": 296}
+{"train_lr": 0.001485018471496406, "train_loss": 0.420904375231266, "epoch": 297}
+{"train_lr": 0.001481399439020176, "train_loss": 0.4209073343873024, "epoch": 298}
+{"train_lr": 0.0014777721808411927, "train_loss": 0.4208303572535515, "epoch": 299}
+{"train_lr": 0.0014741367589392984, "train_loss": 0.420840155172348, "epoch": 300}
+{"train_lr": 0.001470493235433814, "train_loss": 0.42079361829161643, "epoch": 301}
+{"train_lr": 0.0014668416725825066, "train_loss": 0.420751271378994, "epoch": 302}
+{"train_lr": 0.0014631821327805124, "train_loss": 0.42070620072484016, "epoch": 303}
+{"train_lr": 0.0014595146785592672, "train_loss": 0.4206903719842434, "epoch": 304}
+{"train_lr": 0.00145583937258545, "train_loss": 0.4207553890287876, "epoch": 305}
+{"train_lr": 0.001452156277659891, "train_loss": 0.42065541954040525, "epoch": 306}
+{"train_lr": 0.0014484654567165239, "train_loss": 0.4206352675974369, "epoch": 307}
+{"train_lr": 0.0014447669728213, "train_loss": 0.420621358191967, "epoch": 308}
+{"train_lr": 0.0014410608891710992, "train_loss": 0.42055000742673876, "epoch": 309}
+{"train_lr": 0.0014373472690926664, "train_loss": 0.4204847206771374, "epoch": 310}
+{"train_lr": 0.0014336261760415228, "train_loss": 0.4204664314568043, "epoch": 311}
+{"train_lr": 0.0014298976736008813, "train_loss": 0.42043020857572555, "epoch": 312}
+{"train_lr": 0.001426161825480565, "train_loss": 0.42040675433278085, "epoch": 313}
+{"train_lr": 0.0014224186955159059, "train_loss": 0.42036361072659495, "epoch": 314}
+{"train_lr": 0.001418668347666667, "train_loss": 0.42047722015976907, "epoch": 315}
+{"train_lr": 0.001414910846015954, "train_loss": 0.4203856301009655, "epoch": 316}
+{"train_lr": 0.0014111462547690917, "train_loss": 0.42029333937168123, "epoch": 317}
+{"train_lr": 0.0014073746382525652, "train_loss": 0.4202634672820568, "epoch": 318}
+{"train_lr": 0.0014035960609128908, "train_loss": 0.4202524435698986, "epoch": 319}
+{"train_lr": 0.0013998105873155328, "train_loss": 0.4202196150660515, "epoch": 320}
+{"train_lr": 0.0013960182821437879, "train_loss": 0.4202494762778282, "epoch": 321}
+{"train_lr": 0.001392219210197692, "train_loss": 0.42015298603773116, "epoch": 322}
+{"train_lr": 0.0013884134363928921, "train_loss": 0.4201268202781677, "epoch": 323}
+{"train_lr": 0.001384601025759574, "train_loss": 0.419998131608963, "epoch": 324}
+{"train_lr": 0.001380782043441313, "train_loss": 0.42012175452113154, "epoch": 325}
+{"train_lr": 0.0013769565546939756, "train_loss": 0.4199835945248604, "epoch": 326}
+{"train_lr": 0.001373124624884616, "train_loss": 0.4199679668843746, "epoch": 327}
+{"train_lr": 0.0013692863194903408, "train_loss": 0.41995242735147476, "epoch": 328}
+{"train_lr": 0.0013654417040971938, "train_loss": 0.419914648014307, "epoch": 329}
+{"train_lr": 0.0013615908443990496, "train_loss": 0.420291811478138, "epoch": 330}
+{"train_lr": 0.0013577338061964764, "train_loss": 0.42001431497335434, "epoch": 331}
+{"train_lr": 0.0013538706553956092, "train_loss": 0.41982507169246674, "epoch": 332}
+{"train_lr": 0.0013500014580070398, "train_loss": 0.4198868880212307, "epoch": 333}
+{"train_lr": 0.0013461262801446774, "train_loss": 0.41981268939375876, "epoch": 334}
+{"train_lr": 0.0013422451880246203, "train_loss": 0.4197859157443047, "epoch": 335}
+{"train_lr": 0.0013383582479640172, "train_loss": 0.4197030574500561, "epoch": 336}
+{"train_lr": 0.0013344655263799582, "train_loss": 0.41963064196109773, "epoch": 337}
+{"train_lr": 0.0013305670897883135, "train_loss": 0.41961017757058144, "epoch": 338}
+{"train_lr": 0.0013266630048026041, "train_loss": 0.41970578683018683, "epoch": 339}
+{"train_lr": 0.00132275333813287, "train_loss": 0.4195633431851864, "epoch": 340}
+{"train_lr": 0.001318838156584536, "train_loss": 0.41961616225242615, "epoch": 341}
+{"train_lr": 0.0013149175270572404, "train_loss": 0.419570646417141, "epoch": 342}
+{"train_lr": 0.0013109915165437332, "train_loss": 0.41954742604494094, "epoch": 343}
+{"train_lr": 0.0013070601921287021, "train_loss": 0.4194275137424469, "epoch": 344}
+{"train_lr": 0.001303123620987628, "train_loss": 0.41942608463168146, "epoch": 345}
+{"train_lr": 0.0012991818703856554, "train_loss": 0.4194012299001217, "epoch": 346}
+{"train_lr": 0.0012952350076764231, "train_loss": 0.41930887975096703, "epoch": 347}
+{"train_lr": 0.00129128310030093, "train_loss": 0.4193776847779751, "epoch": 348}
+{"train_lr": 0.0012873262157863646, "train_loss": 0.4193605829834938, "epoch": 349}
+{"train_lr": 0.0012833644217449664, "train_loss": 0.4192501567542553, "epoch": 350}
+{"train_lr": 0.0012793977858728675, "train_loss": 0.4192127873659134, "epoch": 351}
+{"train_lr": 0.00127542637594893, "train_loss": 0.4193183109641075, "epoch": 352}
+{"train_lr": 0.0012714502598335897, "train_loss": 0.4197004640817642, "epoch": 353}
+{"train_lr": 0.0012674695054677005, "train_loss": 0.41917526848316194, "epoch": 354}
+{"train_lr": 0.0012634841808713748, "train_loss": 0.41914406824707984, "epoch": 355}
+{"train_lr": 0.0012594943541428109, "train_loss": 0.4190760906219482, "epoch": 356}
+{"train_lr": 0.0012555000934571397, "train_loss": 0.419058157235384, "epoch": 357}
+{"train_lr": 0.0012515014670652586, "train_loss": 0.419025038343668, "epoch": 358}
+{"train_lr": 0.0012474985432926558, "train_loss": 0.4189569546878338, "epoch": 359}
+{"train_lr": 0.001243491390538254, "train_loss": 0.4190199301660061, "epoch": 360}
+{"train_lr": 0.0012394800772732412, "train_loss": 0.4189968164920807, "epoch": 361}
+{"train_lr": 0.0012354646720398926, "train_loss": 0.41894257601499557, "epoch": 362}
+{"train_lr": 0.001231445243450402, "train_loss": 0.41891998412013054, "epoch": 363}
+{"train_lr": 0.0012274218601857198, "train_loss": 0.41875716477632524, "epoch": 364}
+{"train_lr": 0.0012233945909943611, "train_loss": 0.4189456850349903, "epoch": 365}
+{"train_lr": 0.001219363504691245, "train_loss": 0.41879626615047455, "epoch": 366}
+{"train_lr": 0.0012153286701565129, "train_loss": 0.4187718325734138, "epoch": 367}
+{"train_lr": 0.0012112901563343563, "train_loss": 0.4186474060893059, "epoch": 368}
+{"train_lr": 0.0012072480322318328, "train_loss": 0.4186492353022099, "epoch": 369}
+{"train_lr": 0.0012032023669176915, "train_loss": 0.4186739155292511, "epoch": 370}
+{"train_lr": 0.0011991532295211936, "train_loss": 0.41872328140735626, "epoch": 371}
+{"train_lr": 0.001195100689230918, "train_loss": 0.4187125334382057, "epoch": 372}
+{"train_lr": 0.0011910448152936013, "train_loss": 0.4187649071574211, "epoch": 373}
+{"train_lr": 0.00118698567701294, "train_loss": 0.41862554777264593, "epoch": 374}
+{"train_lr": 0.001182923343748406, "train_loss": 0.418472838807106, "epoch": 375}
+{"train_lr": 0.0011788578849140647, "train_loss": 0.4184612022995949, "epoch": 376}
+{"train_lr": 0.00117478936997739, "train_loss": 0.418489412689209, "epoch": 377}
+{"train_lr": 0.001170717868458082, "train_loss": 0.4183967174947262, "epoch": 378}
+{"train_lr": 0.001166643449926863, "train_loss": 0.4184104426383972, "epoch": 379}
+{"train_lr": 0.0011625661840043084, "train_loss": 0.41830870187282565, "epoch": 380}
+{"train_lr": 0.0011584861403596384, "train_loss": 0.41836045224666596, "epoch": 381}
+{"train_lr": 0.0011544033887095435, "train_loss": 0.41827855964899063, "epoch": 382}
+{"train_lr": 0.0011503179988169893, "train_loss": 0.41831054545640944, "epoch": 383}
+{"train_lr": 0.001146230040490009, "train_loss": 0.41834157658815385, "epoch": 384}
+{"train_lr": 0.0011421395835805358, "train_loss": 0.41814741303920744, "epoch": 385}
+{"train_lr": 0.0011380466979831925, "train_loss": 0.41818422635793684, "epoch": 386}
+{"train_lr": 0.0011339514536341003, "train_loss": 0.418172834277153, "epoch": 387}
+{"train_lr": 0.001129853920509686, "train_loss": 0.4181679087162018, "epoch": 388}
+{"train_lr": 0.0011257541686254895, "train_loss": 0.4180516511440277, "epoch": 389}
+{"train_lr": 0.0011216522680349492, "train_loss": 0.41804933690428736, "epoch": 390}
+{"train_lr": 0.0011175482888282399, "train_loss": 0.4180680680811405, "epoch": 391}
+{"train_lr": 0.0011134423011310347, "train_loss": 0.4179827343761921, "epoch": 392}
+{"train_lr": 0.0011093343751033356, "train_loss": 0.4179728990733623, "epoch": 393}
+{"train_lr": 0.0011052245809382672, "train_loss": 0.4179292483329773, "epoch": 394}
+{"train_lr": 0.0011011129888608734, "train_loss": 0.41794585397839545, "epoch": 395}
+{"train_lr": 0.0010969996691269118, "train_loss": 0.41799348885416987, "epoch": 396}
+{"train_lr": 0.0010928846920216773, "train_loss": 0.41798636142015455, "epoch": 397}
+{"train_lr": 0.0010887681278587693, "train_loss": 0.4178778306603432, "epoch": 398}
+{"train_lr": 0.0010846500469789088, "train_loss": 0.417874898070097, "epoch": 399}
+{"train_lr": 0.0010805305197487387, "train_loss": 0.4177279465615749, "epoch": 400}
+{"train_lr": 0.001076409616559617, "train_loss": 0.4177750180602074, "epoch": 401}
+{"train_lr": 0.001072287407826403, "train_loss": 0.41769819692969323, "epoch": 402}
+{"train_lr": 0.0010681639639862738, "train_loss": 0.4177047014296055, "epoch": 403}
+{"train_lr": 0.0010640393554975105, "train_loss": 0.41777194578647614, "epoch": 404}
+{"train_lr": 0.001059913652838287, "train_loss": 0.4176086929380894, "epoch": 405}
+{"train_lr": 0.0010557869265054776, "train_loss": 0.41784522614479064, "epoch": 406}
+{"train_lr": 0.0010516592470134524, "train_loss": 0.4175245689034462, "epoch": 407}
+{"train_lr": 0.0010475306848928647, "train_loss": 0.41752753249406815, "epoch": 408}
+{"train_lr": 0.0010434013106894533, "train_loss": 0.41756826764941213, "epoch": 409}
+{"train_lr": 0.0010392711949628248, "train_loss": 0.417456934183836, "epoch": 410}
+{"train_lr": 0.00103514040828526, "train_loss": 0.4174256393015385, "epoch": 411}
+{"train_lr": 0.001031009021240512, "train_loss": 0.4173729620695114, "epoch": 412}
+{"train_lr": 0.0010268771044225837, "train_loss": 0.41735645656585696, "epoch": 413}
+{"train_lr": 0.0010227447284345357, "train_loss": 0.4173586934030056, "epoch": 414}
+{"train_lr": 0.0010186119638872688, "train_loss": 0.4173215918242931, "epoch": 415}
+{"train_lr": 0.001014478881398324, "train_loss": 0.4172305813729763, "epoch": 416}
+{"train_lr": 0.0010103455515906839, "train_loss": 0.4172926494061947, "epoch": 417}
+{"train_lr": 0.0010062120450915484, "train_loss": 0.41724464458227156, "epoch": 418}
+{"train_lr": 0.0010020784325311383, "train_loss": 0.4171691377878189, "epoch": 419}
+{"train_lr": 0.0009979447845414845, "train_loss": 0.4171639740407467, "epoch": 420}
+{"train_lr": 0.000993811171755231, "train_loss": 0.41711280400156975, "epoch": 421}
+{"train_lr": 0.0009896776648044105, "train_loss": 0.4171565491616726, "epoch": 422}
+{"train_lr": 0.0009855443343192564, "train_loss": 0.41707179708480835, "epoch": 423}
+{"train_lr": 0.0009814112509269812, "train_loss": 0.4170188140451908, "epoch": 424}
+{"train_lr": 0.0009772784852505741, "train_loss": 0.4170406273066998, "epoch": 425}
+{"train_lr": 0.0009731461079075985, "train_loss": 0.41711612367033957, "epoch": 426}
+{"train_lr": 0.0009690141895089831, "train_loss": 0.4170222055196762, "epoch": 427}
+{"train_lr": 0.0009648828006578134, "train_loss": 0.41702341947555543, "epoch": 428}
+{"train_lr": 0.0009607520119481245, "train_loss": 0.41691789889335634, "epoch": 429}
+{"train_lr": 0.0009566218939636999, "train_loss": 0.4168686304748058, "epoch": 430}
+{"train_lr": 0.0009524925172768602, "train_loss": 0.41675924023389815, "epoch": 431}
+{"train_lr": 0.0009483639524472556, "train_loss": 0.416854871147871, "epoch": 432}
+{"train_lr": 0.000944236270020672, "train_loss": 0.4168514198482037, "epoch": 433}
+{"train_lr": 0.0009401095405278129, "train_loss": 0.416757233697176, "epoch": 434}
+{"train_lr": 0.0009359838344831006, "train_loss": 0.41670299382805825, "epoch": 435}
+{"train_lr": 0.0009318592223834629, "train_loss": 0.4166590934753418, "epoch": 436}
+{"train_lr": 0.0009277357747071485, "train_loss": 0.41663964110612867, "epoch": 437}
+{"train_lr": 0.0009236135619124953, "train_loss": 0.4165965996146202, "epoch": 438}
+{"train_lr": 0.0009194926544367453, "train_loss": 0.41651798075437546, "epoch": 439}
+{"train_lr": 0.0009153731226948438, "train_loss": 0.4164986294090748, "epoch": 440}
+{"train_lr": 0.0009112550370782172, "train_loss": 0.416437136977911, "epoch": 441}
+{"train_lr": 0.0009071384679535845, "train_loss": 0.41646568976044657, "epoch": 442}
+{"train_lr": 0.0009030234856617595, "train_loss": 0.41643167090415956, "epoch": 443}
+{"train_lr": 0.0008989101605164331, "train_loss": 0.41652717319726945, "epoch": 444}
+{"train_lr": 0.0008947985628029826, "train_loss": 0.41643864707946776, "epoch": 445}
+{"train_lr": 0.000890688762777271, "train_loss": 0.4163997540950775, "epoch": 446}
+{"train_lr": 0.000886580830664437, "train_loss": 0.41629073085188867, "epoch": 447}
+{"train_lr": 0.000882474836657711, "train_loss": 0.41629329221844674, "epoch": 448}
+{"train_lr": 0.0008783708509171996, "train_loss": 0.4162357913553715, "epoch": 449}
+{"train_lr": 0.0008742689435686971, "train_loss": 0.41620945250988006, "epoch": 450}
+{"train_lr": 0.0008701691847024806, "train_loss": 0.4162765025675297, "epoch": 451}
+{"train_lr": 0.0008660716443721178, "train_loss": 0.4162437853038311, "epoch": 452}
+{"train_lr": 0.0008619763925932672, "train_loss": 0.4160850357532501, "epoch": 453}
+{"train_lr": 0.000857883499342485, "train_loss": 0.41610757068395615, "epoch": 454}
+{"train_lr": 0.0008537930345560229, "train_loss": 0.4160928344607353, "epoch": 455}
+{"train_lr": 0.0008497050681286344, "train_loss": 0.4160299357116222, "epoch": 456}
+{"train_lr": 0.0008456196699123931, "train_loss": 0.4160114522874355, "epoch": 457}
+{"train_lr": 0.0008415369097154786, "train_loss": 0.4159763306438923, "epoch": 458}
+{"train_lr": 0.0008374568573009967, "train_loss": 0.41603127918839455, "epoch": 459}
+{"train_lr": 0.0008333795823857903, "train_loss": 0.41598988704681394, "epoch": 460}
+{"train_lr": 0.0008293051546392356, "train_loss": 0.4159310473740101, "epoch": 461}
+{"train_lr": 0.0008252336436820601, "train_loss": 0.415981581813097, "epoch": 462}
+{"train_lr": 0.0008211651190851549, "train_loss": 0.41578987702727316, "epoch": 463}
+{"train_lr": 0.000817099650368378, "train_loss": 0.41580206685066223, "epoch": 464}
+{"train_lr": 0.0008130373069993725, "train_loss": 0.41571778808832166, "epoch": 465}
+{"train_lr": 0.0008089781583923796, "train_loss": 0.41572712430357933, "epoch": 466}
+{"train_lr": 0.0008049222739070492, "train_loss": 0.41574336388111116, "epoch": 467}
+{"train_lr": 0.0008008697228472562, "train_loss": 0.4156535137236118, "epoch": 468}
+{"train_lr": 0.0007968205744599162, "train_loss": 0.4156419990241528, "epoch": 469}
+{"train_lr": 0.0007927748979338039, "train_loss": 0.4155884074151516, "epoch": 470}
+{"train_lr": 0.0007887327623983688, "train_loss": 0.4156378916442394, "epoch": 471}
+{"train_lr": 0.0007846942369225543, "train_loss": 0.4155085301876068, "epoch": 472}
+{"train_lr": 0.0007806593905136176, "train_loss": 0.4155448598384857, "epoch": 473}
+{"train_lr": 0.0007766282921159524, "train_loss": 0.4154927032291889, "epoch": 474}
+{"train_lr": 0.0007726010106099076, "train_loss": 0.41546733177900314, "epoch": 475}
+{"train_lr": 0.0007685776148106116, "train_loss": 0.4154239023923874, "epoch": 476}
+{"train_lr": 0.0007645581734668001, "train_loss": 0.41537334437966345, "epoch": 477}
+{"train_lr": 0.0007605427552596346, "train_loss": 0.4154018564403057, "epoch": 478}
+{"train_lr": 0.0007565314288015307, "train_loss": 0.4154400738298893, "epoch": 479}
+{"train_lr": 0.0007525242626349935, "train_loss": 0.41526551213860513, "epoch": 480}
+{"train_lr": 0.0007485213252314344, "train_loss": 0.41534639605283735, "epoch": 481}
+{"train_lr": 0.0007445226849900115, "train_loss": 0.41517547313570974, "epoch": 482}
+{"train_lr": 0.0007405284102364519, "train_loss": 0.41520622568130494, "epoch": 483}
+{"train_lr": 0.0007365385692218902, "train_loss": 0.4151805117607117, "epoch": 484}
+{"train_lr": 0.0007325532301217024, "train_loss": 0.4151923873543739, "epoch": 485}
+{"train_lr": 0.0007285724610343378, "train_loss": 0.415164637196064, "epoch": 486}
+{"train_lr": 0.0007245963299801566, "train_loss": 0.41513061105012894, "epoch": 487}
+{"train_lr": 0.0007206249049002679, "train_loss": 0.41505594806075097, "epoch": 488}
+{"train_lr": 0.000716658253655366, "train_loss": 0.414974316573143, "epoch": 489}
+{"train_lr": 0.0007126964440245807, "train_loss": 0.4150249040722847, "epoch": 490}
+{"train_lr": 0.0007087395437043058, "train_loss": 0.41488441542983057, "epoch": 491}
+{"train_lr": 0.00070478762030705, "train_loss": 0.4148910955309868, "epoch": 492}
+{"train_lr": 0.0007008407413602802, "train_loss": 0.41482908695936205, "epoch": 493}
+{"train_lr": 0.000696898974305269, "train_loss": 0.4148184060752392, "epoch": 494}
+{"train_lr": 0.000692962386495939, "train_loss": 0.4147723206758499, "epoch": 495}
+{"train_lr": 0.0006890310451977145, "train_loss": 0.4147319468975067, "epoch": 496}
+{"train_lr": 0.0006851050175863707, "train_loss": 0.41478028755784035, "epoch": 497}
+{"train_lr": 0.0006811843707468876, "train_loss": 0.41468667683005334, "epoch": 498}
+{"train_lr": 0.0006772691716723045, "train_loss": 0.4146990427553654, "epoch": 499}
+{"train_lr": 0.0006733594872625652, "train_loss": 0.4146124188661575, "epoch": 500}
+{"train_lr": 0.0006694553843233956, "train_loss": 0.41460838395953176, "epoch": 501}
+{"train_lr": 0.000665556929565141, "train_loss": 0.4146430678844452, "epoch": 502}
+{"train_lr": 0.0006616641896016334, "train_loss": 0.41454836529493333, "epoch": 503}
+{"train_lr": 0.0006577772309490656, "train_loss": 0.4145636016011238, "epoch": 504}
+{"train_lr": 0.0006538961200248293, "train_loss": 0.4144807538509369, "epoch": 505}
+{"train_lr": 0.0006500209231464063, "train_loss": 0.4144996554195881, "epoch": 506}
+{"train_lr": 0.0006461517065302167, "train_loss": 0.41448832686543463, "epoch": 507}
+{"train_lr": 0.0006422885362904992, "train_loss": 0.41441225247979163, "epoch": 508}
+{"train_lr": 0.0006384314784381729, "train_loss": 0.41435343540906905, "epoch": 509}
+{"train_lr": 0.000634580598879715, "train_loss": 0.41432497901916504, "epoch": 510}
+{"train_lr": 0.0006307359634160299, "train_loss": 0.4141963863253593, "epoch": 511}
+{"train_lr": 0.0006268976377413344, "train_loss": 0.414292369222641, "epoch": 512}
+{"train_lr": 0.0006230656874420206, "train_loss": 0.41412822899222373, "epoch": 513}
+{"train_lr": 0.000619240177995549, "train_loss": 0.41413087169528007, "epoch": 514}
+{"train_lr": 0.0006154211747693183, "train_loss": 0.4142154009103775, "epoch": 515}
+{"train_lr": 0.0006116087430195577, "train_loss": 0.4141320895433426, "epoch": 516}
+{"train_lr": 0.0006078029478902082, "train_loss": 0.41410443152785303, "epoch": 517}
+{"train_lr": 0.0006040038544118062, "train_loss": 0.41408611317276955, "epoch": 518}
+{"train_lr": 0.0006002115275003778, "train_loss": 0.41404432806372643, "epoch": 519}
+{"train_lr": 0.0005964260319563274, "train_loss": 0.41398654327988627, "epoch": 520}
+{"train_lr": 0.0005926474324633267, "train_loss": 0.41391335440278054, "epoch": 521}
+{"train_lr": 0.0005888757935872201, "train_loss": 0.41387077738046646, "epoch": 522}
+{"train_lr": 0.0005851111797749066, "train_loss": 0.41397186594605445, "epoch": 523}
+{"train_lr": 0.0005813536553532483, "train_loss": 0.4139413024187088, "epoch": 524}
+{"train_lr": 0.0005776032845279719, "train_loss": 0.4138102644562721, "epoch": 525}
+{"train_lr": 0.0005738601313825683, "train_loss": 0.4137563929796219, "epoch": 526}
+{"train_lr": 0.0005701242598771955, "train_loss": 0.41368419902324677, "epoch": 527}
+{"train_lr": 0.0005663957338475891, "train_loss": 0.4137420842349529, "epoch": 528}
+{"train_lr": 0.0005626746170039725, "train_loss": 0.41368860872387886, "epoch": 529}
+{"train_lr": 0.0005589609729299664, "train_loss": 0.41366335294246676, "epoch": 530}
+{"train_lr": 0.0005552548650815012, "train_loss": 0.4136664641916752, "epoch": 531}
+{"train_lr": 0.0005515563567857334, "train_loss": 0.41364744307994844, "epoch": 532}
+{"train_lr": 0.0005478655112399664, "train_loss": 0.4135095750927925, "epoch": 533}
+{"train_lr": 0.0005441823915105678, "train_loss": 0.4134646384775639, "epoch": 534}
+{"train_lr": 0.0005405070605318911, "train_loss": 0.4135267463207245, "epoch": 535}
+{"train_lr": 0.0005368395811052013, "train_loss": 0.41346614977121354, "epoch": 536}
+{"train_lr": 0.000533180015897602, "train_loss": 0.41341310681700705, "epoch": 537}
+{"train_lr": 0.0005295284274409709, "train_loss": 0.41329991322159765, "epoch": 538}
+{"train_lr": 0.0005258848781308736, "train_loss": 0.41334705371260644, "epoch": 539}
+{"train_lr": 0.0005222494302255165, "train_loss": 0.41332384219169616, "epoch": 540}
+{"train_lr": 0.0005186221458446746, "train_loss": 0.41331451881527903, "epoch": 541}
+{"train_lr": 0.0005150030869686313, "train_loss": 0.4133096279680729, "epoch": 542}
+{"train_lr": 0.0005113923154371142, "train_loss": 0.41333488993048667, "epoch": 543}
+{"train_lr": 0.0005077898929482494, "train_loss": 0.41325920339226724, "epoch": 544}
+{"train_lr": 0.0005041958810574948, "train_loss": 0.41315065550804136, "epoch": 545}
+{"train_lr": 0.0005006103411766005, "train_loss": 0.41317506961226463, "epoch": 546}
+{"train_lr": 0.0004970333345725481, "train_loss": 0.4131286765635014, "epoch": 547}
+{"train_lr": 0.0004934649223665127, "train_loss": 0.41310886276960374, "epoch": 548}
+{"train_lr": 0.0004899051655328116, "train_loss": 0.4130334359705448, "epoch": 549}
+{"train_lr": 0.0004863541248978668, "train_loss": 0.4128937359213829, "epoch": 550}
+{"train_lr": 0.00048281186113916804, "train_loss": 0.413018404263258, "epoch": 551}
+{"train_lr": 0.00047927843478422894, "train_loss": 0.4130131136238575, "epoch": 552}
+{"train_lr": 0.00047575390620955427, "train_loss": 0.41287013072967527, "epoch": 553}
+{"train_lr": 0.00047223833563961505, "train_loss": 0.4128674404680729, "epoch": 554}
+{"train_lr": 0.00046873178314581177, "train_loss": 0.41282065522670747, "epoch": 555}
+{"train_lr": 0.00046523430864545227, "train_loss": 0.412801239490509, "epoch": 556}
+{"train_lr": 0.00046174597190072565, "train_loss": 0.4127468424975872, "epoch": 557}
+{"train_lr": 0.0004582668325176823, "train_loss": 0.41264703783988954, "epoch": 558}
+{"train_lr": 0.000454796949945214, "train_loss": 0.412723533976078, "epoch": 559}
+{"train_lr": 0.0004513363834740404, "train_loss": 0.4126707662463188, "epoch": 560}
+{"train_lr": 0.0004478851922356962, "train_loss": 0.4126263898909092, "epoch": 561}
+{"train_lr": 0.0004444434352015155, "train_loss": 0.4126412259161472, "epoch": 562}
+{"train_lr": 0.0004410111711816321, "train_loss": 0.4125005640268326, "epoch": 563}
+{"train_lr": 0.0004375884588239656, "train_loss": 0.41252969363331793, "epoch": 564}
+{"train_lr": 0.0004341753566132277, "train_loss": 0.4123950005233288, "epoch": 565}
+{"train_lr": 0.0004307719228699184, "train_loss": 0.4124559945344925, "epoch": 566}
+{"train_lr": 0.0004273782157493301, "train_loss": 0.41243081186413766, "epoch": 567}
+{"train_lr": 0.00042399429324055236, "train_loss": 0.41239919402599334, "epoch": 568}
+{"train_lr": 0.0004206202131654863, "train_loss": 0.4123308017849922, "epoch": 569}
+{"train_lr": 0.000417256033177851, "train_loss": 0.4123557644248009, "epoch": 570}
+{"train_lr": 0.00041390181076219907, "train_loss": 0.41225514442920685, "epoch": 571}
+{"train_lr": 0.0004105576032329374, "train_loss": 0.41224638593196866, "epoch": 572}
+{"train_lr": 0.0004072234677333462, "train_loss": 0.4121566233634949, "epoch": 573}
+{"train_lr": 0.000403899461234601, "train_loss": 0.4122022950172424, "epoch": 574}
+{"train_lr": 0.0004005856405348028, "train_loss": 0.4122386267721653, "epoch": 575}
+{"train_lr": 0.00039728206225800316, "train_loss": 0.4121166242182255, "epoch": 576}
+{"train_lr": 0.0003939887828532405, "train_loss": 0.41211722364425657, "epoch": 577}
+{"train_lr": 0.00039070585859357225, "train_loss": 0.41196879163384437, "epoch": 578}
+{"train_lr": 0.00038743334557511883, "train_loss": 0.4120268380403519, "epoch": 579}
+{"train_lr": 0.00038417129971609465, "train_loss": 0.4120321435570717, "epoch": 580}
+{"train_lr": 0.0003809197767558675, "train_loss": 0.4119880166888237, "epoch": 581}
+{"train_lr": 0.00037767883225399033, "train_loss": 0.41182354040145874, "epoch": 582}
+{"train_lr": 0.00037444852158926347, "train_loss": 0.4119151137650013, "epoch": 583}
+{"train_lr": 0.00037122889995878434, "train_loss": 0.41178027091026304, "epoch": 584}
+{"train_lr": 0.00036802002237700215, "train_loss": 0.41189671708345416, "epoch": 585}
+{"train_lr": 0.0003648219436747815, "train_loss": 0.4118088481903076, "epoch": 586}
+{"train_lr": 0.00036163471849846445, "train_loss": 0.41158689913749696, "epoch": 587}
+{"train_lr": 0.00035845840130893473, "train_loss": 0.4116609573543072, "epoch": 588}
+{"train_lr": 0.00035529304638068815, "train_loss": 0.4116432239770889, "epoch": 589}
+{"train_lr": 0.0003521387078009091, "train_loss": 0.4116695198178291, "epoch": 590}
+{"train_lr": 0.0003489954394685392, "train_loss": 0.4116169459104538, "epoch": 591}
+{"train_lr": 0.000345863295093364, "train_loss": 0.4115727410554886, "epoch": 592}
+{"train_lr": 0.0003427423281950851, "train_loss": 0.41158620098233223, "epoch": 593}
+{"train_lr": 0.00033963259210241883, "train_loss": 0.41150212720036505, "epoch": 594}
+{"train_lr": 0.00033653413995217435, "train_loss": 0.41141462765336034, "epoch": 595}
+{"train_lr": 0.00033344702468834903, "train_loss": 0.4113722758948803, "epoch": 596}
+{"train_lr": 0.00033037129906122623, "train_loss": 0.41129573442935946, "epoch": 597}
+{"train_lr": 0.0003273070156264704, "train_loss": 0.41129547247886655, "epoch": 598}
+{"train_lr": 0.0003242542267442306, "train_loss": 0.4113450105786324, "epoch": 599}
+{"train_lr": 0.0003212129845782456, "train_loss": 0.411285870462656, "epoch": 600}
+{"train_lr": 0.0003181833410949536, "train_loss": 0.41130744271874425, "epoch": 601}
+{"train_lr": 0.00031516534806260186, "train_loss": 0.4112095928132534, "epoch": 602}
+{"train_lr": 0.00031215905705036536, "train_loss": 0.41113772990703584, "epoch": 603}
+{"train_lr": 0.0003091645194274621, "train_loss": 0.41113032053112986, "epoch": 604}
+{"train_lr": 0.0003061817863622778, "train_loss": 0.4110891651570797, "epoch": 605}
+{"train_lr": 0.00030321090882149234, "train_loss": 0.41110460319519043, "epoch": 606}
+{"train_lr": 0.0003002519375692042, "train_loss": 0.41107726674675943, "epoch": 607}
+{"train_lr": 0.00029730492316606825, "train_loss": 0.4110739596545696, "epoch": 608}
+{"train_lr": 0.0002943699159684297, "train_loss": 0.4109533204615116, "epoch": 609}
+{"train_lr": 0.00029144696612746454, "train_loss": 0.41088306730389595, "epoch": 610}
+{"train_lr": 0.0002885361235883199, "train_loss": 0.41095819348096846, "epoch": 611}
+{"train_lr": 0.0002856374380892637, "train_loss": 0.41093446829319, "epoch": 612}
+{"train_lr": 0.00028275095916083335, "train_loss": 0.41092277715802195, "epoch": 613}
+{"train_lr": 0.00027987673612499026, "train_loss": 0.41091884284615515, "epoch": 614}
+{"train_lr": 0.00027701481809427403, "train_loss": 0.41077308706641197, "epoch": 615}
+{"train_lr": 0.0002741652539709704, "train_loss": 0.41076149238944054, "epoch": 616}
+{"train_lr": 0.0002713280924462657, "train_loss": 0.41067302731275557, "epoch": 617}
+{"train_lr": 0.00026850338199942207, "train_loss": 0.4106996956408024, "epoch": 618}
+{"train_lr": 0.0002656911708969498, "train_loss": 0.41060551152825353, "epoch": 619}
+{"train_lr": 0.0002628915071917763, "train_loss": 0.41057525554299357, "epoch": 620}
+{"train_lr": 0.0002601044387224285, "train_loss": 0.4105493293166161, "epoch": 621}
+{"train_lr": 0.0002573300131122188, "train_loss": 0.41071004919409754, "epoch": 622}
+{"train_lr": 0.00025456827776842376, "train_loss": 0.41045111640691756, "epoch": 623}
+{"train_lr": 0.00025181927988148265, "train_loss": 0.410511493909359, "epoch": 624}
+{"train_lr": 0.0002490830664241836, "train_loss": 0.4104461461484432, "epoch": 625}
+{"train_lr": 0.0002463596841508659, "train_loss": 0.4104572146654129, "epoch": 626}
+{"train_lr": 0.00024364917959661644, "train_loss": 0.41034869700074195, "epoch": 627}
+{"train_lr": 0.00024095159907648234, "train_loss": 0.41023331859111783, "epoch": 628}
+{"train_lr": 0.0002382669886846699, "train_loss": 0.41035697820782663, "epoch": 629}
+{"train_lr": 0.0002355953942937644, "train_loss": 0.4102578080415726, "epoch": 630}
+{"train_lr": 0.00023293686155394203, "train_loss": 0.41025401488542557, "epoch": 631}
+{"train_lr": 0.00023029143589219285, "train_loss": 0.41027388836741446, "epoch": 632}
+{"train_lr": 0.00022765916251154313, "train_loss": 0.4101309650480747, "epoch": 633}
+{"train_lr": 0.00022504008639028075, "train_loss": 0.41018197714686394, "epoch": 634}
+{"train_lr": 0.00022243425228119063, "train_loss": 0.4102461946487427, "epoch": 635}
+{"train_lr": 0.00021984170471078866, "train_loss": 0.41012363595962525, "epoch": 636}
+{"train_lr": 0.00021726248797855976, "train_loss": 0.41003916486501696, "epoch": 637}
+{"train_lr": 0.0002146966461562013, "train_loss": 0.4100011553347111, "epoch": 638}
+{"train_lr": 0.00021214422308687, "train_loss": 0.4099870161771774, "epoch": 639}
+{"train_lr": 0.00020960526238443468, "train_loss": 0.409950205296278, "epoch": 640}
+{"train_lr": 0.00020707980743272803, "train_loss": 0.40993198407888415, "epoch": 641}
+{"train_lr": 0.00020456790138480746, "train_loss": 0.40987456869482997, "epoch": 642}
+{"train_lr": 0.00020206958716221631, "train_loss": 0.4099106639921665, "epoch": 643}
+{"train_lr": 0.00019958490745425211, "train_loss": 0.40992944944500925, "epoch": 644}
+{"train_lr": 0.00019711390471723525, "train_loss": 0.40970903441905976, "epoch": 645}
+{"train_lr": 0.00019465662117378513, "train_loss": 0.4097623137831688, "epoch": 646}
+{"train_lr": 0.00019221309881209726, "train_loss": 0.4097091728568077, "epoch": 647}
+{"train_lr": 0.00018978337938522675, "train_loss": 0.4097723929464817, "epoch": 648}
+{"train_lr": 0.00018736750441037523, "train_loss": 0.4096767637908459, "epoch": 649}
+{"train_lr": 0.00018496551516817997, "train_loss": 0.4096685712814331, "epoch": 650}
+{"train_lr": 0.00018257745270201065, "train_loss": 0.4095007773041725, "epoch": 651}
+{"train_lr": 0.00018020335781726479, "train_loss": 0.40950128165483474, "epoch": 652}
+{"train_lr": 0.0001778432710806747, "train_loss": 0.4095606074631214, "epoch": 653}
+{"train_lr": 0.00017549723281960988, "train_loss": 0.40949765983819963, "epoch": 654}
+{"train_lr": 0.00017316528312139175, "train_loss": 0.40952413992881775, "epoch": 655}
+{"train_lr": 0.00017084746183260703, "train_loss": 0.4094638512015343, "epoch": 656}
+{"train_lr": 0.00016854380855842624, "train_loss": 0.4094694583117962, "epoch": 657}
+{"train_lr": 0.00016625436266192763, "train_loss": 0.40931712368130685, "epoch": 658}
+{"train_lr": 0.00016397916326342497, "train_loss": 0.4093422090888023, "epoch": 659}
+{"train_lr": 0.000161718249239798, "train_loss": 0.4092994294703007, "epoch": 660}
+{"train_lr": 0.0001594716592238298, "train_loss": 0.4093663468182087, "epoch": 661}
+{"train_lr": 0.00015723943160354516, "train_loss": 0.40929065743684767, "epoch": 662}
+{"train_lr": 0.00015502160452155516, "train_loss": 0.4092123525619507, "epoch": 663}
+{"train_lr": 0.00015281821587440569, "train_loss": 0.40918805617690085, "epoch": 664}
+{"train_lr": 0.00015062930331192866, "train_loss": 0.4091305765867233, "epoch": 665}
+{"train_lr": 0.0001484549042366004, "train_loss": 0.40919655148983003, "epoch": 666}
+{"train_lr": 0.0001462950558029027, "train_loss": 0.40923569843173024, "epoch": 667}
+{"train_lr": 0.0001441497949166853, "train_loss": 0.40912016796469686, "epoch": 668}
+{"train_lr": 0.00014201915823453798, "train_loss": 0.4091143898308277, "epoch": 669}
+{"train_lr": 0.00013990318216316309, "train_loss": 0.4091158373832703, "epoch": 670}
+{"train_lr": 0.00013780190285875329, "train_loss": 0.4089883540272713, "epoch": 671}
+{"train_lr": 0.0001357153562263738, "train_loss": 0.40893249164819717, "epoch": 672}
+{"train_lr": 0.00013364357791935063, "train_loss": 0.409016412883997, "epoch": 673}
+{"train_lr": 0.0001315866033386586, "train_loss": 0.40892095088362695, "epoch": 674}
+{"train_lr": 0.00012954446763231708, "train_loss": 0.4089177478671074, "epoch": 675}
+{"train_lr": 0.00012751720569479193, "train_loss": 0.4089482992887497, "epoch": 676}
+{"train_lr": 0.00012550485216639558, "train_loss": 0.40890288605093955, "epoch": 677}
+{"train_lr": 0.0001235074414326978, "train_loss": 0.40893578273653985, "epoch": 678}
+{"train_lr": 0.00012152500762393668, "train_loss": 0.40879338170886037, "epoch": 679}
+{"train_lr": 0.00011955758461443642, "train_loss": 0.40870585800409315, "epoch": 680}
+{"train_lr": 0.0001176052060220283, "train_loss": 0.408755088865757, "epoch": 681}
+{"train_lr": 0.00011566790520747518, "train_loss": 0.4087392102777958, "epoch": 682}
+{"train_lr": 0.00011374571527390314, "train_loss": 0.40866463065743447, "epoch": 683}
+{"train_lr": 0.0001118386690662345, "train_loss": 0.4087050619006157, "epoch": 684}
+{"train_lr": 0.00010994679917062744, "train_loss": 0.4086720600247383, "epoch": 685}
+{"train_lr": 0.000108070137913918, "train_loss": 0.40857414263486863, "epoch": 686}
+{"train_lr": 0.00010620871736307003, "train_loss": 0.40863434770703316, "epoch": 687}
+{"train_lr": 0.00010436256932462424, "train_loss": 0.40859491340518, "epoch": 688}
+{"train_lr": 0.00010253172534415723, "train_loss": 0.40860966989994046, "epoch": 689}
+{"train_lr": 0.00010071621670574097, "train_loss": 0.408625454801321, "epoch": 690}
+{"train_lr": 9.891607443140929e-05, "train_loss": 0.40844214201569556, "epoch": 691}
+{"train_lr": 9.713132928062657e-05, "train_loss": 0.40843296210169794, "epoch": 692}
+{"train_lr": 9.536201174976322e-05, "train_loss": 0.40837096125483513, "epoch": 693}
+{"train_lr": 9.360815207157413e-05, "train_loss": 0.4083694005072117, "epoch": 694}
+{"train_lr": 9.186978021468215e-05, "train_loss": 0.4084002661764622, "epoch": 695}
+{"train_lr": 9.014692588306594e-05, "train_loss": 0.40843813487291336, "epoch": 696}
+{"train_lr": 8.84396185155527e-05, "train_loss": 0.40834322509765625, "epoch": 697}
+{"train_lr": 8.67478872853143e-05, "train_loss": 0.4083211016476154, "epoch": 698}
+{"train_lr": 8.507176109937047e-05, "train_loss": 0.4082286029994488, "epoch": 699}
+{"train_lr": 8.341126859809256e-05, "train_loss": 0.40823151443004607, "epoch": 700}
+{"train_lr": 8.176643815471623e-05, "train_loss": 0.40823154353499413, "epoch": 701}
+{"train_lr": 8.013729787485531e-05, "train_loss": 0.40827645783424377, "epoch": 702}
+{"train_lr": 7.852387559602257e-05, "train_loss": 0.40825580505132675, "epoch": 703}
+{"train_lr": 7.692619888715302e-05, "train_loss": 0.4081780993103981, "epoch": 704}
+{"train_lr": 7.534429504813323e-05, "train_loss": 0.4081855354487896, "epoch": 705}
+{"train_lr": 7.377819110933544e-05, "train_loss": 0.4082311128556728, "epoch": 706}
+{"train_lr": 7.222791383115492e-05, "train_loss": 0.4081001627087593, "epoch": 707}
+{"train_lr": 7.069348970355303e-05, "train_loss": 0.40801326141357425, "epoch": 708}
+{"train_lr": 6.917494494560436e-05, "train_loss": 0.40805929116606715, "epoch": 709}
+{"train_lr": 6.767230550504895e-05, "train_loss": 0.4080538489818573, "epoch": 710}
+{"train_lr": 6.618559705784932e-05, "train_loss": 0.4080111927628517, "epoch": 711}
+{"train_lr": 6.471484500775038e-05, "train_loss": 0.40799329899549486, "epoch": 712}
+{"train_lr": 6.326007448584706e-05, "train_loss": 0.4080479858994484, "epoch": 713}
+{"train_lr": 6.182131035015343e-05, "train_loss": 0.4079994874477387, "epoch": 714}
+{"train_lr": 6.0398577185179195e-05, "train_loss": 0.4078952370584011, "epoch": 715}
+{"train_lr": 5.8991899301508436e-05, "train_loss": 0.40794192504286764, "epoch": 716}
+{"train_lr": 5.7601300735385406e-05, "train_loss": 0.4079172481238842, "epoch": 717}
+{"train_lr": 5.62268052483022e-05, "train_loss": 0.4078769870400429, "epoch": 718}
+{"train_lr": 5.4868436326594996e-05, "train_loss": 0.40775742872953413, "epoch": 719}
+{"train_lr": 5.352621718104013e-05, "train_loss": 0.4078458012342453, "epoch": 720}
+{"train_lr": 5.220017074646012e-05, "train_loss": 0.4077809689939022, "epoch": 721}
+{"train_lr": 5.089031968132945e-05, "train_loss": 0.40774403147697447, "epoch": 722}
+{"train_lr": 4.959668636738903e-05, "train_loss": 0.4077515964627266, "epoch": 723}
+{"train_lr": 4.831929290926272e-05, "train_loss": 0.407721921145916, "epoch": 724}
+{"train_lr": 4.705816113408049e-05, "train_loss": 0.40768695514798164, "epoch": 725}
+{"train_lr": 4.5813312591104704e-05, "train_loss": 0.4076756275653839, "epoch": 726}
+{"train_lr": 4.458476855136227e-05, "train_loss": 0.40769834047555925, "epoch": 727}
+{"train_lr": 4.3372550007281185e-05, "train_loss": 0.4076857505738735, "epoch": 728}
+{"train_lr": 4.217667767233175e-05, "train_loss": 0.4076809181332588, "epoch": 729}
+{"train_lr": 4.0997171980672597e-05, "train_loss": 0.4076770887076855, "epoch": 730}
+{"train_lr": 3.9834053086801805e-05, "train_loss": 0.4075366601884365, "epoch": 731}
+{"train_lr": 3.868734086521197e-05, "train_loss": 0.40765976741313936, "epoch": 732}
+{"train_lr": 3.7557054910051054e-05, "train_loss": 0.40767239355444906, "epoch": 733}
+{"train_lr": 3.644321453478749e-05, "train_loss": 0.4076313421726227, "epoch": 734}
+{"train_lr": 3.5345838771880166e-05, "train_loss": 0.40756957579255104, "epoch": 735}
+{"train_lr": 3.4264946372453015e-05, "train_loss": 0.40758756697773935, "epoch": 736}
+{"train_lr": 3.3200555805974955e-05, "train_loss": 0.40753637469410897, "epoch": 737}
+{"train_lr": 3.215268525994395e-05, "train_loss": 0.40759654030799863, "epoch": 738}
+{"train_lr": 3.1121352639576464e-05, "train_loss": 0.4075975024521351, "epoch": 739}
+{"train_lr": 3.0106575567501452e-05, "train_loss": 0.4074262948334217, "epoch": 740}
+{"train_lr": 2.9108371383459213e-05, "train_loss": 0.4075006844162941, "epoch": 741}
+{"train_lr": 2.8126757144005083e-05, "train_loss": 0.4073981125712395, "epoch": 742}
+{"train_lr": 2.7161749622217994e-05, "train_loss": 0.40740938003063204, "epoch": 743}
+{"train_lr": 2.6213365307414162e-05, "train_loss": 0.4074016982078552, "epoch": 744}
+{"train_lr": 2.5281620404864564e-05, "train_loss": 0.4073709517121315, "epoch": 745}
+{"train_lr": 2.4366530835519025e-05, "train_loss": 0.40737549446821214, "epoch": 746}
+{"train_lr": 2.3468112235733392e-05, "train_loss": 0.4074480685114861, "epoch": 747}
+{"train_lr": 2.2586379957002727e-05, "train_loss": 0.407499808126688, "epoch": 748}
+{"train_lr": 2.1721349065698846e-05, "train_loss": 0.40737112711071966, "epoch": 749}
+{"train_lr": 2.087303434281305e-05, "train_loss": 0.40735656403303144, "epoch": 750}
+{"train_lr": 2.0041450283703275e-05, "train_loss": 0.40729228178858756, "epoch": 751}
+{"train_lr": 1.9226611097846807e-05, "train_loss": 0.40734857454895973, "epoch": 752}
+{"train_lr": 1.842853070859705e-05, "train_loss": 0.407330923384428, "epoch": 753}
+{"train_lr": 1.7647222752945838e-05, "train_loss": 0.40724869443178174, "epoch": 754}
+{"train_lr": 1.688270058129047e-05, "train_loss": 0.4072960561275482, "epoch": 755}
+{"train_lr": 1.6134977257205462e-05, "train_loss": 0.407342313015461, "epoch": 756}
+{"train_lr": 1.5404065557219386e-05, "train_loss": 0.40731965934634207, "epoch": 757}
+{"train_lr": 1.4689977970596522e-05, "train_loss": 0.40725169029831887, "epoch": 758}
+{"train_lr": 1.3992726699123512e-05, "train_loss": 0.4072245597243309, "epoch": 759}
+{"train_lr": 1.3312323656900852e-05, "train_loss": 0.40719416123628616, "epoch": 760}
+{"train_lr": 1.2648780470139173e-05, "train_loss": 0.4072723692417145, "epoch": 761}
+{"train_lr": 1.2002108476960741e-05, "train_loss": 0.4072173948287964, "epoch": 762}
+{"train_lr": 1.1372318727205755e-05, "train_loss": 0.40724221390485765, "epoch": 763}
+{"train_lr": 1.0759421982243326e-05, "train_loss": 0.40718788425326347, "epoch": 764}
+{"train_lr": 1.0163428714787861e-05, "train_loss": 0.40724759435653685, "epoch": 765}
+{"train_lr": 9.584349108719813e-06, "train_loss": 0.40717979621887207, "epoch": 766}
+{"train_lr": 9.022193058912006e-06, "train_loss": 0.4071979228913784, "epoch": 767}
+{"train_lr": 8.476970171060192e-06, "train_loss": 0.40717670152187346, "epoch": 768}
+{"train_lr": 7.948689761519278e-06, "train_loss": 0.40705007915496827, "epoch": 769}
+{"train_lr": 7.437360857143847e-06, "train_loss": 0.4072156092405319, "epoch": 770}
+{"train_lr": 6.942992195134097e-06, "train_loss": 0.4070780915558338, "epoch": 771}
+{"train_lr": 6.465592222886441e-06, "train_loss": 0.4070723837614059, "epoch": 772}
+{"train_lr": 6.0051690978492155e-06, "train_loss": 0.40715753821730616, "epoch": 773}
+{"train_lr": 5.561730687383275e-06, "train_loss": 0.40711742687225344, "epoch": 774}
+{"train_lr": 5.135284568627556e-06, "train_loss": 0.407139888215065, "epoch": 775}
+{"train_lr": 4.725838028369653e-06, "train_loss": 0.4070862729489803, "epoch": 776}
+{"train_lr": 4.333398062921207e-06, "train_loss": 0.40716656067967416, "epoch": 777}
+{"train_lr": 3.957971377998454e-06, "train_loss": 0.4070811638891697, "epoch": 778}
+{"train_lr": 3.599564388607613e-06, "train_loss": 0.40715316613912583, "epoch": 779}
+{"train_lr": 3.258183218935257e-06, "train_loss": 0.4070707754790783, "epoch": 780}
+{"train_lr": 2.9338337022436484e-06, "train_loss": 0.407094335603714, "epoch": 781}
+{"train_lr": 2.626521380771149e-06, "train_loss": 0.4070680266022682, "epoch": 782}
+{"train_lr": 2.3362515056374043e-06, "train_loss": 0.4070445769608021, "epoch": 783}
+{"train_lr": 2.0630290367537063e-06, "train_loss": 0.407051396137476, "epoch": 784}
+{"train_lr": 1.8068586427382016e-06, "train_loss": 0.4070886338174343, "epoch": 785}
+{"train_lr": 1.5677447008361348e-06, "train_loss": 0.4070832368195057, "epoch": 786}
+{"train_lr": 1.3456912968450236e-06, "train_loss": 0.4070368420124054, "epoch": 787}
+{"train_lr": 1.140702225044881e-06, "train_loss": 0.40696477791666985, "epoch": 788}
+{"train_lr": 9.527809881333541e-07, "train_loss": 0.40711091704964636, "epoch": 789}
+{"train_lr": 7.819307971659009e-07, "train_loss": 0.40703405417203903, "epoch": 790}
+{"train_lr": 6.281545715008838e-07, "train_loss": 0.4070916808605194, "epoch": 791}
+{"train_lr": 4.91454938749716e-07, "train_loss": 0.4070300230205059, "epoch": 792}
+{"train_lr": 3.7183423473196524e-07, "train_loss": 0.40707525467276573, "epoch": 793}
+{"train_lr": 2.6929450343540397e-07, "train_loss": 0.40699523387551306, "epoch": 794}
+{"train_lr": 1.8383749698112992e-07, "train_loss": 0.4070662397742271, "epoch": 795}
+{"train_lr": 1.1546467559359906e-07, "train_loss": 0.40707216830849646, "epoch": 796}
+{"train_lr": 6.417720757569029e-08, "train_loss": 0.4070766533434391, "epoch": 797}
+{"train_lr": 2.9975969288707755e-08, "train_loss": 0.4070338776230812, "epoch": 798}
+{"train_lr": 1.2861545137461837e-08, "train_loss": 0.4070114720463753, "epoch": 799}
diff --git a/CV/MAE/exp_results/MAE/large/log_large_ft.txt b/CV/MAE/exp_results/MAE/large/log_large_ft.txt
new file mode 100644
index 0000000..209bfb4
--- /dev/null
+++ b/CV/MAE/exp_results/MAE/large/log_large_ft.txt
@@ -0,0 +1,50 @@
+{"train_lr": 0.0007476019200000001, "train_loss": 5.9094133159518245, "test_loss": 1.7714076134562493, "test_acc1": 61.33637235611582, "test_acc5": 84.77687142609177, "epoch": 0, "n_parameters": 304326632}
+{"train_lr": 0.0022476019200000003, "train_loss": 4.501337738275528, "test_loss": 1.1959131537377834, "test_acc1": 72.27087332465598, "test_acc5": 91.66066860084875, "epoch": 1, "n_parameters": 304326632}
+{"train_lr": 0.0037476019200000004, "train_loss": 4.119643689954281, "test_loss": 1.0854404755681752, "test_acc1": 75.52783110144804, "test_acc5": 93.39011516131733, "epoch": 2, "n_parameters": 304326632}
+{"train_lr": 0.005247601920000002, "train_loss": 3.9008864871740343, "test_loss": 1.0289268112555146, "test_acc1": 76.92938261289896, "test_acc5": 94.09788868386092, "epoch": 3, "n_parameters": 304326632}
+{"train_lr": 0.006747601919999998, "train_loss": 3.76051225707531, "test_loss": 0.9720380315184594, "test_acc1": 78.21497122713639, "test_acc5": 94.63371721293326, "epoch": 4, "n_parameters": 304326632}
+{"train_lr": 0.00824760192, "train_loss": 3.651956864875555, "test_loss": 0.9415295435115695, "test_acc1": 78.97672746285214, "test_acc5": 95.09756876746584, "epoch": 5, "n_parameters": 304326632}
+{"train_lr": 0.009747601920000001, "train_loss": 3.5677191224038602, "test_loss": 0.9388785093277693, "test_acc1": 79.57453616627957, "test_acc5": 95.29950415058465, "epoch": 6, "n_parameters": 304326632}
+{"train_lr": 0.011247601919999997, "train_loss": 3.507449230492115, "test_loss": 0.9052619117870927, "test_acc1": 80.08437302847818, "test_acc5": 95.49944016815986, "epoch": 7, "n_parameters": 304326632}
+{"train_lr": 0.012747601919999994, "train_loss": 3.4423172294437885, "test_loss": 0.8388488055765628, "test_acc1": 80.4342610673575, "test_acc5": 95.76935380052772, "epoch": 8, "n_parameters": 304326632}
+{"train_lr": 0.014247601920000002, "train_loss": 3.3948125799477102, "test_loss": 0.8529021150618792, "test_acc1": 80.73616445743343, "test_acc5": 95.86732244598355, "epoch": 9, "n_parameters": 304326632}
+{"train_lr": 0.01499233375709719, "train_loss": 3.342990658354759, "test_loss": 0.8151264287903905, "test_acc1": 81.03206976010719, "test_acc5": 95.96529109723585, "epoch": 10, "n_parameters": 304326632}
+{"train_lr": 0.014946245730243689, "train_loss": 3.288912183743715, "test_loss": 0.8095201044529676, "test_acc1": 81.51191621381963, "test_acc5": 96.16522712243801, "epoch": 11, "n_parameters": 304326632}
+{"train_lr": 0.01485427994899793, "train_loss": 3.238141927015781, "test_loss": 0.7871933653950691, "test_acc1": 82.07973450799821, "test_acc5": 96.36716250067556, "epoch": 12, "n_parameters": 304326632}
+{"train_lr": 0.014717003412983015, "train_loss": 3.1956452232837678, "test_loss": 0.7688306730240584, "test_acc1": 82.2496801315022, "test_acc5": 96.52111323888074, "epoch": 13, "n_parameters": 304326632}
+{"train_lr": 0.014535262477692571, "train_loss": 3.1652532088041307, "test_loss": 0.7522821754962206, "test_acc1": 82.66154833756725, "test_acc5": 96.58309339943104, "epoch": 14, "n_parameters": 304326632}
+{"train_lr": 0.014310177636427614, "train_loss": 3.121457608240843, "test_loss": 0.7477796772867441, "test_acc1": 82.73952337029799, "test_acc5": 96.67906269169892, "epoch": 15, "n_parameters": 304326632}
+{"train_lr": 0.014043136612082945, "train_loss": 3.0966577651739122, "test_loss": 0.753467806391418, "test_acc1": 82.9974408353359, "test_acc5": 96.78502878132953, "epoch": 16, "n_parameters": 304326632}
+{"train_lr": 0.013735785801373714, "train_loss": 3.0689808761537076, "test_loss": 0.7341048694401979, "test_acc1": 83.14339413813727, "test_acc5": 96.79302621802991, "epoch": 17, "n_parameters": 304326632}
+{"train_lr": 0.01339002012425247, "train_loss": 3.029768516147137, "test_loss": 0.725501059666276, "test_acc1": 83.34532951271389, "test_acc5": 96.81301982526358, "epoch": 18, "n_parameters": 304326632}
+{"train_lr": 0.01300797134109743, "train_loss": 3.0120413874208927, "test_loss": 0.7309531949833036, "test_acc1": 83.50927706414586, "test_acc5": 97.00095968694924, "epoch": 19, "n_parameters": 304326632}
+{"train_lr": 0.012591994909700855, "train_loss": 2.9821670488238334, "test_loss": 0.7118158831447363, "test_acc1": 83.61924186945724, "test_acc5": 97.01895393230026, "epoch": 20, "n_parameters": 304326632}
+{"train_lr": 0.012144655463088535, "train_loss": 2.962305991309881, "test_loss": 0.7047568802535534, "test_acc1": 83.74520156128774, "test_acc5": 97.07493601513458, "epoch": 21, "n_parameters": 304326632}
+{"train_lr": 0.011668710997704269, "train_loss": 2.938569626682997, "test_loss": 0.7103257965296507, "test_acc1": 83.9051503784292, "test_acc5": 97.10092768666078, "epoch": 22, "n_parameters": 304326632}
+{"train_lr": 0.01116709586944475, "train_loss": 2.91352473244071, "test_loss": 0.7010805677436293, "test_acc1": 84.26103648877037, "test_acc5": 97.12492002376135, "epoch": 23, "n_parameters": 304326632}
+{"train_lr": 0.010642902702379645, "train_loss": 2.8938853970646856, "test_loss": 0.692104572802782, "test_acc1": 84.34101090580701, "test_acc5": 97.2508797091852, "epoch": 24, "n_parameters": 304326632}
+{"train_lr": 0.010099363321695844, "train_loss": 2.874984144228697, "test_loss": 0.6802691061235965, "test_acc1": 84.30902114603967, "test_acc5": 97.22488802759142, "epoch": 25, "n_parameters": 304326632}
+{"train_lr": 0.009539828828420426, "train_loss": 2.852267661267519, "test_loss": 0.6850866706669331, "test_acc1": 84.41898594143599, "test_acc5": 97.29486562941827, "epoch": 26, "n_parameters": 304326632}
+{"train_lr": 0.00896774893876856, "train_loss": 2.837763201504946, "test_loss": 0.6828102863952518, "test_acc1": 84.65091173876118, "test_acc5": 97.3268554027616, "epoch": 27, "n_parameters": 304326632}
+{"train_lr": 0.008386650715495802, "train_loss": 2.81947190862298, "test_loss": 0.6762189302407206, "test_acc1": 84.7188899800782, "test_acc5": 97.34884836501368, "epoch": 28, "n_parameters": 304326632}
+{"train_lr": 0.00780011682238341, "train_loss": 2.8003201848089696, "test_loss": 0.6725861196033657, "test_acc1": 84.82285671179217, "test_acc5": 97.32285668235212, "epoch": 29, "n_parameters": 304326632}
+{"train_lr": 0.007211763435924688, "train_loss": 2.7866385659873485, "test_loss": 0.671936163790524, "test_acc1": 84.95481448881304, "test_acc5": 97.38883556682028, "epoch": 30, "n_parameters": 304326632}
+{"train_lr": 0.006625217950394574, "train_loss": 2.7746526652514936, "test_loss": 0.6678782022558153, "test_acc1": 84.89283432917799, "test_acc5": 97.4168266078561, "epoch": 31, "n_parameters": 304326632}
+{"train_lr": 0.006044096613757472, "train_loss": 2.7576689450562, "test_loss": 0.6610171441733838, "test_acc1": 85.12675947130145, "test_acc5": 97.46681061770316, "epoch": 32, "n_parameters": 304326632}
+{"train_lr": 0.00547198223229625, "train_loss": 2.7347684874773024, "test_loss": 0.6683760618418455, "test_acc1": 85.13675626942688, "test_acc5": 97.39683300702906, "epoch": 33, "n_parameters": 304326632}
+{"train_lr": 0.004912402081419917, "train_loss": 2.723790532976389, "test_loss": 0.6556776543706655, "test_acc1": 85.26271595713884, "test_acc5": 97.47680741643875, "epoch": 34, "n_parameters": 304326632}
+{"train_lr": 0.004368806158837928, "train_loss": 2.7088236126720906, "test_loss": 0.654360967874527, "test_acc1": 85.24072299839516, "test_acc5": 97.47280869541913, "epoch": 35, "n_parameters": 304326632}
+{"train_lr": 0.003844545914176986, "train_loss": 2.694744017738104, "test_loss": 0.6538684133067727, "test_acc1": 85.33869164430858, "test_acc5": 97.53278950156115, "epoch": 36, "n_parameters": 304326632}
+{"train_lr": 0.0033428535861796433, "train_loss": 2.6908254801392557, "test_loss": 0.6542927216365934, "test_acc1": 85.39467373049877, "test_acc5": 97.53878758217536, "epoch": 37, "n_parameters": 304326632}
+{"train_lr": 0.002866822274877639, "train_loss": 2.671278304463625, "test_loss": 0.6524978142604232, "test_acc1": 85.49464174439643, "test_acc5": 97.49280229456822, "epoch": 38, "n_parameters": 304326632}
+{"train_lr": 0.0024193868716016085, "train_loss": 2.657200170958042, "test_loss": 0.650126696806401, "test_acc1": 85.59660910492285, "test_acc5": 97.52879077825345, "epoch": 39, "n_parameters": 304326632}
+{"train_lr": 0.0020033059644001382, "train_loss": 2.652334677708149, "test_loss": 0.6520910476334393, "test_acc1": 85.52263278619495, "test_acc5": 97.50279909849014, "epoch": 40, "n_parameters": 304326632}
+{"train_lr": 0.001621144830427048, "train_loss": 2.6431411161601543, "test_loss": 0.647436778191477, "test_acc1": 85.6365963131361, "test_acc5": 97.54478566339972, "epoch": 41, "n_parameters": 304326632}
+{"train_lr": 0.0012752596201547688, "train_loss": 2.637372990643978, "test_loss": 0.6462450991012156, "test_acc1": 85.61260398945898, "test_acc5": 97.54678502360446, "epoch": 42, "n_parameters": 304326632}
+{"train_lr": 0.0009677828309231273, "train_loss": 2.6305615900933743, "test_loss": 0.6458461854793132, "test_acc1": 85.75455856750352, "test_acc5": 97.53878758278552, "epoch": 43, "n_parameters": 304326632}
+{"train_lr": 0.0007006101593841485, "train_loss": 2.627352162593603, "test_loss": 0.6431183713674545, "test_acc1": 85.75455856231719, "test_acc5": 97.5627799058525, "epoch": 44, "n_parameters": 304326632}
+{"train_lr": 0.0004753888139017931, "train_loss": 2.6245033386409284, "test_loss": 0.6450332224182784, "test_acc1": 85.80654192580981, "test_acc5": 97.56877798524638, "epoch": 45, "n_parameters": 304326632}
+{"train_lr": 0.0002935073589646598, "train_loss": 2.6220774190187455, "test_loss": 0.6432638500258326, "test_acc1": 85.85252721585758, "test_acc5": 97.56078054442744, "epoch": 46, "n_parameters": 304326632}
+{"train_lr": 0.00015608715422415792, "train_loss": 2.611486408829689, "test_loss": 0.6422065225988627, "test_acc1": 85.82453617009305, "test_acc5": 97.57077734545112, "epoch": 47, "n_parameters": 304326632}
+{"train_lr": 6.397544093936805e-05, "train_loss": 2.6108330062150955, "test_loss": 0.6433782994002104, "test_acc1": 85.822536808668, "test_acc5": 97.57677542606532, "epoch": 48, "n_parameters": 304326632}
+{"train_lr": 1.7740118452942777e-05, "train_loss": 2.6155946560740473, "test_loss": 0.6427758732996881, "test_acc1": 85.822536808668, "test_acc5": 97.5807741464748, "epoch": 49, "n_parameters": 304326632}
diff --git a/CV/MAE/exp_results/MAE/large/log_large_pretrain.txt b/CV/MAE/exp_results/MAE/large/log_large_pretrain.txt
new file mode 100644
index 0000000..b3f4d30
--- /dev/null
+++ b/CV/MAE/exp_results/MAE/large/log_large_pretrain.txt
@@ -0,0 +1,801 @@
+{"train_lr": 1.3705929487179487e-05, "train_loss": 1.0373671979237444, "epoch": 0}
+{"train_lr": 4.1205929487179494e-05, "train_loss": 0.8163748006873692, "epoch": 1}
+{"train_lr": 6.870592948717947e-05, "train_loss": 0.7898846722196023, "epoch": 2}
+{"train_lr": 9.62059294871795e-05, "train_loss": 0.7556995776995348, "epoch": 3}
+{"train_lr": 0.00012370592948717955, "train_loss": 0.7204586103892862, "epoch": 4}
+{"train_lr": 0.00015120592948717948, "train_loss": 0.6970280320025407, "epoch": 5}
+{"train_lr": 0.0001787059294871795, "train_loss": 0.6892808590036554, "epoch": 6}
+{"train_lr": 0.00020620592948717952, "train_loss": 0.6760739260412848, "epoch": 7}
+{"train_lr": 0.0002337059294871796, "train_loss": 0.6467630795549411, "epoch": 8}
+{"train_lr": 0.00026120592948717953, "train_loss": 0.6119912476577343, "epoch": 9}
+{"train_lr": 0.0002887059294871795, "train_loss": 0.591552123773652, "epoch": 10}
+{"train_lr": 0.0003162059294871794, "train_loss": 0.577067206100298, "epoch": 11}
+{"train_lr": 0.0003437059294871795, "train_loss": 0.5598926345567004, "epoch": 12}
+{"train_lr": 0.0003712059294871795, "train_loss": 0.5453465787502818, "epoch": 13}
+{"train_lr": 0.0003987059294871796, "train_loss": 0.5339593999475861, "epoch": 14}
+{"train_lr": 0.00042620592948717975, "train_loss": 0.5245551809173029, "epoch": 15}
+{"train_lr": 0.0004537059294871794, "train_loss": 0.5173753621116376, "epoch": 16}
+{"train_lr": 0.0004812059294871794, "train_loss": 0.5108209133554155, "epoch": 17}
+{"train_lr": 0.0005087059294871794, "train_loss": 0.5050460415659472, "epoch": 18}
+{"train_lr": 0.0005362059294871794, "train_loss": 0.5001554909842805, "epoch": 19}
+{"train_lr": 0.0005637059294871797, "train_loss": 0.4958586446188677, "epoch": 20}
+{"train_lr": 0.0005912059294871796, "train_loss": 0.4919821908757186, "epoch": 21}
+{"train_lr": 0.0006187059294871795, "train_loss": 0.4885007034903631, "epoch": 22}
+{"train_lr": 0.0006462059294871793, "train_loss": 0.48533707800715303, "epoch": 23}
+{"train_lr": 0.0006737059294871794, "train_loss": 0.48238299978682053, "epoch": 24}
+{"train_lr": 0.0007012059294871796, "train_loss": 0.4795845612280596, "epoch": 25}
+{"train_lr": 0.0007287059294871798, "train_loss": 0.47709798404815584, "epoch": 26}
+{"train_lr": 0.0007562059294871797, "train_loss": 0.4748512744031942, "epoch": 27}
+{"train_lr": 0.0007837059294871795, "train_loss": 0.4727442269362748, "epoch": 28}
+{"train_lr": 0.0008112059294871793, "train_loss": 0.4707687391201034, "epoch": 29}
+{"train_lr": 0.0008387059294871796, "train_loss": 0.46924415775216544, "epoch": 30}
+{"train_lr": 0.0008662059294871798, "train_loss": 0.46733421087265015, "epoch": 31}
+{"train_lr": 0.0008937059294871797, "train_loss": 0.46589025970584214, "epoch": 32}
+{"train_lr": 0.0009212059294871793, "train_loss": 0.46424756009871954, "epoch": 33}
+{"train_lr": 0.0009487059294871794, "train_loss": 0.46277184823814493, "epoch": 34}
+{"train_lr": 0.0009762059294871795, "train_loss": 0.4613388040377639, "epoch": 35}
+{"train_lr": 0.0010037059294871799, "train_loss": 0.4599568355494203, "epoch": 36}
+{"train_lr": 0.0010312059294871796, "train_loss": 0.4587650656347903, "epoch": 37}
+{"train_lr": 0.0010587059294871793, "train_loss": 0.4574355971671116, "epoch": 38}
+{"train_lr": 0.0010862059294871797, "train_loss": 0.4563278357904309, "epoch": 39}
+{"train_lr": 0.0011137059294871793, "train_loss": 0.45536527213437533, "epoch": 40}
+{"train_lr": 0.0011412059294871797, "train_loss": 0.4542963448494004, "epoch": 41}
+{"train_lr": 0.00116870592948718, "train_loss": 0.45362750644115013, "epoch": 42}
+{"train_lr": 0.0011962059294871796, "train_loss": 0.4524291767201458, "epoch": 43}
+{"train_lr": 0.0012237059294871793, "train_loss": 0.4514670613138244, "epoch": 44}
+{"train_lr": 0.0012512059294871795, "train_loss": 0.45056493411986875, "epoch": 45}
+{"train_lr": 0.0012787059294871797, "train_loss": 0.4497934035622539, "epoch": 46}
+{"train_lr": 0.0013062059294871792, "train_loss": 0.448925889061334, "epoch": 47}
+{"train_lr": 0.0013337059294871796, "train_loss": 0.4482692235943455, "epoch": 48}
+{"train_lr": 0.0013612059294871794, "train_loss": 0.4476346656656227, "epoch": 49}
+{"train_lr": 0.0013887059294871796, "train_loss": 0.44682418112643063, "epoch": 50}
+{"train_lr": 0.0014162059294871793, "train_loss": 0.44604185191042817, "epoch": 51}
+{"train_lr": 0.0014437059294871795, "train_loss": 0.445422636401744, "epoch": 52}
+{"train_lr": 0.0014712059294871795, "train_loss": 0.444676601423476, "epoch": 53}
+{"train_lr": 0.0014987059294871796, "train_loss": 0.44410661035456145, "epoch": 54}
+{"train_lr": 0.0015262059294871796, "train_loss": 0.4435697843780359, "epoch": 55}
+{"train_lr": 0.0015537059294871798, "train_loss": 0.4429366707067507, "epoch": 56}
+{"train_lr": 0.0015812059294871795, "train_loss": 0.4423868660158358, "epoch": 57}
+{"train_lr": 0.0016087059294871797, "train_loss": 0.4419644352179976, "epoch": 58}
+{"train_lr": 0.0016362059294871794, "train_loss": 0.4413508808550735, "epoch": 59}
+{"train_lr": 0.0016637059294871796, "train_loss": 0.4408118412292634, "epoch": 60}
+{"train_lr": 0.0016912059294871796, "train_loss": 0.44038724107369304, "epoch": 61}
+{"train_lr": 0.0017187059294871791, "train_loss": 0.43994122875543934, "epoch": 62}
+{"train_lr": 0.001746205929487179, "train_loss": 0.4393991921783592, "epoch": 63}
+{"train_lr": 0.0017737059294871797, "train_loss": 0.43902113603857845, "epoch": 64}
+{"train_lr": 0.0018012059294871797, "train_loss": 0.43856765599384046, "epoch": 65}
+{"train_lr": 0.0018287059294871792, "train_loss": 0.43815263809982496, "epoch": 66}
+{"train_lr": 0.0018562059294871796, "train_loss": 0.4385444735302232, "epoch": 67}
+{"train_lr": 0.00188370592948718, "train_loss": 0.4378033945253358, "epoch": 68}
+{"train_lr": 0.0019112059294871802, "train_loss": 0.4373112537342912, "epoch": 69}
+{"train_lr": 0.0019387059294871795, "train_loss": 0.4368607692796594, "epoch": 70}
+{"train_lr": 0.0019662059294871794, "train_loss": 0.43645675210521007, "epoch": 71}
+{"train_lr": 0.0019937059294871796, "train_loss": 0.43612490208126986, "epoch": 72}
+{"train_lr": 0.002021205929487179, "train_loss": 0.4361463249929679, "epoch": 73}
+{"train_lr": 0.0020487059294871796, "train_loss": 0.4355239907506471, "epoch": 74}
+{"train_lr": 0.0020762059294871793, "train_loss": 0.4350612056381905, "epoch": 75}
+{"train_lr": 0.0021037059294871795, "train_loss": 0.4349484308055626, "epoch": 76}
+{"train_lr": 0.0021312059294871797, "train_loss": 0.4347404240606687, "epoch": 77}
+{"train_lr": 0.002158705929487179, "train_loss": 0.4341731424807595, "epoch": 78}
+{"train_lr": 0.002186205929487179, "train_loss": 0.4338370120415512, "epoch": 79}
+{"train_lr": 0.002199996684251048, "train_loss": 0.4335061630460983, "epoch": 80}
+{"train_lr": 0.002199976725863753, "train_loss": 0.4331624563275956, "epoch": 81}
+{"train_lr": 0.0021999367774331083, "train_loss": 0.43274621699208343, "epoch": 82}
+{"train_lr": 0.002199876839719668, "train_loss": 0.43239927467985606, "epoch": 83}
+{"train_lr": 0.002199796913864568, "train_loss": 0.4320713832753543, "epoch": 84}
+{"train_lr": 0.002199697001389479, "train_loss": 0.43173469559779054, "epoch": 85}
+{"train_lr": 0.002199577104196586, "train_loss": 0.4313527816631951, "epoch": 86}
+{"train_lr": 0.0021994372245685645, "train_loss": 0.4310783812035926, "epoch": 87}
+{"train_lr": 0.0021992773651685147, "train_loss": 0.4307854121688228, "epoch": 88}
+{"train_lr": 0.002199097529039938, "train_loss": 0.43050752383155316, "epoch": 89}
+{"train_lr": 0.002198897719606647, "train_loss": 0.43016116209149075, "epoch": 90}
+{"train_lr": 0.0021986779406727294, "train_loss": 0.4297766865093786, "epoch": 91}
+{"train_lr": 0.0021984381964224556, "train_loss": 0.4297368807676368, "epoch": 92}
+{"train_lr": 0.0021981784914202134, "train_loss": 0.4293370681683509, "epoch": 93}
+{"train_lr": 0.0021978988306104136, "train_loss": 0.42910005276700336, "epoch": 94}
+{"train_lr": 0.0021975992193173943, "train_loss": 0.42879289225973666, "epoch": 95}
+{"train_lr": 0.0021972796632453166, "train_loss": 0.42852139729970634, "epoch": 96}
+{"train_lr": 0.0021969401684780723, "train_loss": 0.4283285259287088, "epoch": 97}
+{"train_lr": 0.0021965807414791516, "train_loss": 0.42814426130065936, "epoch": 98}
+{"train_lr": 0.0021962013890915295, "train_loss": 0.427771221443366, "epoch": 99}
+{"train_lr": 0.002195802118537524, "train_loss": 0.42763554915272367, "epoch": 100}
+{"train_lr": 0.0021953829374186744, "train_loss": 0.42753871774766594, "epoch": 101}
+{"train_lr": 0.002194943853715583, "train_loss": 0.42720191766364646, "epoch": 102}
+{"train_lr": 0.002194484875787771, "train_loss": 0.427074841169927, "epoch": 103}
+{"train_lr": 0.0021940060123735164, "train_loss": 0.4268490781929965, "epoch": 104}
+{"train_lr": 0.0021935072725896877, "train_loss": 0.42664367280518395, "epoch": 105}
+{"train_lr": 0.0021929886659315715, "train_loss": 0.4263747133887731, "epoch": 106}
+{"train_lr": 0.0021924502022726967, "train_loss": 0.42692178616431564, "epoch": 107}
+{"train_lr": 0.0021918918918646256, "train_loss": 0.42619890832186985, "epoch": 108}
+{"train_lr": 0.0021913137453367865, "train_loss": 0.4259361302724872, "epoch": 109}
+{"train_lr": 0.0021907157736962605, "train_loss": 0.425667358828016, "epoch": 110}
+{"train_lr": 0.0021900979883275615, "train_loss": 0.42560386141905415, "epoch": 111}
+{"train_lr": 0.0021894604009924366, "train_loss": 0.4254387913253875, "epoch": 112}
+{"train_lr": 0.0021888030238296262, "train_loss": 0.4252314920602844, "epoch": 113}
+{"train_lr": 0.0021881258693546408, "train_loss": 0.4251124603518595, "epoch": 114}
+{"train_lr": 0.0021874289504595305, "train_loss": 0.4249423658146929, "epoch": 115}
+{"train_lr": 0.00218671228041263, "train_loss": 0.42477498546791953, "epoch": 116}
+{"train_lr": 0.0021859758728582953, "train_loss": 0.42456992419185835, "epoch": 117}
+{"train_lr": 0.0021852197418166675, "train_loss": 0.4244430363805105, "epoch": 118}
+{"train_lr": 0.0021844439016833928, "train_loss": 0.4242876783484975, "epoch": 119}
+{"train_lr": 0.0021836483672293488, "train_loss": 0.42407468885171395, "epoch": 120}
+{"train_lr": 0.0021828331536003654, "train_loss": 0.4240114364790945, "epoch": 121}
+{"train_lr": 0.0021819982763169312, "train_loss": 0.423764265790128, "epoch": 122}
+{"train_lr": 0.0021811437512739154, "train_loss": 0.42374736052125883, "epoch": 123}
+{"train_lr": 0.0021802695947402357, "train_loss": 0.4245943425862023, "epoch": 124}
+{"train_lr": 0.0021793758233585704, "train_loss": 0.42387092797658765, "epoch": 125}
+{"train_lr": 0.002178462454145044, "train_loss": 0.4235223626288084, "epoch": 126}
+{"train_lr": 0.0021775295044888857, "train_loss": 0.4233784673269838, "epoch": 127}
+{"train_lr": 0.002176576992152116, "train_loss": 0.4231366920732678, "epoch": 128}
+{"train_lr": 0.0021756049352691944, "train_loss": 0.4230839658337526, "epoch": 129}
+{"train_lr": 0.002174613352346683, "train_loss": 0.4228322916807464, "epoch": 130}
+{"train_lr": 0.002173602262262889, "train_loss": 0.4232935921396487, "epoch": 131}
+{"train_lr": 0.0021725716842675145, "train_loss": 0.4229748474720579, "epoch": 132}
+{"train_lr": 0.0021715216379812764, "train_loss": 0.42267333947790736, "epoch": 133}
+{"train_lr": 0.0021704521433955426, "train_loss": 0.422408729346875, "epoch": 134}
+{"train_lr": 0.0021693632208719493, "train_loss": 0.4223583231841286, "epoch": 135}
+{"train_lr": 0.002168254891142009, "train_loss": 0.42228768690704155, "epoch": 136}
+{"train_lr": 0.002167127175306729, "train_loss": 0.4221280742621718, "epoch": 137}
+{"train_lr": 0.002165980094836185, "train_loss": 0.4220910801444776, "epoch": 138}
+{"train_lr": 0.002164813671569137, "train_loss": 0.42186729532654565, "epoch": 139}
+{"train_lr": 0.002163627927712607, "train_loss": 0.421843397601221, "epoch": 140}
+{"train_lr": 0.0021624228858414477, "train_loss": 0.4217382113908967, "epoch": 141}
+{"train_lr": 0.0021611985688979166, "train_loss": 0.4215304550732701, "epoch": 142}
+{"train_lr": 0.0021599550001912458, "train_loss": 0.42138542972194654, "epoch": 143}
+{"train_lr": 0.0021586922033971913, "train_loss": 0.4213122255377806, "epoch": 144}
+{"train_lr": 0.002157410202557581, "train_loss": 0.421302269359167, "epoch": 145}
+{"train_lr": 0.002156109022079862, "train_loss": 0.42114945856006575, "epoch": 146}
+{"train_lr": 0.0021547886867366393, "train_loss": 0.42119269031517875, "epoch": 147}
+{"train_lr": 0.0021534492216651966, "train_loss": 0.420911968464796, "epoch": 148}
+{"train_lr": 0.0021520906523670095, "train_loss": 0.4209221635097399, "epoch": 149}
+{"train_lr": 0.0021507130047072865, "train_loss": 0.420738841407001, "epoch": 150}
+{"train_lr": 0.00214931630491445, "train_loss": 0.4208859864932795, "epoch": 151}
+{"train_lr": 0.0021479005795796537, "train_loss": 0.42067305678621125, "epoch": 152}
+{"train_lr": 0.00214646585565626, "train_loss": 0.42045262417732143, "epoch": 153}
+{"train_lr": 0.0021450121604593515, "train_loss": 0.4205632483473239, "epoch": 154}
+{"train_lr": 0.002143539521665188, "train_loss": 0.4204747405094214, "epoch": 155}
+{"train_lr": 0.002142047967310689, "train_loss": 0.42016615892927617, "epoch": 156}
+{"train_lr": 0.002140537525792898, "train_loss": 0.420656357312766, "epoch": 157}
+{"train_lr": 0.002139008225868444, "train_loss": 0.42010926239741725, "epoch": 158}
+{"train_lr": 0.002137460096652994, "train_loss": 0.4202159770215169, "epoch": 159}
+{"train_lr": 0.0021358931676206975, "train_loss": 0.42006659239996225, "epoch": 160}
+{"train_lr": 0.0021343074686036253, "train_loss": 0.42007251099969906, "epoch": 161}
+{"train_lr": 0.002132703029791194, "train_loss": 0.41983775014523417, "epoch": 162}
+{"train_lr": 0.0021310798817296174, "train_loss": 0.41982228738458777, "epoch": 163}
+{"train_lr": 0.002129438055321287, "train_loss": 0.41973125434611946, "epoch": 164}
+{"train_lr": 0.0021277775818242138, "train_loss": 0.4196543410265197, "epoch": 165}
+{"train_lr": 0.002126098492851418, "train_loss": 0.419556195089498, "epoch": 166}
+{"train_lr": 0.0021244008203703327, "train_loss": 0.41950905472875977, "epoch": 167}
+{"train_lr": 0.0021226845967021965, "train_loss": 0.41944392684262055, "epoch": 168}
+{"train_lr": 0.0021209498545214367, "train_loss": 0.4193810497541936, "epoch": 169}
+{"train_lr": 0.00211919662685504, "train_loss": 0.4192442288831211, "epoch": 170}
+{"train_lr": 0.0021174249470819317, "train_loss": 0.4192189235234251, "epoch": 171}
+{"train_lr": 0.002115634848932345, "train_loss": 0.4191800984374892, "epoch": 172}
+{"train_lr": 0.0021138263664871684, "train_loss": 0.4190960426050692, "epoch": 173}
+{"train_lr": 0.0021119995341772973, "train_loss": 0.4190245867611315, "epoch": 174}
+{"train_lr": 0.0021101543867829906, "train_loss": 0.4189993502596059, "epoch": 175}
+{"train_lr": 0.0021082909594331923, "train_loss": 0.4187265991245229, "epoch": 176}
+{"train_lr": 0.0021064092876048723, "train_loss": 0.4187703061824999, "epoch": 177}
+{"train_lr": 0.0021045094071223494, "train_loss": 0.4187772857914798, "epoch": 178}
+{"train_lr": 0.0021025913541566133, "train_loss": 0.4186587428453211, "epoch": 179}
+{"train_lr": 0.0021006551652246208, "train_loss": 0.41868259694176513, "epoch": 180}
+{"train_lr": 0.0020987008771886275, "train_loss": 0.41851956458487666, "epoch": 181}
+{"train_lr": 0.0020967285272554524, "train_loss": 0.41851956319983286, "epoch": 182}
+{"train_lr": 0.0020947381529758, "train_loss": 0.4204516486169245, "epoch": 183}
+{"train_lr": 0.002092729792243523, "train_loss": 0.41863377986308664, "epoch": 184}
+{"train_lr": 0.0020907034832949195, "train_loss": 0.4185516889451836, "epoch": 185}
+{"train_lr": 0.0020886592647079852, "train_loss": 0.41843239219787604, "epoch": 186}
+{"train_lr": 0.0020865971754017044, "train_loss": 0.4183382890545405, "epoch": 187}
+{"train_lr": 0.002084517254635278, "train_loss": 0.4181868398400883, "epoch": 188}
+{"train_lr": 0.0020824195420073976, "train_loss": 0.4181370502934815, "epoch": 189}
+{"train_lr": 0.0020803040774554945, "train_loss": 0.418083170022911, "epoch": 190}
+{"train_lr": 0.0020781709012549616, "train_loss": 0.4179465582904716, "epoch": 191}
+{"train_lr": 0.0020760200540183996, "train_loss": 0.4180231848373436, "epoch": 192}
+{"train_lr": 0.0020738515766948354, "train_loss": 0.4179109181247604, "epoch": 193}
+{"train_lr": 0.002071665510568953, "train_loss": 0.41795996805199254, "epoch": 194}
+{"train_lr": 0.0020694618972603037, "train_loss": 0.4180147959343277, "epoch": 195}
+{"train_lr": 0.002067240778722506, "train_loss": 0.4179162073802824, "epoch": 196}
+{"train_lr": 0.0020650021972424553, "train_loss": 0.41772405684698755, "epoch": 197}
+{"train_lr": 0.002062746195439519, "train_loss": 0.4176427618009396, "epoch": 198}
+{"train_lr": 0.002060472816264713, "train_loss": 0.4175360346457754, "epoch": 199}
+{"train_lr": 0.002058182102999905, "train_loss": 0.4175078834598072, "epoch": 200}
+{"train_lr": 0.002055874099256973, "train_loss": 0.4175237088959712, "epoch": 201}
+{"train_lr": 0.0020535488489769813, "train_loss": 0.41736443084963143, "epoch": 202}
+{"train_lr": 0.0020512063964293406, "train_loss": 0.41743514193401027, "epoch": 203}
+{"train_lr": 0.0020488467862109726, "train_loss": 0.41724888548457945, "epoch": 204}
+{"train_lr": 0.0020464700632454582, "train_loss": 0.4175108184482759, "epoch": 205}
+{"train_lr": 0.0020440762727821694, "train_loss": 0.41730758319728267, "epoch": 206}
+{"train_lr": 0.002041665460395431, "train_loss": 0.4171531482014614, "epoch": 207}
+{"train_lr": 0.002039237671983636, "train_loss": 0.4171176812856291, "epoch": 208}
+{"train_lr": 0.002036792953768375, "train_loss": 0.4171120857533354, "epoch": 209}
+{"train_lr": 0.002034331352293559, "train_loss": 0.41690345452680516, "epoch": 210}
+{"train_lr": 0.0020318529144245315, "train_loss": 0.4169688862360393, "epoch": 211}
+{"train_lr": 0.0020293576873471747, "train_loss": 0.4169356456110015, "epoch": 212}
+{"train_lr": 0.0020268457185670195, "train_loss": 0.41681159019935876, "epoch": 213}
+{"train_lr": 0.002024317055908329, "train_loss": 0.41887578073566634, "epoch": 214}
+{"train_lr": 0.0020217717475131958, "train_loss": 0.41701536552789503, "epoch": 215}
+{"train_lr": 0.0020192098418406177, "train_loss": 0.4168762993873455, "epoch": 216}
+{"train_lr": 0.0020166313876655924, "train_loss": 0.41685889998319536, "epoch": 217}
+{"train_lr": 0.002014036434078168, "train_loss": 0.4166946614113374, "epoch": 218}
+{"train_lr": 0.0020114250304825213, "train_loss": 0.41653254462812, "epoch": 219}
+{"train_lr": 0.002008797226596011, "train_loss": 0.4166686459360883, "epoch": 220}
+{"train_lr": 0.0020061530724482363, "train_loss": 0.4163997006208564, "epoch": 221}
+{"train_lr": 0.00200349261838008, "train_loss": 0.4164510855021385, "epoch": 222}
+{"train_lr": 0.0020008159150427538, "train_loss": 0.4164321756629178, "epoch": 223}
+{"train_lr": 0.0019981230133968306, "train_loss": 0.4163135129253929, "epoch": 224}
+{"train_lr": 0.0019954139647112732, "train_loss": 0.41683639441198933, "epoch": 225}
+{"train_lr": 0.001992688820562465, "train_loss": 0.4164539910763359, "epoch": 226}
+{"train_lr": 0.0019899476328332256, "train_loss": 0.4162949403353895, "epoch": 227}
+{"train_lr": 0.001987190453711815, "train_loss": 0.41620151580979997, "epoch": 228}
+{"train_lr": 0.0019844173356909473, "train_loss": 0.4163096353907186, "epoch": 229}
+{"train_lr": 0.0019816283315667966, "train_loss": 0.4160788968706933, "epoch": 230}
+{"train_lr": 0.001978823494437979, "train_loss": 0.4160326524124218, "epoch": 231}
+{"train_lr": 0.001976002877704551, "train_loss": 0.4159600309108217, "epoch": 232}
+{"train_lr": 0.00197316653506699, "train_loss": 0.415864022105383, "epoch": 233}
+{"train_lr": 0.001970314520525169, "train_loss": 0.4159824102579688, "epoch": 234}
+{"train_lr": 0.0019674468883773347, "train_loss": 0.4159344154505584, "epoch": 235}
+{"train_lr": 0.0019645636932190706, "train_loss": 0.41571531779108906, "epoch": 236}
+{"train_lr": 0.0019616649899422568, "train_loss": 0.41572490451523125, "epoch": 237}
+{"train_lr": 0.0019587508337340223, "train_loss": 0.41574745898385745, "epoch": 238}
+{"train_lr": 0.0019558212800757026, "train_loss": 0.41571679237322545, "epoch": 239}
+{"train_lr": 0.0019528763847417802, "train_loss": 0.41566707138330317, "epoch": 240}
+{"train_lr": 0.0019499162037988121, "train_loss": 0.4154396512730716, "epoch": 241}
+{"train_lr": 0.001946940793604378, "train_loss": 0.4155334876641297, "epoch": 242}
+{"train_lr": 0.0019439502108059982, "train_loss": 0.4154881267683007, "epoch": 243}
+{"train_lr": 0.0019409445123400604, "train_loss": 0.4154404973467955, "epoch": 244}
+{"train_lr": 0.0019379237554307278, "train_loss": 0.41536261062794483, "epoch": 245}
+{"train_lr": 0.001934887997588859, "train_loss": 0.4153592953733049, "epoch": 246}
+{"train_lr": 0.0019318372966109106, "train_loss": 0.4152674611586218, "epoch": 247}
+{"train_lr": 0.0019287717105778263, "train_loss": 0.4153060437783074, "epoch": 248}
+{"train_lr": 0.0019256912978539496, "train_loss": 0.4151951908521975, "epoch": 249}
+{"train_lr": 0.0019225961170858967, "train_loss": 0.41521607185355747, "epoch": 250}
+{"train_lr": 0.0019194862272014467, "train_loss": 0.4151945383920788, "epoch": 251}
+{"train_lr": 0.001916361687408424, "train_loss": 0.415128694429325, "epoch": 252}
+{"train_lr": 0.0019132225571935563, "train_loss": 0.4152043274042603, "epoch": 253}
+{"train_lr": 0.0019100688963213624, "train_loss": 0.41502150900375384, "epoch": 254}
+{"train_lr": 0.0019069007648329988, "train_loss": 0.414975148676417, "epoch": 255}
+{"train_lr": 0.0019037182230451216, "train_loss": 0.41492671117437285, "epoch": 256}
+{"train_lr": 0.0019005213315487395, "train_loss": 0.414867433364121, "epoch": 257}
+{"train_lr": 0.0018973101512080564, "train_loss": 0.41478415592334783, "epoch": 258}
+{"train_lr": 0.0018940847431593185, "train_loss": 0.41493314027320594, "epoch": 259}
+{"train_lr": 0.0018908451688096474, "train_loss": 0.4148071248932049, "epoch": 260}
+{"train_lr": 0.001887591489835866, "train_loss": 0.4147998127692307, "epoch": 261}
+{"train_lr": 0.0018843237681833364, "train_loss": 0.4147368201997895, "epoch": 262}
+{"train_lr": 0.0018810420660647636, "train_loss": 0.41473585022434306, "epoch": 263}
+{"train_lr": 0.0018777464459590254, "train_loss": 0.41463008197322965, "epoch": 264}
+{"train_lr": 0.0018744369706099827, "train_loss": 0.4145144334791276, "epoch": 265}
+{"train_lr": 0.0018711137030252738, "train_loss": 0.4145721539299792, "epoch": 266}
+{"train_lr": 0.0018677767064751189, "train_loss": 0.4146482014688305, "epoch": 267}
+{"train_lr": 0.0018644260444911289, "train_loss": 0.4145143874652254, "epoch": 268}
+{"train_lr": 0.001861061780865072, "train_loss": 0.41513582520509285, "epoch": 269}
+{"train_lr": 0.001857683979647683, "train_loss": 0.4145510424561321, "epoch": 270}
+{"train_lr": 0.0018542927051474255, "train_loss": 0.4143607781405967, "epoch": 271}
+{"train_lr": 0.0018508880219292774, "train_loss": 0.4143475139519582, "epoch": 272}
+{"train_lr": 0.0018474699948134992, "train_loss": 0.4142985286591097, "epoch": 273}
+{"train_lr": 0.001844038688874402, "train_loss": 0.4142709164969766, "epoch": 274}
+{"train_lr": 0.0018405941694391048, "train_loss": 0.4141980246312391, "epoch": 275}
+{"train_lr": 0.0018371365020862912, "train_loss": 0.4141822525670227, "epoch": 276}
+{"train_lr": 0.0018336657526449639, "train_loss": 0.4141156807667218, "epoch": 277}
+{"train_lr": 0.0018301819871931874, "train_loss": 0.41410747529246295, "epoch": 278}
+{"train_lr": 0.0018266852720568382, "train_loss": 0.41405237226699215, "epoch": 279}
+{"train_lr": 0.0018231756738083295, "train_loss": 0.41394138641249484, "epoch": 280}
+{"train_lr": 0.0018196532592653519, "train_loss": 0.4139768362093048, "epoch": 281}
+{"train_lr": 0.0018161180954896032, "train_loss": 0.41397458218778366, "epoch": 282}
+{"train_lr": 0.0018125702497855084, "train_loss": 0.41386487913461256, "epoch": 283}
+{"train_lr": 0.0018090097896989272, "train_loss": 0.4139637210054132, "epoch": 284}
+{"train_lr": 0.0018054367830158936, "train_loss": 0.41381767440515643, "epoch": 285}
+{"train_lr": 0.0018018512977613032, "train_loss": 0.4137953261170202, "epoch": 286}
+{"train_lr": 0.0017982534021976266, "train_loss": 0.41396266024094075, "epoch": 287}
+{"train_lr": 0.0017946431648236123, "train_loss": 0.4137683888085378, "epoch": 288}
+{"train_lr": 0.00179102065437297, "train_loss": 0.41367859775737786, "epoch": 289}
+{"train_lr": 0.0017873859398130803, "train_loss": 0.4136714078157615, "epoch": 290}
+{"train_lr": 0.0017837390903436671, "train_loss": 0.4135731390247551, "epoch": 291}
+{"train_lr": 0.0017800801753954888, "train_loss": 0.4136636662834252, "epoch": 292}
+{"train_lr": 0.0017764092646290154, "train_loss": 0.41356596955432534, "epoch": 293}
+{"train_lr": 0.0017727264279330912, "train_loss": 0.4135198926714321, "epoch": 294}
+{"train_lr": 0.0017690317354236186, "train_loss": 0.4135080915219031, "epoch": 295}
+{"train_lr": 0.0017653252574422209, "train_loss": 0.41334341784031725, "epoch": 296}
+{"train_lr": 0.001761607064554894, "train_loss": 0.41334728990645647, "epoch": 297}
+{"train_lr": 0.0017578772275506705, "train_loss": 0.41330422608193773, "epoch": 298}
+{"train_lr": 0.0017541358174402676, "train_loss": 0.41324203011866373, "epoch": 299}
+{"train_lr": 0.0017503829054547454, "train_loss": 0.4132311620110741, "epoch": 300}
+{"train_lr": 0.0017466185630441384, "train_loss": 0.4132360761197141, "epoch": 301}
+{"train_lr": 0.0017428428618760945, "train_loss": 0.4132467000745237, "epoch": 302}
+{"train_lr": 0.0017390558738345284, "train_loss": 0.41321784359677577, "epoch": 303}
+{"train_lr": 0.00173525767101823, "train_loss": 0.4131651461745302, "epoch": 304}
+{"train_lr": 0.001731448325739506, "train_loss": 0.4131434768384609, "epoch": 305}
+{"train_lr": 0.0017276279105227959, "train_loss": 0.41309410180801, "epoch": 306}
+{"train_lr": 0.0017237964981033048, "train_loss": 0.41297986341986614, "epoch": 307}
+{"train_lr": 0.0017199541614255998, "train_loss": 0.4128857530438556, "epoch": 308}
+{"train_lr": 0.001716100973642235, "train_loss": 0.413681590425161, "epoch": 309}
+{"train_lr": 0.001712237008112346, "train_loss": 0.4130775839275418, "epoch": 310}
+{"train_lr": 0.00170836233840027, "train_loss": 0.4129094842564649, "epoch": 311}
+{"train_lr": 0.0017044770382741352, "train_loss": 0.4129503610400626, "epoch": 312}
+{"train_lr": 0.001700581181704449, "train_loss": 0.41287012439734566, "epoch": 313}
+{"train_lr": 0.00169667484286271, "train_loss": 0.41280730115548253, "epoch": 314}
+{"train_lr": 0.001692758096119979, "train_loss": 0.41272441369409746, "epoch": 315}
+{"train_lr": 0.0016888310160454662, "train_loss": 0.41262197143768364, "epoch": 316}
+{"train_lr": 0.0016848936774051166, "train_loss": 0.4126468203926029, "epoch": 317}
+{"train_lr": 0.0016809461551601822, "train_loss": 0.41255873617513156, "epoch": 318}
+{"train_lr": 0.0016769885244657956, "train_loss": 0.412636836590723, "epoch": 319}
+{"train_lr": 0.0016730208606695412, "train_loss": 0.412544735134221, "epoch": 320}
+{"train_lr": 0.001669043239310017, "train_loss": 0.41250999425000584, "epoch": 321}
+{"train_lr": 0.0016650557361153995, "train_loss": 0.41247300328448033, "epoch": 322}
+{"train_lr": 0.0016610584270020066, "train_loss": 0.41246316529129845, "epoch": 323}
+{"train_lr": 0.0016570513880728383, "train_loss": 0.41240897299184537, "epoch": 324}
+{"train_lr": 0.0016530346956161383, "train_loss": 0.4123140548934969, "epoch": 325}
+{"train_lr": 0.0016490084261039418, "train_loss": 0.4124350955274004, "epoch": 326}
+{"train_lr": 0.0016449726561906196, "train_loss": 0.4123261533050726, "epoch": 327}
+{"train_lr": 0.0016409274627114101, "train_loss": 0.4122270453775015, "epoch": 328}
+{"train_lr": 0.0016368729226809665, "train_loss": 0.4122691912463126, "epoch": 329}
+{"train_lr": 0.001632809113291888, "train_loss": 0.41216078330165684, "epoch": 330}
+{"train_lr": 0.0016287361119132467, "train_loss": 0.4120873586029913, "epoch": 331}
+{"train_lr": 0.0016246539960891194, "train_loss": 0.41207945707122773, "epoch": 332}
+{"train_lr": 0.001620562843537104, "train_loss": 0.4120459109634304, "epoch": 333}
+{"train_lr": 0.0016164627321468496, "train_loss": 0.41213970735239297, "epoch": 334}
+{"train_lr": 0.001612353739978566, "train_loss": 0.41200300023699993, "epoch": 335}
+{"train_lr": 0.0016082359452615441, "train_loss": 0.4119489185410576, "epoch": 336}
+{"train_lr": 0.0016041094263926547, "train_loss": 0.41217702930458844, "epoch": 337}
+{"train_lr": 0.0015999742619348728, "train_loss": 0.41226495031971866, "epoch": 338}
+{"train_lr": 0.0015958305306157678, "train_loss": 0.4119313608389348, "epoch": 339}
+{"train_lr": 0.001591678311326011, "train_loss": 0.41189253890218264, "epoch": 340}
+{"train_lr": 0.0015875176831178716, "train_loss": 0.4119056661386425, "epoch": 341}
+{"train_lr": 0.0015833487252037124, "train_loss": 0.4116876139395082, "epoch": 342}
+{"train_lr": 0.0015791715169544858, "train_loss": 0.4117441733684152, "epoch": 343}
+{"train_lr": 0.0015749861378982126, "train_loss": 0.4116757833727229, "epoch": 344}
+{"train_lr": 0.0015707926677184783, "train_loss": 0.4117640209718583, "epoch": 345}
+{"train_lr": 0.0015665911862529113, "train_loss": 0.41165625816509605, "epoch": 346}
+{"train_lr": 0.001562381773491659, "train_loss": 0.41153176439006645, "epoch": 347}
+{"train_lr": 0.0015581645095758788, "train_loss": 0.41154465329451245, "epoch": 348}
+{"train_lr": 0.0015539394747961911, "train_loss": 0.4115032056907717, "epoch": 349}
+{"train_lr": 0.0015497067495911672, "train_loss": 0.41149046644568443, "epoch": 350}
+{"train_lr": 0.0015454664145457997, "train_loss": 0.4113970931279115, "epoch": 351}
+{"train_lr": 0.0015412185503899496, "train_loss": 0.41137130759381807, "epoch": 352}
+{"train_lr": 0.0015369632379968283, "train_loss": 0.41134841808189565, "epoch": 353}
+{"train_lr": 0.0015327005583814536, "train_loss": 0.41121035462352806, "epoch": 354}
+{"train_lr": 0.0015284305926990987, "train_loss": 0.4112332000886687, "epoch": 355}
+{"train_lr": 0.0015241534222437516, "train_loss": 0.4111234651108344, "epoch": 356}
+{"train_lr": 0.0015198691284465764, "train_loss": 0.41117768977971697, "epoch": 357}
+{"train_lr": 0.0015155777928743523, "train_loss": 0.41120130031739766, "epoch": 358}
+{"train_lr": 0.0015112794972279191, "train_loss": 0.4111418041120021, "epoch": 359}
+{"train_lr": 0.0015069743233406332, "train_loss": 0.41106258124674266, "epoch": 360}
+{"train_lr": 0.0015026623531767976, "train_loss": 0.411072268562678, "epoch": 361}
+{"train_lr": 0.0014983436688301081, "train_loss": 0.41097060625608534, "epoch": 362}
+{"train_lr": 0.001494018352522093, "train_loss": 0.4110512023206609, "epoch": 363}
+{"train_lr": 0.001489686486600536, "train_loss": 0.4109220164260851, "epoch": 364}
+{"train_lr": 0.001485348153537923, "train_loss": 0.4108758811832955, "epoch": 365}
+{"train_lr": 0.0014810034359298602, "train_loss": 0.41088385198217553, "epoch": 366}
+{"train_lr": 0.001476652416493508, "train_loss": 0.4108179884109025, "epoch": 367}
+{"train_lr": 0.0014722951780660042, "train_loss": 0.4108165610724917, "epoch": 368}
+{"train_lr": 0.0014679318036028908, "train_loss": 0.41075373621872413, "epoch": 369}
+{"train_lr": 0.001463562376176525, "train_loss": 0.410711961481959, "epoch": 370}
+{"train_lr": 0.0014591869789745055, "train_loss": 0.41074939695700335, "epoch": 371}
+{"train_lr": 0.0014548056952980906, "train_loss": 0.41064891304808837, "epoch": 372}
+{"train_lr": 0.0014504186085606062, "train_loss": 0.41055555843437713, "epoch": 373}
+{"train_lr": 0.001446025802285859, "train_loss": 0.4104764893865929, "epoch": 374}
+{"train_lr": 0.0014416273601065466, "train_loss": 0.4106733149025016, "epoch": 375}
+{"train_lr": 0.0014372233657626709, "train_loss": 0.410539361090065, "epoch": 376}
+{"train_lr": 0.0014328139030999325, "train_loss": 0.41042309052024323, "epoch": 377}
+{"train_lr": 0.00142839905606815, "train_loss": 0.4104917037497776, "epoch": 378}
+{"train_lr": 0.0014239789087196419, "train_loss": 0.4103732809621411, "epoch": 379}
+{"train_lr": 0.0014195535452076445, "train_loss": 0.41036086723518866, "epoch": 380}
+{"train_lr": 0.0014151230497846973, "train_loss": 0.41025182133456933, "epoch": 381}
+{"train_lr": 0.0014106875068010517, "train_loss": 0.4103283556297613, "epoch": 382}
+{"train_lr": 0.0014062470007030464, "train_loss": 0.41025463273175633, "epoch": 383}
+{"train_lr": 0.001401801616031522, "train_loss": 0.41028842999814796, "epoch": 384}
+{"train_lr": 0.0013973514374201934, "train_loss": 0.4101260701636187, "epoch": 385}
+{"train_lr": 0.0013928965495940433, "train_loss": 0.4101290691148442, "epoch": 386}
+{"train_lr": 0.001388437037367717, "train_loss": 0.4100582876977009, "epoch": 387}
+{"train_lr": 0.0013839729856439005, "train_loss": 0.4100109167003001, "epoch": 388}
+{"train_lr": 0.0013795044794117017, "train_loss": 0.41002871754030007, "epoch": 389}
+{"train_lr": 0.0013750316037450382, "train_loss": 0.4100142954609906, "epoch": 390}
+{"train_lr": 0.0013705544438010152, "train_loss": 0.4099733572101029, "epoch": 391}
+{"train_lr": 0.0013660730848183047, "train_loss": 0.40993832916212386, "epoch": 392}
+{"train_lr": 0.001361587612115522, "train_loss": 0.40984915221074164, "epoch": 393}
+{"train_lr": 0.0013570981110896019, "train_loss": 0.40986981642289233, "epoch": 394}
+{"train_lr": 0.0013526046672141716, "train_loss": 0.4098925541471451, "epoch": 395}
+{"train_lr": 0.0013481073660379268, "train_loss": 0.4097335473455202, "epoch": 396}
+{"train_lr": 0.0013436062931829961, "train_loss": 0.4097142314914471, "epoch": 397}
+{"train_lr": 0.0013391015343433242, "train_loss": 0.40965941716104937, "epoch": 398}
+{"train_lr": 0.0013345931752830203, "train_loss": 0.40957096070110893, "epoch": 399}
+{"train_lr": 0.0013300813018347428, "train_loss": 0.40958923276048154, "epoch": 400}
+{"train_lr": 0.0013255659998980631, "train_loss": 0.40965150356985247, "epoch": 401}
+{"train_lr": 0.001321047355437815, "train_loss": 0.4096374787443962, "epoch": 402}
+{"train_lr": 0.0013165254544824816, "train_loss": 0.4095211476236821, "epoch": 403}
+{"train_lr": 0.0013120003831225341, "train_loss": 0.4094003884551617, "epoch": 404}
+{"train_lr": 0.0013074722275088128, "train_loss": 0.4093821646842676, "epoch": 405}
+{"train_lr": 0.0013029410738508687, "train_loss": 0.40934318331225467, "epoch": 406}
+{"train_lr": 0.0012984070084153404, "train_loss": 0.40935628237023663, "epoch": 407}
+{"train_lr": 0.001293870117524294, "train_loss": 0.4092580980549638, "epoch": 408}
+{"train_lr": 0.0012893304875535958, "train_loss": 0.40928569577861196, "epoch": 409}
+{"train_lr": 0.001284788204931254, "train_loss": 0.40930103614197993, "epoch": 410}
+{"train_lr": 0.0012802433561357833, "train_loss": 0.4091632225676082, "epoch": 411}
+{"train_lr": 0.0012756960276945543, "train_loss": 0.40913745740321106, "epoch": 412}
+{"train_lr": 0.0012711463061821455, "train_loss": 0.4091320280642368, "epoch": 413}
+{"train_lr": 0.0012665942782186948, "train_loss": 0.4090705737053679, "epoch": 414}
+{"train_lr": 0.0012620400304682543, "train_loss": 0.40911481409476924, "epoch": 415}
+{"train_lr": 0.0012574836496371338, "train_loss": 0.40898072150631404, "epoch": 416}
+{"train_lr": 0.001252925222472262, "train_loss": 0.4089827490463041, "epoch": 417}
+{"train_lr": 0.0012483648357595157, "train_loss": 0.4089191631408026, "epoch": 418}
+{"train_lr": 0.0012438025763220866, "train_loss": 0.4089194155799655, "epoch": 419}
+{"train_lr": 0.0012392385310188183, "train_loss": 0.4088360767107075, "epoch": 420}
+{"train_lr": 0.0012346727867425544, "train_loss": 0.40889794748718256, "epoch": 421}
+{"train_lr": 0.0012301054304184812, "train_loss": 0.4087364257384951, "epoch": 422}
+{"train_lr": 0.0012255365490024856, "train_loss": 0.4087256219787284, "epoch": 423}
+{"train_lr": 0.0012209662294794788, "train_loss": 0.4086236726068772, "epoch": 424}
+{"train_lr": 0.0012163945588617594, "train_loss": 0.40868301555299413, "epoch": 425}
+{"train_lr": 0.0012118216241873432, "train_loss": 0.40864022688355106, "epoch": 426}
+{"train_lr": 0.0012072475125183195, "train_loss": 0.408639843435361, "epoch": 427}
+{"train_lr": 0.0012026723109391762, "train_loss": 0.4084960335704426, "epoch": 428}
+{"train_lr": 0.0011980961065551578, "train_loss": 0.40850723951828116, "epoch": 429}
+{"train_lr": 0.0011935189864905992, "train_loss": 0.4084427037741989, "epoch": 430}
+{"train_lr": 0.0011889410378872717, "train_loss": 0.408406377546131, "epoch": 431}
+{"train_lr": 0.0011843623479027132, "train_loss": 0.4084319865319114, "epoch": 432}
+{"train_lr": 0.0011797830037085834, "train_loss": 0.4082929293404166, "epoch": 433}
+{"train_lr": 0.0011752030924889923, "train_loss": 0.4083061993856413, "epoch": 434}
+{"train_lr": 0.001170622701438853, "train_loss": 0.40826553131979054, "epoch": 435}
+{"train_lr": 0.0011660419177622026, "train_loss": 0.40831295249219507, "epoch": 436}
+{"train_lr": 0.0011614608286705634, "train_loss": 0.408208545213804, "epoch": 437}
+{"train_lr": 0.001156879521381265, "train_loss": 0.40810751629312736, "epoch": 438}
+{"train_lr": 0.0011522980831157985, "train_loss": 0.4089228991920558, "epoch": 439}
+{"train_lr": 0.0011477166010981405, "train_loss": 0.40827475297145355, "epoch": 440}
+{"train_lr": 0.0011431351625531072, "train_loss": 0.40814876497890323, "epoch": 441}
+{"train_lr": 0.001138553854704682, "train_loss": 0.4081494444772267, "epoch": 442}
+{"train_lr": 0.0011339727647743652, "train_loss": 0.4080424102423235, "epoch": 443}
+{"train_lr": 0.0011293919799795042, "train_loss": 0.4079775022557722, "epoch": 444}
+{"train_lr": 0.0011248115875316382, "train_loss": 0.40800062047305685, "epoch": 445}
+{"train_lr": 0.0011202316746348369, "train_loss": 0.4078693060676018, "epoch": 446}
+{"train_lr": 0.0011156523284840427, "train_loss": 0.40788094251930046, "epoch": 447}
+{"train_lr": 0.0011110736362634, "train_loss": 0.4078162476921884, "epoch": 448}
+{"train_lr": 0.0011064956851446132, "train_loss": 0.4077284878376537, "epoch": 449}
+{"train_lr": 0.0011019185622852719, "train_loss": 0.4077657826322441, "epoch": 450}
+{"train_lr": 0.001097342354827195, "train_loss": 0.4077323899896911, "epoch": 451}
+{"train_lr": 0.0010927671498947784, "train_loss": 0.40761750450250334, "epoch": 452}
+{"train_lr": 0.001088193034593329, "train_loss": 0.40754589111580014, "epoch": 453}
+{"train_lr": 0.0010836200960074077, "train_loss": 0.4076227524962563, "epoch": 454}
+{"train_lr": 0.001079048421199174, "train_loss": 0.40763028256463796, "epoch": 455}
+{"train_lr": 0.0010744780972067251, "train_loss": 0.4075469539548533, "epoch": 456}
+{"train_lr": 0.0010699092110424448, "train_loss": 0.4075042612325305, "epoch": 457}
+{"train_lr": 0.0010653418496913364, "train_loss": 0.40739803072900915, "epoch": 458}
+{"train_lr": 0.0010607761001093785, "train_loss": 0.40746555620064145, "epoch": 459}
+{"train_lr": 0.0010562120492218607, "train_loss": 0.4072514477645596, "epoch": 460}
+{"train_lr": 0.0010516497839217333, "train_loss": 0.40732319879596335, "epoch": 461}
+{"train_lr": 0.0010470893910679514, "train_loss": 0.40721929031543624, "epoch": 462}
+{"train_lr": 0.0010425309574838217, "train_loss": 0.4074741626988189, "epoch": 463}
+{"train_lr": 0.0010379745699553473, "train_loss": 0.40718286079115784, "epoch": 464}
+{"train_lr": 0.0010334203152295809, "train_loss": 0.4070995915442323, "epoch": 465}
+{"train_lr": 0.001028868280012966, "train_loss": 0.4070817599228273, "epoch": 466}
+{"train_lr": 0.0010243185509696913, "train_loss": 0.407057137398694, "epoch": 467}
+{"train_lr": 0.001019771214720041, "train_loss": 0.40704172161610747, "epoch": 468}
+{"train_lr": 0.0010152263578387406, "train_loss": 0.40708873046036714, "epoch": 469}
+{"train_lr": 0.0010106840668533167, "train_loss": 0.40687227406753945, "epoch": 470}
+{"train_lr": 0.0010061444282424387, "train_loss": 0.40689455137814945, "epoch": 471}
+{"train_lr": 0.001001607528434284, "train_loss": 0.4067392822337122, "epoch": 472}
+{"train_lr": 0.0009970734538048858, "train_loss": 0.4067725698159148, "epoch": 473}
+{"train_lr": 0.0009925422906764867, "train_loss": 0.4067414591673953, "epoch": 474}
+{"train_lr": 0.000988014125315904, "train_loss": 0.40677410726531005, "epoch": 475}
+{"train_lr": 0.000983489043932877, "train_loss": 0.40671809860027563, "epoch": 476}
+{"train_lr": 0.0009789671326784328, "train_loss": 0.4066885843520793, "epoch": 477}
+{"train_lr": 0.0009744484776432449, "train_loss": 0.40656357107815355, "epoch": 478}
+{"train_lr": 0.0009699331648559909, "train_loss": 0.40661479317499566, "epoch": 479}
+{"train_lr": 0.0009654212802817167, "train_loss": 0.4065199022372373, "epoch": 480}
+{"train_lr": 0.0009609129098201999, "train_loss": 0.40652067499915856, "epoch": 481}
+{"train_lr": 0.0009564081393043194, "train_loss": 0.40650237360909486, "epoch": 482}
+{"train_lr": 0.0009519070544984084, "train_loss": 0.4064606928911347, "epoch": 483}
+{"train_lr": 0.0009474097410966353, "train_loss": 0.406358272028275, "epoch": 484}
+{"train_lr": 0.0009429162847213638, "train_loss": 0.4062820214581174, "epoch": 485}
+{"train_lr": 0.0009384267709215272, "train_loss": 0.4062754138229558, "epoch": 486}
+{"train_lr": 0.0009339412851709953, "train_loss": 0.40622046690147656, "epoch": 487}
+{"train_lr": 0.0009294599128669512, "train_loss": 0.4062196460212223, "epoch": 488}
+{"train_lr": 0.0009249827393282664, "train_loss": 0.40624284703666586, "epoch": 489}
+{"train_lr": 0.000920509849793868, "train_loss": 0.4063993097575477, "epoch": 490}
+{"train_lr": 0.0009160413294211269, "train_loss": 0.4061269706401687, "epoch": 491}
+{"train_lr": 0.0009115772632842303, "train_loss": 0.40604672604538977, "epoch": 492}
+{"train_lr": 0.0009071177363725607, "train_loss": 0.4059948067312153, "epoch": 493}
+{"train_lr": 0.0009026628335890832, "train_loss": 0.4060036432541286, "epoch": 494}
+{"train_lr": 0.0008982126397487258, "train_loss": 0.4059653549413316, "epoch": 495}
+{"train_lr": 0.0008937672395767638, "train_loss": 0.4058932272018865, "epoch": 496}
+{"train_lr": 0.0008893267177072082, "train_loss": 0.40586398070893037, "epoch": 497}
+{"train_lr": 0.0008848911586811962, "train_loss": 0.40570423821620166, "epoch": 498}
+{"train_lr": 0.0008804606469453758, "train_loss": 0.4057079857477966, "epoch": 499}
+{"train_lr": 0.0008760352668503046, "train_loss": 0.4057402282427901, "epoch": 500}
+{"train_lr": 0.0008716151026488423, "train_loss": 0.40573455984345996, "epoch": 501}
+{"train_lr": 0.0008672002384945409, "train_loss": 0.4056789746675163, "epoch": 502}
+{"train_lr": 0.0008627907584400527, "train_loss": 0.40564147108652365, "epoch": 503}
+{"train_lr": 0.0008583867464355204, "train_loss": 0.40551826204113567, "epoch": 504}
+{"train_lr": 0.0008539882863269843, "train_loss": 0.40549777155049527, "epoch": 505}
+{"train_lr": 0.0008495954618547843, "train_loss": 0.40548197221822846, "epoch": 506}
+{"train_lr": 0.0008452083566519666, "train_loss": 0.4053351033795386, "epoch": 507}
+{"train_lr": 0.0008408270542426921, "train_loss": 0.40544413105966765, "epoch": 508}
+{"train_lr": 0.0008364516380406403, "train_loss": 0.4054094929522715, "epoch": 509}
+{"train_lr": 0.0008320821913474302, "train_loss": 0.40523585497449416, "epoch": 510}
+{"train_lr": 0.000827718797351028, "train_loss": 0.4052047186847537, "epoch": 511}
+{"train_lr": 0.0008233615391241664, "train_loss": 0.4051514037568361, "epoch": 512}
+{"train_lr": 0.0008190104996227606, "train_loss": 0.405203840847557, "epoch": 513}
+{"train_lr": 0.0008146657616843306, "train_loss": 0.40509661564675087, "epoch": 514}
+{"train_lr": 0.0008103274080264235, "train_loss": 0.4051370158767662, "epoch": 515}
+{"train_lr": 0.0008059955212450415, "train_loss": 0.40509992099713343, "epoch": 516}
+{"train_lr": 0.0008016701838130633, "train_loss": 0.4050084149841076, "epoch": 517}
+{"train_lr": 0.0007973514780786778, "train_loss": 0.40498947312470335, "epoch": 518}
+{"train_lr": 0.0007930394862638177, "train_loss": 0.4048468506005473, "epoch": 519}
+{"train_lr": 0.0007887342904625922, "train_loss": 0.40491496573369473, "epoch": 520}
+{"train_lr": 0.0007844359726397224, "train_loss": 0.40479989495510477, "epoch": 521}
+{"train_lr": 0.0007801446146289847, "train_loss": 0.4047400512440035, "epoch": 522}
+{"train_lr": 0.0007758602981316503, "train_loss": 0.40473769646054375, "epoch": 523}
+{"train_lr": 0.0007715831047149271, "train_loss": 0.40474676929462033, "epoch": 524}
+{"train_lr": 0.0007673131158104147, "train_loss": 0.40466389988955015, "epoch": 525}
+{"train_lr": 0.0007630504127125459, "train_loss": 0.40458019682242036, "epoch": 526}
+{"train_lr": 0.0007587950765770436, "train_loss": 0.4046392260053649, "epoch": 527}
+{"train_lr": 0.0007545471884193728, "train_loss": 0.4045530325965956, "epoch": 528}
+{"train_lr": 0.0007503068291132018, "train_loss": 0.40456746879200906, "epoch": 529}
+{"train_lr": 0.0007460740793888594, "train_loss": 0.40441401888771605, "epoch": 530}
+{"train_lr": 0.0007418490198317987, "train_loss": 0.4043100023170551, "epoch": 531}
+{"train_lr": 0.0007376317308810632, "train_loss": 0.4043588912896573, "epoch": 532}
+{"train_lr": 0.0007334222928277559, "train_loss": 0.4043358557362062, "epoch": 533}
+{"train_lr": 0.0007292207858135094, "train_loss": 0.40429400393548304, "epoch": 534}
+{"train_lr": 0.0007250272898289608, "train_loss": 0.40428032110540724, "epoch": 535}
+{"train_lr": 0.0007208418847122287, "train_loss": 0.40419535100674975, "epoch": 536}
+{"train_lr": 0.0007166646501473936, "train_loss": 0.4041401115932669, "epoch": 537}
+{"train_lr": 0.0007124956656629803, "train_loss": 0.4041405795709206, "epoch": 538}
+{"train_lr": 0.0007083350106304438, "train_loss": 0.4040476569237235, "epoch": 539}
+{"train_lr": 0.0007041827642626584, "train_loss": 0.4039582103007258, "epoch": 540}
+{"train_lr": 0.0007000390056124096, "train_loss": 0.40393333622696215, "epoch": 541}
+{"train_lr": 0.0006959038135708897, "train_loss": 0.4039792067204148, "epoch": 542}
+{"train_lr": 0.0006917772668661943, "train_loss": 0.40400743056446886, "epoch": 543}
+{"train_lr": 0.0006876594440618228, "train_loss": 0.40387270907656503, "epoch": 544}
+{"train_lr": 0.0006835504235551869, "train_loss": 0.40385030512996495, "epoch": 545}
+{"train_lr": 0.0006794502835761145, "train_loss": 0.40380048998094237, "epoch": 546}
+{"train_lr": 0.0006753591021853594, "train_loss": 0.40375170164490837, "epoch": 547}
+{"train_lr": 0.0006712769572731192, "train_loss": 0.40375785952887666, "epoch": 548}
+{"train_lr": 0.0006672039265575479, "train_loss": 0.40368185934121126, "epoch": 549}
+{"train_lr": 0.0006631400875832792, "train_loss": 0.40351521058115536, "epoch": 550}
+{"train_lr": 0.0006590855177199493, "train_loss": 0.40354250425783295, "epoch": 551}
+{"train_lr": 0.0006550402941607243, "train_loss": 0.4034807008589642, "epoch": 552}
+{"train_lr": 0.0006510044939208292, "train_loss": 0.40340881089632136, "epoch": 553}
+{"train_lr": 0.0006469781938360838, "train_loss": 0.40350077035598075, "epoch": 554}
+{"train_lr": 0.0006429614705614375, "train_loss": 0.40342369057739585, "epoch": 555}
+{"train_lr": 0.0006389544005695102, "train_loss": 0.4034082487506123, "epoch": 556}
+{"train_lr": 0.0006349570601491407, "train_loss": 0.40329981361253137, "epoch": 557}
+{"train_lr": 0.0006309695254039274, "train_loss": 0.40330997984617567, "epoch": 558}
+{"train_lr": 0.0006269918722507841, "train_loss": 0.4032074278519035, "epoch": 559}
+{"train_lr": 0.0006230241764184931, "train_loss": 0.40314884478142726, "epoch": 560}
+{"train_lr": 0.0006190665134462633, "train_loss": 0.40318546131348765, "epoch": 561}
+{"train_lr": 0.0006151189586822944, "train_loss": 0.40302897839902496, "epoch": 562}
+{"train_lr": 0.0006111815872823375, "train_loss": 0.40307248276598656, "epoch": 563}
+{"train_lr": 0.0006072544742082678, "train_loss": 0.40295639399212235, "epoch": 564}
+{"train_lr": 0.0006033376942266588, "train_loss": 0.4029202505480498, "epoch": 565}
+{"train_lr": 0.0005994313219073551, "train_loss": 0.40293739083557367, "epoch": 566}
+{"train_lr": 0.0005955354316220552, "train_loss": 0.40285234694452715, "epoch": 567}
+{"train_lr": 0.0005916500975428925, "train_loss": 0.4027972887628354, "epoch": 568}
+{"train_lr": 0.00058777539364103, "train_loss": 0.40278459172576475, "epoch": 569}
+{"train_lr": 0.0005839113936852423, "train_loss": 0.4027620741996007, "epoch": 570}
+{"train_lr": 0.0005800581712405198, "train_loss": 0.40272315271879333, "epoch": 571}
+{"train_lr": 0.0005762157996666634, "train_loss": 0.4025773255405231, "epoch": 572}
+{"train_lr": 0.000572384352116889, "train_loss": 0.40262824610078657, "epoch": 573}
+{"train_lr": 0.0005685639015364357, "train_loss": 0.4025623084893689, "epoch": 574}
+{"train_lr": 0.000564754520661175, "train_loss": 0.4025112026944183, "epoch": 575}
+{"train_lr": 0.0005609562820162276, "train_loss": 0.4024656550743832, "epoch": 576}
+{"train_lr": 0.0005571692579145825, "train_loss": 0.40243863600353974, "epoch": 577}
+{"train_lr": 0.000553393520455719, "train_loss": 0.4023528701106373, "epoch": 578}
+{"train_lr": 0.0005496291415242374, "train_loss": 0.40238917958683884, "epoch": 579}
+{"train_lr": 0.0005458761927884844, "train_loss": 0.40230784869979686, "epoch": 580}
+{"train_lr": 0.0005421347456991955, "train_loss": 0.40238204727080673, "epoch": 581}
+{"train_lr": 0.0005384048714881292, "train_loss": 0.40223376255613774, "epoch": 582}
+{"train_lr": 0.0005346866411667144, "train_loss": 0.4021565411777164, "epoch": 583}
+{"train_lr": 0.0005309801255246968, "train_loss": 0.4021812020204006, "epoch": 584}
+{"train_lr": 0.0005272853951287912, "train_loss": 0.4020746909440137, "epoch": 585}
+{"train_lr": 0.0005236025203213388, "train_loss": 0.40203427048459744, "epoch": 586}
+{"train_lr": 0.0005199315712189664, "train_loss": 0.40197034500455725, "epoch": 587}
+{"train_lr": 0.0005162726177112542, "train_loss": 0.40197522756464493, "epoch": 588}
+{"train_lr": 0.0005126257294594024, "train_loss": 0.4018897909987479, "epoch": 589}
+{"train_lr": 0.000508990975894907, "train_loss": 0.4019340915271105, "epoch": 590}
+{"train_lr": 0.0005053684262182351, "train_loss": 0.4018403323343358, "epoch": 591}
+{"train_lr": 0.000501758149397512, "train_loss": 0.40179195118327743, "epoch": 592}
+{"train_lr": 0.000498160214167204, "train_loss": 0.40182342882065153, "epoch": 593}
+{"train_lr": 0.0004945746890268112, "train_loss": 0.40156844701283634, "epoch": 594}
+{"train_lr": 0.0004910016422395642, "train_loss": 0.40166771351001584, "epoch": 595}
+{"train_lr": 0.0004874411418311232, "train_loss": 0.4015691514986639, "epoch": 596}
+{"train_lr": 0.0004838932555882831, "train_loss": 0.40157083718953895, "epoch": 597}
+{"train_lr": 0.0004803580510576859, "train_loss": 0.40150746444347674, "epoch": 598}
+{"train_lr": 0.0004768355955445271, "train_loss": 0.40146687768626577, "epoch": 599}
+{"train_lr": 0.0004733259561112843, "train_loss": 0.4013656457104028, "epoch": 600}
+{"train_lr": 0.00046982919957643183, "train_loss": 0.4014010253177287, "epoch": 601}
+{"train_lr": 0.0004663453925131751, "train_loss": 0.401316442541802, "epoch": 602}
+{"train_lr": 0.0004628746012481774, "train_loss": 0.40135061838186514, "epoch": 603}
+{"train_lr": 0.00045941689186030244, "train_loss": 0.4013017872258878, "epoch": 604}
+{"train_lr": 0.00045597233017935225, "train_loss": 0.4012205607514494, "epoch": 605}
+{"train_lr": 0.0004525409817848158, "train_loss": 0.4010732845307734, "epoch": 606}
+{"train_lr": 0.0004491229120046211, "train_loss": 0.40112392098989147, "epoch": 607}
+{"train_lr": 0.0004457181859138885, "train_loss": 0.40095535995295417, "epoch": 608}
+{"train_lr": 0.0004423268683336966, "train_loss": 0.40107519237491757, "epoch": 609}
+{"train_lr": 0.0004389490238298424, "train_loss": 0.40089200041555345, "epoch": 610}
+{"train_lr": 0.0004355847167116164, "train_loss": 0.4009603075026415, "epoch": 611}
+{"train_lr": 0.0004322340110305767, "train_loss": 0.4007888589806568, "epoch": 612}
+{"train_lr": 0.0004288969705793297, "train_loss": 0.40085823017542654, "epoch": 613}
+{"train_lr": 0.00042557365889031546, "train_loss": 0.4008245808311189, "epoch": 614}
+{"train_lr": 0.00042226413923459786, "train_loss": 0.40080666332804143, "epoch": 615}
+{"train_lr": 0.00041896847462066024, "train_loss": 0.40067557709960216, "epoch": 616}
+{"train_lr": 0.0004156867277932069, "train_loss": 0.4006406409122671, "epoch": 617}
+{"train_lr": 0.0004124189612319663, "train_loss": 0.40067053648034257, "epoch": 618}
+{"train_lr": 0.000409165237150504, "train_loss": 0.4006024836652124, "epoch": 619}
+{"train_lr": 0.00040592561749503553, "train_loss": 0.40058071185082483, "epoch": 620}
+{"train_lr": 0.0004027001639432505, "train_loss": 0.40051370228413874, "epoch": 621}
+{"train_lr": 0.0003994889379031339, "train_loss": 0.40047323810330665, "epoch": 622}
+{"train_lr": 0.0003962920005118015, "train_loss": 0.40047235801242864, "epoch": 623}
+{"train_lr": 0.0003931094126343328, "train_loss": 0.40038400971426225, "epoch": 624}
+{"train_lr": 0.00038994123486261274, "train_loss": 0.4003337500109457, "epoch": 625}
+{"train_lr": 0.0003867875275141802, "train_loss": 0.4003118855269769, "epoch": 626}
+{"train_lr": 0.0003836483506310766, "train_loss": 0.4002860259079637, "epoch": 627}
+{"train_lr": 0.0003805237639787045, "train_loss": 0.40020315256268263, "epoch": 628}
+{"train_lr": 0.00037741382704469054, "train_loss": 0.4001016740937932, "epoch": 629}
+{"train_lr": 0.00037431859903775094, "train_loss": 0.40014096555443335, "epoch": 630}
+{"train_lr": 0.0003712381388865644, "train_loss": 0.40007645417398846, "epoch": 631}
+{"train_lr": 0.00036817250523865294, "train_loss": 0.4001130870818041, "epoch": 632}
+{"train_lr": 0.000365121756459261, "train_loss": 0.3999421138059682, "epoch": 633}
+{"train_lr": 0.000362085950630249, "train_loss": 0.39993893454830426, "epoch": 634}
+{"train_lr": 0.00035906514554898285, "train_loss": 0.3999607272577496, "epoch": 635}
+{"train_lr": 0.00035605939872723774, "train_loss": 0.39988730190536725, "epoch": 636}
+{"train_lr": 0.00035306876739010003, "train_loss": 0.39983918937221646, "epoch": 637}
+{"train_lr": 0.0003500933084748797, "train_loss": 0.39978164734426314, "epoch": 638}
+{"train_lr": 0.00034713307863002557, "train_loss": 0.39978201805542296, "epoch": 639}
+{"train_lr": 0.0003441881342140461, "train_loss": 0.3997511507597012, "epoch": 640}
+{"train_lr": 0.00034125853129443856, "train_loss": 0.399678559621605, "epoch": 641}
+{"train_lr": 0.0003383443256466194, "train_loss": 0.3995659738414897, "epoch": 642}
+{"train_lr": 0.00033544557275286366, "train_loss": 0.39954135730826795, "epoch": 643}
+{"train_lr": 0.00033256232780124785, "train_loss": 0.39953629148061365, "epoch": 644}
+{"train_lr": 0.00032969464568459927, "train_loss": 0.39953539276925415, "epoch": 645}
+{"train_lr": 0.000326842580999452, "train_loss": 0.3994370355951385, "epoch": 646}
+{"train_lr": 0.00032400618804500746, "train_loss": 0.39941193949944603, "epoch": 647}
+{"train_lr": 0.0003211855208220971, "train_loss": 0.39936334144873303, "epoch": 648}
+{"train_lr": 0.0003183806330321605, "train_loss": 0.39932101693911815, "epoch": 649}
+{"train_lr": 0.0003155915780762176, "train_loss": 0.39929802753986454, "epoch": 650}
+{"train_lr": 0.000312818409053854, "train_loss": 0.3992980943324092, "epoch": 651}
+{"train_lr": 0.0003100611787622107, "train_loss": 0.3992614940686438, "epoch": 652}
+{"train_lr": 0.0003073199396949779, "train_loss": 0.3991799572637926, "epoch": 653}
+{"train_lr": 0.0003045947440413965, "train_loss": 0.39913937841685343, "epoch": 654}
+{"train_lr": 0.00030188564368526324, "train_loss": 0.39915586874271053, "epoch": 655}
+{"train_lr": 0.00029919269020394336, "train_loss": 0.398969745153012, "epoch": 656}
+{"train_lr": 0.00029651593486738974, "train_loss": 0.3989916533315315, "epoch": 657}
+{"train_lr": 0.0002938554286371653, "train_loss": 0.398980718314982, "epoch": 658}
+{"train_lr": 0.0002912112221654737, "train_loss": 0.3989473644626112, "epoch": 659}
+{"train_lr": 0.00028858336579419536, "train_loss": 0.39888661709697676, "epoch": 660}
+{"train_lr": 0.00028597190955392625, "train_loss": 0.39888876083438307, "epoch": 661}
+{"train_lr": 0.00028337690316303, "train_loss": 0.3988901605346025, "epoch": 662}
+{"train_lr": 0.0002807983960266869, "train_loss": 0.3988267953406112, "epoch": 663}
+{"train_lr": 0.00027823643723595644, "train_loss": 0.3987751852434415, "epoch": 664}
+{"train_lr": 0.0002756910755668407, "train_loss": 0.3987223649672113, "epoch": 665}
+{"train_lr": 0.0002731623594793579, "train_loss": 0.3986610113046108, "epoch": 666}
+{"train_lr": 0.0002706503371166151, "train_loss": 0.39863646618771154, "epoch": 667}
+{"train_lr": 0.0002681550563038991, "train_loss": 0.398625210312625, "epoch": 668}
+{"train_lr": 0.0002656765645477588, "train_loss": 0.39857109432789284, "epoch": 669}
+{"train_lr": 0.00026321490903510463, "train_loss": 0.3984749709578374, "epoch": 670}
+{"train_lr": 0.0002607701366323092, "train_loss": 0.3985634923297673, "epoch": 671}
+{"train_lr": 0.0002607701366323092, "train_loss": 0.39847029329468614, "epoch": 671}
+{"train_lr": 0.00025834229388431527, "train_loss": 0.3984167390019418, "epoch": 672}
+{"train_lr": 0.00025593142701374873, "train_loss": 0.3984153713219059, "epoch": 673}
+{"train_lr": 0.0002535375819200397, "train_loss": 0.39833352874474937, "epoch": 674}
+{"train_lr": 0.0002511608041785483, "train_loss": 0.39829322964084357, "epoch": 675}
+{"train_lr": 0.0002488011390396965, "train_loss": 0.3982728718785951, "epoch": 676}
+{"train_lr": 0.000246458631428107, "train_loss": 0.3982159129093186, "epoch": 677}
+{"train_lr": 0.00024413332594174845, "train_loss": 0.3982398451025335, "epoch": 678}
+{"train_lr": 0.0002418252668510853, "train_loss": 0.39817184433699226, "epoch": 679}
+{"train_lr": 0.00023953449809823558, "train_loss": 0.39808873023885566, "epoch": 680}
+{"train_lr": 0.0002372610632961341, "train_loss": 0.3980418292369741, "epoch": 681}
+{"train_lr": 0.00023500500572770275, "train_loss": 0.39808892941651625, "epoch": 682}
+{"train_lr": 0.00023276636834502533, "train_loss": 0.39800545562488526, "epoch": 683}
+{"train_lr": 0.00023054519376853095, "train_loss": 0.39798971789423376, "epoch": 684}
+{"train_lr": 0.000228341524286182, "train_loss": 0.39801577686403805, "epoch": 685}
+{"train_lr": 0.00022615540185266786, "train_loss": 0.39793494881059116, "epoch": 686}
+{"train_lr": 0.00022398686808860945, "train_loss": 0.3979601376266099, "epoch": 687}
+{"train_lr": 0.00022183596427976347, "train_loss": 0.3978403454903179, "epoch": 688}
+{"train_lr": 0.0002197027313762382, "train_loss": 0.39773378970149237, "epoch": 689}
+{"train_lr": 0.00021758720999171227, "train_loss": 0.39777832595894164, "epoch": 690}
+{"train_lr": 0.00021548944040266456, "train_loss": 0.39771761881330836, "epoch": 691}
+{"train_lr": 0.0002134094625476033, "train_loss": 0.3976772154848545, "epoch": 692}
+{"train_lr": 0.0002113473160263091, "train_loss": 0.3975796118014468, "epoch": 693}
+{"train_lr": 0.000209303040099079, "train_loss": 0.3975841266085179, "epoch": 694}
+{"train_lr": 0.00020727667368597986, "train_loss": 0.3975522827225713, "epoch": 695}
+{"train_lr": 0.00020526825536610726, "train_loss": 0.39762424131353885, "epoch": 696}
+{"train_lr": 0.00020327782337685118, "train_loss": 0.3976167627783397, "epoch": 697}
+{"train_lr": 0.0002013054156131673, "train_loss": 0.3974667288130149, "epoch": 698}
+{"train_lr": 0.00019935106962685635, "train_loss": 0.39743793080859363, "epoch": 699}
+{"train_lr": 0.00019741482262584887, "train_loss": 0.3974320717418614, "epoch": 700}
+{"train_lr": 0.00019549671147349638, "train_loss": 0.3973614268649656, "epoch": 701}
+{"train_lr": 0.00019359677268787083, "train_loss": 0.39733912353702366, "epoch": 702}
+{"train_lr": 0.0001917150424410675, "train_loss": 0.39732482824892473, "epoch": 703}
+{"train_lr": 0.00018985155655851815, "train_loss": 0.39729455607429814, "epoch": 704}
+{"train_lr": 0.00018800635051830793, "train_loss": 0.3972467629937455, "epoch": 705}
+{"train_lr": 0.00018617945945049967, "train_loss": 0.39721219141322833, "epoch": 706}
+{"train_lr": 0.00018437091813646575, "train_loss": 0.3971436861514424, "epoch": 707}
+{"train_lr": 0.00018258076100822665, "train_loss": 0.3972315847103556, "epoch": 708}
+{"train_lr": 0.000180809022147793, "train_loss": 0.3970368156263318, "epoch": 709}
+{"train_lr": 0.00017905573528651913, "train_loss": 0.39711282955399024, "epoch": 710}
+{"train_lr": 0.000177320933804459, "train_loss": 0.3970147223952107, "epoch": 711}
+{"train_lr": 0.00017560465072973276, "train_loss": 0.3970198181236927, "epoch": 712}
+{"train_lr": 0.00017390691873789602, "train_loss": 0.3969494656044751, "epoch": 713}
+{"train_lr": 0.0001722277701513185, "train_loss": 0.39695064426184845, "epoch": 714}
+{"train_lr": 0.0001705672369385691, "train_loss": 0.39693007428640836, "epoch": 715}
+{"train_lr": 0.00016892535071380598, "train_loss": 0.39690777982692593, "epoch": 716}
+{"train_lr": 0.00016730214273617719, "train_loss": 0.3968320999813911, "epoch": 717}
+{"train_lr": 0.00016569764390922197, "train_loss": 0.3967816275723565, "epoch": 718}
+{"train_lr": 0.0001641118847802857, "train_loss": 0.39692675177222836, "epoch": 719}
+{"train_lr": 0.00016254489553993575, "train_loss": 0.3967170494889172, "epoch": 720}
+{"train_lr": 0.00016099670602138892, "train_loss": 0.39678743287909013, "epoch": 721}
+{"train_lr": 0.000159467345699942, "train_loss": 0.3967261496746244, "epoch": 722}
+{"train_lr": 0.00015795684369241075, "train_loss": 0.3967392017873816, "epoch": 723}
+{"train_lr": 0.00015646522875657626, "train_loss": 0.3966885056310835, "epoch": 724}
+{"train_lr": 0.0001549925292906367, "train_loss": 0.3965789718248953, "epoch": 725}
+{"train_lr": 0.00015353877333266702, "train_loss": 0.39661051045195794, "epoch": 726}
+{"train_lr": 0.00015210398856008514, "train_loss": 0.3965992741495705, "epoch": 727}
+{"train_lr": 0.00015068820228912496, "train_loss": 0.39652254913921636, "epoch": 728}
+{"train_lr": 0.00014929144147431605, "train_loss": 0.39651879141978824, "epoch": 729}
+{"train_lr": 0.0001479137327079715, "train_loss": 0.39652866908074474, "epoch": 730}
+{"train_lr": 0.0001465551022196798, "train_loss": 0.39642498921602964, "epoch": 731}
+{"train_lr": 0.0001452155758758071, "train_loss": 0.3964142541741379, "epoch": 732}
+{"train_lr": 0.00014389517917900418, "train_loss": 0.3964455056613168, "epoch": 733}
+{"train_lr": 0.00014259393726772084, "train_loss": 0.39640393293498516, "epoch": 734}
+{"train_lr": 0.00014131187491572722, "train_loss": 0.39643978867799234, "epoch": 735}
+{"train_lr": 0.00014004901653164286, "train_loss": 0.3962538736776855, "epoch": 736}
+{"train_lr": 0.00013880538615847047, "train_loss": 0.3962314602148791, "epoch": 737}
+{"train_lr": 0.00013758100747314012, "train_loss": 0.39621207925777596, "epoch": 738}
+{"train_lr": 0.00013637590378605678, "train_loss": 0.39627872894589716, "epoch": 739}
+{"train_lr": 0.00013519009804065788, "train_loss": 0.39609781629704416, "epoch": 740}
+{"train_lr": 0.000134023612812975, "train_loss": 0.3961053301717561, "epoch": 741}
+{"train_lr": 0.00013287647031120598, "train_loss": 0.39621051645372063, "epoch": 742}
+{"train_lr": 0.00013174869237529024, "train_loss": 0.39617534446565866, "epoch": 743}
+{"train_lr": 0.00013064030047649377, "train_loss": 0.39614509605169773, "epoch": 744}
+{"train_lr": 0.00012955131571700112, "train_loss": 0.3961230623297011, "epoch": 745}
+{"train_lr": 0.00012848175882951195, "train_loss": 0.396136003274781, "epoch": 746}
+{"train_lr": 0.00012743165017684786, "train_loss": 0.3959994736360386, "epoch": 747}
+{"train_lr": 0.00012640100975156387, "train_loss": 0.3958685474846369, "epoch": 748}
+{"train_lr": 0.00012538985717556808, "train_loss": 0.39600011887542236, "epoch": 749}
+{"train_lr": 0.00012439821169974797, "train_loss": 0.39602591505405516, "epoch": 750}
+{"train_lr": 0.00012342609220360385, "train_loss": 0.39595969730856806, "epoch": 751}
+{"train_lr": 0.00012247351719488973, "train_loss": 0.39595074674042946, "epoch": 752}
+{"train_lr": 0.00012154050480926074, "train_loss": 0.3958544527592424, "epoch": 753}
+{"train_lr": 0.0001206270728099278, "train_loss": 0.39586364505334926, "epoch": 754}
+{"train_lr": 0.0001197332385873192, "train_loss": 0.395963006802142, "epoch": 755}
+{"train_lr": 0.00011885901915875058, "train_loss": 0.3957122047342217, "epoch": 756}
+{"train_lr": 0.00011800443116809937, "train_loss": 0.39583009899032706, "epoch": 757}
+{"train_lr": 0.00011716949088548901, "train_loss": 0.3957149375600215, "epoch": 758}
+{"train_lr": 0.00011635421420697925, "train_loss": 0.3958443301753738, "epoch": 759}
+{"train_lr": 0.00011555861665426263, "train_loss": 0.3956724483346662, "epoch": 760}
+{"train_lr": 0.00011478271337436975, "train_loss": 0.3958091593037049, "epoch": 761}
+{"train_lr": 0.00011402651913938054, "train_loss": 0.39575345273046064, "epoch": 762}
+{"train_lr": 0.0001132900483461433, "train_loss": 0.39577234872222805, "epoch": 763}
+{"train_lr": 0.00011257331501600035, "train_loss": 0.3956845926377588, "epoch": 764}
+{"train_lr": 0.00011187633279452117, "train_loss": 0.3957371470810941, "epoch": 765}
+{"train_lr": 0.00011119911495124251, "train_loss": 0.39570510602639747, "epoch": 766}
+{"train_lr": 0.00011054167437941602, "train_loss": 0.3956630747460832, "epoch": 767}
+{"train_lr": 0.00010990402359576233, "train_loss": 0.39567503699352247, "epoch": 768}
+{"train_lr": 0.00010928617474023332, "train_loss": 0.39571998205680686, "epoch": 769}
+{"train_lr": 0.00010868813957578054, "train_loss": 0.3956101418216116, "epoch": 770}
+{"train_lr": 0.00010810992948813149, "train_loss": 0.3955000247179459, "epoch": 771}
+{"train_lr": 0.00010755155548557293, "train_loss": 0.39557026598590594, "epoch": 772}
+{"train_lr": 0.00010701302819874079, "train_loss": 0.3955783303707647, "epoch": 773}
+{"train_lr": 0.00010649435788041832, "train_loss": 0.3955380746629089, "epoch": 774}
+{"train_lr": 0.00010599555440534079, "train_loss": 0.3954764198249158, "epoch": 775}
+{"train_lr": 0.00010551662727000747, "train_loss": 0.39549409868852353, "epoch": 776}
+{"train_lr": 0.00010505758559250056, "train_loss": 0.39542940607031757, "epoch": 777}
+{"train_lr": 0.00010461843811231193, "train_loss": 0.3954961236046914, "epoch": 778}
+{"train_lr": 0.00010419919319017639, "train_loss": 0.39550039982303786, "epoch": 779}
+{"train_lr": 0.00010379985880791331, "train_loss": 0.39544806897771567, "epoch": 780}
+{"train_lr": 0.00010342044256827326, "train_loss": 0.3954047037479587, "epoch": 781}
+{"train_lr": 0.00010306095169479492, "train_loss": 0.3954766464855474, "epoch": 782}
+{"train_lr": 0.00010272139303166615, "train_loss": 0.3953952655727521, "epoch": 783}
+{"train_lr": 0.00010240177304359433, "train_loss": 0.39547297515822816, "epoch": 784}
+{"train_lr": 0.0001021020978156836, "train_loss": 0.39534901476536805, "epoch": 785}
+{"train_lr": 0.00010182237305331808, "train_loss": 0.39531362823282296, "epoch": 786}
+{"train_lr": 0.00010156260408205405, "train_loss": 0.3954241632883891, "epoch": 787}
+{"train_lr": 0.00010132279584751836, "train_loss": 0.3952771816163873, "epoch": 788}
+{"train_lr": 0.0001011029529153142, "train_loss": 0.3954083356791391, "epoch": 789}
+{"train_lr": 0.00010090307947093394, "train_loss": 0.39527252094987303, "epoch": 790}
+{"train_lr": 0.00010072317931967978, "train_loss": 0.3953342796780933, "epoch": 791}
+{"train_lr": 0.00010056325588659148, "train_loss": 0.39531643394058424, "epoch": 792}
+{"train_lr": 0.00010042331221638053, "train_loss": 0.3953475776182201, "epoch": 793}
+{"train_lr": 0.00010030335097337291, "train_loss": 0.3953579601873524, "epoch": 794}
+{"train_lr": 0.00010020337444145742, "train_loss": 0.39527962923061866, "epoch": 795}
+{"train_lr": 0.00010012338452404336, "train_loss": 0.39528626886805374, "epoch": 796}
+{"train_lr": 0.00010006338274402353, "train_loss": 0.3952554755826266, "epoch": 797}
+{"train_lr": 0.0001000233702437451, "train_loss": 0.39523102716805464, "epoch": 798}
+{"train_lr": 0.00010000334778498856, "train_loss": 0.39522883877193987, "epoch": 799}
diff --git a/CV/MAE/main_finetune.py b/CV/MAE/main_finetune.py
new file mode 100644
index 0000000..2f30421
--- /dev/null
+++ b/CV/MAE/main_finetune.py
@@ -0,0 +1,391 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+
+import argparse
+from ast import arg
+import datetime
+import json
+import numpy as np
+import os
+import time
+from pathlib import Path
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+from timm.models.helpers import load_state_dict
+
+import timm
+
+#assert timm.__version__ == "0.3.2" # version check
+from timm.models.layers import trunc_normal_
+from timm.data.mixup import Mixup
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from adan import Adan
+
+import util.lr_decay as lrd
+import util.misc as misc
+from util.datasets import build_dataset
+from util.pos_embed import interpolate_pos_embed
+from util.misc import NativeScalerWithGradNormCount as NativeScaler
+
+import models_vit
+
+from engine_finetune import train_one_epoch, evaluate
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('MAE fine-tuning for image classification', add_help=False)
+    parser.add_argument('--batch_size', default=64, type=int,
+                        help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
+    parser.add_argument('--epochs', default=50, type=int)
+    parser.add_argument('--accum_iter', default=1, type=int,
+                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+
+    # Model parameters
+    parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL',
+                        help='Name of model to train')
+
+    parser.add_argument('--input_size', default=224, type=int,
+                        help='images input size')
+
+    parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT',
+                        help='Drop path rate (default: 0.1)')
+
+    # Optimizer parameters
+    parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+    parser.add_argument('--weight_decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+
+    parser.add_argument('--lr', type=float, default=None, metavar='LR',
+                        help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--layer_decay', type=float, default=0.75,
+                        help='layer-wise lr decay from ELECTRA/BEiT')
+
+    parser.add_argument('--min-lr', type=float, default=1e-6, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+
+    parser.add_argument('--warmup-epochs', type=int, default=5, metavar='N',
+                        help='epochs to warmup LR')
+
+                        
+    parser.add_argument('--use-adan', action='store_true', default=True,
+                        help='whether to use Adan')
+    parser.add_argument('--max-grad-norm', type=float, default=0.0,
+                        help='max grad norm (default: 0.0 for no clip)')
+    parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--bias-decay', action='store_true', default=False,
+                        help='whether to decay bias term')
+
+    # Augmentation parameters
+    parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT',
+                        help='Color jitter factor (enabled only when not using Auto/RandAug)')
+    parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m9-mstd0.5-inc1)'),
+    parser.add_argument('--smoothing', type=float, default=0.1,
+                        help='Label smoothing (default: 0.1)')
+
+    # * Random Erase params
+    parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                        help='Random erase prob (default: 0.25)')
+    parser.add_argument('--remode', type=str, default='pixel',
+                        help='Random erase mode (default: "pixel")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+
+    # * Mixup params
+    parser.add_argument('--mixup', type=float, default=0,
+                        help='mixup alpha, mixup enabled if > 0.')
+    parser.add_argument('--cutmix', type=float, default=0,
+                        help='cutmix alpha, cutmix enabled if > 0.')
+    parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup_prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup_switch_prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup_mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+
+    # * Finetuning params
+    parser.add_argument('--finetune', default='', 
+                        help='finetune from checkpoint')
+    parser.add_argument('--global_pool', action='store_true')
+    parser.set_defaults(global_pool=True)
+    parser.add_argument('--cls_token', action='store_false', dest='global_pool',
+                        help='Use class token instead of global pool for classification')
+
+    # Dataset parameters
+    parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
+                        help='dataset path')
+    parser.add_argument('--nb_classes', default=1000, type=int,
+                        help='number of the classification types')
+
+    parser.add_argument('--output_dir', default=None,
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--log_dir', default='./output_dir/',
+                        help='path where to tensorboard log')
+    parser.add_argument('--device', default='cuda:0',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='',
+                        help='resume from checkpoint')
+
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true',
+                        help='Perform evaluation only')
+    parser.add_argument('--dist_eval', action='store_true', default=False,
+                        help='Enabling distributed evaluation (recommended during training for faster monitor')
+    parser.add_argument('--num_workers', default=8, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=0, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+
+    return parser
+
+
+def main(args):
+    args.device = 'cuda:0'
+    args.world_size = 1
+    args.rank = 0  # global rank
+    args.gpu = 0
+    #misc.init_distributed_mode(args)
+    misc.init_distributed_ddpjob(args)
+
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+
+    device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+
+    dataset_train = build_dataset(is_train=True, args=args)
+    dataset_val = build_dataset(is_train=False, args=args)
+
+    if True:  # args.distributed:
+        num_tasks = misc.get_world_size()
+        global_rank = misc.get_rank()
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+        print("Sampler_train = %s" % str(sampler_train))
+        if args.dist_eval:
+            if len(dataset_val) % num_tasks != 0:
+                print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                      'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                      'equal num of samples per-process.')
+            sampler_val = torch.utils.data.DistributedSampler(
+                dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True)  # shuffle=True to reduce monitor bias
+        else:
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+
+    if misc.is_main_process() and args.log_dir is not None and not args.eval:
+        TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.datetime.now())
+        
+        args.log_dir = args.log_dir+ 'mae-' + TIMESTAMP
+        os.makedirs(args.log_dir, exist_ok=True)
+        log_writer = SummaryWriter(log_dir=args.log_dir)
+    else:
+        log_writer = None
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=True,
+    )
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=False
+    )
+
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        print("Mixup is activated!")
+        mixup_fn = Mixup(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.nb_classes)
+    
+    model = models_vit.__dict__[args.model](
+        num_classes=args.nb_classes,
+        drop_path_rate=args.drop_path,
+        global_pool=args.global_pool,
+    )
+
+    if args.finetune and not args.eval:
+        #checkpoint = torch.load(args.finetune, map_location='cpu')
+
+        print("Load pre-trained checkpoint from: %s" % args.finetune)
+        checkpoint_model = load_state_dict(args.finetune)
+        state_dict = model.state_dict()
+        for k in ['head.weight', 'head.bias']:
+            if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
+                print(f"Removing key {k} from pretrained checkpoint")
+                del checkpoint_model[k]
+
+        # interpolate position embedding
+        interpolate_pos_embed(model, checkpoint_model)
+
+        # load pre-trained model
+        msg = model.load_state_dict(checkpoint_model, strict=False)
+        print(msg)
+
+        if args.global_pool:
+            assert set(msg.missing_keys) == {'head.weight', 'head.bias', 'fc_norm.weight', 'fc_norm.bias'}
+        else:
+            assert set(msg.missing_keys) == {'head.weight', 'head.bias'}
+
+
+        # manually initialize fc layer
+        trunc_normal_(model.head.weight, std=1e-5)
+
+    model.to(device)
+
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+    print("Model = %s" % str(model_without_ddp))
+    print('number of params (M): %.2f' % (n_parameters / 1.e6))
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    
+    # build optimizer with layer-wise lr decay (lrd)
+
+    if args.use_FAM:
+        param_groups = lrd.param_groups_lrd(model_without_ddp, args.weight_decay,
+        no_weight_decay_list=[] if args.bias_decay else model_without_ddp.no_weight_decay(),
+        layer_decay=args.layer_decay
+        )
+        optimizer = Name(param_groups, weight_decay=args.weight_decay,
+        lr=args.lr, betas=args.opt_betas, eps = args.opt_eps, max_grad_norm=args.max_grad_norm
+        )
+    else:
+        param_groups = lrd.param_groups_lrd(model_without_ddp, args.weight_decay,
+        no_weight_decay_list=model_without_ddp.no_weight_decay(),
+        layer_decay=args.layer_decay
+        )
+        optimizer = torch.optim.AdamW(param_groups, lr=args.lr)
+    #print(optimizer)
+    loss_scaler = NativeScaler()
+
+    if mixup_fn is not None:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif args.smoothing > 0.:
+        criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+
+    print("criterion = %s" % str(criterion))
+
+
+    misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+    if args.eval:
+        test_stats = evaluate(data_loader_val, model, device)
+        print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        exit(0)
+
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    max_accuracy = 0.0
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            args.clip_grad, mixup_fn,
+            log_writer=log_writer,
+            args=args
+        )
+        if args.output_dir and (epoch+1) % 10 == 0:
+            misc.save_model(
+                args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                loss_scaler=loss_scaler, epoch=epoch)
+
+        test_stats = evaluate(data_loader_val, model, device)
+        print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        max_accuracy = max(max_accuracy, test_stats["acc1"])
+        print(f'Max accuracy: {max_accuracy:.2f}%')
+
+        if log_writer is not None:
+            log_writer.add_scalar('perf/test_acc1', test_stats['acc1'], epoch)
+            log_writer.add_scalar('perf/test_acc5', test_stats['acc5'], epoch)
+            log_writer.add_scalar('perf/test_loss', test_stats['loss'], epoch)
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        **{f'test_{k}': v for k, v in test_stats.items()},
+                        'epoch': epoch,
+                        'n_parameters': n_parameters}
+
+        if args.log_dir and misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.log_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
\ No newline at end of file
diff --git a/CV/MAE/main_linprobe.py b/CV/MAE/main_linprobe.py
new file mode 100644
index 0000000..2d3f241
--- /dev/null
+++ b/CV/MAE/main_linprobe.py
@@ -0,0 +1,316 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import time
+from pathlib import Path
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import timm
+
+assert timm.__version__ == "0.3.2" # version check
+from timm.models.layers import trunc_normal_
+
+import util.misc as misc
+from util.pos_embed import interpolate_pos_embed
+from util.misc import NativeScalerWithGradNormCount as NativeScaler
+from util.lars import LARS
+from util.crop import RandomResizedCrop
+
+import models_vit
+
+from engine_finetune import train_one_epoch, evaluate
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('MAE linear probing for image classification', add_help=False)
+    parser.add_argument('--batch_size', default=512, type=int,
+                        help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
+    parser.add_argument('--epochs', default=90, type=int)
+    parser.add_argument('--accum_iter', default=1, type=int,
+                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+
+    # Model parameters
+    parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL',
+                        help='Name of model to train')
+
+    # Optimizer parameters
+    parser.add_argument('--weight_decay', type=float, default=0,
+                        help='weight decay (default: 0 for linear probe following MoCo v1)')
+
+    parser.add_argument('--lr', type=float, default=None, metavar='LR',
+                        help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=0.1, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+
+    parser.add_argument('--warmup_epochs', type=int, default=10, metavar='N',
+                        help='epochs to warmup LR')
+
+    # * Finetuning params
+    parser.add_argument('--finetune', default='',
+                        help='finetune from checkpoint')
+    parser.add_argument('--global_pool', action='store_true')
+    parser.set_defaults(global_pool=False)
+    parser.add_argument('--cls_token', action='store_false', dest='global_pool',
+                        help='Use class token instead of global pool for classification')
+
+    # Dataset parameters
+    parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
+                        help='dataset path')
+    parser.add_argument('--nb_classes', default=1000, type=int,
+                        help='number of the classification types')
+
+    parser.add_argument('--output_dir', default='./output_dir',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--log_dir', default='./output_dir',
+                        help='path where to tensorboard log')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='',
+                        help='resume from checkpoint')
+
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true',
+                        help='Perform evaluation only')
+    parser.add_argument('--dist_eval', action='store_true', default=False,
+                        help='Enabling distributed evaluation (recommended during training for faster monitor')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+
+    return parser
+
+
+def main(args):
+    misc.init_distributed_mode(args)
+
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+
+    device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+
+    # linear probe: weak augmentation
+    transform_train = transforms.Compose([
+            RandomResizedCrop(224, interpolation=3),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+    transform_val = transforms.Compose([
+            transforms.Resize(256, interpolation=3),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+    dataset_train = datasets.ImageFolder(os.path.join(args.data_path, 'train'), transform=transform_train)
+    dataset_val = datasets.ImageFolder(os.path.join(args.data_path, 'val'), transform=transform_val)
+    print(dataset_train)
+    print(dataset_val)
+
+    if True:  # args.distributed:
+        num_tasks = misc.get_world_size()
+        global_rank = misc.get_rank()
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+        print("Sampler_train = %s" % str(sampler_train))
+        if args.dist_eval:
+            if len(dataset_val) % num_tasks != 0:
+                print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                      'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                      'equal num of samples per-process.')
+            sampler_val = torch.utils.data.DistributedSampler(
+                dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True)  # shuffle=True to reduce monitor bias
+        else:
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+
+    if global_rank == 0 and args.log_dir is not None and not args.eval:
+        os.makedirs(args.log_dir, exist_ok=True)
+        log_writer = SummaryWriter(log_dir=args.log_dir)
+    else:
+        log_writer = None
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=True,
+    )
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=False
+    )
+
+    model = models_vit.__dict__[args.model](
+        num_classes=args.nb_classes,
+        global_pool=args.global_pool,
+    )
+
+    if args.finetune and not args.eval:
+        checkpoint = torch.load(args.finetune, map_location='cpu')
+
+        print("Load pre-trained checkpoint from: %s" % args.finetune)
+        checkpoint_model = checkpoint['model']
+        state_dict = model.state_dict()
+        for k in ['head.weight', 'head.bias']:
+            if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
+                print(f"Removing key {k} from pretrained checkpoint")
+                del checkpoint_model[k]
+
+        # interpolate position embedding
+        interpolate_pos_embed(model, checkpoint_model)
+
+        # load pre-trained model
+        msg = model.load_state_dict(checkpoint_model, strict=False)
+        print(msg)
+
+        if args.global_pool:
+            assert set(msg.missing_keys) == {'head.weight', 'head.bias', 'fc_norm.weight', 'fc_norm.bias'}
+        else:
+            assert set(msg.missing_keys) == {'head.weight', 'head.bias'}
+
+        # manually initialize fc layer: following MoCo v3
+        trunc_normal_(model.head.weight, std=0.01)
+
+    # for linear prob only
+    # hack: revise model's head with BN
+    model.head = torch.nn.Sequential(torch.nn.BatchNorm1d(model.head.in_features, affine=False, eps=1e-6), model.head)
+    # freeze all but the head
+    for _, p in model.named_parameters():
+        p.requires_grad = False
+    for _, p in model.head.named_parameters():
+        p.requires_grad = True
+
+    model.to(device)
+
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+    print("Model = %s" % str(model_without_ddp))
+    print('number of params (M): %.2f' % (n_parameters / 1.e6))
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+
+    optimizer = LARS(model_without_ddp.head.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    criterion = torch.nn.CrossEntropyLoss()
+
+    print("criterion = %s" % str(criterion))
+
+    misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+    if args.eval:
+        test_stats = evaluate(data_loader_val, model, device)
+        print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        exit(0)
+
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    max_accuracy = 0.0
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            max_norm=None,
+            log_writer=log_writer,
+            args=args
+        )
+        if args.output_dir:
+            misc.save_model(
+                args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                loss_scaler=loss_scaler, epoch=epoch)
+
+        test_stats = evaluate(data_loader_val, model, device)
+        print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        max_accuracy = max(max_accuracy, test_stats["acc1"])
+        print(f'Max accuracy: {max_accuracy:.2f}%')
+
+        if log_writer is not None:
+            log_writer.add_scalar('perf/test_acc1', test_stats['acc1'], epoch)
+            log_writer.add_scalar('perf/test_acc5', test_stats['acc5'], epoch)
+            log_writer.add_scalar('perf/test_loss', test_stats['loss'], epoch)
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        **{f'test_{k}': v for k, v in test_stats.items()},
+                        'epoch': epoch,
+                        'n_parameters': n_parameters}
+
+        if args.output_dir and misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
diff --git a/CV/MAE/main_pretrain.py b/CV/MAE/main_pretrain.py
new file mode 100644
index 0000000..2f134f5
--- /dev/null
+++ b/CV/MAE/main_pretrain.py
@@ -0,0 +1,277 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import time
+from pathlib import Path
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+from adan import Adan
+from timm.models import resume_checkpoint
+
+import timm
+
+#assert timm.__version__ == "0.3.2"  # version check
+import timm.optim.optim_factory as optim_factory
+from timm.utils import *
+
+import util.misc as misc
+from util.misc import NativeScalerWithGradNormCount as NativeScaler
+
+import models_mae
+
+from engine_pretrain import train_one_epoch
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('MAE pre-training', add_help=False)
+    parser.add_argument('--batch_size', default=64, type=int,
+                        help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
+    parser.add_argument('--epochs', default=400, type=int)
+    parser.add_argument('--accum_iter', default=1, type=int,
+                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+
+    # Model parameters
+    parser.add_argument('--model', default='mae_vit_large_patch16', type=str, metavar='MODEL',
+                        help='Name of model to train')
+
+    parser.add_argument('--input_size', default=224, type=int,
+                        help='images input size')
+
+    parser.add_argument('--mask_ratio', default=0.75, type=float,
+                        help='Masking ratio (percentage of removed patches).')
+
+    parser.add_argument('--norm_pix_loss', action='store_true',
+                        help='Use (per-patch) normalized pixels as targets for computing loss')
+    parser.set_defaults(norm_pix_loss=False)
+
+    # Optimizer parameters
+    parser.add_argument('--weight_decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+
+    parser.add_argument('--lr', type=float, default=None, metavar='LR',
+                        help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+
+    parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N',
+                        help='epochs to warmup LR')
+
+
+    parser.add_argument('--use-adan', action='store_true', default=False,
+                        help='whether to use Adan')
+    parser.add_argument('--max-grad-norm', type=float, default=0.0,
+                        help='max grad norm (default: 0.0 for no clip)')
+    parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--bias-decay', action='store_true', default=False,
+                        help='whether to decay bias term')
+
+    # Dataset parameters
+    parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
+                        help='dataset path')
+
+    parser.add_argument('--output_dir', default=None,
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--log_dir', default='./pretrain_dir/',
+                        help='path where to tensorboard log')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default=None,
+                        help='resume from checkpoint')
+    parser.add_argument('--no-resume-opt', action='store_true', default=False,
+                    help='prevent resume of optimizer state when resuming model')
+
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+
+    return parser
+
+
+def main(args):
+    misc.init_distributed_mode(args)
+
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+
+    device = torch.device(args.device)
+    
+
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+
+    # simple augmentation
+    transform_train = transforms.Compose([
+            transforms.RandomResizedCrop(args.input_size, scale=(0.2, 1.0), interpolation=3),  # 3 is bicubic
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+    dataset_train = datasets.ImageFolder(os.path.join(args.data_path, 'train'), transform=transform_train)
+    print(dataset_train)
+
+    if True:  # args.distributed:
+        num_tasks = misc.get_world_size()
+        global_rank = misc.get_rank()
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+        print("Sampler_train = %s" % str(sampler_train))
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+
+    if misc.is_main_process() and args.log_dir is not None:
+        TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.datetime.now())
+        
+        args.log_dir = args.log_dir+ 'mae-' + TIMESTAMP
+        os.makedirs(args.log_dir, exist_ok=True)
+        log_writer = SummaryWriter(log_dir=args.log_dir)
+    else:
+        log_writer = None
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=True,
+    )
+    
+    # define the model
+    model = models_mae.__dict__[args.model](norm_pix_loss=args.norm_pix_loss)
+
+    model.to(device)
+
+    model_without_ddp = model
+    print("Model = %s" % str(model_without_ddp))
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
+        model_without_ddp = model.module
+    
+    # following timm: set wd as 0 for bias and norm layers
+    param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay)
+    if args.use_adan:
+        if args.bias_decay:
+            param = model_without_ddp.parameters() 
+        else: 
+            param = param_groups
+            args.weight_decay = 0.0
+        optimizer = Adan(param, weight_decay=args.weight_decay,
+        lr=args.lr, betas=args.opt_betas, eps = args.opt_eps, max_grad_norm=args.max_grad_norm
+        )
+    else:
+        optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    resume_epoch = None
+
+    if not args.resume:
+        resume_path = os.path.join(args.output_dir, "last.pth.tar")
+        print(resume_path, os.path.isfile(resume_path))
+        if os.path.isfile(resume_path): args.resume = resume_path
+
+    if args.resume:
+        resume_epoch = resume_checkpoint(
+            model_without_ddp, args.resume,
+            optimizer=None if args.no_resume_opt else optimizer,
+            loss_scaler=None if args.no_resume_opt else loss_scaler,
+            log_info=misc.is_main_process())
+    if resume_epoch is not None:
+        args.start_epoch = resume_epoch
+
+    #misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+    print(f"Start training for {args.epochs} epochs")
+    saver = None
+    if misc.is_main_process() and args.output_dir is not None:
+        saver = CheckpointSaver(
+            model=model, optimizer=optimizer, args=args, amp_scaler=loss_scaler,
+            checkpoint_dir=args.output_dir, recovery_dir=args.output_dir, decreasing=True, max_history=2)
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            log_writer=log_writer,
+            args=args
+        )
+        if saver is not None:
+            # save proper checkpoint with eval metric
+           
+            saver.save_checkpoint(epoch, train_stats['loss'])
+        # if args.output_dir and (epoch % 25 == 0 or epoch + 1 == args.epochs):
+        #     misc.save_model(
+        #         args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+        #         loss_scaler=loss_scaler, epoch=epoch)
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        'epoch': epoch,}
+
+        if args.output_dir and misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
diff --git a/CV/MAE/models_mae.py b/CV/MAE/models_mae.py
new file mode 100644
index 0000000..880e28f
--- /dev/null
+++ b/CV/MAE/models_mae.py
@@ -0,0 +1,250 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from timm.models.vision_transformer import PatchEmbed, Block
+
+from util.pos_embed import get_2d_sincos_pos_embed
+
+
+class MaskedAutoencoderViT(nn.Module):
+    """ Masked Autoencoder with VisionTransformer backbone
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3,
+                 embed_dim=1024, depth=24, num_heads=16,
+                 decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+                 mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False):
+        super().__init__()
+
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False)  # fixed sin-cos embedding
+
+        self.blocks = nn.ModuleList([
+            Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        # --------------------------------------------------------------------------
+
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, decoder_embed_dim), requires_grad=False)  # fixed sin-cos embedding
+
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer)
+            for i in range(decoder_depth)])
+
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size**2 * in_chans, bias=True) # decoder to patch
+        # --------------------------------------------------------------------------
+
+        self.norm_pix_loss = norm_pix_loss
+
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # initialization
+        # initialize (and freeze) pos_embed by sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=True)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+
+        decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=True)
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=.02)
+        torch.nn.init.normal_(self.mask_token, std=.02)
+
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def patchify(self, imgs):
+        """
+        imgs: (N, 3, H, W)
+        x: (N, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        return x
+
+    def unpatchify(self, x):
+        """
+        x: (N, L, patch_size**2 *3)
+        imgs: (N, 3, H, W)
+        """
+        p = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1]**.5)
+        assert h * w == x.shape[1]
+        
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, 3))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p))
+        return imgs
+
+    def random_masking(self, x, mask_ratio):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+        len_keep = int(L * (1 - mask_ratio))
+        
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return x_masked, mask, ids_restore
+
+    def forward_encoder(self, x, mask_ratio):
+        # embed patches
+        x = self.patch_embed(x)
+
+        # add pos embed w/o cls token
+        x = x + self.pos_embed[:, 1:, :]
+
+        # masking: length -> length * mask_ratio
+        x, mask, ids_restore = self.random_masking(x, mask_ratio)
+
+        # append cls token
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+
+        return x, mask, ids_restore
+
+    def forward_decoder(self, x, ids_restore):
+        # embed tokens
+        x = self.decoder_embed(x)
+
+        # append mask tokens to sequence
+        mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # no cls token
+        x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # append cls token
+
+        # add pos embed
+        x = x + self.decoder_pos_embed
+
+        # apply Transformer blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        x = self.decoder_norm(x)
+
+        # predictor projection
+        x = self.decoder_pred(x)
+
+        # remove cls token
+        x = x[:, 1:, :]
+
+        return x
+
+    def forward_loss(self, imgs, pred, mask):
+        """
+        imgs: [N, 3, H, W]
+        pred: [N, L, p*p*3]
+        mask: [N, L], 0 is keep, 1 is remove, 
+        """
+        target = self.patchify(imgs)
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.e-6)**.5
+
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def forward(self, imgs, mask_ratio=0.75):
+        latent, mask, ids_restore = self.forward_encoder(imgs, mask_ratio)
+        pred = self.forward_decoder(latent, ids_restore)  # [N, L, p*p*3]
+        loss = self.forward_loss(imgs, pred, mask)
+        return loss, pred, mask
+
+
+def mae_vit_base_patch16_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViT(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12,
+        decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+
+
+def mae_vit_large_patch16_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViT(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16,
+        decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+
+
+def mae_vit_huge_patch14_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViT(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16,
+        decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+
+
+# set recommended archs
+mae_vit_base_patch16 = mae_vit_base_patch16_dec512d8b  # decoder: 512 dim, 8 blocks
+mae_vit_large_patch16 = mae_vit_large_patch16_dec512d8b  # decoder: 512 dim, 8 blocks
+mae_vit_huge_patch14 = mae_vit_huge_patch14_dec512d8b  # decoder: 512 dim, 8 blocks
diff --git a/CV/MAE/models_vit.py b/CV/MAE/models_vit.py
new file mode 100644
index 0000000..2244a17
--- /dev/null
+++ b/CV/MAE/models_vit.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+import timm.models.vision_transformer
+
+
+class VisionTransformer(timm.models.vision_transformer.VisionTransformer):
+    """ Vision Transformer with support for global average pooling
+    """
+    def __init__(self, global_pool=False, **kwargs):
+        super(VisionTransformer, self).__init__(**kwargs)
+
+        self.global_pool = global_pool
+        if self.global_pool:
+            norm_layer = kwargs['norm_layer']
+            embed_dim = kwargs['embed_dim']
+            self.fc_norm = norm_layer(embed_dim)
+
+            del self.norm  # remove the original norm
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        if self.global_pool:
+            x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
+            outcome = self.fc_norm(x)
+        else:
+            x = self.norm(x)
+            outcome = x[:, 0]
+
+        return outcome
+
+
+def vit_base_patch16(**kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+
+
+def vit_large_patch16(**kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+
+
+def vit_huge_patch14(**kwargs):
+    model = VisionTransformer(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
\ No newline at end of file
diff --git a/CV/MAE/util/crop.py b/CV/MAE/util/crop.py
new file mode 100644
index 0000000..fcb2612
--- /dev/null
+++ b/CV/MAE/util/crop.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+
+from torchvision import transforms
+from torchvision.transforms import functional as F
+
+
+class RandomResizedCrop(transforms.RandomResizedCrop):
+    """
+    RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
+    This may lead to results different with torchvision's version.
+    Following BYOL's TF code:
+    https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
+    """
+    @staticmethod
+    def get_params(img, scale, ratio):
+        width, height = F._get_image_size(img)
+        area = height * width
+
+        target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
+        log_ratio = torch.log(torch.tensor(ratio))
+        aspect_ratio = torch.exp(
+            torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
+        ).item()
+
+        w = int(round(math.sqrt(target_area * aspect_ratio)))
+        h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+        w = min(w, width)
+        h = min(h, height)
+
+        i = torch.randint(0, height - h + 1, size=(1,)).item()
+        j = torch.randint(0, width - w + 1, size=(1,)).item()
+
+        return i, j, h, w
\ No newline at end of file
diff --git a/CV/MAE/util/datasets.py b/CV/MAE/util/datasets.py
new file mode 100644
index 0000000..0dde1f4
--- /dev/null
+++ b/CV/MAE/util/datasets.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+
+import os
+import PIL
+
+from torchvision import datasets, transforms
+
+from timm.data import create_transform
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+
+def build_dataset(is_train, args):
+    transform = build_transform(is_train, args)
+
+    root = os.path.join(args.data_path, 'train' if is_train else 'val')
+    dataset = datasets.ImageFolder(root, transform=transform)
+
+    print(dataset)
+
+    return dataset
+
+
+def build_transform(is_train, args):
+    mean = IMAGENET_DEFAULT_MEAN
+    std = IMAGENET_DEFAULT_STD
+    # train transform
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.input_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            auto_augment=args.aa,
+            interpolation='bicubic',
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+            mean=mean,
+            std=std,
+        )
+        return transform
+
+    # eval transform
+    t = []
+    if args.input_size <= 224:
+        crop_pct = 224 / 256
+    else:
+        crop_pct = 1.0
+    size = int(args.input_size / crop_pct)
+    t.append(
+        transforms.Resize(size, interpolation=PIL.Image.BICUBIC),  # to maintain same ratio w.r.t. 224 images
+    )
+    t.append(transforms.CenterCrop(args.input_size))
+
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(mean, std))
+    return transforms.Compose(t)
diff --git a/CV/MAE/util/lars.py b/CV/MAE/util/lars.py
new file mode 100644
index 0000000..509c5f6
--- /dev/null
+++ b/CV/MAE/util/lars.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# LARS optimizer, implementation from MoCo v3:
+# https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+
+import torch
+
+
+class LARS(torch.optim.Optimizer):
+    """
+    LARS optimizer, no rate scaling or weight decay for parameters <= 1D.
+    """
+    def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001):
+        defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, trust_coefficient=trust_coefficient)
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self):
+        for g in self.param_groups:
+            for p in g['params']:
+                dp = p.grad
+
+                if dp is None:
+                    continue
+
+                if p.ndim > 1: # if not normalization gamma/beta or bias
+                    dp = dp.add(p, alpha=g['weight_decay'])
+                    param_norm = torch.norm(p)
+                    update_norm = torch.norm(dp)
+                    one = torch.ones_like(param_norm)
+                    q = torch.where(param_norm > 0.,
+                                    torch.where(update_norm > 0,
+                                    (g['trust_coefficient'] * param_norm / update_norm), one),
+                                    one)
+                    dp = dp.mul(q)
+
+                param_state = self.state[p]
+                if 'mu' not in param_state:
+                    param_state['mu'] = torch.zeros_like(p)
+                mu = param_state['mu']
+                mu.mul_(g['momentum']).add_(dp)
+                p.add_(mu, alpha=-g['lr'])
\ No newline at end of file
diff --git a/CV/MAE/util/lr_decay.py b/CV/MAE/util/lr_decay.py
new file mode 100644
index 0000000..7fa11f1
--- /dev/null
+++ b/CV/MAE/util/lr_decay.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# ELECTRA https://github.com/google-research/electra
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+
+import json
+
+
+def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=.75):
+    """
+    Parameter groups for layer-wise lr decay
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
+    """
+    param_group_names = {}
+    param_groups = {}
+
+    num_layers = len(model.blocks) + 1
+
+    layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1))
+
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+
+        # no decay: all 1D parameters and model specific ones
+        if p.ndim == 1 or n in no_weight_decay_list:
+            g_decay = "no_decay"
+            this_decay = 0.
+        else:
+            g_decay = "decay"
+            this_decay = weight_decay
+            
+        layer_id = get_layer_id_for_vit(n, num_layers)
+        group_name = "layer_%d_%s" % (layer_id, g_decay)
+
+        if group_name not in param_group_names:
+            this_scale = layer_scales[layer_id]
+
+            param_group_names[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+            param_groups[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+
+        param_group_names[group_name]["params"].append(n)
+        param_groups[group_name]["params"].append(p)
+
+    # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
+
+    return list(param_groups.values())
+
+
+def get_layer_id_for_vit(name, num_layers):
+    """
+    Assign a parameter with its layer id
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+    """
+    if name in ['cls_token', 'pos_embed']:
+        return 0
+    elif name.startswith('patch_embed'):
+        return 0
+    elif name.startswith('blocks'):
+        return int(name.split('.')[1]) + 1
+    else:
+        return num_layers
\ No newline at end of file
diff --git a/CV/MAE/util/lr_sched.py b/CV/MAE/util/lr_sched.py
new file mode 100644
index 0000000..4cb682b
--- /dev/null
+++ b/CV/MAE/util/lr_sched.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs 
+    else:
+        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
+            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+    return lr
diff --git a/CV/MAE/util/misc.py b/CV/MAE/util/misc.py
new file mode 100644
index 0000000..b62001a
--- /dev/null
+++ b/CV/MAE/util/misc.py
@@ -0,0 +1,366 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+
+import builtins
+import datetime
+import os
+import time
+from collections import defaultdict, deque
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+from torch._six import inf
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+
+    builtins.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_ddpjob(args=None):
+    """
+    initialize the ddp job  
+    """
+    if not dist.is_available() or not dist.is_initialized():
+        try:
+            os.environ['MASTER_PORT'] = '40101'
+            torch.distributed.init_process_group(
+            backend='nccl')
+        except Exception:
+            world_size, rank = 1, 0
+            print('distributed training not available')
+            print(Exception)
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    assert rank >= 0
+    args.gpu = args.rank
+    args.world_size, args.rank = world_size, rank
+   
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    #torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+
+    def state_dict(self):
+        return self._scaler.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+
+
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+
+
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler):
+    output_dir = Path(args.output_dir)
+    epoch_name = str(epoch)
+    if loss_scaler is not None:
+        checkpoint_paths = [output_dir / ('checkpoint-%s.pth' % epoch_name)]
+        for checkpoint_path in checkpoint_paths:
+            to_save = {
+                'model': model_without_ddp.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'epoch': epoch,
+                'scaler': loss_scaler.state_dict(),
+                'args': args,
+            }
+
+            save_on_master(to_save, checkpoint_path)
+    else:
+        client_state = {'epoch': epoch}
+        model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch_name, client_state=client_state)
+
+
+def load_model(args, model_without_ddp, optimizer, loss_scaler):
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        print("Resume checkpoint %s" % args.resume)
+        if 'optimizer' in checkpoint and 'epoch' in checkpoint and not (hasattr(args, 'eval') and args.eval):
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            args.start_epoch = checkpoint['epoch'] + 1
+            if 'scaler' in checkpoint:
+                loss_scaler.load_state_dict(checkpoint['scaler'])
+            print("With optim & sched!")
+
+
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x
\ No newline at end of file
diff --git a/CV/MAE/util/pos_embed.py b/CV/MAE/util/pos_embed.py
new file mode 100644
index 0000000..6acf8bd
--- /dev/null
+++ b/CV/MAE/util/pos_embed.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+
+import numpy as np
+
+import torch
+
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
diff --git a/CV/timm/README.md b/CV/timm/README.md
new file mode 100644
index 0000000..3592aba
--- /dev/null
+++ b/CV/timm/README.md
@@ -0,0 +1,79 @@
+# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
+
+For vision tasks, our implementation is based on the official [`timm`](https://github.com/rwightman/pytorch-image-models). To reproduce our results, please first refer to [`timm`](https://github.com/rwightman/pytorch-image-models) and install it. Then you can follow the following two steps to reproduce our experiments in paper. 
+
+
+
+## Environment
+
+Our experiments for this task are based on the following pkg version.
+
+```python
+torch.__version__  = '1.10.0+cu113'
+torchvision.__version__ = '0.11.1+cu113'
+timm.__version__ = '0.6.1'
+torchaudio.__version__ = '0.10.0+cu113'
+```
+
+Note that our timm is a developer version. If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:timm](https://hub.docker.com/repository/docker/xyxie/adan-image).
+
+
+
+## Usage of Adan in timm
+
+### Two steps to use Adan
+
+**Step 1.** add Adan-dependent hyper-parameters by adding the following hyper-parameters to the `train.py`:
+
+```python
+parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient  (default: 0.0, no gradient clip)')
+parser.add_argument('--weight-decay', type=float, default=0.02,  help='weight decay, similar one used in AdamW (default: 0.02)')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)')
+parser.add_argument('--no-prox', action='store_true', default=False, help='whether perform weight decay like AdamW (default=False)')
+parser.add_argument('--bias-decay', action='store_true', default=False, help='Perform the weight decay on bias term (default=False)')
+
+```
+
+* `bias-decay`: It decides whether or not to perform the weight decay on 1) bias term, 2) bn, and 3) other 1d params, which are all filtered out by the default setting in timm.
+
+* `no-prox`: It determines the update rule of parameters with weight decay. By default, Adan updates the parameters in the way presented in Algorithm 1 in the paper:
+
+    $$\boldsymbol{\theta}_{k+1} = ( 1+\lambda \eta)^{-1}\left[\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k)\right],$$
+
+  But one also can update the parameter like Adamw:
+
+  $$\boldsymbol{\theta}_{k+1} = ( 1-\lambda \eta)\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k).$$
+  **In all experiments, we set `no-prox=False` in our paper.** 
+
+  
+
+**Step 2.** creat the Adan optimizer as follows. In this step, we directly replace the vanilla optimizer creator by using the following three substeps. 
+
+i) add Adan into `optim_factory.py`:
+  ```python
+  elif opt_lower == 'adan': 
+    optimizer = Adan(parameters, **opt_args)
+  ```
+
+ii) import the optimizer creator into your training file `train.py` from `optim_factory` :
+
+  ```python
+  from optim_factory import create_optimizer
+  ```
+
+iii) replace the vanilla creator (`optimizer = create_optimizer(args, model)`) in the training file `train.py`  with Adan:
+
+  ```python
+  opt_lower = args.opt.lower()
+  if opt_lower == 'adan':
+    args.opt_args = {'max_grad_norm': args.max_grad_norm, 'no_prox': args.no_prox}
+  optimizer = create_optimizer(args, model, filter_bias_and_bn = not args.bias_decay)
+  ```
+
+
+
+## ImageNet-1K Training Recipe
+
+We provide the specific commonds and hyper-parameters for ViTs, ResNets and ConvNexts in this [recipe](./supervised.md).
+
diff --git a/CV/timm/adan.py b/CV/timm/adan.py
new file mode 100644
index 0000000..e2a224a
--- /dev/null
+++ b/CV/timm/adan.py
@@ -0,0 +1,154 @@
+# Copyright 2022 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from timm.utils import *
+
+
+class Adan(Optimizer):
+    """
+    Implements a pytorch variant of Adan
+
+    Adan was proposed in
+    Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022.
+    https://arxiv.org/abs/2208.06677
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float, flot], optional): coefficients used for computing 
+            running averages of gradient and its norm. (default: (0.98, 0.92, 0.99))
+        eps (float, optional): term added to the denominator to improve 
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0)
+        max_grad_norm (float, optional): value used to clip 
+            global grad norm (default: 0.0 no clip)
+        no_prox (bool): how to perform the decoupled weight decay (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8,
+                 weight_decay=0.0, max_grad_norm=0.0, no_prox=False):
+        if not 0.0 <= max_grad_norm:
+            raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm))
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= betas[2] < 1.0:
+            raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm, no_prox=no_prox)
+        super(Adan, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adan, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('no_prox', False)
+
+    @torch.no_grad()
+    def restart_opt(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                if p.requires_grad:
+                    state = self.state[p]
+                    # State initialization
+
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    # Exponential moving average of gradient difference
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+    @torch.no_grad()
+    def step(self):
+        """
+            Performs a single optimization step.
+        """
+        if self.defaults['max_grad_norm'] > 0:
+            device = self.param_groups[0]['params'][0].device
+            global_grad_norm = torch.zeros(1, device=device)
+
+            max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device)
+            for group in self.param_groups:
+
+                for p in group['params']:
+                    if p.grad is not None:
+                        grad = p.grad
+                        global_grad_norm.add_(grad.pow(2).sum())
+
+            global_grad_norm = torch.sqrt(global_grad_norm)
+
+            clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0)
+        else:
+            clip_global_grad_norm = 1.0
+
+        for group in self.param_groups:
+            beta1, beta2, beta3 = group['betas']
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            bias_correction1 = 1.0 - beta1 ** group['step']
+
+            bias_correction2 = 1.0 - beta2 ** group['step']
+
+            bias_correction3 = 1.0 - beta3 ** group['step']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+                grad = p.grad.mul_(clip_global_grad_norm)
+                if 'pre_grad' not in state or group['step'] == 1:
+                    state['pre_grad'] = grad
+
+                copy_grad = grad.clone()
+
+                exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff']
+                diff = grad - state['pre_grad']
+
+                update = grad + beta2 * diff
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
+                exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
+                exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # n_t
+
+                denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps'])
+                update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom)
+
+                if group['no_prox']:
+                    p.data.mul_(1 - group['lr'] * group['weight_decay'])
+                    p.add_(update, alpha=-group['lr'])
+                else:
+                    p.add_(update, alpha=-group['lr'])
+                    p.data.div_(1 + group['lr'] * group['weight_decay'])
+
+                state['pre_grad'] = copy_grad
diff --git a/CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml b/CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml
new file mode 100644
index 0000000..30485ea
--- /dev/null
+++ b/CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml
@@ -0,0 +1,111 @@
+aa: rand-m9-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: false
+bias_decay: false
+bn_eps: null
+bn_momentum: null
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.1
+epoch_repeats: 0.0
+epochs: 150
+eval_metric: top1
+experiment: 
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 0.0
+mean: null
+min_lr: 0.0001
+mixup: 0.8
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: convnext_tiny_hnf
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_eps: 1.0e-08
+output: ./exp_results/cvnext
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.25
+resplit: false
+resume: null
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: bicubic
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-08
+weight_decay: 0.04
+workers: 8
diff --git a/CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml b/CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml
new file mode 100644
index 0000000..09de86f
--- /dev/null
+++ b/CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml
@@ -0,0 +1,111 @@
+aa: rand-m9-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: false
+bias_decay: false
+bn_eps: null
+bn_momentum: null
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.1
+epoch_repeats: 0.0
+epochs: 300
+eval_metric: top1
+experiment:
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.016
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 0.0
+mean: null
+min_lr: 0.0001
+mixup: 0.8
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: convnext_tiny_hnf
+model_ema: true
+model_ema_decay: 0.9999
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.9
+opt_eps: 1.0e-08
+output: ./exp_results/cvnext
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.25
+resplit: false
+resume: null
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 150
+warmup_lr: 1.0e-08
+weight_decay: 0.02
+workers: 8
diff --git a/CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv b/CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv
new file mode 100644
index 0000000..eb20334
--- /dev/null
+++ b/CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv
@@ -0,0 +1,162 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,6.949120010648455,6.94758,0.098,0.5
+1,6.862144197736468,6.67267,0.7260000094604492,2.8660000064086915
+2,6.758432626724243,6.235245,2.4060000006103515,7.79000000793457
+3,6.59022855758667,5.71968375,5.324000036621094,14.974000053710938
+4,6.390288966042655,5.161505,9.559999970703124,24.06800000732422
+5,6.191993509020124,4.61118375,15.203999997558594,34.33800004638672
+6,5.93863810811724,4.02059625,22.75800002685547,45.19399994628906
+7,5.739998238427298,3.502533125,29.72400002319336,54.630000031738284
+8,5.5293778691973,3.0821,36.9499999987793,62.65999991210938
+9,5.182862758636475,2.7918725,42.444000041503905,68.35999993652344
+10,5.155720744814191,2.5169634375,46.800000048828124,72.43000000976562
+11,4.918254239218576,2.3485275,50.37200013916016,75.46400005371093
+12,4.932929992675781,2.1857096875,53.25000006347656,77.90200017333984
+13,4.708190441131592,2.069689375,55.221999931640624,79.64600004882813
+14,4.694507598876953,1.96074125,57.45999997558594,80.91000001220704
+15,4.671949318477085,1.8996378125,58.692000075683595,82.26000005859375
+16,4.54720984186445,1.8131734375,59.98400012207031,83.24400005859376
+17,4.458590064729963,1.8123903125,60.53799998535156,83.96999998291015
+18,4.3758848905563354,1.7519709375,61.49800001953125,84.48200008544922
+19,4.340606348855155,1.6863440625,62.391999909667966,85.24799992919922
+20,4.472063779830933,1.68092125,62.82999993164062,85.42400000732422
+21,4.378427471433367,1.679071875,63.790000107421875,86.00400018310548
+22,4.375464507511684,1.648258125,63.99400006347656,86.01799992919922
+23,4.337998969214303,1.62242875,64.24999989746094,86.27799989990234
+24,4.298710312162127,1.59491171875,64.766,86.731999921875
+25,4.22202069418771,1.5892434375,65.10400015625,86.99000014892579
+26,4.316074218068804,1.56527109375,65.22599997558594,87.11799984375
+27,4.191796098436628,1.53780734375,65.55000013183594,87.21200010009765
+28,4.222038558551243,1.575911875,65.37400005859375,87.27199994873047
+29,4.135323320116315,1.5107128125,66.06800002441406,87.29200002441407
+30,4.22194378716605,1.545961875,66.02200002929688,87.62000001953125
+31,4.185418827193124,1.51114390625,66.05799994873047,87.71000005126953
+32,4.096941930907113,1.49620984375,65.96999989990235,87.7519999975586
+33,4.215072478566851,1.4948609375,66.21000018554687,87.73399987060547
+34,4.163071274757385,1.5585396875,66.666,87.91400005126953
+35,4.114607776914324,1.5304534375,66.01000005371094,87.63400002685547
+36,4.118212870189121,1.5224515625,66.20600007080078,87.93999994384765
+37,4.204281193869455,1.5179465625,66.38400010253906,87.89600007080078
+38,4.24107871736799,1.52187875,66.32999999755859,87.8100000732422
+39,4.1643034390040805,1.515391875,66.61999997558594,87.92400002197266
+40,4.070860539163862,1.56338328125,66.33000002685547,87.8499999975586
+41,4.184106230735779,1.514239375,66.48600007568359,87.71399991943359
+42,4.172222001211984,1.50769546875,66.104,87.7760001538086
+43,4.3232389858790805,1.57834875,65.96399992675781,87.65399987060547
+44,4.174148797988892,1.5443553125,65.89199994628906,87.63599997070312
+45,4.1315469571522305,1.51145125,65.72400005615235,87.67199999511719
+46,4.283388665744236,1.565783125,66.0200000805664,87.78800002197265
+47,4.183656964983259,1.525795,65.64399995117188,87.71999999511719
+48,4.234571712357657,1.50795125,65.86199995117188,87.81200017578125
+49,4.198768717902047,1.5564540625,65.36000006591797,87.1520000830078
+50,4.219038724899292,1.5349934375,65.90200008056641,87.5440000756836
+51,4.127701095172337,1.515018125,65.67200011230469,87.53599997070313
+52,4.239894764763968,1.53229078125,65.62400016601562,87.49200007324218
+53,4.229601979255676,1.5197615625,65.15399995605469,87.35199997314453
+54,4.19003392968859,1.51075296875,65.7220001538086,87.46800004638672
+55,4.192435826574053,1.5914375,65.48400008544922,87.322000078125
+56,4.1021279607500345,1.54005890625,65.38599997802734,87.42000010498047
+57,4.20039519241878,1.5680734375,65.5919998828125,87.57800007568359
+58,4.324156045913696,1.59799640625,64.72200002441406,87.17800012939453
+59,4.239287546702793,1.5902246875,64.84800000732422,86.97599997558594
+60,4.065577728407724,1.44326765625,68.30400000244141,89.15400017333984
+61,4.120057480675833,1.386383125,68.40999994628906,89.24200007324218
+62,4.0325968606131415,1.39331453125,68.53599994628907,89.37199999267578
+63,4.132853150367737,1.42474296875,68.66400004394531,89.28800007080078
+64,4.020219087600708,1.3872065625,68.5939999975586,89.42399996826173
+65,4.013268879481724,1.39158640625,68.34799997070313,89.32800001953125
+66,4.005090730530875,1.3511690625,68.99200005371094,89.52000014648438
+67,3.9683394602366855,1.36299828125,69.29600005126953,89.69600004638671
+68,4.024942125592913,1.340035625,69.46800006835937,89.66600001708984
+69,3.9339629071099416,1.35916578125,69.40000004394531,89.7560000439453
+70,3.9678512130464827,1.35286515625,69.51199994628907,89.79800007324219
+71,4.023373927388873,1.360429375,69.92400012695313,90.07600001953125
+72,3.969754219055176,1.33380875,69.88200002197266,90.17200012451171
+73,3.9590375082833424,1.28158140625,70.1680000756836,90.32199991699218
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+74,3.872572592326573,1.335075,70.19199993652344,90.28600004394531
+75,3.909360408782959,1.312461875,70.52400001708985,90.5280001196289
+76,3.8855138335909163,1.28012890625,70.78799993408204,90.43199994140625
+77,3.919007114001683,1.31800015625,70.6560000390625,90.4380000732422
+78,3.891653691019331,1.336391875,70.90000001708984,90.46399996337891
+79,3.894351737839835,1.28790734375,71.00600001953126,90.64600007080078
+80,3.823104841368539,1.262434375,71.15400001953125,90.76999993896484
+81,3.910448908805847,1.25446109375,71.07600009521484,90.81999999267578
+82,3.924909387316023,1.2769846875,71.60600012207031,90.9519999658203
+83,3.7137380497796193,1.2824403125,71.43800014648437,90.88200001464844
+84,3.8967798948287964,1.24836609375,71.78200009765625,91.15599999267579
+85,3.8117938893181935,1.2596334375,71.88600006347656,91.25200017333984
+86,3.9488723278045654,1.22986734375,72.25800017089844,91.46800001464844
+87,3.7670505387442454,1.22124546875,72.36199996826171,91.43000004394531
+88,3.825251579284668,1.199299375,72.53400001708984,91.48800014892578
+89,3.860643812588283,1.19397125,72.801999921875,91.58000007324219
+90,3.779147114072527,1.20414328125,72.74400001464844,91.69600009277343
+91,3.759223989077977,1.191304375,72.82799999023437,91.7520001196289
+92,3.7066173212868825,1.16685125,73.24599995605469,91.93600001464844
+93,3.715459874698094,1.1712415625,72.98600003417968,91.88200006591796
+94,3.826604655810765,1.16880703125,73.4040000390625,91.98200006591797
+95,3.775054318564279,1.15899234375,73.8859999584961,92.25399999023438
+96,3.782100932938712,1.166160625,73.78200000976562,92.13400014404297
+97,3.743901354925973,1.16621484375,73.6680001171875,92.34799999023437
+98,3.742660846029009,1.1303665625,73.92599998535157,92.48599998779297
+99,3.6611821992056712,1.13907421875,74.32999998291015,92.30999993652344
+100,3.7334849323545183,1.13430890625,74.36000008544922,92.46600009277344
+101,3.634329523359026,1.1071696875,74.45400001220703,92.61800016845703
+102,3.664542010852269,1.114905625,74.62600011230468,92.6480000366211
+103,3.5832586458751132,1.0819065625,75.06000008789063,92.80000006591797
+104,3.64421922819955,1.0801809375,75.07000005615234,93.16000000976562
+105,3.607390114239284,1.0793553125,75.23599990722656,93.0760001171875
+106,3.5108348301478793,1.07566765625,75.54999998291015,93.04799990966796
+107,3.588543857846941,1.0673165625,75.23600003662109,93.10200000976562
+108,3.5450944219316756,1.05819828125,75.74000008544922,93.2359999609375
+109,3.506815637860979,1.040446875,75.8360001147461,93.39200006591797
+110,3.4770743335996355,1.05782140625,75.7600000415039,93.24000001220703
+111,3.5809445721762523,1.0449415625,75.99400006347656,93.38399998779298
+112,3.569081289427621,1.04761125,76.47800013427734,93.5019999584961
+113,3.497449823788234,1.03080390625,76.34600006347657,93.6019998803711
+114,3.4141811473029002,1.0365765625,76.51400006103516,93.5659999609375
+115,3.488074677331107,1.02879859375,76.75000016601562,93.72600006347656
+116,3.478419746671404,1.00970078125,76.97399990966797,93.84199998535156
+117,3.5675860132489885,1.01771640625,76.95800005859375,93.8620000366211
+118,3.424144216946193,0.9859259375,77.22600010742188,93.91600014404297
+119,3.377763560840062,0.9717196875,77.44999998535157,94.06000008789063
+120,3.500486288751875,0.97851296875,77.43200000488281,94.13199998535156
+121,3.3951018367494856,0.96721609375,77.89600000488281,94.10600008789062
+122,3.4327589103153775,0.972241875,77.86000005615234,94.26000006347657
+123,3.356873188699995,0.95380078125,77.99800005615235,94.25000008789063
+124,3.366360766547067,0.94531078125,78.0940000024414,94.36000000732422
+125,3.2791837453842163,0.93246,78.37000002685546,94.53400008789062
+126,3.3443728174482072,0.944398125,78.48199998046876,94.55600000976563
+127,3.332287073135376,0.93921078125,78.58600000244141,94.56599993164062
+128,3.276544622012547,0.9311209375,78.60599998291016,94.67399998291016
+129,3.270776629447937,0.930044375,78.72600005371093,94.63600008789062
+130,3.1806167364120483,0.91067625,79.11799989990234,94.85600008789062
+131,3.2413502761295865,0.9131765625,79.11400010742187,94.90999993164063
+132,3.3152259417942593,0.91413046875,79.33600010742188,94.94599988037109
+133,3.2410558121544972,0.91534203125,79.33800002685547,94.95599998291016
+134,3.1902000393186296,0.8926434375,79.64600013183593,95.03800008789062
+135,3.2619236537388394,0.87898453125,79.832000078125,95.11199995605469
+136,3.1733152525765553,0.8800503125,79.84800002929687,95.14799993408204
+137,3.2605464458465576,0.87765875,79.98400005859375,95.22400003662109
+138,3.1617833205631802,0.88012453125,80.00399994873047,95.24800003417968
+139,3.149062837873186,0.8756025,80.15600005371094,95.29200005859374
+140,3.11025937965938,0.8646725,80.29399990234376,95.40000006103516
+141,3.0998826708112444,0.8566525,80.40999998046875,95.44000000976563
+142,3.127287915774754,0.8480178125,80.42199994873047,95.53199993164063
+143,3.0388451644352505,0.855489375,80.51200018554688,95.50399995605468
+144,3.076501284326826,0.847876875,80.738000078125,95.59399995605469
+145,3.1298107419695174,0.84905984375,80.82400002929687,95.57399995605469
+146,3.075501118387495,0.84291765625,81.01200005126952,95.6200001123047
+147,3.054636767932347,0.83939078125,80.97999997802734,95.68599998046875
+148,3.1356202363967896,0.8339446875,80.97600008056641,95.58800011230468
+149,2.961950966290065,0.8238534375,81.13400012939454,95.69599995605469
+150,2.983371581350054,0.8275609375,81.26999989746093,95.69399998046875
+151,2.9757753951208934,0.8230825,81.20200000244141,95.65000003417968
+152,2.9356598343167986,0.82456109375,81.38800002685547,95.78400008544922
+153,2.976205723626273,0.82176859375,81.40400002685547,95.74199998291016
+154,3.0084752525602068,0.82321359375,81.47600010253906,95.78000011230469
+155,2.9824235098702565,0.819495625,81.59399997558593,95.75999998291016
+156,3.010391661099025,0.8145009375,81.54799997558594,95.79999992919922
+157,3.0055614709854126,0.80819,81.62000002685546,95.77600006103516
+158,3.0069884743009294,0.81687484375,81.694000078125,95.80000003417969
+159,2.9622493471418108,0.81747,81.71600012695312,95.86999987792969
diff --git a/CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv b/CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv
new file mode 100644
index 0000000..4d3763d
--- /dev/null
+++ b/CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv
@@ -0,0 +1,311 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,6.94980594090053,6.941925,0.148,0.5639999999332428
+1,6.870656830923898,6.93398,0.14,0.6059999999332428
+2,6.832324981689453,6.9268075,0.14,0.6260000001525879
+3,6.744790928704398,6.85027,0.356,1.423999999732971
+4,6.6263234955923895,6.82559625,0.3519999999332428,1.680000004119873
+5,6.48770945412772,6.8010825,0.445999999294281,1.8840000047302246
+6,6.354598147528512,6.7766625,0.5019999987411499,2.0840000004577637
+7,6.1707538196018765,6.75051,0.5359999960327149,2.357999996337891
+8,6.007028205054147,6.724545,0.5940000102233887,2.5400000085449217
+9,5.772588491439819,6.6968475,0.6180000048828125,2.764000009155273
+10,5.631790944508144,6.66867,0.6499999992370605,2.8380000115966797
+11,5.379027741295951,6.63972,0.7039999964904785,2.983999981994629
+12,5.221321684973581,6.6121825,0.7779999993896485,3.107999990234375
+13,5.053121021815708,6.590085,0.8179999989318848,3.2280000225830077
+14,4.956123658588955,6.572545,0.8080000015258789,3.3900000115966797
+15,4.799027442932129,6.5652675,0.812000001373291,3.498000008239746
+16,4.720616817474365,6.5723075,0.8280000000762939,3.607999997253418
+17,4.573836837496076,6.59066375,0.8199999999237061,3.646000002441406
+18,4.5748593126024515,6.61353875,0.8399999986267089,3.5579999966430664
+19,4.5739074775150845,6.6250425,0.8559999960327148,3.7140000021362303
+20,4.318449326923916,6.61105625,0.937999998626709,3.906000018005371
+21,4.435781615121024,6.56041875,1.032000001525879,4.251999999084473
+22,4.302624361855643,6.4759325,1.2600000071716309,4.878000020751953
+23,4.414106096540179,6.3724225,1.5319999990844726,5.741999996643067
+24,4.211065360477993,6.27572125,1.7319999960327148,6.6620000073242185
+25,4.247744934899466,6.19649875,1.955999998626709,7.483999992370605
+26,4.282609258379255,6.13518,2.046000001068115,8.096000004882812
+27,4.169072525841849,6.0866725,2.1599999997711183,8.647999996643067
+28,4.122129150799343,6.0465125,2.2039999970245363,9.036000012817382
+29,4.049461347716195,6.01146125,2.181999998474121,9.56800002380371
+30,4.055705479213169,5.97383875,2.218000001220703,10.153999978637696
+31,4.163602726800101,5.93298875,2.298000004272461,10.857999990234376
+32,4.081817013876779,5.88474,2.490000000228882,11.744000009765625
+33,4.099612167903355,5.82870625,2.7820000045776365,12.73000000793457
+34,4.05392244883946,5.77154125,3.1359999996948242,13.936000022583007
+35,4.031520996774946,5.70855,3.588000005950928,15.310000037231445
+36,3.9380472217287337,5.63284125,4.223999998474121,16.89000000366211
+37,4.008964129856655,5.55018625,5.134000010375977,18.955999996948243
+38,3.9080281938825334,5.4650375,6.213999992675781,21.212000052490236
+39,3.9426505735942294,5.37652875,7.4720000238037105,23.502000013427736
+40,3.8746714421680997,5.275275,8.935999995117188,25.958000014648437
+41,3.925470062664577,5.16479625,10.686000018310548,28.565999997558595
+42,3.8827988420213972,5.038985,12.961999990234375,31.956000020751954
+43,3.8133669240134105,4.8973225,15.324,35.33399998168945
+44,3.895782709121704,4.735395,18.004000031738283,39.12999999145508
+45,3.840534363474165,4.5540025,20.978000014648437,43.07599996826172
+46,3.8127967289515903,4.3406675,24.086000004882813,47.30399996459961
+47,3.791675567626953,4.10886875,27.554000001220704,51.619999987792966
+48,3.899762204715184,3.8762425,30.89399997314453,55.756
+49,3.8846767289297923,3.6193125,34.578000009765624,59.768000014648436
+50,3.8196579899106706,3.34842375,38.15000000610352,63.612000004882816
+51,3.7278830664498463,3.084740625,41.78199998046875,67.10199993652344
+52,3.827213168144226,2.84354625,45.26399999267578,70.4120000024414
+53,3.8210743835994174,2.6217921875,48.41599998535156,73.43599991210938
+54,3.938135640961783,2.4278509375,51.35800005126953,75.99800005859375
+55,3.804831658090864,2.259463125,53.92400004394531,78.37399994873047
+56,3.7662359986986433,2.108823125,56.36400003417969,80.34200004638672
+57,3.874816451753889,1.9794275,58.644000029296876,82.09400001708984
+58,3.7855028424944197,1.8671025,60.698000024414064,83.56799993896485
+59,3.8578923600060597,1.772823125,62.44000004638672,84.77200006347657
+60,3.776338151523045,1.688523125,64.03600006591797,85.80600006103515
+61,3.821035214832851,1.613735,65.38999995849609,86.73000008544922
+62,3.7199710437229703,1.547733125,66.50600000976563,87.50199997558593
+63,3.8106864350182668,1.49222375,67.56400003173827,88.19799989746093
+64,3.8047622782843455,1.44100609375,68.55600005615234,88.803999921875
+65,3.781612345150539,1.39522640625,69.3880000024414,89.3560000756836
+66,3.7649945531572615,1.351588125,70.10600010253906,89.83200002197266
+67,3.679371884890965,1.3106146875,70.83200012695312,90.23799999267578
+68,3.7694370406014577,1.27566546875,71.43799999511718,90.60799996582031
+69,3.90033187185015,1.247449375,71.96400009765625,90.90999999023437
+70,3.809638108525957,1.22405953125,72.35200017578126,91.14399999023438
+71,3.7749327761786327,1.20222625,72.7679999951172,91.36999999023438
+72,3.8541347810200284,1.18338953125,73.09400004638672,91.56199999023437
+73,3.7692117520741055,1.16706453125,73.45000001953125,91.69999993896485
+74,3.89384697164808,1.15245609375,73.72600004394532,91.83999993896484
+75,3.818265676498413,1.13920421875,73.88999993896485,91.96599999023438
+76,3.808180570602417,1.12641375,74.09400001464844,92.09400006835938
+77,3.7770608493259976,1.11587984375,74.3200000390625,92.20400006835938
+78,3.7761710030691966,1.10574734375,74.42600009033202,92.34000006835937
+79,3.851380297115871,1.096963125,74.5920001171875,92.43800006835937
+80,3.7436849049159457,1.08872375,74.73000016845702,92.52200006835938
+81,3.7914348670414517,1.0825409375,74.7980001171875,92.58200006835938
+82,3.8620016745158603,1.075705,74.9819999609375,92.6440000415039
+83,3.7995089633124217,1.07101390625,75.0600000390625,92.68200004150391
+84,3.820492676326207,1.067599375,75.14399993652344,92.69400004150391
+85,3.9248864139829363,1.06450140625,75.22599998779297,92.74400011962891
+86,3.7503830705370222,1.06045515625,75.2979999609375,92.8420001196289
+87,3.758145349366324,1.05718203125,75.36800006347656,92.87600011962891
+88,3.7711183173315868,1.0540571875,75.46799998535157,92.89000011962891
+89,3.7123160702841624,1.05168203125,75.51800006347656,92.94000017089844
+90,3.8062860454831804,1.04932546875,75.58600008789062,92.99200017089844
+91,3.8366364240646362,1.047400625,75.58600008789062,93.0200001196289
+92,3.79878500529698,1.0462640625,75.67000008789063,93.04199999023437
+93,3.8638044936316356,1.04370953125,75.70600008789063,93.08600001708984
+94,3.8578367233276367,1.04192125,75.75400013916015,93.12000001708985
+95,3.7778862374169484,1.040408125,75.79199995849609,93.11000001708985
+96,3.797469445637294,1.03935296875,75.79799995849609,93.14600001708985
+97,3.851992062159947,1.0373315625,75.86000013916015,93.20200006835937
+98,3.881648983274187,1.0358784375,75.89800013916016,93.18800006835937
+99,3.7989996331078664,1.03511015625,75.97000006103515,93.19600001708984
+100,3.832390921456473,1.034321875,76.06399993164062,93.20800001708984
+101,3.8957015786852156,1.0339253125,76.08000003417969,93.22400001708985
+102,3.7361060891832625,1.03406046875,76.1099999560547,93.2260001196289
+103,3.8049339226314,1.0340321875,76.11400003417968,93.24200009277344
+104,3.7306436811174666,1.03438734375,76.11799995605469,93.27800009277344
+105,3.7872143302645003,1.03508734375,76.15199990478516,93.27000009277344
+106,3.8432654482977733,1.03528734375,76.20999995605469,93.28200014404297
+107,3.743649329457964,1.03604734375,76.25600008300782,93.27400006591797
+108,3.827305112566267,1.0367990625,76.22400010986328,93.27800006591796
+109,3.873527799333845,1.0386921875,76.22000005859375,93.29600001464844
+110,3.8782293115343367,1.03986390625,76.21600005859375,93.31000001464844
+111,3.7775590079171315,1.041655625,76.17200005859375,93.33200006591797
+112,3.800403901508876,1.04380390625,76.15000000732422,93.34000006591796
+113,3.9012868915285384,1.04627703125,76.1580000341797,93.33400006591796
+114,3.846208725656782,1.0487984375,76.15199990478516,93.34800006591797
+115,3.8475638798304965,1.0521446875,76.17799998291015,93.33400014404297
+116,3.9795446395874023,1.05490609375,76.14799998291015,93.36200006591797
+117,3.8443203142711093,1.0580040625,76.08199990478515,93.3640001171875
+118,3.8250525849206105,1.06173859375,76.05199998291016,93.3480001171875
+119,3.907123395374843,1.0642565625,76.06800000976563,93.33600006591797
+120,3.8155746970857893,1.06640765625,76.09999998291016,93.32000006591797
+121,3.8923403535570418,1.0678690625,76.08800000976562,93.30000006591797
+122,3.843483175550188,1.069791875,76.04999995849609,93.32200009277344
+123,3.855623040880476,1.0719815625,76.02399990722657,93.31200014404297
+124,3.8870831046785628,1.07447265625,76.03199995849609,93.33200014404296
+125,3.8356190749577115,1.076770625,76.02800008789062,93.28600009277343
+126,3.8387703554970876,1.07683890625,76.02400000976563,93.30600001464843
+127,3.9135857139314925,1.0763871875,76.0479998803711,93.31400001464844
+128,3.792390823364258,1.07619203125,76.06800006103515,93.2779999633789
+129,3.850563202585493,1.07544859375,76.04999998291015,93.2720000415039
+130,3.738796762057713,1.07485546875,76.00599998291015,93.24200009277344
+131,3.8326881442751204,1.0729271875,75.95000006103515,93.22200001464844
+132,3.893867118018014,1.07098234375,75.92200008789062,93.21400001464843
+133,3.903687221663339,1.068804375,75.92000000976563,93.2240000415039
+134,3.8506924084254672,1.0672546875,75.85400000976563,93.20200009277343
+135,3.8837316717420305,1.065745,75.8399999584961,93.20600004150391
+136,3.8928286177771434,1.06427671875,75.77599990722656,93.18800004150391
+137,3.981452601296561,1.06258359375,75.71999998535156,93.18200009277344
+138,3.909976840019226,1.06103390625,75.68400001220704,93.18800014404297
+139,3.9253520454679216,1.059399375,75.62400006347656,93.16000009277344
+140,3.8020264761788503,1.0577296875,75.60999998535156,93.16800009277344
+141,3.922452688217163,1.05619515625,75.60600006347656,93.15600009277344
+142,3.8550586189542497,1.05484890625,75.63400006347656,93.13400014404297
+143,3.8236265863691057,1.053694375,75.5499999609375,93.16800014404296
+144,3.9549256563186646,1.05310125,75.54199993408203,93.16200009277344
+145,3.936390927859715,1.05281296875,75.5200000390625,93.1260000415039
+146,3.8756578990391324,1.052428125,75.53200001220704,93.0940000415039
+147,3.8966123376573836,1.051775,75.49600014160156,93.11800009277344
+148,3.8537066323416576,1.05180328125,75.51200001220703,93.10600009277344
+149,3.905484369822911,1.05247984375,75.47600001220704,93.06800009277343
+150,3.780659999166216,1.0510046875,75.49200014160157,93.1120001953125
+151,3.650853753089905,1.046048125,75.58200006347656,93.1580001953125
+152,3.614588141441345,1.0403984375,75.72000001220704,93.2160001953125
+153,3.5736235891069685,1.03362390625,75.81200006347656,93.29400014404297
+154,3.616161755153111,1.02613765625,75.98200008789063,93.35400009277343
+155,3.5855596917016164,1.01847828125,76.13000008789062,93.43000006591797
+156,3.59866692338671,1.0102871875,76.26600000976562,93.50000006591797
+157,3.5309522322246005,1.00186296875,76.43600008789062,93.60800001464844
+158,3.574401548930577,0.9941953125,76.5759999584961,93.69000006591797
+159,3.51989597933633,0.98644765625,76.71000000976562,93.73400006591797
+160,3.6196948971067155,0.9791965625,76.82399995849609,93.7780001171875
+161,3.605029055050441,0.9721903125,76.94599995849609,93.8140001171875
+162,3.5420662505286082,0.9652475,77.05399995849609,93.8980001171875
+163,3.5894347429275513,0.958748125,77.22999998291016,93.93400006591797
+164,3.5978579180581227,0.95259359375,77.36000003417969,93.99200006591796
+165,3.6617026329040527,0.9469590625,77.46600008544922,94.02400006591797
+166,3.6058098588671004,0.94118453125,77.55200006103516,94.07200006591796
+167,3.58823892048427,0.9359265625,77.6520000341797,94.11400006591796
+168,3.541081871305193,0.930876875,77.73800003417969,94.16400006591797
+169,3.545100109917777,0.926250625,77.81400008544922,94.22399998779296
+170,3.602257422038487,0.92191265625,77.86400008544922,94.2580000390625
+171,3.507282631737845,0.91811125,77.97200000732421,94.2920001171875
+172,3.5846358367374966,0.91407328125,78.04000005859375,94.31400009033203
+173,3.4164858715874806,0.91026359375,78.15199998046874,94.36200014160156
+174,3.5214629684175764,0.9067821875,78.21600010986329,94.41000009033203
+175,3.723686303411211,0.90326078125,78.24600010986327,94.44000009033203
+176,3.567773597581046,0.9000628125,78.34000005859374,94.45200014160156
+177,3.531922306333269,0.8968096875,78.44199992919921,94.47600014160156
+178,3.445431743349348,0.8934965625,78.48400000732421,94.50800014160156
+179,3.4728594166891917,0.89035171875,78.5360000341797,94.55200014160157
+180,3.494410361562456,0.88745859375,78.59400008544922,94.59200001220704
+181,3.5128171103341237,0.88491375,78.65999992919922,94.65800009033204
+182,3.4463326930999756,0.88252890625,78.74200005859375,94.66800009033203
+183,3.429210696901594,0.8797275,78.83600005859375,94.70400014160157
+184,3.430286169052124,0.87714265625,78.88200008544922,94.76800014160156
+185,3.5126789127077376,0.87460609375,78.96800008544922,94.77200014160157
+186,3.5115831749779836,0.8723978125,78.97000008544921,94.79600014160157
+187,3.4737915652138844,0.8702578125,79.03600000732422,94.81200014160156
+188,3.536697966711862,0.86798125,79.06000008544922,94.82200006347657
+189,3.5705651896340505,0.865568125,79.10999998046876,94.8680000366211
+190,3.5216511828558787,0.86370328125,79.19199998046875,94.89800008789062
+191,3.343783582959856,0.86171015625,79.27199995361327,94.91200008789062
+192,3.306029898779733,0.8600184375,79.31600003173828,94.95000008789063
+193,3.4795777116503035,0.858201875,79.39200003173828,94.96200013916015
+194,3.360287530081613,0.85623359375,79.46000008300781,94.97000013916016
+195,3.383433989116124,0.85435703125,79.5000000048828,95.02400013916015
+196,3.4374169622148787,0.85294046875,79.56400000488281,95.00400013916015
+197,3.403316753251212,0.8514921875,79.58800000488282,95.02000019042968
+198,3.339807084628514,0.84972390625,79.65599997802734,95.05800019042968
+199,3.453535488673619,0.8484521875,79.69200005615234,95.09800013916016
+200,3.419135877064296,0.8466790625,79.75600005615235,95.11400013916015
+201,3.3291687795094083,0.84469078125,79.77400005615235,95.14400008789063
+202,3.3989278929574147,0.84291421875,79.80600005615234,95.17600008789063
+203,3.4345146928514754,0.84117421875,79.91200013427735,95.22200013916016
+204,3.3367595842906406,0.8393425,79.96200013427735,95.24400013916015
+205,3.405371512685503,0.83769421875,80.00800005615234,95.26000013916016
+206,3.2920469726834978,0.83589765625,80.03600018554687,95.29600013916016
+207,3.39451459475926,0.83424109375,80.09400010742188,95.28600008789063
+208,3.3381849186761037,0.83268109375,80.13600013183594,95.3060000366211
+209,3.3369586978639876,0.8314128125,80.16200013183594,95.3240000366211
+210,3.305583425930568,0.82968453125,80.21800005371094,95.3260000366211
+211,3.383475865636553,0.82795625,80.23200005371093,95.3560000366211
+212,3.3413387877600536,0.82657625,80.2960000805664,95.3620000366211
+213,3.3035824469157626,0.82492453125,80.35200013183594,95.37600008789063
+214,3.2925713743482317,0.82351625,80.39800013183594,95.40600008789062
+215,3.2677473170416698,0.82188453125,80.4540000805664,95.4220000366211
+216,3.29270339012146,0.82025625,80.50200013183594,95.4300001147461
+217,3.3200165714536394,0.8187796875,80.54400013183594,95.4600001147461
+218,3.228957329477583,0.81721140625,80.56200013183594,95.4820001147461
+219,3.257698552949088,0.81583140625,80.62400013183594,95.4980000366211
+220,3.324594702039446,0.814423125,80.67800013183594,95.5219999584961
+221,3.2105934960501537,0.8128865625,80.70600005371094,95.5579999584961
+222,3.2700722898755754,0.81163,80.76599997558594,95.56599995849609
+223,3.184366192136492,0.81002171875,80.80000005371093,95.56199995849609
+224,3.2554157972335815,0.80895,80.83400010498048,95.5759999584961
+225,3.2719230311257497,0.80763,80.91000010498047,95.56599995849609
+226,3.1703891924449374,0.8061734375,80.95800010498047,95.58599995849609
+227,3.182521172932216,0.8049934375,81.01800010498047,95.61599995849609
+228,3.215354255267552,0.80380515625,81.05000002685547,95.63400003662109
+229,3.199547358921596,0.80252515625,81.09199994873048,95.65000000976562
+230,3.2207604135785783,0.80131515625,81.17199994873047,95.64200000976562
+231,3.1240655524390086,0.80026515625,81.21600002685547,95.67800000976563
+232,3.257808906691415,0.799116875,81.25400002685546,95.69599993164063
+233,3.1133551938193187,0.79786859375,81.30399997558594,95.70799993164063
+234,3.1397553852626254,0.79670203125,81.33199997558594,95.71799993164062
+235,3.212665115083967,0.79555375,81.36999997558594,95.72199993164062
+236,3.134836418288095,0.79439890625,81.40200005371094,95.73199993164063
+237,3.1265358924865723,0.79340890625,81.43400005371093,95.74599993164063
+238,3.2117801904678345,0.79225890625,81.4740000024414,95.76199993164063
+239,3.1193034989493236,0.791180625,81.5239999243164,95.77199998291016
+240,3.1610439334596907,0.79030234375,81.56599987304688,95.76399993164063
+241,3.126009328024728,0.78928234375,81.58000005371093,95.78399993164062
+242,3.0744248969214305,0.7885540625,81.6080000024414,95.79999993164063
+243,3.0661239453724454,0.7877840625,81.6440000024414,95.81200000976563
+244,3.02728225503649,0.78692578125,81.67000000244141,95.80199993164062
+245,3.1005813223975047,0.78626578125,81.6900000805664,95.81999993164062
+246,3.105734280177525,0.78565578125,81.70800005371093,95.78999993164062
+247,3.098195024899074,0.7848375,81.71600005371094,95.79399993164063
+248,3.142967564719064,0.7842675,81.72600000244141,95.79799993164063
+249,3.0174715518951416,0.7835175,81.79399997558593,95.81999993164062
+250,3.0780578000204906,0.78284921875,81.84599997558594,95.83799993164062
+251,3.147279364722116,0.7824209375,81.87599997558594,95.82799993164062
+252,3.034480486597334,0.78215265625,81.90000002685547,95.83799993164062
+253,3.027377588408334,0.781624375,81.91000002685547,95.85399993164063
+254,3.063859905515398,0.7815178125,81.93600002685547,95.84799993164063
+255,3.042998433113098,0.78106953125,81.97199997558593,95.86599998291015
+256,3.0484637873513356,0.78063296875,81.99199997558594,95.90400003417969
+257,3.0794162239347185,0.7802546875,82.00999997558594,95.9200000341797
+258,3.045973709651402,0.78008984375,82.0399999243164,95.9260000341797
+259,2.928838150841849,0.77996984375,82.08999997558594,95.91800003417968
+260,2.9149784360613142,0.7799015625,82.07999997558593,95.93000003417968
+261,2.9356118951525008,0.7798015625,82.1379999243164,95.92800003417969
+262,3.0385569674628123,0.7799215625,82.1639999243164,95.9260000341797
+263,2.981449672154018,0.77984984375,82.19600000244141,95.93599995605469
+264,2.9820726939610074,0.779828125,82.20799997558593,95.94399995605468
+265,2.8757408687046597,0.7799146875,82.21999997558594,95.96799995605468
+266,3.0372165271214078,0.7797746875,82.22999997558594,95.95600003417968
+267,2.8532401663916453,0.77982640625,82.25199992431641,95.96800003417968
+268,2.8727880716323853,0.779848125,82.27599992431641,95.98200003417969
+269,2.9443894113813127,0.780148125,82.28199992431641,96.00000003417969
+270,2.9520383221762523,0.780188125,82.31200000244141,95.99200003417968
+271,2.9796100854873657,0.780598125,82.29799995117187,95.98600003417968
+272,2.91614978654044,0.780838125,82.34400002929688,95.98800003417969
+273,2.9470558166503906,0.78107984375,82.31199997802734,95.98200003417969
+274,2.9149969816207886,0.781378125,82.33599997802735,95.98800003417969
+275,2.8980303491864885,0.781628125,82.33400002929687,95.99400003417969
+276,2.8834817920412337,0.78223640625,82.3740000805664,95.99799995605468
+277,2.893464684486389,0.78245640625,82.42600008056641,95.99799995605468
+278,2.866437315940857,0.78304296875,82.4220000805664,96.00600000732422
+279,2.9361328056880405,0.78362125,82.4180000024414,95.99600005859375
+280,2.8688106536865234,0.7841978125,82.40600005371094,95.99600000732421
+281,2.8958534002304077,0.784674375,82.41600000244141,95.99000000732421
+282,2.9111480712890625,0.78514265625,82.4360000024414,96.00000000732422
+283,2.8539947952542986,0.7859809375,82.42000005371094,95.98400000732421
+284,2.901869671685355,0.78666921875,82.40999997558593,95.96399995605469
+285,2.789713059152876,0.78732578125,82.4179999243164,95.94800000732423
+286,2.8766787222453525,0.7881540625,82.43599997558594,95.93400000732422
+287,2.9195314816066196,0.78879234375,82.45199997558593,95.91799995605469
+288,2.8812105655670166,0.78950890625,82.43399997558593,95.91400000732422
+289,2.846291661262512,0.79023890625,82.43599997558594,95.90600000732422
+290,2.831061363220215,0.7908171875,82.42200002685547,95.90200000732422
+291,2.8955056497028897,0.79149375,82.43400002685547,95.89600000732422
+292,2.8761453798839023,0.79212203125,82.42600002685546,95.89400000732422
+293,2.8942482812064037,0.7927603125,82.39200002685547,95.88200000732422
+294,2.8691021544592723,0.793396875,82.39000010498047,95.87200000732422
+295,2.7985400472368513,0.79409515625,82.37400015625,95.85000000732421
+296,2.83943886416299,0.7948034375,82.372000234375,95.84200000732422
+297,2.764699237687247,0.79532171875,82.388000234375,95.83800000732423
+298,2.783741457121713,0.79605828125,82.408000234375,95.84000000732422
+299,2.8964389903204784,0.7964965625,82.414000234375,95.83600000732422
+300,2.8764584915978566,0.79715484375,82.418000234375,95.82600005859375
+301,2.8418020009994507,0.79767140625,82.410000234375,95.83000005859375
+302,2.878923177719116,0.79832140625,82.412000234375,95.82400005859375
+303,2.882906266621181,0.79883140625,82.39600015625,95.81200005859375
+304,2.912467394556318,0.7992396875,82.40000015625,95.80600005859375
+305,2.8378712109157016,0.79983625,82.39400020751953,95.81200005859375
+306,2.797353148460388,0.80029625,82.41200020751953,95.80800000732422
+307,2.8914946487971713,0.80090453125,82.44800012939453,95.80200000732422
+308,2.7958712577819824,0.80121453125,82.43600012939453,95.81000000732422
+309,2.9499638080596924,0.80185453125,82.44200012939453,95.81200000732422
diff --git a/CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml b/CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml
new file mode 100644
index 0000000..b65af94
--- /dev/null
+++ b/CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml
@@ -0,0 +1,111 @@
+aa: rand-m7-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: true
+bias_decay: true
+bn_eps: null
+bn_momentum: null
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: 0.95
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.05
+epoch_repeats: 0.0
+epochs: 100
+eval_metric: top1
+experiment: e100-aug0-w60-minlr1e6-wrlr1e9-initRdm-bias-lr3e2
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.03
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 5.0
+mean: null
+min_lr: 1.0e-06
+mixup: 0.1
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: resnet50
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_eps: 1.0e-08
+output: ./exp_results/res50-epoch-
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.0
+resplit: false
+resume: null
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.0
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-09
+weight_decay: 0.02
+workers: 8
diff --git a/CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml b/CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml
new file mode 100644
index 0000000..81258c2
--- /dev/null
+++ b/CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml
@@ -0,0 +1,111 @@
+aa: rand-m7-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: true
+bias_decay: true
+bn_eps: null
+bn_momentum: null
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: 0.95
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.05
+epoch_repeats: 0.0
+epochs: 200
+eval_metric: top1
+experiment: e200-aug0-w60-minlr1e4-wrlr1e9-initRdm-bias
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 5.0
+mean: null
+min_lr: 0.0001
+mixup: 0.1
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: resnet50
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_eps: 1.0e-08
+output: ./exp_results/res50-epoch-
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.0
+resplit: false
+resume: null
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.0
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-09
+weight_decay: 0.02
+workers: 8
diff --git a/CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml b/CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml
new file mode 100644
index 0000000..2c5fcf3
--- /dev/null
+++ b/CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml
@@ -0,0 +1,112 @@
+aa: rand-m7-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: true
+bias_decay: true
+bn_eps: null
+bn_momentum: null
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: 0.95
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.05
+epoch_repeats: 0.0
+epochs: 300
+eval_metric: top1
+experiment: res50-aug0-retrain
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 5.0
+mean: null
+min_lr: 1.0e-05
+mixup: 0.1
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: resnet50
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_debug: 5
+opt_eps: 1.0e-08
+output: ./exp_results/res50-epoch-
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.0
+resplit: false
+resume: null
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.0
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: bicubic
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-06
+weight_decay: 0.02
+workers: 8
diff --git a/CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv b/CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv
new file mode 100644
index 0000000..ecb81ca
--- /dev/null
+++ b/CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv
@@ -0,0 +1,111 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,0.7045409509113857,6.9416,0.064,0.418
+1,0.058665430905031304,6.89746625,0.3079999999332428,1.2019999998664856
+2,0.007796582133908357,6.2966525,1.7179999993896484,5.899999989013672
+3,0.007212148014722126,5.116435,8.078000043945313,21.984000035400392
+4,0.006597742538100907,4.30874625,16.604000009765624,37.32800003540039
+5,0.006309278409129807,3.7494875,24.503999986572264,48.297999992675784
+6,0.00587210977183921,3.23308,31.903999926757812,57.66999989746094
+7,0.005444032173337681,2.87593875,38.16399994140625,63.99200002685547
+8,0.0054282506462186575,2.59584875,43.517999924316406,69.46200001464844
+9,0.005179691860186202,2.359841875,47.206000029296874,72.58200003417969
+10,0.004889545729383826,2.1719675,50.609999997558596,75.55400000244141
+11,0.00470197234036667,2.1567584375,51.69199992919922,76.44600010253906
+12,0.004586202425083944,1.98930375,54.606000112304685,78.83600004394532
+13,0.004271666053682566,1.8706825,56.328000031738284,80.30800010009766
+14,0.004447908040934375,1.806950625,58.472000075683596,81.5399999633789
+15,0.0041762767692229575,1.7647315625,58.741999968261716,82.09000006103516
+16,0.004471837143812861,1.708065625,60.30200004394531,82.98200011230469
+17,0.004270398956058281,1.67571921875,61.048000041503904,83.32200005859374
+18,0.004100026030625615,1.65201375,61.26000004150391,83.77400021972656
+19,0.0041242205105455855,1.63376078125,61.504000068359375,84.07800001220703
+20,0.004059118734273527,1.67590984375,60.91800009765625,83.5019999584961
+21,0.0041561292850279385,1.63649734375,61.82800004882812,84.22399995361329
+22,0.004249815163867814,1.5946559375,62.68000001220703,84.70800006347656
+23,0.0039470667751239875,1.64520578125,61.93799990234375,84.07400013427734
+24,0.003988273092545569,1.671076875,61.05600004394531,83.42199993164063
+25,0.004096939311628895,1.7034496875,61.12399993652344,83.56399995605469
+26,0.004087086118358586,1.60285265625,62.73200006347656,84.75999995605468
+27,0.00399751916328179,1.61492046875,62.43800003662109,84.32400010742188
+28,0.003949649166315794,1.701069375,60.77399994628906,83.2460001147461
+29,0.004051400797574648,1.6202353125,62.64599990722656,84.67000006103515
+30,0.004139024115699742,1.6344540625,62.20200006591797,84.12200026855469
+31,0.003921386137205575,1.62690984375,62.05000011474609,84.17200008544921
+32,0.00411509963617261,1.68366421875,61.46400011474609,83.86600005859376
+33,0.003911659786743777,1.67565765625,60.84800007324219,83.32999993408202
+34,0.00395727701418634,1.62554953125,62.0080000390625,84.16199998291016
+35,0.004033969731868378,1.71603296875,60.70599999267578,83.0460000390625
+36,0.004010531336202153,1.6436690625,62.05400001953125,84.18800013916015
+37,0.0039575622982478565,1.67731390625,61.35800016845703,83.65600013671875
+38,0.0039316649615232435,1.61552953125,62.22400010986328,84.39000005126952
+39,0.003873389430477151,1.63947921875,61.81200003662109,84.1440000366211
+40,0.004065845494291612,1.653141875,61.8460001147461,83.82200008789063
+41,0.004109910373309893,1.714169375,60.308000017089846,83.24199985595703
+42,0.003946930452782128,1.94490875,56.48200006103516,79.57000004638672
+43,0.0041138056798705035,1.6267740625,61.803999931640625,84.29799997802735
+44,0.004048073315061629,1.62808609375,62.09799998291015,84.28800000976563
+45,0.0039734537546922055,1.784985625,59.12400004882812,82.1780000390625
+46,0.0038987650768831372,1.713120625,60.78800010498047,83.26599994628906
+47,0.0040997504090358105,1.88673,57.57800005615234,80.4180000415039
+48,0.003935285162047616,1.6685634375,61.34400001220703,83.64799995605469
+49,0.004107319034769067,1.7783765625,59.22000000244141,82.05199999023438
+50,0.00387493397907487,1.6779953125,61.276,83.92200001464843
+51,0.004015801890221026,1.847471875,58.37599998046875,81.37399998779297
+52,0.003935897473378905,1.859410625,58.18199997802734,81.15000001708984
+53,0.004190738429315388,1.821818125,58.34600005615234,81.56200009277343
+54,0.004043174558319151,1.823231875,58.122000075683594,81.2140000390625
+55,0.004158310043359441,1.86400625,57.84399987792969,81.45800022460938
+56,0.003960915591700801,1.7923175,58.804000024414066,81.96200001220703
+57,0.004142970977617162,1.7743928125,59.36600004394531,82.41600017333984
+58,0.004029840646710779,1.7658021875,59.30400007080078,82.15200016845704
+59,0.004218896684635963,1.88195375,56.881999975585934,80.56000011474609
+60,0.0036925061971747448,1.3517940625,67.70000002197266,88.15399987304687
+61,0.0035992927150800824,1.34404765625,68.08800004882812,88.23600020751954
+62,0.003520481986925006,1.283674375,69.1300000805664,88.94400007568359
+63,0.003616590718073504,1.3082865625,68.802000078125,88.65599994384766
+64,0.0036838793894276023,1.27181484375,69.44200001953125,89.27800005126953
+65,0.003572586092299649,1.29942640625,69.78399999267579,89.41400007324219
+66,0.0036129531716661794,1.2370415625,70.27599992431641,89.516
+67,0.0032376082381233573,1.2114928125,70.86000002197265,90.03600010009765
+68,0.0035054978714989765,1.224236875,70.44400004394531,89.89400004394531
+69,0.0034192517466310945,1.23175109375,70.51399994628906,89.67800012451171
+70,0.00328368427498,1.19048328125,71.48400014648438,90.30600015136719
+71,0.00327613196402256,1.16209390625,71.9240000366211,90.69200007080079
+72,0.0030484608806935804,1.16013578125,71.9080000390625,90.63800004394531
+73,0.0034537422138133217,1.1457075,72.4540000390625,90.89400011962891
+74,0.003460384572723082,1.13635015625,72.41000006835938,90.91400004638672
+75,0.0033204310374068363,1.12647875,72.77400001464844,91.23800009521484
+76,0.0032639388061527696,1.113355625,72.89800006347656,91.27400009521484
+77,0.0032552302914804648,1.1143825,72.92800009033203,91.40200001708985
+78,0.003150941720897598,1.0993584375,73.49799998779297,91.53000014892578
+79,0.0031130987585389186,1.0650625,74.15800011474609,92.03200009521484
+80,0.0032726521603763103,1.0721525,74.11400006591796,91.98600006591796
+81,0.00320629304873624,1.0649465625,74.26599995605469,92.1619999633789
+82,0.0029540062449606402,1.0372840625,74.79800008789063,92.30600001708984
+83,0.003026906833318727,1.0280375,75.05400014160156,92.53800022460938
+84,0.0029979831805186613,1.017864375,75.4720000366211,92.63999999267578
+85,0.00299135923186051,0.99109765625,75.92600000732422,92.9679999633789
+86,0.003011097732399191,0.99155703125,75.93799998291016,92.82800001464844
+87,0.003033405419306031,0.970643125,76.38000008789062,93.0640001171875
+88,0.0028323159287018435,0.9561534375,76.69000000976563,93.1739999633789
+89,0.0030302958163831916,0.9529859375,76.86200008544922,93.20600006591796
+90,0.0030514331634289454,0.9512065625,77.03400000976562,93.26600009033203
+91,0.002754983675133969,0.9374346875,77.17200000488282,93.41999996337891
+92,0.002925087830850056,0.92438484375,77.47400018554687,93.53399991210938
+93,0.002743347780779004,0.9260734375,77.55600011230469,93.63999993652344
+94,0.0028534684097394347,0.95646546875,77.48199992675781,93.63399998779298
+95,0.0028282569421987447,0.91486703125,77.77999995361328,93.66999996337891
+96,0.0026793425869462745,0.90815390625,77.85000003173828,93.7900001171875
+97,0.002686592417636088,0.909225625,78.00000013427734,93.78399993652344
+98,0.002937979913050575,0.90744421875,77.98200003173828,93.79199998779296
+99,0.002853604283050767,0.90461453125,78.05800000488281,93.84199993652344
+100,0.002864615060389042,0.9053496875,78.0300000830078,93.79799998779296
+101,0.002886664870727275,0.9070628125,78.00999995361327,93.78799993652343
+102,0.002906581253877708,0.91363046875,77.93799995361329,93.71799998779296
+103,0.0030246374164042728,0.90368484375,78.14200010986328,93.83399998779296
+104,0.0028219220860462102,0.906053125,78.03600005615235,93.78400006591797
+105,0.002867467302296843,0.90486140625,78.06800013427734,93.80799998779297
+106,0.002776414771298213,0.90622484375,78.1760000830078,93.84400001464844
+107,0.0027404509518029435,0.90221796875,78.09400000488282,93.82399998779297
+108,0.002886704235736813,0.90330140625,78.10999998046876,93.80999993652344
+109,0.0028225835911663516,0.9019365625,78.07000000488281,93.81399993652344
diff --git a/CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv b/CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv
new file mode 100644
index 0000000..1da7189
--- /dev/null
+++ b/CV/timm/exp_results/ResNet/Res50/summary_res50_200.csv
@@ -0,0 +1,211 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,0.7045409551688603,6.9416,0.064,0.418
+1,0.05861631261983088,6.88152125,0.3300000003051758,1.4160000054168702
+2,0.007787905666711075,6.29743625,1.7679999932861328,6.330000021362305
+3,0.007318386575207114,5.39208375,6.4000000115966795,18.032000087890626
+4,0.006826370256021619,4.711741875,12.58400001525879,30.035999998779296
+5,0.006564456370792219,4.215930625,18.40000004638672,39.540000072021485
+6,0.006210722494870424,3.7064575,25.034000032958986,48.896000021972654
+7,0.005833921089236226,3.446033125,29.666000029296875,54.72999997558594
+8,0.005793575064412185,3.0451746875,35.78199996948242,61.63399999023437
+9,0.005482399137690663,2.92318375,37.8639999609375,62.89000001953125
+10,0.005182175398139017,2.4888303125,44.8599998828125,70.50999987060547
+11,0.004980468157944935,2.3753028125,47.31599997314453,72.85000001708984
+12,0.004848868419815387,2.1763934375,50.68400004394531,75.72199998046875
+13,0.004497935130660023,2.02472625,53.710000114746094,78.10400005126954
+14,0.004636649873905948,1.9788840625,54.92200000244141,78.86800007324219
+15,0.004339669598266482,1.876849375,56.504000158691404,80.39399998779297
+16,0.0046054747654125094,1.8213928125,57.803999970703124,81.25400013671874
+17,0.00438071893794196,1.7728221875,59.07799999511719,82.16399990478516
+18,0.004191712393159313,1.7204253125,59.88599999267578,82.56599991210938
+19,0.004200312824520681,1.6947209375,60.43600009277344,83.2559999609375
+20,0.0041146951095600215,1.64887125,61.5360000390625,83.89399993408203
+21,0.004193632979877293,1.6154840625,62.46800001953125,84.26399980224609
+22,0.0042644668470269865,1.620426875,62.13600014404297,84.38800008300781
+23,0.003946975472250155,1.58272265625,63.15000013427734,84.93799995361329
+24,0.003950901828440172,1.62863140625,62.05200009765625,83.91600006347656
+25,0.004059900596205678,1.64938625,62.29600000732422,84.31999992919921
+26,0.004035455657036177,1.580548125,63.11400011474609,84.89199998291015
+27,0.003935754764825106,1.5290325,64.00800005615234,85.40400005371093
+28,0.0038723718441490617,1.5349790625,63.911999926757815,85.51600012939453
+29,0.003965828401435699,1.6118196875,63.035999931640625,84.76599995605469
+30,0.004045715455764106,1.5477725,63.98800006103516,85.71800008056641
+31,0.003791363294502454,1.50541140625,64.66399998291016,85.84799989990235
+32,0.003994816814416221,1.872206875,59.20399997314453,81.1340001953125
+33,0.003794607290598963,1.4659615625,65.19200009277344,86.57399998046876
+34,0.0038328385313174556,1.4683784375,65.23600006103516,86.53399998291016
+35,0.0038947787834331393,1.5209565625,64.42000000732422,85.83799984863282
+36,0.003859427624515125,1.47193203125,65.54000002685547,86.57600013427735
+37,0.003790942220283406,1.5083471875,64.5499999267578,86.07199995361329
+38,0.0037716488753046307,1.50174578125,64.79599992675782,86.10999997558594
+39,0.003689929044672421,1.510310625,64.64600000488281,86.02999997802735
+40,0.0038831187578450355,1.4518178125,65.59200008544921,86.8760000024414
+41,0.00391619885340333,1.5053915625,64.53599995117187,86.06000013183593
+42,0.003753877860227866,1.4715946875,65.40400002685547,86.4160000805664
+43,0.003907359188555607,1.5655059375,63.231999907226566,85.17599997802735
+44,0.003843014344706067,1.48220671875,65.15600012695313,86.44200002685547
+45,0.003759625999789153,1.49491625,64.71200010986328,86.38999997802735
+46,0.0036789896964494672,1.48738609375,65.48199987548828,86.51200010742187
+47,0.003882888887476708,1.479825,65.29599989990234,86.59600005126953
+48,0.0036989124824426006,1.4772675,65.192000078125,86.40999995117187
+49,0.0038785873080736826,1.504194375,64.83399998535157,85.74599995117188
+50,0.0036308450757392813,1.4936209375,65.20000003662109,86.48599998291016
+51,0.003771561148044254,1.488496875,65.1560000366211,86.57800005371094
+52,0.0036744583963549565,1.5306975,64.27400009521484,85.84400010986329
+53,0.0039386395863922575,1.505415,65.27399990234375,86.40400000488282
+54,0.003786725890157478,1.5121053125,64.41799998779297,86.02000000976562
+55,0.0038960346885557684,1.584905,63.384000056152345,85.33400023681641
+56,0.003697914753242263,1.4781703125,65.27399997802735,86.6660000805664
+57,0.0038609286670440008,1.48826421875,65.09000000976563,86.31200005615234
+58,0.00374445816435452,1.4656303125,65.4440000341797,86.49399990234375
+59,0.003939676181679326,1.520185625,64.42199998779297,85.91200008544922
+60,0.0036717986554971765,1.4740965625,65.33400005615235,86.43400010498047
+61,0.0036946918782112853,1.4400040625,66.137999921875,86.8360000756836
+62,0.003649507200212351,1.3948309375,66.91000002929688,87.45000002929687
+63,0.003765647050126323,1.419415,66.84600002441407,87.29800000244141
+64,0.00385747043349381,1.38540703125,67.19199996826171,87.59400010009766
+65,0.003746751995225038,1.4373690625,66.85599995117188,87.50800013183594
+66,0.0038159869810832398,1.35951734375,67.73400008300781,87.92200005371093
+67,0.0034571332590920584,1.37397765625,67.47000008300782,87.77199995117188
+68,0.003730148426257074,1.4091596875,66.44200002685547,87.34000018310547
+69,0.003659855990138437,1.3601834375,67.68800010498047,87.981999921875
+70,0.0035387545524697217,1.35465234375,68.02799999511718,88.44
+71,0.003558939788490534,1.4536840625,65.95800008544921,86.66000021240234
+72,0.0033511826103287084,1.39861125,67.14400007324218,87.55200005126953
+73,0.00376773886715195,1.3392646875,68.31200001953125,88.25800012939453
+74,0.0037749758422640817,1.3295978125,68.16599992431641,88.37799981933594
+75,0.0036681361629494597,1.40448625,66.6980000366211,87.508
+76,0.0036327216907271315,1.34860953125,68.05599997070313,88.05400010253906
+77,0.0036319279045398745,1.3575459375,67.70800001953126,88.14200004638671
+78,0.0035495711656819496,1.36476296875,68.11400007080078,88.26000025390626
+79,0.0035365247999184896,1.31430375,68.72399997558594,88.56599997558594
+80,0.003713787134204592,1.3273484375,68.40599997070312,88.72199999511719
+81,0.0036787415821371333,1.30792578125,68.79600008300781,88.896
+82,0.003430017004055636,1.31487703125,68.60600012207031,88.50999997070312
+83,0.0035406785318627954,1.33942859375,68.50599997314453,88.37800015625
+84,0.0035228457834039417,1.33490109375,68.78400005126953,88.79600002685547
+85,0.0035429883615246843,1.33512765625,68.27599989013672,88.48199999755859
+86,0.0035690352420455645,1.2936003125,69.10999991943359,88.992
+87,0.0036141484244061367,1.30547953125,68.96799993896484,88.92200022705079
+88,0.003423843566062195,1.30081328125,68.81600005126953,88.84600005126953
+89,0.003634033741296402,1.3100946875,68.73999997070312,88.90600017822265
+90,0.003681087350871946,1.3103096875,69.52200002685547,88.89200004882812
+91,0.0033962452351780875,1.2537275,70.15800012207032,89.61000002197265
+92,0.0035749949581388918,1.25576875,69.95000001708985,89.52200002197266
+93,0.003407541712346886,1.26281765625,70.04200010009765,89.61800015136718
+94,0.0035377658371414456,1.33862625,69.27399997070313,88.9799999194336
+95,0.003494631831667253,1.2840921875,69.66400005126953,89.08999997314453
+96,0.0033580272824370433,1.243096875,70.43400004394532,89.8300000756836
+97,0.003372354432940483,1.40749484375,66.95800008789062,87.59000015625
+98,0.0036157858557999134,1.25091953125,70.18200007568359,89.58599994384765
+99,0.003527536356289472,1.21663265625,70.91600010009766,89.88200017822265
+100,0.0035386210906186272,1.23658109375,70.51199994384766,89.73999997314453
+101,0.003551432181016675,1.23664953125,70.65799999755859,89.79200008056641
+102,0.0035552934610417913,1.245574375,70.66399996582031,89.824000078125
+103,0.0036766345479658674,1.2083915625,70.92600004882813,90.03399994628906
+104,0.0034643861831032802,1.24649953125,70.41199998779297,89.57199996582031
+105,0.003494899670061256,1.24356671875,70.44800020263672,89.80400010253906
+106,0.0033947998890653253,1.2432778125,70.20399994140625,89.6799999975586
+107,0.0033498970858220544,1.26618046875,69.61600013183593,89.30800002441406
+108,0.0034940940760342138,1.20514609375,71.09400004394531,90.22199994384765
+109,0.003417208025764142,1.185001875,71.54799999267578,90.50400009765625
+110,0.0032839904306456447,1.211754375,71.20400001708984,90.03000002441406
+111,0.003404544184117445,1.192296875,71.55799993652344,90.51399999267578
+112,0.0032217274752578567,1.216126875,70.7519999609375,90.08199999267578
+113,0.0033759328237335596,1.2004621875,71.39199996826171,90.21800022949219
+114,0.003175128933175334,1.18758890625,71.57000001708984,90.21599994384766
+115,0.0032200828760064076,1.1713734375,71.9300001147461,90.56200012207032
+116,0.0032189975026994944,1.1896378125,72.10400007324219,90.7679999951172
+117,0.0035298727120139767,1.17690640625,71.79800009277344,90.63999996582031
+118,0.0032351285418761627,1.1448571875,72.57400001220704,90.86400012451172
+119,0.0032762797116967185,1.139448125,72.48800014648438,91.06800007080078
+120,0.0032881099863776137,1.16669484375,72.26800007324219,90.61600004638672
+121,0.0034034981758200695,1.160696875,72.14400001464844,90.59199999511719
+122,0.0033585052172254238,1.14411265625,72.40999996582032,90.99000009521484
+123,0.003353612048418394,1.13490546875,72.60200006835937,91.03800004394532
+124,0.003222887670355184,1.1164625,73.02000006347656,91.31800004638671
+125,0.0033358727482014467,1.17029984375,71.98200007324219,90.94200007080079
+126,0.003147848233181451,1.11883234375,72.99600007324219,91.11600004882813
+127,0.00330801319796592,1.1164684375,73.37800008789063,91.31800006835938
+128,0.003165309433825314,1.13151640625,72.79200010009765,91.10200004638672
+129,0.003169606639338391,1.10438015625,73.2600000390625,91.41600007324219
+130,0.003111145625423108,1.1232259375,73.14799991210937,91.18200017333984
+131,0.003257711268296199,1.12542625,73.11000020019532,91.16200017333985
+132,0.0032982677720221026,1.10749890625,73.3720001171875,91.4680000732422
+133,0.003284811640956572,1.08792484375,73.93399993652343,91.64800014892577
+134,0.003277899364807776,1.0648290625,74.2020000390625,92.00999996582031
+135,0.0031747023708053996,1.0842659375,73.8239999609375,91.83600006835937
+136,0.0031947052172784296,1.124199375,73.76800000976563,91.64800001708984
+137,0.003041988188800003,1.0820375,73.96600001708984,91.67200004638671
+138,0.003274818416684866,1.0491440625,74.68800008544922,92.05400009277344
+139,0.0032263360252337797,1.06187390625,74.43999998291015,92.05400014648437
+140,0.003107036247716418,1.06291078125,74.50599998779298,91.89599993896485
+141,0.0031503743957728148,1.0556015625,74.59600016357422,92.1040001196289
+142,0.003153187089732715,1.05592671875,74.43599993408203,91.99400007080078
+143,0.0030785591141985996,1.0333478125,75.0699999584961,92.36800004394532
+144,0.0030362975916692187,1.039278125,74.78800008789062,92.23000006835937
+145,0.0029685184958257844,1.01897828125,75.26999998535156,92.51400012207031
+146,0.0030120556142979433,1.01427953125,75.45599995605468,92.49000001953125
+147,0.0029117654942508253,1.024656875,75.39200024414062,92.52600006835938
+148,0.003095526248216629,0.99868203125,75.65999999023437,92.81400007080079
+149,0.002969694423622319,1.023748125,75.40800006591797,92.5580001196289
+150,0.0030562643826540026,1.00439859375,75.65000000732422,92.63400011962891
+151,0.003044905440349664,1.0112996875,75.57199993408203,92.63200009521485
+152,0.0029704225016757846,0.99755734375,75.90399993408204,92.74999998779298
+153,0.002950280306062528,1.00189,75.8440000366211,92.81200004150391
+154,0.003015570342540741,0.97426734375,76.23200000732422,92.91200009521485
+155,0.002881033279533897,0.97787875,76.3000000366211,93.01800009521484
+156,0.0029676160608817425,0.997861875,75.86400006103516,92.86600011962891
+157,0.002893779011044119,0.97974515625,76.3899999609375,93.02000014648438
+158,0.0027411910206345575,0.9752015625,76.4300000390625,93.0559999658203
+159,0.0030133193525086555,0.98281015625,76.63000009033203,93.06800014404297
+160,0.0027467000597555724,0.9500196875,76.94399998291016,93.4259999609375
+161,0.002747313435455518,0.97383375,76.54000011230468,93.1820000390625
+162,0.002890611666121653,0.96070921875,76.83600008056641,93.23000014404298
+163,0.002992227241130812,0.9530984375,77.13000003417969,93.44800001220703
+164,0.0028335172184077756,0.9447990625,77.23600008544922,93.41000006591797
+165,0.0027590213243716528,0.95350171875,77.05399995849609,93.4360001147461
+166,0.002805237242552851,0.9368121875,77.47000003417969,93.55600001464843
+167,0.003104101467345442,0.9339853125,77.32400000976563,93.60000006835938
+168,0.0028203485999256372,0.930644375,77.66000008544921,93.59800009033204
+169,0.002985484631998198,0.93263171875,77.48199998291015,93.58799998779297
+170,0.0026641425377290162,0.92875015625,77.77600008544921,93.7500000390625
+171,0.0026267553164091495,0.9258865625,77.95000000488281,93.7359999633789
+172,0.002781675280337887,0.91648203125,77.94000010986328,93.82800001220703
+173,0.0028434929637504475,0.908438125,78.14000002929687,93.97399990966797
+174,0.0027169642936704414,0.90687953125,78.13000010986327,93.9280001147461
+175,0.0026101735087909867,0.9099146875,78.14600000732422,93.9720001171875
+176,0.0026994317138035384,0.90558171875,78.24800000732422,94.04800003662109
+177,0.0027551356802827547,0.9109775,78.37600010986328,94.00599998779298
+178,0.0025562551704102327,0.8944078125,78.60200008544922,94.0939999609375
+179,0.002841701381839812,0.8946471875,78.53800000732421,94.09000006347657
+180,0.0027144267556390594,0.89255328125,78.61000005859376,94.11800009033203
+181,0.0025879032078332137,0.88817375,78.77999995849609,94.24800006835937
+182,0.0025061716358842595,0.87781390625,78.87999997802734,94.26600009033203
+183,0.0027128129066633327,0.88093078125,78.88400003417969,94.29800001220703
+184,0.0026006640899660332,0.87767390625,78.98200013427734,94.2580000366211
+185,0.002634142176248133,0.870630625,79.04800016113282,94.4480000366211
+186,0.002722469574239637,0.8782084375,79.07400005859375,94.34199998535156
+187,0.0027721369572515997,0.881635625,78.97800003417969,94.24000013916016
+188,0.0025335990191836444,0.87690703125,79.06000005615235,94.3679999609375
+189,0.0024866706392328653,0.87624640625,79.25999990234375,94.3780001171875
+190,0.002724298608622381,0.86884984375,79.15400000732421,94.36600014160156
+191,0.0026475561815979226,0.873798125,79.23800013427734,94.39600006347656
+192,0.002496325452479401,0.86303046875,79.34600003173828,94.4899999609375
+193,0.0025580572463305934,0.861071953125,79.37200005615234,94.5380000366211
+194,0.0026442011751766714,0.87250765625,79.26800010742187,94.39000008789063
+195,0.002566711910601173,0.86696109375,79.29600008300781,94.39599998535157
+196,0.002543845430149564,0.864356875,79.45799995361328,94.45399998535156
+197,0.0026796250770400676,0.869255625,79.30799997802734,94.41199998779297
+198,0.0025170722676973257,0.86292703125,79.37000005615235,94.43599990966797
+199,0.0025656953387494597,0.861311875,79.44200008300781,94.45999990966797
+200,0.0026176332030445337,0.8598421875,79.50999995361327,94.49400001220702
+201,0.0025168933124015374,0.85830453125,79.6120000024414,94.46600001220703
+202,0.002508296282030642,0.86487796875,79.5180002368164,94.44999998535157
+203,0.0024724971070619567,0.86042703125,79.44800000488281,94.47799998535156
+204,0.002350900338829628,0.87450015625,79.59199995117187,94.55400003662109
+205,0.0025996306545234154,0.86581359375,79.53600005859376,94.4879999609375
+206,0.0024244988869343486,0.8577853125,79.56600018798828,94.52199998535156
+207,0.0025348346680402756,0.85924640625,79.54200005859374,94.53600001220703
+208,0.002649968007712492,0.86089328125,79.55800011230468,94.4879998828125
+209,0.002461412771871047,0.85889765625,79.65000003417968,94.51399993408204
diff --git a/CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv b/CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv
new file mode 100644
index 0000000..07953cb
--- /dev/null
+++ b/CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv
@@ -0,0 +1,311 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,0.5879408035959516,6.93980125,0.066,0.406
+1,0.043263234331139495,6.88107375,0.284,1.3320000038146973
+2,0.00782376863727612,6.48283375,1.3799999908447265,4.72400003540039
+3,0.0074164533455457005,5.53255375,5.5160000164794925,15.882000014648437
+4,0.006907936850828784,4.86912625,10.956000001831054,26.94600002441406
+5,0.0066305674346429965,4.3362075,16.86999999267578,37.32200001953125
+6,0.006266916303762368,3.833835,23.227999986572264,46.63400005126953
+7,0.005900568095967174,3.424026875,29.399999946289064,54.26000005371094
+8,0.005858304411438959,3.161556875,33.99200005981445,59.326000024414064
+9,0.005546203548354762,2.7578753125,40.40000000854492,66.20200010742188
+10,0.00525083123440189,2.556459375,43.81000014160156,69.40399999023437
+11,0.005052656168118119,2.41526875,46.781999951171876,72.16199999023438
+12,0.004906764154189399,2.200563125,50.682000043945315,75.4040000024414
+13,0.00454613400091018,2.1156203125,51.83600006103516,76.44200002685547
+14,0.004688152045543704,2.03107625,53.548000114746095,77.93799999511718
+15,0.004370159785529333,1.9354171875,55.46199996826172,79.38000001708984
+16,0.004650838440284133,1.88189921875,56.684000048828125,80.42000008789063
+17,0.0044214187655597925,1.80287,58.39000005126953,81.53800007080078
+18,0.004206506818133805,1.7846809375,58.36399997558594,81.52400004638672
+19,0.004208372756173568,1.7458278125,59.268000092773434,82.38799999023438
+20,0.0041131798976234025,1.67181078125,60.89600001708985,83.4620000390625
+21,0.004204317411806967,1.6355546875,61.79599997070312,84.05200006347657
+22,0.004256514399977667,1.6303496875,61.71199993652344,83.87599995605468
+23,0.0039758168693099704,1.60533265625,62.700000061035155,84.75600000732422
+24,0.003976539592258632,1.6761265625,61.06800004394531,83.21199998291016
+25,0.0040600252936461145,1.589869375,63.34200011474609,85.14399998046875
+26,0.004057688239429679,1.5644690625,63.53000001220703,85.30600000488282
+27,0.003951762303976076,1.6300059375,61.744000146484375,84.00400013671874
+28,0.003902679558710328,1.5681228125,63.68600006835938,85.09000015869141
+29,0.0039712132420390844,1.5614875,63.81000000732422,85.46199989990234
+30,0.004059286156137075,1.6156275,62.969999956054686,84.69800002685547
+31,0.0037791924218514134,1.55970734375,63.461999990234375,85.07399998046876
+32,0.00400861116525318,1.528990625,65.12599995605468,86.2820000805664
+33,0.003777677742099123,1.5097675,64.3220000390625,85.72199998779297
+34,0.003831957051131342,1.49948203125,64.96999995849609,85.94799997558594
+35,0.003889977266745908,1.4704921875,65.59200006347656,86.37600003173829
+36,0.0038687303396207945,1.505684375,64.81400005859375,85.99400013671875
+37,0.003778126955564533,1.509783125,65.04800008544922,85.91000012939453
+38,0.0037710025374378476,1.514193125,64.50600008544922,85.78200002685547
+39,0.0037076565703111036,1.56284796875,63.95599998535156,85.43200000732422
+40,0.003900448991251843,1.48128640625,65.16400001220703,86.3820000805664
+41,0.003922918129579297,1.55287359375,63.76600001220703,85.54400005859375
+42,0.0037474680658695953,1.55009421875,63.562000139160155,85.33400003417968
+43,0.0039003821023340735,1.4944140625,64.84599987304688,86.15999994873047
+44,0.003832862579396793,1.47437,65.29599987304688,86.49000002929688
+45,0.0037390387317697915,1.50843,64.82800001220703,86.0080000024414
+46,0.0036831728149471538,1.551256875,64.07600013916016,85.61399997558594
+47,0.0038603152248210142,1.4796565625,65.42000011230469,86.53800008300782
+48,0.0037030736483367427,1.4785609375,65.19200016113281,86.40799997558594
+49,0.0038562817332733956,1.452211875,65.87800013183593,86.71200007568359
+50,0.0036413500285042183,1.4864984375,65.09800009277343,86.43200000732422
+51,0.003774097821276103,1.53024046875,64.63200011962891,86.05800010986329
+52,0.0036625074821391274,1.472685625,65.56800011230469,86.5720000024414
+53,0.0039101472523595604,1.5525409375,64.06400000976562,85.66800003173829
+54,0.003761170930894358,1.53140046875,63.95000006835937,85.6660000805664
+55,0.0038942551805770825,1.5284903125,64.98600010742187,86.25600005615235
+56,0.0037146693128826363,1.471130625,65.2040000366211,86.56799989990235
+57,0.003862551646307111,1.566090625,63.550000102539066,85.44199998779297
+58,0.0037471780586721642,1.5615784375,63.66200009277344,85.21000003417969
+59,0.003933556095164802,1.505938125,64.50000008789063,86.06800003173828
+60,0.0037123816353934152,1.51548203125,64.28000006835937,85.95799982666016
+61,0.0037444233894348145,1.4667553125,65.69800008300781,86.56999997558594
+62,0.0037000667569892748,1.5014478125,64.77600014160156,86.08999998046875
+63,0.003817097150853702,1.47620203125,65.53200003173828,86.6259999243164
+64,0.003908832723807011,1.44793703125,65.99399989257813,87.16600000244141
+65,0.003810447491040187,1.50632359375,65.92000006103515,86.72400008056641
+66,0.003877482027746737,1.44935671875,65.74400009033204,86.75600010498047
+67,0.0035001660996515837,1.474410625,65.61999997802734,86.56799994873047
+68,0.0037858569329338415,1.44278734375,65.89400003417968,86.91399995361328
+69,0.0037039401774693814,1.4219121875,66.46199997802735,87.15800005126952
+70,0.003584539318191154,1.4854034375,65.58600012939453,86.43000002441406
+71,0.0036189446691423655,1.408791875,66.81999999511719,87.19399994628907
+72,0.0034194375288539697,1.5136809375,64.56200000488282,85.982
+73,0.0038340060273185372,1.50853046875,64.8480001196289,86.02799995605469
+74,0.0038377702169652495,1.48404703125,65.19000013916016,86.21399998046876
+75,0.0037398780696094036,1.44281453125,66.09400000732421,87.01200010253906
+76,0.0036941264157316516,1.419920625,66.66200010253907,87.30800004638672
+77,0.0036912159329014166,1.39577015625,67.10200002197266,87.50799999267578
+78,0.003628838441467711,1.43978140625,66.44999997558594,87.26600007568359
+79,0.0036196053926167743,1.428256875,66.31400001220703,86.93199995361329
+80,0.0037884804021034923,1.41467578125,66.88999984375,87.37200010009765
+81,0.0037704520260116886,1.4212840625,66.73599994873047,87.291999921875
+82,0.003521366626955569,1.36810109375,67.35,87.92200017822266
+83,0.0036201796195070657,1.431815,66.478,87.4620000024414
+84,0.0036126957441280994,1.4206084375,67.08600015625,87.38800002685547
+85,0.00364691173724298,1.383635625,67.43399997314454,87.75199992431641
+86,0.003684012784755656,1.42890765625,66.46799995605468,87.32400002929687
+87,0.0037029437787298647,1.39296953125,67.16,87.6620000756836
+88,0.0035368107492104173,1.4194915625,66.504,87.33000005126954
+89,0.003732426424643823,1.389775,67.05400003173828,87.55000002441406
+90,0.003774909635207483,1.46568671875,66.54600000732422,86.95400013427735
+91,0.0035257768052230987,1.38999359375,67.09600002929687,87.77599981933594
+92,0.003705113460975034,1.395430625,67.05200005371094,87.70200002441406
+93,0.003521749783041222,1.326335,68.68200012451172,88.75400012451172
+94,0.003645537537522614,1.43553203125,67.51400010498047,87.94600010009766
+95,0.0036176966968923807,1.3430171875,68.53400003417968,88.60199997802735
+96,0.0035008633276447654,1.4016096875,67.02000000488282,87.51800000244141
+97,0.0034988391811826398,1.3573815625,68.02600003173828,88.43799987304688
+98,0.003750821475737861,1.37476375,67.54199997802735,87.8860000805664
+99,0.003664890703346048,1.35086578125,67.89200000488282,88.11200015625
+100,0.003684502377706979,1.373255,67.54400005615234,88.03999997558594
+101,0.003695012818622802,1.349871875,68.15200005126952,88.49599999511719
+102,0.0037163359811529517,1.3368015625,68.33599986816407,88.5100000024414
+103,0.003837666840159467,1.338334375,68.08000004638671,88.18199999755859
+104,0.0036296833007197294,1.34355421875,68.29200007080078,88.41799999267577
+105,0.0036591265151011093,1.3459440625,68.00600002441406,88.34000020263672
+106,0.0035502492849315915,1.33519578125,67.93999994873047,88.24399994628907
+107,0.0035088361806369255,1.38043453125,67.594,87.79400002441406
+108,0.0036571137108174817,1.3370221875,68.07799997802735,88.24600005615234
+109,0.003598236129619181,1.2989975,69.07599989013671,88.95199996826172
+110,0.0034665650288973537,1.31067546875,68.76000000488281,88.83600010253906
+111,0.003577601580348398,1.32908,68.58800009765625,88.47400002197266
+112,0.003416676722866084,1.29249703125,69.26600007568359,88.92599997070313
+113,0.00355812518059143,1.3232515625,68.66800005615234,88.62000005126953
+114,0.0033563452307134867,1.3571640625,67.93200002929687,87.85999994873048
+115,0.003407049791089126,1.304571875,68.76200012939454,88.95200007324219
+116,0.0034234707543094245,1.36884828125,68.73400020996094,88.44599994873047
+117,0.0037440506940973656,1.3250934375,68.96199994384766,88.82800005126953
+118,0.003433559023376022,1.2825096875,69.25600004638672,89.04400001953125
+119,0.003482125499950988,1.2820234375,69.37399999267578,89.04400001953125
+120,0.0035127438105908887,1.3135090625,68.70999995361328,88.90399997558593
+121,0.003636225060160671,1.28286640625,69.41800004394531,89.19200002197266
+122,0.003607004914166672,1.282149375,69.41400001708985,89.23599999267579
+123,0.0035776219363989575,1.26638484375,69.63000007080078,89.21400012451171
+124,0.0034578298052240697,1.292155625,69.1800000756836,88.85800012695313
+125,0.0035633945371955633,1.32452921875,68.92799995605469,88.77600002441406
+126,0.0034291247803983943,1.314371875,68.6280000439453,88.6499999975586
+127,0.003587623642358397,1.3392128125,68.4520000805664,88.366000078125
+128,0.00341518730523863,1.32833703125,68.80799996582031,88.56399999511719
+129,0.003439400078994887,1.33987890625,68.01200016601562,88.39799997558593
+130,0.0033969029152233687,1.2661765625,69.92799991699219,89.17799996826172
+131,0.003517004212231508,1.280429375,69.7639999975586,89.32800004638672
+132,0.0035932111287755625,1.268138125,69.91200006835938,89.30200010009766
+133,0.0035588327861790147,1.35987171875,68.31999997802734,88.35400010009765
+134,0.0035531476106760757,1.229270625,70.50999999267579,89.74399997558594
+135,0.0034332048380747437,1.27554859375,69.87000007080078,89.38400004882813
+136,0.0034692909269194517,1.29049078125,70.19400004150391,89.62000009765624
+137,0.003353963855520955,1.2432678125,70.31400006591797,89.86000012207032
+138,0.003599305687073086,1.23844671875,70.32800001953125,89.83000004882813
+139,0.003525197905089174,1.266065625,70.21199999267579,89.37800012695313
+140,0.003439630115670817,1.23350921875,70.19000002685547,89.75200005126953
+141,0.003482519233200167,1.2829453125,69.83200005126953,89.30800020751953
+142,0.003490099500465606,1.2773321875,69.42800010009766,89.03599991943359
+143,0.0034282832805599484,1.230988125,70.60599994140625,89.96600009765625
+144,0.003393945932787444,1.25483703125,70.05200005126953,89.49400004882813
+145,0.003305191340457116,1.23083140625,70.49599996582032,89.67000017578125
+146,0.0033644404861011674,1.24529484375,70.15399996826172,89.82200012939452
+147,0.003268734019781862,1.2247296875,70.85000004394531,89.94799994140625
+148,0.0034619672889156,1.20769796875,70.98200001953126,90.25799994628906
+149,0.003331201466997819,1.26503203125,70.04799997070313,89.41799996826173
+150,0.003441011516510376,1.22800796875,70.52600007324219,89.92000022949219
+151,0.00342923730412232,1.205493125,71.00600012451171,90.21399999267578
+152,0.0033721947111189365,1.22785359375,71.00600011962891,89.97600004394532
+153,0.003331308303001736,1.2461840625,70.60000002197266,89.81000007568359
+154,0.0034347615770197342,1.2066878125,71.1920001196289,90.10000007080077
+155,0.0033012595626392533,1.20977125,71.22600006835937,90.11400007080078
+156,0.0033992239041253924,1.19793546875,71.61599994628907,90.45199983642578
+157,0.003318158394124891,1.21407890625,70.66000012451173,90.06199994628906
+158,0.003163406525605491,1.214715625,70.97999996582031,90.17000004882813
+159,0.0034511078681264606,1.1941971875,71.82799997314453,90.43200017578125
+160,0.003197109093889594,1.14000453125,72.43999991455078,91.09000006835937
+161,0.00321825232822448,1.16305515625,72.20600001708985,90.66999988769531
+162,0.003342022620407598,1.17439734375,71.66399989013672,90.62800009765625
+163,0.003477680164256266,1.1905534375,71.69600007324219,90.53199997070313
+164,0.0032866454649982707,1.1906775,71.70800004394532,90.56200006835938
+165,0.0032659856702334116,1.2040075,70.90800007080078,90.26400004638671
+166,0.0032735190553856747,1.18966578125,71.4379999194336,90.41599999267578
+167,0.003588428072232221,1.16072765625,72.15400001708984,90.81000009765626
+168,0.0033078217280230354,1.18827859375,71.74999993896485,90.47200014892579
+169,0.0034906826580741574,1.1408671875,72.46199991699218,90.97000004638672
+170,0.003191659942136279,1.18764828125,71.94600004150391,90.45000017333984
+171,0.003139835665933788,1.17025375,71.83200004638672,90.51200007324219
+172,0.0032830593242709127,1.1612790625,72.09000010253907,90.82200010009765
+173,0.003370747352684183,1.1359815625,72.7260000390625,91.09599991455079
+174,0.003248639587712075,1.1737246875,71.98399989257813,90.80200009765625
+175,0.0031608384368675096,1.14259765625,72.85200001464844,91.11199999511719
+176,0.00323448857359056,1.1602953125,72.21999999755859,90.78799991699219
+177,0.003318667661265603,1.17726234375,71.84000002441407,90.60200001953125
+178,0.003132707821870489,1.14985984375,72.55800009521484,91.00800006835938
+179,0.0033881253330036998,1.156435625,72.46000006835938,90.93800001953124
+180,0.003284998570701906,1.13349390625,72.88999998779298,91.17200001464843
+181,0.003157229528629354,1.11932859375,73.05000004394532,91.31800009521484
+182,0.003064800286665559,1.116170625,73.36400009277344,91.28400001708984
+183,0.003283920614714069,1.128514375,72.93400013916016,91.1900001977539
+184,0.0031627657091511147,1.126823125,73.07000009277344,91.20600004394531
+185,0.003216694774372237,1.10957890625,73.2940000366211,91.41999993896485
+186,0.0033008489491684096,1.13548015625,73.0499999609375,91.4460001196289
+187,0.0033268413805801955,1.12621671875,73.36800007080078,91.34600009765624
+188,0.0031068910445485797,1.097263125,73.77199993896484,91.81600006835937
+189,0.003080885707666831,1.15398984375,72.1059999975586,90.68200002197266
+190,0.003289500534135316,1.09940046875,73.4820000390625,91.57600020019531
+191,0.0032341959553637673,1.11165171875,73.72000004638672,91.77999996582031
+192,0.0030639911502865808,1.12018734375,73.13600001464843,91.37000009521485
+193,0.0031219612075281994,1.1085696875,73.60000001464844,91.43200004638672
+194,0.0031889582086088403,1.09505875,73.93799988769531,91.7100001196289
+195,0.003157629132536905,1.08983859375,74.01399996337891,91.66600009521484
+196,0.0031145006152135985,1.07532953125,74.01800006835937,91.70999991455078
+197,0.0032537023736430065,1.0866959375,74.32400001708984,91.9120000415039
+198,0.0030873163071061882,1.11645671875,73.21000017089844,91.39600006835937
+199,0.0031369839229487945,1.09043328125,74.05000006347656,91.89799999023438
+200,0.003181640906924648,1.08010234375,74.06400006591797,91.92799999267578
+201,0.0030805699227909955,1.0735140625,74.08799998535156,91.88400011962891
+202,0.0030592053164062755,1.04242984375,74.6680000415039,92.2660000415039
+203,0.0030181118054315448,1.0612190625,74.59600008789063,92.12800022460938
+204,0.0028991151734122206,1.06336171875,74.3419999609375,92.03400002441406
+205,0.003133522512923394,1.06863453125,74.36200003417969,92.07399998535156
+206,0.0029669138769220027,1.057049375,74.50799993652343,92.20400006591797
+207,0.003045719815418124,1.03255125,75.06799996337891,92.34800001708984
+208,0.0031389754731208086,1.06708421875,74.44199999023438,92.11400006835937
+209,0.002939954483216362,1.0265696875,75.12800008789063,92.5659999633789
+210,0.0030027222487011124,1.0509209375,74.48200006591797,91.98199998779297
+211,0.0030858172768993036,1.04638328125,75.16400000976563,92.2900000415039
+212,0.003049486216955951,1.01229046875,75.49800005859375,92.74400006835937
+213,0.003054107938494001,1.0149840625,75.43199993164062,92.66000001464843
+214,0.0030523278817002264,1.027435,75.20000001464844,92.33399996582031
+215,0.00295353280047753,1.02725765625,75.22200014160157,92.46599999023438
+216,0.002916792208062751,1.03820640625,75.47200008544922,92.53200009277344
+217,0.0030544213950634003,1.00248921875,75.63400006347656,92.6440000415039
+218,0.0031409591070509385,1.003148125,75.68800013671876,92.84999993652343
+219,0.00295333845341312,0.99967875,75.9020000390625,92.76399998779297
+220,0.0028556018535579953,0.997283125,75.80200006347657,92.58000014648438
+221,0.002862908594709422,0.999586875,75.85599988037109,92.67600004638672
+222,0.003056368823828442,1.00927765625,75.65600011474609,92.75400001464844
+223,0.003060362451443715,1.01704703125,75.67000011230469,92.81000009277344
+224,0.003175580724408584,0.9809190625,76.30400014160156,93.06399993408203
+225,0.002951170567290059,0.97987609375,76.25200006103516,93.04399996582032
+226,0.0028978271542915274,0.9713578125,76.16400000732422,93.23800006591797
+227,0.00277037569321692,0.9814646875,76.33599998291015,93.0980001196289
+228,0.0030606386883716497,0.9775634375,76.3620001147461,93.1340001171875
+229,0.002817842926430915,0.97441875,76.29000006347657,93.1499999633789
+230,0.0028577848404113737,0.98003734375,76.37199998291015,92.96600001464844
+231,0.002933824185415038,0.96455765625,76.66000000976562,93.1620000390625
+232,0.0028675607671695097,0.96486984375,76.67999993164062,93.26999998779297
+233,0.002932505465910903,0.9552828125,76.96200006103516,93.37800014404297
+234,0.0027431468811950515,0.954749375,76.96599998291016,93.40400001464843
+235,0.0027674867971135037,0.95912953125,76.77600008544921,93.30399993652344
+236,0.002751236149509038,0.96058203125,77.07400000976563,93.38400006591797
+237,0.0029287314641156365,0.95341609375,77.00400006103516,93.34000019287109
+238,0.002752234344370663,0.94035875,77.1740000366211,93.60999998779297
+239,0.00272264369829957,0.9491415625,77.13200001220703,93.42399999267577
+240,0.002795018887679492,0.9399025,77.42600003173828,93.4739999609375
+241,0.0028743272414430976,0.96436625,77.27999990234375,93.37800006591797
+242,0.002643940305071218,0.93623484375,77.42999998046875,93.56800009277343
+243,0.00276781537104398,0.93184546875,77.41600011230469,93.58800009277344
+244,0.0026543704360457404,0.9522346875,77.53400003417968,93.65799998779296
+245,0.002833746772791658,0.92415828125,77.69200003173827,93.7400000390625
+246,0.0028131523369146244,0.92019875,77.76400005615234,93.74200011474609
+247,0.002630403365141579,0.930619375,77.6620000366211,93.71200001464844
+248,0.002686031994276813,0.91615609375,77.89200005859375,93.78799998779297
+249,0.0027636841405183077,0.91213,77.9479999584961,93.82599993652343
+250,0.00263610525455858,0.91414609375,77.94400008789063,93.87000001220703
+251,0.0028250382248578326,0.90730046875,78.12200010986328,93.99800014160157
+252,0.0027282563969492912,0.89905328125,78.41600000244141,94.11400009033203
+253,0.0026974499092570375,0.90280859375,78.29199998291016,93.99600009033203
+254,0.0026165787795824663,0.90059046875,78.35999990234374,94.02800016601563
+255,0.0028645797033927272,0.896670625,78.30000006103515,94.0480000390625
+256,0.0027296430697398527,0.89341390625,78.28800003417969,94.12399993408204
+257,0.002628813514352909,0.8924690625,78.43999998046876,94.10399990722657
+258,0.0027438735456338952,0.89788578125,78.59200005859375,94.09800001464843
+259,0.0027608362558696952,0.88773546875,78.65600008544922,94.1600000390625
+260,0.002709025625205998,0.8862396875,78.82799995605468,94.15000008789063
+261,0.002753045010779585,0.885668203125,78.84200003173828,94.15800000976563
+262,0.0027249641716480255,0.87703578125,78.86000005859376,94.31800001220704
+263,0.002667704613746277,0.88469171875,79.01000010986328,94.24399993164063
+264,0.0027875113633594344,0.8815678125,78.92600000732422,94.18200001464844
+265,0.002604053189445819,0.88435046875,78.95800010742188,94.13800014160157
+266,0.0025744007268388358,0.87147875,79.06400008300781,94.3639999609375
+267,0.0025899515354207586,0.8690709375,79.12400013671875,94.34799998535156
+268,0.0025270525864990695,0.8703253125,79.16600013671875,94.38400000976563
+269,0.0027527200457240853,0.86244484375,79.30800016113281,94.4319998828125
+270,0.002652591543405184,0.86984296875,79.37399995361328,94.38599998779297
+271,0.0025154544107083765,0.878178125,79.30400008300781,94.3060001147461
+272,0.0027743227214419414,0.86343484375,79.41800008544922,94.45400001220703
+273,0.0025572667862953885,0.861257578125,79.62200013671875,94.4799999609375
+274,0.002562160320979144,0.85878140625,79.47599992675781,94.47600001220704
+275,0.0026652730801807983,0.87701125,79.4160001586914,94.44400001220703
+276,0.002546968720188098,0.85214609375,79.69800002929688,94.45800009033204
+277,0.002406195496275489,0.85422359375,79.63799998046875,94.47999993408203
+278,0.0025625270292428987,0.857310859375,79.64600005371094,94.46000001220703
+279,0.002560538156623287,0.852581640625,79.76600005615235,94.55200016601563
+280,0.0023881655069999397,0.85405609375,79.73400010742188,94.55800013916016
+281,0.002368844230659306,0.8560896875,79.77799997802734,94.58600000976563
+282,0.0025179104247529593,0.847653125,79.85799995117188,94.62799993408203
+283,0.002443302289715835,0.854506875,79.83200010742188,94.74800000976562
+284,0.0025897356016295297,0.85438046875,79.94799992675782,94.62800000976563
+285,0.0025441833173057865,0.845479609375,79.88599995361328,94.6980000390625
+286,0.002364877677921738,0.847043125,79.91400003173828,94.65400006347656
+287,0.0024518951873428057,0.8473740625,80.00000008056641,94.66400006347656
+288,0.0025553761183151175,0.8408471875,80.07599995117188,94.74600001220703
+289,0.0024772981996648014,0.84096609375,79.95999997802734,94.7600000366211
+290,0.002522468126179384,0.845085859375,79.92799995361328,94.65000008789063
+291,0.00250109241876219,0.844013984375,80.0679999267578,94.69999993408203
+292,0.0023949523539548473,0.843519921875,80.05000000732421,94.73600003662109
+293,0.0024601881991007496,0.839418828125,80.11200008300781,94.71800003662109
+294,0.002333784642230187,0.83979578125,80.04199995361328,94.71800011474609
+295,0.0023810978995503058,0.84091578125,80.07200015869141,94.74600006347656
+296,0.0023521651829858975,0.838931640625,80.10800002929687,94.75399990722656
+297,0.0024202836211770773,0.8369471875,80.11600005615234,94.73000011474609
+298,0.0024537391810944037,0.838296484375,80.08600002929687,94.72600016601562
+299,0.0024408193421550095,0.837796171875,80.09599992675781,94.72999993408203
+300,0.0024034588714130223,0.84015,80.04599992675782,94.7660000366211
+301,0.002540342717631055,0.83971515625,80.10600002929688,94.7340001147461
+302,0.002468660681708051,0.842360625,80.17199997802734,94.7820000366211
+303,0.0024969897266211255,0.837438515625,80.14799997802734,94.75799998535156
+304,0.0025321109652785318,0.83988890625,80.04600005615234,94.71000000976562
+305,0.002433182222635618,0.838136484375,80.03999997802734,94.72999993408203
+306,0.0024769810760127647,0.83927546875,80.17199997802734,94.78199998535156
+307,0.0026157021389475892,0.84361546875,80.15200000488281,94.73600011474609
+308,0.002351050132087299,0.837729296875,80.07200003173828,94.73600016601563
+309,0.0023475157213397324,0.846922734375,80.21200000488281,94.76799998535157
diff --git a/CV/timm/exp_results/ViT/base/args_vit-B_150.yaml b/CV/timm/exp_results/ViT/base/args_vit-B_150.yaml
new file mode 100644
index 0000000..ab05c6d
--- /dev/null
+++ b/CV/timm/exp_results/ViT/base/args_vit-B_150.yaml
@@ -0,0 +1,112 @@
+aa: rand-m9-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 3
+aug_splits: 0
+batch_size: 256
+bce_loss: false
+bias_decay: true
+bn_eps: null
+bn_momentum: null
+bn_tf: false
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.1
+epoch_repeats: 0.0
+epochs: 150
+eval_metric: top1
+experiment: ''
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 5.0
+mean: null
+min_lr: 1.0e-08
+mixup: 0.8
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: deit_base_patch16_224
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_debug: 5
+opt_eps: 1.0e-08
+output: ./exp_results/deit-base-ori-
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.25
+resplit: false
+resume: ''
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-06
+weight_decay: 0.02
+workers: 10
diff --git a/CV/timm/exp_results/ViT/base/args_vit-B_300.yaml b/CV/timm/exp_results/ViT/base/args_vit-B_300.yaml
new file mode 100644
index 0000000..e41f508
--- /dev/null
+++ b/CV/timm/exp_results/ViT/base/args_vit-B_300.yaml
@@ -0,0 +1,112 @@
+aa: rand-m9-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 3
+aug_splits: 0
+batch_size: 256
+bce_loss: false
+bias_decay: true
+bn_eps: null
+bn_momentum: null
+bn_tf: false
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.1
+epoch_repeats: 0.0
+epochs: 300
+eval_metric: top1
+experiment: ''
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 5.0
+mean: null
+min_lr: 1.0e-05
+mixup: 0.8
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: deit_base_patch16_224
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_debug: 5
+opt_eps: 1.0e-08
+output: ./exp_results/deit-base-ori-
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.25
+resplit: false
+resume: ''
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-08
+weight_decay: 0.02
+workers: 10
diff --git a/CV/timm/exp_results/ViT/base/summary_vit-B_150.csv b/CV/timm/exp_results/ViT/base/summary_vit-B_150.csv
new file mode 100644
index 0000000..da0cd6d
--- /dev/null
+++ b/CV/timm/exp_results/ViT/base/summary_vit-B_150.csv
@@ -0,0 +1,161 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,6.969629015241351,6.8789,0.35,1.4740000004577636
+1,6.7701307364872525,6.092885,2.923999998779297,9.55200001953125
+2,6.520314659391131,5.52451375,6.120000012207031,17.63199996826172
+3,6.309063332421439,5.1006775,10.359999992675782,25.637999995117188
+4,6.200977563858032,4.6926425,14.590000002441407,33.050000032958984
+5,5.966246536799839,4.211550625,19.92799998046875,41.69199997314453
+6,5.886366980416434,3.823265625,25.436000042724608,48.86799998535156
+7,5.636490276881626,3.476820625,30.502000009765624,55.28800012695312
+8,5.441435030528477,3.2040934375,34.54999999145508,60.13599993408203
+9,5.333667687007359,3.0704365625,36.912000010986326,62.384000053710935
+10,5.33064079284668,2.809416875,40.71199998535156,66.99200001953125
+11,5.050014563969204,2.70141125,43.060000002441406,68.93799999023437
+12,5.04610105923244,2.50888125,46.04800005126953,71.58800000488282
+13,4.8556502887180875,2.483266875,47.44400007080078,72.83800002929688
+14,4.818053586142404,2.33415125,49.5940000366211,75.03200020996094
+15,4.764411279133388,2.301555625,50.29400001464844,75.524000078125
+16,4.74158375603812,2.1787871875,51.97400000244141,76.99400014648438
+17,4.725761686052595,2.1432540625,53.04199998046875,77.7640001196289
+18,4.6711210523332864,2.1266396875,54.010000048828125,78.6920000415039
+19,4.646009683609009,2.0637640625,54.994000053710934,79.66799999267577
+20,4.620888267244611,2.0288465625,55.64800001953125,80.16399999023437
+21,4.584996495928083,1.99706625,56.465999943847656,80.57400009033204
+22,4.563190596444266,1.951900625,57.1580001171875,81.28200008789062
+23,4.4821431296212335,1.8932825,57.884000017089846,81.89999993164062
+24,4.455127239227295,1.87278,58.68000009033203,82.47600006347656
+25,4.406286137444632,1.86544875,58.723999990234375,82.26600011230468
+26,4.310754571642194,1.8313215625,59.26600009033203,82.67000000732422
+27,4.404716219220843,1.7840215625,60.09199994140625,83.1980000366211
+28,4.299009203910828,1.8148903125,59.758000092773436,83.15800011230469
+29,4.417932408196585,1.7890609375,60.447999982910154,83.57200008300781
+30,4.2956497328622,1.75007546875,61.16000005615234,83.83599997802735
+31,4.3164447375706265,1.7511821875,60.878000087890626,83.95799990234374
+32,4.325539588928223,1.7671178125,60.750000036621095,83.98800003662109
+33,4.260358010019575,1.72828484375,61.24400002929688,84.25600013427734
+34,4.185277155467442,1.740508125,61.1720001171875,84.13599995605469
+35,4.1936653682163785,1.7270590625,61.37399996337891,84.41800008789062
+36,4.253177131925311,1.7454315625,61.321999990234374,84.40199992919922
+37,4.308102743966239,1.726250625,61.394000048828126,84.24999998779298
+38,4.234909108706883,1.74166625,61.54800003417969,84.20600000488281
+39,4.306120225361416,1.71573078125,61.391999929199216,84.43199990234375
+40,4.217206188610622,1.71197203125,61.92000005615235,84.634000078125
+41,4.3120207616261075,1.70625515625,61.70000008544922,84.65799997558594
+42,4.250123279435294,1.690705,61.85000001464844,84.68599995361328
+43,4.315731082643781,1.70799375,61.53600005859375,84.68000005615234
+44,4.222789577075413,1.7375190625,61.842000007324216,84.6680000805664
+45,4.26564964226314,1.714016875,61.938000009765624,84.95400005859375
+46,4.3635857445853095,1.6910959375,61.89200005859375,84.74799995361329
+47,4.227936165673392,1.6967903125,62.11200010986328,85.02400005371094
+48,4.254791617393494,1.7004075,62.08599998779297,84.89600003173828
+49,4.355360167367118,1.6895821875,62.13000006835937,84.76999997802734
+50,4.265195778438023,1.7516853125,61.66599998046875,84.66400010742187
+51,4.264554177011762,1.69786328125,62.00400006103516,84.77399995361328
+52,4.327261243547712,1.7127665625,61.6620001171875,84.65600000488281
+53,4.337813939367022,1.72115828125,61.97000008056641,84.78199995361328
+54,4.316329751695905,1.67919625,61.95400005859375,84.94600008789062
+55,4.249390431812832,1.723519375,61.83200000976562,84.64400006347657
+56,4.298370599746704,1.71378625,61.31800004150391,84.59600013427735
+57,4.244845850127084,1.74009125,61.500000036621095,84.44800005615234
+58,4.326196159635272,1.7223190625,61.487999965820315,84.40999990478515
+59,4.326049634388515,1.79321,60.632000007324216,83.9800001123047
+60,4.1043886968067715,1.54936984375,64.7080000048828,86.68400012939453
+61,4.025504384722028,1.5333053125,65.3160000341797,87.17000005126953
+62,4.114333816937038,1.52166609375,65.672,87.32400002197265
+63,4.03487799848829,1.50415203125,65.83000016113282,87.293999921875
+64,3.998051575251988,1.53302625,65.81399997558594,87.40800002441406
+65,4.01603765147073,1.5050271875,66.26600002441407,87.7180000732422
+66,4.131799561636789,1.49615859375,67.01799997314453,87.83800012695312
+67,4.02579082761492,1.4637478125,66.96800007568359,88.12800001953126
+68,4.021304403032575,1.4824240625,67.07400000244141,88.09800013183593
+69,3.9552708864212036,1.4422015625,67.18400006835938,88.29600010009766
+70,3.9504153047289168,1.460476875,67.14399994628906,88.22599994384765
+71,3.9517369951520647,1.40621890625,67.93799994628907,88.46599989501954
+72,3.9281171900885448,1.44610921875,67.65400004882812,88.40000004882812
+73,3.9567974976130893,1.4171990625,67.68600004882812,88.59200002197265
+74,3.9092021669660295,1.44212796875,68.24400012207032,88.59599991699218
+75,3.908873404775347,1.38805734375,68.40799997070313,89.02599991699219
+76,3.88528687613351,1.405209375,68.75000004638672,88.99200009765624
+77,3.881950242178781,1.40530421875,68.8119999658203,89.16600001953125
+78,3.855154871940613,1.36586625,69.13000004882812,89.17800010498047
+79,3.817075729370117,1.37695109375,69.27999994384766,89.5919999169922
+80,3.7851529121398926,1.3624575,69.51800014648437,89.53199991210937
+81,3.905322245189122,1.3414584375,69.5499999975586,89.51800002685547
+82,3.7586053950445995,1.3092875,69.9420000756836,89.78000007324219
+83,3.751699788229806,1.32302875,69.99599988769532,89.96599999023438
+84,3.8931176321847096,1.32061453125,70.19800007324218,89.98600012451172
+85,3.709507261003767,1.31129953125,70.26800009521484,90.21999994140624
+86,3.7826418536049977,1.2817078125,70.78999999267577,90.25800007080078
+87,3.6400119747434343,1.29241625,70.62000014648437,90.35200001953125
+88,3.758640170097351,1.27716546875,71.0879999633789,90.71399999023437
+89,3.6318452187946866,1.2621228125,71.35400004150391,90.60800001953125
+90,3.651788149561201,1.252781875,71.63599993652343,90.72799996582032
+91,3.7197152887071883,1.2491325,71.77199999267579,90.88599999023438
+92,3.7757417304175243,1.26371796875,71.76600007080079,90.90400006835938
+93,3.6193600382123674,1.25542765625,71.77400006591797,90.88800001708984
+94,3.6238814422062466,1.2174590625,72.23000012207031,91.18000001708984
+95,3.536820190293448,1.24194640625,72.40800001464844,91.32400009033204
+96,3.5582499844687328,1.2054115625,72.8439999609375,91.49200021972656
+97,3.6898646354675293,1.18531578125,72.8999999584961,91.5640001953125
+98,3.549690229552133,1.18031859375,73.15199987792968,91.80200001464844
+99,3.609755516052246,1.1682584375,73.5279998828125,91.81599998779296
+100,3.657796195575169,1.16523125,73.63399996337891,91.82000017089844
+101,3.569818241255624,1.16046578125,73.75399990722656,91.91600009521484
+102,3.62766364642552,1.15534203125,73.86800000976562,92.15999999023437
+103,3.544077685901097,1.14937140625,73.92800006591797,92.08600009033204
+104,3.5154461520058766,1.11958328125,74.30400001220703,92.44999991210938
+105,3.5504840782710483,1.1220875,74.61600006103515,92.41400009277343
+106,3.4753070218222484,1.1120759375,74.76800016845704,92.60200016601563
+107,3.5267016206468855,1.10029203125,74.79000003417968,92.67399990966797
+108,3.444872396332877,1.11157,75.01600021972656,92.60800017089844
+109,3.4604526417595998,1.10123546875,75.36000003662109,92.75999991210938
+110,3.4083507571901595,1.0809575,75.45199990478515,92.8700001953125
+111,3.3957954985754832,1.065394375,75.60800009033203,92.9800001953125
+112,3.3272638150623868,1.06037859375,75.64400016845703,93.13599996337891
+113,3.4289666414260864,1.0603453125,76.15000003417968,93.15600006347657
+114,3.388340336935861,1.04481484375,76.2539999609375,93.1620001171875
+115,3.3944766351154874,1.0165696875,76.64199998291015,93.47000006347656
+116,3.3446701083864485,1.03141125,76.69600002929687,93.43200009033202
+117,3.3022158316203525,1.0210525,76.63599995361328,93.60599998779297
+118,3.3074265718460083,1.017645625,77.00199995605469,93.68400009277343
+119,3.213198951312474,0.99452734375,77.18400003173828,93.70600011474609
+120,3.2595878498894826,0.9907528125,77.4020000830078,93.94799990722656
+121,3.2362237998417447,0.9886540625,77.52999995605468,93.9280000390625
+122,3.153636063848223,0.97030625,77.73000010742187,94.03999990966797
+123,3.1741700853620256,0.9702671875,77.92399989990234,94.0639999609375
+124,3.184590901647295,0.9583778125,78.30200003173829,94.16600001220704
+125,3.116585901805333,0.94181203125,78.68399998046876,94.4280001147461
+126,3.1041476896830966,0.952405625,78.46399998046876,94.32799998779296
+127,3.1573141642979214,0.94671078125,78.60999987548828,94.4000000415039
+128,3.180657318660191,0.92641671875,78.80400003173828,94.5760001171875
+129,3.101477725165231,0.9277609375,78.93800012939452,94.5620001147461
+130,3.069905706814357,0.92532625,79.19000002929687,94.69400016845704
+131,3.1060594660895213,0.9192690625,79.23400010742188,94.7280000366211
+132,2.992018461227417,0.90406015625,79.7039999975586,94.79200016845704
+133,3.000976528440203,0.90748828125,79.68400000244141,94.87200000976563
+134,2.9952284267970493,0.887784375,79.83800000488282,94.9960001147461
+135,2.9843625681740895,0.8879746875,79.90600010498046,94.98200006347656
+136,2.9764948231833324,0.88236796875,80.11400005371094,95.09400001220703
+137,2.939366579055786,0.88322875,80.27399997802735,95.0480001171875
+138,2.916310088975089,0.8718796875,80.26000010742187,95.16400003662109
+139,2.8832543236868724,0.8712409375,80.50000010986328,95.16800011474609
+140,2.9356773921421597,0.862911875,80.52800015869141,95.25000008789063
+141,2.8584332977022444,0.851140625,80.74800013183594,95.21599998535156
+142,2.907580545970372,0.85194359375,80.95000003173828,95.31199995849609
+143,2.8931364502225603,0.84861859375,80.87800005615235,95.33600000976563
+144,2.8906786952699934,0.844526875,80.96400008300782,95.37000006103516
+145,2.856494903564453,0.8433371875,80.98600010498046,95.4320000341797
+146,2.8453703948429654,0.84349796875,81.0280000024414,95.42200001220704
+147,2.7883094208581105,0.8313503125,81.18400000244141,95.5220000366211
+148,2.82052743434906,0.8334771875,81.28800010742188,95.45600003662109
+149,2.815722806113107,0.83443375,81.30000005371093,95.50399998535157
+150,2.8363174029759,0.82525375,81.26799997802735,95.56000006347656
+151,2.8583740166255405,0.82702125,81.48200010498047,95.57200013916015
+152,2.857897468975612,0.82399515625,81.5140000024414,95.61800006103516
+153,2.807421122278486,0.8223240625,81.50000000244141,95.59400008789062
+154,2.799610444477626,0.81890265625,81.52800005371094,95.61000013916015
+155,2.7337716477257863,0.82232546875,81.63599997558593,95.62000013916015
+156,2.7986813272748674,0.8176771875,81.53799997802734,95.64000013916015
+157,2.7346041883741106,0.81801578125,81.62399995117187,95.64200013916016
+158,2.7378521987370084,0.8184275,81.6320000024414,95.62400013916016
+159,2.7272439684186662,0.8178109375,81.66199989990234,95.63600013916016
diff --git a/CV/timm/exp_results/ViT/base/summary_vit-B_300.csv b/CV/timm/exp_results/ViT/base/summary_vit-B_300.csv
new file mode 100644
index 0000000..b8e7525
--- /dev/null
+++ b/CV/timm/exp_results/ViT/base/summary_vit-B_300.csv
@@ -0,0 +1,311 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,6.969629015241351,6.8789,0.35,1.4740000004577636
+1,6.7701307364872525,6.092885,2.923999998779297,9.55200001953125
+2,6.520314659391131,5.52451375,6.120000012207031,17.63199996826172
+3,6.309063332421439,5.1006775,10.359999992675782,25.637999995117188
+4,6.200977563858032,4.6926425,14.590000002441407,33.050000032958984
+5,5.966246536799839,4.211550625,19.92799998046875,41.69199997314453
+6,5.886366980416434,3.823265625,25.436000042724608,48.86799998535156
+7,5.636490276881626,3.476820625,30.502000009765624,55.28800012695312
+8,5.441435030528477,3.2040934375,34.54999999145508,60.13599993408203
+9,5.333667687007359,3.0704365625,36.912000010986326,62.384000053710935
+10,5.33064079284668,2.809416875,40.71199998535156,66.99200001953125
+11,5.050014563969204,2.70141125,43.060000002441406,68.93799999023437
+12,5.04610105923244,2.50888125,46.04800005126953,71.58800000488282
+13,4.8556502887180875,2.483266875,47.44400007080078,72.83800002929688
+14,4.818053586142404,2.33415125,49.5940000366211,75.03200020996094
+15,4.764411279133388,2.301555625,50.29400001464844,75.524000078125
+16,4.74158375603812,2.1787871875,51.97400000244141,76.99400014648438
+17,4.725761686052595,2.1432540625,53.04199998046875,77.7640001196289
+18,4.6711210523332864,2.1266396875,54.010000048828125,78.6920000415039
+19,4.646009683609009,2.0637640625,54.994000053710934,79.66799999267577
+20,4.620888267244611,2.0288465625,55.64800001953125,80.16399999023437
+21,4.584996495928083,1.99706625,56.465999943847656,80.57400009033204
+22,4.563190596444266,1.951900625,57.1580001171875,81.28200008789062
+23,4.4821431296212335,1.8932825,57.884000017089846,81.89999993164062
+24,4.455127239227295,1.87278,58.68000009033203,82.47600006347656
+25,4.406286137444632,1.86544875,58.723999990234375,82.26600011230468
+26,4.310754571642194,1.8313215625,59.26600009033203,82.67000000732422
+27,4.404716219220843,1.7840215625,60.09199994140625,83.1980000366211
+28,4.299009203910828,1.8148903125,59.758000092773436,83.15800011230469
+29,4.417932408196585,1.7890609375,60.447999982910154,83.57200008300781
+30,4.2956497328622,1.75007546875,61.16000005615234,83.83599997802735
+31,4.3164447375706265,1.7511821875,60.878000087890626,83.95799990234374
+32,4.325539588928223,1.7671178125,60.750000036621095,83.98800003662109
+33,4.260358010019575,1.72828484375,61.24400002929688,84.25600013427734
+34,4.185277155467442,1.740508125,61.1720001171875,84.13599995605469
+35,4.1936653682163785,1.7270590625,61.37399996337891,84.41800008789062
+36,4.253177131925311,1.7454315625,61.321999990234374,84.40199992919922
+37,4.308102743966239,1.726250625,61.394000048828126,84.24999998779298
+38,4.234909108706883,1.74166625,61.54800003417969,84.20600000488281
+39,4.306120225361416,1.71573078125,61.391999929199216,84.43199990234375
+40,4.217206188610622,1.71197203125,61.92000005615235,84.634000078125
+41,4.3120207616261075,1.70625515625,61.70000008544922,84.65799997558594
+42,4.250123279435294,1.690705,61.85000001464844,84.68599995361328
+43,4.315731082643781,1.70799375,61.53600005859375,84.68000005615234
+44,4.222789577075413,1.7375190625,61.842000007324216,84.6680000805664
+45,4.26564964226314,1.714016875,61.938000009765624,84.95400005859375
+46,4.3635857445853095,1.6910959375,61.89200005859375,84.74799995361329
+47,4.227936165673392,1.6967903125,62.11200010986328,85.02400005371094
+48,4.254791617393494,1.7004075,62.08599998779297,84.89600003173828
+49,4.355360167367118,1.6895821875,62.13000006835937,84.76999997802734
+50,4.265195778438023,1.7516853125,61.66599998046875,84.66400010742187
+51,4.264554177011762,1.69786328125,62.00400006103516,84.77399995361328
+52,4.327261243547712,1.7127665625,61.6620001171875,84.65600000488281
+53,4.337813939367022,1.72115828125,61.97000008056641,84.78199995361328
+54,4.316329751695905,1.67919625,61.95400005859375,84.94600008789062
+55,4.249390431812832,1.723519375,61.83200000976562,84.64400006347657
+56,4.298370599746704,1.71378625,61.31800004150391,84.59600013427735
+57,4.244845850127084,1.74009125,61.500000036621095,84.44800005615234
+58,4.326196159635272,1.7223190625,61.487999965820315,84.40999990478515
+59,4.326049634388515,1.79321,60.632000007324216,83.9800001123047
+60,4.213467495782035,1.6962178125,61.84800006347656,84.79200003173828
+61,4.179520555904934,1.67279703125,62.441999934082034,85.09599984863281
+62,4.2883595909391135,1.70774703125,62.03399993164062,85.20599997558594
+63,4.212563753128052,1.6541159375,62.82400009277344,85.45000010986328
+64,4.186365791729519,1.7163590625,62.156000139160156,85.09800008300782
+65,4.211346830640521,1.6630990625,62.92400000976563,85.53600000244141
+66,4.345460380826678,1.706836875,63.13999995605469,85.28199992431641
+67,4.232215166091919,1.6551934375,63.3160000756836,85.56600015136719
+68,4.2405494792120795,1.6708703125,62.974000100097655,85.73600003417968
+69,4.184299758502415,1.6155940625,63.50000006103516,86.09000020507813
+70,4.189510720116751,1.636130625,63.55400000732422,85.91000003173828
+71,4.201522128922599,1.60571,63.98600003662109,86.16800002929688
+72,4.148890921047756,1.64055875,63.548000114746095,86.05599995117187
+73,4.201035022735596,1.61706171875,63.74000010498047,85.9760001586914
+74,4.1622961929866245,1.64356109375,63.83399998779297,85.86400013183594
+75,4.16710318837847,1.60072328125,64.19599997802734,86.190000078125
+76,4.1475348472595215,1.61889765625,64.25200005615234,86.16199997558594
+77,4.158843449183872,1.63870453125,64.10599987304687,86.37799992431641
+78,4.142371841839382,1.597145625,64.36000003173828,86.40400005615234
+79,4.1166762965066095,1.60502796875,64.42400010009766,86.38800015625
+80,4.078348330089024,1.5798840625,64.87199997314453,86.685999921875
+81,4.230167474065508,1.56397234375,64.74600016357422,86.58000002685547
+82,4.071969883782523,1.53146046875,64.90000001953125,86.85000005126953
+83,4.0647357021059305,1.57282546875,64.98799995361328,86.65799997314453
+84,4.239261967795236,1.576009375,64.71600006347656,86.48800005371093
+85,4.046512143952506,1.56221296875,65.11799990234375,86.87000002685546
+86,4.121064526694162,1.5594540625,65.17199989746094,86.90399991943359
+87,3.9984947443008423,1.5599678125,65.36000000976563,86.9120000805664
+88,4.116381389754159,1.56357,65.24000002685547,87.04400015625
+89,3.9988598312650407,1.525839375,65.47000005615234,87.23199997558594
+90,4.018885595457895,1.51683296875,65.566,86.97400004882813
+91,4.1075364010674615,1.5444475,65.75200002929688,87.06599999755859
+92,4.171327829360962,1.55207921875,65.69600000244141,87.11199997558593
+93,4.023313914026533,1.542966875,65.69600006103515,87.15200008056641
+94,4.040976541382926,1.52646296875,65.62000004882813,87.28999986816406
+95,3.9684804337365285,1.55841890625,65.53800002441406,87.0920001538086
+96,3.9950338431767056,1.53197,66.24799995117188,87.56800012939453
+97,4.128159182412284,1.50979484375,66.18000010009766,87.63800002197266
+98,3.992838842528207,1.47391,66.11200015136718,87.66200010253907
+99,4.059381195477077,1.4893584375,66.54200010253906,87.65599997558594
+100,4.133719955171857,1.4780228125,66.442,87.74200010498046
+101,4.041632890701294,1.51560859375,66.54800013183593,87.67999997314453
+102,4.1323743888310025,1.49358578125,66.4760000756836,87.74200018066406
+103,4.0429258687155585,1.48665953125,66.75,87.9560002319336
+104,4.048352837562561,1.47264984375,66.65400016113281,88.054000078125
+105,4.077164786202567,1.4712521875,66.66599989501952,87.98799999511719
+106,4.014335121427264,1.47394609375,67.20400008300781,88.08599997558593
+107,4.067411524908883,1.488710625,67.01800005371094,88.16200002685547
+108,3.9856631415230885,1.490495625,66.98200008056641,87.98000007568359
+109,4.032512954303196,1.49698,67.07999988769531,88.10799999267579
+110,3.985403231212071,1.43890265625,67.46600001953125,88.34000004638672
+111,3.983332174164908,1.4535665625,67.45200002197265,88.40200007324219
+112,3.9212536300931657,1.459085,67.27200002685547,88.27800002441407
+113,4.041322333472116,1.457323125,67.76400010742188,88.45400004882812
+114,4.005600690841675,1.44920921875,67.25399997802734,88.33800010009766
+115,4.040409803390503,1.41558953125,67.61199997558593,88.486000078125
+116,3.9900557483945573,1.45428828125,68.04399999267578,88.66000004882812
+117,3.9569373641695296,1.427186875,67.8319999975586,88.76600004882812
+118,3.972442524773734,1.43617828125,67.71599994628906,88.476
+119,3.8783712216785977,1.4222303125,68.13200005126953,88.59199994140624
+120,3.945246083395822,1.41395203125,68.50200001953125,88.76200007080078
+121,3.9348261015755788,1.42097109375,68.19000015136719,88.83000002197265
+122,3.8721287761415755,1.4167175,67.98600007324218,88.48600005126953
+123,3.8878585951668874,1.387705625,68.44000002685547,88.82600005371094
+124,3.931880303791591,1.38112578125,69.11600007568359,89.17199997070313
+125,3.8602528401783536,1.38122140625,69.19399997558594,89.41599999511719
+126,3.847030554498945,1.40038890625,68.70000004394531,89.18599994384766
+127,3.9303902047021047,1.38554421875,68.95999997070312,89.22000004882813
+128,3.979553392955235,1.37263125,68.60000007080077,89.22199994384765
+129,3.9104709114347185,1.39174265625,68.886,89.12000007568359
+130,3.868299501282828,1.37742796875,69.01400005126953,89.38400004638672
+131,3.939802203859602,1.387974375,68.72800014648438,89.17399996826173
+132,3.788008655820574,1.3468928125,69.71600004394531,89.63200001708984
+133,3.8497856003897533,1.3657671875,69.5219999633789,89.34199998779297
+134,3.851942607334682,1.3613771875,69.44600002685547,89.54999994384765
+135,3.8319093329565868,1.35302671875,69.64799996582032,89.56000007080078
+136,3.841527921812875,1.33907203125,69.50600004638672,89.69800004638672
+137,3.8174962997436523,1.36149984375,69.75200015625,89.60599994384765
+138,3.7776987893240794,1.32647453125,69.90999996826172,89.81799994140626
+139,3.779037492615836,1.360001875,69.77799994140625,89.7060000732422
+140,3.858560698372977,1.334925625,69.74599997314454,89.55999997070313
+141,3.7706131083624705,1.3348275,69.85999994384765,89.70000009765624
+142,3.8451303924833025,1.354425,69.88000001220703,89.92000014648437
+143,3.827801857675825,1.31508015625,70.50800001464843,90.02399989013672
+144,3.819279636655535,1.30098140625,70.38399999267578,90.11600007080078
+145,3.793690732547215,1.308443125,70.5079999975586,90.10400004638672
+146,3.79205060005188,1.3126325,70.42800012451171,90.0999999975586
+147,3.7342803989137923,1.2962403125,70.72799999023438,90.37200001953126
+148,3.759069698197501,1.30338671875,70.68000002197266,90.24599991699219
+149,3.773835233279637,1.27972859375,71.06800010009766,90.3099999194336
+150,3.7902458224977766,1.30308015625,70.69000001464843,90.22200004638673
+151,3.833420293671744,1.30644046875,70.62199994140624,90.2620000439453
+152,3.829545021057129,1.2752184375,71.1980001953125,90.56400006591797
+153,3.762980546270098,1.2827546875,71.08000010253906,90.56200002197265
+154,3.765869344983782,1.2928425,70.85000004394531,90.53799999267578
+155,3.6597749335425243,1.253755,71.40400012695312,90.62800012451171
+156,3.7653644255229404,1.2428259375,71.46800002197266,90.85600004394531
+157,3.665971670831953,1.26535609375,71.70599999267579,90.76799986328125
+158,3.662470664296831,1.28085375,71.41400001953124,90.76199999267578
+159,3.660918814795358,1.24413703125,71.95200009521484,90.93999996826172
+160,3.651732785361154,1.23913953125,71.85800001220703,90.84800014648438
+161,3.6541929244995117,1.26079765625,72.04000009765625,90.94600007080078
+162,3.6284380640302385,1.254959375,71.92000004150391,90.97400002197266
+163,3.6653095313480923,1.23689171875,72.06400004394531,91.05199994140625
+164,3.5928574800491333,1.21643140625,72.2220000390625,91.23200014648438
+165,3.684508442878723,1.2473825,72.38599996582032,91.02999993896485
+166,3.6942974669592723,1.204405,72.53200002197265,91.2360000415039
+167,3.737000686781747,1.2211703125,72.65800006591797,91.2500000390625
+168,3.639409899711609,1.2167225,72.39999999023438,91.31800006835938
+169,3.6478631666728427,1.20903890625,72.64000004150391,91.39800001708984
+170,3.5844166789736067,1.2037178125,72.71000006591797,91.33400011962891
+171,3.5722510474068776,1.18691359375,72.85599991699219,91.61200009277344
+172,3.6715655667441234,1.2027809375,72.9860000415039,91.5980001977539
+173,3.5886222294398715,1.20322296875,73.31799988525391,91.71600009521484
+174,3.651749236243112,1.19743671875,73.04200009033202,91.51000001464844
+175,3.640444346836635,1.1770146875,73.50600003662109,91.7240000415039
+176,3.6747266224452426,1.16991953125,73.7160000805664,91.8840000390625
+177,3.5942376341138567,1.16856515625,73.54999997802734,91.77000009033203
+178,3.6450414998190745,1.17093625,73.51600009033203,91.78600004150391
+179,3.5550800391605923,1.18241265625,73.34600000976563,91.67800014648438
+180,3.5467359849384854,1.16648359375,73.8679999633789,92.04199996337891
+181,3.5346290384020125,1.1711478125,73.9060000390625,92.02400006591797
+182,3.549855317388262,1.16440859375,73.89200006103516,92.00000014404297
+183,3.488757542201451,1.13834375,74.21400000732422,92.16600006835938
+184,3.5486171756471907,1.13467140625,74.07599998535156,92.15600006835938
+185,3.527870978627886,1.14563390625,74.24400008789063,92.31800004150391
+186,3.641656960759844,1.147888125,74.40600000732422,92.22600001464843
+187,3.4327066114970615,1.12195578125,74.64799995605469,92.37800001464844
+188,3.4440344912665233,1.1374628125,74.2739998852539,92.24399996337891
+189,3.5416121823447093,1.128978125,74.56999998779297,92.36800014404297
+190,3.4425646918160573,1.10468796875,74.84600006347657,92.49800009277344
+191,3.3866710492542813,1.12815515625,74.77600001220704,92.4780001928711
+192,3.5243432010923113,1.112049375,74.93199998535157,92.6079999609375
+193,3.458450981548854,1.1140653125,74.88200003417968,92.58999993408203
+194,3.5172935724258423,1.10661859375,74.9979999584961,92.81200004150391
+195,3.3919708728790283,1.0790059375,75.4440000341797,92.87799998779298
+196,3.4189445972442627,1.0813096875,75.53599990234375,93.16600000976563
+197,3.4953403643199374,1.08123953125,75.31799997802734,92.85200001220703
+198,3.4433101756232127,1.0640003125,75.66999990722657,92.9319998828125
+199,3.4136417593274797,1.1084825,75.48000001220703,92.84799998535156
+200,3.368078657558986,1.064356875,75.87599995361329,93.06600014404297
+201,3.3975088596343994,1.0529915625,75.77000008789062,93.12799998779298
+202,3.414073722703116,1.048718125,75.8860000805664,93.09199993164063
+203,3.4181587355477467,1.039999375,76.12000003417968,93.26000008789063
+204,3.3536973680768694,1.05147796875,75.97800000488282,93.1780000366211
+205,3.3996514763150896,1.0671740625,76.00800000732421,93.1180000366211
+206,3.2854509013039723,1.04157546875,76.19400008056641,93.25400000976562
+207,3.3939605951309204,1.0417709375,76.20799992919922,93.39599995849609
+208,3.4585547958101546,1.031876875,76.7360001123047,93.36799991210937
+209,3.267347148486546,1.0176971875,76.61599995849609,93.41000009033203
+210,3.2705468790871755,1.021806875,76.78999998291016,93.56000019287109
+211,3.363382646015712,1.02334453125,76.88600003417969,93.56400001464844
+212,3.28737325327737,1.024301875,76.76,93.71399995849609
+213,3.268887758255005,1.0145903125,76.89600000732422,93.67399993652344
+214,3.246021270751953,1.00449375,76.9939999560547,93.73800000732422
+215,3.3066403525216237,1.0040428125,77.26399998291015,93.73800003662109
+216,3.300972972597395,1.016209375,77.22200002685547,93.71200008544922
+217,3.2904276847839355,0.99096859375,77.3560000366211,93.84399990722656
+218,3.254083718572344,0.9851975,77.50800002929688,93.77400000976563
+219,3.3215164116450717,1.00614453125,77.46800006347657,93.92999993408203
+220,3.217401776994978,1.00295546875,77.57199992919922,93.89199993408204
+221,3.183136684553964,0.9760646875,77.79600018554687,94.05600006347656
+222,3.207250254494803,0.98017984375,77.83199990478515,94.1139999584961
+223,3.1810088668550764,0.9771621875,78.04999997802734,94.12400011230469
+224,3.192355445453099,0.96112828125,78.28000018554687,94.09199990478515
+225,3.176583766937256,0.96966171875,78.17600005371094,94.17200001220704
+226,3.216641868863787,0.97431265625,78.21800010498048,94.18800008789063
+227,3.2392124107905795,0.9575475,78.15200011230469,94.25200001220703
+228,3.094020298549107,0.96108203125,78.412000078125,94.35999998291015
+229,3.042390619005476,0.94788421875,78.44799997802734,94.39000003417969
+230,3.1469024079186574,0.9359265625,78.59599995361329,94.4300001147461
+231,3.0367832354136874,0.9324596875,78.87600002685546,94.51400006103516
+232,3.1701260123934065,0.9365625,78.87000000976562,94.43600008789062
+233,3.1740969930376326,0.92394921875,78.981999921875,94.62600008789063
+234,3.021683692932129,0.93181640625,78.81200012695312,94.58799998291016
+235,3.066072804587228,0.93264640625,78.95200005371093,94.44799998535156
+236,3.029092584337507,0.9182596875,79.23400010986329,94.81799995849609
+237,3.0436436789376393,0.9156940625,79.14999995117188,94.7540000366211
+238,3.0316148485456194,0.91837046875,79.13999997558594,94.71800021728515
+239,3.1059979370662143,0.91385984375,79.19399998046875,94.80999995849609
+240,3.0071086372647966,0.9125759375,79.28199987060547,94.74799998291016
+241,3.076224752834865,0.89396609375,79.62000010742187,94.85999998291015
+242,2.9855558361325945,0.90599921875,79.66800010742188,94.96199998291016
+243,3.02846850667681,0.906094375,79.56800010253906,94.88000008789062
+244,2.9678858518600464,0.8839753125,79.7680000830078,95.02400000976563
+245,3.019363965306963,0.897726875,79.81999998046875,94.89400000976562
+246,2.9393607548304965,0.894104375,79.84200010742188,94.95000003417968
+247,2.954161967550005,0.89661734375,80.11799997558593,95.02600008544921
+248,2.9839111055646623,0.881688125,80.04600010498046,95.10400005859375
+249,2.8732495989118303,0.8762984375,79.99600015625,95.16200000976562
+250,2.926910638809204,0.8640259375,80.362,95.18600013916016
+251,2.9226092100143433,0.8752515625,80.348000078125,95.15000000976562
+252,2.936304875782558,0.87086,80.36000005371093,95.2120001123047
+253,2.893927880695888,0.862070625,80.55800004882812,95.31000000732422
+254,2.914907455444336,0.86268625,80.44600005615234,95.36200000976562
+255,2.9629796062197005,0.8679853125,80.69200012695312,95.32400003417969
+256,2.9341112545558383,0.85994359375,80.904000078125,95.3640001123047
+257,2.8932479109082903,0.856921875,80.853999921875,95.31600019042969
+258,2.8523381778172086,0.85901671875,80.86600015136719,95.40199992919922
+259,2.840928418295724,0.85219765625,80.96600013183594,95.41000016357422
+260,2.7624418565205167,0.848074375,81.03999997802734,95.40000000732422
+261,2.8604295594351634,0.84466109375,80.91399989746094,95.36400008544922
+262,2.836558222770691,0.848165,80.85000005371094,95.38600005859375
+263,2.789436902318682,0.8437884375,81.21199997070312,95.50800010986327
+264,2.8127035924366544,0.841511875,81.3199999975586,95.51599990234375
+265,2.7396664108548845,0.84411375,81.428,95.51200003417969
+266,2.682282737323216,0.836664375,81.16800010253907,95.45400000732423
+267,2.7652451481137956,0.8375390625,81.48000012939453,95.51200000732422
+268,2.7684823785509383,0.83409765625,81.415999921875,95.53999995605469
+269,2.7430005414145335,0.82644015625,81.54600012939453,95.53000011230469
+270,2.7615814208984375,0.826859296875,81.38000005126953,95.56999992919921
+271,2.730608412197658,0.827575625,81.52599997558593,95.62400008544923
+272,2.743230836732047,0.8226790625,81.64000013427734,95.68600008789062
+273,2.7087179933275496,0.82627703125,81.59800013183593,95.66400013916015
+274,2.680653316634042,0.81819546875,81.693999921875,95.65600000732422
+275,2.655898758343288,0.82363890625,81.71600010253906,95.69400011230469
+276,2.6966289622443065,0.818948125,81.82200005615235,95.69800013916016
+277,2.657309651374817,0.8161665625,81.8339999243164,95.69400013916015
+278,2.722650715282985,0.816388125,81.85400008056641,95.69000016357423
+279,2.7371155534471785,0.81848859375,81.80799997558594,95.73600016357422
+280,2.762461543083191,0.8157059375,81.89800002929688,95.77000011230469
+281,2.6352199826921736,0.809033125,81.97000005371093,95.80400000976563
+282,2.6447329010282243,0.81242375,81.89800002685547,95.74600013916016
+283,2.7577838727406094,0.810340625,81.9660000805664,95.78400006103516
+284,2.6313655035836354,0.8106365625,81.99800002929688,95.73400019042968
+285,2.695423790386745,0.8138815625,81.975999921875,95.81200008544921
+286,2.639449800763811,0.80932921875,82.01800010498047,95.80600013916016
+287,2.6542039769036427,0.81090203125,82.06199995117187,95.74000016357422
+288,2.5680429254259383,0.81421203125,82.14400005371094,95.75600011230469
+289,2.655463797705514,0.80852859375,82.086000078125,95.79600006103516
+290,2.640289000102452,0.80881171875,82.03200005371093,95.80800013916016
+291,2.6459262541362216,0.807845625,82.07800005371094,95.79800011230469
+292,2.615483454295567,0.8087240625,82.0779999243164,95.79400011230469
+293,2.6161666086741855,0.80778140625,82.14400005371094,95.83800011230468
+294,2.5554496560777937,0.80624515625,82.15400005371093,95.80800011230468
+295,2.562917113304138,0.8091565625,82.12000010498046,95.82000011230468
+296,2.634465183530535,0.80806171875,82.18800002685546,95.81200006103515
+297,2.5788906131471907,0.80650515625,82.12200000244141,95.83600006103515
+298,2.6572596175330028,0.80463859375,82.16400000244141,95.8400001123047
+299,2.7056877442768643,0.8056234375,82.1639999243164,95.82000013916016
+300,2.6104579312460765,0.80488515625,82.25199989746094,95.84400013916015
+301,2.6317106655665805,0.804826875,82.17200000244141,95.81400013916016
+302,2.6293483631951466,0.80684,82.18999992431641,95.82400013916016
+303,2.580767955098833,0.80481828125,82.18399992431641,95.84400021728516
+304,2.5847002608435496,0.804366875,82.2060000024414,95.86200021728516
+305,2.5991972514561246,0.80402203125,82.1739999243164,95.84600019042969
+306,2.622671059199742,0.80541546875,82.18399997558593,95.83600006103515
+307,2.5870354686464583,0.804596875,82.2959999243164,95.85400019042969
+308,2.622898374285017,0.80611625,82.20199992431641,95.84400006103516
+309,2.689241988318307,0.80708,82.17600005371094,95.81600006103515
diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml
new file mode 100644
index 0000000..5bebb3e
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml
@@ -0,0 +1,113 @@
+aa: rand-m7-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: true
+bias_decay: false
+bn_eps: null
+bn_momentum: null
+bn_tf: false
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.05
+epoch_repeats: 0.0
+epochs: 150
+eval_metric: top1
+experiment: ''
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 0.0
+mean: null
+min_lr: 1.0e-08
+mixup: 0.2
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: deit_small_patch16_224
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_debug: 5
+opt_eps: 1.0e-08
+output: ./exp_results/deit-small-bs-test-
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.0
+resplit: false
+resume: ''
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 1005
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-08
+weight_decay: 0.02
+workers: 10
diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_150.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_150.yaml
new file mode 100644
index 0000000..84b7a54
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/args_vit-s_150.yaml
@@ -0,0 +1,111 @@
+aa: rand-m7-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: true
+bias_decay: false
+bn_eps: null
+bn_momentum: null
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.1
+epoch_repeats: 0.0
+epochs: 150
+eval_metric: top1
+experiment: wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 0.0
+mean: null
+min_lr: 1.0e-05
+mixup: 0.8
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: deit_small_patch16_224
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_eps: 1.0e-08
+output: ./exp_results/deit-small
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.25
+resplit: false
+resume: null
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-08
+weight_decay: 0.02
+workers: 10
diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml
new file mode 100644
index 0000000..8cf5701
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml
@@ -0,0 +1,113 @@
+aa: rand-m7-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: true
+bias_decay: false
+bn_eps: null
+bn_momentum: null
+bn_tf: false
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.05
+epoch_repeats: 0.0
+epochs: 300
+eval_metric: top1
+experiment: bs4096
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.02121
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 0.0
+mean: null
+min_lr: 1.0e-08
+mixup: 0.2
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: deit_small_patch16_224
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_debug: 5
+opt_eps: 1.0e-08
+output: ./exp_results/deit-small-bs-test-
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.0
+resplit: false
+resume: ''
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 1005
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 80
+warmup_lr: 1.0e-08
+weight_decay: 0.02
+workers: 10
diff --git a/CV/timm/exp_results/ViT/small/args_vit-s_300.yaml b/CV/timm/exp_results/ViT/small/args_vit-s_300.yaml
new file mode 100644
index 0000000..6703113
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/args_vit-s_300.yaml
@@ -0,0 +1,111 @@
+aa: rand-m7-mstd0.5-inc1
+amp: true
+apex_amp: false
+aug_repeats: 0
+aug_splits: 0
+batch_size: 256
+bce_loss: true
+bias_decay: false
+bn_eps: null
+bn_momentum: null
+channels_last: false
+checkpoint_hist: 2
+clip_grad: null
+clip_mode: norm
+color_jitter: 0.4
+cooldown_epochs: 10
+crop_pct: null
+cutmix: 1.0
+cutmix_minmax: null
+data_dir: /dataset/common/imagenet-raw
+dataset: ''
+decay_epochs: 100
+decay_rate: 0.1
+dist_bn: reduce
+drop: 0.0
+drop_block: null
+drop_connect: null
+drop_path: 0.1
+epoch_repeats: 0.0
+epochs: 300
+eval_metric: top1
+experiment: e300-wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08-bce
+gp: null
+hflip: 0.5
+img_size: null
+initial_checkpoint: ''
+input_size: null
+interpolation: ''
+jsd_loss: false
+local_rank: 0
+log_interval: 50
+log_wandb: false
+lr: 0.015
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+max_grad_norm: 0.0
+mean: null
+min_lr: 1.0e-05
+mixup: 0.8
+mixup_mode: batch
+mixup_off_epoch: 0
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+model: deit_small_patch16_224
+model_ema: false
+model_ema_decay: 0.9998
+model_ema_force_cpu: false
+momentum: 0.9
+native_amp: false
+no_aug: false
+no_prefetcher: false
+no_prox: false
+no_resume_opt: false
+num_classes: null
+opt: adan
+opt_betas:
+- 0.98
+- 0.92
+- 0.99
+opt_eps: 1.0e-08
+output: ./exp_results/deit-small
+patience_epochs: 10
+pin_mem: false
+pretrained: false
+ratio:
+- 0.75
+- 1.3333333333333333
+recount: 1
+recovery_interval: 0
+remode: pixel
+reprob: 0.25
+resplit: false
+resume: null
+save_images: false
+scale:
+- 0.08
+- 1.0
+sched: cosine
+seed: 42
+smoothing: 0.1
+split_bn: false
+start_epoch: null
+std: null
+sync_bn: false
+torchscript: false
+train_interpolation: random
+train_split: train
+tta: 0
+use_multi_epochs_loader: false
+val_split: validation
+validation_batch_size: null
+vflip: 0.0
+warmup_epochs: 60
+warmup_lr: 1.0e-08
+weight_decay: 0.02
+workers: 10
diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv
new file mode 100644
index 0000000..f926d13
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv
@@ -0,0 +1,171 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,0.7026468387671879,6.984695,0.082,0.46
+1,0.058770897665194104,6.9116125,0.106,0.526
+2,0.007911681064537593,6.91631375,0.1,0.508
+3,0.007928581509206976,6.6319525,0.8200000009155274,3.075999998779297
+4,0.007686727081558534,6.04628,3.808000006713867,11.112000046386719
+5,0.007372890499287418,5.32454875,8.540000014648438,21.948000028076173
+6,0.007114177669531533,4.822300625,13.483999995117188,30.73600007080078
+7,0.006873541445072208,4.345096875,18.62599999267578,38.88200006591797
+8,0.006686848022841981,3.96510875,23.81200007446289,46.102000041503906
+9,0.006508722136329327,3.558395,28.93600006591797,52.72000012207031
+10,0.006406569953209588,3.319361875,32.69800003173828,57.07200003662109
+11,0.006091187142633966,2.980923125,37.60000006103515,62.92200005371094
+12,0.005965045220883829,2.7659815625,41.365999982910154,66.40400014648438
+13,0.005911312930818115,2.6131971875,44.04200001953125,69.21200000732422
+14,0.005739207878442747,2.4368365625,46.747999990234376,71.56399995117188
+15,0.005549260514921376,2.315486875,49.20399995605469,73.99400002685547
+16,0.005579812019797308,2.2251215625,50.674000087890626,75.37799997802735
+17,0.005359911625938756,2.1386896875,52.33400008789062,76.75399994384766
+18,0.0053312217351049185,2.047164375,53.998000007324215,78.22200001953125
+19,0.005294654119227614,1.98766875,55.05999994628906,78.90800022460938
+20,0.005293804021286113,1.928628125,56.43800004882812,79.80200014648437
+21,0.005101200592304979,1.892119375,56.980000048828124,80.62000014892578
+22,0.005065899142729384,1.830405625,58.26800009765625,81.35800006347657
+23,0.005011323107672589,1.7809378125,59.08200006835938,82.07200008789063
+24,0.005041455550651465,1.7671984375,59.573999936523435,82.28799998779297
+25,0.005057569460145065,1.72392078125,60.189999987792966,82.9780000390625
+26,0.005011343031323382,1.697203125,60.678000061035156,83.40599985351562
+27,0.004803977141688977,1.6762446875,61.38600006835937,83.77799998291016
+28,0.004737243688266192,1.6374884375,61.80200000976563,84.30000013427734
+29,0.0048476228756564,1.61066359375,62.402000092773434,84.65000003173829
+30,0.004830248554104141,1.60866765625,62.512000166015625,84.74799990722656
+31,0.004853829142770597,1.6123709375,63.006000034179685,84.83400006103516
+32,0.004932305309921503,1.5929675,63.099999912109375,85.05400000976563
+33,0.004792891841913972,1.568394375,63.276000119628904,85.31999998291016
+34,0.004692332952150277,1.58934375,63.035999912109375,85.20200000244141
+35,0.004577582768563714,1.54651234375,63.73400003417969,85.60999985351563
+36,0.004556031598310385,1.5503490625,63.59800005371094,85.50600002685547
+37,0.0046822375152260065,1.52474796875,64.22400008789063,85.93000013427735
+38,0.004657128559691566,1.55069484375,63.546000217285155,85.62999992675782
+39,0.004746380395122937,1.52210296875,64.32800003417968,85.91000010742188
+40,0.004707724354895098,1.5206871875,64.42799998291015,86.0240000830078
+41,0.004603428766131401,1.506674375,64.54200005859374,86.05000005371093
+42,0.004603030060284904,1.50159671875,64.60199995361329,86.00200010742188
+43,0.0047432629591120145,1.5179125,64.27400005859376,85.91999986816407
+44,0.004725775448605418,1.51297625,64.40400008789062,86.0840000024414
+45,0.004635986472879138,1.51435421875,64.39799995605469,86.12600018310548
+46,0.004731553606688976,1.51975109375,64.57999998291015,85.91800005859375
+47,0.004743808514571616,1.52170140625,64.19400000732422,86.22800018066407
+48,0.0046771604434720105,1.51519671875,64.61600005615234,86.22799995361328
+49,0.004706535787720766,1.4999521875,64.70200003173828,86.35599994873047
+50,0.004842441262943404,1.52595046875,64.25200006347656,86.07999998046876
+51,0.004725219449028373,1.5043696875,64.45800010986328,86.2180000024414
+52,0.004687858246532934,1.5141671875,64.41200015869141,86.05400005371094
+53,0.004687787432755742,1.548313125,63.66999997802734,85.65200002685548
+54,0.0047174037899822,1.52720578125,64.31400008300781,86.20199995117187
+55,0.004637726915201971,1.52093953125,64.1680000366211,86.0380000390625
+56,0.004833232844248414,1.52520125,64.17599990966796,86.19600010742188
+57,0.004762610686676843,1.52058671875,64.0140000366211,86.07000000488281
+58,0.004648298191438828,1.52946125,64.14999998779297,85.98200005859375
+59,0.0046070771225328955,1.5326153125,64.15800011474609,86.00000005371093
+60,0.004567398789471814,1.38797765625,67.20599997558594,87.92
+61,0.004383251969037312,1.36050390625,67.4280001586914,88.1979999975586
+62,0.004411891967590366,1.35785125,67.754000078125,88.27799996826172
+63,0.004354702425189316,1.377858125,67.62200020019532,88.17399994384766
+64,0.004435187638072031,1.338603125,68.13600005126953,88.3960001538086
+65,0.0044293701316096955,1.36339078125,67.78399989501953,88.31600004882813
+66,0.004406826853352998,1.3349828125,68.20400002441406,88.53000010009765
+67,0.004341115642871175,1.34182421875,68.06400010498047,88.522000078125
+68,0.00440527665029679,1.3345721875,68.53000009765626,88.51799996826172
+69,0.00445441366173327,1.31747984375,68.80199995117188,88.88400004882813
+70,0.0045289295459432265,1.31268578125,69.02600005126953,88.89000002685547
+71,0.004412627180239984,1.31370578125,68.77999997558594,88.77800002197266
+72,0.00447040267421731,1.2932334375,69.15000004638672,89.27399997314453
+73,0.004431776136958173,1.3080253125,68.917999921875,88.95799999755859
+74,0.004366434345554028,1.29256109375,69.2580001196289,89.16399991699218
+75,0.004449943878820964,1.27832703125,69.34399997558593,89.37399991699219
+76,0.004414989829196462,1.27328578125,69.49399997070313,89.35799989013672
+77,0.004385375467661236,1.2588965625,69.97000009765625,89.67000014404297
+78,0.004234651502754007,1.252145625,70.06600004882813,89.61200001953125
+79,0.004134277879659619,1.2481928125,70.17599991699218,89.73199991455078
+80,0.004376141986410532,1.2435940625,70.29000009521485,89.80399999023437
+81,0.004350113000587693,1.25145546875,70.16599994384765,89.71599991699219
+82,0.0041788803480033365,1.23727890625,70.4640000390625,89.90800012207032
+83,0.004163048130327037,1.22594515625,70.7480000390625,90.1040001196289
+84,0.004177262308076024,1.220285,70.80000002441406,90.10800001953125
+85,0.004356617806479335,1.2096825,71.06799990966798,90.28000017333984
+86,0.004136818195027965,1.20626265625,71.00800006835938,90.45400009277344
+87,0.004320669054452862,1.1960878125,71.44600004394532,90.39599999267578
+88,0.004225688005265381,1.1825890625,71.57000004394531,90.68199999267578
+89,0.004137393403133112,1.18721875,71.5899999633789,90.6060000415039
+90,0.004129843686574272,1.1719803125,71.85000006835938,90.6959999609375
+91,0.004141489758954516,1.1739284375,71.73799994628907,90.95400009521484
+92,0.004072970527756427,1.15733859375,71.9239999584961,90.9779999609375
+93,0.004200898892512279,1.16603765625,71.97399991455079,90.74799993652344
+94,0.004249856541199344,1.1445459375,72.57999996337891,90.96800004150391
+95,0.004225575564695256,1.14039734375,72.46000006835938,91.07400001708984
+96,0.004005532079775419,1.131825,72.8800000390625,91.18599999023438
+97,0.004071374174340495,1.1173815625,73.15000009521485,91.3600000439453
+98,0.004082076717168093,1.122508125,72.95799993164063,91.48199988525391
+99,0.004027015063911676,1.1120584375,73.32999996337891,91.45199999267578
+100,0.004073423183789211,1.1090534375,73.0780001171875,91.6000000390625
+101,0.004202060867100954,1.10801828125,73.33600004394532,91.67000009765626
+102,0.004041028491753552,1.08485265625,73.82200001220703,91.8820000415039
+103,0.004068882670253515,1.08951828125,73.75799998779297,91.70599988769531
+104,0.004002831843016403,1.0773490625,73.9340000390625,91.92600014648437
+105,0.0039905716798135215,1.06997796875,74.22000014160156,91.96800014404297
+106,0.0039710661263338155,1.07303515625,74.20199993164063,92.1240000390625
+107,0.004002945015339979,1.0502475,74.67599990722657,92.20000014404297
+108,0.003870416233049972,1.04936078125,74.8459999584961,92.24200006835937
+109,0.00415139301081321,1.049835,74.65800008544922,92.3300000390625
+110,0.00399666149834437,1.0420678125,75.02599999023437,92.40200004150391
+111,0.004025361367634365,1.03837640625,75.07799998779296,92.33199998779297
+112,0.0038562153931707144,1.03210140625,75.1619999609375,92.61000009277343
+113,0.004027474771386811,1.018221875,75.49000013671875,92.70800006347656
+114,0.003928569860623351,1.010503125,75.47600003417969,92.82799998779296
+115,0.0037325743386255844,1.0082990625,75.82800008789063,92.87200016845703
+116,0.003692587238869497,0.99984109375,75.9919999584961,92.90200006347656
+117,0.003773627004453114,0.993427578125,75.82000000732423,93.03400008789062
+118,0.003733301069587469,0.98609140625,76.09799993164063,93.04599998779297
+119,0.0037581040690253887,0.98603875,76.28400003173829,93.15600011474609
+120,0.003757303347811103,0.9739171875,76.39200014160156,93.22600003662109
+121,0.003823640407063067,0.96441375,76.72800016113281,93.3140001147461
+122,0.0038722677688513485,0.95932984375,76.80000005859375,93.4180000366211
+123,0.0037423527599977596,0.95374203125,76.90800008056641,93.4300001147461
+124,0.003713275771588087,0.95002859375,77.07799997558594,93.65599998535156
+125,0.0037526132738483803,0.94809203125,77.04999998046875,93.59000001220703
+126,0.003595086995379201,0.9410465625,77.21200008789063,93.65400016601562
+127,0.0037954666851354496,0.93331484375,77.58000000732422,93.78200016601562
+128,0.0036314339875908835,0.927356484375,77.58400000732422,93.86399990722656
+129,0.0036980636484388795,0.92431375,77.68600000488281,93.90999993408204
+130,0.003769875886583967,0.91506,77.9819999560547,94.0260001171875
+131,0.003566112119837531,0.90513046875,78.06400000732422,94.11600006347656
+132,0.003662788059695491,0.910395078125,78.0900000024414,94.13200021728515
+133,0.003594044263341597,0.9083621875,78.20199993164063,94.2340000366211
+134,0.003604894254489669,0.8975578125,78.35000010498047,94.22800000976562
+135,0.003663198523489492,0.892152578125,78.53599992675781,94.25000011230469
+136,0.0036279520552073207,0.89069171875,78.59200018310547,94.34800006103515
+137,0.0035507999127730727,0.885088671875,78.67200000488282,94.42800006103515
+138,0.003624363453127444,0.88099734375,78.83999992675781,94.41600008789062
+139,0.003613233383345817,0.88451125,78.92200000488282,94.43399993408202
+140,0.003535857324355415,0.873215546875,79.06199995117187,94.45999993164062
+141,0.0036269916953252895,0.872527578125,79.16999995117187,94.59400006103516
+142,0.003528143628500402,0.86826640625,79.25200000732421,94.56400006103516
+143,0.0034625070568706307,0.86327984375,79.30600000488282,94.64400013916016
+144,0.003433352885102587,0.85692328125,79.4280000805664,94.72600008789063
+145,0.0035239099857530425,0.86083546875,79.59600013427735,94.71600008789062
+146,0.003434724856301078,0.855509765625,79.66200005371094,94.77000003417969
+147,0.003357212558122618,0.855286328125,79.71399989990235,94.84200006103515
+148,0.003438713012396225,0.8520646875,79.78599995117187,94.8499999584961
+149,0.0034021507848852445,0.8481296875,79.88000002929688,94.8320000366211
+150,0.003489113911720259,0.84722625,79.9060000805664,94.82800006103515
+151,0.003307607523830874,0.843561953125,79.97799987548828,94.86000008789063
+152,0.0034429498482495546,0.843726328125,79.99799997802734,94.9439999584961
+153,0.0033417041413486004,0.842510546875,79.99800005615235,94.85000000976562
+154,0.003366844529019935,0.842011796875,80.04599989990234,94.90800013916015
+155,0.0034030966427443282,0.8417021875,80.05599989990235,94.91600000976563
+156,0.003337076399475336,0.841655,80.16599997802734,94.94600008789062
+157,0.0034864412487617563,0.8409853125,80.12999992675782,94.94600008789062
+158,0.0033682246659217136,0.8408253125,80.10799992675781,94.94800013916016
+159,0.003300395860735859,0.840429453125,80.10799992675781,94.95800013916016
+160,0.0034828968678734134,0.840489453125,80.10999992675781,94.95200013916016
+161,0.0033458996497626814,0.840411171875,80.10999992675781,94.95600013916015
+162,0.0033955154607870747,0.840509453125,80.10799992675781,94.95400013916016
+163,0.0034967419258984072,0.840469453125,80.11199992675782,94.95800013916016
+164,0.003338477507765804,0.840429453125,80.11399992675781,94.95600013916015
+165,0.0033603642701304386,0.840461171875,80.10799992675781,94.95600013916015
+166,0.003346549197366195,0.84041703125,80.11199992675782,94.95800013916016
+167,0.003418706906294184,0.840461171875,80.10799992675781,94.95200013916016
+168,0.0034616739389353563,0.840401171875,80.10999992675781,94.95600013916015
+169,0.003319057735747525,0.840421171875,80.10799992675781,94.95600013916015
diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_150.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_150.csv
new file mode 100644
index 0000000..cd3c530
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/summary_vit-s_150.csv
@@ -0,0 +1,162 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,0.7088104273591723,6.98578625,0.076,0.4399999984359741
+1,0.0593021409586072,6.9105975,0.1,0.488
+2,0.007911931656833206,6.91798,0.098,0.534
+3,0.007876356664512838,6.6090775,0.7639999987792969,3.085999990234375
+4,0.007697046135685274,6.09368625,3.602,10.651999990844727
+5,0.007457644079944917,5.40907375,8.25999998046875,21.33200000854492
+6,0.007227104323516999,4.87317,13.316000041503907,30.212000043945313
+7,0.007069527537428907,4.469114375,17.83600005859375,37.67800001220703
+8,0.0068350267330450675,4.03336875,22.974000017089843,45.019999990234375
+9,0.006735124126342791,3.7360025,26.911999973144532,50.64200002685547
+10,0.0065133661098246065,3.4029515625,31.460000067138672,56.11799998535156
+11,0.006383622730416911,3.11413875,36.14199999267578,61.25599998046875
+12,0.006272536403100405,2.925640625,39.16599985107422,64.40200005371094
+13,0.006175674231989043,2.7787234375,41.937999921875,67.25399999023438
+14,0.006054158921220473,2.6632184375,44.03199999755859,69.6120000390625
+15,0.005941766081377864,2.490314375,46.57800006835937,71.7799999609375
+16,0.005736711734373655,2.3612234375,48.838000092773434,73.87800000732422
+17,0.005751167856422918,2.2801615625,50.37600013183594,75.05000002685547
+18,0.005688209020133529,2.2064034375,51.42600002685547,76.07400020019531
+19,0.0056528631996895585,2.11119125,53.12000000488281,77.693999921875
+20,0.005559766764885613,2.078486875,54.07400008544922,78.16200002929688
+21,0.0055256913349564585,2.02706875,54.952000029296876,79.01000000976562
+22,0.005470881537933435,1.982676875,55.996,79.79399999023437
+23,0.005492086954680937,1.9340134375,56.625999997558594,80.45800017578125
+24,0.005311453382351569,1.8639765625,57.63000004638672,81.27000006347656
+25,0.005362782394513488,1.8542515625,58.23199999267578,81.61199993896484
+26,0.0051889723898576835,1.82206859375,58.56200001220703,81.97000013916016
+27,0.005191617146400469,1.8077596875,59.058000065917966,82.1180000390625
+28,0.00539108006549733,1.76982375,59.733999992675784,82.49199993652344
+29,0.005350109356056366,1.75589625,59.6940001171875,82.95000000976563
+30,0.0051864461108509985,1.7444865625,60.152000092773434,83.12800008789063
+31,0.005111382908320853,1.7377140625,60.58999988769531,83.37200003662109
+32,0.005090781321216907,1.70070625,60.926000063476565,83.86600009033204
+33,0.0051537183046873125,1.697153125,61.247999968261716,84.04599993652344
+34,0.005152960141588535,1.69067796875,61.30999995849609,84.1339999560547
+35,0.005137387929218156,1.67412921875,61.55600004394531,84.29999990478515
+36,0.005153708858415484,1.6652390625,61.874000063476565,84.37599989990234
+37,0.005256490149934377,1.6635028125,62.1420000390625,84.33800000976562
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+38,0.005088782769494823,1.63648578125,62.50000005859375,84.98400003417969
+39,0.005024506510900599,1.60348984375,62.58800013916016,85.09200006103515
+40,0.005171889306179115,1.62893296875,62.43400001220703,84.78599992675781
+41,0.00501966945427869,1.6393809375,62.182000063476565,84.9300000805664
+42,0.005019068584910461,1.6146625,63.00200009033203,84.96200003173828
+43,0.005028320310105171,1.610436875,62.552000107421875,85.07799995361329
+44,0.005035558216539877,1.603858125,62.870000009765626,85.24400005615234
+45,0.0051212664028363565,1.61266875,62.57800005371094,85.0480000830078
+46,0.005013669574899333,1.58476109375,62.805999936523435,85.38800003173829
+47,0.00513466597268624,1.60628,62.746000041503905,85.16800013427735
+48,0.005029742705768773,1.60754875,62.69999995605469,85.11400005615235
+49,0.005068301722141249,1.60245984375,63.012000014648436,85.22000013427734
+50,0.005102636824761119,1.59829046875,62.91400001953125,85.22400008789063
+51,0.00508713665684419,1.60935359375,62.944000063476565,85.19000018554688
+52,0.005092570458405784,1.62416609375,63.045999956054686,85.21000006103516
+53,0.005107233600158777,1.60259453125,62.84600011230469,85.3239999267578
+54,0.004963167610445193,1.62584015625,62.63200004150391,85.03599997558594
+55,0.005057706331302013,1.603459375,62.89600008544922,85.3440000024414
+56,0.005091265742001789,1.60293578125,62.752000063476565,85.15000005859375
+57,0.005120393088353532,1.59812140625,63.020000061035155,85.27399992675781
+58,0.005047764762171677,1.62439109375,62.720000036621094,85.08199998046875
+59,0.005100339318492583,1.6105346875,62.749999982910154,84.96400005371093
+60,0.00489781451012407,1.461613125,65.8520000024414,87.048000078125
+61,0.004919247635241065,1.44690765625,66.18199997070313,87.34000010253907
+62,0.00476340061452772,1.41814203125,66.62199994140624,87.67599994628907
+63,0.004819929466715881,1.4123228125,66.75000002441406,87.91599994384765
+64,0.004664965140234146,1.400436875,67.19600012939453,87.9180000756836
+65,0.004728357174566814,1.39647765625,67.11599997558594,87.95000015380859
+66,0.0049002468253352815,1.38950890625,67.18599995117188,88.1239999975586
+67,0.004868564462023122,1.38209890625,67.43199994140625,88.11400010009766
+68,0.00472947655777846,1.37768984375,67.502,88.29799997070313
+69,0.0046727384241031745,1.3800528125,67.54400002197265,88.31799994384765
+70,0.004654625364180122,1.358063125,67.92600010253906,88.59000007568359
+71,0.004676780397338527,1.34764734375,68.39599994384766,88.79199999267578
+72,0.004702951027346509,1.3553028125,68.19200002685547,88.68800010253906
+73,0.0046924852566527465,1.33750328125,68.48599989746094,88.95200001953125
+74,0.004712799996403711,1.32582234375,68.80199999511719,88.89799999511719
+75,0.0048501147289893454,1.327525,68.80200002441406,88.93000002441406
+76,0.0047676527007882085,1.3018096875,69.15399994140625,89.15400007324219
+77,0.00481278362816998,1.3033221875,69.2420000415039,89.29799989013672
+78,0.004725964540349585,1.2860696875,69.49800007080079,89.53000012207032
+79,0.004516901980553355,1.2861878125,69.61000001953126,89.42199997070313
+80,0.004539829911664128,1.2745015625,69.77400001953124,89.53799994140626
+81,0.004735531651281885,1.269680625,69.93200001953124,89.53599989257812
+82,0.004495503480679223,1.2706703125,70.01799997070313,89.58400004882813
+83,0.004645188538623708,1.24389453125,70.34400017089844,89.99799993896484
+84,0.004592442519164511,1.253758125,70.1959999658203,89.82199997314453
+85,0.004540879412421158,1.23293046875,70.69799996582032,90.16799996582031
+86,0.0046499134706599375,1.23806453125,70.8520001171875,90.11600001953126
+87,0.004522715928032994,1.216608125,70.97000004394532,90.38200009521485
+88,0.004548228744949613,1.21265484375,71.22600014160156,90.31400009765625
+89,0.004482994155426111,1.19394625,71.35399998779297,90.5680001977539
+90,0.00460372755437025,1.18898640625,71.69999994140625,90.69400007080078
+91,0.0045170816925487346,1.18815234375,71.54599999023438,90.59600004150391
+92,0.004452806664630771,1.1944325,71.74200006347657,90.64400014404296
+93,0.004470930102148226,1.17952640625,71.97399998046875,90.85000001220703
+94,0.004519084235653281,1.1689840625,72.18800008544922,90.80399998779296
+95,0.004441033882488098,1.1511928125,72.30200016845703,91.02200006835938
+96,0.004507575084322265,1.13273125,73.06600013671876,91.2280001196289
+97,0.004393214426402535,1.1395334375,72.67200000976563,91.32800006835937
+98,0.0044600961929453274,1.13176390625,72.93400001464843,91.3420001196289
+99,0.004350347677245736,1.12184359375,73.11000014160156,91.57600009277344
+100,0.004354501043313316,1.1171884375,73.28800006103516,91.6200000415039
+101,0.0043542285981987205,1.109035,73.32200009033203,91.71199986328125
+102,0.0044115336744913036,1.10323703125,73.56599995849609,91.8140000415039
+103,0.0043475014556731495,1.09522,73.8040000341797,91.8679999633789
+104,0.004279967563759003,1.07913765625,74.18800013671876,92.07800006591796
+105,0.004298488759169621,1.07189796875,74.11799988037109,92.08200001464844
+106,0.0043106886358665565,1.0691628125,74.25399992919922,92.29599993408203
+107,0.004302483650722674,1.0528375,74.71400006103515,92.38399998779298
+108,0.004286574815133852,1.052930625,74.78800005859375,92.34199998779297
+109,0.004312934420470681,1.039460625,74.91800000488281,92.5140001147461
+110,0.004205309669487178,1.0343,75.10400001220704,92.6020001171875
+111,0.004196903435513377,1.031161875,75.1299999584961,92.58799998535156
+112,0.004206354929400342,1.0310475,75.30400006103515,92.59400006591797
+113,0.004175113913203988,1.01361875,75.68399995361328,92.81600000976563
+114,0.004100877741750862,1.01069421875,75.83800018554687,92.9179999609375
+115,0.0040066726173141175,0.997464375,75.9339998779297,93.0339998828125
+116,0.00419412087649107,0.9867975,76.1880000341797,93.24999985595703
+117,0.004112885084136256,0.986059375,76.17600018798828,93.11800006347656
+118,0.004120496783538589,0.97795734375,76.47800013427734,93.30400000976563
+119,0.004059170971491507,0.969184375,76.74600011230469,93.32400014160156
+120,0.00405424738502396,0.96552375,76.85999995117187,93.3660000390625
+121,0.0040235080689724,0.9608075,77.030000078125,93.51000008789063
+122,0.004061605897732079,0.950822578125,76.9779999243164,93.67200006347656
+123,0.004020468215458095,0.943777890625,77.22800010742188,93.67000000976563
+124,0.004036017577163875,0.9434803125,77.14199994384765,93.77000003417969
+125,0.004021217980022941,0.938650078125,77.34199997558594,93.76799993164063
+126,0.003956006823240646,0.923268828125,77.82200008300781,93.97000008789063
+127,0.00397626292293093,0.9259103125,77.79600018310546,93.9640001147461
+128,0.0039319154041420135,0.91582265625,77.75799990478515,94.0440000366211
+129,0.003862393304838666,0.913869296875,78.03200002441406,94.07600000976562
+130,0.0039001869902546915,0.901537578125,78.20000018066406,94.08600000976563
+131,0.0038433120353147388,0.8982784375,78.3039998461914,94.24600000976562
+132,0.0038704367554081337,0.8959540625,78.38600012939453,94.2480000366211
+133,0.003911821055226028,0.8925590625,78.47600010498047,94.30200008789062
+134,0.0039027773642114232,0.88617625,78.67799997802734,94.44400008789063
+135,0.003936674761851984,0.883285546875,78.73000008544922,94.3879999584961
+136,0.0038679074329723206,0.880095859375,78.98600002685546,94.44399998291016
+137,0.003974030193473611,0.878472109375,78.96200000488281,94.42399998535156
+138,0.003772691740388317,0.873896171875,78.98399997558593,94.58800003417969
+139,0.0039038029998274787,0.87008015625,79.19400025878906,94.5519999584961
+140,0.0037787892256996463,0.865694140625,79.24800005126953,94.65600000976562
+141,0.003886453907138535,0.86606859375,79.28600008056641,94.5779999584961
+142,0.0038702244803841624,0.862626875,79.29200015625,94.63000008789062
+143,0.0038205020495557357,0.8608634375,79.3959999243164,94.62800000976563
+144,0.0038622134597972035,0.86000546875,79.36399995117188,94.67600008789063
+145,0.0037754822988063097,0.858419609375,79.458000078125,94.64599990722657
+146,0.0038074126600154807,0.85653265625,79.47600002685547,94.67800006103515
+147,0.0037796468074832645,0.85589296875,79.49800018066406,94.7039999584961
+148,0.003756655995467944,0.85510296875,79.54800015625,94.68800000976563
+149,0.0038007416961980717,0.854795390625,79.50400002685546,94.71000000976562
+150,0.0037753373284691145,0.85447609375,79.55800015625,94.66200000976562
+151,0.0038181487803480457,0.854808515625,79.54400002685547,94.69600000976563
+152,0.0038581541240481393,0.85495265625,79.57000012939453,94.69599995849609
+153,0.0038464674559820977,0.85448265625,79.61000015625,94.68400000976563
+154,0.0038058483374438117,0.8545775,79.57600002685547,94.70600000976563
+155,0.003683173995731132,0.854753359375,79.57600018066407,94.69200000976562
+156,0.0037975836977628724,0.85438921875,79.56800015625,94.72000000976563
+157,0.003804615482554904,0.854045078125,79.55200012939453,94.70400000976562
+158,0.0037559159598978503,0.85405609375,79.60600020751953,94.71600000976562
+159,0.003807129604475839,0.854081953125,79.59000015625,94.71599990722656
diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv
new file mode 100644
index 0000000..434da21
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv
@@ -0,0 +1,311 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,0.7044260501861572,6.985015,0.088,0.452
+1,0.0969994724728167,6.91954625,0.1,0.502
+2,0.007983872550539672,6.909185,0.1,0.542
+3,0.007909782463684678,6.91218625,0.1,0.51
+4,0.007911908673122525,6.90983625,0.164,0.6220000016784668
+5,0.007871961453929543,6.70395875,0.5879999995803833,2.2040000076293946
+6,0.007742398825939745,6.261975,2.1440000048828125,7.538000052490235
+7,0.0075181673164479434,5.76455625,5.369999989013672,14.986000063476563
+8,0.007321218785364181,5.29548125,8.772000046386719,22.36600001098633
+9,0.007111855666153133,4.853555,13.030000030517579,29.66600005493164
+10,0.006843417999334633,4.459345,17.36600002685547,36.45599997558594
+11,0.006791550025809556,4.12680875,21.54399997314453,42.666000051269535
+12,0.006450463959481567,3.7919925,26.05999996459961,48.98600003417969
+13,0.006386950903106481,3.51486375,29.708000098876955,53.302000068359376
+14,0.006161959143355489,3.275209375,33.4859999621582,57.85200018798828
+15,0.005905966216232628,3.0013975,37.48999988769531,62.220000068359376
+16,0.0059155591879971325,2.8378715625,40.1099999584961,65.31599997070313
+17,0.00580364337656647,2.67033625,43.25800007324219,68.38600009033203
+18,0.005784340493846685,2.54994375,45.37000004638672,70.3460000390625
+19,0.005498647165950388,2.3923209375,47.886000122070314,72.7140000048828
+20,0.005400298163294792,2.3392971875,49.22999998779297,73.68799997802735
+21,0.005284862301778048,2.2200078125,51.148000134277346,75.39200002929688
+22,0.0052393467631191015,2.2018809375,51.76199992675781,76.0319999975586
+23,0.005337623995728791,2.06775625,54.09399992431641,77.97599994873048
+24,0.005264734965749085,1.9937253125,55.292,79.01200001708985
+25,0.005198978411499411,1.9435671875,56.252000021972655,79.83599996337891
+26,0.005091208382509649,1.910471875,56.89400001953125,80.4799998828125
+27,0.005193100310862064,1.84141171875,58.048000166015626,81.1960000366211
+28,0.004887306306045502,1.79133453125,59.0100001171875,81.94200009033203
+29,0.004921669606119394,1.74195515625,59.95999993896484,82.50400003417968
+30,0.00497161119710654,1.7719084375,59.681999990234374,82.49599990722656
+31,0.004979399731382728,1.7319225,60.844000092773435,82.97799988037109
+32,0.004785400058608502,1.64407359375,61.781999877929685,84.00999998535156
+33,0.004833479702938348,1.64234125,62.18799998779297,84.17799998291015
+34,0.004748103907331824,1.62056109375,62.69000000488281,84.48600005859375
+35,0.004687358741648495,1.57935859375,63.360000087890626,85.16400005859376
+36,0.004569473152514547,1.5563034375,63.79000008544922,85.58800013671875
+37,0.004761328222230077,1.54370125,63.81800003417969,85.52800001464844
+38,0.004718770156614482,1.52213484375,64.40800006591797,85.90199997802735
+39,0.004768356215208769,1.51102453125,64.6120000366211,86.04000013427735
+40,0.0047006625682115555,1.50302859375,64.84799997802735,86.34000010742187
+41,0.004745179205201566,1.48255546875,65.20000005859374,86.34799995361328
+42,0.004688572196755558,1.474654375,65.36199998046875,86.4659999243164
+43,0.004641626437660307,1.4526140625,65.774000078125,86.92200002685547
+44,0.004737225652206689,1.44859515625,65.90600008300781,86.77000010253906
+45,0.004549883306026459,1.43138296875,66.2820000805664,87.08999997558594
+46,0.004568657139316201,1.4419103125,66.08,86.97400012939453
+47,0.00449184695025906,1.4311253125,66.21400005859375,87.23800018310547
+48,0.004698772158008069,1.43329,66.484,87.15599995117188
+49,0.004482330055907369,1.416868125,66.71999997802735,87.39999995361327
+50,0.004466343321837485,1.40876625,66.59600003417968,87.35800015869141
+51,0.004509880149271339,1.42795875,66.515999921875,87.28400012939453
+52,0.004421624355018139,1.4103325,66.96200002685546,87.4540000756836
+53,0.004459893825696781,1.40280203125,67.00600000488281,87.694000078125
+54,0.004646702029276639,1.42107328125,66.31000005615235,87.36199984619141
+55,0.004660098522435874,1.40078390625,66.97999994873047,87.64400015869141
+56,0.004636922327335924,1.40150484375,67.00999997314453,87.54800015380859
+57,0.004505249846260995,1.3814259375,67.38599999511719,87.78999989501953
+58,0.004583987290970981,1.39887203125,66.95400008300781,87.64400000488281
+59,0.00445454369764775,1.3692078125,67.46200002441407,88.01799994873046
+60,0.004361059225630015,1.3869778125,67.11800007080078,87.80800005126953
+61,0.004440771474037319,1.3606525,67.41000005126953,88.12400005126953
+62,0.004636758414562792,1.39476765625,67.29600005615234,87.77800003173829
+63,0.004537882923614234,1.38170265625,67.14600000244141,87.98000002685546
+64,0.004445640544872731,1.4132978125,66.766,87.50000010009765
+65,0.004597496357746422,1.3838809375,67.57400002441406,87.97399996826172
+66,0.004552507074549794,1.3813365625,67.304,87.90600010009766
+67,0.004598443454597145,1.404621875,67.20599981445312,87.72800004882812
+68,0.004736300674267113,1.369383125,67.59199991943359,88.01800002197265
+69,0.004423990612849593,1.3800675,67.385999921875,88.02800010253907
+70,0.004376259341370314,1.375864375,67.59799997070313,88.063999921875
+71,0.004542485869023949,1.3974696875,67.25599991699218,87.80600002197265
+72,0.004551234073005617,1.38990765625,67.202,87.82200002441407
+73,0.004510169732384384,1.39546359375,67.02400010009765,87.88600002929688
+74,0.004502174386288971,1.3995421875,66.8800000024414,87.71800010498048
+75,0.004468847764655948,1.38606765625,67.2239999975586,87.90200002197265
+76,0.0044153109774924815,1.38774046875,67.27800010253907,88.01200010253906
+77,0.0045601483434438705,1.376890625,67.42200004882812,88.01800007568359
+78,0.004543673770967871,1.3869546875,67.29800007568359,87.92600002685546
+79,0.004438304342329502,1.39851296875,67.14399997070312,87.66800002441406
+80,0.004577811749186367,1.31768515625,68.536000078125,88.84199996826172
+81,0.004345477733295411,1.333046875,68.45199994628906,88.6480000756836
+82,0.004302715067751706,1.32025875,68.541999921875,88.93399996826172
+83,0.004509917518589646,1.3180740625,68.74399991699218,88.9799999975586
+84,0.004441541852429509,1.29395828125,69.16400005126953,89.25600002197265
+85,0.0045066027087159455,1.2853759375,69.141999921875,89.24200010009766
+86,0.004326534806750715,1.2793428125,69.49200007324218,89.23199996826172
+87,0.004294669663067907,1.36364703125,67.84400005126953,88.23799997314453
+88,0.004422786645591259,1.2834696875,69.44999997070312,89.29600002441406
+89,0.004403424798510969,1.28864765625,69.30399994140625,89.11400004882813
+90,0.004319502564612776,1.28498984375,69.38199994628906,89.25600002197265
+91,0.004208565398585051,1.2912790625,69.4160000415039,89.10199996826172
+92,0.0044140288373455405,1.30274,69.19799999511719,89.04800001953124
+93,0.004330580122768879,1.31306734375,68.91400002685548,88.88199997314453
+94,0.004385846754303202,1.2693425,69.68000001953125,89.57000007080079
+95,0.0044691963703371584,1.29532671875,69.619999921875,89.28600004638672
+96,0.004273455881047994,1.2526528125,70.02000009277344,89.62800004638672
+97,0.004275580518878996,1.2551025,70.17200007080078,89.62199996582031
+98,0.004168680345173925,1.2476459375,70.2620000439453,89.69599997070313
+99,0.004312307632062584,1.25005265625,70.04200002197265,89.70800001708984
+100,0.004151400818955153,1.2461659375,70.03999999511718,89.72400004394531
+101,0.004309043200919405,1.2391,70.36400012695313,89.92399999511719
+102,0.004157550341915339,1.22961453125,70.57400012207032,89.94399997070313
+103,0.004356609890237451,1.24146640625,70.50399991699219,89.85400001708985
+104,0.004298999905586243,1.227674375,70.76399998779297,90.17600006835937
+105,0.00430909771239385,1.2422475,70.44799996582032,89.9480000756836
+106,0.004445596074219793,1.22154328125,70.83000002441406,90.21400007324219
+107,0.00423419097205624,1.21691765625,70.83599999511719,90.18800004638672
+108,0.004210363578749821,1.213593125,71.11400001708985,90.26200007080078
+109,0.00412930449238047,1.20937359375,71.05799998779297,90.20600001708985
+110,0.0042115405085496604,1.22068828125,70.8900000439453,90.17599996582031
+111,0.00418245664332062,1.2140925,70.8720000439453,90.29600002197266
+112,0.004309441312216222,1.22103734375,71.18000001464844,90.14400006591796
+113,0.0042236754088662565,1.19859234375,71.22199989257813,90.38400007080078
+114,0.004432787478435785,1.23499390625,70.92599997070313,90.13999996826172
+115,0.0041279447614215314,1.1996665625,71.40400001708984,90.31399991699219
+116,0.0042328120325692,1.200145,71.30199989013671,90.44000001708984
+117,0.004356413439381868,1.2043246875,71.31400015625,90.41000007324219
+118,0.004225354467052966,1.212551875,71.19400001464844,90.07199999511718
+119,0.004277279193047434,1.19622109375,71.25399997070312,90.31799999511719
+120,0.004156066454015672,1.18837296875,71.68400017578125,90.45000014648437
+121,0.004125220933929086,1.1742715625,71.88199993896484,90.68599991210938
+122,0.004092424380360171,1.17817953125,71.87799991699218,90.69000006835938
+123,0.004256274143699557,1.18016703125,71.61000009521484,90.67000014648437
+124,0.0042632205004338175,1.1741175,71.82400002197265,90.88000001708984
+125,0.004216796689433977,1.179620625,71.8520000415039,90.77600012207031
+126,0.004339889041148126,1.173100625,71.86399999267579,90.83200006835938
+127,0.0042471098131500185,1.17969546875,71.81000006835937,90.75400001708984
+128,0.004166796104982495,1.155326875,72.28800001464843,90.86999999267579
+129,0.00414535662275739,1.1584121875,72.21200003662109,90.84199991210937
+130,0.004148415551753715,1.14498046875,72.60400009033204,90.93599993896484
+131,0.0042910316260531545,1.16384421875,72.20800009521484,90.78599996582031
+132,0.0040841237641870975,1.17078765625,72.27400009765626,90.73199996582031
+133,0.004122275277040899,1.15785671875,72.15199996582031,91.04800001953124
+134,0.0042202716576866806,1.139366875,72.58799993896484,91.17399996826173
+135,0.004182497912552208,1.149436875,72.56800004150391,91.06200004150391
+136,0.004343954788055271,1.14087421875,72.87200014160156,91.15600009765625
+137,0.004005039401818067,1.13817484375,72.76200008789063,91.28799993896484
+138,0.004071355739142746,1.13199,72.97399998535157,91.16200009521485
+139,0.00414348577032797,1.132341875,73.01999996337891,91.24399993896485
+140,0.0039712024736218154,1.1301503125,72.95399999511719,91.27600007080078
+141,0.004159792559221387,1.134925625,73.14199998779297,91.28000004150391
+142,0.004030889453133568,1.12694734375,73.06800001464843,91.26800006835937
+143,0.0040782393189147115,1.12359453125,72.86000001708985,91.48399983886719
+144,0.003809310757787898,1.12580953125,73.17400008789062,91.37000007080078
+145,0.003980578243499622,1.1145928125,73.1079999609375,91.3860001196289
+146,0.004046716610901058,1.11903015625,73.20999988525391,91.30800001464844
+147,0.004106871841941029,1.1137225,73.3460000390625,91.53399991455078
+148,0.003906197816831991,1.10643140625,73.63199998535156,91.68200006835937
+149,0.004178202943876386,1.10409578125,73.54999998535156,91.6900001171875
+150,0.00415997754316777,1.09609484375,73.84400001220703,91.62600009521485
+151,0.003989629592979327,1.09465078125,73.63000006835938,91.76400006835938
+152,0.00396822375478223,1.10190734375,73.51400012207031,91.68000016845703
+153,0.003976788342697546,1.09611203125,73.81400002197266,91.72199999023438
+154,0.004158306081080809,1.09639421875,73.66799993652344,91.80200019775391
+155,0.004064109758473933,1.08795421875,74.01999996582032,91.8660000439453
+156,0.004117032280191779,1.0851290625,74.0259998828125,91.8880000415039
+157,0.0039173789555206895,1.08440359375,73.87799995849609,91.80399998779296
+158,0.003975894884206355,1.08112921875,73.8619999609375,91.83800006835938
+159,0.004066657216753811,1.083331875,74.13599999023438,91.88200001708984
+160,0.004081751016201451,1.07107125,74.42000001220703,91.9859999633789
+161,0.003981336456490681,1.07378015625,74.17199985595703,91.9720000415039
+162,0.003877143404679373,1.06280453125,74.5020000366211,92.22400011474609
+163,0.004046246845973656,1.08194265625,74.23800003417969,91.9560000390625
+164,0.003967240249039605,1.06394828125,74.41600006347656,92.2520000415039
+165,0.004025593894766644,1.06165015625,74.64000000732422,92.18999998779297
+166,0.003986912866821513,1.0660328125,74.38600001220703,92.11399996582031
+167,0.003991044999565929,1.063330625,74.50000009033204,92.16400004394531
+168,0.0039660760085098445,1.0574934375,74.60000001708984,92.29800004150391
+169,0.0038451424334198236,1.04937484375,74.78200001464843,92.34000001464844
+170,0.00406502527766861,1.045675,74.56399998535156,92.56799988525391
+171,0.0038427552208304405,1.0538271875,74.69800006347656,92.2440000439453
+172,0.004004607035312802,1.0508121875,74.89399998779297,92.32400004150391
+173,0.003920168557669967,1.03681453125,75.0600001171875,92.54000009277344
+174,0.0037876375718042254,1.03353546875,75.05800005859375,92.5699999609375
+175,0.0039634802378714085,1.03495625,75.12800010986328,92.4180000415039
+176,0.003979514702223241,1.02824296875,75.19799989990234,92.64400006591796
+177,0.0039475191733799875,1.03462125,75.27599995605469,92.57400004150391
+178,0.003931994579033926,1.02431625,75.34600003417968,92.65600001464844
+179,0.0038644576852675527,1.02364328125,75.30000000732421,92.70000009277344
+180,0.004069518763571978,1.021311875,75.40200000244141,92.7040001147461
+181,0.0038787248195149004,1.0157940625,75.4380000390625,92.80000001708984
+182,0.0038980625104159117,1.01297875,75.60400006103515,92.78600004150391
+183,0.0038413635338656604,1.01237828125,75.65799998046874,92.83199998779297
+184,0.0038715062255505472,1.001800625,75.8019999584961,92.97600024902344
+185,0.0037820974830538034,1.00903609375,75.73799990234374,92.93200011474609
+186,0.0039416955260094255,1.00611484375,75.94000008544921,92.97599998779297
+187,0.004028597992146388,1.00629703125,75.76399995605469,92.94800006591797
+188,0.0040608441340737045,0.9983109375,76.03400006347657,93.05399998779296
+189,0.0038682857411913574,0.99529265625,76.10000005859375,93.01600009033203
+190,0.0038625796150881797,0.9906225,76.3139999584961,93.20400006591797
+191,0.003907167032593861,0.99719609375,76.07200003173828,92.99600016845703
+192,0.003970563324401155,0.98546515625,76.22399998291016,93.12800011962891
+193,0.0037146424292586744,0.98384734375,76.23200005859375,93.1819998852539
+194,0.0038293678080663085,0.980125625,76.29599997558594,93.23399993652343
+195,0.003857848176266998,0.9789878125,76.48399992675782,93.21600014404297
+196,0.003651000588433817,0.9828165625,76.30399995117187,93.18599990966797
+197,0.003931012062821537,0.974150625,76.47400006103516,93.2300001171875
+198,0.0037729314935859293,0.971365625,76.54600000732422,93.2560001147461
+199,0.003835154144326225,0.962321875,76.75400010986328,93.34400006591797
+200,0.0037835679831914604,0.97207484375,76.69200006347656,93.4340000366211
+201,0.0036511396756395698,0.9725315625,76.69000003417969,93.28200001220704
+202,0.0035157224046997726,0.96689546875,76.70999995361328,93.40600013916016
+203,0.0037986902752891183,0.96763328125,76.93200002929687,93.41600006591797
+204,0.0037802516599185765,0.96447734375,76.83799990234375,93.46799998779296
+205,0.00364596422878094,0.95649390625,77.00600002929687,93.54799985839844
+206,0.004014570848084986,0.95046234375,77.07600008544922,93.56600014404297
+207,0.003854787297314033,0.952626875,77.16199995361328,93.54800001464844
+208,0.003810833120951429,0.95615046875,77.11800005371094,93.5760001171875
+209,0.0036461960698943585,0.94509078125,77.31000010742187,93.59200014160156
+210,0.0036390326858963817,0.93478578125,77.606,93.7100000390625
+211,0.003728658310137689,0.940257734375,77.36200002441406,93.70999993408203
+212,0.003695755498483777,0.934088046875,77.38800015869141,93.8620001171875
+213,0.00374451614334248,0.93637125,77.55400008056641,93.7719999609375
+214,0.0037858944560866803,0.9297875,77.57800008056641,93.73599990722656
+215,0.003787133755395189,0.929144453125,77.604000078125,93.8679999584961
+216,0.0037237268406897783,0.92438484375,77.86200003173828,93.7780000390625
+217,0.0038397773751057684,0.9217740625,77.89799988037109,94.01000011474609
+218,0.003692085068905726,0.9237778125,78.078000078125,93.8559999609375
+219,0.0037746465823147446,0.92401,77.9880001586914,93.95799998535156
+220,0.003497931669699028,0.91463734375,78.17200003417969,94.03000001220703
+221,0.0035741630708798766,0.91683484375,78.10400008300782,94.02600009033203
+222,0.0037327913742046803,0.91270625,78.18000021240235,94.0819999584961
+223,0.003703387745190412,0.907024765625,78.2820000805664,94.11999998779297
+224,0.0035485914850141853,0.90498890625,78.47000013183593,94.14200001220703
+225,0.0035215062380302697,0.90400015625,78.47200010498047,94.1780000390625
+226,0.003617745591327548,0.901161171875,78.52600002685547,94.1599999609375
+227,0.003756721707759425,0.902990625,78.46400000244141,94.20800016601562
+228,0.0035172457282897085,0.898592890625,78.47000010498047,94.24400009033204
+229,0.003436287835938856,0.89840953125,78.6879999243164,94.25600014160156
+230,0.0035909943107981235,0.8956084375,78.79399997558593,94.21800001220703
+231,0.003540566220181063,0.8909171875,78.638000078125,94.29199998535157
+232,0.003629441751400009,0.892078515625,78.7260001586914,94.26399985595702
+233,0.0036266729002818465,0.8912621875,78.62400000488282,94.3320000390625
+234,0.0036107241467107087,0.88978859375,78.85000020751953,94.35600008789062
+235,0.003551934292772785,0.89019984375,79.11600000244141,94.36199990966797
+236,0.003503760090097785,0.88300671875,79.01200000244141,94.46399998535156
+237,0.003436240862356499,0.875820546875,79.03000018066406,94.56399993408203
+238,0.00340579726616852,0.879975703125,79.24400002929687,94.4900000366211
+239,0.0035596858360804617,0.870110234375,79.32800018310547,94.57400006347656
+240,0.0034809598000720143,0.87433234375,79.23400002441406,94.59199998779297
+241,0.0034409927029628307,0.873230625,79.2940000805664,94.57199998535157
+242,0.003514723590342328,0.86742953125,79.39000007568359,94.68400009033203
+243,0.003626737539889291,0.86634640625,79.39000008056641,94.74800014160157
+244,0.0036246690433472395,0.87029609375,79.3940000805664,94.70000008789063
+245,0.003732266020961106,0.87053484375,79.45600008300781,94.74200009033203
+246,0.0034438550064805895,0.86946375,79.6140000024414,94.72600009033204
+247,0.003378850087756291,0.86259046875,79.56600005615235,94.7200001147461
+248,0.0035159428080078214,0.8622246875,79.65200000244141,94.7580000366211
+249,0.0034493720449972898,0.85940671875,79.846,94.71400003662109
+250,0.003627320984378457,0.859804375,79.86799995361328,94.77600013916016
+251,0.003383415110874921,0.85597328125,79.78600005615235,94.87200008789063
+252,0.0033759995421860367,0.861890625,79.68400005615234,94.86600016845703
+253,0.003541645623045042,0.8537678125,79.80800003173829,94.86200009033203
+254,0.0036422949051484466,0.85650078125,79.91000005371093,94.81799993408202
+255,0.003406544477911666,0.854969375,79.98199992919922,94.78199993652343
+256,0.0033522049780003726,0.8533825,80.05600005615234,94.96200001220703
+257,0.0034389470529276878,0.8473271875,80.10600008300781,94.93000014160157
+258,0.0033865342556964606,0.84619328125,80.18000008300781,94.9299999584961
+259,0.003447333292569965,0.84689046875,80.1759999243164,94.96200006347657
+260,0.0032964720739983022,0.85028234375,80.15400003173828,94.96200003662109
+261,0.0034078260068781674,0.84569546875,80.26399992431641,95.0580000366211
+262,0.003493778232950717,0.84492140625,80.29800000244141,95.03200016601562
+263,0.003361152426805347,0.8409953125,80.38600010498047,94.9840000366211
+264,0.003216014476493001,0.8484996875,80.2539999243164,95.02400001220703
+265,0.0033724562090355903,0.84592109375,80.42600002929687,95.06999998535156
+266,0.003271011490141973,0.845681875,80.36600010986328,95.04200009033202
+267,0.00325453162076883,0.84505921875,80.45000010742187,95.08800006347656
+268,0.003399371402338147,0.84084203125,80.54600021240235,95.13200006347657
+269,0.003290993539849296,0.84360375,80.48200000244141,95.03600011474609
+270,0.003396770596737042,0.843845,80.66800008300781,95.04000009033203
+271,0.0034957354655489326,0.8456625,80.51999995117187,95.11200008789062
+272,0.0033322387316729873,0.8467834375,80.5320001586914,95.06799998535156
+273,0.0032145300647243857,0.843624375,80.62000005371094,95.14200008789062
+274,0.0032572261116001755,0.83734109375,80.67399997802734,95.1919999609375
+275,0.0032410602434538305,0.8366184375,80.61999998046875,95.23799998535156
+276,0.0032682681048754603,0.83492609375,80.75799989990234,95.13600006347656
+277,0.003225842461688444,0.8364571875,80.77199997802734,95.26400014160156
+278,0.003334630251629278,0.83932546875,80.70800020996094,95.24000001220703
+279,0.0031491983390878886,0.8327865625,80.80600005371093,95.22600003662109
+280,0.0032511689059901983,0.8352259375,80.79400003173828,95.24200001220703
+281,0.003383599338121712,0.83481140625,80.80199997802734,95.24200001220703
+282,0.003208352194633335,0.83299375,80.82000005615234,95.28400006347657
+283,0.00338284092140384,0.83310734375,80.90600002929688,95.24800011474609
+284,0.0033502599399071187,0.8340803125,80.87400010742188,95.21000006347656
+285,0.0031701632833573967,0.83368890625,80.93800005615235,95.27200006347657
+286,0.0031970099080353975,0.8329978125,80.93600013427735,95.29400001220704
+287,0.003307257400592789,0.83367359375,81.02200005615235,95.29600006347657
+288,0.0032536371145397425,0.83250046875,80.9960000805664,95.22600006347656
+289,0.0030824737914372236,0.83073984375,81.0500000805664,95.2640001147461
+290,0.003224483778467402,0.83288328125,81.01600010742187,95.24000006347656
+291,0.0033304091775789857,0.8298646875,81.03000013427734,95.31200016601562
+292,0.0032747428049333394,0.83119640625,81.06000018554687,95.2640000366211
+293,0.003150499804178253,0.83247578125,81.04200005615235,95.27000016601562
+294,0.003187613532645628,0.83180609375,81.09000005615235,95.24800011474609
+295,0.0032579210528638214,0.83188265625,81.07800005615235,95.29000011474609
+296,0.003274351533036679,0.83074125,81.12400013427734,95.29000011474609
+297,0.0031184881809167564,0.83056609375,81.10000013427734,95.29200003662109
+298,0.0031399513536598533,0.83104609375,81.07200013427735,95.2900000366211
+299,0.003196138044586405,0.83090609375,81.09400013427734,95.2840000366211
+300,0.003246222360758111,0.83078609375,81.08800013427734,95.28000003662109
+301,0.0032320290338248014,0.83078609375,81.08800013427734,95.28000003662109
+302,0.0031107992690522224,0.83080609375,81.09200013427734,95.2760000366211
+303,0.0032131875632330775,0.83076609375,81.09400013427734,95.2880000366211
+304,0.003161734639434144,0.83076609375,81.09600013427735,95.2840000366211
+305,0.00320960785029456,0.83088609375,81.09200013427734,95.28000003662109
+306,0.0031181383528746665,0.83080609375,81.10000013427734,95.28000003662109
+307,0.0033232175337616354,0.83096609375,81.09600013427735,95.2780000366211
+308,0.0031707440793979913,0.83080609375,81.09400013427734,95.2820000366211
+309,0.0033417781232856214,0.83084609375,81.08800013427734,95.28600003662109
diff --git a/CV/timm/exp_results/ViT/small/summary_vit-s_300.csv b/CV/timm/exp_results/ViT/small/summary_vit-s_300.csv
new file mode 100644
index 0000000..de0b724
--- /dev/null
+++ b/CV/timm/exp_results/ViT/small/summary_vit-s_300.csv
@@ -0,0 +1,311 @@
+epoch,train_loss,eval_loss,eval_top1,eval_top5
+0,0.7088104273591723,6.98578625,0.076,0.4399999984359741
+1,0.0593021409586072,6.9105975,0.1,0.488
+2,0.007911931656833206,6.91798,0.098,0.534
+3,0.007876356664512838,6.6090775,0.7639999987792969,3.085999990234375
+4,0.007697046135685274,6.09368625,3.602,10.651999990844727
+5,0.007457644079944917,5.40907375,8.25999998046875,21.33200000854492
+6,0.007227104323516999,4.87317,13.316000041503907,30.212000043945313
+7,0.007069527537428907,4.469114375,17.83600005859375,37.67800001220703
+8,0.0068350267330450675,4.03336875,22.974000017089843,45.019999990234375
+9,0.006735124126342791,3.7360025,26.911999973144532,50.64200002685547
+10,0.0065133661098246065,3.4029515625,31.460000067138672,56.11799998535156
+11,0.006383622730416911,3.11413875,36.14199999267578,61.25599998046875
+12,0.006272536403100405,2.925640625,39.16599985107422,64.40200005371094
+13,0.006175674231989043,2.7787234375,41.937999921875,67.25399999023438
+14,0.006054158921220473,2.6632184375,44.03199999755859,69.6120000390625
+15,0.005941766081377864,2.490314375,46.57800006835937,71.7799999609375
+16,0.005736711734373655,2.3612234375,48.838000092773434,73.87800000732422
+17,0.005751167856422918,2.2801615625,50.37600013183594,75.05000002685547
+18,0.005688209020133529,2.2064034375,51.42600002685547,76.07400020019531
+19,0.0056528631996895585,2.11119125,53.12000000488281,77.693999921875
+20,0.005559766764885613,2.078486875,54.07400008544922,78.16200002929688
+21,0.0055256913349564585,2.02706875,54.952000029296876,79.01000000976562
+22,0.005470881537933435,1.982676875,55.996,79.79399999023437
+23,0.005492086954680937,1.9340134375,56.625999997558594,80.45800017578125
+24,0.005311453382351569,1.8639765625,57.63000004638672,81.27000006347656
+25,0.005362782394513488,1.8542515625,58.23199999267578,81.61199993896484
+26,0.0051889723898576835,1.82206859375,58.56200001220703,81.97000013916016
+27,0.005191617146400469,1.8077596875,59.058000065917966,82.1180000390625
+28,0.00539108006549733,1.76982375,59.733999992675784,82.49199993652344
+29,0.005350109356056366,1.75589625,59.6940001171875,82.95000000976563
+30,0.0051864461108509985,1.7444865625,60.152000092773434,83.12800008789063
+31,0.005111382908320853,1.7377140625,60.58999988769531,83.37200003662109
+32,0.005090781321216907,1.70070625,60.926000063476565,83.86600009033204
+33,0.0051537183046873125,1.697153125,61.247999968261716,84.04599993652344
+34,0.005152960141588535,1.69067796875,61.30999995849609,84.1339999560547
+35,0.005137387929218156,1.67412921875,61.55600004394531,84.29999990478515
+36,0.005153708858415484,1.6652390625,61.874000063476565,84.37599989990234
+37,0.005256490149934377,1.6635028125,62.1420000390625,84.33800000976562
+38,0.005222479480185679,1.65173234375,62.20800006103516,84.648
+39,0.005251926835626364,1.63313015625,62.53800000244141,84.9480001586914
+40,0.005172699357249907,1.62525328125,62.58600004150391,84.86999993164062
+41,0.004970315178590161,1.6189990625,62.66999993652344,84.85200000488281
+42,0.004989458713680506,1.6085684375,63.08999998535156,85.12400005615234
+43,0.0052113881268139395,1.61122203125,62.83200006103515,85.20600000488281
+44,0.004982101365125605,1.61612,62.63399995605469,85.08999997802735
+45,0.0051351250149309635,1.59424671875,62.9319999609375,85.26199997558594
+46,0.005090858754036682,1.615645625,62.298000012207034,85.04000009033203
+47,0.005041462934709021,1.6103778125,62.868000163574216,85.03800008300782
+48,0.005173007891114269,1.62044953125,62.80199998046875,85.2100000024414
+49,0.005058950777830822,1.595099375,62.86000011474609,85.27200005859375
+50,0.005100807940055217,1.600473125,62.80199993408203,85.1840000805664
+51,0.005033610355375069,1.58446359375,63.045999907226566,85.38199997558594
+52,0.005180339362206203,1.6024678125,62.946000009765626,85.21599995361328
+53,0.005104479579521077,1.58289453125,62.96600009033203,85.42200013916016
+54,0.005021374972004976,1.6025959375,62.80600016601562,85.30599995361328
+55,0.005073556743030038,1.60291203125,62.93400006347656,85.36799998291016
+56,0.005158487640853439,1.6101865625,62.609999965820315,85.11199992431641
+57,0.005101768133629646,1.6161715625,62.609999963378904,85.00000008300782
+58,0.005178365357486265,1.603305,62.91400003173828,85.12200016113282
+59,0.0050634183654827735,1.625264375,62.71600003173828,85.022000078125
+60,0.005082057423091361,1.56725828125,63.44800000732422,85.69399989746094
+61,0.0050182118679263765,1.56564359375,63.832000056152346,85.810000078125
+62,0.00500678809891854,1.5584078125,63.872000053710934,85.83000008300782
+63,0.004991384588980249,1.54848390625,63.90199997802734,85.92000010742187
+64,0.005057041134153094,1.54323890625,64.27800003173829,86.02400008544922
+65,0.005009956025917616,1.570284375,63.900000036621094,85.86200023925781
+66,0.004924139373802713,1.529154375,64.41200008544922,86.16600010498047
+67,0.004968045678521905,1.52668625,64.45400008300781,86.314000078125
+68,0.00501549882548196,1.54637953125,64.18199994628907,86.06799994628906
+69,0.005025971581095031,1.51549640625,64.45600005126953,86.40200005126952
+70,0.005019565312457936,1.5235915625,64.53800000244141,86.47599987060546
+71,0.005023190851456353,1.5237465625,64.57600013183594,86.47199987304687
+72,0.00493933616339096,1.5230834375,64.73199994873048,86.44000002685547
+73,0.004933323544849243,1.5054390625,64.89200008300782,86.49800016113281
+74,0.0049628756075565305,1.52728390625,64.57800010742187,86.49999987304687
+75,0.004948072434802141,1.51232859375,64.85399995117187,86.543999921875
+76,0.0048303686281932256,1.52575109375,64.89600000244141,86.41599994873047
+77,0.004780515329912305,1.47931625,65.19199997802734,86.89199994873047
+78,0.004977641993069223,1.48489359375,65.35600005615234,86.79800002929687
+79,0.004913635186052748,1.49372359375,65.28599992919922,86.68200005615235
+80,0.004917722561263612,1.4772965625,65.56600000976563,86.98600002685546
+81,0.004869485540049416,1.462256875,65.76,87.00200013183594
+82,0.004857529919328434,1.4718278125,65.48000002685546,87.06199989746094
+83,0.004841296434668558,1.4812390625,65.68200012695313,87.07199997314453
+84,0.004901787831581065,1.46519921875,65.88999995117187,87.14199989990234
+85,0.0048849687445908785,1.45171453125,66.12199997314453,87.18400002685547
+86,0.004908482444339565,1.46072125,65.798,87.24200007568359
+87,0.004881637370479959,1.44158515625,66.44599997558593,87.32600015625
+88,0.004839210604716625,1.44029125,66.1840001586914,87.31400005126953
+89,0.004845336212643555,1.43849734375,66.24600002685547,87.5639999975586
+90,0.004818427609279752,1.4360671875,66.23799998046874,87.4899999975586
+91,0.004752129615683641,1.43512328125,66.30399999511718,87.3679999975586
+92,0.004836552809657795,1.41703296875,66.68800015380859,87.75199981445313
+93,0.00478907478308039,1.4329465625,66.48200004882813,87.63600004638671
+94,0.004824953453083124,1.42722390625,66.40000004882812,87.70200009765625
+95,0.004850512669820871,1.42813828125,66.79999997802734,87.7480000024414
+96,0.004876219467925174,1.43117953125,66.73400005126953,87.61599989257813
+97,0.004927711095660925,1.41865015625,66.74200005126953,87.89799994873047
+98,0.004851346236786672,1.40063,66.94000010742188,87.91400010498047
+99,0.004951916402205825,1.4060365625,66.91800007324218,87.90400015136719
+100,0.0047612665221095085,1.4090628125,66.87200010253906,87.78399997314453
+101,0.004887008507336889,1.401068125,67.07000002441406,88.0020000756836
+102,0.0047703505759792665,1.39104484375,67.23199989746094,88.02400012695313
+103,0.00487110427846866,1.3881403125,67.48200005371093,88.22999989501953
+104,0.004896886247609343,1.4029371875,67.3679999975586,88.10799997070312
+105,0.0048172368468450645,1.3930609375,67.28200001464843,88.07999999267578
+106,0.004833530435072524,1.38458515625,67.55199989746093,88.27199989501953
+107,0.0047719960233994895,1.394876875,67.36400005126953,88.13600001953125
+108,0.004808439041620919,1.3727871875,67.73999994140625,88.23200007324219
+109,0.004759473027661443,1.36038953125,67.80200002441406,88.47999994140625
+110,0.004733743013015815,1.37357859375,67.75000002685547,88.37600007568359
+111,0.004777830792590976,1.35628453125,67.966,88.52199996826172
+112,0.004750983955870781,1.36778015625,67.9559999194336,88.42800004638671
+113,0.004782901012471744,1.351753125,67.78600002685548,88.489999921875
+114,0.004859128035604954,1.368960625,67.9739999975586,88.39600005126952
+115,0.004820822006357568,1.348521875,68.20200010253906,88.6520001538086
+116,0.004778401726590735,1.345480625,68.45600007080078,88.76800004882813
+117,0.004646568293018001,1.340895,68.28199997314454,88.82000004638672
+118,0.004756226736520018,1.34797265625,68.43599994140625,88.7539999194336
+119,0.004734682850539684,1.34323171875,68.42199996826172,88.77000009765625
+120,0.0047223693358578855,1.3437465625,68.40399999511719,88.8380001538086
+121,0.00478786273327257,1.3476125,68.59599994628907,88.65000007324218
+122,0.004672181892341801,1.3225603125,68.87000006591796,88.85399996582031
+123,0.004665981911654983,1.3343040625,68.63799999511718,88.82599997070312
+124,0.0047178248475704876,1.33354453125,68.7540000415039,89.07199994140625
+125,0.004726118374882,1.32187890625,68.65200004638672,88.8800000756836
+126,0.004768286133185029,1.319091875,68.96800012451172,89.16199996826172
+127,0.004736629148413028,1.31102640625,68.91800007324218,89.17600004882813
+128,0.004740432536761675,1.30669734375,69.19400009765624,89.07200002197266
+129,0.004662513433556471,1.30038671875,69.16200002197266,89.32599999267578
+130,0.004642459863264646,1.3016015625,69.35200004882813,89.14799997070313
+131,0.004651597135567239,1.29885984375,69.18000006835938,89.18999981445313
+132,0.004526541002893022,1.28210890625,69.70399996582032,89.41400004882813
+133,0.004782311518543533,1.30472515625,69.34399999511719,89.38200009765625
+134,0.004684346761288387,1.2839378125,69.53999986328125,89.42399996826173
+135,0.004732772075970258,1.2874309375,69.61800012207031,89.59200004638672
+136,0.004705433168315462,1.28186265625,69.52399994140625,89.41600004638671
+137,0.004564720771408507,1.28739578125,69.6279999609375,89.51400010009766
+138,0.004698288693491902,1.26457640625,69.82800004150391,89.66599996826172
+139,0.004667031146319849,1.26491140625,70.13000006103516,89.74200014648437
+140,0.004615324449592403,1.26782265625,69.94600012207032,89.6339999951172
+141,0.004736929267112698,1.2716990625,69.92200002197265,89.66000007080078
+142,0.004645303856315357,1.26636546875,70.05000012207032,89.79600007080079
+143,0.004589632619172335,1.27597921875,69.87999999267578,89.6260001513672
+144,0.004599113316674318,1.26832953125,70.03800010009766,89.70800017333984
+145,0.004497614356556109,1.24645171875,70.2940000390625,89.89200001953125
+146,0.004668201053781169,1.25213734375,70.25399991699219,89.90399996826172
+147,0.004643831202494246,1.2573528125,70.34800009277343,89.83199994140625
+148,0.004657145689374634,1.246579375,70.32000004638672,89.87399997314454
+149,0.004672455501609615,1.24891890625,70.46399994140624,89.97999988769531
+150,0.004561962260465536,1.2371015625,70.81199999267578,90.01200011962891
+151,0.004665072109284145,1.2461521875,70.58600006835937,89.99000004638673
+152,0.004581982303144676,1.24205828125,70.82200009277344,90.14599993896485
+153,0.004534704111782568,1.2320478125,70.91400018066406,90.11200009765625
+154,0.00448989266130541,1.219623125,71.05400009521485,90.25200001953125
+155,0.004539375891909003,1.216745,71.27199994140625,90.33400009521485
+156,0.004434717386694891,1.220150625,71.1020000415039,90.27199999267579
+157,0.004551438348633903,1.2224921875,70.9960001171875,90.40999991699219
+158,0.004494417591818741,1.2208475,71.21400014892578,90.29600012451172
+159,0.004556146516863789,1.21477640625,71.20599999511718,90.46600004638672
+160,0.0044869347808084315,1.209465625,71.1119999633789,90.34599983886719
+161,0.004461011549990092,1.19804203125,71.48400001953125,90.57199991699218
+162,0.004472280120743173,1.19293375,71.64600006835937,90.66400014648437
+163,0.0044560003693082505,1.18684765625,71.78400012207031,90.65399986083985
+164,0.004553499465276088,1.2062878125,71.5279999633789,90.49000004638673
+165,0.004505423562867301,1.18359390625,71.8559999633789,90.67399988769532
+166,0.004468879961807813,1.17607625,71.90200014160156,90.79999986083985
+167,0.004504834667646459,1.17856859375,72.16599996582032,90.99200007080078
+168,0.004461629715348993,1.170638125,72.18999996337891,90.82400007080078
+169,0.004403821259204831,1.17690703125,72.00999999511718,90.80199997070312
+170,0.004433724197692105,1.1584646875,72.29599998779297,91.04799996582031
+171,0.004493325383269361,1.17607875,72.1560000366211,90.89199991210937
+172,0.004585765568273408,1.176185625,72.15800007324219,91.04000004638672
+173,0.004505930235609412,1.16091203125,72.40199990478516,91.09399996582032
+174,0.0044098537348742995,1.154465,72.56800004394532,91.14999996826172
+175,0.004415984298767788,1.1441278125,72.65400006835938,91.23800001953126
+176,0.004470436135306954,1.14189859375,72.6019999609375,91.17000002197265
+177,0.004430129724953856,1.1509359375,72.72800001708984,91.16800004394531
+178,0.004526908675740872,1.1395303125,72.98599999023438,91.24800009521485
+179,0.004396590969658324,1.14365375,72.67000001220703,91.35200004394531
+180,0.004393110816766109,1.1443346875,72.87400000976562,91.38399980957031
+181,0.004384525552658098,1.12447671875,73.29599999023438,91.42199996826172
+182,0.004510728775390557,1.120158125,73.32200006835937,91.57599993896484
+183,0.004279247085962977,1.1233425,73.32999998535156,91.6160001171875
+184,0.004418941780126521,1.12287375,73.25199999023438,91.62199991210937
+185,0.0043414472934923,1.1175003125,73.23800009277343,91.62400004394532
+186,0.00444089132361114,1.1186775,73.1880000390625,91.63200001464844
+187,0.004242212129091578,1.1076759375,73.49599999023438,91.6920001171875
+188,0.004308835071112428,1.11382453125,73.61799998779297,91.66400009277343
+189,0.004377027907009635,1.09399609375,73.75999990722656,91.7500001953125
+190,0.004387934026973588,1.09843328125,73.76799990722657,91.8679999609375
+191,0.004388023932863559,1.09843125,73.90999998779297,91.86400007080078
+192,0.004349396996466177,1.0851678125,74.0220000415039,91.89800009277344
+193,0.004335067600810102,1.089355625,73.96399995849609,91.92600001464844
+194,0.004343908280134201,1.0814921875,74.13400001464844,92.05199991210938
+195,0.004251544397058231,1.0836525,74.10800008789063,92.0840000390625
+196,0.004303004742333931,1.079254375,74.2260001147461,92.07999998779297
+197,0.004285441528606627,1.06234,74.47600001464843,92.23600001464844
+198,0.004263307001175625,1.0597365625,74.48800000976563,92.2820001171875
+199,0.004302812540637595,1.07631171875,74.16999998779296,92.24600006591797
+200,0.004331470600196293,1.068165625,74.74800008789063,92.23600006347657
+201,0.004283389409205743,1.0668040625,74.44800000732423,92.1400000390625
+202,0.004252366804783898,1.05651734375,74.55799998291016,92.20200001464843
+203,0.0042564044041293,1.05011671875,74.94400014160156,92.37599998779297
+204,0.004381564678624272,1.05434125,74.53800008544921,92.4000000390625
+205,0.004182761285587081,1.0424640625,75.0139999584961,92.45999990966797
+206,0.0041452854805226836,1.04821296875,74.94200003417969,92.46599993652343
+207,0.004292250378057361,1.03995625,75.07400000732422,92.5780001171875
+208,0.0043207124108448625,1.0382290625,75.07200006347657,92.5040000390625
+209,0.004227171286142298,1.03806,75.21599998291016,92.53199990966797
+210,0.0041973576382068655,1.0244478125,75.29000013671875,92.69000001464843
+211,0.00421309527674956,1.02601953125,75.39600011230469,92.7799999609375
+212,0.004208202340773174,1.02174296875,75.61000000732422,92.70400006591797
+213,0.0041728746977501684,1.01000328125,75.73200010742187,92.8560000366211
+214,0.004136975771481437,1.01413421875,75.54799995605468,92.78600009277343
+215,0.004179607378318906,1.002690625,75.89000006103515,92.90000001220703
+216,0.00414369604550302,1.0029115625,75.88200013427735,92.89199998535156
+217,0.004148620879277587,1.00303890625,76.00599998291015,92.96800001220703
+218,0.004154796828515828,0.99850484375,76.10200006347657,93.0359999609375
+219,0.004153257684915194,0.98883125,76.31000005371094,93.15800008789063
+220,0.004109900195284614,0.98340609375,76.32799989990234,93.21600006347656
+221,0.004117257976239281,0.9853646875,76.24999998046874,93.25400011474609
+222,0.00406772896115269,0.9858534375,76.40599997802734,93.22599993652344
+223,0.004105778061784804,0.98020984375,76.40200008544922,93.31200001220704
+224,0.004056159257223564,0.97794484375,76.45200003173828,93.29400006103516
+225,0.0040910857164167935,0.97563390625,76.60599998046875,93.2780000390625
+226,0.0040714993707037395,0.96293984375,76.86800010742188,93.49800008789063
+227,0.004126675110975546,0.96799328125,76.75399998046875,93.48799990722657
+228,0.00401083128859422,0.9581128125,76.94400013427735,93.48600008544922
+229,0.004122888103925756,0.95719859375,77.01600005859375,93.54200001220703
+230,0.004067675509889211,0.95604203125,77.01200008544922,93.5659999609375
+231,0.004003024942773793,0.95360828125,77.04200000488281,93.52799998291016
+232,0.004023495795471328,0.94518703125,77.21000000488282,93.67000014160156
+233,0.004062957230157086,0.95060203125,77.31199994873047,93.54399998535156
+234,0.003994461913992252,0.941239140625,77.35600005371094,93.75000000976563
+235,0.004072328746717956,0.9401640625,77.37200008300782,93.79200001220703
+236,0.0040370105499667784,0.93594515625,77.57600006103516,93.83400006347657
+237,0.004102625684546573,0.925669375,77.69400013183594,93.85800008789063
+238,0.00400754701279636,0.92673109375,77.79000003173829,93.92000001220703
+239,0.003931260301864573,0.91868796875,77.82599993408203,93.91800019287109
+240,0.0040078962587618405,0.9184509375,77.88200008300781,94.08200009033203
+241,0.003989631010751639,0.9131821875,77.93200005859374,94.0940000390625
+242,0.0039024739027289407,0.910870078125,78.14800002929688,94.0840000390625
+243,0.003919692660149719,0.90910875,78.17200005615234,94.09400001220703
+244,0.003983062409263637,0.90479765625,78.35199989990234,94.1960000366211
+245,0.0039435730515314,0.90500796875,78.24200010253907,94.14600000976563
+246,0.003882105811499059,0.8973740625,78.44200002929688,94.28399992919923
+247,0.003944338681841535,0.8931828125,78.48599995361329,94.34200003662109
+248,0.0038875251609299865,0.89339703125,78.55400003173828,94.31999998535156
+249,0.003919344611598977,0.8847475,78.82599997802734,94.48600011474609
+250,0.003974659834057093,0.88667890625,78.75400015625,94.32399998291015
+251,0.0038648887337850673,0.88792609375,78.61599985107422,94.39000009033204
+252,0.003882348088414541,0.883708671875,78.84999997314453,94.4559999584961
+253,0.003899832156353763,0.879748046875,78.92999987548828,94.49600006103516
+254,0.003878143765697522,0.88182046875,78.83600008056641,94.39199998535156
+255,0.003834759500542922,0.87118734375,78.98400013427734,94.57800013916015
+256,0.003816165545556162,0.870868359375,79.09200005371093,94.67200011230469
+257,0.0038604919432795475,0.865966484375,79.19800002685547,94.68200011474609
+258,0.0037719939303185257,0.86741421875,79.24200005371094,94.66000011230469
+259,0.0038805989482040915,0.85983828125,79.34400005615234,94.6920000366211
+260,0.0038489396683871746,0.86072609375,79.45199995117187,94.76999998291015
+261,0.0037831025464194162,0.8598553125,79.38800010498046,94.7120001123047
+262,0.003775066928938031,0.850538515625,79.54200013427734,94.90200011230469
+263,0.00376021843736193,0.852878203125,79.61400018310547,94.87000019042969
+264,0.003743130630547447,0.850947734375,79.70000005371094,94.85000006103516
+265,0.0037950902645077023,0.848290625,79.73400005126953,94.89799998291015
+266,0.003691152353504939,0.842548671875,79.798000078125,94.94399993164062
+267,0.003776852207790528,0.845507109375,79.84200010498047,94.96200000976563
+268,0.0036705253878608346,0.84085453125,79.87599989746094,95.00599995849609
+269,0.0037077388260513544,0.838556796875,79.99599997314454,94.95600000976563
+270,0.003797434370166489,0.833672109375,80.11799997558593,95.03399998291016
+271,0.0037534147767083986,0.841295234375,79.98400010498047,95.01400008544923
+272,0.0037938125730891314,0.833252109375,80.12200005371093,95.09199998291015
+273,0.003618617625241833,0.833057890625,80.27400005371094,95.01599993164062
+274,0.0036791543077145305,0.830380390625,80.18800008056641,95.1000001147461
+275,0.0036934222360806806,0.827929140625,80.2340000024414,95.06199995849609
+276,0.0037091131920793225,0.824863046875,80.30000010498047,95.14200003417969
+277,0.0037969240552878807,0.822872421875,80.36600005371093,95.18600006103516
+278,0.0036822593870705794,0.821608828125,80.4,95.14200003417969
+279,0.0037219872132741977,0.8220225,80.388,95.21999998291015
+280,0.0036821293511560987,0.815946875,80.54399997314454,95.25400006103516
+281,0.003686774555327637,0.817045625,80.58000005126954,95.26800006103515
+282,0.003641096143318074,0.817729375,80.548,95.22400003417968
+283,0.0036904641560145785,0.816061484375,80.64400005126953,95.22400000732422
+284,0.0035623855323397686,0.81314640625,80.63799994873047,95.33999998291016
+285,0.003658884570800832,0.812442265625,80.61200005126953,95.29000000976562
+286,0.003644167977784361,0.813734765625,80.68999994873047,95.24999998291015
+287,0.0036552690102585723,0.81131984375,80.77999999755859,95.32599998291016
+288,0.0036323205567896366,0.80879640625,80.77200002441407,95.32799998291016
+289,0.003605903275976224,0.809427109375,80.7480000024414,95.32399998291015
+290,0.0037104166944378187,0.810876796875,80.73000002685546,95.35599998291016
+291,0.0036719500785693526,0.80932296875,80.79399997314454,95.36000000976563
+292,0.0036519923014566302,0.807780859375,80.81200005126954,95.37199998291015
+293,0.0036492896199758562,0.809131171875,80.795999921875,95.35800003417968
+294,0.003632878784888557,0.80717015625,80.81000005126953,95.38600011230469
+295,0.003646290262362787,0.80859359375,80.8500001538086,95.36600006103515
+296,0.0035626571126548307,0.80803984375,80.84199997558594,95.36199998291016
+297,0.0036516755519966993,0.806596015625,80.88400005126954,95.38399998291015
+298,0.003551991772837937,0.807148125,80.84000002685546,95.37600003417968
+299,0.0035453016503847073,0.80710328125,80.86600005126954,95.33199998291016
+300,0.003604882995464972,0.807660546875,80.826000078125,95.37800003417969
+301,0.003637207838307534,0.807074296875,80.830000078125,95.36200003417969
+302,0.003711703832128218,0.807204296875,80.91999997314453,95.37599995605468
+303,0.003739510030884828,0.806617734375,80.88800002441407,95.38000003417969
+304,0.00353996387483286,0.807367421875,80.876,95.37800003417969
+305,0.0035504487376394017,0.807177421875,80.884000078125,95.37399998291015
+306,0.00364223014496799,0.80659125,80.90399994873047,95.35600003417969
+307,0.00370254267805389,0.8077678125,80.814000078125,95.36400003417968
+308,0.003639972086862794,0.806408125,80.862000078125,95.38200003417968
+309,0.0036769274322848234,0.80716296875,80.87200002685547,95.35400003417969
diff --git a/CV/timm/optim_factory.py b/CV/timm/optim_factory.py
new file mode 100644
index 0000000..b0b9ae6
--- /dev/null
+++ b/CV/timm/optim_factory.py
@@ -0,0 +1,343 @@
+""" Optimizer Factory w/ Custom Weight Decay
+Hacked together by / Copyright 2021 Ross Wightman
+"""
+import json
+from itertools import islice
+from typing import Optional, Callable, Tuple
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from timm.models.helpers import group_parameters
+
+from timm.optim.adabelief import AdaBelief
+from timm.optim.adafactor import Adafactor
+from timm.optim.adahessian import Adahessian
+from timm.optim.adamp import AdamP
+from timm.optim.lamb import Lamb
+from timm.optim.lars import Lars
+from timm.optim.lookahead import Lookahead
+from timm.optim.madgrad import MADGRAD
+from timm.optim.nadam import Nadam
+from timm.optim.nvnovograd import NvNovoGrad
+from timm.optim.radam import RAdam
+from timm.optim.rmsprop_tf import RMSpropTF
+from timm.optim.sgdp import SGDP
+from adan import Adan
+from sam import SAM
+
+try:
+    from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD
+    has_apex = True
+except ImportError:
+    has_apex = False
+
+
+def param_groups_weight_decay(
+        model: nn.Module,
+        weight_decay=1e-5,
+        no_weight_decay_list=()
+):
+    no_weight_decay_list = set(no_weight_decay_list)
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+
+        if param.ndim <= 1 or name.endswith(".bias") or name in no_weight_decay_list:
+            no_decay.append(param)
+        else:
+            decay.append(param)
+
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+
+
+def _group(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+
+
+def _layer_map(model, layers_per_group=12, num_groups=None):
+    def _in_head(n, hp):
+        if not hp:
+            return True
+        elif isinstance(hp, (tuple, list)):
+            return any([n.startswith(hpi) for hpi in hp])
+        else:
+            return n.startswith(hp)
+
+    head_prefix = getattr(model, 'pretrained_cfg', {}).get('classifier', None)
+    names_trunk = []
+    names_head = []
+    for n, _ in model.named_parameters():
+        names_head.append(n) if _in_head(n, head_prefix) else names_trunk.append(n)
+
+    # group non-head layers
+    num_trunk_layers = len(names_trunk)
+    if num_groups is not None:
+        layers_per_group = -(num_trunk_layers // -num_groups)
+    names_trunk = list(_group(names_trunk, layers_per_group))
+
+    num_trunk_groups = len(names_trunk)
+    layer_map = {n: i for i, l in enumerate(names_trunk) for n in l}
+    layer_map.update({n: num_trunk_groups for n in names_head})
+    return layer_map
+
+
+def param_groups_layer_decay(
+        model: nn.Module,
+        weight_decay: float = 0.05,
+        no_weight_decay_list: Tuple[str] = (),
+        layer_decay: float = .75,
+        end_layer_decay: Optional[float] = None,
+):
+    """
+    Parameter groups for layer-wise lr decay & weight decay
+    Based on BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
+    """
+    no_weight_decay_list = set(no_weight_decay_list)
+    param_group_names = {}  # NOTE for debugging
+    param_groups = {}
+
+    if hasattr(model, 'group_matcher'):
+        # FIXME interface needs more work
+        layer_map = group_parameters(model, model.group_matcher(coarse=False), reverse=True)
+    else:
+        # fallback
+        layer_map = _layer_map(model)
+    num_layers = max(layer_map.values()) + 1
+    layer_max = num_layers - 1
+    layer_scales = list(layer_decay ** (layer_max - i) for i in range(num_layers))
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+
+        # no decay: all 1D parameters and model specific ones
+        if param.ndim == 1 or name in no_weight_decay_list:
+            g_decay = "no_decay"
+            this_decay = 0.
+        else:
+            g_decay = "decay"
+            this_decay = weight_decay
+
+        layer_id = layer_map.get(name, layer_max)
+        group_name = "layer_%d_%s" % (layer_id, g_decay)
+
+        if group_name not in param_groups:
+            this_scale = layer_scales[layer_id]
+            param_group_names[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "param_names": [],
+            }
+            param_groups[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+
+        param_group_names[group_name]["param_names"].append(name)
+        param_groups[group_name]["params"].append(param)
+
+    # FIXME temporary output to debug new feature
+    print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
+
+    return list(param_groups.values())
+
+
+def optimizer_kwargs(cfg):
+    """ cfg/argparse to kwargs helper
+    Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn.
+    """
+    kwargs = dict(
+        opt=cfg.opt,
+        lr=cfg.lr,
+        weight_decay=cfg.weight_decay,
+        momentum=cfg.momentum)
+    if getattr(cfg, 'opt_eps', None) is not None:
+        kwargs['eps'] = cfg.opt_eps
+    if getattr(cfg, 'opt_betas', None) is not None:
+        kwargs['betas'] = cfg.opt_betas
+    if getattr(cfg, 'layer_decay', None) is not None:
+        kwargs['layer_decay'] = cfg.layer_decay
+    if getattr(cfg, 'opt_args', None) is not None:
+        kwargs.update(cfg.opt_args)
+    return kwargs
+
+
+def create_optimizer(args, model, filter_bias_and_bn=True):
+    """ Legacy optimizer factory for backwards compatibility.
+    NOTE: Use create_optimizer_v2 for new code.
+    """
+    return create_optimizer_v2(
+        model,
+        **optimizer_kwargs(cfg=args),
+        filter_bias_and_bn=filter_bias_and_bn,
+    )
+
+
+def create_optimizer_v2(
+        model_or_params,
+        opt: str = 'sgd',
+        lr: Optional[float] = None,
+        weight_decay: float = 0.,
+        momentum: float = 0.9,
+        filter_bias_and_bn: bool = True,
+        layer_decay: Optional[float] = None,
+        param_group_fn: Optional[Callable] = None,
+        **kwargs):
+    """ Create an optimizer.
+
+    TODO currently the model is passed in and all parameters are selected for optimization.
+    For more general use an interface that allows selection of parameters to optimize and lr groups, one of:
+      * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion
+      * expose the parameters interface and leave it up to caller
+
+    Args:
+        model_or_params (nn.Module): model containing parameters to optimize
+        opt: name of optimizer to create
+        lr: initial learning rate
+        weight_decay: weight decay to apply in optimizer
+        momentum:  momentum for momentum based optimizers (others may use betas via kwargs)
+        filter_bias_and_bn:  filter out bias, bn and other 1d params from weight decay
+        **kwargs: extra optimizer specific kwargs to pass through
+
+    Returns:
+        Optimizer
+    """
+    if isinstance(model_or_params, nn.Module):
+        # a model was passed in, extract parameters and add weight decays to appropriate layers
+        no_weight_decay = {}
+        if hasattr(model_or_params, 'no_weight_decay'):
+            no_weight_decay = model_or_params.no_weight_decay()
+
+        if param_group_fn:
+            parameters = param_group_fn(model_or_params)
+        elif layer_decay is not None:
+            parameters = param_groups_layer_decay(
+                model_or_params,
+                weight_decay=weight_decay,
+                layer_decay=layer_decay,
+                no_weight_decay_list=no_weight_decay)
+            weight_decay = 0.
+        elif weight_decay and filter_bias_and_bn:
+            parameters = param_groups_weight_decay(model_or_params, weight_decay, no_weight_decay)
+            weight_decay = 0.
+        else:
+            parameters = model_or_params.parameters()
+    else:
+        # iterable of parameters or param groups passed in
+        parameters = model_or_params
+
+    opt_lower = opt.lower()
+    opt_split = opt_lower.split('_')
+    opt_lower = opt_split[-1]
+    if 'fused' in opt_lower:
+        assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
+
+    opt_args = dict(weight_decay=weight_decay, **kwargs)
+    if lr is not None:
+        opt_args.setdefault('lr', lr)
+
+    # basic SGD & related
+    if opt_lower == 'sgd' or opt_lower == 'nesterov':
+        # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons
+        opt_args.pop('eps', None)
+        optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'sam':
+        opt_args.pop('eps', None)
+        optimizer = SAM(parameters, optim.SGD, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'adan':
+        optimizer = Adan(parameters, **opt_args)
+    elif opt_lower == 'momentum':
+        opt_args.pop('eps', None)
+        optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+    elif opt_lower == 'sgdp':
+        optimizer = SGDP(parameters, momentum=momentum, nesterov=True, **opt_args)
+
+    # adaptive
+    elif opt_lower == 'adam':
+        optimizer = optim.Adam(parameters, **opt_args) 
+    elif opt_lower == 'adamw':
+        optimizer = optim.AdamW(parameters, **opt_args)
+    elif opt_lower == 'adamp':
+        optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)
+    elif opt_lower == 'nadam':
+        try:
+            # NOTE PyTorch >= 1.10 should have native NAdam
+            optimizer = optim.Nadam(parameters, **opt_args)
+        except AttributeError:
+            optimizer = Nadam(parameters, **opt_args)
+    elif opt_lower == 'radam':
+        optimizer = RAdam(parameters, **opt_args)
+    elif opt_lower == 'adamax':
+        optimizer = optim.Adamax(parameters, **opt_args)
+    elif opt_lower == 'adabelief':
+        optimizer = AdaBelief(parameters, rectify=False, **opt_args)
+    elif opt_lower == 'radabelief':
+        optimizer = AdaBelief(parameters, rectify=True, **opt_args)
+    elif opt_lower == 'adadelta':
+        optimizer = optim.Adadelta(parameters, **opt_args)
+    elif opt_lower == 'adagrad':
+        opt_args.setdefault('eps', 1e-8)
+        optimizer = optim.Adagrad(parameters, **opt_args)
+    elif opt_lower == 'adafactor':
+        optimizer = Adafactor(parameters, **opt_args)
+    elif opt_lower == 'lamb':
+        optimizer = Lamb(parameters, **opt_args)
+    elif opt_lower == 'lambc':
+        optimizer = Lamb(parameters, trust_clip=True, **opt_args)
+    elif opt_lower == 'larc':
+        optimizer = Lars(parameters, momentum=momentum, trust_clip=True, **opt_args)
+    elif opt_lower == 'lars':
+        optimizer = Lars(parameters, momentum=momentum, **opt_args)
+    elif opt_lower == 'nlarc':
+        optimizer = Lars(parameters, momentum=momentum, trust_clip=True, nesterov=True, **opt_args)
+    elif opt_lower == 'nlars':
+        optimizer = Lars(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'madgrad':
+        optimizer = MADGRAD(parameters, momentum=momentum, **opt_args)
+    elif opt_lower == 'madgradw':
+        optimizer = MADGRAD(parameters, momentum=momentum, decoupled_decay=True, **opt_args)
+    elif opt_lower == 'novograd' or opt_lower == 'nvnovograd':
+        optimizer = NvNovoGrad(parameters, **opt_args)
+    elif opt_lower == 'rmsprop':
+        optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=momentum, **opt_args)
+    elif opt_lower == 'rmsproptf':
+        optimizer = RMSpropTF(parameters, alpha=0.9, momentum=momentum, **opt_args)
+
+    # second order
+    elif opt_lower == 'adahessian':
+        optimizer = Adahessian(parameters, **opt_args)
+
+    # NVIDIA fused optimizers, require APEX to be installed
+    elif opt_lower == 'fusedsgd':
+        opt_args.pop('eps', None)
+        optimizer = FusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'fusedmomentum':
+        opt_args.pop('eps', None)
+        optimizer = FusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+    elif opt_lower == 'fusedadam':
+        optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
+    elif opt_lower == 'fusedadamw':
+        optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
+    elif opt_lower == 'fusedlamb':
+        optimizer = FusedLAMB(parameters, **opt_args)
+    elif opt_lower == 'fusednovograd':
+        opt_args.setdefault('betas', (0.95, 0.98))
+        optimizer = FusedNovoGrad(parameters, **opt_args)
+
+    else:
+        assert False and "Invalid optimizer"
+        raise ValueError
+
+    if len(opt_split) > 1:
+        if opt_split[0] == 'lookahead':
+            optimizer = Lookahead(optimizer)
+
+    return optimizer
diff --git a/CV/timm/sam.py b/CV/timm/sam.py
new file mode 100644
index 0000000..61ae5c8
--- /dev/null
+++ b/CV/timm/sam.py
@@ -0,0 +1,62 @@
+import torch
+
+
+class SAM(torch.optim.Optimizer):
+    def __init__(self, params, base_optimizer, rho=0.05, adaptive=False, **kwargs):
+        assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"
+
+        defaults = dict(rho=rho, adaptive=adaptive, **kwargs)
+        super(SAM, self).__init__(params, defaults)
+
+        self.base_optimizer = base_optimizer(self.param_groups, **kwargs)
+        self.param_groups = self.base_optimizer.param_groups
+
+    @torch.no_grad()
+    def first_step(self, zero_grad=False):
+        grad_norm = self._grad_norm()
+        for group in self.param_groups:
+            scale = group["rho"] / (grad_norm + 1e-12)
+
+            for p in group["params"]:
+                if p.grad is None: continue
+                self.state[p]["old_p"] = p.data.clone()
+                e_w = (torch.pow(p, 2) if group["adaptive"] else 1.0) * p.grad * scale.to(p)
+                p.add_(e_w)  # climb to the local maximum "w + e(w)"
+
+        if zero_grad: self.zero_grad()
+
+    @torch.no_grad()
+    def second_step(self, zero_grad=False):
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None: continue
+                p.data = self.state[p]["old_p"]  # get back to "w" from "w + e(w)"
+
+        self.base_optimizer.step()  # do the actual "sharpness-aware" update
+
+        if zero_grad: self.zero_grad()
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
+        closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass
+
+        self.first_step(zero_grad=True)
+        closure()
+        self.second_step()
+
+    def _grad_norm(self):
+        shared_device = self.param_groups[0]["params"][0].device  # put everything on the same device, in case of model parallelism
+        norm = torch.norm(
+                    torch.stack([
+                        ((torch.abs(p) if group["adaptive"] else 1.0) * p.grad).norm(p=2).to(shared_device)
+                        for group in self.param_groups for p in group["params"]
+                        if p.grad is not None
+                    ]),
+                    p=2
+               )
+        return norm
+
+    def load_state_dict(self, state_dict):
+        super().load_state_dict(state_dict)
+        self.base_optimizer.param_groups = self.param_groups
\ No newline at end of file
diff --git a/CV/timm/supervised.md b/CV/timm/supervised.md
new file mode 100644
index 0000000..e266905
--- /dev/null
+++ b/CV/timm/supervised.md
@@ -0,0 +1,168 @@
+# Training recipes 
+
+We provide the specific commonds and hyper-parameters for ViTs, ResNets and ConvNexts in this recipe.
+
+
+
+## Training of ViT
+
+### 1) Training with Setting I
+
+This is a prevalent setting for training [ResNets](https://arxiv.org/abs/2110.00476). To train ViT-small, you can use the following command.
+
+```python
+python -m torch.distributed.launch --nproc_per_node=8 ./train.py 
+    --data-dir ${IMAGENET_DIR}   \
+    --model deit_small_patch16_224 \
+    --sched cosine -j 10 \
+    --epochs ${EPOCH} --weight-decay 0.02 \
+    --opt Adan \ 
+    --lr 1.5e-2  --opt-betas 0.98 0.92 0.99 \
+    --opt-eps 1e-8 --max-grad-norm 0.0 \
+    --warmup-lr 1e-8 --min-lr 1.0e-08 \
+    -b 256 --amp \
+    --aug-repeats 0 \
+    --warmup-epochs 60 \
+    --aa rand-m7-mstd0.5-inc1 \
+    --smoothing 0.1 \
+    --remode pixel \
+    --reprob 0.0 \
+    --bce \
+    --drop 0.0 --drop-path 0.05 \
+    --mixup 0.2 --cutmix 1.0 \
+    --output ${OUT_DIR} \
+    --experiment ${EXP_DIR}
+```
+
+After training, this command should give the following results. Note, it seems that this setting cannot  improve the results of ViT-Base under training setting II (see below).
+
+|           |                          150 Epoch                           |                          300 Epoch                           |
+| :-------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| ViT small |                             80.1                             |                             81.1                             |
+| download  | [config](./exp_results/ViT/small/args_vit-s_150-I.yaml)/[log](./exp_results/ViT/small/summary_vit-s_150-I.csv)/model | [config](./exp_results/ViT/small/args_vit-s_300-I.yaml)/[log](./exp_results/ViT/small/summary_vit-s_300-I.csv)/model |
+
+
+
+
+
+### 2) Training with Setting II
+
+This is the official setting used in [Deit](https://github.com/facebookresearch/deit). Note, without distillation, DeiTs and ViTs are the same models. To train ViT-small, you can use the following command.
+
+```python
+python -m torch.distributed.launch --nproc_per_node=8 ./train.py 
+    --data-dir ${IMAGENET_DIR} \
+    --model ${MODEL_NAME} \
+    --sched cosine -j 10 \
+    --epochs ${EPOCH} --weight-decay .02 \
+    --opt Adan \ 
+    --lr 1.5e-2  --opt-betas 0.98 0.92 0.99 \
+    --opt-eps 1e-8 --max-grad-norm 5.0 \
+    --warmup-lr 1e-8 --min-lr 1e-5 \
+    -b 256 --amp \
+    --aug-repeats ${REP} \
+    --warmup-epochs 60 \
+    --aa ${AUG}  \
+    --smoothing 0.1 \
+    --remode pixel \
+    --reprob 0.25 \
+    --drop 0.0 --drop-path 0.1 \
+    --mixup 0.8 --cutmix 1.0 \
+    --output ${OUT_DIR} \
+    --experiment ${EXP_DIR}
+```
+There is some differences between hyper-parameters for ViT-Base and ViT-Small. `--bce` means using the Binary Cross Entropy loss. 
+
+  |           |       MODEL_NAME       | REP  |         AUG          |  BCE  | Bias-Decay |
+  | --------- | :--------------------: | :--: | :------------------: | :---: | :--------: |
+  | ViT-Small | deit_small_patch16_224 |  0   | rand-m7-mstd0.5-inc1 | True  |   False    |
+  | ViT-Base  | deit_base_patch16_224  |  3   | rand-m9-mstd0.5-inc1 | False |    True    |
+
+After training, you should expect the following resutls. The results are sensitive to the `warmup-lr` and `min-lr`. 
+
+|           |                          150 Epoch                           |                          300 Epoch                           |
+| :-------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| ViT-Small |                             79.6                             |                             80.9                             |
+| download  | [config](./exp_results/ViT/small/args_vit-s_150.yaml)/[log](./exp_results/ViT/small/summary_vit-s_150.csv)/model | [config](./exp_results/ViT/small/args_vit-s_300.yaml)/[log](./exp_results/ViT/small/summary_vit-s_300.csv)/model |
+| ViT-Base  |                             81.7                             |                             82.3                             |
+| download  | [config](./exp_results/ViT/base/args_vit-B_150.yaml)/[log](./exp_results/ViT/base/summary_vit-B_150.csv)/model | [config](./exp_results/ViT/base/args_vit-B_300.yaml)/[log](./exp_results/ViT/base/summary_vit-B_300.csv)/model |
+
+
+
+## ResNet
+This is a default setting used to train [ResNets](https://arxiv.org/abs/2110.00476). To train ResNet-50, you can use the following command.
+
+```python
+python -m torch.distributed.launch --nproc_per_node=8 ./train.py 
+    --data-dir ${IMAGENET_DIR} \
+    --model resnet50 \
+    --sched cosine -j 8 \
+    --epochs ${EPOCH} --weight-decay .02 \
+    --opt Adan \ 
+    --lr ${LR}  --opt-betas 0.98 0.92 0.99 \
+    --opt-eps 1e-8 --max-grad-norm 5.0 \
+    --warmup-lr 1e-9 --min-lr 1.0e-05 --bias-decay \
+    -b 256 --amp \
+    --aug-repeats 0 \
+    --warmup-epochs 60 \
+    --aa rand-m7-mstd0.5-inc1 \
+    --smoothing 0.0 \
+    --remode pixel \
+    --crop-pct 0.95 \
+    --reprob 0.0 \
+    --bce \
+    --drop 0.0 --drop-path 0.05 \
+    --mixup 0.1 --cutmix 1.0 \
+    --output ${OUT_DIR} \
+    --experiment ${EXP_DIR}
+```
+
+When training different epochs, we use slightly different learning rate, namely, `LR = 3e-2` for `EPOCH = 100` and `LR = 1.5e-2` for `EPOCH = 200 and 300`. After training, you can get the following resutls:
+
+|           |                          100 Epoch                           |                          200 Epoch                           |                          300 Epoch                           |
+| :-------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| ResNet-50 |                             78.1                             |                             79.7                             |                             80.2                             |
+| download  | [config](./exp_results/ResNet/Res50/args_res50_100.yaml)/[log](./exp_results/ResNet/Res50/summary_res50_100.csv)/model | [config](./exp_results/ResNet/Res50/args_res50_200.yaml)/[log](./exp_results/ResNet/Res50/summary_res50_200.csv)/model | [config](./exp_results/ResNet/Res50/args_res50_300.yaml)/[log](./exp_results/ResNet/Res50/summary_res50_300.csv)/model |
+
+
+
+## ConvNext
+
+This is a default setting to train ConvNext-tiny. To train ConvNext-tiny, you can use the following command.
+
+```python
+python -m torch.distributed.launch --nproc_per_node=8 ./train.py 
+    --data-dir ${IMAGENET_DIR} \
+    --model convnext_tiny_hnf \
+    --sched cosine -j 8 \
+    --epochs ${EPOCH} --weight-decay .02 \
+    --opt Adan \ 
+    --lr 1.6e-2  --opt-betas 0.98 0.92 0.90 \
+    --opt-eps 1e-8 --max-grad-norm 0.0 \
+    --warmup-lr 1e-9 --min-lr 1.0e-05 --bias-decay \
+    -b 256 --amp \
+    --aug-repeats 0 \
+    --warmup-epochs 150 \
+    --aa rand-m7-mstd0.5-inc1 \
+    --smoothing 0.1 \
+    --remode pixel \
+    --reprob 0.25 \
+    --drop 0.0 --drop-path 0.1 \
+    --mixup 0.8 --cutmix 1.0 \
+    --model-ema \
+    --train-interpolation random \
+    --output ${OUT_DIR} \
+    --experiment ${EXP_DIR}
+```
+
+For this training, the performance is NOT sensitive to some hyper-params, such as `warmup-epochs` and `lr`.  But whether using `model-ema` plays a key role. 
+
+You can use the following config to train convnext tiny for 150 epoch, in which we do not utilize `model-ema`.
+
+This results should be:
+
+|               |                          150 Epoch                           |                          300 Epoch                           |
+| :-----------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| ConvNext-tiny |                             81.7                             |                             82.4                             |
+|   download    | [config](./exp_results/ConvNext/small/args_cvnext_150.yaml)/[log](./exp_results/ConvNext/small/summary_cvnext_150.csv)/model | [config](./exp_results/ConvNext/small/args_cvnext_300.yaml)/[log](./exp_results/ConvNext/small/summary_cvnext_300.csv)/model |
+
diff --git a/CV/timm/train.py b/CV/timm/train.py
new file mode 100644
index 0000000..975f284
--- /dev/null
+++ b/CV/timm/train.py
@@ -0,0 +1,830 @@
+#!/usr/bin/env python3
+""" ImageNet Training Script
+
+This is intended to be a lean and easily modifiable ImageNet training script that reproduces ImageNet
+training results with some of the latest networks and training techniques. It favours canonical PyTorch
+and standard Python style over trying to be able to 'do it all.' That said, it offers quite a few speed
+and training result improvements over the usual PyTorch example scripts. Repurpose as you see fit.
+
+This script was started from an early version of the PyTorch ImageNet example
+(https://github.com/pytorch/examples/tree/master/imagenet)
+
+NVIDIA CUDA specific speedups adopted from NVIDIA Apex examples
+(https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+
+Hacked together by / Copyright 2020 Ross Wightman (https://github.com/rwightman)
+"""
+import argparse
+import time
+import yaml
+import os
+import logging
+from collections import OrderedDict
+from contextlib import suppress
+from datetime import datetime
+
+import torch
+import torch.nn as nn
+import torchvision.utils
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+from timm.data import create_dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.models import create_model, safe_model_name, resume_checkpoint, load_checkpoint,\
+    convert_splitbn_model, model_parameters
+from timm.utils import *
+from timm.loss import *
+#from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+from optim_factory import create_optimizer
+from timm.utils import ApexScaler, NativeScaler
+#import timm.optim.optim_factory as optim_factory
+
+try:
+    from apex import amp
+    from apex.parallel import DistributedDataParallel as ApexDDP
+    from apex.parallel import convert_syncbn_model
+    has_apex = True
+except ImportError:
+    has_apex = False
+
+has_native_amp = False
+try:
+    if getattr(torch.cuda.amp, 'autocast') is not None:
+        has_native_amp = True
+except AttributeError:
+    pass
+
+try:
+    import wandb
+    has_wandb = True
+except ImportError: 
+    has_wandb = False
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('train')
+
+# The first arg parser parses out only the --config argument, this argument is used to
+# load a yaml file containing key-values that override the defaults for the main parser below
+config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+                    help='YAML config file specifying default arguments')
+
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+
+# Dataset / Model parameters
+parser.add_argument('--data-dir', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--dataset', '-d', metavar='NAME', default='',
+                    help='dataset type (default: ImageFolder/ImageTar if empty)')
+parser.add_argument('--train-split', metavar='NAME', default='train',
+                    help='dataset train split (default: train)')
+parser.add_argument('--val-split', metavar='NAME', default='validation',
+                    help='dataset validation split (default: validation)')
+parser.add_argument('--model', default='resnet50', type=str, metavar='MODEL',
+                    help='Name of model to train (default: "resnet50"')
+parser.add_argument('--pretrained', action='store_true', default=False,
+                    help='Start with pretrained version of specified network (if avail)')
+parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH',
+                    help='Initialize model from this checkpoint (default: none)')
+parser.add_argument('--resume', default=None, type=str, metavar='PATH',
+                    help='Resume full model and optimizer state from checkpoint (default: none)')
+parser.add_argument('--no-resume-opt', action='store_true', default=False,
+                    help='prevent resume of optimizer state when resuming model')
+parser.add_argument('--num-classes', type=int, default=None, metavar='N',
+                    help='number of label classes (Model default if None)')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--img-size', type=int, default=None, metavar='N',
+                    help='Image patch size (default: None => model default)')
+parser.add_argument('--input-size', default=None, nargs=3, type=int,
+                    metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop percent (for validation only)')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('-b', '--batch-size', type=int, default=128, metavar='N',
+                    help='input batch size for training (default: 128)')
+parser.add_argument('-vb', '--validation-batch-size', type=int, default=None, metavar='N',
+                    help='validation batch size override (default: None)')
+
+# Optimizer parameters
+parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
+                    help='Optimizer (default: "sgd"')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                    help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight-decay', type=float, default=2e-5,
+                    help='weight decay (default: 2e-5)')
+parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                    help='Clip gradient norm (default: None, no clipping)')
+parser.add_argument('--clip-mode', type=str, default='norm',
+                    help='Gradient clipping mode. One of ("norm", "value", "agc")')
+parser.add_argument('--max-grad-norm', type=float, default=0.0,
+                    help='Max grad norm (same as clip gradient norm, default: 0.0, no clipping)')
+parser.add_argument('--bias-decay', action='store_true', default=False,
+                    help='Perform the weight decay on bias term (default=False)')
+parser.add_argument('--no-prox', action='store_true', default=False,
+                    help='Perform the weight decay update like AdamW (default=False)')
+
+
+# Learning rate schedule parameters
+parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+                    help='LR scheduler (default: "step"')
+parser.add_argument('--lr', type=float, default=0.05, metavar='LR',
+                    help='learning rate (default: 0.05)')
+parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                    help='learning rate noise on/off epoch percentages')
+parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                    help='learning rate noise limit percent (default: 0.67)')
+parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                    help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                    help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-decay', type=float, default=0.5, metavar='MULT',
+                    help='amount to decay each learning rate cycle (default: 0.5)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                    help='learning rate cycle limit, cycles enabled if > 1')
+parser.add_argument('--lr-k-decay', type=float, default=1.0,
+                    help='learning rate k-decay for cosine/poly (default: 1.0)')
+parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
+                    help='warmup learning rate (default: 0.0001)')
+parser.add_argument('--min-lr', type=float, default=1e-6, metavar='LR',
+                    help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+parser.add_argument('--epochs', type=int, default=300, metavar='N',
+                    help='number of epochs to train (default: 300)')
+parser.add_argument('--epoch-repeats', type=float, default=0., metavar='N',
+                    help='epoch repeat multiplier (number of times to repeat dataset epoch per train epoch).')
+parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('--decay-epochs', type=float, default=100, metavar='N',
+                    help='epoch interval to decay LR')
+parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
+                    help='epochs to warmup LR, if scheduler supports')
+parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                    help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                    help='patience epochs for Plateau LR scheduler (default: 10')
+parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                    help='LR decay rate (default: 0.1)')
+
+# Augmentation & regularization parameters
+parser.add_argument('--no-aug', action='store_true', default=False,
+                    help='Disable all training augmentation, override other train aug args')
+parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                    help='Random resize scale (default: 0.08 1.0)')
+parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                    help='Random resize aspect ratio (default: 0.75 1.33)')
+parser.add_argument('--hflip', type=float, default=0.5,
+                    help='Horizontal flip training aug probability')
+parser.add_argument('--vflip', type=float, default=0.,
+                    help='Vertical flip training aug probability')
+parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                    help='Color jitter factor (default: 0.4)')
+parser.add_argument('--aa', type=str, default=None, metavar='NAME',
+                    help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+parser.add_argument('--aug-repeats', type=int, default=0,
+                    help='Number of augmentation repetitions (distributed training only) (default: 0)')
+parser.add_argument('--aug-splits', type=int, default=0,
+                    help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+parser.add_argument('--jsd-loss', action='store_true', default=False,
+                    help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+parser.add_argument('--bce-loss', action='store_true', default=False,
+                    help='Enable BCE loss w/ Mixup/CutMix use.')
+parser.add_argument('--reprob', type=float, default=0., metavar='PCT',
+                    help='Random erase prob (default: 0.)')
+parser.add_argument('--remode', type=str, default='pixel',
+                    help='Random erase mode (default: "pixel")')
+parser.add_argument('--recount', type=int, default=1,
+                    help='Random erase count (default: 1)')
+parser.add_argument('--resplit', action='store_true', default=False,
+                    help='Do not random erase first (clean) augmentation split')
+parser.add_argument('--mixup', type=float, default=0.0,
+                    help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix', type=float, default=0.0,
+                    help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                    help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+parser.add_argument('--mixup-prob', type=float, default=1.0,
+                    help='Probability of performing mixup or cutmix when either/both is enabled')
+parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                    help='Probability of switching to cutmix when both mixup and cutmix enabled')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+                    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                    help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+parser.add_argument('--smoothing', type=float, default=0.1,
+                    help='Label smoothing (default: 0.1)')
+parser.add_argument('--train-interpolation', type=str, default='random',
+                    help='Training interpolation (random, bilinear, bicubic default: "random")')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                    help='Dropout rate (default: 0.)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                    help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                    help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                    help='Drop block rate (default: None)')
+
+parser.add_argument('--bn-momentum', type=float, default=None,
+                    help='BatchNorm momentum override (if not None)')
+parser.add_argument('--bn-eps', type=float, default=None,
+                    help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+                    help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='reduce',
+                    help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+parser.add_argument('--split-bn', action='store_true',
+                    help='Enable separate BN layers per augmentation split.')
+
+# Model Exponential Moving Average
+parser.add_argument('--model-ema', action='store_true', default=False,
+                    help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                    help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.9998,
+                    help='decay factor for model weights moving average (default: 0.9998)')
+
+# Misc
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+                    help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+                    help='how many batches to wait before logging training status')
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
+                    help='how many batches to wait before writing recovery checkpoint')
+parser.add_argument('--checkpoint-hist', type=int, default=2, metavar='N',
+                    help='number of checkpoints to keep (default: 10)')
+parser.add_argument('-j', '--workers', type=int, default=4, metavar='N',
+                    help='how many training processes to use (default: 4)')
+parser.add_argument('--save-images', action='store_true', default=False,
+                    help='save images of input bathes every log interval for debugging')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='use NVIDIA Apex AMP or Native AMP for mixed precision training')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+                    help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+                    help='Use Native Torch AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+                    help='disable fast prefetcher')
+parser.add_argument('--output', default='', type=str, metavar='PATH',
+                    help='path to output folder (default: none, current dir)')
+parser.add_argument('--experiment', default='', type=str, metavar='NAME',
+                    help='name of train experiment, name of sub-folder for output')
+parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC',
+                    help='Best metric (default: "top1"')
+parser.add_argument('--tta', type=int, default=0, metavar='N',
+                    help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)')
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False,
+                    help='use the multi-epochs-loader to save time at the beginning of every epoch')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+                    help='convert model torchscript for inference')
+parser.add_argument('--log-wandb', action='store_true', default=False,
+                    help='log training and validation metrics to wandb')
+
+
+def _parse_args():
+    # Do we have a config file to parse?
+    args_config, remaining = config_parser.parse_known_args()
+    if args_config.config:
+        with open(args_config.config, 'r') as f:
+            cfg = yaml.safe_load(f)
+            parser.set_defaults(**cfg)
+
+    # The main arg parser parses the rest of the args, the usual
+    # defaults will have been overridden if config file specified.
+    args = parser.parse_args(remaining)
+
+    # Cache the args as a text string to save them in the output dir later
+    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+    return args, args_text
+
+
+def main():
+    setup_default_logging()
+    args, args_text = _parse_args()
+    
+    if args.log_wandb:
+        if has_wandb:
+            wandb.init(project=args.experiment, config=args)
+        else: 
+            _logger.warning("You've requested to log metrics to wandb but package not found. "
+                            "Metrics not being logged to wandb, try `pip install wandb`")
+             
+    args.prefetcher = not args.no_prefetcher
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+    args.device = 'cuda:0'
+    args.world_size = 1
+    args.rank = 0  # global rank
+    if args.distributed:
+        args.device = 'cuda:%d' % args.local_rank
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+        args.world_size = torch.distributed.get_world_size()
+        args.rank = torch.distributed.get_rank()
+        _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
+                     % (args.rank, args.world_size))
+    else:
+        _logger.info('Training with a single process on 1 GPUs.')
+    assert args.rank >= 0
+
+    # resolve AMP arguments based on PyTorch / Apex availability
+    use_amp = None
+    if args.amp:
+        # `--amp` chooses native amp before apex (APEX ver not actively maintained)
+        if has_native_amp:
+            args.native_amp = True
+        elif has_apex:
+            args.apex_amp = True
+    if args.apex_amp and has_apex:
+        use_amp = 'apex'
+    elif args.native_amp and has_native_amp:
+        use_amp = 'native'
+    elif args.apex_amp or args.native_amp:
+        _logger.warning("Neither APEX or native Torch AMP is available, using float32. "
+                        "Install NVIDA apex or upgrade to PyTorch 1.6")
+
+    random_seed(args.seed, args.rank)
+
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        drop_rate=args.drop,
+        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps,
+        scriptable=args.torchscript,
+        checkpoint_path=args.initial_checkpoint)
+    if args.num_classes is None:
+        assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.'
+        args.num_classes = model.num_classes  # FIXME handle model default vs config num_classes more elegantly
+
+    if args.local_rank == 0:
+        _logger.info(
+            f'Model {safe_model_name(args.model)} created, param count:{sum([m.numel() for m in model.parameters()])}')
+
+    data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0)
+
+    # setup augmentation batch splits for contrastive loss or split bn
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits > 1, 'A split of 1 makes no sense'
+        num_aug_splits = args.aug_splits
+
+    # enable split bn (separate bn stats per batch-portion)
+    if args.split_bn:
+        assert num_aug_splits > 1 or args.resplit
+        model = convert_splitbn_model(model, max(num_aug_splits, 2))
+
+    # move model to GPU, enable channels last layout if set
+    model.cuda()
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+
+    # setup synchronized BatchNorm for distributed training
+    if args.distributed and args.sync_bn:
+        assert not args.split_bn
+        if has_apex and use_amp == 'apex':
+            # Apex SyncBN preferred unless native amp is activated
+            model = convert_syncbn_model(model)
+        else:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        if args.local_rank == 0:
+            _logger.info(
+                'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
+                'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
+
+    if args.torchscript:
+        assert not use_amp == 'apex', 'Cannot use APEX AMP with torchscripted model'
+        assert not args.sync_bn, 'Cannot use SyncBatchNorm with torchscripted model'
+        model = torch.jit.script(model)
+
+    opt_lower = args.opt.lower()
+    if opt_lower == 'adan':
+        args.opt_args = {'max_grad_norm': args.max_grad_norm, 'no_prox': args.no_prox}
+    optimizer = create_optimizer(args, model, filter_bias_and_bn = not args.bias_decay)
+    print(optimizer)
+
+
+    # setup automatic mixed-precision (AMP) loss scaling and op casting
+    amp_autocast = suppress  # do nothing
+    loss_scaler = None
+    if use_amp == 'apex':
+        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
+        loss_scaler = ApexScaler()
+        if args.local_rank == 0:
+            _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
+    elif use_amp == 'native':
+        amp_autocast = torch.cuda.amp.autocast
+        loss_scaler = NativeScaler()
+        if args.local_rank == 0:
+            _logger.info('Using native Torch AMP. Training in mixed precision.')
+    else:
+        if args.local_rank == 0:
+            _logger.info('AMP not enabled. Training in float32.')
+
+    # optionally resume from a checkpoint
+    resume_epoch = None
+    if args.experiment:
+        output_dir = get_outdir(args.output if args.output else './output/train', args.experiment)
+        resume_path = os.path.join(output_dir, "last.pth.tar")
+        print(resume_path, os.path.exists(resume_path))
+        if os.path.exists(resume_path) and not args.resume: args.resume = resume_path
+ 
+        
+    if args.resume:
+        resume_epoch = resume_checkpoint(
+            model, args.resume,
+            optimizer=None if args.no_resume_opt else optimizer,
+            loss_scaler=None if args.no_resume_opt else loss_scaler,
+            log_info=args.local_rank == 0)
+
+    # setup exponential moving average of model weights, SWA could be used here too
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEmaV2(
+            model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else None)
+        if args.resume:
+            load_checkpoint(model_ema.module, args.resume, use_ema=True)
+
+    # setup distributed training
+    if args.distributed:
+        if has_apex and use_amp == 'apex':
+            # Apex DDP preferred unless native amp is activated
+            if args.local_rank == 0:
+                _logger.info("Using NVIDIA APEX DistributedDataParallel.")
+            model = ApexDDP(model, delay_allreduce=True)
+        else:
+            if args.local_rank == 0:
+                _logger.info("Using native Torch DistributedDataParallel.")
+            model = NativeDDP(model, device_ids=[args.local_rank])  # can use device str in Torch >= 1.1
+        # NOTE: EMA model does not need to be wrapped by DDP
+
+    # setup learning rate schedule and starting epoch
+    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
+    start_epoch = 0
+    if args.start_epoch is not None:
+        # a specified start_epoch will always override the resume epoch
+        start_epoch = args.start_epoch
+    elif resume_epoch is not None:
+        start_epoch = resume_epoch
+    if lr_scheduler is not None and start_epoch > 0:
+        lr_scheduler.step(start_epoch)
+
+    if args.local_rank == 0:
+        _logger.info('Scheduled epochs: {}'.format(num_epochs))
+
+    # create the train and eval datasets
+    dataset_train = create_dataset(
+        args.dataset,
+        root=args.data_dir, split=args.train_split, is_training=True,
+        batch_size=args.batch_size, repeats=args.epoch_repeats)
+    dataset_eval = create_dataset(
+        args.dataset, root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size)
+
+    # setup mixup / cutmix
+    collate_fn = None
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        mixup_args = dict(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.num_classes)
+        if args.prefetcher:
+            assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
+            collate_fn = FastCollateMixup(**mixup_args)
+        else:
+            mixup_fn = Mixup(**mixup_args)
+
+    # wrap dataset in AugMix helper
+    if num_aug_splits > 1:
+        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+    # create data loaders w/ augmentation pipeiine
+    train_interpolation = args.train_interpolation
+    if args.no_aug or not train_interpolation:
+        train_interpolation = data_config['interpolation']
+    loader_train = create_loader(
+        dataset_train,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        is_training=True,
+        use_prefetcher=args.prefetcher,
+        no_aug=args.no_aug,
+        re_prob=args.reprob,
+        re_mode=args.remode,
+        re_count=args.recount,
+        re_split=args.resplit,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        auto_augment=args.aa,
+        num_aug_repeats=args.aug_repeats,
+        num_aug_splits=num_aug_splits,
+        interpolation=train_interpolation,
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        collate_fn=collate_fn,
+        pin_memory=args.pin_mem,
+        use_multi_epochs_loader=args.use_multi_epochs_loader
+    )
+
+    loader_eval = create_loader(
+        dataset_eval,
+        input_size=data_config['input_size'],
+        batch_size=args.validation_batch_size or args.batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+
+    # setup loss function
+    if args.jsd_loss:
+        assert num_aug_splits > 1  # JSD only valid with aug splits set
+        train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing)
+    elif mixup_active:
+        # smoothing is handled with mixup target transform which outputs sparse, soft targets
+        if args.bce_loss:
+            train_loss_fn = nn.BCEWithLogitsLoss()
+        else:
+            train_loss_fn = SoftTargetCrossEntropy()
+    elif args.smoothing:
+        if args.bce_loss:
+            train_loss_fn = BinaryCrossEntropy(smoothing=args.smoothing)
+        else:
+            train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
+    else:
+        train_loss_fn = nn.CrossEntropyLoss()
+    train_loss_fn = train_loss_fn.cuda()
+    validate_loss_fn = nn.CrossEntropyLoss().cuda()
+
+    # setup checkpoint saver and eval metric tracking
+    eval_metric = args.eval_metric
+    best_metric = None
+    best_epoch = None
+    saver = None
+    output_dir = None
+    if args.rank == 0:
+        if args.experiment:
+            exp_name = args.experiment
+        else:
+            exp_name = '-'.join([
+                datetime.now().strftime("%Y%m%d-%H%M%S"),
+                safe_model_name(args.model),
+                str(data_config['input_size'][-1])
+            ])
+        output_dir = get_outdir(args.output if args.output else './output/train', exp_name)
+        decreasing = True if eval_metric == 'loss' else False
+        saver = CheckpointSaver(
+            model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
+            checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist)
+        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+            f.write(args_text)
+
+    try:
+        for epoch in range(start_epoch, num_epochs):
+            if args.distributed and hasattr(loader_train.sampler, 'set_epoch'):
+                loader_train.sampler.set_epoch(epoch)
+
+            train_metrics = train_one_epoch(
+                epoch, model, loader_train, optimizer, train_loss_fn, args,
+                lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+                amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
+
+            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                if args.local_rank == 0:
+                    _logger.info("Distributing BatchNorm running means and vars")
+                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
+
+            eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+
+            if model_ema is not None and not args.model_ema_force_cpu:
+                if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                    distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
+                ema_eval_metrics = validate(
+                    model_ema.module, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)')
+                eval_metrics = ema_eval_metrics
+
+            if lr_scheduler is not None:
+                # step LR for next epoch
+                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
+
+            if output_dir is not None:
+                update_summary(
+                    epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
+                    write_header=best_metric is None, log_wandb=args.log_wandb and has_wandb)
+
+            if saver is not None:
+                # save proper checkpoint with eval metric
+                save_metric = eval_metrics[eval_metric]
+                best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
+
+    except KeyboardInterrupt:
+        pass
+    if best_metric is not None:
+        _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
+
+
+def train_one_epoch(
+        epoch, model, loader, optimizer, loss_fn, args,
+        lr_scheduler=None, saver=None, output_dir=None, amp_autocast=suppress,
+        loss_scaler=None, model_ema=None, mixup_fn=None):
+
+    if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
+        if args.prefetcher and loader.mixup_enabled:
+            loader.mixup_enabled = False
+        elif mixup_fn is not None:
+            mixup_fn.mixup_enabled = False
+
+    second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    losses_m = AverageMeter()
+
+    model.train()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    num_updates = epoch * len(loader)
+    for batch_idx, (input, target) in enumerate(loader):
+        last_batch = batch_idx == last_idx
+        data_time_m.update(time.time() - end)
+        if not args.prefetcher:
+            input, target = input.cuda(), target.cuda()
+            if mixup_fn is not None:
+                input, target = mixup_fn(input, target)
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
+
+        with amp_autocast():
+            output = model(input)
+            loss = loss_fn(output, target)
+
+        if not args.distributed:
+            losses_m.update(loss.item(), input.size(0))
+
+        optimizer.zero_grad()
+        if loss_scaler is not None:
+            loss_scaler(
+                loss, optimizer,
+                clip_grad=args.clip_grad, clip_mode=args.clip_mode,
+                parameters=model_parameters(model, exclude_head='agc' in args.clip_mode),
+                create_graph=second_order)
+        else:
+            loss.backward(create_graph=second_order)
+            if args.clip_grad is not None:
+                dispatch_clip_grad(
+                    model_parameters(model, exclude_head='agc' in args.clip_mode),
+                    value=args.clip_grad, mode=args.clip_mode)
+            optimizer.step()
+
+        if model_ema is not None:
+            model_ema.update(model)
+
+        torch.cuda.synchronize()
+        num_updates += 1
+        batch_time_m.update(time.time() - end)
+        if last_batch or batch_idx % args.log_interval == 0:
+            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+            lr = sum(lrl) / len(lrl)
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                losses_m.update(reduced_loss.item(), input.size(0))
+
+            if args.local_rank == 0:
+                _logger.info(
+                    'Train: {} [{:>4d}/{} ({:>3.0f}%)]  '
+                    'Loss: {loss.val:#.4g} ({loss.avg:#.3g})  '
+                    'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s  '
+                    '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'LR: {lr:.3e}  '
+                    'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+                        epoch,
+                        batch_idx, len(loader),
+                        100. * batch_idx / last_idx,
+                        loss=losses_m,
+                        batch_time=batch_time_m,
+                        rate=input.size(0) * args.world_size / batch_time_m.val,
+                        rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
+                        lr=lr,
+                        data_time=data_time_m))
+
+                if args.save_images and output_dir:
+                    torchvision.utils.save_image(
+                        input,
+                        os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
+                        padding=0,
+                        normalize=True)
+
+        if saver is not None and args.recovery_interval and (
+                last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+            saver.save_recovery(epoch, batch_idx=batch_idx)
+
+        if lr_scheduler is not None:
+            lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
+
+        end = time.time()
+        # end for
+
+    if hasattr(optimizer, 'sync_lookahead'):
+        optimizer.sync_lookahead()
+
+    return OrderedDict([('loss', losses_m.avg)])
+
+
+def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''):
+    batch_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    top1_m = AverageMeter()
+    top5_m = AverageMeter()
+
+    model.eval()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    with torch.no_grad():
+        for batch_idx, (input, target) in enumerate(loader):
+            last_batch = batch_idx == last_idx
+            if not args.prefetcher:
+                input = input.cuda()
+                target = target.cuda()
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+
+            with amp_autocast():
+                output = model(input)
+            if isinstance(output, (tuple, list)):
+                output = output[0]
+
+            # augmentation reduction
+            reduce_factor = args.tta
+            if reduce_factor > 1:
+                output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
+                target = target[0:target.size(0):reduce_factor]
+
+            loss = loss_fn(output, target)
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                acc1 = reduce_tensor(acc1, args.world_size)
+                acc5 = reduce_tensor(acc5, args.world_size)
+            else:
+                reduced_loss = loss.data
+
+            torch.cuda.synchronize()
+
+            losses_m.update(reduced_loss.item(), input.size(0))
+            top1_m.update(acc1.item(), output.size(0))
+            top5_m.update(acc5.item(), output.size(0))
+
+            batch_time_m.update(time.time() - end)
+            end = time.time()
+            if args.local_rank == 0 and (last_batch or batch_idx % args.log_interval == 0):
+                log_name = 'Test' + log_suffix
+                _logger.info(
+                    '{0}: [{1:>4d}/{2}]  '
+                    'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f})  '
+                    'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+                        log_name, batch_idx, last_idx, batch_time=batch_time_m,
+                        loss=losses_m, top1=top1_m, top5=top5_m))
+
+    metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)])
+
+    return metrics
+
+
+if __name__ == '__main__':
+    main()
diff --git a/NLP/BERT/README.md b/NLP/BERT/README.md
new file mode 100644
index 0000000..7e8d3d9
--- /dev/null
+++ b/NLP/BERT/README.md
@@ -0,0 +1,213 @@
+# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
+
+
+
+## Installation of Fairseq
+
+Our experiment is based on the repo [Fairseq](https://github.com/facebookresearch/fairseq). For the requirements and installation of [Fairseq](https://github.com/facebookresearch/fairseq) and Apex, please refer to that repo.
+
+
+
+## Environment
+
+Our experiments for this task are based on the following pkg version.
+
+```python
+torch.__version__  = '1.10.1+cu111'
+torchvision.__version__ = '0.11.2+cu111'
+torchaudio.__version__ = '0.10.1+cu111'
+fairseq.__version__ = '0.12.2'
+```
+
+If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:fairseq](https://hub.docker.com/repository/docker/xyxie/adan-image).
+
+
+
+## Usage of Adan in Fairseq
+
+### One step to use Adan
+
+Please first put the file [`adan.py`](./adan.py) to the directory `path/to/fairseq/fairseq/optim`. Then you can choose Adan as the optimizer in the config file. See  following example for pre-training:
+
+```yaml
+optimizer:
+  _name: adan
+  weight_decay: 0.02
+  adan_betas: (0.98,0.92,0.99)
+  adan_eps: 1e-08
+```
+
+
+
+## Pretraining
+
+The following steps are modified from [Fairseq-Roberta](https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.pretraining.md). For completeness, we list some key steps here.
+
+
+### 1) Preprocess the data
+
+Data should be preprocessed following the [language modeling format](https://github.com/facebookresearch/fairseq/tree/main/examples/language_model). That is, each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`, and all lines should be concatenated as a 1D text stream during training.
+
+
+
+In the following steps, we use the [Bookcorpus dataset](https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz) and [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:Database_download) to demonstrate how to preprocess raw text data with the GPT-2 BPE.
+
+#### i) Download the dataset:
+
+```bash
+wget https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz
+tar  -zxvf  books1.tar.gz  -C  ./bert-corpus/
+```
+
+```python
+pip install datasets
+from datasets import load_dataset
+
+dataset = load_dataset("wikipedia", "20220301.en")
+```
+
+#### ii) Generate Raw data:
+
+   - For wikipedia dataset,  we need to read each line of the json line file , replace the `\n` in the text field with a space, and write the line (add `\n` at the end), to the file new  `all_data.raw`.
+
+   - For  bookcorpus dataset, read out the contexts of each book, then replace  the `\n` with the space, and then write the context of the book as one line in `all_data.raw`, ended up with `\n`.
+
+   - Split the  `all_data.raw`  in to  `wiki.train.raw` and  `wiki.dev.raw`  with the ratio of 99:1. Set  `wiki.test.raw = wiki.dev.raw` for compatibility of fairseq.
+
+     
+
+#### iii) Encode data with the GPT-2 BPE:
+
+```bash
+mkdir -p gpt2_bpe
+wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
+wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
+for SPLIT in train valid test; do \
+    python -m examples.roberta.multiprocessing_bpe_encoder \
+        --encoder-json gpt2_bpe/encoder.json \
+        --vocab-bpe gpt2_bpe/vocab.bpe \
+        --inputs bert-corpus/wiki.${SPLIT}.raw \
+        --outputs bert-corpus/wiki.${SPLIT}.bpe \
+        --keep-empty \
+        --workers 60; \
+done
+```
+
+
+
+#### iv) Binarize the data using the GPT-2 fairseq dictionary:
+
+```bash
+wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
+fairseq-preprocess \
+    --only-source \
+    --srcdict gpt2_bpe/dict.txt \
+    --trainpref bert-corpus/wiki.train.bpe \
+    --validpref bert-corpus/wiki.valid.bpe \
+    --testpref bert-corpus/wiki.test.bpe \
+    --destdir data-bin/bert-corpus \
+    --workers 60
+```
+
+
+
+### 2) Train BERT base
+
+Put the provided [config files](./config/pretraining) to the directory `path/to/fairseq/examples/roberta/config/pretraining`
+
+```bash
+DATA_DIR=/path/to/fairseq/bert-corpus
+
+fairseq-hydra-train -m --config-dir examples/roberta/config/pretraining \
+--config-name ${NAME} task.data=$DATA_DIR \
+checkpoint.save_dir=/path/to/save_dir/
+
+```
+
+We can optionally resume the training of the released BERT-base model by adding `checkpoint.restore_file=/path/to/model.pt`. Note, in our experiments, we use Adan to train BERT-base from scratch. You can use the following config files to train  BERT-base with Adam or Adan:
+
+  |   NAME    | Optimizer |                         Config                         |                         Download                         |
+  | :-------: | :-------: | :----------------------------------------------------: | :------------------------------------------------------: |
+  | bert-base |   Adam    | [config](./exp_results/pretrain/full_config-adam.yaml) | [log](./exp_results/pretrain/hydra_train-adam.log)/model |
+  | bert-adan |   Adan    | [config](./exp_results/pretrain/full_config-adan.yaml) | [log](./exp_results/pretrain/hydra_train-adan.log)/model |
+
+The above command assumes the training is on 8x40GB A100 GPUs. Each GPU uses a batch size of 32 sequences (`dataset.batch_size`). If you have fewer GPUs or GPUs with less memory, you may need to reduce `dataset.batch_size` and increase `dataset.update_freq` to compensate. Alternatively if you have more GPUs you can decrease `dataset.update_freq` accordingly to improve the training speed.
+
+
+## Finetuning BERT-base on GLUE tasks
+
+### 1) Download the data from [GLUE website](https://gluebenchmark.com/tasks) using following commands:
+```bash
+wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
+python download_glue_data.py --data_dir glue_data --tasks all
+```
+There some problems to download `MRPC` and  `MNLI` , hence we pass the `MRPC` task and download the data of `MNLI` from the unofficial sources.
+
+
+
+### 2) Preprocess GLUE task data:
+
+```bash
+./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
+```
+- `glue_task_name` is one of the following: `{ALL, QQP, MNLI, QNLI, RTE, STS-B, SST-2, CoLA}`. Use `ALL` for preprocessing all the glue tasks.
+
+
+
+### 3) Fine-tuning on GLUE task:
+
+Example fine-tuning cmd for `RTE` task
+```bash
+TASK=RTE;
+
+python  path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --avg_num 1 \
+--data_path /path/to/fairseq/GLUE/glue_data/$TASK \
+--bin_path /path/to/fairseq/GLUE/$TASK-bin \
+--pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \
+--finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \
+--task rte-adan
+```
+
+- `avg_num` number of repetitions.
+
+- `data_path` path to the data of GLUE task, e.g., CoLA, MNLI, etc.
+
+- `bin_path` similar to `data_path`, but is path to the binarized data after processing.
+
+- `pre_path` path to the pre-trained model.
+
+- `finetune_path` path to save/load fine-tuned model.
+
+- `task` config name, please refer to the directory of [fine-tuning](./config/finetuning) for the additional config files for each of the GLUE tasks.
+
+- This cmd-args and hyperparams are tested on one Nvidia `A100` GPU with `40gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
+
+  
+
+### 4) Inference on GLUE task
+After training the model by using previous step, we can perform inference with checkpoints in `finetune_path` directory using following code snippet:
+
+```bash
+TASK=RTE;
+
+python  path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --inference \
+--data_path /path/to/fairseq/GLUE/glue_data/$TASK \
+--bin_path /path/to/fairseq/GLUE/$TASK-bin \
+--pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \
+--finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \
+--task rte-adan
+
+```
+
+ This should give:
+
+| GLUE-Task | Metric                       |  Result   |                    Config                     |
+| --------- | :--------------------------- | :-------: | :-------------------------------------------: |
+| CoLA      | Matthew's corr.              |   64.6    | [config](./config/finetuning/cola-adan.yaml)  |
+| SST-2     | Accuracy                     |   93.2    | [config](./config/finetuning/sst_2-adan.yaml) |
+| STS-B     | Person corr.                 |   89.3    | [config](./config/finetuning/sts_b-adan.yaml) |
+| QQP       | Accuracy                     |   91.2    |  [config](./config/finetuning/qqp-adan.yaml)  |
+| MNLI      | Matched acc./Mismatched acc. | 85.7/85.6 | [config](./config/finetuning/mnli-adan.yaml)  |
+| QNLI      | Accuracy                     |   91.3    | [config](./config/finetuning/qnli-adan.yaml)  |
+| RTE       | Accuracy                     |   73.3    |  [config](./config/finetuning/rte-adan.yaml)  |
+
diff --git a/NLP/BERT/adan.py b/NLP/BERT/adan.py
new file mode 100644
index 0000000..65326ce
--- /dev/null
+++ b/NLP/BERT/adan.py
@@ -0,0 +1,231 @@
+# Copyright 2022 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import math
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import Any, List
+
+import torch
+import torch.distributed as dist
+import torch.optim
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim import FairseqOptimizer, register_optimizer
+from omegaconf import II, OmegaConf
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class FairseqAdanConfig(FairseqDataclass):
+    adan_betas: Any = field(
+        default=(0.98, 0.92, 0.99), metadata={"help": "betas for Adan optimizer"}
+    )
+    adan_eps: float = field(
+        default=1e-8, metadata={"help": "epsilon for Adam optimizer"}
+    )
+    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
+
+    no_prox: bool = field(
+        default=False, metadata={"help": "wether to perform prox operator"}
+    )
+    fp16_adan_stats: bool = field(
+        default=False, metadata={"help": "use FP16 stats (with automatic scaling)"}
+    )
+    # TODO common vars below in parent
+    tpu: bool = II("common.tpu")
+    lr: List[float] = II("optimization.lr")
+
+
+@register_optimizer("adan", dataclass=FairseqAdanConfig)
+class FairseqAdan(FairseqOptimizer):
+    """
+    Adan optimizer for fairseq.
+    """
+
+    def __init__(self, cfg: FairseqAdanConfig, params):
+        super().__init__(cfg)
+        fused_adan_cls = None
+        use_fused_adan = (
+            fused_adan_cls is not None
+            and torch.cuda.is_available()
+        )
+        if getattr(cfg, "tpu", False):
+            if self.cfg.fp16_adan_stats:
+                raise NotImplementedError("--fp16-adam-stats is only supported on GPU")
+            # on TPUs we use the Adam defined here, since it
+            # automatically casts gradients to FP32
+            self._optimizer = Adan(params, **self.optimizer_config)
+        elif use_fused_adan:
+            raise NotImplementedError("--fp16-adam-stats is only supported on GPU")
+        else:
+            if self.cfg.fp16_adan_stats:
+                raise NotImplementedError(
+                    "--fp16-adam-stats is only supported with FusedAdanV1"
+                )
+            self._optimizer = Adan(params, **self.optimizer_config)
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.cfg.lr[0]
+            if isinstance(self.cfg.lr, Collection)
+            else self.cfg.lr,
+            "betas": eval(self.cfg.adan_betas)
+            if isinstance(self.cfg.adan_betas, str)
+            else OmegaConf.to_container(self.cfg.adan_betas),
+            "eps": self.cfg.adan_eps,
+            "weight_decay": self.cfg.weight_decay,
+        }
+
+    def average_params(self):
+        """Reduce Params is only used during BMUF distributed training."""
+        state_dict = self.optimizer.state_dict()
+        total_gpus = float(dist.get_world_size())
+
+        for _, value in state_dict["state"].items():
+            value["exp_avg"] /= total_gpus
+            value["exp_avg_sq"] /= total_gpus
+            value['exp_avg_diff'] /= total_gpus
+            dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
+            dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
+            dist.all_reduce(value["exp_avg_diff"], op=dist.ReduceOp.SUM)
+
+
+class Adan(torch.optim.Optimizer):
+    r"""Implements Adan algorithm.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.98, 0.92, 0.99))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+    """
+    def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8,
+                 weight_decay=0.0, no_prox = False):
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, no_prox = no_prox)
+        super(Adan, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adan, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('no_prox', False)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        
+
+        for group in self.param_groups:
+            beta1, beta2, beta3 = group['betas']
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1 
+            else:
+                group['step'] = 1
+
+            
+            bias_correction1 = 1.0 - beta1 ** group['step']
+
+            bias_correction2 = 1.0 - beta2 ** group['step']
+
+            bias_correction3 = 1.0 - beta3 ** group['step']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                p_data_fp32 = p.data
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+                
+                state = self.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_diff'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32)
+                    state['exp_avg_diff'] = state['exp_avg_diff'].to(p_data_fp32)
+
+
+                grad = p.grad.data
+                if grad.dtype in {torch.float16, torch.bfloat16}:
+                    grad = grad.float()
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        "Adan does not support sparse gradients, please consider SparseAdam instead"
+                    )
+
+                if 'pre_grad' not in state or group['step'] == 1: 
+                    state['pre_grad'] = grad
+
+                
+                copy_grad = grad.clone()
+                
+
+                exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff']
+                diff = grad - state['pre_grad']
+
+
+                update = grad+beta2*diff
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
+                exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
+                exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # v_t
+        
+                denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps'])
+                update = ((exp_avg/bias_correction1+beta2*exp_avg_diff/bias_correction2) ).div_(denom)
+
+                if group['no_prox']:
+                    p_data_fp32.mul_(1 - group['lr'] * group['weight_decay'])
+                    p_data_fp32.add_(update, alpha=-group['lr']) 
+                else:
+                    p_data_fp32.add_(update, alpha=-group['lr'])  
+                    p_data_fp32.div_(1 + group['lr'] * group['weight_decay']) 
+                    
+                state['pre_grad'] = copy_grad
+                
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)             
+        return loss
diff --git a/NLP/BERT/config/finetuning/acc_test.py b/NLP/BERT/config/finetuning/acc_test.py
new file mode 100644
index 0000000..efd67df
--- /dev/null
+++ b/NLP/BERT/config/finetuning/acc_test.py
@@ -0,0 +1,116 @@
+import os
+from fairseq.models.roberta import RobertaModel
+import argparse
+from scipy.stats import pearsonr
+from sklearn.metrics import matthews_corrcoef
+
+
+def get_acc(model_path, data_path, bin_path, task='rte'):
+    acc_list = []
+    gold, pred = [], []
+    roberta = RobertaModel.from_pretrained(
+        model_path,
+        checkpoint_file='checkpoint_best.pt',
+        data_name_or_path=bin_path#'RTE-bin'
+    )
+
+    label_fn = lambda label: roberta.task.label_dictionary.string(
+        [label + roberta.task.label_dictionary.nspecial]
+    )
+    ncorrect, nsamples = 0, 0
+    roberta.cuda()
+    roberta.eval()
+    if 'mnli' not in task:
+        dev_files = ['dev.tsv']
+    else: dev_files = ['dev_mismatched.tsv', 'dev_matched.tsv']
+    for dev_file in dev_files:
+        with open(os.path.join(data_path, dev_file)) as fin:
+            fin.readline()
+            for index, line in enumerate(fin):
+                tokens = line.strip().split('\t')
+                if 'rte' in task or 'qnli' in task:
+                    sent1, sent2, target = tokens[1], tokens[2], tokens[3]
+                    tokens = roberta.encode(sent1, sent2)
+                elif 'qqp' in task:
+                    sent1, sent2, target = tokens[3], tokens[4], tokens[5]
+                    tokens = roberta.encode(sent1, sent2)
+                elif 'mnli' in task:
+                    sent1, sent2, target = tokens[8], tokens[9], tokens[11]
+                    tokens = roberta.encode(sent1, sent2)
+                elif 'mrpc' in task:
+                    sent1, sent2, target = tokens[3], tokens[4], tokens[0]
+                    tokens = roberta.encode(sent1, sent2)
+                elif 'sts_b' in task:
+                    sent1, sent2, target = tokens[7], tokens[8], float(tokens[9])
+                    tokens = roberta.encode(sent1, sent2)
+                elif 'sst_2' in task:
+                    sent, target = tokens[0], tokens[1]
+                    tokens = roberta.encode(sent)
+                   
+                elif 'cola' in task:
+                    sent, target = tokens[3], tokens[1]
+                    tokens = roberta.encode(sent)
+                if 'sts_b' not in task:
+                    prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
+                    prediction_label = label_fn(prediction)
+                    ncorrect += int(prediction_label == target)
+                    
+                    nsamples += 1
+                    if 'cola' in task: 
+                        target = int(target)
+                        prediction_label = int(prediction_label)
+                        pred.append(prediction_label)
+                        gold.append(target)
+                    
+                else:
+                    features = roberta.extract_features(tokens)
+                    predictions = 5.0 * roberta.model.classification_heads['sentence_classification_head'](features)
+                    gold.append(target)
+                    pred.append(predictions.item())
+        if 'cola' in task: 
+            out = matthews_corrcoef(gold, pred)
+        elif 'sts_b' in task:
+            out = pearsonr(gold, pred)[0]
+        else: out = float(ncorrect)/float(nsamples)
+        
+        acc_list.append(out)
+    return acc_list
+
+
+parser = argparse.ArgumentParser(description='GLUE test for acc')
+parser.add_argument('--avg_num', type=int, default=1,
+                    help='number of try')
+parser.add_argument('--pre_path', type=str,  default='./baseline/checkpoint_20_1000000.pt',
+                    help='path to pre-trained model')
+parser.add_argument('--data_path', type=str,  default='./GLUE/glue_data/STS-B',
+                    help='path to data')
+parser.add_argument('--bin_path', type=str,  default='./GLUE/STS-B-bin',
+                    help='path to -bin data')
+parser.add_argument('--finetune_path', type=str,  default='./bert-fintune/adam/STS-B/',
+                    help='path to finetuned model')
+parser.add_argument('--task', type=str,  default='sts_b',
+                    help='task of finetune')
+parser.add_argument('--inference', action='store_true', default=False,
+                    help='inference only')
+args = parser.parse_args()
+
+
+acc_avg = 0.0
+acc_avg2 = 0.0
+for _ in range(args.avg_num):
+    if not args.inference:
+        val = os.system(' fairseq-hydra-train --config-dir ./fairseq/examples/roberta/config/finetuning \
+                    --config-name {} \
+                    task.data={} checkpoint.restore_file={} \
+                    checkpoint.save_dir={}'.format(args.task, args.bin_path, args.pre_path, args.finetune_path))
+    all_acc = get_acc(args.finetune_path, args.data_path, args.bin_path, args.task)
+    acc_avg+=all_acc[0]
+    if len(all_acc)>1:
+        acc_avg2+=all_acc[1]
+
+if acc_avg2>0:
+    print('Mismatched Accuracy1:{},   Matched Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num), float(acc_avg2)/float(args.avg_num)))
+else:
+    print('AVG Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num)))
+
+                 
\ No newline at end of file
diff --git a/NLP/BERT/config/finetuning/cola-adan.yaml b/NLP/BERT/config/finetuning/cola-adan.yaml
new file mode 100644
index 0000000..cddfbfe
--- /dev/null
+++ b/NLP/BERT/config/finetuning/cola-adan.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adan
+  weight_decay: 0.01
+  adan_betas: (0.98,0.99,0.99)
+  adan_eps: 1e-08
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 320
+
+optimization:
+  clip_norm: 0.0
+  lr: [4e-05]
+  max_update: 5336
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/cola.yaml b/NLP/BERT/config/finetuning/cola.yaml
new file mode 100644
index 0000000..ac76611
--- /dev/null
+++ b/NLP/BERT/config/finetuning/cola.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 320
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 5336
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/mnli-adan.yaml b/NLP/BERT/config/finetuning/mnli-adan.yaml
new file mode 100644
index 0000000..8edf286
--- /dev/null
+++ b/NLP/BERT/config/finetuning/mnli-adan.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 3
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adan
+  weight_decay: 0.01
+  adan_betas: (0.98,0.92,0.999)
+  adan_eps: 1e-08
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 7432
+
+optimization:
+  clip_norm: 1.0
+  lr: [2.0e-05]
+  max_update: 123873
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/mnli.yaml b/NLP/BERT/config/finetuning/mnli.yaml
new file mode 100644
index 0000000..5be10c3
--- /dev/null
+++ b/NLP/BERT/config/finetuning/mnli.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 3
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 7432
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 123873
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/qnli-adan.yaml b/NLP/BERT/config/finetuning/qnli-adan.yaml
new file mode 100644
index 0000000..36f1bce
--- /dev/null
+++ b/NLP/BERT/config/finetuning/qnli-adan.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adan
+  weight_decay: 0.001
+  adan_betas: (0.98,0.99,0.99)
+  adan_eps: 1e-08
+
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 1986
+
+optimization:
+  clip_norm: 0.0
+  lr: [2e-05]
+  max_update: 33112
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/qnli.yaml b/NLP/BERT/config/finetuning/qnli.yaml
new file mode 100644
index 0000000..b4595b0
--- /dev/null
+++ b/NLP/BERT/config/finetuning/qnli.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 1986
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 33112
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/qqp-adan.yaml b/NLP/BERT/config/finetuning/qqp-adan.yaml
new file mode 100644
index 0000000..df48414
--- /dev/null
+++ b/NLP/BERT/config/finetuning/qqp-adan.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adan
+  weight_decay: 0.001
+  adan_betas: (0.98,0.99,0.99)
+  adan_eps: 1e-08
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 28318
+
+optimization:
+  clip_norm: 0.0
+  lr: [4e-05]
+  max_update: 113272
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/qqp.yaml b/NLP/BERT/config/finetuning/qqp.yaml
new file mode 100644
index 0000000..5a2b2ed
--- /dev/null
+++ b/NLP/BERT/config/finetuning/qqp.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 28318
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 113272
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/rte-adan.yaml b/NLP/BERT/config/finetuning/rte-adan.yaml
new file mode 100644
index 0000000..c43f6e2
--- /dev/null
+++ b/NLP/BERT/config/finetuning/rte-adan.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adan
+  weight_decay: 0.01
+  adan_betas: (0.98,0.99,0.99)
+  adan_eps: 1e-08
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 122
+
+optimization:
+  clip_norm: 0.0
+  lr: [2e-05]
+  max_update: 2036
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/rte.yaml b/NLP/BERT/config/finetuning/rte.yaml
new file mode 100644
index 0000000..7318465
--- /dev/null
+++ b/NLP/BERT/config/finetuning/rte.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 122
+
+optimization:
+  clip_norm: 0.0
+  lr: [2e-05]
+  max_update: 2036
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/sst_2-adan.yaml b/NLP/BERT/config/finetuning/sst_2-adan.yaml
new file mode 100644
index 0000000..ed79f63
--- /dev/null
+++ b/NLP/BERT/config/finetuning/sst_2-adan.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adan
+  weight_decay: 0.01
+  adan_betas: (0.98,0.92,0.99)
+  adan_eps: 1e-08
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 1256
+
+optimization:
+  clip_norm: 0.0
+  lr: [4e-05]
+  max_update: 20935
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/sst_2.yaml b/NLP/BERT/config/finetuning/sst_2.yaml
new file mode 100644
index 0000000..a93ad2f
--- /dev/null
+++ b/NLP/BERT/config/finetuning/sst_2.yaml
@@ -0,0 +1,59 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 1256
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 20935
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/sts_b-adan.yaml b/NLP/BERT/config/finetuning/sts_b-adan.yaml
new file mode 100644
index 0000000..6c4069f
--- /dev/null
+++ b/NLP/BERT/config/finetuning/sts_b-adan.yaml
@@ -0,0 +1,58 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 1
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+  regression_target: true
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adan
+  weight_decay: 0.01
+  adan_betas: (0.98,0.99,0.99)
+  adan_eps: 1e-8
+
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 214
+
+optimization:
+  clip_norm: 0.5
+  lr: [4e-05]
+  max_update: 3598
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/finetuning/sts_b.yaml b/NLP/BERT/config/finetuning/sts_b.yaml
new file mode 100644
index 0000000..2d49522
--- /dev/null
+++ b/NLP/BERT/config/finetuning/sts_b.yaml
@@ -0,0 +1,58 @@
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 1
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+  regression_target: true
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 214
+
+optimization:
+  clip_norm: 0.0
+  lr: [2e-05]
+  max_update: 3598
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/pretraining/base.yaml b/NLP/BERT/config/pretraining/base.yaml
new file mode 100644
index 0000000..9782990
--- /dev/null
+++ b/NLP/BERT/config/pretraining/base.yaml
@@ -0,0 +1,42 @@
+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+
+checkpoint:
+  no_epoch_checkpoints: true
+
+task:
+  _name: masked_lm
+  data: ???
+  sample_break_mode: complete
+  tokens_per_sample: 512
+
+criterion: masked_lm
+
+dataset:
+  batch_size: 16
+  ignore_unused_valid_subsets: true
+
+optimizer:
+  _name: adam
+  weight_decay: 0.01
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 10000
+
+optimization:
+  clip_norm: 0
+  lr: [0.0005]
+  max_update: 125000
+  update_freq: [16]
+
+model:
+  _name: roberta
+  max_positions: 512
+  dropout: 0.1
+  attention_dropout: 0.1
diff --git a/NLP/BERT/config/pretraining/bert-adan.yaml b/NLP/BERT/config/pretraining/bert-adan.yaml
new file mode 100644
index 0000000..b0e3ebb
--- /dev/null
+++ b/NLP/BERT/config/pretraining/bert-adan.yaml
@@ -0,0 +1,52 @@
+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+
+checkpoint:
+  no_epoch_checkpoints: true
+  save_interval: 5
+  save_interval_updates: 50000
+  
+task:
+  _name: masked_lm
+  data: ???
+  sample_break_mode: complete
+  tokens_per_sample: 512
+
+criterion: masked_lm
+
+
+
+optimizer:
+  _name: adan
+  weight_decay: 0.02
+  adan_betas: (0.98,0.92,0.99)
+  adan_eps: 1e-08
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 10000
+
+optimization:
+  clip_norm: 5.0
+  lr: [0.001]
+  max_update: 1000000
+  update_freq: [1]
+
+model:
+  _name: roberta
+  max_positions: 512
+  dropout: 0.1
+  attention_dropout: 0.1
+
+distributed_training:
+  ddp_backend: no_c10d
+
+dataset:
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 50000
+  batch_size: 32
+  ignore_unused_valid_subsets: true
diff --git a/NLP/BERT/config/pretraining/bert-base.yaml b/NLP/BERT/config/pretraining/bert-base.yaml
new file mode 100644
index 0000000..f8ae660
--- /dev/null
+++ b/NLP/BERT/config/pretraining/bert-base.yaml
@@ -0,0 +1,54 @@
+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+
+checkpoint:
+  save_dir: 'bert/baseline/'
+  no_epoch_checkpoints: true
+  save_interval: 5
+  save_interval_updates: 50000
+
+task:
+  _name: masked_lm
+  data: ???
+  sample_break_mode: complete
+  tokens_per_sample: 512
+
+criterion: masked_lm
+
+
+
+optimizer:
+  _name: adam
+  weight_decay: 0.01
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 10000
+
+optimization:
+  clip_norm: 0
+  lr: [0.0001]
+  max_update: 1000000
+  update_freq: [1]
+
+model:
+  _name: roberta
+  max_positions: 512
+  dropout: 0.1
+  attention_dropout: 0.1
+
+distributed_training:
+  ddp_backend: no_c10d
+
+dataset:
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 50000
+  batch_size: 32
+  ignore_unused_valid_subsets: true
+
diff --git a/NLP/BERT/exp_results/pretrain/full_config-adam.yaml b/NLP/BERT/exp_results/pretrain/full_config-adam.yaml
new file mode 100644
index 0000000..5a35e9b
--- /dev/null
+++ b/NLP/BERT/exp_results/pretrain/full_config-adam.yaml
@@ -0,0 +1,376 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    hydra_help: ???
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+  output_subdir: .hydra
+  overrides:
+    hydra: []
+    task:
+    - task.data=/dataset/common/bert-corpus-0729/
+  job:
+    name: hydra_train
+    override_dirname: task.data=/dataset/common/bert-corpus-0729/
+    id: ???
+    num: ???
+    config_name: bert-base
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.0.7
+    cwd: /vit-opt/fairseq
+  verbose: false
+_name: null
+common:
+  _name: null
+  no_progress_bar: false
+  log_interval: 200
+  log_format: json
+  log_file: null
+  aim_repo: null
+  aim_run_hash: null
+  tensorboard_logdir: null
+  wandb_project: null
+  azureml_logging: false
+  seed: 1
+  cpu: false
+  tpu: false
+  bf16: false
+  memory_efficient_bf16: false
+  fp16: true
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  on_cpu_convert_precision: false
+  min_loss_scale: 0.0001
+  threshold_loss_scale: null
+  amp: false
+  amp_batch_retries: 2
+  amp_init_scale: 128
+  amp_scale_window: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  quantization_config_path: null
+  profile: false
+  reset_logging: false
+  suppress_crashes: false
+  use_plasma_view: false
+  plasma_path: /tmp/plasma
+common_eval:
+  _name: null
+  path: null
+  post_process: null
+  quiet: false
+  model_overrides: '{}'
+  results_path: null
+distributed_training:
+  _name: null
+  distributed_world_size: 8
+  distributed_num_procs: 8
+  distributed_rank: 0
+  distributed_backend: nccl
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  distributed_no_spawn: false
+  ddp_backend: no_c10d
+  ddp_comm_hook: none
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  gradient_as_bucket_view: false
+  fast_stat_sync: false
+  heartbeat_timeout: -1
+  broadcast_buffers: false
+  slowmo_momentum: null
+  slowmo_base_algorithm: localsgd
+  localsgd_frequency: 3
+  nprocs_per_node: 8
+  pipeline_model_parallel: false
+  pipeline_balance: null
+  pipeline_devices: null
+  pipeline_chunks: 0
+  pipeline_encoder_balance: null
+  pipeline_encoder_devices: null
+  pipeline_decoder_balance: null
+  pipeline_decoder_devices: null
+  pipeline_checkpoint: never
+  zero_sharding: none
+  fp16: ${common.fp16}
+  memory_efficient_fp16: ${common.memory_efficient_fp16}
+  tpu: ${common.tpu}
+  no_reshard_after_forward: false
+  fp32_reduce_scatter: false
+  cpu_offload: false
+  use_sharded_state: false
+  not_fsdp_flatten_parameters: false
+dataset:
+  _name: null
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: true
+  max_tokens: null
+  batch_size: 32
+  required_batch_size_multiple: 8
+  required_seq_len_multiple: 1
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: train
+  valid_subset: valid
+  combine_valid_subsets: null
+  ignore_unused_valid_subsets: true
+  validate_interval: 5
+  validate_interval_updates: 50000
+  validate_after_updates: 0
+  fixed_validation_seed: null
+  disable_validation: false
+  max_tokens_valid: ${dataset.max_tokens}
+  batch_size_valid: ${dataset.batch_size}
+  max_valid_steps: null
+  curriculum: 0
+  gen_subset: test
+  num_shards: 1
+  shard_id: 0
+  grouped_shuffling: false
+  update_epoch_batch_itr: ${dataset.grouped_shuffling}
+  update_ordered_indices_seed: false
+optimization:
+  _name: null
+  max_epoch: 0
+  max_update: 1000000
+  stop_time_hours: 0.0
+  clip_norm: 0.0
+  sentence_avg: false
+  update_freq:
+  - 1
+  lr:
+  - 0.0001
+  stop_min_lr: -1.0
+  use_bmuf: false
+  skip_remainder_batch: false
+checkpoint:
+  _name: null
+  save_dir: bert/baseline/
+  restore_file: checkpoint_last.pt
+  continue_once: null
+  finetune_from_model: null
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: '{}'
+  save_interval: 5
+  save_interval_updates: 50000
+  keep_interval_updates: -1
+  keep_interval_updates_pattern: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: true
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: loss
+  maximize_best_checkpoint_metric: false
+  patience: -1
+  checkpoint_suffix: ''
+  checkpoint_shard_count: 1
+  load_checkpoint_on_all_dp_ranks: false
+  write_checkpoints_asynchronously: false
+  model_parallel_size: ${common.model_parallel_size}
+bmuf:
+  _name: null
+  block_lr: 1.0
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
+  distributed_world_size: ${distributed_training.distributed_world_size}
+generation:
+  _name: null
+  beam: 5
+  nbest: 1
+  max_len_a: 0.0
+  max_len_b: 200
+  min_len: 1
+  match_source_len: false
+  unnormalized: false
+  no_early_stop: false
+  no_beamable_mm: false
+  lenpen: 1.0
+  unkpen: 0.0
+  replace_unk: null
+  sacrebleu: false
+  score_reference: false
+  prefix_size: 0
+  no_repeat_ngram_size: 0
+  sampling: false
+  sampling_topk: -1
+  sampling_topp: -1.0
+  constraints: null
+  temperature: 1.0
+  diverse_beam_groups: -1
+  diverse_beam_strength: 0.5
+  diversity_rate: -1.0
+  print_alignment: null
+  print_step: false
+  lm_path: null
+  lm_weight: 0.0
+  iter_decode_eos_penalty: 0.0
+  iter_decode_max_iter: 10
+  iter_decode_force_max_iter: false
+  iter_decode_with_beam: 1
+  iter_decode_with_external_reranker: false
+  retain_iter_history: false
+  retain_dropout: false
+  retain_dropout_modules: null
+  decoding_format: null
+  no_seed_provided: false
+  eos_token: null
+eval_lm:
+  _name: null
+  output_word_probs: false
+  output_word_stats: false
+  context_window: 0
+  softmax_batch: 9223372036854775807
+interactive:
+  _name: null
+  buffer_size: 0
+  input: '-'
+model:
+  _name: roberta
+  max_positions: 512
+  dropout: 0.1
+  attention_dropout: 0.1
+task:
+  _name: masked_lm
+  data: /dataset/common/bert-corpus-0729/
+  sample_break_mode: complete
+  tokens_per_sample: 512
+criterion: masked_lm
+optimizer:
+  _name: adam
+  weight_decay: 0.01
+  adam_betas: (0.9,0.98)
+  adam_eps: 1.0e-06
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 10000
+scoring: null
+bpe: null
+tokenizer: null
+ema:
+  _name: null
+  store_ema: false
+  ema_decay: 0.9999
+  ema_start_update: 0
+  ema_seed_model: null
+  ema_update_freq: 1
+  ema_fp32: false
diff --git a/NLP/BERT/exp_results/pretrain/full_config-adan.yaml b/NLP/BERT/exp_results/pretrain/full_config-adan.yaml
new file mode 100644
index 0000000..7ec930a
--- /dev/null
+++ b/NLP/BERT/exp_results/pretrain/full_config-adan.yaml
@@ -0,0 +1,376 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    hydra_help: ???
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+  output_subdir: .hydra
+  overrides:
+    hydra: []
+    task:
+    - task.data=/dataset/common/bert-corpus-0729/
+  job:
+    name: hydra_train
+    override_dirname: task.data=/dataset/common/bert-corpus-0729/
+    id: ???
+    num: ???
+    config_name: bert-adan2
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.0.7
+    cwd: /vit-opt/fairseq
+  verbose: false
+_name: null
+common:
+  _name: null
+  no_progress_bar: false
+  log_interval: 200
+  log_format: json
+  log_file: null
+  aim_repo: null
+  aim_run_hash: null
+  tensorboard_logdir: null
+  wandb_project: null
+  azureml_logging: false
+  seed: 1
+  cpu: false
+  tpu: false
+  bf16: false
+  memory_efficient_bf16: false
+  fp16: true
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  on_cpu_convert_precision: false
+  min_loss_scale: 0.0001
+  threshold_loss_scale: null
+  amp: false
+  amp_batch_retries: 2
+  amp_init_scale: 128
+  amp_scale_window: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  quantization_config_path: null
+  profile: false
+  reset_logging: false
+  suppress_crashes: false
+  use_plasma_view: false
+  plasma_path: /tmp/plasma
+common_eval:
+  _name: null
+  path: null
+  post_process: null
+  quiet: false
+  model_overrides: '{}'
+  results_path: null
+distributed_training:
+  _name: null
+  distributed_world_size: 8
+  distributed_num_procs: 8
+  distributed_rank: 0
+  distributed_backend: nccl
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  distributed_no_spawn: false
+  ddp_backend: no_c10d
+  ddp_comm_hook: none
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  gradient_as_bucket_view: false
+  fast_stat_sync: false
+  heartbeat_timeout: -1
+  broadcast_buffers: false
+  slowmo_momentum: null
+  slowmo_base_algorithm: localsgd
+  localsgd_frequency: 3
+  nprocs_per_node: 8
+  pipeline_model_parallel: false
+  pipeline_balance: null
+  pipeline_devices: null
+  pipeline_chunks: 0
+  pipeline_encoder_balance: null
+  pipeline_encoder_devices: null
+  pipeline_decoder_balance: null
+  pipeline_decoder_devices: null
+  pipeline_checkpoint: never
+  zero_sharding: none
+  fp16: ${common.fp16}
+  memory_efficient_fp16: ${common.memory_efficient_fp16}
+  tpu: ${common.tpu}
+  no_reshard_after_forward: false
+  fp32_reduce_scatter: false
+  cpu_offload: false
+  use_sharded_state: false
+  not_fsdp_flatten_parameters: false
+dataset:
+  _name: null
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: true
+  max_tokens: null
+  batch_size: 32
+  required_batch_size_multiple: 8
+  required_seq_len_multiple: 1
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: train
+  valid_subset: valid
+  combine_valid_subsets: null
+  ignore_unused_valid_subsets: true
+  validate_interval: 5
+  validate_interval_updates: 50000
+  validate_after_updates: 0
+  fixed_validation_seed: null
+  disable_validation: false
+  max_tokens_valid: ${dataset.max_tokens}
+  batch_size_valid: ${dataset.batch_size}
+  max_valid_steps: null
+  curriculum: 0
+  gen_subset: test
+  num_shards: 1
+  shard_id: 0
+  grouped_shuffling: false
+  update_epoch_batch_itr: ${dataset.grouped_shuffling}
+  update_ordered_indices_seed: false
+optimization:
+  _name: null
+  max_epoch: 0
+  max_update: 1000000
+  stop_time_hours: 0.0
+  clip_norm: 5.0
+  sentence_avg: false
+  update_freq:
+  - 1
+  lr:
+  - 0.001
+  stop_min_lr: -1.0
+  use_bmuf: false
+  skip_remainder_batch: false
+checkpoint:
+  _name: null
+  save_dir: bert/adan2/
+  restore_file: checkpoint_last.pt
+  continue_once: null
+  finetune_from_model: null
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: '{}'
+  save_interval: 5
+  save_interval_updates: 50000
+  keep_interval_updates: -1
+  keep_interval_updates_pattern: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: true
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: loss
+  maximize_best_checkpoint_metric: false
+  patience: -1
+  checkpoint_suffix: ''
+  checkpoint_shard_count: 1
+  load_checkpoint_on_all_dp_ranks: false
+  write_checkpoints_asynchronously: false
+  model_parallel_size: ${common.model_parallel_size}
+bmuf:
+  _name: null
+  block_lr: 1.0
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
+  distributed_world_size: ${distributed_training.distributed_world_size}
+generation:
+  _name: null
+  beam: 5
+  nbest: 1
+  max_len_a: 0.0
+  max_len_b: 200
+  min_len: 1
+  match_source_len: false
+  unnormalized: false
+  no_early_stop: false
+  no_beamable_mm: false
+  lenpen: 1.0
+  unkpen: 0.0
+  replace_unk: null
+  sacrebleu: false
+  score_reference: false
+  prefix_size: 0
+  no_repeat_ngram_size: 0
+  sampling: false
+  sampling_topk: -1
+  sampling_topp: -1.0
+  constraints: null
+  temperature: 1.0
+  diverse_beam_groups: -1
+  diverse_beam_strength: 0.5
+  diversity_rate: -1.0
+  print_alignment: null
+  print_step: false
+  lm_path: null
+  lm_weight: 0.0
+  iter_decode_eos_penalty: 0.0
+  iter_decode_max_iter: 10
+  iter_decode_force_max_iter: false
+  iter_decode_with_beam: 1
+  iter_decode_with_external_reranker: false
+  retain_iter_history: false
+  retain_dropout: false
+  retain_dropout_modules: null
+  decoding_format: null
+  no_seed_provided: false
+  eos_token: null
+eval_lm:
+  _name: null
+  output_word_probs: false
+  output_word_stats: false
+  context_window: 0
+  softmax_batch: 9223372036854775807
+interactive:
+  _name: null
+  buffer_size: 0
+  input: '-'
+model:
+  _name: roberta
+  max_positions: 512
+  dropout: 0.1
+  attention_dropout: 0.1
+task:
+  _name: masked_lm
+  data: /dataset/common/bert-corpus-0729/
+  sample_break_mode: complete
+  tokens_per_sample: 512
+criterion: masked_lm
+optimizer:
+  _name: adan
+  weight_decay: 0.02
+  adan_betas: (0.98,0.92,0.99)
+  adan_eps: 1.0e-08
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 10000
+scoring: null
+bpe: null
+tokenizer: null
+ema:
+  _name: null
+  store_ema: false
+  ema_decay: 0.9999
+  ema_start_update: 0
+  ema_seed_model: null
+  ema_update_freq: 1
+  ema_fp32: false
diff --git a/NLP/Transformer-XL/README.md b/NLP/Transformer-XL/README.md
new file mode 100644
index 0000000..27aff32
--- /dev/null
+++ b/NLP/Transformer-XL/README.md
@@ -0,0 +1,92 @@
+# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
+
+We first provide the instruction to modify the official training files from [Transformer-XL](https://github.com/kimiyoung/transformer-xl) to support Adan. **For data preparation, please follow that repo.**
+
+
+
+## Environment
+
+As rtecommended by the official [Transformer-XL](https://github.com/kimiyoung/transformer-xl), our experiments for this task are based on the following pkg version.
+
+```python
+torch.__version__  = '1.1.0'
+```
+
+
+
+## Usage of Adan for Transformer-XL
+
+### Two steps to use Adan
+
+**Step 1.** add  the following parameters to the file `train.py`.
+
+```python
+parser.add_argument('--optim', default='adam', type=str, choices=['adam', 'sgd', 'adagrad', 'adan'], help='optimizer to use.')
+parser.add_argument('--wd', type=float, default=0.02, help='weight decay (default: 0.02)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='Optimizer Betas (default: None, use opt default)')
+```
+
+* `optim`: the choice of optimizers. We add Adan in the choices.
+
+* `wd`: decoupled weight decay.
+
+* `opt-betas`: optimizer betas for Adan.
+
+  
+
+**Step 2.** replace the originl optimizitor creation with the following:
+
+```python
+from adan import Adan
+
+elif args.optim.lower() == 'adan':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = Adan(sparse_params,betas=args.opt_betas, lr=args.lr, weight_decay= args.wd)
+        optimizer = Adan(dense_params, lr=args.lr,betas=args.opt_betas, weight_decay= args.wd)
+    else:
+        optimizer = Adan(model.parameters(), lr=args.lr, betas=args.opt_betas, weight_decay= args.wd)
+
+```
+
+
+
+## Data Prepration
+
+see `bash getdata.sh` in repo  [Transformer-XL](https://github.com/kimiyoung/transformer-xl).
+
+
+
+## Training and Evaluation
+
+- #### Training
+
+  `bash run_wt103_adan.sh train --work_dir PATH_TO_WORK_DIR`
+
+- #### Evaluation
+
+  `bash run_wt103_adan.sh eval --work_dir PATH_TO_WORK_DIR`
+  
+  
+  
+- #### Tips for Experiments
+
+  - For Adan, we set `args.wd = 0.02` for all steps, which is consist with the other experiments.
+  - For the experiment using `steps = 50k` , we choose a slight larger `LR`.
+
+## Results and Logs
+
+  With different setting for `lr` and `max_step` in `run_wt103_adan.sh`, we have the following results:
+
+  |                     |   LR   | Steps | Test PPL |                 Download                 |
+  | ------------------- | :----: | :---: | :------: | :--------------------------------------: |
+  | Baseline (Adam)     | 2.5e-4 | 200k  |   24.2   | [log&config](./exp_results/log-adam.txt) |
+  | Transformer-XL-base | 1.5e-3 |  50k  |   26.2   | [log&config](./exp_results/log-50k.txt)  |
+  | Transformer-XL-base |  1e-3  | 100k  |   24.2   | [log&config](./exp_results/log-100k.txt) |
+  | Transformer-XL-base |  1e-3  | 200k  |   23.5   | [log&config](./exp_results/log-200k.txt) |
+
diff --git a/NLP/Transformer-XL/adan.py b/NLP/Transformer-XL/adan.py
new file mode 100644
index 0000000..e2a224a
--- /dev/null
+++ b/NLP/Transformer-XL/adan.py
@@ -0,0 +1,154 @@
+# Copyright 2022 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from timm.utils import *
+
+
+class Adan(Optimizer):
+    """
+    Implements a pytorch variant of Adan
+
+    Adan was proposed in
+    Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022.
+    https://arxiv.org/abs/2208.06677
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float, flot], optional): coefficients used for computing 
+            running averages of gradient and its norm. (default: (0.98, 0.92, 0.99))
+        eps (float, optional): term added to the denominator to improve 
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0)
+        max_grad_norm (float, optional): value used to clip 
+            global grad norm (default: 0.0 no clip)
+        no_prox (bool): how to perform the decoupled weight decay (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8,
+                 weight_decay=0.0, max_grad_norm=0.0, no_prox=False):
+        if not 0.0 <= max_grad_norm:
+            raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm))
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= betas[2] < 1.0:
+            raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm, no_prox=no_prox)
+        super(Adan, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adan, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('no_prox', False)
+
+    @torch.no_grad()
+    def restart_opt(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                if p.requires_grad:
+                    state = self.state[p]
+                    # State initialization
+
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    # Exponential moving average of gradient difference
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+    @torch.no_grad()
+    def step(self):
+        """
+            Performs a single optimization step.
+        """
+        if self.defaults['max_grad_norm'] > 0:
+            device = self.param_groups[0]['params'][0].device
+            global_grad_norm = torch.zeros(1, device=device)
+
+            max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device)
+            for group in self.param_groups:
+
+                for p in group['params']:
+                    if p.grad is not None:
+                        grad = p.grad
+                        global_grad_norm.add_(grad.pow(2).sum())
+
+            global_grad_norm = torch.sqrt(global_grad_norm)
+
+            clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0)
+        else:
+            clip_global_grad_norm = 1.0
+
+        for group in self.param_groups:
+            beta1, beta2, beta3 = group['betas']
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            bias_correction1 = 1.0 - beta1 ** group['step']
+
+            bias_correction2 = 1.0 - beta2 ** group['step']
+
+            bias_correction3 = 1.0 - beta3 ** group['step']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+                grad = p.grad.mul_(clip_global_grad_norm)
+                if 'pre_grad' not in state or group['step'] == 1:
+                    state['pre_grad'] = grad
+
+                copy_grad = grad.clone()
+
+                exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff']
+                diff = grad - state['pre_grad']
+
+                update = grad + beta2 * diff
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
+                exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
+                exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # n_t
+
+                denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps'])
+                update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom)
+
+                if group['no_prox']:
+                    p.data.mul_(1 - group['lr'] * group['weight_decay'])
+                    p.add_(update, alpha=-group['lr'])
+                else:
+                    p.add_(update, alpha=-group['lr'])
+                    p.data.div_(1 + group['lr'] * group['weight_decay'])
+
+                state['pre_grad'] = copy_grad
diff --git a/NLP/Transformer-XL/data_utils.py b/NLP/Transformer-XL/data_utils.py
new file mode 100644
index 0000000..df762a7
--- /dev/null
+++ b/NLP/Transformer-XL/data_utils.py
@@ -0,0 +1,273 @@
+import os, sys
+import glob
+
+from collections import Counter, OrderedDict
+import numpy as np
+import torch
+
+from utils.vocabulary import Vocab
+
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None: bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i+1:i+1+seq_len]
+
+        return data, target, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+        """
+            data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
+            else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
+                            streams[i][:n_new]
+                        target[n_filled:n_filled+n_new, i] = \
+                            streams[i][1:n_new+1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data = data.to(self.device)
+            target = target.to(self.device)
+
+            yield data, target, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
+        shuffle=False):
+
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class Corpus(object):
+    def __init__(self, path, dataset, *args, **kwargs):
+        self.dataset = dataset
+        self.vocab = Vocab(*args, **kwargs)
+
+        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+            self.vocab.count_file(os.path.join(path, 'valid.txt'))
+            self.vocab.count_file(os.path.join(path, 'test.txt'))
+        elif self.dataset == 'wt103':
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+        elif self.dataset == 'lm1b':
+            train_path_pattern = os.path.join(
+                path, '1-billion-word-language-modeling-benchmark-r13output',
+                'training-monolingual.tokenized.shuffled', 'news.en-*')
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ['ptb', 'wt2', 'wt103']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True)
+        elif self.dataset in ['enwik8', 'text8']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
+        elif self.dataset == 'lm1b':
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == 'train':
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                kwargs['shuffle'] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ['valid', 'test']:
+            data = self.valid if split == 'valid' else self.test
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+
+        return data_iter
+
+
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, 'cache.pt')
+    if os.path.exists(fn):
+        print('Loading cached dataset...')
+        corpus = torch.load(fn)
+    else:
+        print('Producing dataset {}...'.format(dataset))
+        kwargs = {}
+        if dataset in ['wt103', 'wt2']:
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = False
+        elif dataset == 'ptb':
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = True
+        elif dataset == 'lm1b':
+            kwargs['special'] = []
+            kwargs['lower_case'] = False
+            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
+        elif dataset in ['enwik8', 'text8']:
+            pass
+
+        corpus = Corpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+
+    return corpus
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='unit test')
+    parser.add_argument('--datadir', type=str, default='../data/text8',
+                        help='location of the data corpus')
+    parser.add_argument('--dataset', type=str, default='text8',
+                        choices=['ptb', 'wt2', 'wt103', 'lm1b', 'enwik8', 'text8'],
+                        help='dataset name')
+    args = parser.parse_args()
+
+    corpus = get_lm_corpus(args.datadir, args.dataset)
+    print('Vocab size : {}'.format(len(corpus.vocab.idx2sym)))
diff --git a/NLP/Transformer-XL/eval.py b/NLP/Transformer-XL/eval.py
new file mode 100644
index 0000000..eff3618
--- /dev/null
+++ b/NLP/Transformer-XL/eval.py
@@ -0,0 +1,122 @@
+# coding: utf-8
+import argparse
+import time
+import math
+import os, sys
+
+import torch
+
+from data_utils import get_lm_corpus
+from mem_transformer import MemTransformerLM
+from utils.exp_utils import get_logger
+
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+parser.add_argument('--data', type=str, default='../data/wikitext-103',
+                    help='location of the data corpus')
+parser.add_argument('--dataset', type=str, default='wt103',
+                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
+                    help='dataset name')
+parser.add_argument('--split', type=str, default='all',
+                    choices=['all', 'valid', 'test'],
+                    help='which split to evaluate')
+parser.add_argument('--batch_size', type=int, default=10,
+                    help='batch size')
+parser.add_argument('--tgt_len', type=int, default=5,
+                    help='number of tokens to predict')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=0,
+                    help='length of the retained previous heads')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='max positional embedding index')
+parser.add_argument('--cuda', action='store_true',
+                    help='use CUDA')
+parser.add_argument('--work_dir', type=str, required=True,
+                    help='path to the work_dir')
+parser.add_argument('--no_log', action='store_true',
+                    help='do not log the eval result')
+parser.add_argument('--same_length', action='store_true',
+                    help='set same length attention with masking')
+args = parser.parse_args()
+assert args.ext_len >= 0, 'extended context length must be non-negative'
+
+device = torch.device("cuda" if args.cuda else "cpu")
+
+# Get logger
+logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
+                     log_=not args.no_log)
+
+# Load dataset
+corpus = get_lm_corpus(args.data, args.dataset)
+ntokens = len(corpus.vocab)
+
+va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+
+# Load the best saved model.
+with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
+    model = torch.load(f)
+model.backward_compatible()
+model = model.to(device)
+
+logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+       args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+
+model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+if args.clamp_len > 0:
+    model.clamp_len = args.clamp_len
+if args.same_length:
+    model.same_length = True
+
+###############################################################################
+# Evaluation code
+###############################################################################
+def evaluate(eval_iter):
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    total_len, total_loss = 0, 0.
+    start_time = time.time()
+    with torch.no_grad():
+        mems = tuple()
+        for idx, (data, target, seq_len) in enumerate(eval_iter):
+            ret = model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.mean()
+            total_loss += seq_len * loss.item()
+            total_len += seq_len
+        total_time = time.time() - start_time
+    logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
+            total_time, 1000 * total_time / (idx+1)))
+    return total_loss / total_len
+
+# Run on test data.
+if args.split == 'all':
+    test_loss = evaluate(te_iter)
+    valid_loss = evaluate(va_iter)
+elif args.split == 'valid':
+    valid_loss = evaluate(va_iter)
+    test_loss = None
+elif args.split == 'test':
+    test_loss = evaluate(te_iter)
+    valid_loss = None
+
+def format_log(loss, split):
+    if args.dataset in ['enwik8', 'text8']:
+        log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
+            split, loss, loss / math.log(2))
+    else:
+        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+            split, loss, math.exp(loss))
+    return log_str
+
+log_str = ''
+if valid_loss is not None:
+    log_str += format_log(valid_loss, 'valid')
+if test_loss is not None:
+    log_str += format_log(test_loss, 'test')
+
+logging('=' * 100)
+logging(log_str)
+logging('=' * 100)
diff --git a/NLP/Transformer-XL/exp_results/log-100k.txt b/NLP/Transformer-XL/exp_results/log-100k.txt
new file mode 100644
index 0000000..00c50df
--- /dev/null
+++ b/NLP/Transformer-XL/exp_results/log-100k.txt
@@ -0,0 +1,649 @@
+====================================================================================================
+    - data : /root/autodl-tmp/data/wikitext-103/
+    - dataset : wt103
+    - n_layer : 16
+    - n_head : 10
+    - d_head : 41
+    - d_embed : 410
+    - d_model : 410
+    - d_inner : 2100
+    - dropout : 0.1
+    - dropatt : 0.0
+    - init : normal
+    - emb_init : normal
+    - init_range : 0.1
+    - emb_init_range : 0.01
+    - init_std : 0.02
+    - proj_init_std : 0.01
+    - optim : adan
+    - lr : 0.001
+    - wd : 0.02
+    - mom : 0.0
+    - scheduler : cosine
+    - warmup_step : 3000
+    - decay_rate : 0.5
+    - lr_min : 1e-06
+    - clip : 0.25
+    - clip_nonemb : False
+    - max_step : 100000
+    - batch_size : 60
+    - batch_chunk : 1
+    - tgt_len : 150
+    - eval_tgt_len : 150
+    - ext_len : 0
+    - mem_len : 150
+    - not_tied : False
+    - seed : 1111
+    - cuda : True
+    - adaptive : True
+    - div_val : 1
+    - pre_lnorm : False
+    - varlen : False
+    - multi_gpu : True
+    - log_interval : 200
+    - eval_interval : 4000
+    - work_dir : /root/autodl-tmp/-wt103/20220810-001355
+    - restart : False
+    - restart_dir : 
+    - debug : False
+    - same_length : False
+    - attn_type : 0
+    - clamp_len : -1
+    - eta_min : 0.0
+    - gpu0_bsz : 4
+    - max_eval_steps : -1
+    - sample_softmax : -1
+    - patience : 0
+    - finetune_v2 : False
+    - finetune_v3 : False
+    - fp16 : False
+    - static_loss_scale : 1
+    - dynamic_loss_scale : False
+    - opt_betas : [0.9, 0.9, 0.999]
+    - tied : True
+    - n_token : 267735
+    - n_all_param : 151107538
+    - n_nonemb_param : 41066400
+====================================================================================================
+#params = 151107538
+#non emb params = 41066400
+| epoch   1 step      200 |    200 batches | lr 6.67e-05 | ms/batch 742.71 | loss  8.90 | ppl  7366.806
+| epoch   1 step      400 |    400 batches | lr 0.000133 | ms/batch 761.92 | loss  6.85 | ppl   942.451
+| epoch   1 step      600 |    600 batches | lr 0.0002 | ms/batch 704.16 | loss  6.34 | ppl   567.781
+| epoch   1 step      800 |    800 batches | lr 0.000267 | ms/batch 669.19 | loss  6.06 | ppl   428.925
+| epoch   1 step     1000 |   1000 batches | lr 0.000333 | ms/batch 697.67 | loss  5.80 | ppl   330.968
+| epoch   1 step     1200 |   1200 batches | lr 0.0004 | ms/batch 710.36 | loss  5.60 | ppl   270.691
+| epoch   1 step     1400 |   1400 batches | lr 0.000467 | ms/batch 726.18 | loss  5.43 | ppl   228.271
+| epoch   1 step     1600 |   1600 batches | lr 0.000533 | ms/batch 712.97 | loss  5.28 | ppl   196.416
+| epoch   1 step     1800 |   1800 batches | lr 0.0006 | ms/batch 695.31 | loss  5.15 | ppl   173.240
+| epoch   1 step     2000 |   2000 batches | lr 0.000667 | ms/batch 700.07 | loss  5.04 | ppl   154.584
+| epoch   1 step     2200 |   2200 batches | lr 0.000733 | ms/batch 681.35 | loss  4.93 | ppl   138.813
+| epoch   1 step     2400 |   2400 batches | lr 0.0008 | ms/batch 680.03 | loss  4.85 | ppl   128.135
+| epoch   1 step     2600 |   2600 batches | lr 0.000867 | ms/batch 672.90 | loss  4.76 | ppl   116.945
+| epoch   1 step     2800 |   2800 batches | lr 0.000933 | ms/batch 674.70 | loss  4.69 | ppl   108.587
+| epoch   1 step     3000 |   3000 batches | lr 0.001 | ms/batch 681.39 | loss  4.64 | ppl   103.975
+| epoch   1 step     3200 |   3200 batches | lr 0.000999 | ms/batch 693.50 | loss  4.58 | ppl    97.506
+| epoch   1 step     3400 |   3400 batches | lr 0.000999 | ms/batch 674.28 | loss  4.53 | ppl    93.139
+| epoch   1 step     3600 |   3600 batches | lr 0.000999 | ms/batch 693.74 | loss  4.45 | ppl    85.849
+| epoch   1 step     3800 |   3800 batches | lr 0.000998 | ms/batch 674.43 | loss  4.48 | ppl    88.153
+| epoch   1 step     4000 |   4000 batches | lr 0.000998 | ms/batch 672.46 | loss  4.43 | ppl    84.328
+----------------------------------------------------------------------------------------------------
+| Eval   1 at step     4000 | time: 2792.28s | valid loss  4.37 | valid ppl    78.835
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     4200 |   4200 batches | lr 0.000998 | ms/batch 736.53 | loss  4.38 | ppl    79.983
+| epoch   1 step     4400 |   4400 batches | lr 0.000997 | ms/batch 707.78 | loss  4.36 | ppl    78.055
+| epoch   1 step     4600 |   4600 batches | lr 0.000997 | ms/batch 716.77 | loss  4.34 | ppl    76.331
+| epoch   1 step     4800 |   4800 batches | lr 0.000996 | ms/batch 690.44 | loss  4.28 | ppl    72.184
+| epoch   1 step     5000 |   5000 batches | lr 0.000996 | ms/batch 673.77 | loss  4.31 | ppl    74.590
+| epoch   1 step     5200 |   5200 batches | lr 0.000995 | ms/batch 678.84 | loss  4.25 | ppl    70.193
+| epoch   1 step     5400 |   5400 batches | lr 0.000995 | ms/batch 677.47 | loss  4.20 | ppl    66.462
+| epoch   1 step     5600 |   5600 batches | lr 0.000994 | ms/batch 671.76 | loss  4.22 | ppl    67.988
+| epoch   1 step     5800 |   5800 batches | lr 0.000994 | ms/batch 690.14 | loss  4.21 | ppl    67.462
+| epoch   1 step     6000 |   6000 batches | lr 0.000993 | ms/batch 704.75 | loss  4.17 | ppl    64.509
+| epoch   1 step     6200 |   6200 batches | lr 0.000992 | ms/batch 714.31 | loss  4.14 | ppl    62.962
+| epoch   1 step     6400 |   6400 batches | lr 0.000992 | ms/batch 691.45 | loss  4.17 | ppl    64.894
+| epoch   1 step     6600 |   6600 batches | lr 0.000991 | ms/batch 713.05 | loss  4.11 | ppl    60.698
+| epoch   1 step     6800 |   6800 batches | lr 0.000991 | ms/batch 685.79 | loss  4.10 | ppl    60.561
+| epoch   1 step     7000 |   7000 batches | lr 0.00099 | ms/batch 700.60 | loss  4.11 | ppl    60.660
+| epoch   1 step     7200 |   7200 batches | lr 0.000989 | ms/batch 675.17 | loss  4.06 | ppl    57.759
+| epoch   1 step     7400 |   7400 batches | lr 0.000988 | ms/batch 702.69 | loss  4.05 | ppl    57.520
+| epoch   1 step     7600 |   7600 batches | lr 0.000988 | ms/batch 691.46 | loss  4.03 | ppl    56.370
+| epoch   1 step     7800 |   7800 batches | lr 0.000987 | ms/batch 677.30 | loss  4.05 | ppl    57.587
+| epoch   1 step     8000 |   8000 batches | lr 0.000986 | ms/batch 692.82 | loss  4.05 | ppl    57.212
+----------------------------------------------------------------------------------------------------
+| Eval   2 at step     8000 | time: 2775.07s | valid loss  3.93 | valid ppl    50.908
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     8200 |   8200 batches | lr 0.000985 | ms/batch 745.71 | loss  4.02 | ppl    55.804
+| epoch   1 step     8400 |   8400 batches | lr 0.000985 | ms/batch 703.07 | loss  4.03 | ppl    56.420
+| epoch   1 step     8600 |   8600 batches | lr 0.000984 | ms/batch 688.98 | loss  4.01 | ppl    55.313
+| epoch   1 step     8800 |   8800 batches | lr 0.000983 | ms/batch 700.17 | loss  4.02 | ppl    55.826
+| epoch   1 step     9000 |   9000 batches | lr 0.000982 | ms/batch 673.45 | loss  3.99 | ppl    54.215
+| epoch   1 step     9200 |   9200 batches | lr 0.000981 | ms/batch 691.53 | loss  3.98 | ppl    53.544
+| epoch   1 step     9400 |   9400 batches | lr 0.00098 | ms/batch 681.53 | loss  3.99 | ppl    53.802
+| epoch   1 step     9600 |   9600 batches | lr 0.000979 | ms/batch 705.40 | loss  4.00 | ppl    54.643
+| epoch   1 step     9800 |   9800 batches | lr 0.000978 | ms/batch 716.62 | loss  3.96 | ppl    52.276
+| epoch   1 step    10000 |  10000 batches | lr 0.000977 | ms/batch 679.81 | loss  3.97 | ppl    53.073
+| epoch   1 step    10200 |  10200 batches | lr 0.000976 | ms/batch 680.69 | loss  3.94 | ppl    51.218
+| epoch   1 step    10400 |  10400 batches | lr 0.000975 | ms/batch 677.39 | loss  3.93 | ppl    51.130
+| epoch   1 step    10600 |  10600 batches | lr 0.000974 | ms/batch 682.82 | loss  3.96 | ppl    52.328
+| epoch   1 step    10800 |  10800 batches | lr 0.000973 | ms/batch 675.32 | loss  3.92 | ppl    50.152
+| epoch   1 step    11000 |  11000 batches | lr 0.000972 | ms/batch 687.74 | loss  3.95 | ppl    52.112
+| epoch   1 step    11200 |  11200 batches | lr 0.000971 | ms/batch 687.73 | loss  3.93 | ppl    50.965
+| epoch   1 step    11400 |  11400 batches | lr 0.00097 | ms/batch 692.52 | loss  3.93 | ppl    50.818
+| epoch   2 step    11600 |    130 batches | lr 0.000969 | ms/batch 719.64 | loss  3.90 | ppl    49.417
+| epoch   2 step    11800 |    330 batches | lr 0.000968 | ms/batch 690.59 | loss  3.88 | ppl    48.186
+| epoch   2 step    12000 |    530 batches | lr 0.000967 | ms/batch 700.90 | loss  3.90 | ppl    49.205
+----------------------------------------------------------------------------------------------------
+| Eval   3 at step    12000 | time: 2772.08s | valid loss  3.78 | valid ppl    43.627
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    12200 |    730 batches | lr 0.000966 | ms/batch 772.15 | loss  3.87 | ppl    47.839
+| epoch   2 step    12400 |    930 batches | lr 0.000964 | ms/batch 681.74 | loss  3.87 | ppl    47.878
+| epoch   2 step    12600 |   1130 batches | lr 0.000963 | ms/batch 692.52 | loss  3.90 | ppl    49.212
+| epoch   2 step    12800 |   1330 batches | lr 0.000962 | ms/batch 672.00 | loss  3.86 | ppl    47.513
+| epoch   2 step    13000 |   1530 batches | lr 0.000961 | ms/batch 699.31 | loss  3.85 | ppl    47.004
+| epoch   2 step    13200 |   1730 batches | lr 0.000959 | ms/batch 703.25 | loss  3.84 | ppl    46.727
+| epoch   2 step    13400 |   1930 batches | lr 0.000958 | ms/batch 694.76 | loss  3.85 | ppl    46.999
+| epoch   2 step    13600 |   2130 batches | lr 0.000957 | ms/batch 702.36 | loss  3.87 | ppl    47.877
+| epoch   2 step    13800 |   2330 batches | lr 0.000956 | ms/batch 714.52 | loss  3.84 | ppl    46.684
+| epoch   2 step    14000 |   2530 batches | lr 0.000954 | ms/batch 704.35 | loss  3.83 | ppl    45.921
+| epoch   2 step    14200 |   2730 batches | lr 0.000953 | ms/batch 701.29 | loss  3.80 | ppl    44.917
+| epoch   2 step    14400 |   2930 batches | lr 0.000951 | ms/batch 688.11 | loss  3.79 | ppl    44.149
+| epoch   2 step    14600 |   3130 batches | lr 0.00095 | ms/batch 704.84 | loss  3.80 | ppl    44.497
+| epoch   2 step    14800 |   3330 batches | lr 0.000949 | ms/batch 716.44 | loss  3.80 | ppl    44.659
+| epoch   2 step    15000 |   3530 batches | lr 0.000947 | ms/batch 695.23 | loss  3.76 | ppl    42.957
+| epoch   2 step    15200 |   3730 batches | lr 0.000946 | ms/batch 675.92 | loss  3.79 | ppl    44.272
+| epoch   2 step    15400 |   3930 batches | lr 0.000944 | ms/batch 680.85 | loss  3.78 | ppl    43.873
+| epoch   2 step    15600 |   4130 batches | lr 0.000943 | ms/batch 676.88 | loss  3.77 | ppl    43.466
+| epoch   2 step    15800 |   4330 batches | lr 0.000941 | ms/batch 690.26 | loss  3.78 | ppl    43.828
+| epoch   2 step    16000 |   4530 batches | lr 0.00094 | ms/batch 681.76 | loss  3.78 | ppl    43.855
+----------------------------------------------------------------------------------------------------
+| Eval   4 at step    16000 | time: 2785.52s | valid loss  3.68 | valid ppl    39.575
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    16200 |   4730 batches | lr 0.000938 | ms/batch 761.98 | loss  3.74 | ppl    41.963
+| epoch   2 step    16400 |   4930 batches | lr 0.000937 | ms/batch 719.77 | loss  3.76 | ppl    42.816
+| epoch   2 step    16600 |   5130 batches | lr 0.000935 | ms/batch 682.43 | loss  3.75 | ppl    42.488
+| epoch   2 step    16800 |   5330 batches | lr 0.000934 | ms/batch 678.56 | loss  3.74 | ppl    42.072
+| epoch   2 step    17000 |   5530 batches | lr 0.000932 | ms/batch 702.18 | loss  3.73 | ppl    41.580
+| epoch   2 step    17200 |   5730 batches | lr 0.000931 | ms/batch 693.54 | loss  3.75 | ppl    42.350
+| epoch   2 step    17400 |   5930 batches | lr 0.000929 | ms/batch 682.69 | loss  3.73 | ppl    41.637
+| epoch   2 step    17600 |   6130 batches | lr 0.000927 | ms/batch 702.62 | loss  3.72 | ppl    41.292
+| epoch   2 step    17800 |   6330 batches | lr 0.000926 | ms/batch 676.86 | loss  3.75 | ppl    42.496
+| epoch   2 step    18000 |   6530 batches | lr 0.000924 | ms/batch 686.50 | loss  3.69 | ppl    40.096
+| epoch   2 step    18200 |   6730 batches | lr 0.000922 | ms/batch 678.10 | loss  3.70 | ppl    40.308
+| epoch   2 step    18400 |   6930 batches | lr 0.00092 | ms/batch 703.33 | loss  3.71 | ppl    40.840
+| epoch   2 step    18600 |   7130 batches | lr 0.000919 | ms/batch 690.96 | loss  3.69 | ppl    39.977
+| epoch   2 step    18800 |   7330 batches | lr 0.000917 | ms/batch 746.79 | loss  3.67 | ppl    39.106
+| epoch   2 step    19000 |   7530 batches | lr 0.000915 | ms/batch 676.15 | loss  3.69 | ppl    40.078
+| epoch   2 step    19200 |   7730 batches | lr 0.000913 | ms/batch 707.35 | loss  3.69 | ppl    40.034
+| epoch   2 step    19400 |   7930 batches | lr 0.000912 | ms/batch 674.04 | loss  3.68 | ppl    39.801
+| epoch   2 step    19600 |   8130 batches | lr 0.00091 | ms/batch 709.95 | loss  3.70 | ppl    40.300
+| epoch   2 step    19800 |   8330 batches | lr 0.000908 | ms/batch 685.00 | loss  3.69 | ppl    39.868
+| epoch   2 step    20000 |   8530 batches | lr 0.000906 | ms/batch 706.46 | loss  3.67 | ppl    39.391
+----------------------------------------------------------------------------------------------------
+| Eval   5 at step    20000 | time: 2788.84s | valid loss  3.60 | valid ppl    36.475
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    20200 |   8730 batches | lr 0.000904 | ms/batch 752.81 | loss  3.69 | ppl    40.136
+| epoch   2 step    20400 |   8930 batches | lr 0.000902 | ms/batch 688.44 | loss  3.69 | ppl    39.976
+| epoch   2 step    20600 |   9130 batches | lr 0.000901 | ms/batch 690.82 | loss  3.68 | ppl    39.641
+| epoch   2 step    20800 |   9330 batches | lr 0.000899 | ms/batch 698.88 | loss  3.67 | ppl    39.207
+| epoch   2 step    21000 |   9530 batches | lr 0.000897 | ms/batch 700.37 | loss  3.71 | ppl    40.939
+| epoch   2 step    21200 |   9730 batches | lr 0.000895 | ms/batch 675.10 | loss  3.66 | ppl    38.940
+| epoch   2 step    21400 |   9930 batches | lr 0.000893 | ms/batch 694.48 | loss  3.67 | ppl    39.373
+| epoch   2 step    21600 |  10130 batches | lr 0.000891 | ms/batch 684.69 | loss  3.66 | ppl    38.760
+| epoch   2 step    21800 |  10330 batches | lr 0.000889 | ms/batch 729.00 | loss  3.67 | ppl    39.128
+| epoch   2 step    22000 |  10530 batches | lr 0.000887 | ms/batch 710.08 | loss  3.68 | ppl    39.746
+| epoch   2 step    22200 |  10730 batches | lr 0.000885 | ms/batch 693.05 | loss  3.65 | ppl    38.365
+| epoch   2 step    22400 |  10930 batches | lr 0.000883 | ms/batch 698.33 | loss  3.65 | ppl    38.293
+| epoch   2 step    22600 |  11130 batches | lr 0.000881 | ms/batch 713.05 | loss  3.69 | ppl    40.048
+| epoch   2 step    22800 |  11330 batches | lr 0.000879 | ms/batch 673.93 | loss  3.66 | ppl    38.769
+| epoch   3 step    23000 |     60 batches | lr 0.000877 | ms/batch 695.65 | loss  3.66 | ppl    38.901
+| epoch   3 step    23200 |    260 batches | lr 0.000875 | ms/batch 671.63 | loss  3.62 | ppl    37.173
+| epoch   3 step    23400 |    460 batches | lr 0.000873 | ms/batch 692.68 | loss  3.66 | ppl    38.720
+| epoch   3 step    23600 |    660 batches | lr 0.00087 | ms/batch 696.22 | loss  3.62 | ppl    37.317
+| epoch   3 step    23800 |    860 batches | lr 0.000868 | ms/batch 691.28 | loss  3.65 | ppl    38.609
+| epoch   3 step    24000 |   1060 batches | lr 0.000866 | ms/batch 699.25 | loss  3.64 | ppl    38.097
+----------------------------------------------------------------------------------------------------
+| Eval   6 at step    24000 | time: 2785.75s | valid loss  3.55 | valid ppl    34.856
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    24200 |   1260 batches | lr 0.000864 | ms/batch 771.85 | loss  3.63 | ppl    37.667
+| epoch   3 step    24400 |   1460 batches | lr 0.000862 | ms/batch 678.13 | loss  3.63 | ppl    37.615
+| epoch   3 step    24600 |   1660 batches | lr 0.00086 | ms/batch 676.14 | loss  3.62 | ppl    37.282
+| epoch   3 step    24800 |   1860 batches | lr 0.000857 | ms/batch 728.81 | loss  3.62 | ppl    37.511
+| epoch   3 step    25000 |   2060 batches | lr 0.000855 | ms/batch 694.21 | loss  3.66 | ppl    39.016
+| epoch   3 step    25200 |   2260 batches | lr 0.000853 | ms/batch 724.01 | loss  3.64 | ppl    37.938
+| epoch   3 step    25400 |   2460 batches | lr 0.000851 | ms/batch 678.12 | loss  3.62 | ppl    37.370
+| epoch   3 step    25600 |   2660 batches | lr 0.000848 | ms/batch 696.01 | loss  3.62 | ppl    37.468
+| epoch   3 step    25800 |   2860 batches | lr 0.000846 | ms/batch 694.04 | loss  3.56 | ppl    35.299
+| epoch   3 step    26000 |   3060 batches | lr 0.000844 | ms/batch 711.11 | loss  3.61 | ppl    37.126
+| epoch   3 step    26200 |   3260 batches | lr 0.000842 | ms/batch 723.43 | loss  3.61 | ppl    36.969
+| epoch   3 step    26400 |   3460 batches | lr 0.000839 | ms/batch 720.20 | loss  3.57 | ppl    35.667
+| epoch   3 step    26600 |   3660 batches | lr 0.000837 | ms/batch 684.79 | loss  3.59 | ppl    36.147
+| epoch   3 step    26800 |   3860 batches | lr 0.000835 | ms/batch 701.18 | loss  3.59 | ppl    36.331
+| epoch   3 step    27000 |   4060 batches | lr 0.000832 | ms/batch 706.21 | loss  3.60 | ppl    36.676
+| epoch   3 step    27200 |   4260 batches | lr 0.00083 | ms/batch 714.36 | loss  3.59 | ppl    36.233
+| epoch   3 step    27400 |   4460 batches | lr 0.000827 | ms/batch 692.59 | loss  3.59 | ppl    36.376
+| epoch   3 step    27600 |   4660 batches | lr 0.000825 | ms/batch 711.44 | loss  3.58 | ppl    35.999
+| epoch   3 step    27800 |   4860 batches | lr 0.000823 | ms/batch 728.11 | loss  3.57 | ppl    35.621
+| epoch   3 step    28000 |   5060 batches | lr 0.00082 | ms/batch 692.62 | loss  3.59 | ppl    36.065
+----------------------------------------------------------------------------------------------------
+| Eval   7 at step    28000 | time: 2821.18s | valid loss  3.51 | valid ppl    33.444
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    28200 |   5260 batches | lr 0.000818 | ms/batch 784.83 | loss  3.57 | ppl    35.469
+| epoch   3 step    28400 |   5460 batches | lr 0.000815 | ms/batch 676.58 | loss  3.55 | ppl    34.677
+| epoch   3 step    28600 |   5660 batches | lr 0.000813 | ms/batch 693.09 | loss  3.60 | ppl    36.443
+| epoch   3 step    28800 |   5860 batches | lr 0.00081 | ms/batch 692.23 | loss  3.57 | ppl    35.440
+| epoch   3 step    29000 |   6060 batches | lr 0.000808 | ms/batch 694.47 | loss  3.56 | ppl    35.226
+| epoch   3 step    29200 |   6260 batches | lr 0.000805 | ms/batch 679.24 | loss  3.56 | ppl    35.224
+| epoch   3 step    29400 |   6460 batches | lr 0.000803 | ms/batch 705.43 | loss  3.57 | ppl    35.528
+| epoch   3 step    29600 |   6660 batches | lr 0.0008 | ms/batch 716.64 | loss  3.52 | ppl    33.679
+| epoch   3 step    29800 |   6860 batches | lr 0.000798 | ms/batch 711.33 | loss  3.55 | ppl    34.776
+| epoch   3 step    30000 |   7060 batches | lr 0.000795 | ms/batch 730.14 | loss  3.54 | ppl    34.480
+| epoch   3 step    30200 |   7260 batches | lr 0.000793 | ms/batch 709.85 | loss  3.51 | ppl    33.497
+| epoch   3 step    30400 |   7460 batches | lr 0.00079 | ms/batch 685.34 | loss  3.54 | ppl    34.308
+| epoch   3 step    30600 |   7660 batches | lr 0.000788 | ms/batch 706.36 | loss  3.52 | ppl    33.834
+| epoch   3 step    30800 |   7860 batches | lr 0.000785 | ms/batch 699.03 | loss  3.53 | ppl    34.222
+| epoch   3 step    31000 |   8060 batches | lr 0.000783 | ms/batch 720.24 | loss  3.54 | ppl    34.453
+| epoch   3 step    31200 |   8260 batches | lr 0.00078 | ms/batch 673.26 | loss  3.53 | ppl    34.066
+| epoch   3 step    31400 |   8460 batches | lr 0.000777 | ms/batch 694.72 | loss  3.54 | ppl    34.454
+| epoch   3 step    31600 |   8660 batches | lr 0.000775 | ms/batch 708.28 | loss  3.53 | ppl    34.274
+| epoch   3 step    31800 |   8860 batches | lr 0.000772 | ms/batch 682.86 | loss  3.54 | ppl    34.392
+| epoch   3 step    32000 |   9060 batches | lr 0.000769 | ms/batch 688.85 | loss  3.54 | ppl    34.370
+----------------------------------------------------------------------------------------------------
+| Eval   8 at step    32000 | time: 2806.41s | valid loss  3.46 | valid ppl    31.891
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    32200 |   9260 batches | lr 0.000767 | ms/batch 786.16 | loss  3.52 | ppl    33.871
+| epoch   3 step    32400 |   9460 batches | lr 0.000764 | ms/batch 725.79 | loss  3.54 | ppl    34.633
+| epoch   3 step    32600 |   9660 batches | lr 0.000761 | ms/batch 700.74 | loss  3.54 | ppl    34.622
+| epoch   3 step    32800 |   9860 batches | lr 0.000759 | ms/batch 688.71 | loss  3.50 | ppl    33.131
+| epoch   3 step    33000 |  10060 batches | lr 0.000756 | ms/batch 714.76 | loss  3.55 | ppl    34.776
+| epoch   3 step    33200 |  10260 batches | lr 0.000753 | ms/batch 707.51 | loss  3.50 | ppl    32.988
+| epoch   3 step    33400 |  10460 batches | lr 0.000751 | ms/batch 683.71 | loss  3.53 | ppl    34.236
+| epoch   3 step    33600 |  10660 batches | lr 0.000748 | ms/batch 719.18 | loss  3.54 | ppl    34.467
+| epoch   3 step    33800 |  10860 batches | lr 0.000745 | ms/batch 745.78 | loss  3.49 | ppl    32.814
+| epoch   3 step    34000 |  11060 batches | lr 0.000742 | ms/batch 710.58 | loss  3.53 | ppl    34.283
+| epoch   3 step    34200 |  11260 batches | lr 0.00074 | ms/batch 694.54 | loss  3.54 | ppl    34.583
+| epoch   3 step    34400 |  11460 batches | lr 0.000737 | ms/batch 688.33 | loss  3.51 | ppl    33.583
+| epoch   4 step    34600 |    190 batches | lr 0.000734 | ms/batch 682.61 | loss  3.49 | ppl    32.864
+| epoch   4 step    34800 |    390 batches | lr 0.000731 | ms/batch 713.82 | loss  3.50 | ppl    33.187
+| epoch   4 step    35000 |    590 batches | lr 0.000728 | ms/batch 709.46 | loss  3.49 | ppl    32.943
+| epoch   4 step    35200 |    790 batches | lr 0.000726 | ms/batch 684.47 | loss  3.51 | ppl    33.445
+| epoch   4 step    35400 |    990 batches | lr 0.000723 | ms/batch 721.54 | loss  3.49 | ppl    32.743
+| epoch   4 step    35600 |   1190 batches | lr 0.00072 | ms/batch 705.58 | loss  3.51 | ppl    33.363
+| epoch   4 step    35800 |   1390 batches | lr 0.000717 | ms/batch 715.79 | loss  3.50 | ppl    32.989
+| epoch   4 step    36000 |   1590 batches | lr 0.000714 | ms/batch 707.76 | loss  3.48 | ppl    32.568
+----------------------------------------------------------------------------------------------------
+| Eval   9 at step    36000 | time: 2837.19s | valid loss  3.44 | valid ppl    31.101
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    36200 |   1790 batches | lr 0.000711 | ms/batch 744.09 | loss  3.49 | ppl    32.869
+| epoch   4 step    36400 |   1990 batches | lr 0.000709 | ms/batch 685.71 | loss  3.52 | ppl    33.861
+| epoch   4 step    36600 |   2190 batches | lr 0.000706 | ms/batch 702.84 | loss  3.51 | ppl    33.326
+| epoch   4 step    36800 |   2390 batches | lr 0.000703 | ms/batch 705.87 | loss  3.51 | ppl    33.286
+| epoch   4 step    37000 |   2590 batches | lr 0.0007 | ms/batch 693.72 | loss  3.48 | ppl    32.465
+| epoch   4 step    37200 |   2790 batches | lr 0.000697 | ms/batch 699.40 | loss  3.46 | ppl    31.888
+| epoch   4 step    37400 |   2990 batches | lr 0.000694 | ms/batch 697.96 | loss  3.48 | ppl    32.390
+| epoch   4 step    37600 |   3190 batches | lr 0.000691 | ms/batch 679.96 | loss  3.48 | ppl    32.335
+| epoch   4 step    37800 |   3390 batches | lr 0.000688 | ms/batch 692.96 | loss  3.48 | ppl    32.327
+| epoch   4 step    38000 |   3590 batches | lr 0.000685 | ms/batch 719.86 | loss  3.45 | ppl    31.410
+| epoch   4 step    38200 |   3790 batches | lr 0.000682 | ms/batch 708.23 | loss  3.47 | ppl    32.106
+| epoch   4 step    38400 |   3990 batches | lr 0.000679 | ms/batch 713.26 | loss  3.48 | ppl    32.539
+| epoch   4 step    38600 |   4190 batches | lr 0.000677 | ms/batch 720.48 | loss  3.46 | ppl    31.968
+| epoch   4 step    38800 |   4390 batches | lr 0.000674 | ms/batch 706.09 | loss  3.47 | ppl    32.081
+| epoch   4 step    39000 |   4590 batches | lr 0.000671 | ms/batch 706.32 | loss  3.48 | ppl    32.534
+| epoch   4 step    39200 |   4790 batches | lr 0.000668 | ms/batch 724.90 | loss  3.44 | ppl    31.078
+| epoch   4 step    39400 |   4990 batches | lr 0.000665 | ms/batch 684.94 | loss  3.49 | ppl    32.633
+| epoch   4 step    39600 |   5190 batches | lr 0.000662 | ms/batch 687.24 | loss  3.44 | ppl    31.273
+| epoch   4 step    39800 |   5390 batches | lr 0.000659 | ms/batch 721.71 | loss  3.42 | ppl    30.694
+| epoch   4 step    40000 |   5590 batches | lr 0.000656 | ms/batch 697.69 | loss  3.45 | ppl    31.450
+----------------------------------------------------------------------------------------------------
+| Eval  10 at step    40000 | time: 2814.33s | valid loss  3.41 | valid ppl    30.132
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    40200 |   5790 batches | lr 0.000653 | ms/batch 754.92 | loss  3.47 | ppl    32.025
+| epoch   4 step    40400 |   5990 batches | lr 0.00065 | ms/batch 694.46 | loss  3.44 | ppl    31.158
+| epoch   4 step    40600 |   6190 batches | lr 0.000647 | ms/batch 676.98 | loss  3.44 | ppl    31.171
+| epoch   4 step    40800 |   6390 batches | lr 0.000644 | ms/batch 689.04 | loss  3.47 | ppl    32.015
+| epoch   4 step    41000 |   6590 batches | lr 0.000641 | ms/batch 685.40 | loss  3.40 | ppl    30.022
+| epoch   4 step    41200 |   6790 batches | lr 0.000638 | ms/batch 747.15 | loss  3.43 | ppl    30.725
+| epoch   4 step    41400 |   6990 batches | lr 0.000635 | ms/batch 705.11 | loss  3.44 | ppl    31.182
+| epoch   4 step    41600 |   7190 batches | lr 0.000632 | ms/batch 696.98 | loss  3.39 | ppl    29.650
+| epoch   4 step    41800 |   7390 batches | lr 0.000629 | ms/batch 702.79 | loss  3.42 | ppl    30.476
+| epoch   4 step    42000 |   7590 batches | lr 0.000626 | ms/batch 695.10 | loss  3.39 | ppl    29.763
+| epoch   4 step    42200 |   7790 batches | lr 0.000622 | ms/batch 715.71 | loss  3.42 | ppl    30.681
+| epoch   4 step    42400 |   7990 batches | lr 0.000619 | ms/batch 741.98 | loss  3.42 | ppl    30.604
+| epoch   4 step    42600 |   8190 batches | lr 0.000616 | ms/batch 705.83 | loss  3.41 | ppl    30.193
+| epoch   4 step    42800 |   8390 batches | lr 0.000613 | ms/batch 712.28 | loss  3.44 | ppl    31.079
+| epoch   4 step    43000 |   8590 batches | lr 0.00061 | ms/batch 724.30 | loss  3.41 | ppl    30.299
+| epoch   4 step    43200 |   8790 batches | lr 0.000607 | ms/batch 719.79 | loss  3.43 | ppl    30.914
+| epoch   4 step    43400 |   8990 batches | lr 0.000604 | ms/batch 699.25 | loss  3.42 | ppl    30.455
+| epoch   4 step    43600 |   9190 batches | lr 0.000601 | ms/batch 685.74 | loss  3.41 | ppl    30.187
+| epoch   4 step    43800 |   9390 batches | lr 0.000598 | ms/batch 719.13 | loss  3.42 | ppl    30.441
+| epoch   4 step    44000 |   9590 batches | lr 0.000595 | ms/batch 753.12 | loss  3.44 | ppl    31.043
+----------------------------------------------------------------------------------------------------
+| Eval  11 at step    44000 | time: 2840.79s | valid loss  3.37 | valid ppl    29.010
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    44200 |   9790 batches | lr 0.000592 | ms/batch 773.20 | loss  3.41 | ppl    30.168
+| epoch   4 step    44400 |   9990 batches | lr 0.000589 | ms/batch 694.87 | loss  3.41 | ppl    30.196
+| epoch   4 step    44600 |  10190 batches | lr 0.000586 | ms/batch 724.33 | loss  3.40 | ppl    29.936
+| epoch   4 step    44800 |  10390 batches | lr 0.000582 | ms/batch 701.37 | loss  3.40 | ppl    30.038
+| epoch   4 step    45000 |  10590 batches | lr 0.000579 | ms/batch 724.47 | loss  3.43 | ppl    30.942
+| epoch   4 step    45200 |  10790 batches | lr 0.000576 | ms/batch 700.16 | loss  3.38 | ppl    29.477
+| epoch   4 step    45400 |  10990 batches | lr 0.000573 | ms/batch 699.42 | loss  3.42 | ppl    30.491
+| epoch   4 step    45600 |  11190 batches | lr 0.00057 | ms/batch 697.52 | loss  3.42 | ppl    30.633
+| epoch   4 step    45800 |  11390 batches | lr 0.000567 | ms/batch 716.39 | loss  3.41 | ppl    30.406
+| epoch   5 step    46000 |    120 batches | lr 0.000564 | ms/batch 697.18 | loss  3.39 | ppl    29.776
+| epoch   5 step    46200 |    320 batches | lr 0.000561 | ms/batch 688.95 | loss  3.38 | ppl    29.331
+| epoch   5 step    46400 |    520 batches | lr 0.000557 | ms/batch 702.04 | loss  3.41 | ppl    30.334
+| epoch   5 step    46600 |    720 batches | lr 0.000554 | ms/batch 714.74 | loss  3.37 | ppl    29.146
+| epoch   5 step    46800 |    920 batches | lr 0.000551 | ms/batch 694.28 | loss  3.38 | ppl    29.263
+| epoch   5 step    47000 |   1120 batches | lr 0.000548 | ms/batch 691.20 | loss  3.41 | ppl    30.380
+| epoch   5 step    47200 |   1320 batches | lr 0.000545 | ms/batch 709.55 | loss  3.38 | ppl    29.299
+| epoch   5 step    47400 |   1520 batches | lr 0.000542 | ms/batch 715.69 | loss  3.38 | ppl    29.302
+| epoch   5 step    47600 |   1720 batches | lr 0.000539 | ms/batch 703.59 | loss  3.37 | ppl    29.087
+| epoch   5 step    47800 |   1920 batches | lr 0.000536 | ms/batch 684.68 | loss  3.40 | ppl    29.883
+| epoch   5 step    48000 |   2120 batches | lr 0.000532 | ms/batch 705.81 | loss  3.41 | ppl    30.359
+----------------------------------------------------------------------------------------------------
+| Eval  12 at step    48000 | time: 2823.57s | valid loss  3.34 | valid ppl    28.152
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    48200 |   2320 batches | lr 0.000529 | ms/batch 771.37 | loss  3.39 | ppl    29.735
+| epoch   5 step    48400 |   2520 batches | lr 0.000526 | ms/batch 724.35 | loss  3.38 | ppl    29.266
+| epoch   5 step    48600 |   2720 batches | lr 0.000523 | ms/batch 709.33 | loss  3.36 | ppl    28.891
+| epoch   5 step    48800 |   2920 batches | lr 0.00052 | ms/batch 716.29 | loss  3.35 | ppl    28.605
+| epoch   5 step    49000 |   3120 batches | lr 0.000517 | ms/batch 701.20 | loss  3.37 | ppl    29.121
+| epoch   5 step    49200 |   3320 batches | lr 0.000514 | ms/batch 717.37 | loss  3.38 | ppl    29.440
+| epoch   5 step    49400 |   3520 batches | lr 0.00051 | ms/batch 687.15 | loss  3.34 | ppl    28.306
+| epoch   5 step    49600 |   3720 batches | lr 0.000507 | ms/batch 706.52 | loss  3.37 | ppl    29.021
+| epoch   5 step    49800 |   3920 batches | lr 0.000504 | ms/batch 722.49 | loss  3.36 | ppl    28.862
+| epoch   5 step    50000 |   4120 batches | lr 0.000501 | ms/batch 714.17 | loss  3.36 | ppl    28.886
+| epoch   5 step    50200 |   4320 batches | lr 0.000498 | ms/batch 685.39 | loss  3.37 | ppl    28.957
+| epoch   5 step    50400 |   4520 batches | lr 0.000495 | ms/batch 715.33 | loss  3.38 | ppl    29.372
+| epoch   5 step    50600 |   4720 batches | lr 0.000492 | ms/batch 718.29 | loss  3.34 | ppl    28.187
+| epoch   5 step    50800 |   4920 batches | lr 0.000488 | ms/batch 717.46 | loss  3.35 | ppl    28.583
+| epoch   5 step    51000 |   5120 batches | lr 0.000485 | ms/batch 722.98 | loss  3.35 | ppl    28.452
+| epoch   5 step    51200 |   5320 batches | lr 0.000482 | ms/batch 730.83 | loss  3.34 | ppl    28.284
+| epoch   5 step    51400 |   5520 batches | lr 0.000479 | ms/batch 705.06 | loss  3.34 | ppl    28.130
+| epoch   5 step    51600 |   5720 batches | lr 0.000476 | ms/batch 736.14 | loss  3.35 | ppl    28.474
+| epoch   5 step    51800 |   5920 batches | lr 0.000473 | ms/batch 709.48 | loss  3.35 | ppl    28.381
+| epoch   5 step    52000 |   6120 batches | lr 0.000469 | ms/batch 719.02 | loss  3.34 | ppl    28.123
+----------------------------------------------------------------------------------------------------
+| Eval  13 at step    52000 | time: 2861.73s | valid loss  3.32 | valid ppl    27.651
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    52200 |   6320 batches | lr 0.000466 | ms/batch 795.83 | loss  3.36 | ppl    28.824
+| epoch   5 step    52400 |   6520 batches | lr 0.000463 | ms/batch 697.32 | loss  3.30 | ppl    27.207
+| epoch   5 step    52600 |   6720 batches | lr 0.00046 | ms/batch 724.64 | loss  3.31 | ppl    27.379
+| epoch   5 step    52800 |   6920 batches | lr 0.000457 | ms/batch 734.21 | loss  3.33 | ppl    27.948
+| epoch   5 step    53000 |   7120 batches | lr 0.000454 | ms/batch 707.81 | loss  3.31 | ppl    27.522
+| epoch   5 step    53200 |   7320 batches | lr 0.000451 | ms/batch 704.60 | loss  3.28 | ppl    26.696
+| epoch   5 step    53400 |   7520 batches | lr 0.000448 | ms/batch 729.67 | loss  3.32 | ppl    27.541
+| epoch   5 step    53600 |   7720 batches | lr 0.000444 | ms/batch 709.88 | loss  3.31 | ppl    27.326
+| epoch   5 step    53800 |   7920 batches | lr 0.000441 | ms/batch 722.95 | loss  3.31 | ppl    27.348
+| epoch   5 step    54000 |   8120 batches | lr 0.000438 | ms/batch 728.94 | loss  3.32 | ppl    27.682
+| epoch   5 step    54200 |   8320 batches | lr 0.000435 | ms/batch 706.14 | loss  3.31 | ppl    27.518
+| epoch   5 step    54400 |   8520 batches | lr 0.000432 | ms/batch 723.15 | loss  3.30 | ppl    27.196
+| epoch   5 step    54600 |   8720 batches | lr 0.000429 | ms/batch 759.15 | loss  3.32 | ppl    27.670
+| epoch   5 step    54800 |   8920 batches | lr 0.000426 | ms/batch 692.95 | loss  3.32 | ppl    27.792
+| epoch   5 step    55000 |   9120 batches | lr 0.000423 | ms/batch 736.12 | loss  3.31 | ppl    27.454
+| epoch   5 step    55200 |   9320 batches | lr 0.000419 | ms/batch 709.42 | loss  3.30 | ppl    27.208
+| epoch   5 step    55400 |   9520 batches | lr 0.000416 | ms/batch 707.95 | loss  3.33 | ppl    28.072
+| epoch   5 step    55600 |   9720 batches | lr 0.000413 | ms/batch 691.25 | loss  3.30 | ppl    27.225
+| epoch   5 step    55800 |   9920 batches | lr 0.00041 | ms/batch 685.81 | loss  3.31 | ppl    27.293
+| epoch   5 step    56000 |  10120 batches | lr 0.000407 | ms/batch 709.93 | loss  3.30 | ppl    27.183
+----------------------------------------------------------------------------------------------------
+| Eval  14 at step    56000 | time: 2871.27s | valid loss  3.29 | valid ppl    26.758
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    56200 |  10320 batches | lr 0.000404 | ms/batch 784.81 | loss  3.31 | ppl    27.262
+| epoch   5 step    56400 |  10520 batches | lr 0.000401 | ms/batch 708.23 | loss  3.33 | ppl    27.876
+| epoch   5 step    56600 |  10720 batches | lr 0.000398 | ms/batch 718.78 | loss  3.29 | ppl    26.834
+| epoch   5 step    56800 |  10920 batches | lr 0.000395 | ms/batch 723.00 | loss  3.29 | ppl    26.727
+| epoch   5 step    57000 |  11120 batches | lr 0.000392 | ms/batch 730.49 | loss  3.34 | ppl    28.295
+| epoch   5 step    57200 |  11320 batches | lr 0.000389 | ms/batch 728.66 | loss  3.30 | ppl    27.060
+| epoch   6 step    57400 |     50 batches | lr 0.000386 | ms/batch 693.11 | loss  3.32 | ppl    27.563
+| epoch   6 step    57600 |    250 batches | lr 0.000382 | ms/batch 714.89 | loss  3.27 | ppl    26.241
+| epoch   6 step    57800 |    450 batches | lr 0.000379 | ms/batch 727.56 | loss  3.31 | ppl    27.269
+| epoch   6 step    58000 |    650 batches | lr 0.000376 | ms/batch 714.18 | loss  3.27 | ppl    26.327
+| epoch   6 step    58200 |    850 batches | lr 0.000373 | ms/batch 737.04 | loss  3.31 | ppl    27.365
+| epoch   6 step    58400 |   1050 batches | lr 0.00037 | ms/batch 722.31 | loss  3.28 | ppl    26.671
+| epoch   6 step    58600 |   1250 batches | lr 0.000367 | ms/batch 718.13 | loss  3.28 | ppl    26.642
+| epoch   6 step    58800 |   1450 batches | lr 0.000364 | ms/batch 758.91 | loss  3.29 | ppl    26.793
+| epoch   6 step    59000 |   1650 batches | lr 0.000361 | ms/batch 744.06 | loss  3.27 | ppl    26.246
+| epoch   6 step    59200 |   1850 batches | lr 0.000358 | ms/batch 737.10 | loss  3.28 | ppl    26.644
+| epoch   6 step    59400 |   2050 batches | lr 0.000355 | ms/batch 722.53 | loss  3.32 | ppl    27.782
+| epoch   6 step    59600 |   2250 batches | lr 0.000352 | ms/batch 738.70 | loss  3.29 | ppl    26.834
+| epoch   6 step    59800 |   2450 batches | lr 0.000349 | ms/batch 740.37 | loss  3.29 | ppl    26.765
+| epoch   6 step    60000 |   2650 batches | lr 0.000346 | ms/batch 722.84 | loss  3.29 | ppl    26.752
+----------------------------------------------------------------------------------------------------
+| Eval  15 at step    60000 | time: 2912.80s | valid loss  3.27 | valid ppl    26.281
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    60200 |   2850 batches | lr 0.000343 | ms/batch 774.99 | loss  3.23 | ppl    25.400
+| epoch   6 step    60400 |   3050 batches | lr 0.00034 | ms/batch 736.04 | loss  3.28 | ppl    26.615
+| epoch   6 step    60600 |   3250 batches | lr 0.000337 | ms/batch 723.86 | loss  3.27 | ppl    26.433
+| epoch   6 step    60800 |   3450 batches | lr 0.000334 | ms/batch 699.97 | loss  3.26 | ppl    25.944
+| epoch   6 step    61000 |   3650 batches | lr 0.000331 | ms/batch 699.08 | loss  3.26 | ppl    25.978
+| epoch   6 step    61200 |   3850 batches | lr 0.000328 | ms/batch 728.93 | loss  3.26 | ppl    26.106
+| epoch   6 step    61400 |   4050 batches | lr 0.000325 | ms/batch 698.87 | loss  3.28 | ppl    26.608
+| epoch   6 step    61600 |   4250 batches | lr 0.000322 | ms/batch 700.55 | loss  3.26 | ppl    26.047
+| epoch   6 step    61800 |   4450 batches | lr 0.000319 | ms/batch 743.96 | loss  3.27 | ppl    26.276
+| epoch   6 step    62000 |   4650 batches | lr 0.000317 | ms/batch 728.97 | loss  3.26 | ppl    26.099
+| epoch   6 step    62200 |   4850 batches | lr 0.000314 | ms/batch 731.16 | loss  3.25 | ppl    25.752
+| epoch   6 step    62400 |   5050 batches | lr 0.000311 | ms/batch 719.64 | loss  3.26 | ppl    26.134
+| epoch   6 step    62600 |   5250 batches | lr 0.000308 | ms/batch 760.40 | loss  3.25 | ppl    25.803
+| epoch   6 step    62800 |   5450 batches | lr 0.000305 | ms/batch 721.34 | loss  3.23 | ppl    25.210
+| epoch   6 step    63000 |   5650 batches | lr 0.000302 | ms/batch 717.89 | loss  3.27 | ppl    26.336
+| epoch   6 step    63200 |   5850 batches | lr 0.000299 | ms/batch 725.35 | loss  3.25 | ppl    25.735
+| epoch   6 step    63400 |   6050 batches | lr 0.000296 | ms/batch 686.94 | loss  3.24 | ppl    25.469
+| epoch   6 step    63600 |   6250 batches | lr 0.000293 | ms/batch 716.59 | loss  3.25 | ppl    25.788
+| epoch   6 step    63800 |   6450 batches | lr 0.000291 | ms/batch 707.89 | loss  3.25 | ppl    25.795
+| epoch   6 step    64000 |   6650 batches | lr 0.000288 | ms/batch 727.95 | loss  3.20 | ppl    24.511
+----------------------------------------------------------------------------------------------------
+| Eval  16 at step    64000 | time: 2885.83s | valid loss  3.25 | valid ppl    25.737
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    64200 |   6850 batches | lr 0.000285 | ms/batch 779.72 | loss  3.23 | ppl    25.290
+| epoch   6 step    64400 |   7050 batches | lr 0.000282 | ms/batch 687.37 | loss  3.23 | ppl    25.262
+| epoch   6 step    64600 |   7250 batches | lr 0.000279 | ms/batch 746.50 | loss  3.19 | ppl    24.366
+| epoch   6 step    64800 |   7450 batches | lr 0.000276 | ms/batch 718.93 | loss  3.22 | ppl    24.984
+| epoch   6 step    65000 |   7650 batches | lr 0.000274 | ms/batch 726.70 | loss  3.20 | ppl    24.541
+| epoch   6 step    65200 |   7850 batches | lr 0.000271 | ms/batch 719.23 | loss  3.22 | ppl    25.018
+| epoch   6 step    65400 |   8050 batches | lr 0.000268 | ms/batch 711.20 | loss  3.23 | ppl    25.214
+| epoch   6 step    65600 |   8250 batches | lr 0.000265 | ms/batch 717.61 | loss  3.21 | ppl    24.835
+| epoch   6 step    65800 |   8450 batches | lr 0.000262 | ms/batch 728.49 | loss  3.23 | ppl    25.206
+| epoch   6 step    66000 |   8650 batches | lr 0.00026 | ms/batch 730.31 | loss  3.21 | ppl    24.890
+| epoch   6 step    66200 |   8850 batches | lr 0.000257 | ms/batch 692.18 | loss  3.24 | ppl    25.410
+| epoch   6 step    66400 |   9050 batches | lr 0.000254 | ms/batch 735.80 | loss  3.22 | ppl    25.128
+| epoch   6 step    66600 |   9250 batches | lr 0.000251 | ms/batch 726.67 | loss  3.21 | ppl    24.728
+| epoch   6 step    66800 |   9450 batches | lr 0.000249 | ms/batch 691.71 | loss  3.23 | ppl    25.201
+| epoch   6 step    67000 |   9650 batches | lr 0.000246 | ms/batch 716.45 | loss  3.24 | ppl    25.548
+| epoch   6 step    67200 |   9850 batches | lr 0.000243 | ms/batch 721.99 | loss  3.19 | ppl    24.247
+| epoch   6 step    67400 |  10050 batches | lr 0.000241 | ms/batch 732.11 | loss  3.24 | ppl    25.416
+| epoch   6 step    67600 |  10250 batches | lr 0.000238 | ms/batch 732.60 | loss  3.19 | ppl    24.382
+| epoch   6 step    67800 |  10450 batches | lr 0.000235 | ms/batch 738.25 | loss  3.22 | ppl    25.058
+| epoch   6 step    68000 |  10650 batches | lr 0.000233 | ms/batch 728.29 | loss  3.23 | ppl    25.388
+----------------------------------------------------------------------------------------------------
+| Eval  17 at step    68000 | time: 2892.01s | valid loss  3.23 | valid ppl    25.318
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    68200 |  10850 batches | lr 0.00023 | ms/batch 761.27 | loss  3.18 | ppl    24.097
+| epoch   6 step    68400 |  11050 batches | lr 0.000227 | ms/batch 706.40 | loss  3.23 | ppl    25.283
+| epoch   6 step    68600 |  11250 batches | lr 0.000225 | ms/batch 763.81 | loss  3.24 | ppl    25.592
+| epoch   6 step    68800 |  11450 batches | lr 0.000222 | ms/batch 724.69 | loss  3.21 | ppl    24.756
+| epoch   7 step    69000 |    180 batches | lr 0.000219 | ms/batch 725.10 | loss  3.19 | ppl    24.390
+| epoch   7 step    69200 |    380 batches | lr 0.000217 | ms/batch 719.68 | loss  3.20 | ppl    24.464
+| epoch   7 step    69400 |    580 batches | lr 0.000214 | ms/batch 712.69 | loss  3.20 | ppl    24.451
+| epoch   7 step    69600 |    780 batches | lr 0.000212 | ms/batch 725.29 | loss  3.20 | ppl    24.622
+| epoch   7 step    69800 |    980 batches | lr 0.000209 | ms/batch 732.38 | loss  3.18 | ppl    24.086
+| epoch   7 step    70000 |   1180 batches | lr 0.000206 | ms/batch 744.68 | loss  3.21 | ppl    24.853
+| epoch   7 step    70200 |   1380 batches | lr 0.000204 | ms/batch 698.30 | loss  3.19 | ppl    24.298
+| epoch   7 step    70400 |   1580 batches | lr 0.000201 | ms/batch 693.41 | loss  3.19 | ppl    24.256
+| epoch   7 step    70600 |   1780 batches | lr 0.000199 | ms/batch 727.91 | loss  3.19 | ppl    24.231
+| epoch   7 step    70800 |   1980 batches | lr 0.000196 | ms/batch 689.58 | loss  3.22 | ppl    25.011
+| epoch   7 step    71000 |   2180 batches | lr 0.000194 | ms/batch 722.72 | loss  3.21 | ppl    24.789
+| epoch   7 step    71200 |   2380 batches | lr 0.000191 | ms/batch 720.35 | loss  3.20 | ppl    24.643
+| epoch   7 step    71400 |   2580 batches | lr 0.000189 | ms/batch 736.56 | loss  3.19 | ppl    24.315
+| epoch   7 step    71600 |   2780 batches | lr 0.000187 | ms/batch 713.16 | loss  3.17 | ppl    23.782
+| epoch   7 step    71800 |   2980 batches | lr 0.000184 | ms/batch 681.34 | loss  3.18 | ppl    24.050
+| epoch   7 step    72000 |   3180 batches | lr 0.000182 | ms/batch 712.65 | loss  3.19 | ppl    24.394
+----------------------------------------------------------------------------------------------------
+| Eval  18 at step    72000 | time: 2878.12s | valid loss  3.21 | valid ppl    24.850
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    72200 |   3380 batches | lr 0.000179 | ms/batch 749.92 | loss  3.19 | ppl    24.229
+| epoch   7 step    72400 |   3580 batches | lr 0.000177 | ms/batch 709.24 | loss  3.16 | ppl    23.648
+| epoch   7 step    72600 |   3780 batches | lr 0.000174 | ms/batch 732.91 | loss  3.18 | ppl    23.938
+| epoch   7 step    72800 |   3980 batches | lr 0.000172 | ms/batch 714.76 | loss  3.19 | ppl    24.213
+| epoch   7 step    73000 |   4180 batches | lr 0.00017 | ms/batch 719.33 | loss  3.18 | ppl    24.092
+| epoch   7 step    73200 |   4380 batches | lr 0.000167 | ms/batch 709.24 | loss  3.18 | ppl    24.057
+| epoch   7 step    73400 |   4580 batches | lr 0.000165 | ms/batch 750.40 | loss  3.20 | ppl    24.511
+| epoch   7 step    73600 |   4780 batches | lr 0.000163 | ms/batch 732.09 | loss  3.15 | ppl    23.398
+| epoch   7 step    73800 |   4980 batches | lr 0.00016 | ms/batch 749.69 | loss  3.19 | ppl    24.322
+| epoch   7 step    74000 |   5180 batches | lr 0.000158 | ms/batch 732.47 | loss  3.16 | ppl    23.623
+| epoch   7 step    74200 |   5380 batches | lr 0.000156 | ms/batch 734.25 | loss  3.14 | ppl    23.147
+| epoch   7 step    74400 |   5580 batches | lr 0.000153 | ms/batch 705.61 | loss  3.16 | ppl    23.636
+| epoch   7 step    74600 |   5780 batches | lr 0.000151 | ms/batch 718.58 | loss  3.18 | ppl    24.164
+| epoch   7 step    74800 |   5980 batches | lr 0.000149 | ms/batch 718.67 | loss  3.16 | ppl    23.490
+| epoch   7 step    75000 |   6180 batches | lr 0.000147 | ms/batch 710.85 | loss  3.16 | ppl    23.495
+| epoch   7 step    75200 |   6380 batches | lr 0.000145 | ms/batch 724.50 | loss  3.19 | ppl    24.244
+| epoch   7 step    75400 |   6580 batches | lr 0.000142 | ms/batch 740.93 | loss  3.12 | ppl    22.548
+| epoch   7 step    75600 |   6780 batches | lr 0.00014 | ms/batch 745.37 | loss  3.15 | ppl    23.251
+| epoch   7 step    75800 |   6980 batches | lr 0.000138 | ms/batch 713.31 | loss  3.16 | ppl    23.564
+| epoch   7 step    76000 |   7180 batches | lr 0.000136 | ms/batch 720.59 | loss  3.11 | ppl    22.422
+----------------------------------------------------------------------------------------------------
+| Eval  19 at step    76000 | time: 2902.26s | valid loss  3.20 | valid ppl    24.479
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    76200 |   7380 batches | lr 0.000134 | ms/batch 762.44 | loss  3.14 | ppl    23.037
+| epoch   7 step    76400 |   7580 batches | lr 0.000131 | ms/batch 732.61 | loss  3.11 | ppl    22.458
+| epoch   7 step    76600 |   7780 batches | lr 0.000129 | ms/batch 695.86 | loss  3.15 | ppl    23.248
+| epoch   7 step    76800 |   7980 batches | lr 0.000127 | ms/batch 742.29 | loss  3.14 | ppl    23.190
+| epoch   7 step    77000 |   8180 batches | lr 0.000125 | ms/batch 752.96 | loss  3.13 | ppl    22.825
+| epoch   7 step    77200 |   8380 batches | lr 0.000123 | ms/batch 722.77 | loss  3.16 | ppl    23.556
+| epoch   7 step    77400 |   8580 batches | lr 0.000121 | ms/batch 719.94 | loss  3.14 | ppl    23.028
+| epoch   7 step    77600 |   8780 batches | lr 0.000119 | ms/batch 744.23 | loss  3.15 | ppl    23.304
+| epoch   7 step    77800 |   8980 batches | lr 0.000117 | ms/batch 750.43 | loss  3.15 | ppl    23.339
+| epoch   7 step    78000 |   9180 batches | lr 0.000115 | ms/batch 748.00 | loss  3.13 | ppl    22.849
+| epoch   7 step    78200 |   9380 batches | lr 0.000113 | ms/batch 748.11 | loss  3.15 | ppl    23.225
+| epoch   7 step    78400 |   9580 batches | lr 0.000111 | ms/batch 766.61 | loss  3.16 | ppl    23.632
+| epoch   7 step    78600 |   9780 batches | lr 0.000109 | ms/batch 760.63 | loss  3.14 | ppl    23.013
+| epoch   7 step    78800 |   9980 batches | lr 0.000107 | ms/batch 747.21 | loss  3.13 | ppl    22.924
+| epoch   7 step    79000 |  10180 batches | lr 0.000105 | ms/batch 735.24 | loss  3.13 | ppl    22.790
+| epoch   7 step    79200 |  10380 batches | lr 0.000103 | ms/batch 760.44 | loss  3.14 | ppl    23.063
+| epoch   7 step    79400 |  10580 batches | lr 0.000101 | ms/batch 758.52 | loss  3.16 | ppl    23.590
+| epoch   7 step    79600 |  10780 batches | lr 9.94e-05 | ms/batch 750.88 | loss  3.12 | ppl    22.600
+| epoch   7 step    79800 |  10980 batches | lr 9.75e-05 | ms/batch 754.39 | loss  3.14 | ppl    23.110
+| epoch   7 step    80000 |  11180 batches | lr 9.57e-05 | ms/batch 727.37 | loss  3.16 | ppl    23.628
+----------------------------------------------------------------------------------------------------
+| Eval  20 at step    80000 | time: 2972.05s | valid loss  3.18 | valid ppl    24.133
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    80200 |  11380 batches | lr 9.38e-05 | ms/batch 794.23 | loss  3.15 | ppl    23.294
+| epoch   8 step    80400 |    110 batches | lr 9.2e-05 | ms/batch 734.78 | loss  3.13 | ppl    22.874
+| epoch   8 step    80600 |    310 batches | lr 9.02e-05 | ms/batch 754.47 | loss  3.12 | ppl    22.589
+| epoch   8 step    80800 |    510 batches | lr 8.84e-05 | ms/batch 740.76 | loss  3.15 | ppl    23.330
+| epoch   8 step    81000 |    710 batches | lr 8.66e-05 | ms/batch 735.69 | loss  3.11 | ppl    22.359
+| epoch   8 step    81200 |    910 batches | lr 8.49e-05 | ms/batch 752.15 | loss  3.12 | ppl    22.600
+| epoch   8 step    81400 |   1110 batches | lr 8.31e-05 | ms/batch 742.53 | loss  3.15 | ppl    23.245
+| epoch   8 step    81600 |   1310 batches | lr 8.14e-05 | ms/batch 773.49 | loss  3.12 | ppl    22.646
+| epoch   8 step    81800 |   1510 batches | lr 7.97e-05 | ms/batch 760.43 | loss  3.12 | ppl    22.674
+| epoch   8 step    82000 |   1710 batches | lr 7.8e-05 | ms/batch 737.05 | loss  3.11 | ppl    22.328
+| epoch   8 step    82200 |   1910 batches | lr 7.63e-05 | ms/batch 733.76 | loss  3.14 | ppl    23.159
+| epoch   8 step    82400 |   2110 batches | lr 7.46e-05 | ms/batch 764.27 | loss  3.16 | ppl    23.570
+| epoch   8 step    82600 |   2310 batches | lr 7.3e-05 | ms/batch 772.41 | loss  3.14 | ppl    23.087
+| epoch   8 step    82800 |   2510 batches | lr 7.14e-05 | ms/batch 745.45 | loss  3.12 | ppl    22.685
+| epoch   8 step    83000 |   2710 batches | lr 6.98e-05 | ms/batch 755.61 | loss  3.12 | ppl    22.584
+| epoch   8 step    83200 |   2910 batches | lr 6.82e-05 | ms/batch 750.13 | loss  3.09 | ppl    22.066
+| epoch   8 step    83400 |   3110 batches | lr 6.66e-05 | ms/batch 748.21 | loss  3.12 | ppl    22.669
+| epoch   8 step    83600 |   3310 batches | lr 6.5e-05 | ms/batch 724.78 | loss  3.14 | ppl    23.128
+| epoch   8 step    83800 |   3510 batches | lr 6.35e-05 | ms/batch 740.45 | loss  3.10 | ppl    22.196
+| epoch   8 step    84000 |   3710 batches | lr 6.2e-05 | ms/batch 751.59 | loss  3.12 | ppl    22.623
+----------------------------------------------------------------------------------------------------
+| Eval  21 at step    84000 | time: 2998.13s | valid loss  3.17 | valid ppl    23.903
+----------------------------------------------------------------------------------------------------
+| epoch   8 step    84200 |   3910 batches | lr 6.05e-05 | ms/batch 825.75 | loss  3.11 | ppl    22.467
+| epoch   8 step    84400 |   4110 batches | lr 5.9e-05 | ms/batch 733.29 | loss  3.12 | ppl    22.706
+| epoch   8 step    84600 |   4310 batches | lr 5.75e-05 | ms/batch 742.55 | loss  3.12 | ppl    22.669
+| epoch   8 step    84800 |   4510 batches | lr 5.6e-05 | ms/batch 751.39 | loss  3.14 | ppl    23.073
+| epoch   8 step    85000 |   4710 batches | lr 5.46e-05 | ms/batch 770.53 | loss  3.10 | ppl    22.104
+| epoch   8 step    85200 |   4910 batches | lr 5.32e-05 | ms/batch 739.47 | loss  3.11 | ppl    22.408
+| epoch   8 step    85400 |   5110 batches | lr 5.18e-05 | ms/batch 724.96 | loss  3.11 | ppl    22.412
+| epoch   8 step    85600 |   5310 batches | lr 5.04e-05 | ms/batch 741.18 | loss  3.10 | ppl    22.161
+| epoch   8 step    85800 |   5510 batches | lr 4.9e-05 | ms/batch 752.19 | loss  3.10 | ppl    22.286
+| epoch   8 step    86000 |   5710 batches | lr 4.77e-05 | ms/batch 746.66 | loss  3.11 | ppl    22.364
+| epoch   8 step    86200 |   5910 batches | lr 4.63e-05 | ms/batch 738.32 | loss  3.11 | ppl    22.427
+| epoch   8 step    86400 |   6110 batches | lr 4.5e-05 | ms/batch 759.33 | loss  3.10 | ppl    22.299
+| epoch   8 step    86600 |   6310 batches | lr 4.37e-05 | ms/batch 748.11 | loss  3.12 | ppl    22.675
+| epoch   8 step    86800 |   6510 batches | lr 4.25e-05 | ms/batch 745.24 | loss  3.07 | ppl    21.580
+| epoch   8 step    87000 |   6710 batches | lr 4.12e-05 | ms/batch 745.61 | loss  3.08 | ppl    21.680
+| epoch   8 step    87200 |   6910 batches | lr 4e-05 | ms/batch 752.93 | loss  3.10 | ppl    22.089
+| epoch   8 step    87400 |   7110 batches | lr 3.87e-05 | ms/batch 604.82 | loss  3.09 | ppl    21.917
+| epoch   8 step    87600 |   7310 batches | lr 3.75e-05 | ms/batch 430.85 | loss  3.05 | ppl    21.129
+| epoch   8 step    87800 |   7510 batches | lr 3.63e-05 | ms/batch 430.44 | loss  3.09 | ppl    21.941
+| epoch   8 step    88000 |   7710 batches | lr 3.52e-05 | ms/batch 432.19 | loss  3.08 | ppl    21.673
+----------------------------------------------------------------------------------------------------
+| Eval  22 at step    88000 | time: 2776.62s | valid loss  3.16 | valid ppl    23.687
+----------------------------------------------------------------------------------------------------
+| epoch   8 step    88200 |   7910 batches | lr 3.4e-05 | ms/batch 488.14 | loss  3.08 | ppl    21.771
+| epoch   8 step    88400 |   8110 batches | lr 3.29e-05 | ms/batch 430.18 | loss  3.09 | ppl    22.011
+| epoch   8 step    88600 |   8310 batches | lr 3.18e-05 | ms/batch 432.60 | loss  3.09 | ppl    21.873
+| epoch   8 step    88800 |   8510 batches | lr 3.07e-05 | ms/batch 432.02 | loss  3.08 | ppl    21.770
+| epoch   8 step    89000 |   8710 batches | lr 2.96e-05 | ms/batch 432.92 | loss  3.10 | ppl    22.144
+| epoch   8 step    89200 |   8910 batches | lr 2.86e-05 | ms/batch 431.36 | loss  3.10 | ppl    22.127
+| epoch   8 step    89400 |   9110 batches | lr 2.75e-05 | ms/batch 431.38 | loss  3.10 | ppl    22.138
+| epoch   8 step    89600 |   9310 batches | lr 2.65e-05 | ms/batch 430.48 | loss  3.08 | ppl    21.755
+| epoch   8 step    89800 |   9510 batches | lr 2.55e-05 | ms/batch 431.16 | loss  3.11 | ppl    22.437
+| epoch   8 step    90000 |   9710 batches | lr 2.45e-05 | ms/batch 429.64 | loss  3.09 | ppl    21.973
+| epoch   8 step    90200 |   9910 batches | lr 2.36e-05 | ms/batch 428.56 | loss  3.08 | ppl    21.767
+| epoch   8 step    90400 |  10110 batches | lr 2.26e-05 | ms/batch 429.16 | loss  3.09 | ppl    22.028
+| epoch   8 step    90600 |  10310 batches | lr 2.17e-05 | ms/batch 431.47 | loss  3.09 | ppl    21.880
+| epoch   8 step    90800 |  10510 batches | lr 2.08e-05 | ms/batch 430.01 | loss  3.11 | ppl    22.506
+| epoch   8 step    91000 |  10710 batches | lr 1.99e-05 | ms/batch 430.75 | loss  3.08 | ppl    21.691
+| epoch   8 step    91200 |  10910 batches | lr 1.9e-05 | ms/batch 431.30 | loss  3.07 | ppl    21.584
+| epoch   8 step    91400 |  11110 batches | lr 1.82e-05 | ms/batch 430.69 | loss  3.13 | ppl    22.905
+| epoch   8 step    91600 |  11310 batches | lr 1.73e-05 | ms/batch 431.02 | loss  3.09 | ppl    22.051
+| epoch   9 step    91800 |     40 batches | lr 1.65e-05 | ms/batch 429.67 | loss  3.11 | ppl    22.378
+| epoch   9 step    92000 |    240 batches | lr 1.57e-05 | ms/batch 430.81 | loss  3.06 | ppl    21.367
+----------------------------------------------------------------------------------------------------
+| Eval  23 at step    92000 | time: 1730.21s | valid loss  3.16 | valid ppl    23.602
+----------------------------------------------------------------------------------------------------
+| epoch   9 step    92200 |    440 batches | lr 1.5e-05 | ms/batch 483.29 | loss  3.10 | ppl    22.199
+| epoch   9 step    92400 |    640 batches | lr 1.42e-05 | ms/batch 434.23 | loss  3.07 | ppl    21.539
+| epoch   9 step    92600 |    840 batches | lr 1.35e-05 | ms/batch 434.24 | loss  3.11 | ppl    22.439
+| epoch   9 step    92800 |   1040 batches | lr 1.28e-05 | ms/batch 432.72 | loss  3.07 | ppl    21.632
+| epoch   9 step    93000 |   1240 batches | lr 1.21e-05 | ms/batch 429.50 | loss  3.08 | ppl    21.800
+| epoch   9 step    93200 |   1440 batches | lr 1.14e-05 | ms/batch 432.40 | loss  3.09 | ppl    22.049
+| epoch   9 step    93400 |   1640 batches | lr 1.07e-05 | ms/batch 431.08 | loss  3.07 | ppl    21.468
+| epoch   9 step    93600 |   1840 batches | lr 1.01e-05 | ms/batch 430.19 | loss  3.09 | ppl    21.946
+| epoch   9 step    93800 |   2040 batches | lr 9.47e-06 | ms/batch 431.40 | loss  3.13 | ppl    22.849
+| epoch   9 step    94000 |   2240 batches | lr 8.87e-06 | ms/batch 432.65 | loss  3.10 | ppl    22.092
+| epoch   9 step    94200 |   2440 batches | lr 8.29e-06 | ms/batch 429.09 | loss  3.10 | ppl    22.179
+| epoch   9 step    94400 |   2640 batches | lr 7.73e-06 | ms/batch 428.25 | loss  3.10 | ppl    22.114
+| epoch   9 step    94600 |   2840 batches | lr 7.19e-06 | ms/batch 428.08 | loss  3.05 | ppl    21.164
+| epoch   9 step    94800 |   3040 batches | lr 6.67e-06 | ms/batch 428.49 | loss  3.09 | ppl    22.038
+| epoch   9 step    95000 |   3240 batches | lr 6.17e-06 | ms/batch 430.82 | loss  3.09 | ppl    21.949
+| epoch   9 step    95200 |   3440 batches | lr 5.68e-06 | ms/batch 427.08 | loss  3.08 | ppl    21.680
+| epoch   9 step    95400 |   3640 batches | lr 5.22e-06 | ms/batch 428.74 | loss  3.07 | ppl    21.579
+| epoch   9 step    95600 |   3840 batches | lr 4.78e-06 | ms/batch 427.39 | loss  3.09 | ppl    21.879
+| epoch   9 step    95800 |   4040 batches | lr 4.35e-06 | ms/batch 427.67 | loss  3.10 | ppl    22.228
+| epoch   9 step    96000 |   4240 batches | lr 3.95e-06 | ms/batch 427.59 | loss  3.08 | ppl    21.796
+----------------------------------------------------------------------------------------------------
+| Eval  24 at step    96000 | time: 1726.61s | valid loss  3.16 | valid ppl    23.510
+----------------------------------------------------------------------------------------------------
+| epoch   9 step    96200 |   4440 batches | lr 3.57e-06 | ms/batch 481.05 | loss  3.09 | ppl    21.968
+| epoch   9 step    96400 |   4640 batches | lr 3.2e-06 | ms/batch 426.74 | loss  3.09 | ppl    21.871
+| epoch   9 step    96600 |   4840 batches | lr 2.85e-06 | ms/batch 427.07 | loss  3.07 | ppl    21.565
+| epoch   9 step    96800 |   5040 batches | lr 2.53e-06 | ms/batch 436.58 | loss  3.09 | ppl    22.056
+| epoch   9 step    97000 |   5240 batches | lr 2.22e-06 | ms/batch 427.55 | loss  3.08 | ppl    21.784
+| epoch   9 step    97200 |   5440 batches | lr 1.94e-06 | ms/batch 426.99 | loss  3.05 | ppl    21.169
+| epoch   9 step    97400 |   5640 batches | lr 1.67e-06 | ms/batch 427.80 | loss  3.10 | ppl    22.104
+| epoch   9 step    97600 |   5840 batches | lr 1.42e-06 | ms/batch 429.61 | loss  3.09 | ppl    21.891
+| epoch   9 step    97800 |   6040 batches | lr 1.2e-06 | ms/batch 427.90 | loss  3.06 | ppl    21.431
+| epoch   9 step    98000 |   6240 batches | lr 9.88e-07 | ms/batch 431.01 | loss  3.08 | ppl    21.797
+| epoch   9 step    98200 |   6440 batches | lr 8.01e-07 | ms/batch 427.47 | loss  3.09 | ppl    21.956
+| epoch   9 step    98400 |   6640 batches | lr 6.33e-07 | ms/batch 427.01 | loss  3.04 | ppl    20.833
+| epoch   9 step    98600 |   6840 batches | lr 4.84e-07 | ms/batch 573.59 | loss  3.07 | ppl    21.489
+| epoch   9 step    98800 |   7040 batches | lr 3.56e-07 | ms/batch 711.47 | loss  3.07 | ppl    21.563
+| epoch   9 step    99000 |   7240 batches | lr 2.47e-07 | ms/batch 736.74 | loss  3.04 | ppl    20.823
+| epoch   9 step    99200 |   7440 batches | lr 1.58e-07 | ms/batch 708.78 | loss  3.05 | ppl    21.211
+| epoch   9 step    99400 |   7640 batches | lr 8.9e-08 | ms/batch 750.12 | loss  3.04 | ppl    20.909
+| epoch   9 step    99600 |   7840 batches | lr 3.96e-08 | ms/batch 726.05 | loss  3.07 | ppl    21.536
+| epoch   9 step    99800 |   8040 batches | lr 9.89e-09 | ms/batch 691.15 | loss  3.07 | ppl    21.509
+| epoch   9 step   100000 |   8240 batches | lr 0 | ms/batch 704.59 | loss  3.06 | ppl    21.301
+----------------------------------------------------------------------------------------------------
+| Eval  25 at step   100000 | time: 2157.66s | valid loss  3.16 | valid ppl    23.503
+----------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------
+End of training
+====================================================================================================
+| End of training | test loss  3.19 | test ppl    24.264
+====================================================================================================
diff --git a/NLP/Transformer-XL/exp_results/log-200k.txt b/NLP/Transformer-XL/exp_results/log-200k.txt
new file mode 100644
index 0000000..62efb00
--- /dev/null
+++ b/NLP/Transformer-XL/exp_results/log-200k.txt
@@ -0,0 +1,1224 @@
+====================================================================================================
+    - data : /root/autodl-tmp/data/wikitext-103/
+    - dataset : wt103
+    - n_layer : 16
+    - n_head : 10
+    - d_head : 41
+    - d_embed : 410
+    - d_model : 410
+    - d_inner : 2100
+    - dropout : 0.1
+    - dropatt : 0.0
+    - init : normal
+    - emb_init : normal
+    - init_range : 0.1
+    - emb_init_range : 0.01
+    - init_std : 0.02
+    - proj_init_std : 0.01
+    - optim : adan
+    - lr : 0.001
+    - wd : 0.02
+    - mom : 0.0
+    - scheduler : cosine
+    - warmup_step : 3000
+    - decay_rate : 0.5
+    - lr_min : 1e-06
+    - clip : 0.25
+    - clip_nonemb : False
+    - max_step : 200000
+    - batch_size : 60
+    - batch_chunk : 1
+    - tgt_len : 150
+    - eval_tgt_len : 150
+    - ext_len : 0
+    - mem_len : 150
+    - not_tied : False
+    - seed : 1111
+    - cuda : True
+    - adaptive : True
+    - div_val : 1
+    - pre_lnorm : False
+    - varlen : False
+    - multi_gpu : True
+    - log_interval : 200
+    - eval_interval : 4000
+    - work_dir : /root/autodl-tmp/-wt103/20220811-105308
+    - restart : False
+    - restart_dir : 
+    - debug : False
+    - same_length : False
+    - attn_type : 0
+    - clamp_len : -1
+    - eta_min : 0.0
+    - gpu0_bsz : 4
+    - max_eval_steps : -1
+    - sample_softmax : -1
+    - patience : 0
+    - finetune_v2 : False
+    - finetune_v3 : False
+    - fp16 : False
+    - static_loss_scale : 1
+    - dynamic_loss_scale : False
+    - opt_betas : [0.9, 0.9, 0.999]
+    - tied : True
+    - n_token : 267735
+    - n_all_param : 151107538
+    - n_nonemb_param : 41066400
+====================================================================================================
+#params = 151107538
+#non emb params = 41066400
+| epoch   1 step      200 |    200 batches | lr 6.67e-05 | ms/batch 776.32 | loss  8.90 | ppl  7366.806
+| epoch   1 step      400 |    400 batches | lr 0.000133 | ms/batch 706.08 | loss  6.85 | ppl   942.451
+| epoch   1 step      600 |    600 batches | lr 0.0002 | ms/batch 682.24 | loss  6.34 | ppl   567.781
+| epoch   1 step      800 |    800 batches | lr 0.000267 | ms/batch 727.20 | loss  6.06 | ppl   428.925
+| epoch   1 step     1000 |   1000 batches | lr 0.000333 | ms/batch 722.60 | loss  5.80 | ppl   330.968
+| epoch   1 step     1200 |   1200 batches | lr 0.0004 | ms/batch 707.72 | loss  5.60 | ppl   270.691
+| epoch   1 step     1400 |   1400 batches | lr 0.000467 | ms/batch 715.23 | loss  5.43 | ppl   228.271
+| epoch   1 step     1600 |   1600 batches | lr 0.000533 | ms/batch 717.15 | loss  5.28 | ppl   196.416
+| epoch   1 step     1800 |   1800 batches | lr 0.0006 | ms/batch 706.30 | loss  5.15 | ppl   173.240
+| epoch   1 step     2000 |   2000 batches | lr 0.000667 | ms/batch 692.22 | loss  5.04 | ppl   154.584
+| epoch   1 step     2200 |   2200 batches | lr 0.000733 | ms/batch 676.79 | loss  4.93 | ppl   138.813
+| epoch   1 step     2400 |   2400 batches | lr 0.0008 | ms/batch 692.14 | loss  4.85 | ppl   128.135
+| epoch   1 step     2600 |   2600 batches | lr 0.000867 | ms/batch 670.68 | loss  4.76 | ppl   116.945
+| epoch   1 step     2800 |   2800 batches | lr 0.000933 | ms/batch 709.41 | loss  4.69 | ppl   108.587
+| epoch   1 step     3000 |   3000 batches | lr 0.001 | ms/batch 684.10 | loss  4.64 | ppl   103.975
+| epoch   1 step     3200 |   3200 batches | lr 0.001 | ms/batch 705.82 | loss  4.58 | ppl    97.501
+| epoch   1 step     3400 |   3400 batches | lr 0.001 | ms/batch 696.96 | loss  4.53 | ppl    93.101
+| epoch   1 step     3600 |   3600 batches | lr 0.000999 | ms/batch 698.89 | loss  4.45 | ppl    85.852
+| epoch   1 step     3800 |   3800 batches | lr 0.000999 | ms/batch 728.79 | loss  4.48 | ppl    88.166
+| epoch   1 step     4000 |   4000 batches | lr 0.000999 | ms/batch 728.35 | loss  4.44 | ppl    84.369
+----------------------------------------------------------------------------------------------------
+| Eval   1 at step     4000 | time: 2837.46s | valid loss  4.37 | valid ppl    78.692
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     4200 |   4200 batches | lr 0.000999 | ms/batch 775.55 | loss  4.38 | ppl    79.980
+| epoch   1 step     4400 |   4400 batches | lr 0.000999 | ms/batch 703.47 | loss  4.36 | ppl    78.094
+| epoch   1 step     4600 |   4600 batches | lr 0.000999 | ms/batch 740.85 | loss  4.34 | ppl    76.334
+| epoch   1 step     4800 |   4800 batches | lr 0.000999 | ms/batch 705.75 | loss  4.28 | ppl    72.245
+| epoch   1 step     5000 |   5000 batches | lr 0.000999 | ms/batch 693.81 | loss  4.31 | ppl    74.614
+| epoch   1 step     5200 |   5200 batches | lr 0.000999 | ms/batch 712.14 | loss  4.25 | ppl    70.189
+| epoch   1 step     5400 |   5400 batches | lr 0.000998 | ms/batch 744.54 | loss  4.20 | ppl    66.510
+| epoch   1 step     5600 |   5600 batches | lr 0.000998 | ms/batch 686.33 | loss  4.22 | ppl    67.986
+| epoch   1 step     5800 |   5800 batches | lr 0.000998 | ms/batch 757.67 | loss  4.21 | ppl    67.454
+| epoch   1 step     6000 |   6000 batches | lr 0.000998 | ms/batch 743.34 | loss  4.17 | ppl    64.554
+| epoch   1 step     6200 |   6200 batches | lr 0.000998 | ms/batch 715.31 | loss  4.14 | ppl    62.901
+| epoch   1 step     6400 |   6400 batches | lr 0.000998 | ms/batch 726.38 | loss  4.17 | ppl    64.900
+| epoch   1 step     6600 |   6600 batches | lr 0.000998 | ms/batch 708.39 | loss  4.11 | ppl    60.722
+| epoch   1 step     6800 |   6800 batches | lr 0.000997 | ms/batch 681.98 | loss  4.10 | ppl    60.559
+| epoch   1 step     7000 |   7000 batches | lr 0.000997 | ms/batch 726.10 | loss  4.11 | ppl    60.652
+| epoch   1 step     7200 |   7200 batches | lr 0.000997 | ms/batch 714.34 | loss  4.06 | ppl    57.786
+| epoch   1 step     7400 |   7400 batches | lr 0.000997 | ms/batch 696.85 | loss  4.05 | ppl    57.517
+| epoch   1 step     7600 |   7600 batches | lr 0.000997 | ms/batch 720.62 | loss  4.03 | ppl    56.394
+| epoch   1 step     7800 |   7800 batches | lr 0.000996 | ms/batch 712.74 | loss  4.05 | ppl    57.635
+| epoch   1 step     8000 |   8000 batches | lr 0.000996 | ms/batch 695.84 | loss  4.05 | ppl    57.298
+----------------------------------------------------------------------------------------------------
+| Eval   2 at step     8000 | time: 2868.86s | valid loss  3.94 | valid ppl    51.178
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     8200 |   8200 batches | lr 0.000996 | ms/batch 738.23 | loss  4.02 | ppl    55.917
+| epoch   1 step     8400 |   8400 batches | lr 0.000996 | ms/batch 734.08 | loss  4.03 | ppl    56.542
+| epoch   1 step     8600 |   8600 batches | lr 0.000996 | ms/batch 707.68 | loss  4.01 | ppl    55.411
+| epoch   1 step     8800 |   8800 batches | lr 0.000995 | ms/batch 729.09 | loss  4.02 | ppl    55.927
+| epoch   1 step     9000 |   9000 batches | lr 0.000995 | ms/batch 686.10 | loss  3.99 | ppl    54.282
+| epoch   1 step     9200 |   9200 batches | lr 0.000995 | ms/batch 692.20 | loss  3.98 | ppl    53.707
+| epoch   1 step     9400 |   9400 batches | lr 0.000995 | ms/batch 735.51 | loss  3.99 | ppl    53.919
+| epoch   1 step     9600 |   9600 batches | lr 0.000995 | ms/batch 749.40 | loss  4.00 | ppl    54.757
+| epoch   1 step     9800 |   9800 batches | lr 0.000994 | ms/batch 704.19 | loss  3.96 | ppl    52.375
+| epoch   1 step    10000 |  10000 batches | lr 0.000994 | ms/batch 703.88 | loss  3.97 | ppl    53.129
+| epoch   1 step    10200 |  10200 batches | lr 0.000994 | ms/batch 727.49 | loss  3.94 | ppl    51.329
+| epoch   1 step    10400 |  10400 batches | lr 0.000994 | ms/batch 692.36 | loss  3.94 | ppl    51.268
+| epoch   1 step    10600 |  10600 batches | lr 0.000993 | ms/batch 694.79 | loss  3.96 | ppl    52.487
+| epoch   1 step    10800 |  10800 batches | lr 0.000993 | ms/batch 718.57 | loss  3.92 | ppl    50.269
+| epoch   1 step    11000 |  11000 batches | lr 0.000993 | ms/batch 698.89 | loss  3.96 | ppl    52.263
+| epoch   1 step    11200 |  11200 batches | lr 0.000993 | ms/batch 704.48 | loss  3.93 | ppl    51.073
+| epoch   1 step    11400 |  11400 batches | lr 0.000992 | ms/batch 705.65 | loss  3.93 | ppl    50.985
+| epoch   2 step    11600 |    130 batches | lr 0.000992 | ms/batch 691.91 | loss  3.90 | ppl    49.549
+| epoch   2 step    11800 |    330 batches | lr 0.000992 | ms/batch 692.51 | loss  3.88 | ppl    48.290
+| epoch   2 step    12000 |    530 batches | lr 0.000991 | ms/batch 705.18 | loss  3.90 | ppl    49.346
+----------------------------------------------------------------------------------------------------
+| Eval   3 at step    12000 | time: 2838.27s | valid loss  3.79 | valid ppl    44.041
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    12200 |    730 batches | lr 0.000991 | ms/batch 759.90 | loss  3.87 | ppl    47.958
+| epoch   2 step    12400 |    930 batches | lr 0.000991 | ms/batch 714.42 | loss  3.87 | ppl    48.080
+| epoch   2 step    12600 |   1130 batches | lr 0.00099 | ms/batch 699.20 | loss  3.90 | ppl    49.413
+| epoch   2 step    12800 |   1330 batches | lr 0.00099 | ms/batch 708.63 | loss  3.87 | ppl    47.722
+| epoch   2 step    13000 |   1530 batches | lr 0.00099 | ms/batch 714.74 | loss  3.86 | ppl    47.251
+| epoch   2 step    13200 |   1730 batches | lr 0.00099 | ms/batch 684.72 | loss  3.85 | ppl    46.990
+| epoch   2 step    13400 |   1930 batches | lr 0.000989 | ms/batch 751.38 | loss  3.85 | ppl    47.227
+| epoch   2 step    13600 |   2130 batches | lr 0.000989 | ms/batch 715.16 | loss  3.87 | ppl    48.126
+| epoch   2 step    13800 |   2330 batches | lr 0.000989 | ms/batch 699.09 | loss  3.85 | ppl    46.907
+| epoch   2 step    14000 |   2530 batches | lr 0.000988 | ms/batch 711.72 | loss  3.83 | ppl    46.153
+| epoch   2 step    14200 |   2730 batches | lr 0.000988 | ms/batch 682.58 | loss  3.81 | ppl    45.173
+| epoch   2 step    14400 |   2930 batches | lr 0.000987 | ms/batch 719.64 | loss  3.79 | ppl    44.409
+| epoch   2 step    14600 |   3130 batches | lr 0.000987 | ms/batch 719.75 | loss  3.80 | ppl    44.802
+| epoch   2 step    14800 |   3330 batches | lr 0.000987 | ms/batch 715.90 | loss  3.81 | ppl    44.978
+| epoch   2 step    15000 |   3530 batches | lr 0.000986 | ms/batch 701.70 | loss  3.77 | ppl    43.266
+| epoch   2 step    15200 |   3730 batches | lr 0.000986 | ms/batch 731.21 | loss  3.80 | ppl    44.576
+| epoch   2 step    15400 |   3930 batches | lr 0.000986 | ms/batch 685.54 | loss  3.79 | ppl    44.202
+| epoch   2 step    15600 |   4130 batches | lr 0.000985 | ms/batch 715.92 | loss  3.78 | ppl    43.802
+| epoch   2 step    15800 |   4330 batches | lr 0.000985 | ms/batch 709.67 | loss  3.79 | ppl    44.150
+| epoch   2 step    16000 |   4530 batches | lr 0.000985 | ms/batch 698.36 | loss  3.79 | ppl    44.245
+----------------------------------------------------------------------------------------------------
+| Eval   4 at step    16000 | time: 2843.67s | valid loss  3.69 | valid ppl    40.088
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    16200 |   4730 batches | lr 0.000984 | ms/batch 794.03 | loss  3.75 | ppl    42.359
+| epoch   2 step    16400 |   4930 batches | lr 0.000984 | ms/batch 719.73 | loss  3.77 | ppl    43.208
+| epoch   2 step    16600 |   5130 batches | lr 0.000983 | ms/batch 687.12 | loss  3.76 | ppl    42.866
+| epoch   2 step    16800 |   5330 batches | lr 0.000983 | ms/batch 714.50 | loss  3.75 | ppl    42.520
+| epoch   2 step    17000 |   5530 batches | lr 0.000982 | ms/batch 740.55 | loss  3.74 | ppl    41.965
+| epoch   2 step    17200 |   5730 batches | lr 0.000982 | ms/batch 686.23 | loss  3.76 | ppl    42.748
+| epoch   2 step    17400 |   5930 batches | lr 0.000982 | ms/batch 714.69 | loss  3.74 | ppl    42.066
+| epoch   2 step    17600 |   6130 batches | lr 0.000981 | ms/batch 716.37 | loss  3.73 | ppl    41.737
+| epoch   2 step    17800 |   6330 batches | lr 0.000981 | ms/batch 709.37 | loss  3.76 | ppl    42.999
+| epoch   2 step    18000 |   6530 batches | lr 0.00098 | ms/batch 707.37 | loss  3.70 | ppl    40.547
+| epoch   2 step    18200 |   6730 batches | lr 0.00098 | ms/batch 740.15 | loss  3.71 | ppl    40.752
+| epoch   2 step    18400 |   6930 batches | lr 0.000979 | ms/batch 700.09 | loss  3.72 | ppl    41.308
+| epoch   2 step    18600 |   7130 batches | lr 0.000979 | ms/batch 692.00 | loss  3.70 | ppl    40.409
+| epoch   2 step    18800 |   7330 batches | lr 0.000979 | ms/batch 703.47 | loss  3.68 | ppl    39.589
+| epoch   2 step    19000 |   7530 batches | lr 0.000978 | ms/batch 688.29 | loss  3.70 | ppl    40.570
+| epoch   2 step    19200 |   7730 batches | lr 0.000978 | ms/batch 682.44 | loss  3.70 | ppl    40.581
+| epoch   2 step    19400 |   7930 batches | lr 0.000977 | ms/batch 728.02 | loss  3.70 | ppl    40.350
+| epoch   2 step    19600 |   8130 batches | lr 0.000977 | ms/batch 685.89 | loss  3.71 | ppl    40.839
+| epoch   2 step    19800 |   8330 batches | lr 0.000976 | ms/batch 750.43 | loss  3.70 | ppl    40.432
+| epoch   2 step    20000 |   8530 batches | lr 0.000976 | ms/batch 684.49 | loss  3.69 | ppl    40.035
+----------------------------------------------------------------------------------------------------
+| Eval   5 at step    20000 | time: 2844.94s | valid loss  3.61 | valid ppl    36.930
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    20200 |   8730 batches | lr 0.000975 | ms/batch 792.71 | loss  3.71 | ppl    40.665
+| epoch   2 step    20400 |   8930 batches | lr 0.000975 | ms/batch 724.20 | loss  3.70 | ppl    40.601
+| epoch   2 step    20600 |   9130 batches | lr 0.000974 | ms/batch 703.31 | loss  3.70 | ppl    40.266
+| epoch   2 step    20800 |   9330 batches | lr 0.000974 | ms/batch 712.60 | loss  3.68 | ppl    39.824
+| epoch   2 step    21000 |   9530 batches | lr 0.000973 | ms/batch 707.33 | loss  3.73 | ppl    41.620
+| epoch   2 step    21200 |   9730 batches | lr 0.000973 | ms/batch 732.18 | loss  3.68 | ppl    39.564
+| epoch   2 step    21400 |   9930 batches | lr 0.000972 | ms/batch 739.74 | loss  3.69 | ppl    39.997
+| epoch   2 step    21600 |  10130 batches | lr 0.000972 | ms/batch 721.44 | loss  3.67 | ppl    39.422
+| epoch   2 step    21800 |  10330 batches | lr 0.000971 | ms/batch 724.90 | loss  3.68 | ppl    39.825
+| epoch   2 step    22000 |  10530 batches | lr 0.000971 | ms/batch 700.39 | loss  3.70 | ppl    40.466
+| epoch   2 step    22200 |  10730 batches | lr 0.00097 | ms/batch 697.06 | loss  3.67 | ppl    39.058
+| epoch   2 step    22400 |  10930 batches | lr 0.00097 | ms/batch 698.49 | loss  3.66 | ppl    39.010
+| epoch   2 step    22600 |  11130 batches | lr 0.000969 | ms/batch 735.66 | loss  3.71 | ppl    40.749
+| epoch   2 step    22800 |  11330 batches | lr 0.000968 | ms/batch 694.62 | loss  3.68 | ppl    39.480
+| epoch   3 step    23000 |     60 batches | lr 0.000968 | ms/batch 702.47 | loss  3.68 | ppl    39.624
+| epoch   3 step    23200 |    260 batches | lr 0.000967 | ms/batch 735.52 | loss  3.64 | ppl    37.917
+| epoch   3 step    23400 |    460 batches | lr 0.000967 | ms/batch 714.13 | loss  3.68 | ppl    39.527
+| epoch   3 step    23600 |    660 batches | lr 0.000966 | ms/batch 688.65 | loss  3.64 | ppl    38.062
+| epoch   3 step    23800 |    860 batches | lr 0.000966 | ms/batch 729.42 | loss  3.67 | ppl    39.410
+| epoch   3 step    24000 |   1060 batches | lr 0.000965 | ms/batch 720.33 | loss  3.66 | ppl    38.919
+----------------------------------------------------------------------------------------------------
+| Eval   6 at step    24000 | time: 2870.93s | valid loss  3.57 | valid ppl    35.685
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    24200 |   1260 batches | lr 0.000965 | ms/batch 762.39 | loss  3.65 | ppl    38.550
+| epoch   3 step    24400 |   1460 batches | lr 0.000964 | ms/batch 704.86 | loss  3.65 | ppl    38.452
+| epoch   3 step    24600 |   1660 batches | lr 0.000963 | ms/batch 712.42 | loss  3.64 | ppl    38.214
+| epoch   3 step    24800 |   1860 batches | lr 0.000963 | ms/batch 692.60 | loss  3.65 | ppl    38.427
+| epoch   3 step    25000 |   2060 batches | lr 0.000962 | ms/batch 712.66 | loss  3.69 | ppl    39.912
+| epoch   3 step    25200 |   2260 batches | lr 0.000962 | ms/batch 713.12 | loss  3.66 | ppl    38.905
+| epoch   3 step    25400 |   2460 batches | lr 0.000961 | ms/batch 746.11 | loss  3.65 | ppl    38.302
+| epoch   3 step    25600 |   2660 batches | lr 0.00096 | ms/batch 715.35 | loss  3.65 | ppl    38.395
+| epoch   3 step    25800 |   2860 batches | lr 0.00096 | ms/batch 709.29 | loss  3.59 | ppl    36.239
+| epoch   3 step    26000 |   3060 batches | lr 0.000959 | ms/batch 724.27 | loss  3.64 | ppl    38.109
+| epoch   3 step    26200 |   3260 batches | lr 0.000958 | ms/batch 684.82 | loss  3.64 | ppl    37.948
+| epoch   3 step    26400 |   3460 batches | lr 0.000958 | ms/batch 703.25 | loss  3.60 | ppl    36.652
+| epoch   3 step    26600 |   3660 batches | lr 0.000957 | ms/batch 697.91 | loss  3.62 | ppl    37.174
+| epoch   3 step    26800 |   3860 batches | lr 0.000957 | ms/batch 723.58 | loss  3.62 | ppl    37.381
+| epoch   3 step    27000 |   4060 batches | lr 0.000956 | ms/batch 720.99 | loss  3.63 | ppl    37.721
+| epoch   3 step    27200 |   4260 batches | lr 0.000955 | ms/batch 717.62 | loss  3.62 | ppl    37.339
+| epoch   3 step    27400 |   4460 batches | lr 0.000955 | ms/batch 722.90 | loss  3.62 | ppl    37.489
+| epoch   3 step    27600 |   4660 batches | lr 0.000954 | ms/batch 743.44 | loss  3.61 | ppl    37.092
+| epoch   3 step    27800 |   4860 batches | lr 0.000953 | ms/batch 696.12 | loss  3.60 | ppl    36.720
+| epoch   3 step    28000 |   5060 batches | lr 0.000953 | ms/batch 723.37 | loss  3.62 | ppl    37.226
+----------------------------------------------------------------------------------------------------
+| Eval   7 at step    28000 | time: 2861.34s | valid loss  3.55 | valid ppl    34.679
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    28200 |   5260 batches | lr 0.000952 | ms/batch 784.09 | loss  3.60 | ppl    36.586
+| epoch   3 step    28400 |   5460 batches | lr 0.000951 | ms/batch 697.94 | loss  3.58 | ppl    35.797
+| epoch   3 step    28600 |   5660 batches | lr 0.000951 | ms/batch 696.51 | loss  3.63 | ppl    37.613
+| epoch   3 step    28800 |   5860 batches | lr 0.00095 | ms/batch 709.45 | loss  3.60 | ppl    36.645
+| epoch   3 step    29000 |   6060 batches | lr 0.000949 | ms/batch 726.06 | loss  3.60 | ppl    36.438
+| epoch   3 step    29200 |   6260 batches | lr 0.000949 | ms/batch 713.31 | loss  3.60 | ppl    36.437
+| epoch   3 step    29400 |   6460 batches | lr 0.000948 | ms/batch 711.05 | loss  3.60 | ppl    36.736
+| epoch   3 step    29600 |   6660 batches | lr 0.000947 | ms/batch 718.44 | loss  3.55 | ppl    34.875
+| epoch   3 step    29800 |   6860 batches | lr 0.000946 | ms/batch 702.59 | loss  3.58 | ppl    35.994
+| epoch   3 step    30000 |   7060 batches | lr 0.000946 | ms/batch 707.51 | loss  3.58 | ppl    35.706
+| epoch   3 step    30200 |   7260 batches | lr 0.000945 | ms/batch 721.07 | loss  3.55 | ppl    34.761
+| epoch   3 step    30400 |   7460 batches | lr 0.000944 | ms/batch 709.39 | loss  3.57 | ppl    35.623
+| epoch   3 step    30600 |   7660 batches | lr 0.000944 | ms/batch 744.37 | loss  3.56 | ppl    35.102
+| epoch   3 step    30800 |   7860 batches | lr 0.000943 | ms/batch 734.93 | loss  3.57 | ppl    35.533
+| epoch   3 step    31000 |   8060 batches | lr 0.000942 | ms/batch 726.62 | loss  3.58 | ppl    35.834
+| epoch   3 step    31200 |   8260 batches | lr 0.000941 | ms/batch 720.25 | loss  3.57 | ppl    35.399
+| epoch   3 step    31400 |   8460 batches | lr 0.000941 | ms/batch 718.52 | loss  3.58 | ppl    35.858
+| epoch   3 step    31600 |   8660 batches | lr 0.00094 | ms/batch 739.97 | loss  3.57 | ppl    35.692
+| epoch   3 step    31800 |   8860 batches | lr 0.000939 | ms/batch 718.51 | loss  3.58 | ppl    35.785
+| epoch   3 step    32000 |   9060 batches | lr 0.000938 | ms/batch 707.81 | loss  3.58 | ppl    35.812
+----------------------------------------------------------------------------------------------------
+| Eval   8 at step    32000 | time: 2877.68s | valid loss  3.50 | valid ppl    33.030
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    32200 |   9260 batches | lr 0.000938 | ms/batch 794.55 | loss  3.56 | ppl    35.300
+| epoch   3 step    32400 |   9460 batches | lr 0.000937 | ms/batch 707.68 | loss  3.59 | ppl    36.119
+| epoch   3 step    32600 |   9660 batches | lr 0.000936 | ms/batch 743.86 | loss  3.59 | ppl    36.164
+| epoch   3 step    32800 |   9860 batches | lr 0.000935 | ms/batch 695.30 | loss  3.54 | ppl    34.575
+| epoch   3 step    33000 |  10060 batches | lr 0.000935 | ms/batch 692.14 | loss  3.59 | ppl    36.388
+| epoch   3 step    33200 |  10260 batches | lr 0.000934 | ms/batch 715.57 | loss  3.54 | ppl    34.497
+| epoch   3 step    33400 |  10460 batches | lr 0.000933 | ms/batch 716.72 | loss  3.58 | ppl    35.765
+| epoch   3 step    33600 |  10660 batches | lr 0.000932 | ms/batch 731.54 | loss  3.58 | ppl    36.053
+| epoch   3 step    33800 |  10860 batches | lr 0.000931 | ms/batch 681.57 | loss  3.54 | ppl    34.340
+| epoch   3 step    34000 |  11060 batches | lr 0.000931 | ms/batch 703.97 | loss  3.58 | ppl    35.930
+| epoch   3 step    34200 |  11260 batches | lr 0.00093 | ms/batch 701.49 | loss  3.59 | ppl    36.200
+| epoch   3 step    34400 |  11460 batches | lr 0.000929 | ms/batch 733.09 | loss  3.56 | ppl    35.206
+| epoch   4 step    34600 |    190 batches | lr 0.000928 | ms/batch 756.94 | loss  3.54 | ppl    34.517
+| epoch   4 step    34800 |    390 batches | lr 0.000927 | ms/batch 720.83 | loss  3.55 | ppl    34.839
+| epoch   4 step    35000 |    590 batches | lr 0.000927 | ms/batch 720.58 | loss  3.54 | ppl    34.625
+| epoch   4 step    35200 |    790 batches | lr 0.000926 | ms/batch 697.74 | loss  3.56 | ppl    35.160
+| epoch   4 step    35400 |    990 batches | lr 0.000925 | ms/batch 699.80 | loss  3.54 | ppl    34.435
+| epoch   4 step    35600 |   1190 batches | lr 0.000924 | ms/batch 714.28 | loss  3.56 | ppl    35.131
+| epoch   4 step    35800 |   1390 batches | lr 0.000923 | ms/batch 756.65 | loss  3.55 | ppl    34.742
+| epoch   4 step    36000 |   1590 batches | lr 0.000922 | ms/batch 709.40 | loss  3.54 | ppl    34.353
+----------------------------------------------------------------------------------------------------
+| Eval   9 at step    36000 | time: 2874.62s | valid loss  3.49 | valid ppl    32.646
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    36200 |   1790 batches | lr 0.000922 | ms/batch 803.92 | loss  3.55 | ppl    34.710
+| epoch   4 step    36400 |   1990 batches | lr 0.000921 | ms/batch 728.02 | loss  3.57 | ppl    35.683
+| epoch   4 step    36600 |   2190 batches | lr 0.00092 | ms/batch 688.41 | loss  3.56 | ppl    35.170
+| epoch   4 step    36800 |   2390 batches | lr 0.000919 | ms/batch 762.72 | loss  3.56 | ppl    35.152
+| epoch   4 step    37000 |   2590 batches | lr 0.000918 | ms/batch 713.16 | loss  3.54 | ppl    34.340
+| epoch   4 step    37200 |   2790 batches | lr 0.000917 | ms/batch 707.43 | loss  3.52 | ppl    33.736
+| epoch   4 step    37400 |   2990 batches | lr 0.000916 | ms/batch 740.26 | loss  3.54 | ppl    34.315
+| epoch   4 step    37600 |   3190 batches | lr 0.000916 | ms/batch 717.95 | loss  3.53 | ppl    34.261
+| epoch   4 step    37800 |   3390 batches | lr 0.000915 | ms/batch 709.80 | loss  3.53 | ppl    34.276
+| epoch   4 step    38000 |   3590 batches | lr 0.000914 | ms/batch 733.53 | loss  3.51 | ppl    33.321
+| epoch   4 step    38200 |   3790 batches | lr 0.000913 | ms/batch 758.57 | loss  3.53 | ppl    34.107
+| epoch   4 step    38400 |   3990 batches | lr 0.000912 | ms/batch 718.85 | loss  3.54 | ppl    34.534
+| epoch   4 step    38600 |   4190 batches | lr 0.000911 | ms/batch 739.54 | loss  3.52 | ppl    33.947
+| epoch   4 step    38800 |   4390 batches | lr 0.00091 | ms/batch 687.41 | loss  3.53 | ppl    34.144
+| epoch   4 step    39000 |   4590 batches | lr 0.000909 | ms/batch 738.74 | loss  3.54 | ppl    34.622
+| epoch   4 step    39200 |   4790 batches | lr 0.000908 | ms/batch 698.45 | loss  3.50 | ppl    33.113
+| epoch   4 step    39400 |   4990 batches | lr 0.000907 | ms/batch 693.14 | loss  3.55 | ppl    34.783
+| epoch   4 step    39600 |   5190 batches | lr 0.000907 | ms/batch 712.17 | loss  3.51 | ppl    33.354
+| epoch   4 step    39800 |   5390 batches | lr 0.000906 | ms/batch 703.60 | loss  3.49 | ppl    32.707
+| epoch   4 step    40000 |   5590 batches | lr 0.000905 | ms/batch 736.01 | loss  3.51 | ppl    33.575
+----------------------------------------------------------------------------------------------------
+| Eval  10 at step    40000 | time: 2894.08s | valid loss  3.46 | valid ppl    31.859
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    40200 |   5790 batches | lr 0.000904 | ms/batch 783.88 | loss  3.53 | ppl    34.189
+| epoch   4 step    40400 |   5990 batches | lr 0.000903 | ms/batch 727.73 | loss  3.51 | ppl    33.317
+| epoch   4 step    40600 |   6190 batches | lr 0.000902 | ms/batch 746.60 | loss  3.51 | ppl    33.287
+| epoch   4 step    40800 |   6390 batches | lr 0.000901 | ms/batch 716.44 | loss  3.53 | ppl    34.260
+| epoch   4 step    41000 |   6590 batches | lr 0.0009 | ms/batch 720.41 | loss  3.47 | ppl    32.119
+| epoch   4 step    41200 |   6790 batches | lr 0.000899 | ms/batch 717.76 | loss  3.49 | ppl    32.904
+| epoch   4 step    41400 |   6990 batches | lr 0.000898 | ms/batch 722.41 | loss  3.51 | ppl    33.437
+| epoch   4 step    41600 |   7190 batches | lr 0.000897 | ms/batch 691.50 | loss  3.46 | ppl    31.813
+| epoch   4 step    41800 |   7390 batches | lr 0.000896 | ms/batch 718.66 | loss  3.49 | ppl    32.731
+| epoch   4 step    42000 |   7590 batches | lr 0.000895 | ms/batch 704.21 | loss  3.47 | ppl    31.977
+| epoch   4 step    42200 |   7790 batches | lr 0.000894 | ms/batch 716.09 | loss  3.50 | ppl    32.973
+| epoch   4 step    42400 |   7990 batches | lr 0.000893 | ms/batch 716.72 | loss  3.49 | ppl    32.928
+| epoch   4 step    42600 |   8190 batches | lr 0.000892 | ms/batch 769.51 | loss  3.48 | ppl    32.525
+| epoch   4 step    42800 |   8390 batches | lr 0.000891 | ms/batch 721.86 | loss  3.51 | ppl    33.503
+| epoch   4 step    43000 |   8590 batches | lr 0.00089 | ms/batch 693.31 | loss  3.49 | ppl    32.709
+| epoch   4 step    43200 |   8790 batches | lr 0.000889 | ms/batch 716.81 | loss  3.51 | ppl    33.341
+| epoch   4 step    43400 |   8990 batches | lr 0.000888 | ms/batch 724.20 | loss  3.49 | ppl    32.874
+| epoch   4 step    43600 |   9190 batches | lr 0.000887 | ms/batch 743.40 | loss  3.48 | ppl    32.617
+| epoch   4 step    43800 |   9390 batches | lr 0.000886 | ms/batch 731.34 | loss  3.49 | ppl    32.906
+| epoch   4 step    44000 |   9590 batches | lr 0.000885 | ms/batch 707.15 | loss  3.51 | ppl    33.593
+----------------------------------------------------------------------------------------------------
+| Eval  11 at step    44000 | time: 2893.83s | valid loss  3.44 | valid ppl    31.142
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    44200 |   9790 batches | lr 0.000884 | ms/batch 788.65 | loss  3.49 | ppl    32.688
+| epoch   4 step    44400 |   9990 batches | lr 0.000883 | ms/batch 722.71 | loss  3.49 | ppl    32.749
+| epoch   4 step    44600 |  10190 batches | lr 0.000882 | ms/batch 731.49 | loss  3.48 | ppl    32.440
+| epoch   4 step    44800 |  10390 batches | lr 0.000881 | ms/batch 722.01 | loss  3.48 | ppl    32.562
+| epoch   4 step    45000 |  10590 batches | lr 0.00088 | ms/batch 707.83 | loss  3.51 | ppl    33.595
+| epoch   4 step    45200 |  10790 batches | lr 0.000879 | ms/batch 721.94 | loss  3.47 | ppl    31.984
+| epoch   4 step    45400 |  10990 batches | lr 0.000878 | ms/batch 702.94 | loss  3.50 | ppl    33.148
+| epoch   4 step    45600 |  11190 batches | lr 0.000877 | ms/batch 731.15 | loss  3.51 | ppl    33.303
+| epoch   4 step    45800 |  11390 batches | lr 0.000876 | ms/batch 744.59 | loss  3.50 | ppl    33.078
+| epoch   5 step    46000 |    120 batches | lr 0.000875 | ms/batch 718.10 | loss  3.48 | ppl    32.481
+| epoch   5 step    46200 |    320 batches | lr 0.000874 | ms/batch 718.77 | loss  3.47 | ppl    31.988
+| epoch   5 step    46400 |    520 batches | lr 0.000873 | ms/batch 707.60 | loss  3.50 | ppl    33.036
+| epoch   5 step    46600 |    720 batches | lr 0.000872 | ms/batch 736.58 | loss  3.46 | ppl    31.813
+| epoch   5 step    46800 |    920 batches | lr 0.000871 | ms/batch 740.84 | loss  3.47 | ppl    31.987
+| epoch   5 step    47000 |   1120 batches | lr 0.00087 | ms/batch 697.11 | loss  3.50 | ppl    33.275
+| epoch   5 step    47200 |   1320 batches | lr 0.000869 | ms/batch 708.82 | loss  3.47 | ppl    32.018
+| epoch   5 step    47400 |   1520 batches | lr 0.000868 | ms/batch 730.85 | loss  3.47 | ppl    32.114
+| epoch   5 step    47600 |   1720 batches | lr 0.000867 | ms/batch 731.39 | loss  3.46 | ppl    31.886
+| epoch   5 step    47800 |   1920 batches | lr 0.000866 | ms/batch 733.07 | loss  3.49 | ppl    32.773
+| epoch   5 step    48000 |   2120 batches | lr 0.000865 | ms/batch 713.54 | loss  3.51 | ppl    33.315
+----------------------------------------------------------------------------------------------------
+| Eval  12 at step    48000 | time: 2897.76s | valid loss  3.42 | valid ppl    30.472
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    48200 |   2320 batches | lr 0.000864 | ms/batch 788.00 | loss  3.49 | ppl    32.699
+| epoch   5 step    48400 |   2520 batches | lr 0.000863 | ms/batch 762.17 | loss  3.47 | ppl    32.162
+| epoch   5 step    48600 |   2720 batches | lr 0.000861 | ms/batch 722.27 | loss  3.46 | ppl    31.777
+| epoch   5 step    48800 |   2920 batches | lr 0.00086 | ms/batch 724.85 | loss  3.45 | ppl    31.489
+| epoch   5 step    49000 |   3120 batches | lr 0.000859 | ms/batch 710.81 | loss  3.47 | ppl    32.099
+| epoch   5 step    49200 |   3320 batches | lr 0.000858 | ms/batch 706.84 | loss  3.48 | ppl    32.407
+| epoch   5 step    49400 |   3520 batches | lr 0.000857 | ms/batch 707.39 | loss  3.44 | ppl    31.235
+| epoch   5 step    49600 |   3720 batches | lr 0.000856 | ms/batch 716.47 | loss  3.47 | ppl    32.056
+| epoch   5 step    49800 |   3920 batches | lr 0.000855 | ms/batch 721.75 | loss  3.46 | ppl    31.917
+| epoch   5 step    50000 |   4120 batches | lr 0.000854 | ms/batch 701.48 | loss  3.46 | ppl    31.968
+| epoch   5 step    50200 |   4320 batches | lr 0.000853 | ms/batch 733.62 | loss  3.47 | ppl    32.081
+| epoch   5 step    50400 |   4520 batches | lr 0.000852 | ms/batch 707.41 | loss  3.48 | ppl    32.529
+| epoch   5 step    50600 |   4720 batches | lr 0.00085 | ms/batch 733.10 | loss  3.44 | ppl    31.243
+| epoch   5 step    50800 |   4920 batches | lr 0.000849 | ms/batch 439.30 | loss  3.46 | ppl    31.752
+| epoch   5 step    51000 |   5120 batches | lr 0.000848 | ms/batch 428.23 | loss  3.45 | ppl    31.582
+| epoch   5 step    51200 |   5320 batches | lr 0.000847 | ms/batch 428.16 | loss  3.45 | ppl    31.426
+| epoch   5 step    51400 |   5520 batches | lr 0.000846 | ms/batch 428.00 | loss  3.44 | ppl    31.258
+| epoch   5 step    51600 |   5720 batches | lr 0.000845 | ms/batch 428.31 | loss  3.46 | ppl    31.686
+| epoch   5 step    51800 |   5920 batches | lr 0.000844 | ms/batch 428.68 | loss  3.45 | ppl    31.622
+| epoch   5 step    52000 |   6120 batches | lr 0.000842 | ms/batch 428.13 | loss  3.45 | ppl    31.374
+----------------------------------------------------------------------------------------------------
+| Eval  13 at step    52000 | time: 2482.68s | valid loss  3.41 | valid ppl    30.380
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    52200 |   6320 batches | lr 0.000841 | ms/batch 479.93 | loss  3.47 | ppl    32.078
+| epoch   5 step    52400 |   6520 batches | lr 0.00084 | ms/batch 428.34 | loss  3.41 | ppl    30.391
+| epoch   5 step    52600 |   6720 batches | lr 0.000839 | ms/batch 428.29 | loss  3.42 | ppl    30.557
+| epoch   5 step    52800 |   6920 batches | lr 0.000838 | ms/batch 428.06 | loss  3.44 | ppl    31.190
+| epoch   5 step    53000 |   7120 batches | lr 0.000837 | ms/batch 427.79 | loss  3.43 | ppl    30.785
+| epoch   5 step    53200 |   7320 batches | lr 0.000836 | ms/batch 428.04 | loss  3.40 | ppl    29.880
+| epoch   5 step    53400 |   7520 batches | lr 0.000834 | ms/batch 427.78 | loss  3.43 | ppl    30.849
+| epoch   5 step    53600 |   7720 batches | lr 0.000833 | ms/batch 428.29 | loss  3.42 | ppl    30.652
+| epoch   5 step    53800 |   7920 batches | lr 0.000832 | ms/batch 430.31 | loss  3.42 | ppl    30.697
+| epoch   5 step    54000 |   8120 batches | lr 0.000831 | ms/batch 428.09 | loss  3.44 | ppl    31.114
+| epoch   5 step    54200 |   8320 batches | lr 0.00083 | ms/batch 428.52 | loss  3.43 | ppl    30.845
+| epoch   5 step    54400 |   8520 batches | lr 0.000828 | ms/batch 428.56 | loss  3.42 | ppl    30.624
+| epoch   5 step    54600 |   8720 batches | lr 0.000827 | ms/batch 428.02 | loss  3.44 | ppl    31.145
+| epoch   5 step    54800 |   8920 batches | lr 0.000826 | ms/batch 428.01 | loss  3.44 | ppl    31.221
+| epoch   5 step    55000 |   9120 batches | lr 0.000825 | ms/batch 427.99 | loss  3.43 | ppl    30.961
+| epoch   5 step    55200 |   9320 batches | lr 0.000824 | ms/batch 428.43 | loss  3.42 | ppl    30.708
+| epoch   5 step    55400 |   9520 batches | lr 0.000823 | ms/batch 428.12 | loss  3.46 | ppl    31.685
+| epoch   5 step    55600 |   9720 batches | lr 0.000821 | ms/batch 427.89 | loss  3.43 | ppl    30.732
+| epoch   5 step    55800 |   9920 batches | lr 0.00082 | ms/batch 428.47 | loss  3.43 | ppl    30.858
+| epoch   5 step    56000 |  10120 batches | lr 0.000819 | ms/batch 428.88 | loss  3.43 | ppl    30.769
+----------------------------------------------------------------------------------------------------
+| Eval  14 at step    56000 | time: 1719.48s | valid loss  3.39 | valid ppl    29.702
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    56200 |  10320 batches | lr 0.000818 | ms/batch 481.91 | loss  3.43 | ppl    30.830
+| epoch   5 step    56400 |  10520 batches | lr 0.000816 | ms/batch 428.55 | loss  3.45 | ppl    31.519
+| epoch   5 step    56600 |  10720 batches | lr 0.000815 | ms/batch 428.19 | loss  3.42 | ppl    30.448
+| epoch   5 step    56800 |  10920 batches | lr 0.000814 | ms/batch 428.24 | loss  3.41 | ppl    30.308
+| epoch   5 step    57000 |  11120 batches | lr 0.000813 | ms/batch 428.07 | loss  3.47 | ppl    32.121
+| epoch   5 step    57200 |  11320 batches | lr 0.000812 | ms/batch 428.22 | loss  3.42 | ppl    30.698
+| epoch   6 step    57400 |     50 batches | lr 0.00081 | ms/batch 427.60 | loss  3.44 | ppl    31.304
+| epoch   6 step    57600 |    250 batches | lr 0.000809 | ms/batch 428.27 | loss  3.40 | ppl    29.816
+| epoch   6 step    57800 |    450 batches | lr 0.000808 | ms/batch 428.43 | loss  3.43 | ppl    31.010
+| epoch   6 step    58000 |    650 batches | lr 0.000807 | ms/batch 428.85 | loss  3.40 | ppl    29.986
+| epoch   6 step    58200 |    850 batches | lr 0.000805 | ms/batch 428.36 | loss  3.44 | ppl    31.179
+| epoch   6 step    58400 |   1050 batches | lr 0.000804 | ms/batch 428.27 | loss  3.42 | ppl    30.427
+| epoch   6 step    58600 |   1250 batches | lr 0.000803 | ms/batch 427.88 | loss  3.42 | ppl    30.439
+| epoch   6 step    58800 |   1450 batches | lr 0.000802 | ms/batch 428.26 | loss  3.42 | ppl    30.628
+| epoch   6 step    59000 |   1650 batches | lr 0.0008 | ms/batch 428.41 | loss  3.40 | ppl    29.997
+| epoch   6 step    59200 |   1850 batches | lr 0.000799 | ms/batch 428.81 | loss  3.42 | ppl    30.513
+| epoch   6 step    59400 |   2050 batches | lr 0.000798 | ms/batch 427.82 | loss  3.46 | ppl    31.775
+| epoch   6 step    59600 |   2250 batches | lr 0.000797 | ms/batch 428.09 | loss  3.43 | ppl    30.763
+| epoch   6 step    59800 |   2450 batches | lr 0.000795 | ms/batch 428.44 | loss  3.42 | ppl    30.721
+| epoch   6 step    60000 |   2650 batches | lr 0.000794 | ms/batch 428.03 | loss  3.42 | ppl    30.694
+----------------------------------------------------------------------------------------------------
+| Eval  15 at step    60000 | time: 1719.35s | valid loss  3.38 | valid ppl    29.457
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    60200 |   2850 batches | lr 0.000793 | ms/batch 481.37 | loss  3.37 | ppl    29.154
+| epoch   6 step    60400 |   3050 batches | lr 0.000792 | ms/batch 428.38 | loss  3.42 | ppl    30.655
+| epoch   6 step    60600 |   3250 batches | lr 0.00079 | ms/batch 428.15 | loss  3.41 | ppl    30.363
+| epoch   6 step    60800 |   3450 batches | lr 0.000789 | ms/batch 428.57 | loss  3.40 | ppl    29.835
+| epoch   6 step    61000 |   3650 batches | lr 0.000788 | ms/batch 428.17 | loss  3.40 | ppl    29.899
+| epoch   6 step    61200 |   3850 batches | lr 0.000786 | ms/batch 428.39 | loss  3.41 | ppl    30.122
+| epoch   6 step    61400 |   4050 batches | lr 0.000785 | ms/batch 428.27 | loss  3.42 | ppl    30.664
+| epoch   6 step    61600 |   4250 batches | lr 0.000784 | ms/batch 428.29 | loss  3.41 | ppl    30.120
+| epoch   6 step    61800 |   4450 batches | lr 0.000783 | ms/batch 427.99 | loss  3.41 | ppl    30.317
+| epoch   6 step    62000 |   4650 batches | lr 0.000781 | ms/batch 428.43 | loss  3.41 | ppl    30.140
+| epoch   6 step    62200 |   4850 batches | lr 0.00078 | ms/batch 428.23 | loss  3.40 | ppl    29.843
+| epoch   6 step    62400 |   5050 batches | lr 0.000779 | ms/batch 428.52 | loss  3.41 | ppl    30.256
+| epoch   6 step    62600 |   5250 batches | lr 0.000777 | ms/batch 428.32 | loss  3.40 | ppl    29.897
+| epoch   6 step    62800 |   5450 batches | lr 0.000776 | ms/batch 428.15 | loss  3.37 | ppl    29.184
+| epoch   6 step    63000 |   5650 batches | lr 0.000775 | ms/batch 428.74 | loss  3.42 | ppl    30.596
+| epoch   6 step    63200 |   5850 batches | lr 0.000773 | ms/batch 428.17 | loss  3.40 | ppl    29.873
+| epoch   6 step    63400 |   6050 batches | lr 0.000772 | ms/batch 431.10 | loss  3.39 | ppl    29.602
+| epoch   6 step    63600 |   6250 batches | lr 0.000771 | ms/batch 428.80 | loss  3.40 | ppl    29.894
+| epoch   6 step    63800 |   6450 batches | lr 0.000769 | ms/batch 428.27 | loss  3.40 | ppl    30.015
+| epoch   6 step    64000 |   6650 batches | lr 0.000768 | ms/batch 427.89 | loss  3.35 | ppl    28.502
+----------------------------------------------------------------------------------------------------
+| Eval  16 at step    64000 | time: 1720.26s | valid loss  3.37 | valid ppl    29.191
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    64200 |   6850 batches | lr 0.000767 | ms/batch 480.29 | loss  3.38 | ppl    29.424
+| epoch   6 step    64400 |   7050 batches | lr 0.000765 | ms/batch 428.06 | loss  3.38 | ppl    29.457
+| epoch   6 step    64600 |   7250 batches | lr 0.000764 | ms/batch 428.26 | loss  3.35 | ppl    28.404
+| epoch   6 step    64800 |   7450 batches | lr 0.000763 | ms/batch 427.97 | loss  3.37 | ppl    29.176
+| epoch   6 step    65000 |   7650 batches | lr 0.000761 | ms/batch 427.80 | loss  3.36 | ppl    28.687
+| epoch   6 step    65200 |   7850 batches | lr 0.00076 | ms/batch 427.94 | loss  3.38 | ppl    29.239
+| epoch   6 step    65400 |   8050 batches | lr 0.000759 | ms/batch 428.21 | loss  3.38 | ppl    29.423
+| epoch   6 step    65600 |   8250 batches | lr 0.000757 | ms/batch 428.24 | loss  3.37 | ppl    29.027
+| epoch   6 step    65800 |   8450 batches | lr 0.000756 | ms/batch 428.08 | loss  3.39 | ppl    29.561
+| epoch   6 step    66000 |   8650 batches | lr 0.000755 | ms/batch 428.12 | loss  3.37 | ppl    29.182
+| epoch   6 step    66200 |   8850 batches | lr 0.000753 | ms/batch 427.80 | loss  3.39 | ppl    29.755
+| epoch   6 step    66400 |   9050 batches | lr 0.000752 | ms/batch 427.84 | loss  3.38 | ppl    29.461
+| epoch   6 step    66600 |   9250 batches | lr 0.000751 | ms/batch 428.23 | loss  3.37 | ppl    29.042
+| epoch   6 step    66800 |   9450 batches | lr 0.000749 | ms/batch 428.13 | loss  3.39 | ppl    29.675
+| epoch   6 step    67000 |   9650 batches | lr 0.000748 | ms/batch 428.30 | loss  3.40 | ppl    29.988
+| epoch   6 step    67200 |   9850 batches | lr 0.000747 | ms/batch 427.99 | loss  3.35 | ppl    28.570
+| epoch   6 step    67400 |  10050 batches | lr 0.000745 | ms/batch 427.95 | loss  3.40 | ppl    29.984
+| epoch   6 step    67600 |  10250 batches | lr 0.000744 | ms/batch 428.03 | loss  3.35 | ppl    28.630
+| epoch   6 step    67800 |  10450 batches | lr 0.000742 | ms/batch 430.31 | loss  3.39 | ppl    29.531
+| epoch   6 step    68000 |  10650 batches | lr 0.000741 | ms/batch 427.87 | loss  3.40 | ppl    29.901
+----------------------------------------------------------------------------------------------------
+| Eval  17 at step    68000 | time: 1719.02s | valid loss  3.36 | valid ppl    28.688
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    68200 |  10850 batches | lr 0.00074 | ms/batch 480.96 | loss  3.35 | ppl    28.405
+| epoch   6 step    68400 |  11050 batches | lr 0.000738 | ms/batch 427.96 | loss  3.39 | ppl    29.811
+| epoch   6 step    68600 |  11250 batches | lr 0.000737 | ms/batch 428.15 | loss  3.41 | ppl    30.203
+| epoch   6 step    68800 |  11450 batches | lr 0.000736 | ms/batch 428.01 | loss  3.37 | ppl    29.109
+| epoch   7 step    69000 |    180 batches | lr 0.000734 | ms/batch 426.98 | loss  3.36 | ppl    28.847
+| epoch   7 step    69200 |    380 batches | lr 0.000733 | ms/batch 427.99 | loss  3.36 | ppl    28.907
+| epoch   7 step    69400 |    580 batches | lr 0.000731 | ms/batch 428.36 | loss  3.37 | ppl    28.943
+| epoch   7 step    69600 |    780 batches | lr 0.00073 | ms/batch 428.04 | loss  3.37 | ppl    29.147
+| epoch   7 step    69800 |    980 batches | lr 0.000729 | ms/batch 428.00 | loss  3.35 | ppl    28.565
+| epoch   7 step    70000 |   1180 batches | lr 0.000727 | ms/batch 428.01 | loss  3.38 | ppl    29.455
+| epoch   7 step    70200 |   1380 batches | lr 0.000726 | ms/batch 428.23 | loss  3.36 | ppl    28.842
+| epoch   7 step    70400 |   1580 batches | lr 0.000724 | ms/batch 428.06 | loss  3.36 | ppl    28.832
+| epoch   7 step    70600 |   1780 batches | lr 0.000723 | ms/batch 428.43 | loss  3.36 | ppl    28.804
+| epoch   7 step    70800 |   1980 batches | lr 0.000722 | ms/batch 428.28 | loss  3.39 | ppl    29.744
+| epoch   7 step    71000 |   2180 batches | lr 0.00072 | ms/batch 428.36 | loss  3.38 | ppl    29.446
+| epoch   7 step    71200 |   2380 batches | lr 0.000719 | ms/batch 428.04 | loss  3.38 | ppl    29.368
+| epoch   7 step    71400 |   2580 batches | lr 0.000717 | ms/batch 428.28 | loss  3.36 | ppl    28.901
+| epoch   7 step    71600 |   2780 batches | lr 0.000716 | ms/batch 428.22 | loss  3.34 | ppl    28.336
+| epoch   7 step    71800 |   2980 batches | lr 0.000714 | ms/batch 427.98 | loss  3.36 | ppl    28.688
+| epoch   7 step    72000 |   3180 batches | lr 0.000713 | ms/batch 428.29 | loss  3.37 | ppl    29.018
+----------------------------------------------------------------------------------------------------
+| Eval  18 at step    72000 | time: 1718.69s | valid loss  3.34 | valid ppl    28.340
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    72200 |   3380 batches | lr 0.000712 | ms/batch 480.57 | loss  3.36 | ppl    28.833
+| epoch   7 step    72400 |   3580 batches | lr 0.00071 | ms/batch 428.02 | loss  3.34 | ppl    28.200
+| epoch   7 step    72600 |   3780 batches | lr 0.000709 | ms/batch 428.30 | loss  3.36 | ppl    28.651
+| epoch   7 step    72800 |   3980 batches | lr 0.000707 | ms/batch 428.18 | loss  3.36 | ppl    28.922
+| epoch   7 step    73000 |   4180 batches | lr 0.000706 | ms/batch 428.44 | loss  3.36 | ppl    28.777
+| epoch   7 step    73200 |   4380 batches | lr 0.000704 | ms/batch 428.60 | loss  3.36 | ppl    28.768
+| epoch   7 step    73400 |   4580 batches | lr 0.000703 | ms/batch 427.98 | loss  3.38 | ppl    29.301
+| epoch   7 step    73600 |   4780 batches | lr 0.000702 | ms/batch 427.88 | loss  3.33 | ppl    28.012
+| epoch   7 step    73800 |   4980 batches | lr 0.0007 | ms/batch 428.03 | loss  3.37 | ppl    29.179
+| epoch   7 step    74000 |   5180 batches | lr 0.000699 | ms/batch 428.27 | loss  3.34 | ppl    28.334
+| epoch   7 step    74200 |   5380 batches | lr 0.000697 | ms/batch 428.23 | loss  3.32 | ppl    27.662
+| epoch   7 step    74400 |   5580 batches | lr 0.000696 | ms/batch 428.04 | loss  3.35 | ppl    28.373
+| epoch   7 step    74600 |   5780 batches | lr 0.000694 | ms/batch 428.14 | loss  3.37 | ppl    28.974
+| epoch   7 step    74800 |   5980 batches | lr 0.000693 | ms/batch 428.03 | loss  3.34 | ppl    28.198
+| epoch   7 step    75000 |   6180 batches | lr 0.000691 | ms/batch 428.09 | loss  3.34 | ppl    28.141
+| epoch   7 step    75200 |   6380 batches | lr 0.00069 | ms/batch 428.46 | loss  3.37 | ppl    29.134
+| epoch   7 step    75400 |   6580 batches | lr 0.000689 | ms/batch 428.24 | loss  3.30 | ppl    27.073
+| epoch   7 step    75600 |   6780 batches | lr 0.000687 | ms/batch 428.32 | loss  3.33 | ppl    27.915
+| epoch   7 step    75800 |   6980 batches | lr 0.000686 | ms/batch 428.01 | loss  3.34 | ppl    28.342
+| epoch   7 step    76000 |   7180 batches | lr 0.000684 | ms/batch 428.26 | loss  3.30 | ppl    27.012
+----------------------------------------------------------------------------------------------------
+| Eval  19 at step    76000 | time: 1719.03s | valid loss  3.34 | valid ppl    28.085
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    76200 |   7380 batches | lr 0.000683 | ms/batch 480.62 | loss  3.32 | ppl    27.748
+| epoch   7 step    76400 |   7580 batches | lr 0.000681 | ms/batch 428.12 | loss  3.30 | ppl    27.084
+| epoch   7 step    76600 |   7780 batches | lr 0.00068 | ms/batch 428.01 | loss  3.33 | ppl    28.010
+| epoch   7 step    76800 |   7980 batches | lr 0.000678 | ms/batch 428.40 | loss  3.33 | ppl    27.921
+| epoch   7 step    77000 |   8180 batches | lr 0.000677 | ms/batch 428.37 | loss  3.31 | ppl    27.488
+| epoch   7 step    77200 |   8380 batches | lr 0.000675 | ms/batch 428.44 | loss  3.35 | ppl    28.428
+| epoch   7 step    77400 |   8580 batches | lr 0.000674 | ms/batch 428.56 | loss  3.32 | ppl    27.769
+| epoch   7 step    77600 |   8780 batches | lr 0.000672 | ms/batch 428.27 | loss  3.34 | ppl    28.127
+| epoch   7 step    77800 |   8980 batches | lr 0.000671 | ms/batch 428.11 | loss  3.34 | ppl    28.080
+| epoch   7 step    78000 |   9180 batches | lr 0.00067 | ms/batch 428.36 | loss  3.32 | ppl    27.589
+| epoch   7 step    78200 |   9380 batches | lr 0.000668 | ms/batch 428.37 | loss  3.33 | ppl    28.024
+| epoch   7 step    78400 |   9580 batches | lr 0.000667 | ms/batch 428.24 | loss  3.35 | ppl    28.582
+| epoch   7 step    78600 |   9780 batches | lr 0.000665 | ms/batch 428.30 | loss  3.32 | ppl    27.792
+| epoch   7 step    78800 |   9980 batches | lr 0.000664 | ms/batch 428.32 | loss  3.33 | ppl    27.822
+| epoch   7 step    79000 |  10180 batches | lr 0.000662 | ms/batch 428.43 | loss  3.31 | ppl    27.507
+| epoch   7 step    79200 |  10380 batches | lr 0.000661 | ms/batch 428.67 | loss  3.33 | ppl    27.883
+| epoch   7 step    79400 |  10580 batches | lr 0.000659 | ms/batch 428.45 | loss  3.35 | ppl    28.534
+| epoch   7 step    79600 |  10780 batches | lr 0.000658 | ms/batch 428.45 | loss  3.31 | ppl    27.300
+| epoch   7 step    79800 |  10980 batches | lr 0.000656 | ms/batch 428.51 | loss  3.33 | ppl    28.003
+| epoch   7 step    80000 |  11180 batches | lr 0.000655 | ms/batch 428.08 | loss  3.35 | ppl    28.570
+----------------------------------------------------------------------------------------------------
+| Eval  20 at step    80000 | time: 1719.62s | valid loss  3.33 | valid ppl    27.910
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    80200 |  11380 batches | lr 0.000653 | ms/batch 481.33 | loss  3.34 | ppl    28.104
+| epoch   8 step    80400 |    110 batches | lr 0.000652 | ms/batch 427.32 | loss  3.32 | ppl    27.722
+| epoch   8 step    80600 |    310 batches | lr 0.00065 | ms/batch 428.44 | loss  3.31 | ppl    27.342
+| epoch   8 step    80800 |    510 batches | lr 0.000649 | ms/batch 428.57 | loss  3.34 | ppl    28.236
+| epoch   8 step    81000 |    710 batches | lr 0.000647 | ms/batch 428.00 | loss  3.30 | ppl    27.046
+| epoch   8 step    81200 |    910 batches | lr 0.000646 | ms/batch 428.73 | loss  3.31 | ppl    27.389
+| epoch   8 step    81400 |   1110 batches | lr 0.000644 | ms/batch 428.04 | loss  3.34 | ppl    28.203
+| epoch   8 step    81600 |   1310 batches | lr 0.000643 | ms/batch 428.37 | loss  3.31 | ppl    27.453
+| epoch   8 step    81800 |   1510 batches | lr 0.000641 | ms/batch 428.54 | loss  3.31 | ppl    27.477
+| epoch   8 step    82000 |   1710 batches | lr 0.00064 | ms/batch 428.08 | loss  3.30 | ppl    27.048
+| epoch   8 step    82200 |   1910 batches | lr 0.000638 | ms/batch 428.45 | loss  3.33 | ppl    28.077
+| epoch   8 step    82400 |   2110 batches | lr 0.000637 | ms/batch 428.41 | loss  3.35 | ppl    28.551
+| epoch   8 step    82600 |   2310 batches | lr 0.000635 | ms/batch 428.17 | loss  3.33 | ppl    27.998
+| epoch   8 step    82800 |   2510 batches | lr 0.000634 | ms/batch 428.32 | loss  3.31 | ppl    27.500
+| epoch   8 step    83000 |   2710 batches | lr 0.000632 | ms/batch 428.30 | loss  3.31 | ppl    27.355
+| epoch   8 step    83200 |   2910 batches | lr 0.000631 | ms/batch 428.26 | loss  3.29 | ppl    26.778
+| epoch   8 step    83400 |   3110 batches | lr 0.000629 | ms/batch 428.27 | loss  3.32 | ppl    27.565
+| epoch   8 step    83600 |   3310 batches | lr 0.000628 | ms/batch 428.68 | loss  3.33 | ppl    27.977
+| epoch   8 step    83800 |   3510 batches | lr 0.000626 | ms/batch 428.36 | loss  3.29 | ppl    26.866
+| epoch   8 step    84000 |   3710 batches | lr 0.000624 | ms/batch 428.21 | loss  3.31 | ppl    27.460
+----------------------------------------------------------------------------------------------------
+| Eval  21 at step    84000 | time: 1719.55s | valid loss  3.31 | valid ppl    27.444
+----------------------------------------------------------------------------------------------------
+| epoch   8 step    84200 |   3910 batches | lr 0.000623 | ms/batch 480.82 | loss  3.30 | ppl    27.247
+| epoch   8 step    84400 |   4110 batches | lr 0.000621 | ms/batch 428.46 | loss  3.32 | ppl    27.559
+| epoch   8 step    84600 |   4310 batches | lr 0.00062 | ms/batch 428.36 | loss  3.31 | ppl    27.483
+| epoch   8 step    84800 |   4510 batches | lr 0.000618 | ms/batch 428.27 | loss  3.33 | ppl    27.937
+| epoch   8 step    85000 |   4710 batches | lr 0.000617 | ms/batch 428.47 | loss  3.29 | ppl    26.787
+| epoch   8 step    85200 |   4910 batches | lr 0.000615 | ms/batch 428.45 | loss  3.30 | ppl    27.248
+| epoch   8 step    85400 |   5110 batches | lr 0.000614 | ms/batch 428.55 | loss  3.30 | ppl    27.202
+| epoch   8 step    85600 |   5310 batches | lr 0.000612 | ms/batch 428.21 | loss  3.29 | ppl    26.922
+| epoch   8 step    85800 |   5510 batches | lr 0.000611 | ms/batch 428.44 | loss  3.30 | ppl    26.991
+| epoch   8 step    86000 |   5710 batches | lr 0.000609 | ms/batch 428.89 | loss  3.30 | ppl    27.137
+| epoch   8 step    86200 |   5910 batches | lr 0.000608 | ms/batch 428.44 | loss  3.31 | ppl    27.249
+| epoch   8 step    86400 |   6110 batches | lr 0.000606 | ms/batch 428.40 | loss  3.30 | ppl    27.105
+| epoch   8 step    86600 |   6310 batches | lr 0.000605 | ms/batch 428.80 | loss  3.31 | ppl    27.474
+| epoch   8 step    86800 |   6510 batches | lr 0.000603 | ms/batch 429.72 | loss  3.26 | ppl    26.174
+| epoch   8 step    87000 |   6710 batches | lr 0.000602 | ms/batch 428.74 | loss  3.27 | ppl    26.276
+| epoch   8 step    87200 |   6910 batches | lr 0.0006 | ms/batch 428.17 | loss  3.29 | ppl    26.765
+| epoch   8 step    87400 |   7110 batches | lr 0.000598 | ms/batch 427.98 | loss  3.28 | ppl    26.610
+| epoch   8 step    87600 |   7310 batches | lr 0.000597 | ms/batch 428.15 | loss  3.25 | ppl    25.667
+| epoch   8 step    87800 |   7510 batches | lr 0.000595 | ms/batch 428.23 | loss  3.28 | ppl    26.612
+| epoch   8 step    88000 |   7710 batches | lr 0.000594 | ms/batch 428.25 | loss  3.27 | ppl    26.351
+----------------------------------------------------------------------------------------------------
+| Eval  22 at step    88000 | time: 1720.20s | valid loss  3.30 | valid ppl    27.148
+----------------------------------------------------------------------------------------------------
+| epoch   8 step    88200 |   7910 batches | lr 0.000592 | ms/batch 481.35 | loss  3.27 | ppl    26.388
+| epoch   8 step    88400 |   8110 batches | lr 0.000591 | ms/batch 428.47 | loss  3.28 | ppl    26.693
+| epoch   8 step    88600 |   8310 batches | lr 0.000589 | ms/batch 428.66 | loss  3.28 | ppl    26.491
+| epoch   8 step    88800 |   8510 batches | lr 0.000588 | ms/batch 428.62 | loss  3.28 | ppl    26.477
+| epoch   8 step    89000 |   8710 batches | lr 0.000586 | ms/batch 428.72 | loss  3.29 | ppl    26.868
+| epoch   8 step    89200 |   8910 batches | lr 0.000585 | ms/batch 431.39 | loss  3.29 | ppl    26.753
+| epoch   8 step    89400 |   9110 batches | lr 0.000583 | ms/batch 429.99 | loss  3.29 | ppl    26.822
+| epoch   8 step    89600 |   9310 batches | lr 0.000581 | ms/batch 428.65 | loss  3.27 | ppl    26.355
+| epoch   8 step    89800 |   9510 batches | lr 0.00058 | ms/batch 428.13 | loss  3.30 | ppl    27.153
+| epoch   8 step    90000 |   9710 batches | lr 0.000578 | ms/batch 428.01 | loss  3.28 | ppl    26.579
+| epoch   8 step    90200 |   9910 batches | lr 0.000577 | ms/batch 428.22 | loss  3.27 | ppl    26.390
+| epoch   8 step    90400 |  10110 batches | lr 0.000575 | ms/batch 427.84 | loss  3.28 | ppl    26.629
+| epoch   8 step    90600 |  10310 batches | lr 0.000574 | ms/batch 428.60 | loss  3.28 | ppl    26.444
+| epoch   8 step    90800 |  10510 batches | lr 0.000572 | ms/batch 429.39 | loss  3.30 | ppl    27.174
+| epoch   8 step    91000 |  10710 batches | lr 0.000571 | ms/batch 428.29 | loss  3.27 | ppl    26.291
+| epoch   8 step    91200 |  10910 batches | lr 0.000569 | ms/batch 430.09 | loss  3.26 | ppl    26.014
+| epoch   8 step    91400 |  11110 batches | lr 0.000567 | ms/batch 428.66 | loss  3.32 | ppl    27.663
+| epoch   8 step    91600 |  11310 batches | lr 0.000566 | ms/batch 428.81 | loss  3.28 | ppl    26.603
+| epoch   9 step    91800 |     40 batches | lr 0.000564 | ms/batch 426.93 | loss  3.30 | ppl    26.989
+| epoch   9 step    92000 |    240 batches | lr 0.000563 | ms/batch 428.41 | loss  3.25 | ppl    25.705
+----------------------------------------------------------------------------------------------------
+| Eval  23 at step    92000 | time: 1721.26s | valid loss  3.30 | valid ppl    27.072
+----------------------------------------------------------------------------------------------------
+| epoch   9 step    92200 |    440 batches | lr 0.000561 | ms/batch 483.07 | loss  3.29 | ppl    26.728
+| epoch   9 step    92400 |    640 batches | lr 0.00056 | ms/batch 428.39 | loss  3.25 | ppl    25.916
+| epoch   9 step    92600 |    840 batches | lr 0.000558 | ms/batch 428.56 | loss  3.30 | ppl    27.003
+| epoch   9 step    92800 |   1040 batches | lr 0.000557 | ms/batch 428.59 | loss  3.26 | ppl    26.037
+| epoch   9 step    93000 |   1240 batches | lr 0.000555 | ms/batch 427.68 | loss  3.27 | ppl    26.276
+| epoch   9 step    93200 |   1440 batches | lr 0.000553 | ms/batch 430.44 | loss  3.28 | ppl    26.496
+| epoch   9 step    93400 |   1640 batches | lr 0.000552 | ms/batch 429.16 | loss  3.25 | ppl    25.806
+| epoch   9 step    93600 |   1840 batches | lr 0.00055 | ms/batch 428.82 | loss  3.27 | ppl    26.350
+| epoch   9 step    93800 |   2040 batches | lr 0.000549 | ms/batch 430.56 | loss  3.31 | ppl    27.417
+| epoch   9 step    94000 |   2240 batches | lr 0.000547 | ms/batch 428.76 | loss  3.28 | ppl    26.510
+| epoch   9 step    94200 |   2440 batches | lr 0.000546 | ms/batch 428.37 | loss  3.28 | ppl    26.535
+| epoch   9 step    94400 |   2640 batches | lr 0.000544 | ms/batch 429.44 | loss  3.27 | ppl    26.435
+| epoch   9 step    94600 |   2840 batches | lr 0.000542 | ms/batch 431.05 | loss  3.23 | ppl    25.312
+| epoch   9 step    94800 |   3040 batches | lr 0.000541 | ms/batch 431.02 | loss  3.28 | ppl    26.446
+| epoch   9 step    95000 |   3240 batches | lr 0.000539 | ms/batch 430.52 | loss  3.27 | ppl    26.223
+| epoch   9 step    95200 |   3440 batches | lr 0.000538 | ms/batch 431.61 | loss  3.25 | ppl    25.850
+| epoch   9 step    95400 |   3640 batches | lr 0.000536 | ms/batch 430.76 | loss  3.25 | ppl    25.776
+| epoch   9 step    95600 |   3840 batches | lr 0.000535 | ms/batch 431.52 | loss  3.27 | ppl    26.191
+| epoch   9 step    95800 |   4040 batches | lr 0.000533 | ms/batch 431.13 | loss  3.28 | ppl    26.543
+| epoch   9 step    96000 |   4240 batches | lr 0.000532 | ms/batch 430.68 | loss  3.26 | ppl    26.073
+----------------------------------------------------------------------------------------------------
+| Eval  24 at step    96000 | time: 1725.84s | valid loss  3.29 | valid ppl    26.753
+----------------------------------------------------------------------------------------------------
+| epoch   9 step    96200 |   4440 batches | lr 0.00053 | ms/batch 485.06 | loss  3.26 | ppl    26.156
+| epoch   9 step    96400 |   4640 batches | lr 0.000528 | ms/batch 430.88 | loss  3.26 | ppl    26.108
+| epoch   9 step    96600 |   4840 batches | lr 0.000527 | ms/batch 431.97 | loss  3.25 | ppl    25.737
+| epoch   9 step    96800 |   5040 batches | lr 0.000525 | ms/batch 432.24 | loss  3.27 | ppl    26.276
+| epoch   9 step    97000 |   5240 batches | lr 0.000524 | ms/batch 431.45 | loss  3.26 | ppl    25.981
+| epoch   9 step    97200 |   5440 batches | lr 0.000522 | ms/batch 430.67 | loss  3.23 | ppl    25.161
+| epoch   9 step    97400 |   5640 batches | lr 0.000521 | ms/batch 432.60 | loss  3.27 | ppl    26.376
+| epoch   9 step    97600 |   5840 batches | lr 0.000519 | ms/batch 431.40 | loss  3.26 | ppl    26.045
+| epoch   9 step    97800 |   6040 batches | lr 0.000517 | ms/batch 432.17 | loss  3.24 | ppl    25.492
+| epoch   9 step    98000 |   6240 batches | lr 0.000516 | ms/batch 431.30 | loss  3.25 | ppl    25.846
+| epoch   9 step    98200 |   6440 batches | lr 0.000514 | ms/batch 432.92 | loss  3.26 | ppl    26.078
+| epoch   9 step    98400 |   6640 batches | lr 0.000513 | ms/batch 431.41 | loss  3.21 | ppl    24.699
+| epoch   9 step    98600 |   6840 batches | lr 0.000511 | ms/batch 431.49 | loss  3.24 | ppl    25.454
+| epoch   9 step    98800 |   7040 batches | lr 0.00051 | ms/batch 430.99 | loss  3.24 | ppl    25.585
+| epoch   9 step    99000 |   7240 batches | lr 0.000508 | ms/batch 430.86 | loss  3.21 | ppl    24.714
+| epoch   9 step    99200 |   7440 batches | lr 0.000506 | ms/batch 430.27 | loss  3.23 | ppl    25.190
+| epoch   9 step    99400 |   7640 batches | lr 0.000505 | ms/batch 432.07 | loss  3.21 | ppl    24.787
+| epoch   9 step    99600 |   7840 batches | lr 0.000503 | ms/batch 431.24 | loss  3.24 | ppl    25.439
+| epoch   9 step    99800 |   8040 batches | lr 0.000502 | ms/batch 430.41 | loss  3.24 | ppl    25.411
+| epoch   9 step   100000 |   8240 batches | lr 0.0005 | ms/batch 431.67 | loss  3.22 | ppl    25.115
+----------------------------------------------------------------------------------------------------
+| Eval  25 at step   100000 | time: 1732.27s | valid loss  3.28 | valid ppl    26.518
+----------------------------------------------------------------------------------------------------
+| epoch   9 step   100200 |   8440 batches | lr 0.000499 | ms/batch 484.14 | loss  3.24 | ppl    25.577
+| epoch   9 step   100400 |   8640 batches | lr 0.000497 | ms/batch 431.81 | loss  3.23 | ppl    25.193
+| epoch   9 step   100600 |   8840 batches | lr 0.000495 | ms/batch 431.22 | loss  3.25 | ppl    25.863
+| epoch   9 step   100800 |   9040 batches | lr 0.000494 | ms/batch 431.17 | loss  3.24 | ppl    25.506
+| epoch   9 step   101000 |   9240 batches | lr 0.000492 | ms/batch 432.11 | loss  3.22 | ppl    25.014
+| epoch   9 step   101200 |   9440 batches | lr 0.000491 | ms/batch 430.57 | loss  3.24 | ppl    25.629
+| epoch   9 step   101400 |   9640 batches | lr 0.000489 | ms/batch 430.89 | loss  3.26 | ppl    26.022
+| epoch   9 step   101600 |   9840 batches | lr 0.000488 | ms/batch 431.35 | loss  3.21 | ppl    24.780
+| epoch   9 step   101800 |  10040 batches | lr 0.000486 | ms/batch 430.97 | loss  3.25 | ppl    25.722
+| epoch   9 step   102000 |  10240 batches | lr 0.000484 | ms/batch 432.01 | loss  3.22 | ppl    24.964
+| epoch   9 step   102200 |  10440 batches | lr 0.000483 | ms/batch 430.66 | loss  3.24 | ppl    25.515
+| epoch   9 step   102400 |  10640 batches | lr 0.000481 | ms/batch 431.30 | loss  3.26 | ppl    26.013
+| epoch   9 step   102600 |  10840 batches | lr 0.00048 | ms/batch 430.47 | loss  3.20 | ppl    24.498
+| epoch   9 step   102800 |  11040 batches | lr 0.000478 | ms/batch 430.42 | loss  3.26 | ppl    25.984
+| epoch   9 step   103000 |  11240 batches | lr 0.000477 | ms/batch 430.79 | loss  3.26 | ppl    26.065
+| epoch   9 step   103200 |  11440 batches | lr 0.000475 | ms/batch 431.88 | loss  3.23 | ppl    25.322
+| epoch  10 step   103400 |    170 batches | lr 0.000473 | ms/batch 429.77 | loss  3.22 | ppl    25.117
+| epoch  10 step   103600 |    370 batches | lr 0.000472 | ms/batch 431.10 | loss  3.21 | ppl    24.886
+| epoch  10 step   103800 |    570 batches | lr 0.00047 | ms/batch 430.70 | loss  3.23 | ppl    25.215
+| epoch  10 step   104000 |    770 batches | lr 0.000469 | ms/batch 430.67 | loss  3.23 | ppl    25.190
+----------------------------------------------------------------------------------------------------
+| Eval  26 at step   104000 | time: 1730.45s | valid loss  3.26 | valid ppl    26.179
+----------------------------------------------------------------------------------------------------
+| epoch  10 step   104200 |    970 batches | lr 0.000467 | ms/batch 484.27 | loss  3.21 | ppl    24.692
+| epoch  10 step   104400 |   1170 batches | lr 0.000466 | ms/batch 432.12 | loss  3.24 | ppl    25.567
+| epoch  10 step   104600 |   1370 batches | lr 0.000464 | ms/batch 432.32 | loss  3.22 | ppl    24.984
+| epoch  10 step   104800 |   1570 batches | lr 0.000462 | ms/batch 430.59 | loss  3.21 | ppl    24.857
+| epoch  10 step   105000 |   1770 batches | lr 0.000461 | ms/batch 431.50 | loss  3.22 | ppl    24.967
+| epoch  10 step   105200 |   1970 batches | lr 0.000459 | ms/batch 432.34 | loss  3.25 | ppl    25.699
+| epoch  10 step   105400 |   2170 batches | lr 0.000458 | ms/batch 431.17 | loss  3.24 | ppl    25.529
+| epoch  10 step   105600 |   2370 batches | lr 0.000456 | ms/batch 430.79 | loss  3.23 | ppl    25.362
+| epoch  10 step   105800 |   2570 batches | lr 0.000455 | ms/batch 431.08 | loss  3.22 | ppl    25.140
+| epoch  10 step   106000 |   2770 batches | lr 0.000453 | ms/batch 432.28 | loss  3.20 | ppl    24.603
+| epoch  10 step   106200 |   2970 batches | lr 0.000451 | ms/batch 430.58 | loss  3.21 | ppl    24.817
+| epoch  10 step   106400 |   3170 batches | lr 0.00045 | ms/batch 431.15 | loss  3.23 | ppl    25.248
+| epoch  10 step   106600 |   3370 batches | lr 0.000448 | ms/batch 431.26 | loss  3.22 | ppl    25.082
+| epoch  10 step   106800 |   3570 batches | lr 0.000447 | ms/batch 431.44 | loss  3.20 | ppl    24.526
+| epoch  10 step   107000 |   3770 batches | lr 0.000445 | ms/batch 431.31 | loss  3.21 | ppl    24.815
+| epoch  10 step   107200 |   3970 batches | lr 0.000444 | ms/batch 430.57 | loss  3.22 | ppl    25.021
+| epoch  10 step   107400 |   4170 batches | lr 0.000442 | ms/batch 431.10 | loss  3.22 | ppl    24.926
+| epoch  10 step   107600 |   4370 batches | lr 0.000441 | ms/batch 431.03 | loss  3.22 | ppl    25.090
+| epoch  10 step   107800 |   4570 batches | lr 0.000439 | ms/batch 431.94 | loss  3.23 | ppl    25.375
+| epoch  10 step   108000 |   4770 batches | lr 0.000437 | ms/batch 431.69 | loss  3.19 | ppl    24.269
+----------------------------------------------------------------------------------------------------
+| Eval  27 at step   108000 | time: 1731.81s | valid loss  3.25 | valid ppl    25.797
+----------------------------------------------------------------------------------------------------
+| epoch  10 step   108200 |   4970 batches | lr 0.000436 | ms/batch 485.38 | loss  3.23 | ppl    25.232
+| epoch  10 step   108400 |   5170 batches | lr 0.000434 | ms/batch 431.08 | loss  3.21 | ppl    24.658
+| epoch  10 step   108600 |   5370 batches | lr 0.000433 | ms/batch 431.32 | loss  3.18 | ppl    24.114
+| epoch  10 step   108800 |   5570 batches | lr 0.000431 | ms/batch 432.75 | loss  3.20 | ppl    24.577
+| epoch  10 step   109000 |   5770 batches | lr 0.00043 | ms/batch 430.87 | loss  3.22 | ppl    25.109
+| epoch  10 step   109200 |   5970 batches | lr 0.000428 | ms/batch 432.85 | loss  3.20 | ppl    24.520
+| epoch  10 step   109400 |   6170 batches | lr 0.000427 | ms/batch 431.12 | loss  3.20 | ppl    24.429
+| epoch  10 step   109600 |   6370 batches | lr 0.000425 | ms/batch 431.69 | loss  3.24 | ppl    25.443
+| epoch  10 step   109800 |   6570 batches | lr 0.000423 | ms/batch 431.06 | loss  3.15 | ppl    23.412
+| epoch  10 step   110000 |   6770 batches | lr 0.000422 | ms/batch 431.66 | loss  3.19 | ppl    24.228
+| epoch  10 step   110200 |   6970 batches | lr 0.00042 | ms/batch 432.02 | loss  3.20 | ppl    24.598
+| epoch  10 step   110400 |   7170 batches | lr 0.000419 | ms/batch 432.58 | loss  3.16 | ppl    23.460
+| epoch  10 step   110600 |   7370 batches | lr 0.000417 | ms/batch 431.44 | loss  3.18 | ppl    24.138
+| epoch  10 step   110800 |   7570 batches | lr 0.000416 | ms/batch 433.20 | loss  3.16 | ppl    23.507
+| epoch  10 step   111000 |   7770 batches | lr 0.000414 | ms/batch 430.91 | loss  3.19 | ppl    24.391
+| epoch  10 step   111200 |   7970 batches | lr 0.000413 | ms/batch 433.04 | loss  3.18 | ppl    24.116
+| epoch  10 step   111400 |   8170 batches | lr 0.000411 | ms/batch 431.97 | loss  3.17 | ppl    23.883
+| epoch  10 step   111600 |   8370 batches | lr 0.000409 | ms/batch 432.20 | loss  3.20 | ppl    24.590
+| epoch  10 step   111800 |   8570 batches | lr 0.000408 | ms/batch 432.86 | loss  3.18 | ppl    24.126
+| epoch  10 step   112000 |   8770 batches | lr 0.000406 | ms/batch 432.45 | loss  3.19 | ppl    24.310
+----------------------------------------------------------------------------------------------------
+| Eval  28 at step   112000 | time: 1734.16s | valid loss  3.24 | valid ppl    25.577
+----------------------------------------------------------------------------------------------------
+| epoch  10 step   112200 |   8970 batches | lr 0.000405 | ms/batch 484.80 | loss  3.20 | ppl    24.473
+| epoch  10 step   112400 |   9170 batches | lr 0.000403 | ms/batch 432.34 | loss  3.18 | ppl    23.977
+| epoch  10 step   112600 |   9370 batches | lr 0.000402 | ms/batch 434.24 | loss  3.19 | ppl    24.270
+| epoch  10 step   112800 |   9570 batches | lr 0.0004 | ms/batch 430.73 | loss  3.21 | ppl    24.773
+| epoch  10 step   113000 |   9770 batches | lr 0.000399 | ms/batch 431.89 | loss  3.19 | ppl    24.185
+| epoch  10 step   113200 |   9970 batches | lr 0.000397 | ms/batch 432.06 | loss  3.19 | ppl    24.191
+| epoch  10 step   113400 |  10170 batches | lr 0.000396 | ms/batch 431.38 | loss  3.16 | ppl    23.627
+| epoch  10 step   113600 |  10370 batches | lr 0.000394 | ms/batch 430.96 | loss  3.19 | ppl    24.257
+| epoch  10 step   113800 |  10570 batches | lr 0.000393 | ms/batch 431.43 | loss  3.21 | ppl    24.877
+| epoch  10 step   114000 |  10770 batches | lr 0.000391 | ms/batch 432.73 | loss  3.17 | ppl    23.728
+| epoch  10 step   114200 |  10970 batches | lr 0.000389 | ms/batch 433.81 | loss  3.18 | ppl    24.106
+| epoch  10 step   114400 |  11170 batches | lr 0.000388 | ms/batch 431.64 | loss  3.22 | ppl    24.942
+| epoch  10 step   114600 |  11370 batches | lr 0.000386 | ms/batch 434.07 | loss  3.19 | ppl    24.404
+| epoch  11 step   114800 |    100 batches | lr 0.000385 | ms/batch 430.90 | loss  3.18 | ppl    24.123
+| epoch  11 step   115000 |    300 batches | lr 0.000383 | ms/batch 432.01 | loss  3.16 | ppl    23.679
+| epoch  11 step   115200 |    500 batches | lr 0.000382 | ms/batch 432.69 | loss  3.20 | ppl    24.598
+| epoch  11 step   115400 |    700 batches | lr 0.00038 | ms/batch 433.40 | loss  3.15 | ppl    23.424
+| epoch  11 step   115600 |    900 batches | lr 0.000379 | ms/batch 431.01 | loss  3.17 | ppl    23.860
+| epoch  11 step   115800 |   1100 batches | lr 0.000377 | ms/batch 431.82 | loss  3.19 | ppl    24.356
+| epoch  11 step   116000 |   1300 batches | lr 0.000376 | ms/batch 431.01 | loss  3.17 | ppl    23.859
+----------------------------------------------------------------------------------------------------
+| Eval  29 at step   116000 | time: 1734.75s | valid loss  3.24 | valid ppl    25.504
+----------------------------------------------------------------------------------------------------
+| epoch  11 step   116200 |   1500 batches | lr 0.000374 | ms/batch 484.53 | loss  3.17 | ppl    23.735
+| epoch  11 step   116400 |   1700 batches | lr 0.000373 | ms/batch 431.49 | loss  3.16 | ppl    23.553
+| epoch  11 step   116600 |   1900 batches | lr 0.000371 | ms/batch 431.62 | loss  3.19 | ppl    24.285
+| epoch  11 step   116800 |   2100 batches | lr 0.00037 | ms/batch 431.29 | loss  3.21 | ppl    24.801
+| epoch  11 step   117000 |   2300 batches | lr 0.000368 | ms/batch 431.24 | loss  3.19 | ppl    24.343
+| epoch  11 step   117200 |   2500 batches | lr 0.000367 | ms/batch 431.80 | loss  3.17 | ppl    23.817
+| epoch  11 step   117400 |   2700 batches | lr 0.000365 | ms/batch 431.05 | loss  3.18 | ppl    23.943
+| epoch  11 step   117600 |   2900 batches | lr 0.000364 | ms/batch 431.78 | loss  3.14 | ppl    23.072
+| epoch  11 step   117800 |   3100 batches | lr 0.000362 | ms/batch 433.44 | loss  3.18 | ppl    23.941
+| epoch  11 step   118000 |   3300 batches | lr 0.000361 | ms/batch 431.83 | loss  3.19 | ppl    24.346
+| epoch  11 step   118200 |   3500 batches | lr 0.000359 | ms/batch 430.98 | loss  3.15 | ppl    23.383
+| epoch  11 step   118400 |   3700 batches | lr 0.000358 | ms/batch 431.54 | loss  3.17 | ppl    23.837
+| epoch  11 step   118600 |   3900 batches | lr 0.000356 | ms/batch 430.95 | loss  3.16 | ppl    23.611
+| epoch  11 step   118800 |   4100 batches | lr 0.000355 | ms/batch 432.44 | loss  3.18 | ppl    24.134
+| epoch  11 step   119000 |   4300 batches | lr 0.000353 | ms/batch 431.52 | loss  3.17 | ppl    23.747
+| epoch  11 step   119200 |   4500 batches | lr 0.000352 | ms/batch 432.70 | loss  3.19 | ppl    24.290
+| epoch  11 step   119400 |   4700 batches | lr 0.00035 | ms/batch 432.66 | loss  3.15 | ppl    23.296
+| epoch  11 step   119600 |   4900 batches | lr 0.000349 | ms/batch 432.65 | loss  3.16 | ppl    23.587
+| epoch  11 step   119800 |   5100 batches | lr 0.000347 | ms/batch 432.23 | loss  3.17 | ppl    23.761
+| epoch  11 step   120000 |   5300 batches | lr 0.000346 | ms/batch 432.28 | loss  3.15 | ppl    23.380
+----------------------------------------------------------------------------------------------------
+| Eval  30 at step   120000 | time: 1733.79s | valid loss  3.23 | valid ppl    25.207
+----------------------------------------------------------------------------------------------------
+| epoch  11 step   120200 |   5500 batches | lr 0.000344 | ms/batch 485.19 | loss  3.15 | ppl    23.385
+| epoch  11 step   120400 |   5700 batches | lr 0.000343 | ms/batch 431.60 | loss  3.16 | ppl    23.630
+| epoch  11 step   120600 |   5900 batches | lr 0.000341 | ms/batch 432.39 | loss  3.17 | ppl    23.706
+| epoch  11 step   120800 |   6100 batches | lr 0.00034 | ms/batch 431.23 | loss  3.16 | ppl    23.594
+| epoch  11 step   121000 |   6300 batches | lr 0.000338 | ms/batch 432.67 | loss  3.17 | ppl    23.740
+| epoch  11 step   121200 |   6500 batches | lr 0.000337 | ms/batch 431.72 | loss  3.13 | ppl    22.899
+| epoch  11 step   121400 |   6700 batches | lr 0.000335 | ms/batch 432.59 | loss  3.13 | ppl    22.826
+| epoch  11 step   121600 |   6900 batches | lr 0.000334 | ms/batch 431.15 | loss  3.15 | ppl    23.332
+| epoch  11 step   121800 |   7100 batches | lr 0.000332 | ms/batch 430.77 | loss  3.15 | ppl    23.221
+| epoch  11 step   122000 |   7300 batches | lr 0.000331 | ms/batch 429.79 | loss  3.10 | ppl    22.234
+| epoch  11 step   122200 |   7500 batches | lr 0.000329 | ms/batch 432.21 | loss  3.15 | ppl    23.235
+| epoch  11 step   122400 |   7700 batches | lr 0.000328 | ms/batch 432.24 | loss  3.13 | ppl    22.791
+| epoch  11 step   122600 |   7900 batches | lr 0.000326 | ms/batch 433.78 | loss  3.13 | ppl    22.859
+| epoch  11 step   122800 |   8100 batches | lr 0.000325 | ms/batch 433.88 | loss  3.15 | ppl    23.242
+| epoch  11 step   123000 |   8300 batches | lr 0.000323 | ms/batch 433.02 | loss  3.13 | ppl    22.926
+| epoch  11 step   123200 |   8500 batches | lr 0.000322 | ms/batch 431.07 | loss  3.13 | ppl    22.963
+| epoch  11 step   123400 |   8700 batches | lr 0.00032 | ms/batch 432.33 | loss  3.15 | ppl    23.392
+| epoch  11 step   123600 |   8900 batches | lr 0.000319 | ms/batch 429.32 | loss  3.15 | ppl    23.243
+| epoch  11 step   123800 |   9100 batches | lr 0.000317 | ms/batch 432.13 | loss  3.15 | ppl    23.279
+| epoch  11 step   124000 |   9300 batches | lr 0.000316 | ms/batch 431.79 | loss  3.13 | ppl    22.908
+----------------------------------------------------------------------------------------------------
+| Eval  31 at step   124000 | time: 1733.89s | valid loss  3.21 | valid ppl    24.812
+----------------------------------------------------------------------------------------------------
+| epoch  11 step   124200 |   9500 batches | lr 0.000315 | ms/batch 485.31 | loss  3.15 | ppl    23.395
+| epoch  11 step   124400 |   9700 batches | lr 0.000313 | ms/batch 431.01 | loss  3.14 | ppl    23.217
+| epoch  11 step   124600 |   9900 batches | lr 0.000312 | ms/batch 430.95 | loss  3.13 | ppl    22.847
+| epoch  11 step   124800 |  10100 batches | lr 0.00031 | ms/batch 430.50 | loss  3.14 | ppl    23.214
+| epoch  11 step   125000 |  10300 batches | lr 0.000309 | ms/batch 431.25 | loss  3.13 | ppl    22.910
+| epoch  11 step   125200 |  10500 batches | lr 0.000307 | ms/batch 432.16 | loss  3.17 | ppl    23.719
+| epoch  11 step   125400 |  10700 batches | lr 0.000306 | ms/batch 430.75 | loss  3.13 | ppl    22.860
+| epoch  11 step   125600 |  10900 batches | lr 0.000304 | ms/batch 431.47 | loss  3.12 | ppl    22.570
+| epoch  11 step   125800 |  11100 batches | lr 0.000303 | ms/batch 430.65 | loss  3.17 | ppl    23.879
+| epoch  11 step   126000 |  11300 batches | lr 0.000301 | ms/batch 431.81 | loss  3.15 | ppl    23.372
+| epoch  12 step   126200 |     30 batches | lr 0.0003 | ms/batch 429.97 | loss  3.15 | ppl    23.380
+| epoch  12 step   126400 |    230 batches | lr 0.000299 | ms/batch 431.33 | loss  3.11 | ppl    22.355
+| epoch  12 step   126600 |    430 batches | lr 0.000297 | ms/batch 430.87 | loss  3.14 | ppl    23.169
+| epoch  12 step   126800 |    630 batches | lr 0.000296 | ms/batch 432.29 | loss  3.12 | ppl    22.578
+| epoch  12 step   127000 |    830 batches | lr 0.000294 | ms/batch 432.44 | loss  3.15 | ppl    23.438
+| epoch  12 step   127200 |   1030 batches | lr 0.000293 | ms/batch 431.80 | loss  3.12 | ppl    22.547
+| epoch  12 step   127400 |   1230 batches | lr 0.000291 | ms/batch 431.91 | loss  3.13 | ppl    22.962
+| epoch  12 step   127600 |   1430 batches | lr 0.00029 | ms/batch 432.43 | loss  3.13 | ppl    22.857
+| epoch  12 step   127800 |   1630 batches | lr 0.000289 | ms/batch 431.24 | loss  3.11 | ppl    22.423
+| epoch  12 step   128000 |   1830 batches | lr 0.000287 | ms/batch 431.67 | loss  3.14 | ppl    23.045
+----------------------------------------------------------------------------------------------------
+| Eval  32 at step   128000 | time: 1731.99s | valid loss  3.21 | valid ppl    24.767
+----------------------------------------------------------------------------------------------------
+| epoch  12 step   128200 |   2030 batches | lr 0.000286 | ms/batch 484.47 | loss  3.17 | ppl    23.741
+| epoch  12 step   128400 |   2230 batches | lr 0.000284 | ms/batch 431.11 | loss  3.14 | ppl    23.123
+| epoch  12 step   128600 |   2430 batches | lr 0.000283 | ms/batch 432.77 | loss  3.14 | ppl    23.177
+| epoch  12 step   128800 |   2630 batches | lr 0.000282 | ms/batch 432.06 | loss  3.13 | ppl    22.892
+| epoch  12 step   129000 |   2830 batches | lr 0.00028 | ms/batch 431.54 | loss  3.10 | ppl    22.155
+| epoch  12 step   129200 |   3030 batches | lr 0.000279 | ms/batch 432.06 | loss  3.13 | ppl    22.914
+| epoch  12 step   129400 |   3230 batches | lr 0.000277 | ms/batch 431.25 | loss  3.13 | ppl    22.780
+| epoch  12 step   129600 |   3430 batches | lr 0.000276 | ms/batch 430.82 | loss  3.12 | ppl    22.660
+| epoch  12 step   129800 |   3630 batches | lr 0.000274 | ms/batch 432.19 | loss  3.11 | ppl    22.377
+| epoch  12 step   130000 |   3830 batches | lr 0.000273 | ms/batch 431.91 | loss  3.12 | ppl    22.730
+| epoch  12 step   130200 |   4030 batches | lr 0.000272 | ms/batch 431.49 | loss  3.14 | ppl    23.125
+| epoch  12 step   130400 |   4230 batches | lr 0.00027 | ms/batch 432.13 | loss  3.12 | ppl    22.750
+| epoch  12 step   130600 |   4430 batches | lr 0.000269 | ms/batch 431.86 | loss  3.12 | ppl    22.713
+| epoch  12 step   130800 |   4630 batches | lr 0.000267 | ms/batch 431.34 | loss  3.12 | ppl    22.744
+| epoch  12 step   131000 |   4830 batches | lr 0.000266 | ms/batch 430.75 | loss  3.11 | ppl    22.398
+| epoch  12 step   131200 |   5030 batches | lr 0.000265 | ms/batch 431.12 | loss  3.13 | ppl    22.885
+| epoch  12 step   131400 |   5230 batches | lr 0.000263 | ms/batch 430.46 | loss  3.12 | ppl    22.669
+| epoch  12 step   131600 |   5430 batches | lr 0.000262 | ms/batch 431.34 | loss  3.09 | ppl    21.950
+| epoch  12 step   131800 |   5630 batches | lr 0.000261 | ms/batch 431.72 | loss  3.13 | ppl    22.806
+| epoch  12 step   132000 |   5830 batches | lr 0.000259 | ms/batch 430.10 | loss  3.12 | ppl    22.723
+----------------------------------------------------------------------------------------------------
+| Eval  33 at step   132000 | time: 1732.22s | valid loss  3.20 | valid ppl    24.478
+----------------------------------------------------------------------------------------------------
+| epoch  12 step   132200 |   6030 batches | lr 0.000258 | ms/batch 483.85 | loss  3.10 | ppl    22.208
+| epoch  12 step   132400 |   6230 batches | lr 0.000256 | ms/batch 431.01 | loss  3.11 | ppl    22.454
+| epoch  12 step   132600 |   6430 batches | lr 0.000255 | ms/batch 431.62 | loss  3.13 | ppl    22.788
+| epoch  12 step   132800 |   6630 batches | lr 0.000254 | ms/batch 430.91 | loss  3.07 | ppl    21.552
+| epoch  12 step   133000 |   6830 batches | lr 0.000252 | ms/batch 431.29 | loss  3.10 | ppl    22.161
+| epoch  12 step   133200 |   7030 batches | lr 0.000251 | ms/batch 432.30 | loss  3.11 | ppl    22.333
+| epoch  12 step   133400 |   7230 batches | lr 0.00025 | ms/batch 430.20 | loss  3.07 | ppl    21.561
+| epoch  12 step   133600 |   7430 batches | lr 0.000248 | ms/batch 430.76 | loss  3.08 | ppl    21.775
+| epoch  12 step   133800 |   7630 batches | lr 0.000247 | ms/batch 431.00 | loss  3.08 | ppl    21.656
+| epoch  12 step   134000 |   7830 batches | lr 0.000246 | ms/batch 431.51 | loss  3.10 | ppl    22.131
+| epoch  12 step   134200 |   8030 batches | lr 0.000244 | ms/batch 430.65 | loss  3.10 | ppl    22.148
+| epoch  12 step   134400 |   8230 batches | lr 0.000243 | ms/batch 431.44 | loss  3.09 | ppl    21.895
+| epoch  12 step   134600 |   8430 batches | lr 0.000241 | ms/batch 431.15 | loss  3.10 | ppl    22.214
+| epoch  12 step   134800 |   8630 batches | lr 0.00024 | ms/batch 431.28 | loss  3.09 | ppl    21.994
+| epoch  12 step   135000 |   8830 batches | lr 0.000239 | ms/batch 430.56 | loss  3.11 | ppl    22.496
+| epoch  12 step   135200 |   9030 batches | lr 0.000237 | ms/batch 431.01 | loss  3.11 | ppl    22.324
+| epoch  12 step   135400 |   9230 batches | lr 0.000236 | ms/batch 430.67 | loss  3.07 | ppl    21.638
+| epoch  12 step   135600 |   9430 batches | lr 0.000235 | ms/batch 431.20 | loss  3.10 | ppl    22.290
+| epoch  12 step   135800 |   9630 batches | lr 0.000233 | ms/batch 431.59 | loss  3.12 | ppl    22.606
+| epoch  12 step   136000 |   9830 batches | lr 0.000232 | ms/batch 431.20 | loss  3.08 | ppl    21.688
+----------------------------------------------------------------------------------------------------
+| Eval  34 at step   136000 | time: 1730.84s | valid loss  3.19 | valid ppl    24.239
+----------------------------------------------------------------------------------------------------
+| epoch  12 step   136200 |  10030 batches | lr 0.000231 | ms/batch 483.47 | loss  3.10 | ppl    22.265
+| epoch  12 step   136400 |  10230 batches | lr 0.000229 | ms/batch 431.69 | loss  3.09 | ppl    21.896
+| epoch  12 step   136600 |  10430 batches | lr 0.000228 | ms/batch 431.61 | loss  3.09 | ppl    22.074
+| epoch  12 step   136800 |  10630 batches | lr 0.000227 | ms/batch 431.64 | loss  3.12 | ppl    22.752
+| epoch  12 step   137000 |  10830 batches | lr 0.000226 | ms/batch 431.16 | loss  3.06 | ppl    21.360
+| epoch  12 step   137200 |  11030 batches | lr 0.000224 | ms/batch 430.85 | loss  3.12 | ppl    22.677
+| epoch  12 step   137400 |  11230 batches | lr 0.000223 | ms/batch 431.55 | loss  3.12 | ppl    22.545
+| epoch  12 step   137600 |  11430 batches | lr 0.000222 | ms/batch 430.96 | loss  3.10 | ppl    22.250
+| epoch  13 step   137800 |    160 batches | lr 0.00022 | ms/batch 430.15 | loss  3.09 | ppl    21.936
+| epoch  13 step   138000 |    360 batches | lr 0.000219 | ms/batch 431.25 | loss  3.08 | ppl    21.697
+| epoch  13 step   138200 |    560 batches | lr 0.000218 | ms/batch 430.49 | loss  3.09 | ppl    22.047
+| epoch  13 step   138400 |    760 batches | lr 0.000216 | ms/batch 431.16 | loss  3.09 | ppl    21.894
+| epoch  13 step   138600 |    960 batches | lr 0.000215 | ms/batch 430.96 | loss  3.07 | ppl    21.542
+| epoch  13 step   138800 |   1160 batches | lr 0.000214 | ms/batch 430.70 | loss  3.10 | ppl    22.305
+| epoch  13 step   139000 |   1360 batches | lr 0.000213 | ms/batch 432.79 | loss  3.08 | ppl    21.774
+| epoch  13 step   139200 |   1560 batches | lr 0.000211 | ms/batch 431.02 | loss  3.08 | ppl    21.693
+| epoch  13 step   139400 |   1760 batches | lr 0.00021 | ms/batch 433.07 | loss  3.08 | ppl    21.695
+| epoch  13 step   139600 |   1960 batches | lr 0.000209 | ms/batch 431.58 | loss  3.11 | ppl    22.326
+| epoch  13 step   139800 |   2160 batches | lr 0.000207 | ms/batch 430.88 | loss  3.11 | ppl    22.432
+| epoch  13 step   140000 |   2360 batches | lr 0.000206 | ms/batch 430.34 | loss  3.09 | ppl    21.997
+----------------------------------------------------------------------------------------------------
+| Eval  35 at step   140000 | time: 1731.19s | valid loss  3.18 | valid ppl    23.962
+----------------------------------------------------------------------------------------------------
+| epoch  13 step   140200 |   2560 batches | lr 0.000205 | ms/batch 484.26 | loss  3.09 | ppl    22.042
+| epoch  13 step   140400 |   2760 batches | lr 0.000204 | ms/batch 430.93 | loss  3.07 | ppl    21.495
+| epoch  13 step   140600 |   2960 batches | lr 0.000202 | ms/batch 431.04 | loss  3.07 | ppl    21.645
+| epoch  13 step   140800 |   3160 batches | lr 0.000201 | ms/batch 430.73 | loss  3.09 | ppl    21.999
+| epoch  13 step   141000 |   3360 batches | lr 0.0002 | ms/batch 431.31 | loss  3.09 | ppl    21.953
+| epoch  13 step   141200 |   3560 batches | lr 0.000199 | ms/batch 431.24 | loss  3.07 | ppl    21.515
+| epoch  13 step   141400 |   3760 batches | lr 0.000197 | ms/batch 431.92 | loss  3.08 | ppl    21.696
+| epoch  13 step   141600 |   3960 batches | lr 0.000196 | ms/batch 430.43 | loss  3.08 | ppl    21.807
+| epoch  13 step   141800 |   4160 batches | lr 0.000195 | ms/batch 431.24 | loss  3.08 | ppl    21.863
+| epoch  13 step   142000 |   4360 batches | lr 0.000194 | ms/batch 432.55 | loss  3.08 | ppl    21.818
+| epoch  13 step   142200 |   4560 batches | lr 0.000192 | ms/batch 431.39 | loss  3.10 | ppl    22.231
+| epoch  13 step   142400 |   4760 batches | lr 0.000191 | ms/batch 430.91 | loss  3.05 | ppl    21.181
+| epoch  13 step   142600 |   4960 batches | lr 0.00019 | ms/batch 430.37 | loss  3.09 | ppl    21.940
+| epoch  13 step   142800 |   5160 batches | lr 0.000189 | ms/batch 431.21 | loss  3.07 | ppl    21.603
+| epoch  13 step   143000 |   5360 batches | lr 0.000187 | ms/batch 430.65 | loss  3.06 | ppl    21.268
+| epoch  13 step   143200 |   5560 batches | lr 0.000186 | ms/batch 430.50 | loss  3.06 | ppl    21.369
+| epoch  13 step   143400 |   5760 batches | lr 0.000185 | ms/batch 430.32 | loss  3.08 | ppl    21.808
+| epoch  13 step   143600 |   5960 batches | lr 0.000184 | ms/batch 430.46 | loss  3.07 | ppl    21.536
+| epoch  13 step   143800 |   6160 batches | lr 0.000183 | ms/batch 431.46 | loss  3.06 | ppl    21.313
+| epoch  13 step   144000 |   6360 batches | lr 0.000181 | ms/batch 431.41 | loss  3.11 | ppl    22.363
+----------------------------------------------------------------------------------------------------
+| Eval  36 at step   144000 | time: 1730.58s | valid loss  3.18 | valid ppl    24.033
+----------------------------------------------------------------------------------------------------
+| epoch  13 step   144200 |   6560 batches | lr 0.00018 | ms/batch 463.01 | loss  3.02 | ppl    20.408
+| epoch  13 step   144400 |   6760 batches | lr 0.000179 | ms/batch 430.89 | loss  3.05 | ppl    21.202
+| epoch  13 step   144600 |   6960 batches | lr 0.000178 | ms/batch 431.83 | loss  3.07 | ppl    21.498
+| epoch  13 step   144800 |   7160 batches | lr 0.000177 | ms/batch 431.57 | loss  3.02 | ppl    20.567
+| epoch  13 step   145000 |   7360 batches | lr 0.000175 | ms/batch 431.30 | loss  3.05 | ppl    21.061
+| epoch  13 step   145200 |   7560 batches | lr 0.000174 | ms/batch 431.94 | loss  3.03 | ppl    20.732
+| epoch  13 step   145400 |   7760 batches | lr 0.000173 | ms/batch 430.52 | loss  3.06 | ppl    21.330
+| epoch  13 step   145600 |   7960 batches | lr 0.000172 | ms/batch 432.25 | loss  3.04 | ppl    20.941
+| epoch  13 step   145800 |   8160 batches | lr 0.000171 | ms/batch 428.44 | loss  3.04 | ppl    20.953
+| epoch  13 step   146000 |   8360 batches | lr 0.000169 | ms/batch 428.75 | loss  3.07 | ppl    21.486
+| epoch  13 step   146200 |   8560 batches | lr 0.000168 | ms/batch 428.29 | loss  3.05 | ppl    21.119
+| epoch  13 step   146400 |   8760 batches | lr 0.000167 | ms/batch 429.25 | loss  3.06 | ppl    21.234
+| epoch  13 step   146600 |   8960 batches | lr 0.000166 | ms/batch 428.49 | loss  3.07 | ppl    21.543
+| epoch  13 step   146800 |   9160 batches | lr 0.000165 | ms/batch 431.81 | loss  3.04 | ppl    20.923
+| epoch  13 step   147000 |   9360 batches | lr 0.000164 | ms/batch 428.07 | loss  3.05 | ppl    21.187
+| epoch  13 step   147200 |   9560 batches | lr 0.000162 | ms/batch 428.50 | loss  3.08 | ppl    21.742
+| epoch  13 step   147400 |   9760 batches | lr 0.000161 | ms/batch 428.93 | loss  3.05 | ppl    21.118
+| epoch  13 step   147600 |   9960 batches | lr 0.00016 | ms/batch 429.07 | loss  3.05 | ppl    21.214
+| epoch  13 step   147800 |  10160 batches | lr 0.000159 | ms/batch 428.38 | loss  3.03 | ppl    20.674
+| epoch  13 step   148000 |  10360 batches | lr 0.000158 | ms/batch 429.30 | loss  3.06 | ppl    21.383
+----------------------------------------------------------------------------------------------------
+| Eval  37 at step   148000 | time: 1726.13s | valid loss  3.17 | valid ppl    23.691
+----------------------------------------------------------------------------------------------------
+| epoch  13 step   148200 |  10560 batches | lr 0.000157 | ms/batch 481.88 | loss  3.08 | ppl    21.750
+| epoch  13 step   148400 |  10760 batches | lr 0.000155 | ms/batch 429.14 | loss  3.04 | ppl    20.808
+| epoch  13 step   148600 |  10960 batches | lr 0.000154 | ms/batch 428.38 | loss  3.04 | ppl    20.987
+| epoch  13 step   148800 |  11160 batches | lr 0.000153 | ms/batch 428.50 | loss  3.09 | ppl    22.015
+| epoch  13 step   149000 |  11360 batches | lr 0.000152 | ms/batch 429.49 | loss  3.06 | ppl    21.327
+| epoch  14 step   149200 |     90 batches | lr 0.000151 | ms/batch 428.11 | loss  3.06 | ppl    21.261
+| epoch  14 step   149400 |    290 batches | lr 0.00015 | ms/batch 429.16 | loss  3.03 | ppl    20.713
+| epoch  14 step   149600 |    490 batches | lr 0.000149 | ms/batch 428.77 | loss  3.07 | ppl    21.532
+| epoch  14 step   149800 |    690 batches | lr 0.000148 | ms/batch 429.07 | loss  3.02 | ppl    20.589
+| epoch  14 step   150000 |    890 batches | lr 0.000146 | ms/batch 428.29 | loss  3.05 | ppl    21.031
+| epoch  14 step   150200 |   1090 batches | lr 0.000145 | ms/batch 428.38 | loss  3.06 | ppl    21.266
+| epoch  14 step   150400 |   1290 batches | lr 0.000144 | ms/batch 429.10 | loss  3.04 | ppl    20.860
+| epoch  14 step   150600 |   1490 batches | lr 0.000143 | ms/batch 428.88 | loss  3.04 | ppl    20.851
+| epoch  14 step   150800 |   1690 batches | lr 0.000142 | ms/batch 428.45 | loss  3.04 | ppl    20.828
+| epoch  14 step   151000 |   1890 batches | lr 0.000141 | ms/batch 428.61 | loss  3.05 | ppl    21.108
+| epoch  14 step   151200 |   2090 batches | lr 0.00014 | ms/batch 429.88 | loss  3.09 | ppl    21.960
+| epoch  14 step   151400 |   2290 batches | lr 0.000139 | ms/batch 428.60 | loss  3.06 | ppl    21.348
+| epoch  14 step   151600 |   2490 batches | lr 0.000138 | ms/batch 427.77 | loss  3.04 | ppl    20.892
+| epoch  14 step   151800 |   2690 batches | lr 0.000137 | ms/batch 429.55 | loss  3.05 | ppl    21.183
+| epoch  14 step   152000 |   2890 batches | lr 0.000136 | ms/batch 428.22 | loss  3.00 | ppl    20.146
+----------------------------------------------------------------------------------------------------
+| Eval  38 at step   152000 | time: 1721.33s | valid loss  3.16 | valid ppl    23.586
+----------------------------------------------------------------------------------------------------
+| epoch  14 step   152200 |   3090 batches | lr 0.000134 | ms/batch 483.70 | loss  3.05 | ppl    21.117
+| epoch  14 step   152400 |   3290 batches | lr 0.000133 | ms/batch 428.34 | loss  3.06 | ppl    21.403
+| epoch  14 step   152600 |   3490 batches | lr 0.000132 | ms/batch 429.22 | loss  3.03 | ppl    20.632
+| epoch  14 step   152800 |   3690 batches | lr 0.000131 | ms/batch 428.12 | loss  3.04 | ppl    20.924
+| epoch  14 step   153000 |   3890 batches | lr 0.00013 | ms/batch 432.35 | loss  3.03 | ppl    20.735
+| epoch  14 step   153200 |   4090 batches | lr 0.000129 | ms/batch 428.36 | loss  3.06 | ppl    21.290
+| epoch  14 step   153400 |   4290 batches | lr 0.000128 | ms/batch 435.89 | loss  3.04 | ppl    20.850
+| epoch  14 step   153600 |   4490 batches | lr 0.000127 | ms/batch 434.49 | loss  3.06 | ppl    21.298
+| epoch  14 step   153800 |   4690 batches | lr 0.000126 | ms/batch 428.56 | loss  3.02 | ppl    20.588
+| epoch  14 step   154000 |   4890 batches | lr 0.000125 | ms/batch 428.64 | loss  3.03 | ppl    20.689
+| epoch  14 step   154200 |   5090 batches | lr 0.000124 | ms/batch 428.26 | loss  3.04 | ppl    20.997
+| epoch  14 step   154400 |   5290 batches | lr 0.000123 | ms/batch 428.63 | loss  3.03 | ppl    20.656
+| epoch  14 step   154600 |   5490 batches | lr 0.000122 | ms/batch 430.44 | loss  3.02 | ppl    20.492
+| epoch  14 step   154800 |   5690 batches | lr 0.000121 | ms/batch 429.37 | loss  3.04 | ppl    20.889
+| epoch  14 step   155000 |   5890 batches | lr 0.00012 | ms/batch 428.16 | loss  3.04 | ppl    20.854
+| epoch  14 step   155200 |   6090 batches | lr 0.000119 | ms/batch 428.56 | loss  3.04 | ppl    20.856
+| epoch  14 step   155400 |   6290 batches | lr 0.000118 | ms/batch 428.39 | loss  3.04 | ppl    20.911
+| epoch  14 step   155600 |   6490 batches | lr 0.000117 | ms/batch 428.91 | loss  3.01 | ppl    20.322
+| epoch  14 step   155800 |   6690 batches | lr 0.000116 | ms/batch 427.78 | loss  3.00 | ppl    20.057
+| epoch  14 step   156000 |   6890 batches | lr 0.000115 | ms/batch 428.59 | loss  3.03 | ppl    20.600
+----------------------------------------------------------------------------------------------------
+| Eval  39 at step   156000 | time: 1724.70s | valid loss  3.15 | valid ppl    23.443
+----------------------------------------------------------------------------------------------------
+| epoch  14 step   156200 |   7090 batches | lr 0.000114 | ms/batch 483.92 | loss  3.02 | ppl    20.526
+| epoch  14 step   156400 |   7290 batches | lr 0.000113 | ms/batch 428.29 | loss  2.97 | ppl    19.558
+| epoch  14 step   156600 |   7490 batches | lr 0.000112 | ms/batch 428.20 | loss  3.02 | ppl    20.494
+| epoch  14 step   156800 |   7690 batches | lr 0.000111 | ms/batch 428.23 | loss  3.00 | ppl    20.151
+| epoch  14 step   157000 |   7890 batches | lr 0.00011 | ms/batch 431.45 | loss  3.00 | ppl    20.111
+| epoch  14 step   157200 |   8090 batches | lr 0.000109 | ms/batch 431.07 | loss  3.02 | ppl    20.545
+| epoch  14 step   157400 |   8290 batches | lr 0.000108 | ms/batch 429.87 | loss  3.01 | ppl    20.280
+| epoch  14 step   157600 |   8490 batches | lr 0.000107 | ms/batch 429.34 | loss  3.01 | ppl    20.317
+| epoch  14 step   157800 |   8690 batches | lr 0.000106 | ms/batch 429.35 | loss  3.03 | ppl    20.696
+| epoch  14 step   158000 |   8890 batches | lr 0.000105 | ms/batch 430.34 | loss  3.02 | ppl    20.527
+| epoch  14 step   158200 |   9090 batches | lr 0.000104 | ms/batch 429.23 | loss  3.02 | ppl    20.538
+| epoch  14 step   158400 |   9290 batches | lr 0.000103 | ms/batch 429.86 | loss  3.01 | ppl    20.345
+| epoch  14 step   158600 |   9490 batches | lr 0.000102 | ms/batch 430.44 | loss  3.02 | ppl    20.569
+| epoch  14 step   158800 |   9690 batches | lr 0.000101 | ms/batch 429.23 | loss  3.02 | ppl    20.562
+| epoch  14 step   159000 |   9890 batches | lr 0.0001 | ms/batch 429.96 | loss  3.00 | ppl    20.119
+| epoch  14 step   159200 |  10090 batches | lr 9.92e-05 | ms/batch 431.43 | loss  3.03 | ppl    20.658
+| epoch  14 step   159400 |  10290 batches | lr 9.83e-05 | ms/batch 431.56 | loss  3.00 | ppl    20.177
+| epoch  14 step   159600 |  10490 batches | lr 9.74e-05 | ms/batch 429.18 | loss  3.04 | ppl    21.009
+| epoch  14 step   159800 |  10690 batches | lr 9.64e-05 | ms/batch 429.35 | loss  3.01 | ppl    20.323
+| epoch  14 step   160000 |  10890 batches | lr 9.55e-05 | ms/batch 429.02 | loss  3.00 | ppl    19.986
+----------------------------------------------------------------------------------------------------
+| Eval  40 at step   160000 | time: 1725.57s | valid loss  3.15 | valid ppl    23.322
+----------------------------------------------------------------------------------------------------
+| epoch  14 step   160200 |  11090 batches | lr 9.46e-05 | ms/batch 481.68 | loss  3.04 | ppl    21.005
+| epoch  14 step   160400 |  11290 batches | lr 9.37e-05 | ms/batch 428.54 | loss  3.04 | ppl    20.853
+| epoch  15 step   160600 |     20 batches | lr 9.28e-05 | ms/batch 429.04 | loss  3.03 | ppl    20.670
+| epoch  15 step   160800 |    220 batches | lr 9.19e-05 | ms/batch 428.96 | loss  2.99 | ppl    19.888
+| epoch  15 step   161000 |    420 batches | lr 9.09e-05 | ms/batch 428.59 | loss  3.02 | ppl    20.582
+| epoch  15 step   161200 |    620 batches | lr 9e-05 | ms/batch 429.51 | loss  2.99 | ppl    19.964
+| epoch  15 step   161400 |    820 batches | lr 8.91e-05 | ms/batch 429.16 | loss  3.03 | ppl    20.734
+| epoch  15 step   161600 |   1020 batches | lr 8.83e-05 | ms/batch 428.53 | loss  2.99 | ppl    19.982
+| epoch  15 step   161800 |   1220 batches | lr 8.74e-05 | ms/batch 428.46 | loss  3.02 | ppl    20.448
+| epoch  15 step   162000 |   1420 batches | lr 8.65e-05 | ms/batch 428.75 | loss  3.01 | ppl    20.289
+| epoch  15 step   162200 |   1620 batches | lr 8.56e-05 | ms/batch 428.80 | loss  2.99 | ppl    19.828
+| epoch  15 step   162400 |   1820 batches | lr 8.47e-05 | ms/batch 430.89 | loss  3.02 | ppl    20.551
+| epoch  15 step   162600 |   2020 batches | lr 8.38e-05 | ms/batch 431.71 | loss  3.05 | ppl    21.076
+| epoch  15 step   162800 |   2220 batches | lr 8.3e-05 | ms/batch 429.82 | loss  3.02 | ppl    20.554
+| epoch  15 step   163000 |   2420 batches | lr 8.21e-05 | ms/batch 428.24 | loss  3.02 | ppl    20.554
+| epoch  15 step   163200 |   2620 batches | lr 8.13e-05 | ms/batch 428.88 | loss  3.01 | ppl    20.309
+| epoch  15 step   163400 |   2820 batches | lr 8.04e-05 | ms/batch 429.25 | loss  2.99 | ppl    19.802
+| epoch  15 step   163600 |   3020 batches | lr 7.95e-05 | ms/batch 430.14 | loss  3.01 | ppl    20.356
+| epoch  15 step   163800 |   3220 batches | lr 7.87e-05 | ms/batch 428.14 | loss  3.01 | ppl    20.250
+| epoch  15 step   164000 |   3420 batches | lr 7.79e-05 | ms/batch 428.57 | loss  3.01 | ppl    20.314
+----------------------------------------------------------------------------------------------------
+| Eval  41 at step   164000 | time: 1722.82s | valid loss  3.15 | valid ppl    23.228
+----------------------------------------------------------------------------------------------------
+| epoch  15 step   164200 |   3620 batches | lr 7.7e-05 | ms/batch 481.45 | loss  2.99 | ppl    19.844
+| epoch  15 step   164400 |   3820 batches | lr 7.62e-05 | ms/batch 429.58 | loss  3.01 | ppl    20.294
+| epoch  15 step   164600 |   4020 batches | lr 7.53e-05 | ms/batch 428.34 | loss  3.03 | ppl    20.605
+| epoch  15 step   164800 |   4220 batches | lr 7.45e-05 | ms/batch 432.92 | loss  3.01 | ppl    20.216
+| epoch  15 step   165000 |   4420 batches | lr 7.37e-05 | ms/batch 429.87 | loss  3.01 | ppl    20.269
+| epoch  15 step   165200 |   4620 batches | lr 7.29e-05 | ms/batch 429.01 | loss  3.01 | ppl    20.313
+| epoch  15 step   165400 |   4820 batches | lr 7.21e-05 | ms/batch 428.76 | loss  3.00 | ppl    19.990
+| epoch  15 step   165600 |   5020 batches | lr 7.13e-05 | ms/batch 428.79 | loss  3.02 | ppl    20.541
+| epoch  15 step   165800 |   5220 batches | lr 7.04e-05 | ms/batch 428.63 | loss  3.00 | ppl    20.101
+| epoch  15 step   166000 |   5420 batches | lr 6.96e-05 | ms/batch 428.36 | loss  2.98 | ppl    19.608
+| epoch  15 step   166200 |   5620 batches | lr 6.88e-05 | ms/batch 428.57 | loss  3.01 | ppl    20.309
+| epoch  15 step   166400 |   5820 batches | lr 6.81e-05 | ms/batch 431.45 | loss  3.01 | ppl    20.265
+| epoch  15 step   166600 |   6020 batches | lr 6.73e-05 | ms/batch 428.47 | loss  2.99 | ppl    19.874
+| epoch  15 step   166800 |   6220 batches | lr 6.65e-05 | ms/batch 428.45 | loss  3.00 | ppl    20.062
+| epoch  15 step   167000 |   6420 batches | lr 6.57e-05 | ms/batch 428.92 | loss  3.01 | ppl    20.380
+| epoch  15 step   167200 |   6620 batches | lr 6.49e-05 | ms/batch 428.16 | loss  2.96 | ppl    19.293
+| epoch  15 step   167400 |   6820 batches | lr 6.42e-05 | ms/batch 430.00 | loss  2.99 | ppl    19.858
+| epoch  15 step   167600 |   7020 batches | lr 6.34e-05 | ms/batch 431.79 | loss  3.00 | ppl    20.049
+| epoch  15 step   167800 |   7220 batches | lr 6.26e-05 | ms/batch 428.44 | loss  2.96 | ppl    19.284
+| epoch  15 step   168000 |   7420 batches | lr 6.19e-05 | ms/batch 431.93 | loss  2.97 | ppl    19.458
+----------------------------------------------------------------------------------------------------
+| Eval  42 at step   168000 | time: 1724.13s | valid loss  3.14 | valid ppl    23.110
+----------------------------------------------------------------------------------------------------
+| epoch  15 step   168200 |   7620 batches | lr 6.11e-05 | ms/batch 481.67 | loss  2.96 | ppl    19.254
+| epoch  15 step   168400 |   7820 batches | lr 6.04e-05 | ms/batch 428.92 | loss  2.99 | ppl    19.864
+| epoch  15 step   168600 |   8020 batches | lr 5.96e-05 | ms/batch 428.32 | loss  2.99 | ppl    19.852
+| epoch  15 step   168800 |   8220 batches | lr 5.89e-05 | ms/batch 428.77 | loss  2.98 | ppl    19.604
+| epoch  15 step   169000 |   8420 batches | lr 5.81e-05 | ms/batch 431.33 | loss  2.99 | ppl    19.895
+| epoch  15 step   169200 |   8620 batches | lr 5.74e-05 | ms/batch 428.35 | loss  2.98 | ppl    19.771
+| epoch  15 step   169400 |   8820 batches | lr 5.67e-05 | ms/batch 429.98 | loss  3.00 | ppl    20.183
+| epoch  15 step   169600 |   9020 batches | lr 5.59e-05 | ms/batch 428.27 | loss  3.00 | ppl    20.035
+| epoch  15 step   169800 |   9220 batches | lr 5.52e-05 | ms/batch 428.16 | loss  2.97 | ppl    19.416
+| epoch  15 step   170000 |   9420 batches | lr 5.45e-05 | ms/batch 428.17 | loss  2.99 | ppl    19.919
+| epoch  15 step   170200 |   9620 batches | lr 5.38e-05 | ms/batch 429.42 | loss  3.01 | ppl    20.260
+| epoch  15 step   170400 |   9820 batches | lr 5.31e-05 | ms/batch 428.41 | loss  2.97 | ppl    19.573
+| epoch  15 step   170600 |  10020 batches | lr 5.24e-05 | ms/batch 428.58 | loss  2.99 | ppl    19.872
+| epoch  15 step   170800 |  10220 batches | lr 5.17e-05 | ms/batch 428.30 | loss  2.98 | ppl    19.782
+| epoch  15 step   171000 |  10420 batches | lr 5.1e-05 | ms/batch 428.42 | loss  2.98 | ppl    19.778
+| epoch  15 step   171200 |  10620 batches | lr 5.03e-05 | ms/batch 428.34 | loss  3.02 | ppl    20.469
+| epoch  15 step   171400 |  10820 batches | lr 4.96e-05 | ms/batch 428.37 | loss  2.96 | ppl    19.309
+| epoch  15 step   171600 |  11020 batches | lr 4.89e-05 | ms/batch 428.57 | loss  3.01 | ppl    20.275
+| epoch  15 step   171800 |  11220 batches | lr 4.83e-05 | ms/batch 430.51 | loss  3.01 | ppl    20.222
+| epoch  15 step   172000 |  11420 batches | lr 4.76e-05 | ms/batch 429.74 | loss  3.01 | ppl    20.201
+----------------------------------------------------------------------------------------------------
+| Eval  43 at step   172000 | time: 1721.76s | valid loss  3.14 | valid ppl    23.035
+----------------------------------------------------------------------------------------------------
+| epoch  16 step   172200 |    150 batches | lr 4.69e-05 | ms/batch 480.04 | loss  2.99 | ppl    19.801
+| epoch  16 step   172400 |    350 batches | lr 4.63e-05 | ms/batch 428.93 | loss  2.97 | ppl    19.473
+| epoch  16 step   172600 |    550 batches | lr 4.56e-05 | ms/batch 428.42 | loss  2.99 | ppl    19.978
+| epoch  16 step   172800 |    750 batches | lr 4.5e-05 | ms/batch 428.37 | loss  2.98 | ppl    19.650
+| epoch  16 step   173000 |    950 batches | lr 4.43e-05 | ms/batch 428.78 | loss  2.97 | ppl    19.486
+| epoch  16 step   173200 |   1150 batches | lr 4.37e-05 | ms/batch 428.45 | loss  3.00 | ppl    20.096
+| epoch  16 step   173400 |   1350 batches | lr 4.3e-05 | ms/batch 428.00 | loss  2.98 | ppl    19.677
+| epoch  16 step   173600 |   1550 batches | lr 4.24e-05 | ms/batch 428.26 | loss  2.98 | ppl    19.595
+| epoch  16 step   173800 |   1750 batches | lr 4.18e-05 | ms/batch 428.85 | loss  2.97 | ppl    19.502
+| epoch  16 step   174000 |   1950 batches | lr 4.11e-05 | ms/batch 429.02 | loss  3.00 | ppl    20.143
+| epoch  16 step   174200 |   2150 batches | lr 4.05e-05 | ms/batch 428.57 | loss  3.01 | ppl    20.385
+| epoch  16 step   174400 |   2350 batches | lr 3.99e-05 | ms/batch 428.93 | loss  2.99 | ppl    19.878
+| epoch  16 step   174600 |   2550 batches | lr 3.93e-05 | ms/batch 428.57 | loss  2.99 | ppl    19.965
+| epoch  16 step   174800 |   2750 batches | lr 3.87e-05 | ms/batch 428.31 | loss  2.97 | ppl    19.491
+| epoch  16 step   175000 |   2950 batches | lr 3.81e-05 | ms/batch 428.82 | loss  2.97 | ppl    19.544
+| epoch  16 step   175200 |   3150 batches | lr 3.75e-05 | ms/batch 428.52 | loss  2.99 | ppl    19.909
+| epoch  16 step   175400 |   3350 batches | lr 3.69e-05 | ms/batch 431.04 | loss  2.99 | ppl    19.941
+| epoch  16 step   175600 |   3550 batches | lr 3.63e-05 | ms/batch 428.37 | loss  2.97 | ppl    19.533
+| epoch  16 step   175800 |   3750 batches | lr 3.57e-05 | ms/batch 428.73 | loss  2.98 | ppl    19.693
+| epoch  16 step   176000 |   3950 batches | lr 3.51e-05 | ms/batch 429.12 | loss  2.98 | ppl    19.722
+----------------------------------------------------------------------------------------------------
+| Eval  44 at step   176000 | time: 1720.98s | valid loss  3.13 | valid ppl    22.961
+----------------------------------------------------------------------------------------------------
+| epoch  16 step   176200 |   4150 batches | lr 3.45e-05 | ms/batch 481.57 | loss  2.99 | ppl    19.858
+| epoch  16 step   176400 |   4350 batches | lr 3.4e-05 | ms/batch 428.92 | loss  2.99 | ppl    19.850
+| epoch  16 step   176600 |   4550 batches | lr 3.34e-05 | ms/batch 428.40 | loss  3.01 | ppl    20.276
+| epoch  16 step   176800 |   4750 batches | lr 3.28e-05 | ms/batch 432.59 | loss  2.96 | ppl    19.228
+| epoch  16 step   177000 |   4950 batches | lr 3.23e-05 | ms/batch 429.38 | loss  2.99 | ppl    19.854
+| epoch  16 step   177200 |   5150 batches | lr 3.17e-05 | ms/batch 428.90 | loss  2.98 | ppl    19.677
+| epoch  16 step   177400 |   5350 batches | lr 3.12e-05 | ms/batch 428.84 | loss  2.97 | ppl    19.407
+| epoch  16 step   177600 |   5550 batches | lr 3.06e-05 | ms/batch 429.22 | loss  2.97 | ppl    19.489
+| epoch  16 step   177800 |   5750 batches | lr 3.01e-05 | ms/batch 428.66 | loss  2.99 | ppl    19.841
+| epoch  16 step   178000 |   5950 batches | lr 2.96e-05 | ms/batch 428.51 | loss  2.97 | ppl    19.551
+| epoch  16 step   178200 |   6150 batches | lr 2.9e-05 | ms/batch 428.34 | loss  2.97 | ppl    19.513
+| epoch  16 step   178400 |   6350 batches | lr 2.85e-05 | ms/batch 428.44 | loss  3.01 | ppl    20.244
+| epoch  16 step   178600 |   6550 batches | lr 2.8e-05 | ms/batch 428.77 | loss  2.93 | ppl    18.681
+| epoch  16 step   178800 |   6750 batches | lr 2.75e-05 | ms/batch 428.39 | loss  2.96 | ppl    19.316
+| epoch  16 step   179000 |   6950 batches | lr 2.7e-05 | ms/batch 428.69 | loss  2.97 | ppl    19.587
+| epoch  16 step   179200 |   7150 batches | lr 2.65e-05 | ms/batch 428.29 | loss  2.94 | ppl    18.849
+| epoch  16 step   179400 |   7350 batches | lr 2.6e-05 | ms/batch 428.68 | loss  2.95 | ppl    19.086
+| epoch  16 step   179600 |   7550 batches | lr 2.55e-05 | ms/batch 428.60 | loss  2.95 | ppl    19.086
+| epoch  16 step   179800 |   7750 batches | lr 2.5e-05 | ms/batch 428.68 | loss  2.96 | ppl    19.386
+| epoch  16 step   180000 |   7950 batches | lr 2.45e-05 | ms/batch 428.49 | loss  2.95 | ppl    19.104
+----------------------------------------------------------------------------------------------------
+| Eval  45 at step   180000 | time: 1721.79s | valid loss  3.13 | valid ppl    22.853
+----------------------------------------------------------------------------------------------------
+| epoch  16 step   180200 |   8150 batches | lr 2.4e-05 | ms/batch 481.12 | loss  2.96 | ppl    19.338
+| epoch  16 step   180400 |   8350 batches | lr 2.35e-05 | ms/batch 431.71 | loss  2.97 | ppl    19.506
+| epoch  16 step   180600 |   8550 batches | lr 2.3e-05 | ms/batch 428.61 | loss  2.96 | ppl    19.224
+| epoch  16 step   180800 |   8750 batches | lr 2.26e-05 | ms/batch 428.53 | loss  2.97 | ppl    19.506
+| epoch  16 step   181000 |   8950 batches | lr 2.21e-05 | ms/batch 428.23 | loss  2.98 | ppl    19.751
+| epoch  16 step   181200 |   9150 batches | lr 2.16e-05 | ms/batch 429.02 | loss  2.95 | ppl    19.154
+| epoch  16 step   181400 |   9350 batches | lr 2.12e-05 | ms/batch 430.94 | loss  2.97 | ppl    19.462
+| epoch  16 step   181600 |   9550 batches | lr 2.07e-05 | ms/batch 432.03 | loss  3.00 | ppl    20.034
+| epoch  16 step   181800 |   9750 batches | lr 2.03e-05 | ms/batch 432.56 | loss  2.96 | ppl    19.237
+| epoch  16 step   182000 |   9950 batches | lr 1.99e-05 | ms/batch 433.30 | loss  2.97 | ppl    19.457
+| epoch  16 step   182200 |  10150 batches | lr 1.94e-05 | ms/batch 431.96 | loss  2.95 | ppl    19.045
+| epoch  16 step   182400 |  10350 batches | lr 1.9e-05 | ms/batch 432.55 | loss  2.98 | ppl    19.590
+| epoch  16 step   182600 |  10550 batches | lr 1.86e-05 | ms/batch 432.69 | loss  3.00 | ppl    20.060
+| epoch  16 step   182800 |  10750 batches | lr 1.81e-05 | ms/batch 432.46 | loss  2.94 | ppl    19.004
+| epoch  16 step   183000 |  10950 batches | lr 1.77e-05 | ms/batch 433.87 | loss  2.96 | ppl    19.317
+| epoch  16 step   183200 |  11150 batches | lr 1.73e-05 | ms/batch 430.79 | loss  3.01 | ppl    20.293
+| epoch  16 step   183400 |  11350 batches | lr 1.69e-05 | ms/batch 429.54 | loss  2.97 | ppl    19.576
+| epoch  17 step   183600 |     80 batches | lr 1.65e-05 | ms/batch 428.43 | loss  2.98 | ppl    19.634
+| epoch  17 step   183800 |    280 batches | lr 1.61e-05 | ms/batch 432.08 | loss  2.95 | ppl    19.031
+| epoch  17 step   184000 |    480 batches | lr 1.57e-05 | ms/batch 429.23 | loss  2.99 | ppl    19.851
+----------------------------------------------------------------------------------------------------
+| Eval  46 at step   184000 | time: 1729.72s | valid loss  3.13 | valid ppl    22.820
+----------------------------------------------------------------------------------------------------
+| epoch  17 step   184200 |    680 batches | lr 1.53e-05 | ms/batch 480.81 | loss  2.94 | ppl    19.004
+| epoch  17 step   184400 |    880 batches | lr 1.49e-05 | ms/batch 428.57 | loss  2.97 | ppl    19.496
+| epoch  17 step   184600 |   1080 batches | lr 1.46e-05 | ms/batch 428.97 | loss  2.97 | ppl    19.571
+| epoch  17 step   184800 |   1280 batches | lr 1.42e-05 | ms/batch 428.24 | loss  2.96 | ppl    19.205
+| epoch  17 step   185000 |   1480 batches | lr 1.38e-05 | ms/batch 429.06 | loss  2.96 | ppl    19.267
+| epoch  17 step   185200 |   1680 batches | lr 1.35e-05 | ms/batch 429.83 | loss  2.96 | ppl    19.297
+| epoch  17 step   185400 |   1880 batches | lr 1.31e-05 | ms/batch 430.28 | loss  2.97 | ppl    19.457
+| epoch  17 step   185600 |   2080 batches | lr 1.27e-05 | ms/batch 428.80 | loss  3.01 | ppl    20.313
+| epoch  17 step   185800 |   2280 batches | lr 1.24e-05 | ms/batch 428.95 | loss  2.99 | ppl    19.825
+| epoch  17 step   186000 |   2480 batches | lr 1.2e-05 | ms/batch 432.86 | loss  2.96 | ppl    19.376
+| epoch  17 step   186200 |   2680 batches | lr 1.17e-05 | ms/batch 429.42 | loss  2.98 | ppl    19.685
+| epoch  17 step   186400 |   2880 batches | lr 1.14e-05 | ms/batch 428.91 | loss  2.93 | ppl    18.645
+| epoch  17 step   186600 |   3080 batches | lr 1.1e-05 | ms/batch 429.49 | loss  2.97 | ppl    19.566
+| epoch  17 step   186800 |   3280 batches | lr 1.07e-05 | ms/batch 431.47 | loss  2.99 | ppl    19.831
+| epoch  17 step   187000 |   3480 batches | lr 1.04e-05 | ms/batch 430.23 | loss  2.95 | ppl    19.146
+| epoch  17 step   187200 |   3680 batches | lr 1.01e-05 | ms/batch 429.15 | loss  2.97 | ppl    19.491
+| epoch  17 step   187400 |   3880 batches | lr 9.76e-06 | ms/batch 431.85 | loss  2.96 | ppl    19.216
+| epoch  17 step   187600 |   4080 batches | lr 9.46e-06 | ms/batch 429.38 | loss  2.98 | ppl    19.778
+| epoch  17 step   187800 |   4280 batches | lr 9.16e-06 | ms/batch 429.06 | loss  2.96 | ppl    19.381
+| epoch  17 step   188000 |   4480 batches | lr 8.86e-06 | ms/batch 432.13 | loss  2.99 | ppl    19.797
+----------------------------------------------------------------------------------------------------
+| Eval  47 at step   188000 | time: 1725.40s | valid loss  3.13 | valid ppl    22.784
+----------------------------------------------------------------------------------------------------
+| epoch  17 step   188200 |   4680 batches | lr 8.57e-06 | ms/batch 482.30 | loss  2.96 | ppl    19.223
+| epoch  17 step   188400 |   4880 batches | lr 8.28e-06 | ms/batch 434.48 | loss  2.96 | ppl    19.235
+| epoch  17 step   188600 |   5080 batches | lr 8e-06 | ms/batch 428.56 | loss  2.98 | ppl    19.594
+| epoch  17 step   188800 |   5280 batches | lr 7.72e-06 | ms/batch 428.74 | loss  2.96 | ppl    19.347
+| epoch  17 step   189000 |   5480 batches | lr 7.45e-06 | ms/batch 432.26 | loss  2.95 | ppl    19.043
+| epoch  17 step   189200 |   5680 batches | lr 7.18e-06 | ms/batch 429.46 | loss  2.98 | ppl    19.617
+| epoch  17 step   189400 |   5880 batches | lr 6.92e-06 | ms/batch 429.20 | loss  2.96 | ppl    19.388
+| epoch  17 step   189600 |   6080 batches | lr 6.66e-06 | ms/batch 430.29 | loss  2.97 | ppl    19.430
+| epoch  17 step   189800 |   6280 batches | lr 6.41e-06 | ms/batch 430.46 | loss  2.97 | ppl    19.575
+| epoch  17 step   190000 |   6480 batches | lr 6.16e-06 | ms/batch 429.53 | loss  2.95 | ppl    19.088
+| epoch  17 step   190200 |   6680 batches | lr 5.91e-06 | ms/batch 430.35 | loss  2.93 | ppl    18.675
+| epoch  17 step   190400 |   6880 batches | lr 5.68e-06 | ms/batch 428.73 | loss  2.96 | ppl    19.301
+| epoch  17 step   190600 |   7080 batches | lr 5.44e-06 | ms/batch 430.43 | loss  2.95 | ppl    19.070
+| epoch  17 step   190800 |   7280 batches | lr 5.21e-06 | ms/batch 430.71 | loss  2.91 | ppl    18.382
+| epoch  17 step   191000 |   7480 batches | lr 4.99e-06 | ms/batch 428.97 | loss  2.95 | ppl    19.146
+| epoch  17 step   191200 |   7680 batches | lr 4.77e-06 | ms/batch 428.68 | loss  2.94 | ppl    18.838
+| epoch  17 step   191400 |   7880 batches | lr 4.56e-06 | ms/batch 435.99 | loss  2.94 | ppl    18.890
+| epoch  17 step   191600 |   8080 batches | lr 4.35e-06 | ms/batch 428.95 | loss  2.96 | ppl    19.240
+| epoch  17 step   191800 |   8280 batches | lr 4.14e-06 | ms/batch 431.74 | loss  2.95 | ppl    19.035
+| epoch  17 step   192000 |   8480 batches | lr 3.94e-06 | ms/batch 430.40 | loss  2.95 | ppl    19.092
+----------------------------------------------------------------------------------------------------
+| Eval  48 at step   192000 | time: 1727.76s | valid loss  3.13 | valid ppl    22.769
+----------------------------------------------------------------------------------------------------
+| epoch  17 step   192200 |   8680 batches | lr 3.75e-06 | ms/batch 482.57 | loss  2.96 | ppl    19.349
+| epoch  17 step   192400 |   8880 batches | lr 3.56e-06 | ms/batch 429.22 | loss  2.96 | ppl    19.309
+| epoch  17 step   192600 |   9080 batches | lr 3.37e-06 | ms/batch 429.91 | loss  2.96 | ppl    19.268
+| epoch  17 step   192800 |   9280 batches | lr 3.2e-06 | ms/batch 428.73 | loss  2.95 | ppl    19.147
+| epoch  17 step   193000 |   9480 batches | lr 3.02e-06 | ms/batch 429.72 | loss  2.97 | ppl    19.395
+| epoch  17 step   193200 |   9680 batches | lr 2.85e-06 | ms/batch 428.35 | loss  2.96 | ppl    19.365
+| epoch  17 step   193400 |   9880 batches | lr 2.69e-06 | ms/batch 428.39 | loss  2.94 | ppl    18.828
+| epoch  17 step   193600 |  10080 batches | lr 2.53e-06 | ms/batch 429.53 | loss  2.97 | ppl    19.541
+| epoch  17 step   193800 |  10280 batches | lr 2.37e-06 | ms/batch 431.64 | loss  2.94 | ppl    18.977
+| epoch  17 step   194000 |  10480 batches | lr 2.22e-06 | ms/batch 428.52 | loss  2.98 | ppl    19.732
+| epoch  17 step   194200 |  10680 batches | lr 2.07e-06 | ms/batch 429.27 | loss  2.96 | ppl    19.303
+| epoch  17 step   194400 |  10880 batches | lr 1.93e-06 | ms/batch 428.66 | loss  2.94 | ppl    18.856
+| epoch  17 step   194600 |  11080 batches | lr 1.8e-06 | ms/batch 429.55 | loss  2.98 | ppl    19.745
+| epoch  17 step   194800 |  11280 batches | lr 1.67e-06 | ms/batch 429.71 | loss  2.98 | ppl    19.731
+| epoch  18 step   195000 |     10 batches | lr 1.54e-06 | ms/batch 427.88 | loss  2.97 | ppl    19.547
+| epoch  18 step   195200 |    210 batches | lr 1.42e-06 | ms/batch 428.77 | loss  2.94 | ppl    18.860
+| epoch  18 step   195400 |    410 batches | lr 1.3e-06 | ms/batch 428.59 | loss  2.97 | ppl    19.491
+| epoch  18 step   195600 |    610 batches | lr 1.19e-06 | ms/batch 429.81 | loss  2.94 | ppl    18.910
+| epoch  18 step   195800 |    810 batches | lr 1.09e-06 | ms/batch 430.47 | loss  2.98 | ppl    19.594
+| epoch  18 step   196000 |   1010 batches | lr 9.87e-07 | ms/batch 430.25 | loss  2.94 | ppl    18.915
+----------------------------------------------------------------------------------------------------
+| Eval  49 at step   196000 | time: 1723.60s | valid loss  3.12 | valid ppl    22.721
+----------------------------------------------------------------------------------------------------
+| epoch  18 step   196200 |   1210 batches | lr 8.91e-07 | ms/batch 481.11 | loss  2.97 | ppl    19.444
+| epoch  18 step   196400 |   1410 batches | lr 7.99e-07 | ms/batch 429.35 | loss  2.96 | ppl    19.282
+| epoch  18 step   196600 |   1610 batches | lr 7.13e-07 | ms/batch 430.13 | loss  2.94 | ppl    18.853
+| epoch  18 step   196800 |   1810 batches | lr 6.32e-07 | ms/batch 430.89 | loss  2.97 | ppl    19.428
+| epoch  18 step   197000 |   2010 batches | lr 5.55e-07 | ms/batch 429.33 | loss  2.99 | ppl    19.982
+| epoch  18 step   197200 |   2210 batches | lr 4.84e-07 | ms/batch 434.58 | loss  2.98 | ppl    19.660
+| epoch  18 step   197400 |   2410 batches | lr 4.17e-07 | ms/batch 431.17 | loss  2.97 | ppl    19.544
+| epoch  18 step   197600 |   2610 batches | lr 3.55e-07 | ms/batch 430.55 | loss  2.96 | ppl    19.355
+| epoch  18 step   197800 |   2810 batches | lr 2.99e-07 | ms/batch 430.41 | loss  2.94 | ppl    18.958
+| epoch  18 step   198000 |   3010 batches | lr 2.47e-07 | ms/batch 429.36 | loss  2.96 | ppl    19.330
+| epoch  18 step   198200 |   3210 batches | lr 2e-07 | ms/batch 430.41 | loss  2.96 | ppl    19.325
+| epoch  18 step   198400 |   3410 batches | lr 1.58e-07 | ms/batch 429.43 | loss  2.97 | ppl    19.499
+| epoch  18 step   198600 |   3610 batches | lr 1.21e-07 | ms/batch 431.50 | loss  2.94 | ppl    18.898
+| epoch  18 step   198800 |   3810 batches | lr 8.88e-08 | ms/batch 429.80 | loss  2.96 | ppl    19.348
+| epoch  18 step   199000 |   4010 batches | lr 6.17e-08 | ms/batch 429.77 | loss  2.98 | ppl    19.655
+| epoch  18 step   199200 |   4210 batches | lr 3.95e-08 | ms/batch 429.61 | loss  2.96 | ppl    19.266
+| epoch  18 step   199400 |   4410 batches | lr 2.22e-08 | ms/batch 430.88 | loss  2.97 | ppl    19.436
+| epoch  18 step   199600 |   4610 batches | lr 9.87e-09 | ms/batch 429.55 | loss  2.97 | ppl    19.504
+| epoch  18 step   199800 |   4810 batches | lr 2.47e-09 | ms/batch 428.95 | loss  2.94 | ppl    19.004
+| epoch  18 step   200000 |   5010 batches | lr 0 | ms/batch 430.23 | loss  2.98 | ppl    19.716
+----------------------------------------------------------------------------------------------------
+| Eval  50 at step   200000 | time: 1727.18s | valid loss  3.12 | valid ppl    22.725
+----------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------
+End of training
+====================================================================================================
+| End of training | test loss  3.16 | test ppl    23.511
+====================================================================================================
diff --git a/NLP/Transformer-XL/exp_results/log-50k.txt b/NLP/Transformer-XL/exp_results/log-50k.txt
new file mode 100644
index 0000000..a69845b
--- /dev/null
+++ b/NLP/Transformer-XL/exp_results/log-50k.txt
@@ -0,0 +1,360 @@
+====================================================================================================
+    - data : /root/autodl-tmp/data/wikitext-103/
+    - dataset : wt103
+    - n_layer : 16
+    - n_head : 10
+    - d_head : 41
+    - d_embed : 410
+    - d_model : 410
+    - d_inner : 2100
+    - dropout : 0.1
+    - dropatt : 0.0
+    - init : normal
+    - emb_init : normal
+    - init_range : 0.1
+    - emb_init_range : 0.01
+    - init_std : 0.02
+    - proj_init_std : 0.01
+    - optim : adan
+    - lr : 0.0015
+    - wd : 0.02
+    - mom : 0.0
+    - scheduler : cosine
+    - warmup_step : 5000
+    - decay_rate : 0.5
+    - lr_min : 1e-06
+    - clip : 0.25
+    - clip_nonemb : False
+    - max_step : 50000
+    - batch_size : 60
+    - batch_chunk : 1
+    - tgt_len : 150
+    - eval_tgt_len : 150
+    - ext_len : 0
+    - mem_len : 150
+    - not_tied : False
+    - seed : 1111
+    - cuda : True
+    - adaptive : True
+    - div_val : 1
+    - pre_lnorm : False
+    - varlen : False
+    - multi_gpu : True
+    - log_interval : 200
+    - eval_interval : 4000
+    - work_dir : /root/autodl-tmp/-wt103/20220809-222534
+    - restart : False
+    - restart_dir : 
+    - debug : False
+    - same_length : False
+    - attn_type : 0
+    - clamp_len : -1
+    - eta_min : 0.0
+    - gpu0_bsz : 4
+    - max_eval_steps : -1
+    - sample_softmax : -1
+    - patience : 0
+    - finetune_v2 : False
+    - finetune_v3 : False
+    - fp16 : False
+    - static_loss_scale : 1
+    - dynamic_loss_scale : False
+    - opt_betas : [0.9, 0.9, 0.999]
+    - tied : True
+    - n_token : 267735
+    - n_all_param : 151107538
+    - n_nonemb_param : 41066400
+====================================================================================================
+#params = 151107538
+#non emb params = 41066400
+| epoch   1 step      200 |    200 batches | lr 6e-05 | ms/batch 731.01 | loss  8.99 | ppl  7986.754
+| epoch   1 step      400 |    400 batches | lr 0.00012 | ms/batch 671.04 | loss  6.94 | ppl  1033.129
+| epoch   1 step      600 |    600 batches | lr 0.00018 | ms/batch 674.05 | loss  6.40 | ppl   599.798
+| epoch   1 step      800 |    800 batches | lr 0.00024 | ms/batch 672.64 | loss  6.11 | ppl   452.258
+| epoch   1 step     1000 |   1000 batches | lr 0.0003 | ms/batch 672.77 | loss  5.85 | ppl   348.893
+| epoch   1 step     1200 |   1200 batches | lr 0.00036 | ms/batch 673.66 | loss  5.65 | ppl   285.037
+| epoch   1 step     1400 |   1400 batches | lr 0.00042 | ms/batch 674.81 | loss  5.48 | ppl   240.623
+| epoch   1 step     1600 |   1600 batches | lr 0.00048 | ms/batch 671.81 | loss  5.33 | ppl   206.955
+| epoch   1 step     1800 |   1800 batches | lr 0.00054 | ms/batch 673.69 | loss  5.21 | ppl   182.225
+| epoch   1 step     2000 |   2000 batches | lr 0.0006 | ms/batch 670.74 | loss  5.09 | ppl   162.138
+| epoch   1 step     2200 |   2200 batches | lr 0.00066 | ms/batch 672.15 | loss  4.98 | ppl   145.111
+| epoch   1 step     2400 |   2400 batches | lr 0.00072 | ms/batch 670.57 | loss  4.89 | ppl   133.331
+| epoch   1 step     2600 |   2600 batches | lr 0.00078 | ms/batch 672.95 | loss  4.80 | ppl   121.355
+| epoch   1 step     2800 |   2800 batches | lr 0.00084 | ms/batch 671.53 | loss  4.72 | ppl   112.435
+| epoch   1 step     3000 |   3000 batches | lr 0.0009 | ms/batch 667.80 | loss  4.67 | ppl   107.032
+| epoch   1 step     3200 |   3200 batches | lr 0.00096 | ms/batch 670.42 | loss  4.61 | ppl   100.273
+| epoch   1 step     3400 |   3400 batches | lr 0.00102 | ms/batch 673.73 | loss  4.56 | ppl    95.679
+| epoch   1 step     3600 |   3600 batches | lr 0.00108 | ms/batch 670.60 | loss  4.48 | ppl    88.439
+| epoch   1 step     3800 |   3800 batches | lr 0.00114 | ms/batch 672.03 | loss  4.51 | ppl    90.996
+| epoch   1 step     4000 |   4000 batches | lr 0.0012 | ms/batch 660.71 | loss  4.47 | ppl    87.228
+----------------------------------------------------------------------------------------------------
+| Eval   1 at step     4000 | time: 2706.60s | valid loss  4.43 | valid ppl    83.560
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     4200 |   4200 batches | lr 0.00126 | ms/batch 741.78 | loss  4.42 | ppl    83.146
+| epoch   1 step     4400 |   4400 batches | lr 0.00132 | ms/batch 671.50 | loss  4.40 | ppl    81.572
+| epoch   1 step     4600 |   4600 batches | lr 0.00138 | ms/batch 669.10 | loss  4.38 | ppl    79.989
+| epoch   1 step     4800 |   4800 batches | lr 0.00144 | ms/batch 671.50 | loss  4.33 | ppl    76.228
+| epoch   1 step     5000 |   5000 batches | lr 0.0015 | ms/batch 669.83 | loss  4.37 | ppl    79.175
+| epoch   1 step     5200 |   5200 batches | lr 0.0015 | ms/batch 669.53 | loss  4.32 | ppl    74.879
+| epoch   1 step     5400 |   5400 batches | lr 0.00149 | ms/batch 668.42 | loss  4.26 | ppl    70.961
+| epoch   1 step     5600 |   5600 batches | lr 0.00149 | ms/batch 669.68 | loss  4.28 | ppl    72.426
+| epoch   1 step     5800 |   5800 batches | lr 0.00149 | ms/batch 668.33 | loss  4.28 | ppl    71.883
+| epoch   1 step     6000 |   6000 batches | lr 0.00148 | ms/batch 669.96 | loss  4.23 | ppl    68.809
+| epoch   1 step     6200 |   6200 batches | lr 0.00148 | ms/batch 671.62 | loss  4.20 | ppl    66.917
+| epoch   1 step     6400 |   6400 batches | lr 0.00148 | ms/batch 670.80 | loss  4.23 | ppl    68.826
+| epoch   1 step     6600 |   6600 batches | lr 0.00147 | ms/batch 671.47 | loss  4.17 | ppl    64.485
+| epoch   1 step     6800 |   6800 batches | lr 0.00147 | ms/batch 671.88 | loss  4.16 | ppl    64.148
+| epoch   1 step     7000 |   7000 batches | lr 0.00146 | ms/batch 669.08 | loss  4.16 | ppl    64.382
+| epoch   1 step     7200 |   7200 batches | lr 0.00146 | ms/batch 669.37 | loss  4.12 | ppl    61.310
+| epoch   1 step     7400 |   7400 batches | lr 0.00146 | ms/batch 669.99 | loss  4.11 | ppl    61.000
+| epoch   1 step     7600 |   7600 batches | lr 0.00145 | ms/batch 669.12 | loss  4.09 | ppl    59.732
+| epoch   1 step     7800 |   7800 batches | lr 0.00145 | ms/batch 671.55 | loss  4.11 | ppl    60.794
+| epoch   1 step     8000 |   8000 batches | lr 0.00144 | ms/batch 659.11 | loss  4.10 | ppl    60.478
+----------------------------------------------------------------------------------------------------
+| Eval   2 at step     8000 | time: 2687.58s | valid loss  4.01 | valid ppl    55.175
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     8200 |   8200 batches | lr 0.00144 | ms/batch 742.68 | loss  4.08 | ppl    58.932
+| epoch   1 step     8400 |   8400 batches | lr 0.00143 | ms/batch 669.52 | loss  4.09 | ppl    59.603
+| epoch   1 step     8600 |   8600 batches | lr 0.00143 | ms/batch 670.69 | loss  4.07 | ppl    58.419
+| epoch   1 step     8800 |   8800 batches | lr 0.00142 | ms/batch 670.29 | loss  4.08 | ppl    58.862
+| epoch   1 step     9000 |   9000 batches | lr 0.00142 | ms/batch 671.07 | loss  4.04 | ppl    57.075
+| epoch   1 step     9200 |   9200 batches | lr 0.00141 | ms/batch 670.31 | loss  4.03 | ppl    56.375
+| epoch   1 step     9400 |   9400 batches | lr 0.00141 | ms/batch 668.76 | loss  4.04 | ppl    56.654
+| epoch   1 step     9600 |   9600 batches | lr 0.0014 | ms/batch 668.70 | loss  4.05 | ppl    57.438
+| epoch   1 step     9800 |   9800 batches | lr 0.0014 | ms/batch 669.90 | loss  4.01 | ppl    54.931
+| epoch   1 step    10000 |  10000 batches | lr 0.00139 | ms/batch 671.54 | loss  4.02 | ppl    55.691
+| epoch   1 step    10200 |  10200 batches | lr 0.00138 | ms/batch 668.10 | loss  3.98 | ppl    53.731
+| epoch   1 step    10400 |  10400 batches | lr 0.00138 | ms/batch 668.55 | loss  3.98 | ppl    53.647
+| epoch   1 step    10600 |  10600 batches | lr 0.00137 | ms/batch 670.24 | loss  4.00 | ppl    54.823
+| epoch   1 step    10800 |  10800 batches | lr 0.00137 | ms/batch 669.67 | loss  3.96 | ppl    52.449
+| epoch   1 step    11000 |  11000 batches | lr 0.00136 | ms/batch 668.12 | loss  4.00 | ppl    54.511
+| epoch   1 step    11200 |  11200 batches | lr 0.00135 | ms/batch 669.36 | loss  3.98 | ppl    53.348
+| epoch   1 step    11400 |  11400 batches | lr 0.00135 | ms/batch 667.23 | loss  3.97 | ppl    53.053
+| epoch   2 step    11600 |    130 batches | lr 0.00134 | ms/batch 671.47 | loss  3.95 | ppl    51.832
+| epoch   2 step    11800 |    330 batches | lr 0.00134 | ms/batch 670.28 | loss  3.92 | ppl    50.430
+| epoch   2 step    12000 |    530 batches | lr 0.00133 | ms/batch 658.97 | loss  3.94 | ppl    51.495
+----------------------------------------------------------------------------------------------------
+| Eval   3 at step    12000 | time: 2685.36s | valid loss  3.83 | valid ppl    46.199
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    12200 |    730 batches | lr 0.00132 | ms/batch 741.77 | loss  3.91 | ppl    50.018
+| epoch   2 step    12400 |    930 batches | lr 0.00132 | ms/batch 669.29 | loss  3.91 | ppl    50.118
+| epoch   2 step    12600 |   1130 batches | lr 0.00131 | ms/batch 670.23 | loss  3.94 | ppl    51.393
+| epoch   2 step    12800 |   1330 batches | lr 0.0013 | ms/batch 670.21 | loss  3.91 | ppl    49.684
+| epoch   2 step    13000 |   1530 batches | lr 0.00129 | ms/batch 669.82 | loss  3.90 | ppl    49.205
+| epoch   2 step    13200 |   1730 batches | lr 0.00129 | ms/batch 668.80 | loss  3.89 | ppl    48.946
+| epoch   2 step    13400 |   1930 batches | lr 0.00128 | ms/batch 669.89 | loss  3.90 | ppl    49.160
+| epoch   2 step    13600 |   2130 batches | lr 0.00127 | ms/batch 670.73 | loss  3.91 | ppl    50.134
+| epoch   2 step    13800 |   2330 batches | lr 0.00127 | ms/batch 669.47 | loss  3.89 | ppl    48.907
+| epoch   2 step    14000 |   2530 batches | lr 0.00126 | ms/batch 670.64 | loss  3.88 | ppl    48.187
+| epoch   2 step    14200 |   2730 batches | lr 0.00125 | ms/batch 669.45 | loss  3.85 | ppl    47.194
+| epoch   2 step    14400 |   2930 batches | lr 0.00124 | ms/batch 670.69 | loss  3.84 | ppl    46.316
+| epoch   2 step    14600 |   3130 batches | lr 0.00124 | ms/batch 668.19 | loss  3.84 | ppl    46.742
+| epoch   2 step    14800 |   3330 batches | lr 0.00123 | ms/batch 668.82 | loss  3.85 | ppl    46.832
+| epoch   2 step    15000 |   3530 batches | lr 0.00122 | ms/batch 669.99 | loss  3.81 | ppl    45.024
+| epoch   2 step    15200 |   3730 batches | lr 0.00121 | ms/batch 668.58 | loss  3.83 | ppl    46.255
+| epoch   2 step    15400 |   3930 batches | lr 0.0012 | ms/batch 670.31 | loss  3.82 | ppl    45.787
+| epoch   2 step    15600 |   4130 batches | lr 0.0012 | ms/batch 667.87 | loss  3.81 | ppl    45.203
+| epoch   2 step    15800 |   4330 batches | lr 0.00119 | ms/batch 669.87 | loss  3.82 | ppl    45.456
+| epoch   2 step    16000 |   4530 batches | lr 0.00118 | ms/batch 656.97 | loss  3.82 | ppl    45.455
+----------------------------------------------------------------------------------------------------
+| Eval   4 at step    16000 | time: 2684.61s | valid loss  3.70 | valid ppl    40.554
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    16200 |   4730 batches | lr 0.00117 | ms/batch 743.72 | loss  3.77 | ppl    43.325
+| epoch   2 step    16400 |   4930 batches | lr 0.00116 | ms/batch 669.07 | loss  3.79 | ppl    44.198
+| epoch   2 step    16600 |   5130 batches | lr 0.00116 | ms/batch 670.76 | loss  3.78 | ppl    43.728
+| epoch   2 step    16800 |   5330 batches | lr 0.00115 | ms/batch 673.39 | loss  3.77 | ppl    43.271
+| epoch   2 step    17000 |   5530 batches | lr 0.00114 | ms/batch 668.77 | loss  3.75 | ppl    42.620
+| epoch   2 step    17200 |   5730 batches | lr 0.00113 | ms/batch 668.81 | loss  3.77 | ppl    43.340
+| epoch   2 step    17400 |   5930 batches | lr 0.00112 | ms/batch 671.39 | loss  3.75 | ppl    42.598
+| epoch   2 step    17600 |   6130 batches | lr 0.00111 | ms/batch 670.80 | loss  3.74 | ppl    42.211
+| epoch   2 step    17800 |   6330 batches | lr 0.0011 | ms/batch 670.83 | loss  3.77 | ppl    43.377
+| epoch   2 step    18000 |   6530 batches | lr 0.0011 | ms/batch 670.94 | loss  3.71 | ppl    40.882
+| epoch   2 step    18200 |   6730 batches | lr 0.00109 | ms/batch 671.71 | loss  3.71 | ppl    41.009
+| epoch   2 step    18400 |   6930 batches | lr 0.00108 | ms/batch 671.77 | loss  3.73 | ppl    41.510
+| epoch   2 step    18600 |   7130 batches | lr 0.00107 | ms/batch 672.45 | loss  3.70 | ppl    40.538
+| epoch   2 step    18800 |   7330 batches | lr 0.00106 | ms/batch 676.93 | loss  3.68 | ppl    39.664
+| epoch   2 step    19000 |   7530 batches | lr 0.00105 | ms/batch 673.81 | loss  3.70 | ppl    40.567
+| epoch   2 step    19200 |   7730 batches | lr 0.00104 | ms/batch 673.02 | loss  3.70 | ppl    40.493
+| epoch   2 step    19400 |   7930 batches | lr 0.00103 | ms/batch 671.76 | loss  3.69 | ppl    40.199
+| epoch   2 step    19600 |   8130 batches | lr 0.00102 | ms/batch 672.49 | loss  3.70 | ppl    40.628
+| epoch   2 step    19800 |   8330 batches | lr 0.00102 | ms/batch 675.15 | loss  3.69 | ppl    40.150
+| epoch   2 step    20000 |   8530 batches | lr 0.00101 | ms/batch 662.59 | loss  3.68 | ppl    39.675
+----------------------------------------------------------------------------------------------------
+| Eval   5 at step    20000 | time: 2694.60s | valid loss  3.60 | valid ppl    36.520
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    20200 |   8730 batches | lr 0.000997 | ms/batch 743.34 | loss  3.70 | ppl    40.281
+| epoch   2 step    20400 |   8930 batches | lr 0.000988 | ms/batch 672.38 | loss  3.69 | ppl    40.101
+| epoch   2 step    20600 |   9130 batches | lr 0.000978 | ms/batch 671.32 | loss  3.68 | ppl    39.723
+| epoch   2 step    20800 |   9330 batches | lr 0.000969 | ms/batch 670.29 | loss  3.67 | ppl    39.195
+| epoch   2 step    21000 |   9530 batches | lr 0.00096 | ms/batch 673.92 | loss  3.71 | ppl    40.874
+| epoch   2 step    21200 |   9730 batches | lr 0.00095 | ms/batch 673.78 | loss  3.66 | ppl    38.777
+| epoch   2 step    21400 |   9930 batches | lr 0.000941 | ms/batch 671.65 | loss  3.67 | ppl    39.193
+| epoch   2 step    21600 |  10130 batches | lr 0.000932 | ms/batch 671.55 | loss  3.65 | ppl    38.482
+| epoch   2 step    21800 |  10330 batches | lr 0.000922 | ms/batch 671.69 | loss  3.66 | ppl    38.807
+| epoch   2 step    22000 |  10530 batches | lr 0.000913 | ms/batch 671.36 | loss  3.67 | ppl    39.367
+| epoch   2 step    22200 |  10730 batches | lr 0.000903 | ms/batch 672.87 | loss  3.63 | ppl    37.849
+| epoch   2 step    22400 |  10930 batches | lr 0.000894 | ms/batch 674.08 | loss  3.63 | ppl    37.837
+| epoch   2 step    22600 |  11130 batches | lr 0.000884 | ms/batch 671.07 | loss  3.68 | ppl    39.497
+| epoch   2 step    22800 |  11330 batches | lr 0.000875 | ms/batch 671.94 | loss  3.64 | ppl    38.144
+| epoch   3 step    23000 |     60 batches | lr 0.000865 | ms/batch 672.34 | loss  3.65 | ppl    38.332
+| epoch   3 step    23200 |    260 batches | lr 0.000855 | ms/batch 674.27 | loss  3.60 | ppl    36.501
+| epoch   3 step    23400 |    460 batches | lr 0.000846 | ms/batch 674.42 | loss  3.64 | ppl    37.995
+| epoch   3 step    23600 |    660 batches | lr 0.000836 | ms/batch 672.56 | loss  3.60 | ppl    36.540
+| epoch   3 step    23800 |    860 batches | lr 0.000827 | ms/batch 673.12 | loss  3.63 | ppl    37.738
+| epoch   3 step    24000 |   1060 batches | lr 0.000817 | ms/batch 664.65 | loss  3.62 | ppl    37.164
+----------------------------------------------------------------------------------------------------
+| Eval   6 at step    24000 | time: 2697.80s | valid loss  3.52 | valid ppl    33.726
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    24200 |   1260 batches | lr 0.000807 | ms/batch 740.67 | loss  3.60 | ppl    36.765
+| epoch   3 step    24400 |   1460 batches | lr 0.000798 | ms/batch 674.30 | loss  3.60 | ppl    36.720
+| epoch   3 step    24600 |   1660 batches | lr 0.000788 | ms/batch 672.55 | loss  3.59 | ppl    36.339
+| epoch   3 step    24800 |   1860 batches | lr 0.000778 | ms/batch 671.83 | loss  3.60 | ppl    36.487
+| epoch   3 step    25000 |   2060 batches | lr 0.000769 | ms/batch 671.74 | loss  3.63 | ppl    37.859
+| epoch   3 step    25200 |   2260 batches | lr 0.000759 | ms/batch 672.23 | loss  3.61 | ppl    36.807
+| epoch   3 step    25400 |   2460 batches | lr 0.000749 | ms/batch 671.61 | loss  3.59 | ppl    36.224
+| epoch   3 step    25600 |   2660 batches | lr 0.00074 | ms/batch 674.02 | loss  3.59 | ppl    36.343
+| epoch   3 step    25800 |   2860 batches | lr 0.00073 | ms/batch 671.84 | loss  3.53 | ppl    34.173
+| epoch   3 step    26000 |   3060 batches | lr 0.00072 | ms/batch 672.60 | loss  3.58 | ppl    35.903
+| epoch   3 step    26200 |   3260 batches | lr 0.000711 | ms/batch 673.04 | loss  3.58 | ppl    35.696
+| epoch   3 step    26400 |   3460 batches | lr 0.000701 | ms/batch 673.00 | loss  3.54 | ppl    34.395
+| epoch   3 step    26600 |   3660 batches | lr 0.000692 | ms/batch 673.81 | loss  3.55 | ppl    34.771
+| epoch   3 step    26800 |   3860 batches | lr 0.000682 | ms/batch 672.00 | loss  3.55 | ppl    34.852
+| epoch   3 step    27000 |   4060 batches | lr 0.000672 | ms/batch 673.44 | loss  3.56 | ppl    35.128
+| epoch   3 step    27200 |   4260 batches | lr 0.000663 | ms/batch 671.63 | loss  3.54 | ppl    34.582
+| epoch   3 step    27400 |   4460 batches | lr 0.000653 | ms/batch 672.23 | loss  3.55 | ppl    34.678
+| epoch   3 step    27600 |   4660 batches | lr 0.000644 | ms/batch 671.70 | loss  3.53 | ppl    34.204
+| epoch   3 step    27800 |   4860 batches | lr 0.000634 | ms/batch 670.97 | loss  3.52 | ppl    33.707
+| epoch   3 step    28000 |   5060 batches | lr 0.000625 | ms/batch 663.55 | loss  3.53 | ppl    34.105
+----------------------------------------------------------------------------------------------------
+| Eval   7 at step    28000 | time: 2697.22s | valid loss  3.44 | valid ppl    31.229
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    28200 |   5260 batches | lr 0.000615 | ms/batch 738.31 | loss  3.51 | ppl    33.439
+| epoch   3 step    28400 |   5460 batches | lr 0.000606 | ms/batch 670.03 | loss  3.49 | ppl    32.676
+| epoch   3 step    28600 |   5660 batches | lr 0.000596 | ms/batch 673.65 | loss  3.53 | ppl    34.273
+| epoch   3 step    28800 |   5860 batches | lr 0.000587 | ms/batch 670.70 | loss  3.50 | ppl    33.257
+| epoch   3 step    29000 |   6060 batches | lr 0.000577 | ms/batch 672.88 | loss  3.50 | ppl    33.035
+| epoch   3 step    29200 |   6260 batches | lr 0.000568 | ms/batch 671.74 | loss  3.50 | ppl    33.001
+| epoch   3 step    29400 |   6460 batches | lr 0.000559 | ms/batch 670.97 | loss  3.50 | ppl    33.162
+| epoch   3 step    29600 |   6660 batches | lr 0.00055 | ms/batch 671.14 | loss  3.45 | ppl    31.426
+| epoch   3 step    29800 |   6860 batches | lr 0.00054 | ms/batch 672.59 | loss  3.48 | ppl    32.386
+| epoch   3 step    30000 |   7060 batches | lr 0.000531 | ms/batch 671.72 | loss  3.47 | ppl    32.047
+| epoch   3 step    30200 |   7260 batches | lr 0.000522 | ms/batch 669.64 | loss  3.44 | ppl    31.093
+| epoch   3 step    30400 |   7460 batches | lr 0.000513 | ms/batch 674.88 | loss  3.46 | ppl    31.766
+| epoch   3 step    30600 |   7660 batches | lr 0.000504 | ms/batch 673.98 | loss  3.44 | ppl    31.226
+| epoch   3 step    30800 |   7860 batches | lr 0.000495 | ms/batch 672.05 | loss  3.45 | ppl    31.633
+| epoch   3 step    31000 |   8060 batches | lr 0.000486 | ms/batch 675.06 | loss  3.46 | ppl    31.822
+| epoch   3 step    31200 |   8260 batches | lr 0.000477 | ms/batch 675.76 | loss  3.45 | ppl    31.384
+| epoch   3 step    31400 |   8460 batches | lr 0.000468 | ms/batch 674.16 | loss  3.46 | ppl    31.680
+| epoch   3 step    31600 |   8660 batches | lr 0.000459 | ms/batch 673.56 | loss  3.45 | ppl    31.480
+| epoch   3 step    31800 |   8860 batches | lr 0.00045 | ms/batch 671.05 | loss  3.45 | ppl    31.470
+| epoch   3 step    32000 |   9060 batches | lr 0.000441 | ms/batch 662.55 | loss  3.45 | ppl    31.454
+----------------------------------------------------------------------------------------------------
+| Eval   8 at step    32000 | time: 2696.71s | valid loss  3.37 | valid ppl    29.048
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    32200 |   9260 batches | lr 0.000433 | ms/batch 741.24 | loss  3.43 | ppl    30.924
+| epoch   3 step    32400 |   9460 batches | lr 0.000424 | ms/batch 672.63 | loss  3.45 | ppl    31.583
+| epoch   3 step    32600 |   9660 batches | lr 0.000415 | ms/batch 672.60 | loss  3.45 | ppl    31.560
+| epoch   3 step    32800 |   9860 batches | lr 0.000407 | ms/batch 671.88 | loss  3.41 | ppl    30.145
+| epoch   3 step    33000 |  10060 batches | lr 0.000398 | ms/batch 672.49 | loss  3.45 | ppl    31.582
+| epoch   3 step    33200 |  10260 batches | lr 0.00039 | ms/batch 671.16 | loss  3.40 | ppl    29.971
+| epoch   3 step    33400 |  10460 batches | lr 0.000382 | ms/batch 671.28 | loss  3.43 | ppl    30.997
+| epoch   3 step    33600 |  10660 batches | lr 0.000373 | ms/batch 672.12 | loss  3.44 | ppl    31.166
+| epoch   3 step    33800 |  10860 batches | lr 0.000365 | ms/batch 671.60 | loss  3.39 | ppl    29.578
+| epoch   3 step    34000 |  11060 batches | lr 0.000357 | ms/batch 672.62 | loss  3.43 | ppl    30.954
+| epoch   3 step    34200 |  11260 batches | lr 0.000349 | ms/batch 671.84 | loss  3.44 | ppl    31.123
+| epoch   3 step    34400 |  11460 batches | lr 0.000341 | ms/batch 673.17 | loss  3.41 | ppl    30.185
+| epoch   4 step    34600 |    190 batches | lr 0.000333 | ms/batch 670.84 | loss  3.39 | ppl    29.520
+| epoch   4 step    34800 |    390 batches | lr 0.000325 | ms/batch 673.47 | loss  3.39 | ppl    29.798
+| epoch   4 step    35000 |    590 batches | lr 0.000317 | ms/batch 672.91 | loss  3.38 | ppl    29.482
+| epoch   4 step    35200 |    790 batches | lr 0.000309 | ms/batch 671.06 | loss  3.40 | ppl    29.950
+| epoch   4 step    35400 |    990 batches | lr 0.000301 | ms/batch 673.00 | loss  3.38 | ppl    29.249
+| epoch   4 step    35600 |   1190 batches | lr 0.000294 | ms/batch 673.68 | loss  3.39 | ppl    29.768
+| epoch   4 step    35800 |   1390 batches | lr 0.000286 | ms/batch 671.24 | loss  3.38 | ppl    29.479
+| epoch   4 step    36000 |   1590 batches | lr 0.000279 | ms/batch 660.61 | loss  3.37 | ppl    29.048
+----------------------------------------------------------------------------------------------------
+| Eval   9 at step    36000 | time: 2695.59s | valid loss  3.32 | valid ppl    27.645
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    36200 |   1790 batches | lr 0.000271 | ms/batch 738.61 | loss  3.38 | ppl    29.267
+| epoch   4 step    36400 |   1990 batches | lr 0.000264 | ms/batch 671.84 | loss  3.41 | ppl    30.128
+| epoch   4 step    36600 |   2190 batches | lr 0.000257 | ms/batch 670.16 | loss  3.39 | ppl    29.614
+| epoch   4 step    36800 |   2390 batches | lr 0.00025 | ms/batch 672.50 | loss  3.39 | ppl    29.549
+| epoch   4 step    37000 |   2590 batches | lr 0.000242 | ms/batch 674.54 | loss  3.36 | ppl    28.867
+| epoch   4 step    37200 |   2790 batches | lr 0.000235 | ms/batch 672.19 | loss  3.34 | ppl    28.314
+| epoch   4 step    37400 |   2990 batches | lr 0.000229 | ms/batch 670.71 | loss  3.36 | ppl    28.677
+| epoch   4 step    37600 |   3190 batches | lr 0.000222 | ms/batch 668.95 | loss  3.36 | ppl    28.682
+| epoch   4 step    37800 |   3390 batches | lr 0.000215 | ms/batch 672.94 | loss  3.36 | ppl    28.683
+| epoch   4 step    38000 |   3590 batches | lr 0.000208 | ms/batch 672.33 | loss  3.33 | ppl    27.802
+| epoch   4 step    38200 |   3790 batches | lr 0.000202 | ms/batch 673.11 | loss  3.34 | ppl    28.335
+| epoch   4 step    38400 |   3990 batches | lr 0.000195 | ms/batch 670.77 | loss  3.36 | ppl    28.747
+| epoch   4 step    38600 |   4190 batches | lr 0.000189 | ms/batch 671.42 | loss  3.34 | ppl    28.160
+| epoch   4 step    38800 |   4390 batches | lr 0.000183 | ms/batch 674.42 | loss  3.34 | ppl    28.212
+| epoch   4 step    39000 |   4590 batches | lr 0.000176 | ms/batch 671.51 | loss  3.35 | ppl    28.619
+| epoch   4 step    39200 |   4790 batches | lr 0.00017 | ms/batch 673.38 | loss  3.30 | ppl    27.241
+| epoch   4 step    39400 |   4990 batches | lr 0.000164 | ms/batch 671.09 | loss  3.35 | ppl    28.548
+| epoch   4 step    39600 |   5190 batches | lr 0.000158 | ms/batch 673.71 | loss  3.31 | ppl    27.271
+| epoch   4 step    39800 |   5390 batches | lr 0.000153 | ms/batch 671.79 | loss  3.29 | ppl    26.839
+| epoch   4 step    40000 |   5590 batches | lr 0.000147 | ms/batch 663.99 | loss  3.31 | ppl    27.419
+----------------------------------------------------------------------------------------------------
+| Eval  10 at step    40000 | time: 2695.51s | valid loss  3.28 | valid ppl    26.473
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    40200 |   5790 batches | lr 0.000141 | ms/batch 737.94 | loss  3.33 | ppl    27.939
+| epoch   4 step    40400 |   5990 batches | lr 0.000136 | ms/batch 674.02 | loss  3.30 | ppl    27.155
+| epoch   4 step    40600 |   6190 batches | lr 0.00013 | ms/batch 671.99 | loss  3.30 | ppl    27.222
+| epoch   4 step    40800 |   6390 batches | lr 0.000125 | ms/batch 674.33 | loss  3.33 | ppl    27.819
+| epoch   4 step    41000 |   6590 batches | lr 0.00012 | ms/batch 672.00 | loss  3.26 | ppl    26.092
+| epoch   4 step    41200 |   6790 batches | lr 0.000115 | ms/batch 670.91 | loss  3.29 | ppl    26.772
+| epoch   4 step    41400 |   6990 batches | lr 0.00011 | ms/batch 670.93 | loss  3.30 | ppl    27.098
+| epoch   4 step    41600 |   7190 batches | lr 0.000105 | ms/batch 672.93 | loss  3.25 | ppl    25.775
+| epoch   4 step    41800 |   7390 batches | lr 9.98e-05 | ms/batch 673.77 | loss  3.28 | ppl    26.457
+| epoch   4 step    42000 |   7590 batches | lr 9.51e-05 | ms/batch 672.27 | loss  3.25 | ppl    25.813
+| epoch   4 step    42200 |   7790 batches | lr 9.05e-05 | ms/batch 671.48 | loss  3.28 | ppl    26.654
+| epoch   4 step    42400 |   7990 batches | lr 8.6e-05 | ms/batch 671.27 | loss  3.28 | ppl    26.600
+| epoch   4 step    42600 |   8190 batches | lr 8.16e-05 | ms/batch 673.39 | loss  3.27 | ppl    26.227
+| epoch   4 step    42800 |   8390 batches | lr 7.73e-05 | ms/batch 673.21 | loss  3.29 | ppl    26.959
+| epoch   4 step    43000 |   8590 batches | lr 7.32e-05 | ms/batch 675.70 | loss  3.27 | ppl    26.299
+| epoch   4 step    43200 |   8790 batches | lr 6.91e-05 | ms/batch 673.58 | loss  3.29 | ppl    26.749
+| epoch   4 step    43400 |   8990 batches | lr 6.52e-05 | ms/batch 673.15 | loss  3.28 | ppl    26.451
+| epoch   4 step    43600 |   9190 batches | lr 6.13e-05 | ms/batch 671.88 | loss  3.26 | ppl    26.136
+| epoch   4 step    43800 |   9390 batches | lr 5.76e-05 | ms/batch 673.32 | loss  3.28 | ppl    26.443
+| epoch   4 step    44000 |   9590 batches | lr 5.4e-05 | ms/batch 662.94 | loss  3.29 | ppl    26.910
+----------------------------------------------------------------------------------------------------
+| Eval  11 at step    44000 | time: 2697.59s | valid loss  3.25 | valid ppl    25.763
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    44200 |   9790 batches | lr 5.05e-05 | ms/batch 740.81 | loss  3.27 | ppl    26.191
+| epoch   4 step    44400 |   9990 batches | lr 4.71e-05 | ms/batch 672.14 | loss  3.26 | ppl    26.166
+| epoch   4 step    44600 |  10190 batches | lr 4.38e-05 | ms/batch 670.84 | loss  3.26 | ppl    26.037
+| epoch   4 step    44800 |  10390 batches | lr 4.07e-05 | ms/batch 672.90 | loss  3.26 | ppl    26.088
+| epoch   4 step    45000 |  10590 batches | lr 3.76e-05 | ms/batch 673.66 | loss  3.29 | ppl    26.884
+| epoch   4 step    45200 |  10790 batches | lr 3.47e-05 | ms/batch 672.88 | loss  3.24 | ppl    25.586
+| epoch   4 step    45400 |  10990 batches | lr 3.19e-05 | ms/batch 671.20 | loss  3.28 | ppl    26.487
+| epoch   4 step    45600 |  11190 batches | lr 2.92e-05 | ms/batch 674.06 | loss  3.28 | ppl    26.688
+| epoch   4 step    45800 |  11390 batches | lr 2.66e-05 | ms/batch 670.83 | loss  3.28 | ppl    26.449
+| epoch   5 step    46000 |    120 batches | lr 2.41e-05 | ms/batch 671.63 | loss  3.26 | ppl    26.029
+| epoch   5 step    46200 |    320 batches | lr 2.18e-05 | ms/batch 675.05 | loss  3.24 | ppl    25.647
+| epoch   5 step    46400 |    520 batches | lr 1.96e-05 | ms/batch 671.64 | loss  3.28 | ppl    26.462
+| epoch   5 step    46600 |    720 batches | lr 1.75e-05 | ms/batch 674.85 | loss  3.24 | ppl    25.535
+| epoch   5 step    46800 |    920 batches | lr 1.55e-05 | ms/batch 672.46 | loss  3.24 | ppl    25.522
+| epoch   5 step    47000 |   1120 batches | lr 1.36e-05 | ms/batch 672.98 | loss  3.28 | ppl    26.567
+| epoch   5 step    47200 |   1320 batches | lr 1.19e-05 | ms/batch 669.86 | loss  3.24 | ppl    25.624
+| epoch   5 step    47400 |   1520 batches | lr 1.02e-05 | ms/batch 673.34 | loss  3.25 | ppl    25.746
+| epoch   5 step    47600 |   1720 batches | lr 8.72e-06 | ms/batch 673.91 | loss  3.24 | ppl    25.514
+| epoch   5 step    47800 |   1920 batches | lr 7.33e-06 | ms/batch 672.36 | loss  3.27 | ppl    26.267
+| epoch   5 step    48000 |   2120 batches | lr 6.06e-06 | ms/batch 663.53 | loss  3.29 | ppl    26.743
+----------------------------------------------------------------------------------------------------
+| Eval  12 at step    48000 | time: 2697.55s | valid loss  3.24 | valid ppl    25.471
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    48200 |   2320 batches | lr 4.91e-06 | ms/batch 739.34 | loss  3.27 | ppl    26.196
+| epoch   5 step    48400 |   2520 batches | lr 3.88e-06 | ms/batch 674.08 | loss  3.25 | ppl    25.864
+| epoch   5 step    48600 |   2720 batches | lr 2.97e-06 | ms/batch 672.56 | loss  3.24 | ppl    25.526
+| epoch   5 step    48800 |   2920 batches | lr 2.18e-06 | ms/batch 672.85 | loss  3.23 | ppl    25.302
+| epoch   5 step    49000 |   3120 batches | lr 1.52e-06 | ms/batch 673.40 | loss  3.25 | ppl    25.757
+| epoch   5 step    49200 |   3320 batches | lr 9.71e-07 | ms/batch 672.09 | loss  3.27 | ppl    26.197
+| epoch   5 step    49400 |   3520 batches | lr 5.46e-07 | ms/batch 670.25 | loss  3.23 | ppl    25.175
+| epoch   5 step    49600 |   3720 batches | lr 2.43e-07 | ms/batch 673.34 | loss  3.25 | ppl    25.791
+| epoch   5 step    49800 |   3920 batches | lr 6.07e-08 | ms/batch 670.68 | loss  3.25 | ppl    25.720
+| epoch   5 step    50000 |   4120 batches | lr 0 | ms/batch 475.96 | loss  3.25 | ppl    25.749
+----------------------------------------------------------------------------------------------------
+End of training
+====================================================================================================
+| End of training | test loss  3.27 | test ppl    26.217
+====================================================================================================
diff --git a/NLP/Transformer-XL/exp_results/log-adam.txt b/NLP/Transformer-XL/exp_results/log-adam.txt
new file mode 100644
index 0000000..34c52c6
--- /dev/null
+++ b/NLP/Transformer-XL/exp_results/log-adam.txt
@@ -0,0 +1,1224 @@
+====================================================================================================
+    - data : /root/autodl-tmp/data/wikitext-103/
+    - dataset : wt103
+    - n_layer : 16
+    - n_head : 10
+    - d_head : 41
+    - d_embed : 410
+    - d_model : 410
+    - d_inner : 2100
+    - dropout : 0.1
+    - dropatt : 0.0
+    - init : normal
+    - emb_init : normal
+    - init_range : 0.1
+    - emb_init_range : 0.01
+    - init_std : 0.02
+    - proj_init_std : 0.01
+    - optim : adam
+    - lr : 0.00025
+    - wd : 0.02
+    - mom : 0.0
+    - scheduler : cosine
+    - warmup_step : 0
+    - decay_rate : 0.5
+    - lr_min : 0.0
+    - clip : 0.25
+    - clip_nonemb : False
+    - max_step : 200000
+    - batch_size : 60
+    - batch_chunk : 1
+    - tgt_len : 150
+    - eval_tgt_len : 150
+    - ext_len : 0
+    - mem_len : 150
+    - not_tied : False
+    - seed : 1111
+    - cuda : True
+    - adaptive : True
+    - div_val : 1
+    - pre_lnorm : False
+    - varlen : False
+    - multi_gpu : True
+    - log_interval : 200
+    - eval_interval : 4000
+    - work_dir : /root/autodl-tmp/-wt103/20220810-185417
+    - restart : False
+    - restart_dir : 
+    - debug : False
+    - same_length : False
+    - attn_type : 0
+    - clamp_len : -1
+    - eta_min : 0.0
+    - gpu0_bsz : 4
+    - max_eval_steps : -1
+    - sample_softmax : -1
+    - patience : 0
+    - finetune_v2 : False
+    - finetune_v3 : False
+    - fp16 : False
+    - static_loss_scale : 1
+    - dynamic_loss_scale : False
+    - opt_betas : None
+    - tied : True
+    - n_token : 267735
+    - n_all_param : 151107538
+    - n_nonemb_param : 41066400
+====================================================================================================
+#params = 151107538
+#non emb params = 41066400
+| epoch   1 step      200 |    200 batches | lr 0.00025 | ms/batch 764.49 | loss  6.97 | ppl  1066.907
+| epoch   1 step      400 |    400 batches | lr 0.00025 | ms/batch 687.98 | loss  6.03 | ppl   417.069
+| epoch   1 step      600 |    600 batches | lr 0.00025 | ms/batch 683.07 | loss  5.69 | ppl   297.083
+| epoch   1 step      800 |    800 batches | lr 0.00025 | ms/batch 723.35 | loss  5.49 | ppl   241.413
+| epoch   1 step     1000 |   1000 batches | lr 0.00025 | ms/batch 694.77 | loss  5.30 | ppl   199.605
+| epoch   1 step     1200 |   1200 batches | lr 0.00025 | ms/batch 677.41 | loss  5.17 | ppl   176.453
+| epoch   1 step     1400 |   1400 batches | lr 0.00025 | ms/batch 677.36 | loss  5.07 | ppl   159.156
+| epoch   1 step     1600 |   1600 batches | lr 0.00025 | ms/batch 638.81 | loss  4.98 | ppl   145.306
+| epoch   1 step     1800 |   1800 batches | lr 0.00025 | ms/batch 383.71 | loss  4.91 | ppl   136.268
+| epoch   1 step     2000 |   2000 batches | lr 0.00025 | ms/batch 382.65 | loss  4.85 | ppl   127.951
+| epoch   1 step     2200 |   2200 batches | lr 0.00025 | ms/batch 382.54 | loss  4.78 | ppl   119.484
+| epoch   1 step     2400 |   2400 batches | lr 0.00025 | ms/batch 382.40 | loss  4.73 | ppl   113.765
+| epoch   1 step     2600 |   2600 batches | lr 0.00025 | ms/batch 384.26 | loss  4.68 | ppl   107.611
+| epoch   1 step     2800 |   2800 batches | lr 0.00025 | ms/batch 382.49 | loss  4.63 | ppl   102.007
+| epoch   1 step     3000 |   3000 batches | lr 0.00025 | ms/batch 383.20 | loss  4.60 | ppl    99.044
+| epoch   1 step     3200 |   3200 batches | lr 0.00025 | ms/batch 382.09 | loss  4.55 | ppl    94.494
+| epoch   1 step     3400 |   3400 batches | lr 0.00025 | ms/batch 382.43 | loss  4.52 | ppl    91.563
+| epoch   1 step     3600 |   3600 batches | lr 0.00025 | ms/batch 382.40 | loss  4.45 | ppl    85.252
+| epoch   1 step     3800 |   3800 batches | lr 0.00025 | ms/batch 382.46 | loss  4.49 | ppl    88.831
+| epoch   1 step     4000 |   4000 batches | lr 0.00025 | ms/batch 382.79 | loss  4.45 | ppl    85.701
+----------------------------------------------------------------------------------------------------
+| Eval   1 at step     4000 | time: 2034.38s | valid loss  4.28 | valid ppl    72.551
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     4200 |   4200 batches | lr 0.00025 | ms/batch 425.25 | loss  4.40 | ppl    81.592
+| epoch   1 step     4400 |   4400 batches | lr 0.00025 | ms/batch 382.45 | loss  4.38 | ppl    80.012
+| epoch   1 step     4600 |   4600 batches | lr 0.00025 | ms/batch 381.95 | loss  4.36 | ppl    78.430
+| epoch   1 step     4800 |   4800 batches | lr 0.00025 | ms/batch 383.26 | loss  4.31 | ppl    74.659
+| epoch   1 step     5000 |   5000 batches | lr 0.00025 | ms/batch 382.36 | loss  4.35 | ppl    77.294
+| epoch   1 step     5200 |   5200 batches | lr 0.00025 | ms/batch 383.05 | loss  4.29 | ppl    73.083
+| epoch   1 step     5400 |   5400 batches | lr 0.00025 | ms/batch 382.53 | loss  4.24 | ppl    69.188
+| epoch   1 step     5600 |   5600 batches | lr 0.00025 | ms/batch 382.05 | loss  4.26 | ppl    70.726
+| epoch   1 step     5800 |   5800 batches | lr 0.000249 | ms/batch 383.48 | loss  4.26 | ppl    70.533
+| epoch   1 step     6000 |   6000 batches | lr 0.000249 | ms/batch 382.63 | loss  4.21 | ppl    67.321
+| epoch   1 step     6200 |   6200 batches | lr 0.000249 | ms/batch 382.38 | loss  4.18 | ppl    65.667
+| epoch   1 step     6400 |   6400 batches | lr 0.000249 | ms/batch 382.63 | loss  4.22 | ppl    68.112
+| epoch   1 step     6600 |   6600 batches | lr 0.000249 | ms/batch 383.94 | loss  4.15 | ppl    63.675
+| epoch   1 step     6800 |   6800 batches | lr 0.000249 | ms/batch 383.22 | loss  4.15 | ppl    63.453
+| epoch   1 step     7000 |   7000 batches | lr 0.000249 | ms/batch 382.85 | loss  4.15 | ppl    63.563
+| epoch   1 step     7200 |   7200 batches | lr 0.000249 | ms/batch 383.21 | loss  4.10 | ppl    60.547
+| epoch   1 step     7400 |   7400 batches | lr 0.000249 | ms/batch 382.26 | loss  4.10 | ppl    60.203
+| epoch   1 step     7600 |   7600 batches | lr 0.000249 | ms/batch 382.51 | loss  4.08 | ppl    58.953
+| epoch   1 step     7800 |   7800 batches | lr 0.000249 | ms/batch 382.04 | loss  4.10 | ppl    60.279
+| epoch   1 step     8000 |   8000 batches | lr 0.000249 | ms/batch 382.26 | loss  4.09 | ppl    59.987
+----------------------------------------------------------------------------------------------------
+| Eval   2 at step     8000 | time: 1537.11s | valid loss  3.92 | valid ppl    50.244
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     8200 |   8200 batches | lr 0.000249 | ms/batch 426.91 | loss  4.07 | ppl    58.474
+| epoch   1 step     8400 |   8400 batches | lr 0.000249 | ms/batch 382.09 | loss  4.08 | ppl    58.943
+| epoch   1 step     8600 |   8600 batches | lr 0.000249 | ms/batch 383.51 | loss  4.06 | ppl    57.842
+| epoch   1 step     8800 |   8800 batches | lr 0.000249 | ms/batch 383.16 | loss  4.07 | ppl    58.371
+| epoch   1 step     9000 |   9000 batches | lr 0.000249 | ms/batch 382.59 | loss  4.03 | ppl    56.484
+| epoch   1 step     9200 |   9200 batches | lr 0.000249 | ms/batch 383.24 | loss  4.02 | ppl    55.887
+| epoch   1 step     9400 |   9400 batches | lr 0.000249 | ms/batch 382.44 | loss  4.03 | ppl    56.143
+| epoch   1 step     9600 |   9600 batches | lr 0.000249 | ms/batch 382.34 | loss  4.04 | ppl    56.989
+| epoch   1 step     9800 |   9800 batches | lr 0.000249 | ms/batch 382.46 | loss  4.00 | ppl    54.426
+| epoch   1 step    10000 |  10000 batches | lr 0.000248 | ms/batch 383.27 | loss  4.01 | ppl    55.195
+| epoch   1 step    10200 |  10200 batches | lr 0.000248 | ms/batch 382.34 | loss  3.98 | ppl    53.358
+| epoch   1 step    10400 |  10400 batches | lr 0.000248 | ms/batch 382.68 | loss  3.97 | ppl    53.066
+| epoch   1 step    10600 |  10600 batches | lr 0.000248 | ms/batch 382.80 | loss  3.99 | ppl    54.306
+| epoch   1 step    10800 |  10800 batches | lr 0.000248 | ms/batch 384.05 | loss  3.95 | ppl    51.980
+| epoch   1 step    11000 |  11000 batches | lr 0.000248 | ms/batch 382.48 | loss  3.99 | ppl    54.189
+| epoch   1 step    11200 |  11200 batches | lr 0.000248 | ms/batch 382.43 | loss  3.97 | ppl    52.836
+| epoch   1 step    11400 |  11400 batches | lr 0.000248 | ms/batch 382.62 | loss  3.96 | ppl    52.684
+| epoch   2 step    11600 |    130 batches | lr 0.000248 | ms/batch 384.77 | loss  3.93 | ppl    50.757
+| epoch   2 step    11800 |    330 batches | lr 0.000248 | ms/batch 384.18 | loss  3.89 | ppl    48.921
+| epoch   2 step    12000 |    530 batches | lr 0.000248 | ms/batch 382.18 | loss  3.91 | ppl    49.890
+----------------------------------------------------------------------------------------------------
+| Eval   3 at step    12000 | time: 1537.95s | valid loss  3.77 | valid ppl    43.379
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    12200 |    730 batches | lr 0.000248 | ms/batch 426.96 | loss  3.88 | ppl    48.351
+| epoch   2 step    12400 |    930 batches | lr 0.000248 | ms/batch 382.32 | loss  3.88 | ppl    48.358
+| epoch   2 step    12600 |   1130 batches | lr 0.000248 | ms/batch 382.56 | loss  3.90 | ppl    49.504
+| epoch   2 step    12800 |   1330 batches | lr 0.000247 | ms/batch 383.00 | loss  3.87 | ppl    47.881
+| epoch   2 step    13000 |   1530 batches | lr 0.000247 | ms/batch 384.66 | loss  3.86 | ppl    47.436
+| epoch   2 step    13200 |   1730 batches | lr 0.000247 | ms/batch 385.68 | loss  3.85 | ppl    47.200
+| epoch   2 step    13400 |   1930 batches | lr 0.000247 | ms/batch 385.97 | loss  3.86 | ppl    47.400
+| epoch   2 step    13600 |   2130 batches | lr 0.000247 | ms/batch 387.10 | loss  3.88 | ppl    48.414
+| epoch   2 step    13800 |   2330 batches | lr 0.000247 | ms/batch 387.55 | loss  3.85 | ppl    47.186
+| epoch   2 step    14000 |   2530 batches | lr 0.000247 | ms/batch 385.67 | loss  3.84 | ppl    46.648
+| epoch   2 step    14200 |   2730 batches | lr 0.000247 | ms/batch 385.10 | loss  3.82 | ppl    45.693
+| epoch   2 step    14400 |   2930 batches | lr 0.000247 | ms/batch 385.39 | loss  3.81 | ppl    45.134
+| epoch   2 step    14600 |   3130 batches | lr 0.000247 | ms/batch 386.09 | loss  3.82 | ppl    45.500
+| epoch   2 step    14800 |   3330 batches | lr 0.000247 | ms/batch 385.83 | loss  3.82 | ppl    45.721
+| epoch   2 step    15000 |   3530 batches | lr 0.000247 | ms/batch 384.09 | loss  3.78 | ppl    43.946
+| epoch   2 step    15200 |   3730 batches | lr 0.000246 | ms/batch 385.04 | loss  3.81 | ppl    45.324
+| epoch   2 step    15400 |   3930 batches | lr 0.000246 | ms/batch 384.82 | loss  3.81 | ppl    44.927
+| epoch   2 step    15600 |   4130 batches | lr 0.000246 | ms/batch 385.06 | loss  3.79 | ppl    44.331
+| epoch   2 step    15800 |   4330 batches | lr 0.000246 | ms/batch 384.90 | loss  3.80 | ppl    44.771
+| epoch   2 step    16000 |   4530 batches | lr 0.000246 | ms/batch 386.44 | loss  3.80 | ppl    44.784
+----------------------------------------------------------------------------------------------------
+| Eval   4 at step    16000 | time: 1546.41s | valid loss  3.65 | valid ppl    38.633
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    16200 |   4730 batches | lr 0.000246 | ms/batch 429.10 | loss  3.76 | ppl    42.832
+| epoch   2 step    16400 |   4930 batches | lr 0.000246 | ms/batch 386.10 | loss  3.78 | ppl    43.794
+| epoch   2 step    16600 |   5130 batches | lr 0.000246 | ms/batch 386.13 | loss  3.77 | ppl    43.324
+| epoch   2 step    16800 |   5330 batches | lr 0.000246 | ms/batch 385.77 | loss  3.76 | ppl    42.944
+| epoch   2 step    17000 |   5530 batches | lr 0.000246 | ms/batch 384.98 | loss  3.74 | ppl    42.284
+| epoch   2 step    17200 |   5730 batches | lr 0.000245 | ms/batch 384.86 | loss  3.76 | ppl    43.149
+| epoch   2 step    17400 |   5930 batches | lr 0.000245 | ms/batch 385.57 | loss  3.75 | ppl    42.421
+| epoch   2 step    17600 |   6130 batches | lr 0.000245 | ms/batch 385.85 | loss  3.74 | ppl    42.025
+| epoch   2 step    17800 |   6330 batches | lr 0.000245 | ms/batch 386.39 | loss  3.77 | ppl    43.312
+| epoch   2 step    18000 |   6530 batches | lr 0.000245 | ms/batch 386.91 | loss  3.71 | ppl    40.843
+| epoch   2 step    18200 |   6730 batches | lr 0.000245 | ms/batch 385.35 | loss  3.72 | ppl    41.108
+| epoch   2 step    18400 |   6930 batches | lr 0.000245 | ms/batch 383.48 | loss  3.73 | ppl    41.559
+| epoch   2 step    18600 |   7130 batches | lr 0.000245 | ms/batch 383.69 | loss  3.70 | ppl    40.583
+| epoch   2 step    18800 |   7330 batches | lr 0.000245 | ms/batch 382.21 | loss  3.68 | ppl    39.788
+| epoch   2 step    19000 |   7530 batches | lr 0.000244 | ms/batch 382.49 | loss  3.71 | ppl    40.743
+| epoch   2 step    19200 |   7730 batches | lr 0.000244 | ms/batch 381.98 | loss  3.71 | ppl    40.765
+| epoch   2 step    19400 |   7930 batches | lr 0.000244 | ms/batch 382.74 | loss  3.70 | ppl    40.560
+| epoch   2 step    19600 |   8130 batches | lr 0.000244 | ms/batch 382.31 | loss  3.71 | ppl    41.029
+| epoch   2 step    19800 |   8330 batches | lr 0.000244 | ms/batch 383.90 | loss  3.70 | ppl    40.507
+| epoch   2 step    20000 |   8530 batches | lr 0.000244 | ms/batch 382.56 | loss  3.69 | ppl    40.172
+----------------------------------------------------------------------------------------------------
+| Eval   5 at step    20000 | time: 1543.91s | valid loss  3.58 | valid ppl    36.050
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    20200 |   8730 batches | lr 0.000244 | ms/batch 426.51 | loss  3.71 | ppl    40.844
+| epoch   2 step    20400 |   8930 batches | lr 0.000244 | ms/batch 382.52 | loss  3.71 | ppl    40.678
+| epoch   2 step    20600 |   9130 batches | lr 0.000244 | ms/batch 382.75 | loss  3.70 | ppl    40.294
+| epoch   2 step    20800 |   9330 batches | lr 0.000243 | ms/batch 382.10 | loss  3.69 | ppl    39.944
+| epoch   2 step    21000 |   9530 batches | lr 0.000243 | ms/batch 382.83 | loss  3.73 | ppl    41.725
+| epoch   2 step    21200 |   9730 batches | lr 0.000243 | ms/batch 381.82 | loss  3.68 | ppl    39.593
+| epoch   2 step    21400 |   9930 batches | lr 0.000243 | ms/batch 382.79 | loss  3.69 | ppl    40.048
+| epoch   2 step    21600 |  10130 batches | lr 0.000243 | ms/batch 381.93 | loss  3.68 | ppl    39.454
+| epoch   2 step    21800 |  10330 batches | lr 0.000243 | ms/batch 382.28 | loss  3.68 | ppl    39.787
+| epoch   2 step    22000 |  10530 batches | lr 0.000243 | ms/batch 382.05 | loss  3.70 | ppl    40.356
+| epoch   2 step    22200 |  10730 batches | lr 0.000242 | ms/batch 382.76 | loss  3.66 | ppl    39.021
+| epoch   2 step    22400 |  10930 batches | lr 0.000242 | ms/batch 381.75 | loss  3.66 | ppl    39.049
+| epoch   2 step    22600 |  11130 batches | lr 0.000242 | ms/batch 384.69 | loss  3.71 | ppl    40.838
+| epoch   2 step    22800 |  11330 batches | lr 0.000242 | ms/batch 381.62 | loss  3.67 | ppl    39.428
+| epoch   3 step    23000 |     60 batches | lr 0.000242 | ms/batch 381.30 | loss  3.68 | ppl    39.482
+| epoch   3 step    23200 |    260 batches | lr 0.000242 | ms/batch 382.06 | loss  3.62 | ppl    37.256
+| epoch   3 step    23400 |    460 batches | lr 0.000242 | ms/batch 383.57 | loss  3.66 | ppl    38.850
+| epoch   3 step    23600 |    660 batches | lr 0.000242 | ms/batch 381.67 | loss  3.62 | ppl    37.381
+| epoch   3 step    23800 |    860 batches | lr 0.000241 | ms/batch 383.06 | loss  3.66 | ppl    38.722
+| epoch   3 step    24000 |   1060 batches | lr 0.000241 | ms/batch 382.42 | loss  3.64 | ppl    38.178
+----------------------------------------------------------------------------------------------------
+| Eval   6 at step    24000 | time: 1535.94s | valid loss  3.54 | valid ppl    34.412
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    24200 |   1260 batches | lr 0.000241 | ms/batch 426.42 | loss  3.63 | ppl    37.832
+| epoch   3 step    24400 |   1460 batches | lr 0.000241 | ms/batch 383.25 | loss  3.63 | ppl    37.748
+| epoch   3 step    24600 |   1660 batches | lr 0.000241 | ms/batch 382.90 | loss  3.62 | ppl    37.471
+| epoch   3 step    24800 |   1860 batches | lr 0.000241 | ms/batch 382.79 | loss  3.63 | ppl    37.761
+| epoch   3 step    25000 |   2060 batches | lr 0.00024 | ms/batch 383.41 | loss  3.67 | ppl    39.280
+| epoch   3 step    25200 |   2260 batches | lr 0.00024 | ms/batch 382.61 | loss  3.64 | ppl    38.232
+| epoch   3 step    25400 |   2460 batches | lr 0.00024 | ms/batch 382.20 | loss  3.63 | ppl    37.701
+| epoch   3 step    25600 |   2660 batches | lr 0.00024 | ms/batch 382.62 | loss  3.63 | ppl    37.828
+| epoch   3 step    25800 |   2860 batches | lr 0.00024 | ms/batch 382.53 | loss  3.58 | ppl    35.716
+| epoch   3 step    26000 |   3060 batches | lr 0.00024 | ms/batch 382.55 | loss  3.63 | ppl    37.634
+| epoch   3 step    26200 |   3260 batches | lr 0.00024 | ms/batch 382.81 | loss  3.62 | ppl    37.520
+| epoch   3 step    26400 |   3460 batches | lr 0.000239 | ms/batch 384.69 | loss  3.59 | ppl    36.219
+| epoch   3 step    26600 |   3660 batches | lr 0.000239 | ms/batch 382.44 | loss  3.60 | ppl    36.700
+| epoch   3 step    26800 |   3860 batches | lr 0.000239 | ms/batch 382.15 | loss  3.61 | ppl    36.900
+| epoch   3 step    27000 |   4060 batches | lr 0.000239 | ms/batch 382.14 | loss  3.62 | ppl    37.292
+| epoch   3 step    27200 |   4260 batches | lr 0.000239 | ms/batch 383.17 | loss  3.61 | ppl    36.796
+| epoch   3 step    27400 |   4460 batches | lr 0.000239 | ms/batch 382.18 | loss  3.61 | ppl    36.903
+| epoch   3 step    27600 |   4660 batches | lr 0.000238 | ms/batch 382.49 | loss  3.60 | ppl    36.548
+| epoch   3 step    27800 |   4860 batches | lr 0.000238 | ms/batch 381.75 | loss  3.59 | ppl    36.199
+| epoch   3 step    28000 |   5060 batches | lr 0.000238 | ms/batch 382.08 | loss  3.60 | ppl    36.657
+----------------------------------------------------------------------------------------------------
+| Eval   7 at step    28000 | time: 1536.83s | valid loss  3.50 | valid ppl    33.127
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    28200 |   5260 batches | lr 0.000238 | ms/batch 426.01 | loss  3.58 | ppl    36.005
+| epoch   3 step    28400 |   5460 batches | lr 0.000238 | ms/batch 382.73 | loss  3.56 | ppl    35.230
+| epoch   3 step    28600 |   5660 batches | lr 0.000238 | ms/batch 382.35 | loss  3.61 | ppl    36.999
+| epoch   3 step    28800 |   5860 batches | lr 0.000237 | ms/batch 382.16 | loss  3.58 | ppl    35.999
+| epoch   3 step    29000 |   6060 batches | lr 0.000237 | ms/batch 382.25 | loss  3.58 | ppl    35.815
+| epoch   3 step    29200 |   6260 batches | lr 0.000237 | ms/batch 382.26 | loss  3.58 | ppl    35.851
+| epoch   3 step    29400 |   6460 batches | lr 0.000237 | ms/batch 383.97 | loss  3.59 | ppl    36.178
+| epoch   3 step    29600 |   6660 batches | lr 0.000237 | ms/batch 382.68 | loss  3.54 | ppl    34.313
+| epoch   3 step    29800 |   6860 batches | lr 0.000237 | ms/batch 382.70 | loss  3.57 | ppl    35.428
+| epoch   3 step    30000 |   7060 batches | lr 0.000236 | ms/batch 384.33 | loss  3.56 | ppl    35.112
+| epoch   3 step    30200 |   7260 batches | lr 0.000236 | ms/batch 382.75 | loss  3.53 | ppl    34.109
+| epoch   3 step    30400 |   7460 batches | lr 0.000236 | ms/batch 382.94 | loss  3.55 | ppl    34.943
+| epoch   3 step    30600 |   7660 batches | lr 0.000236 | ms/batch 384.39 | loss  3.54 | ppl    34.438
+| epoch   3 step    30800 |   7860 batches | lr 0.000236 | ms/batch 382.63 | loss  3.55 | ppl    34.942
+| epoch   3 step    31000 |   8060 batches | lr 0.000235 | ms/batch 384.05 | loss  3.56 | ppl    35.184
+| epoch   3 step    31200 |   8260 batches | lr 0.000235 | ms/batch 382.68 | loss  3.55 | ppl    34.799
+| epoch   3 step    31400 |   8460 batches | lr 0.000235 | ms/batch 382.61 | loss  3.56 | ppl    35.170
+| epoch   3 step    31600 |   8660 batches | lr 0.000235 | ms/batch 382.17 | loss  3.56 | ppl    35.065
+| epoch   3 step    31800 |   8860 batches | lr 0.000235 | ms/batch 382.49 | loss  3.56 | ppl    35.131
+| epoch   3 step    32000 |   9060 batches | lr 0.000235 | ms/batch 382.24 | loss  3.56 | ppl    35.142
+----------------------------------------------------------------------------------------------------
+| Eval   8 at step    32000 | time: 1537.58s | valid loss  3.46 | valid ppl    31.818
+----------------------------------------------------------------------------------------------------
+| epoch   3 step    32200 |   9260 batches | lr 0.000234 | ms/batch 426.15 | loss  3.54 | ppl    34.637
+| epoch   3 step    32400 |   9460 batches | lr 0.000234 | ms/batch 383.26 | loss  3.57 | ppl    35.490
+| epoch   3 step    32600 |   9660 batches | lr 0.000234 | ms/batch 382.25 | loss  3.57 | ppl    35.516
+| epoch   3 step    32800 |   9860 batches | lr 0.000234 | ms/batch 382.36 | loss  3.52 | ppl    33.934
+| epoch   3 step    33000 |  10060 batches | lr 0.000234 | ms/batch 382.17 | loss  3.58 | ppl    35.722
+| epoch   3 step    33200 |  10260 batches | lr 0.000233 | ms/batch 382.47 | loss  3.52 | ppl    33.869
+| epoch   3 step    33400 |  10460 batches | lr 0.000233 | ms/batch 383.24 | loss  3.56 | ppl    35.052
+| epoch   3 step    33600 |  10660 batches | lr 0.000233 | ms/batch 382.21 | loss  3.57 | ppl    35.355
+| epoch   3 step    33800 |  10860 batches | lr 0.000233 | ms/batch 382.50 | loss  3.52 | ppl    33.700
+| epoch   3 step    34000 |  11060 batches | lr 0.000233 | ms/batch 382.55 | loss  3.56 | ppl    35.290
+| epoch   3 step    34200 |  11260 batches | lr 0.000232 | ms/batch 382.62 | loss  3.57 | ppl    35.557
+| epoch   3 step    34400 |  11460 batches | lr 0.000232 | ms/batch 382.65 | loss  3.54 | ppl    34.550
+| epoch   4 step    34600 |    190 batches | lr 0.000232 | ms/batch 381.14 | loss  3.51 | ppl    33.420
+| epoch   4 step    34800 |    390 batches | lr 0.000232 | ms/batch 381.97 | loss  3.52 | ppl    33.787
+| epoch   4 step    35000 |    590 batches | lr 0.000232 | ms/batch 382.60 | loss  3.51 | ppl    33.552
+| epoch   4 step    35200 |    790 batches | lr 0.000231 | ms/batch 385.96 | loss  3.53 | ppl    34.089
+| epoch   4 step    35400 |    990 batches | lr 0.000231 | ms/batch 382.69 | loss  3.51 | ppl    33.374
+| epoch   4 step    35600 |   1190 batches | lr 0.000231 | ms/batch 382.30 | loss  3.53 | ppl    34.051
+| epoch   4 step    35800 |   1390 batches | lr 0.000231 | ms/batch 382.36 | loss  3.52 | ppl    33.694
+| epoch   4 step    36000 |   1590 batches | lr 0.000231 | ms/batch 382.00 | loss  3.51 | ppl    33.320
+----------------------------------------------------------------------------------------------------
+| Eval   9 at step    36000 | time: 1536.56s | valid loss  3.44 | valid ppl    31.250
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    36200 |   1790 batches | lr 0.00023 | ms/batch 426.70 | loss  3.52 | ppl    33.653
+| epoch   4 step    36400 |   1990 batches | lr 0.00023 | ms/batch 382.33 | loss  3.54 | ppl    34.638
+| epoch   4 step    36600 |   2190 batches | lr 0.00023 | ms/batch 383.26 | loss  3.53 | ppl    34.169
+| epoch   4 step    36800 |   2390 batches | lr 0.00023 | ms/batch 382.43 | loss  3.53 | ppl    34.156
+| epoch   4 step    37000 |   2590 batches | lr 0.000229 | ms/batch 383.03 | loss  3.51 | ppl    33.352
+| epoch   4 step    37200 |   2790 batches | lr 0.000229 | ms/batch 382.01 | loss  3.49 | ppl    32.825
+| epoch   4 step    37400 |   2990 batches | lr 0.000229 | ms/batch 382.88 | loss  3.51 | ppl    33.368
+| epoch   4 step    37600 |   3190 batches | lr 0.000229 | ms/batch 382.42 | loss  3.51 | ppl    33.417
+| epoch   4 step    37800 |   3390 batches | lr 0.000229 | ms/batch 382.74 | loss  3.51 | ppl    33.414
+| epoch   4 step    38000 |   3590 batches | lr 0.000228 | ms/batch 381.55 | loss  3.48 | ppl    32.456
+| epoch   4 step    38200 |   3790 batches | lr 0.000228 | ms/batch 386.35 | loss  3.50 | ppl    33.250
+| epoch   4 step    38400 |   3990 batches | lr 0.000228 | ms/batch 382.08 | loss  3.52 | ppl    33.648
+| epoch   4 step    38600 |   4190 batches | lr 0.000228 | ms/batch 382.31 | loss  3.50 | ppl    33.089
+| epoch   4 step    38800 |   4390 batches | lr 0.000227 | ms/batch 382.64 | loss  3.50 | ppl    33.248
+| epoch   4 step    39000 |   4590 batches | lr 0.000227 | ms/batch 383.65 | loss  3.52 | ppl    33.624
+| epoch   4 step    39200 |   4790 batches | lr 0.000227 | ms/batch 382.21 | loss  3.47 | ppl    32.242
+| epoch   4 step    39400 |   4990 batches | lr 0.000227 | ms/batch 382.62 | loss  3.52 | ppl    33.868
+| epoch   4 step    39600 |   5190 batches | lr 0.000227 | ms/batch 382.88 | loss  3.48 | ppl    32.418
+| epoch   4 step    39800 |   5390 batches | lr 0.000226 | ms/batch 382.21 | loss  3.46 | ppl    31.803
+| epoch   4 step    40000 |   5590 batches | lr 0.000226 | ms/batch 381.89 | loss  3.48 | ppl    32.611
+----------------------------------------------------------------------------------------------------
+| Eval  10 at step    40000 | time: 1537.11s | valid loss  3.42 | valid ppl    30.522
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    40200 |   5790 batches | lr 0.000226 | ms/batch 426.61 | loss  3.50 | ppl    33.271
+| epoch   4 step    40400 |   5990 batches | lr 0.000226 | ms/batch 382.10 | loss  3.48 | ppl    32.384
+| epoch   4 step    40600 |   6190 batches | lr 0.000225 | ms/batch 382.91 | loss  3.48 | ppl    32.374
+| epoch   4 step    40800 |   6390 batches | lr 0.000225 | ms/batch 382.15 | loss  3.51 | ppl    33.374
+| epoch   4 step    41000 |   6590 batches | lr 0.000225 | ms/batch 383.66 | loss  3.44 | ppl    31.217
+| epoch   4 step    41200 |   6790 batches | lr 0.000225 | ms/batch 382.20 | loss  3.47 | ppl    32.031
+| epoch   4 step    41400 |   6990 batches | lr 0.000224 | ms/batch 383.41 | loss  3.48 | ppl    32.533
+| epoch   4 step    41600 |   7190 batches | lr 0.000224 | ms/batch 382.45 | loss  3.43 | ppl    30.920
+| epoch   4 step    41800 |   7390 batches | lr 0.000224 | ms/batch 382.32 | loss  3.46 | ppl    31.829
+| epoch   4 step    42000 |   7590 batches | lr 0.000224 | ms/batch 382.28 | loss  3.44 | ppl    31.101
+| epoch   4 step    42200 |   7790 batches | lr 0.000224 | ms/batch 383.12 | loss  3.47 | ppl    32.066
+| epoch   4 step    42400 |   7990 batches | lr 0.000223 | ms/batch 382.94 | loss  3.47 | ppl    32.038
+| epoch   4 step    42600 |   8190 batches | lr 0.000223 | ms/batch 382.32 | loss  3.45 | ppl    31.633
+| epoch   4 step    42800 |   8390 batches | lr 0.000223 | ms/batch 384.01 | loss  3.48 | ppl    32.533
+| epoch   4 step    43000 |   8590 batches | lr 0.000223 | ms/batch 382.16 | loss  3.46 | ppl    31.763
+| epoch   4 step    43200 |   8790 batches | lr 0.000222 | ms/batch 382.60 | loss  3.48 | ppl    32.401
+| epoch   4 step    43400 |   8990 batches | lr 0.000222 | ms/batch 382.37 | loss  3.47 | ppl    31.981
+| epoch   4 step    43600 |   9190 batches | lr 0.000222 | ms/batch 382.48 | loss  3.46 | ppl    31.690
+| epoch   4 step    43800 |   9390 batches | lr 0.000222 | ms/batch 384.84 | loss  3.47 | ppl    32.016
+| epoch   4 step    44000 |   9590 batches | lr 0.000221 | ms/batch 382.36 | loss  3.49 | ppl    32.684
+----------------------------------------------------------------------------------------------------
+| Eval  11 at step    44000 | time: 1537.23s | valid loss  3.40 | valid ppl    29.815
+----------------------------------------------------------------------------------------------------
+| epoch   4 step    44200 |   9790 batches | lr 0.000221 | ms/batch 428.35 | loss  3.46 | ppl    31.782
+| epoch   4 step    44400 |   9990 batches | lr 0.000221 | ms/batch 382.90 | loss  3.46 | ppl    31.814
+| epoch   4 step    44600 |  10190 batches | lr 0.000221 | ms/batch 385.08 | loss  3.45 | ppl    31.522
+| epoch   4 step    44800 |  10390 batches | lr 0.00022 | ms/batch 382.88 | loss  3.45 | ppl    31.641
+| epoch   4 step    45000 |  10590 batches | lr 0.00022 | ms/batch 381.85 | loss  3.49 | ppl    32.665
+| epoch   4 step    45200 |  10790 batches | lr 0.00022 | ms/batch 382.45 | loss  3.44 | ppl    31.149
+| epoch   4 step    45400 |  10990 batches | lr 0.00022 | ms/batch 382.05 | loss  3.47 | ppl    32.268
+| epoch   4 step    45600 |  11190 batches | lr 0.000219 | ms/batch 382.67 | loss  3.48 | ppl    32.483
+| epoch   4 step    45800 |  11390 batches | lr 0.000219 | ms/batch 383.04 | loss  3.47 | ppl    32.167
+| epoch   5 step    46000 |    120 batches | lr 0.000219 | ms/batch 381.34 | loss  3.45 | ppl    31.375
+| epoch   5 step    46200 |    320 batches | lr 0.000219 | ms/batch 383.01 | loss  3.43 | ppl    30.760
+| epoch   5 step    46400 |    520 batches | lr 0.000218 | ms/batch 382.83 | loss  3.46 | ppl    31.853
+| epoch   5 step    46600 |    720 batches | lr 0.000218 | ms/batch 382.75 | loss  3.42 | ppl    30.716
+| epoch   5 step    46800 |    920 batches | lr 0.000218 | ms/batch 382.52 | loss  3.43 | ppl    30.822
+| epoch   5 step    47000 |   1120 batches | lr 0.000217 | ms/batch 382.63 | loss  3.47 | ppl    32.008
+| epoch   5 step    47200 |   1320 batches | lr 0.000217 | ms/batch 382.45 | loss  3.43 | ppl    30.837
+| epoch   5 step    47400 |   1520 batches | lr 0.000217 | ms/batch 383.05 | loss  3.43 | ppl    31.007
+| epoch   5 step    47600 |   1720 batches | lr 0.000217 | ms/batch 382.51 | loss  3.43 | ppl    30.726
+| epoch   5 step    47800 |   1920 batches | lr 0.000216 | ms/batch 382.05 | loss  3.45 | ppl    31.615
+| epoch   5 step    48000 |   2120 batches | lr 0.000216 | ms/batch 383.67 | loss  3.47 | ppl    32.131
+----------------------------------------------------------------------------------------------------
+| Eval  12 at step    48000 | time: 1537.36s | valid loss  3.38 | valid ppl    29.286
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    48200 |   2320 batches | lr 0.000216 | ms/batch 426.18 | loss  3.45 | ppl    31.544
+| epoch   5 step    48400 |   2520 batches | lr 0.000216 | ms/batch 382.55 | loss  3.44 | ppl    31.092
+| epoch   5 step    48600 |   2720 batches | lr 0.000215 | ms/batch 383.24 | loss  3.42 | ppl    30.680
+| epoch   5 step    48800 |   2920 batches | lr 0.000215 | ms/batch 382.99 | loss  3.42 | ppl    30.430
+| epoch   5 step    49000 |   3120 batches | lr 0.000215 | ms/batch 382.66 | loss  3.44 | ppl    31.035
+| epoch   5 step    49200 |   3320 batches | lr 0.000214 | ms/batch 383.18 | loss  3.45 | ppl    31.405
+| epoch   5 step    49400 |   3520 batches | lr 0.000214 | ms/batch 382.78 | loss  3.41 | ppl    30.224
+| epoch   5 step    49600 |   3720 batches | lr 0.000214 | ms/batch 382.63 | loss  3.43 | ppl    31.025
+| epoch   5 step    49800 |   3920 batches | lr 0.000214 | ms/batch 382.76 | loss  3.43 | ppl    30.894
+| epoch   5 step    50000 |   4120 batches | lr 0.000213 | ms/batch 382.26 | loss  3.43 | ppl    30.885
+| epoch   5 step    50200 |   4320 batches | lr 0.000213 | ms/batch 382.89 | loss  3.44 | ppl    31.043
+| epoch   5 step    50400 |   4520 batches | lr 0.000213 | ms/batch 384.25 | loss  3.45 | ppl    31.416
+| epoch   5 step    50600 |   4720 batches | lr 0.000213 | ms/batch 382.92 | loss  3.41 | ppl    30.166
+| epoch   5 step    50800 |   4920 batches | lr 0.000212 | ms/batch 382.12 | loss  3.43 | ppl    30.728
+| epoch   5 step    51000 |   5120 batches | lr 0.000212 | ms/batch 382.48 | loss  3.42 | ppl    30.516
+| epoch   5 step    51200 |   5320 batches | lr 0.000212 | ms/batch 382.48 | loss  3.41 | ppl    30.393
+| epoch   5 step    51400 |   5520 batches | lr 0.000211 | ms/batch 383.12 | loss  3.41 | ppl    30.179
+| epoch   5 step    51600 |   5720 batches | lr 0.000211 | ms/batch 382.46 | loss  3.42 | ppl    30.587
+| epoch   5 step    51800 |   5920 batches | lr 0.000211 | ms/batch 382.88 | loss  3.42 | ppl    30.558
+| epoch   5 step    52000 |   6120 batches | lr 0.000211 | ms/batch 382.46 | loss  3.41 | ppl    30.275
+----------------------------------------------------------------------------------------------------
+| Eval  13 at step    52000 | time: 1537.27s | valid loss  3.37 | valid ppl    29.135
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    52200 |   6320 batches | lr 0.00021 | ms/batch 427.28 | loss  3.44 | ppl    31.060
+| epoch   5 step    52400 |   6520 batches | lr 0.00021 | ms/batch 382.67 | loss  3.38 | ppl    29.347
+| epoch   5 step    52600 |   6720 batches | lr 0.00021 | ms/batch 384.93 | loss  3.39 | ppl    29.540
+| epoch   5 step    52800 |   6920 batches | lr 0.000209 | ms/batch 382.20 | loss  3.41 | ppl    30.174
+| epoch   5 step    53000 |   7120 batches | lr 0.000209 | ms/batch 384.43 | loss  3.40 | ppl    29.817
+| epoch   5 step    53200 |   7320 batches | lr 0.000209 | ms/batch 382.30 | loss  3.36 | ppl    28.910
+| epoch   5 step    53400 |   7520 batches | lr 0.000209 | ms/batch 383.00 | loss  3.39 | ppl    29.792
+| epoch   5 step    53600 |   7720 batches | lr 0.000208 | ms/batch 382.44 | loss  3.39 | ppl    29.660
+| epoch   5 step    53800 |   7920 batches | lr 0.000208 | ms/batch 382.02 | loss  3.39 | ppl    29.703
+| epoch   5 step    54000 |   8120 batches | lr 0.000208 | ms/batch 382.41 | loss  3.40 | ppl    30.079
+| epoch   5 step    54200 |   8320 batches | lr 0.000207 | ms/batch 382.90 | loss  3.40 | ppl    29.826
+| epoch   5 step    54400 |   8520 batches | lr 0.000207 | ms/batch 382.56 | loss  3.39 | ppl    29.573
+| epoch   5 step    54600 |   8720 batches | lr 0.000207 | ms/batch 382.32 | loss  3.40 | ppl    30.113
+| epoch   5 step    54800 |   8920 batches | lr 0.000206 | ms/batch 382.09 | loss  3.41 | ppl    30.261
+| epoch   5 step    55000 |   9120 batches | lr 0.000206 | ms/batch 383.65 | loss  3.40 | ppl    29.949
+| epoch   5 step    55200 |   9320 batches | lr 0.000206 | ms/batch 382.70 | loss  3.39 | ppl    29.722
+| epoch   5 step    55400 |   9520 batches | lr 0.000206 | ms/batch 382.58 | loss  3.42 | ppl    30.640
+| epoch   5 step    55600 |   9720 batches | lr 0.000205 | ms/batch 383.54 | loss  3.39 | ppl    29.772
+| epoch   5 step    55800 |   9920 batches | lr 0.000205 | ms/batch 382.56 | loss  3.40 | ppl    29.829
+| epoch   5 step    56000 |  10120 batches | lr 0.000205 | ms/batch 383.56 | loss  3.39 | ppl    29.737
+----------------------------------------------------------------------------------------------------
+| Eval  14 at step    56000 | time: 1537.89s | valid loss  3.35 | valid ppl    28.430
+----------------------------------------------------------------------------------------------------
+| epoch   5 step    56200 |  10320 batches | lr 0.000204 | ms/batch 429.52 | loss  3.40 | ppl    29.888
+| epoch   5 step    56400 |  10520 batches | lr 0.000204 | ms/batch 383.60 | loss  3.42 | ppl    30.470
+| epoch   5 step    56600 |  10720 batches | lr 0.000204 | ms/batch 382.22 | loss  3.38 | ppl    29.429
+| epoch   5 step    56800 |  10920 batches | lr 0.000203 | ms/batch 383.42 | loss  3.38 | ppl    29.378
+| epoch   5 step    57000 |  11120 batches | lr 0.000203 | ms/batch 382.26 | loss  3.44 | ppl    31.147
+| epoch   5 step    57200 |  11320 batches | lr 0.000203 | ms/batch 382.92 | loss  3.39 | ppl    29.724
+| epoch   6 step    57400 |     50 batches | lr 0.000203 | ms/batch 382.09 | loss  3.41 | ppl    30.289
+| epoch   6 step    57600 |    250 batches | lr 0.000202 | ms/batch 383.62 | loss  3.35 | ppl    28.598
+| epoch   6 step    57800 |    450 batches | lr 0.000202 | ms/batch 382.49 | loss  3.39 | ppl    29.762
+| epoch   6 step    58000 |    650 batches | lr 0.000202 | ms/batch 383.51 | loss  3.36 | ppl    28.802
+| epoch   6 step    58200 |    850 batches | lr 0.000201 | ms/batch 382.50 | loss  3.40 | ppl    29.984
+| epoch   6 step    58400 |   1050 batches | lr 0.000201 | ms/batch 386.57 | loss  3.37 | ppl    29.208
+| epoch   6 step    58600 |   1250 batches | lr 0.000201 | ms/batch 383.06 | loss  3.37 | ppl    29.214
+| epoch   6 step    58800 |   1450 batches | lr 0.0002 | ms/batch 382.90 | loss  3.38 | ppl    29.414
+| epoch   6 step    59000 |   1650 batches | lr 0.0002 | ms/batch 381.99 | loss  3.36 | ppl    28.865
+| epoch   6 step    59200 |   1850 batches | lr 0.0002 | ms/batch 382.72 | loss  3.38 | ppl    29.336
+| epoch   6 step    59400 |   2050 batches | lr 0.000199 | ms/batch 382.45 | loss  3.42 | ppl    30.590
+| epoch   6 step    59600 |   2250 batches | lr 0.000199 | ms/batch 383.23 | loss  3.39 | ppl    29.581
+| epoch   6 step    59800 |   2450 batches | lr 0.000199 | ms/batch 382.01 | loss  3.39 | ppl    29.554
+| epoch   6 step    60000 |   2650 batches | lr 0.000198 | ms/batch 385.56 | loss  3.39 | ppl    29.556
+----------------------------------------------------------------------------------------------------
+| Eval  15 at step    60000 | time: 1539.02s | valid loss  3.34 | valid ppl    28.124
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    60200 |   2850 batches | lr 0.000198 | ms/batch 427.18 | loss  3.34 | ppl    28.084
+| epoch   6 step    60400 |   3050 batches | lr 0.000198 | ms/batch 382.74 | loss  3.38 | ppl    29.496
+| epoch   6 step    60600 |   3250 batches | lr 0.000198 | ms/batch 382.29 | loss  3.38 | ppl    29.316
+| epoch   6 step    60800 |   3450 batches | lr 0.000197 | ms/batch 383.43 | loss  3.36 | ppl    28.769
+| epoch   6 step    61000 |   3650 batches | lr 0.000197 | ms/batch 382.43 | loss  3.36 | ppl    28.811
+| epoch   6 step    61200 |   3850 batches | lr 0.000197 | ms/batch 383.71 | loss  3.37 | ppl    29.053
+| epoch   6 step    61400 |   4050 batches | lr 0.000196 | ms/batch 383.78 | loss  3.39 | ppl    29.601
+| epoch   6 step    61600 |   4250 batches | lr 0.000196 | ms/batch 382.55 | loss  3.37 | ppl    28.986
+| epoch   6 step    61800 |   4450 batches | lr 0.000196 | ms/batch 384.36 | loss  3.38 | ppl    29.261
+| epoch   6 step    62000 |   4650 batches | lr 0.000195 | ms/batch 382.85 | loss  3.37 | ppl    29.053
+| epoch   6 step    62200 |   4850 batches | lr 0.000195 | ms/batch 382.12 | loss  3.36 | ppl    28.773
+| epoch   6 step    62400 |   5050 batches | lr 0.000195 | ms/batch 382.25 | loss  3.37 | ppl    29.208
+| epoch   6 step    62600 |   5250 batches | lr 0.000194 | ms/batch 382.20 | loss  3.36 | ppl    28.811
+| epoch   6 step    62800 |   5450 batches | lr 0.000194 | ms/batch 383.91 | loss  3.34 | ppl    28.159
+| epoch   6 step    63000 |   5650 batches | lr 0.000194 | ms/batch 385.04 | loss  3.38 | ppl    29.398
+| epoch   6 step    63200 |   5850 batches | lr 0.000193 | ms/batch 381.98 | loss  3.36 | ppl    28.768
+| epoch   6 step    63400 |   6050 batches | lr 0.000193 | ms/batch 383.86 | loss  3.35 | ppl    28.541
+| epoch   6 step    63600 |   6250 batches | lr 0.000193 | ms/batch 383.24 | loss  3.36 | ppl    28.893
+| epoch   6 step    63800 |   6450 batches | lr 0.000192 | ms/batch 384.46 | loss  3.37 | ppl    28.936
+| epoch   6 step    64000 |   6650 batches | lr 0.000192 | ms/batch 383.12 | loss  3.31 | ppl    27.491
+----------------------------------------------------------------------------------------------------
+| Eval  16 at step    64000 | time: 1538.94s | valid loss  3.33 | valid ppl    27.945
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    64200 |   6850 batches | lr 0.000192 | ms/batch 426.87 | loss  3.35 | ppl    28.395
+| epoch   6 step    64400 |   7050 batches | lr 0.000191 | ms/batch 384.04 | loss  3.35 | ppl    28.397
+| epoch   6 step    64600 |   7250 batches | lr 0.000191 | ms/batch 383.26 | loss  3.31 | ppl    27.419
+| epoch   6 step    64800 |   7450 batches | lr 0.000191 | ms/batch 382.49 | loss  3.34 | ppl    28.186
+| epoch   6 step    65000 |   7650 batches | lr 0.00019 | ms/batch 382.51 | loss  3.32 | ppl    27.650
+| epoch   6 step    65200 |   7850 batches | lr 0.00019 | ms/batch 382.66 | loss  3.34 | ppl    28.265
+| epoch   6 step    65400 |   8050 batches | lr 0.00019 | ms/batch 382.99 | loss  3.35 | ppl    28.415
+| epoch   6 step    65600 |   8250 batches | lr 0.000189 | ms/batch 382.01 | loss  3.33 | ppl    28.063
+| epoch   6 step    65800 |   8450 batches | lr 0.000189 | ms/batch 383.37 | loss  3.35 | ppl    28.493
+| epoch   6 step    66000 |   8650 batches | lr 0.000189 | ms/batch 382.16 | loss  3.34 | ppl    28.161
+| epoch   6 step    66200 |   8850 batches | lr 0.000188 | ms/batch 383.05 | loss  3.36 | ppl    28.722
+| epoch   6 step    66400 |   9050 batches | lr 0.000188 | ms/batch 381.98 | loss  3.35 | ppl    28.462
+| epoch   6 step    66600 |   9250 batches | lr 0.000188 | ms/batch 382.97 | loss  3.33 | ppl    28.032
+| epoch   6 step    66800 |   9450 batches | lr 0.000187 | ms/batch 382.50 | loss  3.35 | ppl    28.632
+| epoch   6 step    67000 |   9650 batches | lr 0.000187 | ms/batch 382.59 | loss  3.37 | ppl    28.996
+| epoch   6 step    67200 |   9850 batches | lr 0.000187 | ms/batch 382.80 | loss  3.32 | ppl    27.543
+| epoch   6 step    67400 |  10050 batches | lr 0.000186 | ms/batch 382.34 | loss  3.36 | ppl    28.905
+| epoch   6 step    67600 |  10250 batches | lr 0.000186 | ms/batch 383.19 | loss  3.32 | ppl    27.730
+| epoch   6 step    67800 |  10450 batches | lr 0.000186 | ms/batch 382.78 | loss  3.35 | ppl    28.489
+| epoch   6 step    68000 |  10650 batches | lr 0.000185 | ms/batch 382.85 | loss  3.37 | ppl    28.941
+----------------------------------------------------------------------------------------------------
+| Eval  17 at step    68000 | time: 1537.35s | valid loss  3.32 | valid ppl    27.546
+----------------------------------------------------------------------------------------------------
+| epoch   6 step    68200 |  10850 batches | lr 0.000185 | ms/batch 426.77 | loss  3.31 | ppl    27.487
+| epoch   6 step    68400 |  11050 batches | lr 0.000185 | ms/batch 382.33 | loss  3.36 | ppl    28.856
+| epoch   6 step    68600 |  11250 batches | lr 0.000184 | ms/batch 383.02 | loss  3.37 | ppl    29.210
+| epoch   6 step    68800 |  11450 batches | lr 0.000184 | ms/batch 382.50 | loss  3.34 | ppl    28.198
+| epoch   7 step    69000 |    180 batches | lr 0.000183 | ms/batch 382.69 | loss  3.32 | ppl    27.723
+| epoch   7 step    69200 |    380 batches | lr 0.000183 | ms/batch 382.53 | loss  3.32 | ppl    27.754
+| epoch   7 step    69400 |    580 batches | lr 0.000183 | ms/batch 383.34 | loss  3.32 | ppl    27.786
+| epoch   7 step    69600 |    780 batches | lr 0.000182 | ms/batch 382.77 | loss  3.33 | ppl    28.006
+| epoch   7 step    69800 |    980 batches | lr 0.000182 | ms/batch 385.85 | loss  3.31 | ppl    27.419
+| epoch   7 step    70000 |   1180 batches | lr 0.000182 | ms/batch 382.26 | loss  3.34 | ppl    28.337
+| epoch   7 step    70200 |   1380 batches | lr 0.000181 | ms/batch 381.99 | loss  3.32 | ppl    27.696
+| epoch   7 step    70400 |   1580 batches | lr 0.000181 | ms/batch 382.65 | loss  3.32 | ppl    27.663
+| epoch   7 step    70600 |   1780 batches | lr 0.000181 | ms/batch 383.32 | loss  3.32 | ppl    27.705
+| epoch   7 step    70800 |   1980 batches | lr 0.00018 | ms/batch 383.40 | loss  3.35 | ppl    28.606
+| epoch   7 step    71000 |   2180 batches | lr 0.00018 | ms/batch 382.11 | loss  3.34 | ppl    28.329
+| epoch   7 step    71200 |   2380 batches | lr 0.00018 | ms/batch 384.90 | loss  3.34 | ppl    28.226
+| epoch   7 step    71400 |   2580 batches | lr 0.000179 | ms/batch 383.90 | loss  3.33 | ppl    27.848
+| epoch   7 step    71600 |   2780 batches | lr 0.000179 | ms/batch 382.26 | loss  3.31 | ppl    27.291
+| epoch   7 step    71800 |   2980 batches | lr 0.000179 | ms/batch 382.65 | loss  3.32 | ppl    27.616
+| epoch   7 step    72000 |   3180 batches | lr 0.000178 | ms/batch 383.18 | loss  3.33 | ppl    28.000
+----------------------------------------------------------------------------------------------------
+| Eval  18 at step    72000 | time: 1538.28s | valid loss  3.30 | valid ppl    27.248
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    72200 |   3380 batches | lr 0.000178 | ms/batch 425.93 | loss  3.33 | ppl    27.861
+| epoch   7 step    72400 |   3580 batches | lr 0.000178 | ms/batch 382.87 | loss  3.30 | ppl    27.166
+| epoch   7 step    72600 |   3780 batches | lr 0.000177 | ms/batch 382.93 | loss  3.32 | ppl    27.592
+| epoch   7 step    72800 |   3980 batches | lr 0.000177 | ms/batch 383.39 | loss  3.33 | ppl    27.882
+| epoch   7 step    73000 |   4180 batches | lr 0.000176 | ms/batch 382.71 | loss  3.32 | ppl    27.750
+| epoch   7 step    73200 |   4380 batches | lr 0.000176 | ms/batch 382.81 | loss  3.32 | ppl    27.778
+| epoch   7 step    73400 |   4580 batches | lr 0.000176 | ms/batch 383.26 | loss  3.34 | ppl    28.229
+| epoch   7 step    73600 |   4780 batches | lr 0.000175 | ms/batch 382.44 | loss  3.30 | ppl    27.014
+| epoch   7 step    73800 |   4980 batches | lr 0.000175 | ms/batch 382.82 | loss  3.34 | ppl    28.153
+| epoch   7 step    74000 |   5180 batches | lr 0.000175 | ms/batch 384.51 | loss  3.31 | ppl    27.294
+| epoch   7 step    74200 |   5380 batches | lr 0.000174 | ms/batch 382.19 | loss  3.28 | ppl    26.677
+| epoch   7 step    74400 |   5580 batches | lr 0.000174 | ms/batch 382.97 | loss  3.31 | ppl    27.304
+| epoch   7 step    74600 |   5780 batches | lr 0.000174 | ms/batch 382.61 | loss  3.33 | ppl    27.918
+| epoch   7 step    74800 |   5980 batches | lr 0.000173 | ms/batch 384.75 | loss  3.30 | ppl    27.162
+| epoch   7 step    75000 |   6180 batches | lr 0.000173 | ms/batch 382.19 | loss  3.30 | ppl    27.189
+| epoch   7 step    75200 |   6380 batches | lr 0.000172 | ms/batch 382.48 | loss  3.34 | ppl    28.110
+| epoch   7 step    75400 |   6580 batches | lr 0.000172 | ms/batch 384.47 | loss  3.26 | ppl    26.103
+| epoch   7 step    75600 |   6780 batches | lr 0.000172 | ms/batch 382.06 | loss  3.29 | ppl    26.928
+| epoch   7 step    75800 |   6980 batches | lr 0.000171 | ms/batch 382.02 | loss  3.31 | ppl    27.354
+| epoch   7 step    76000 |   7180 batches | lr 0.000171 | ms/batch 382.19 | loss  3.26 | ppl    26.088
+----------------------------------------------------------------------------------------------------
+| Eval  19 at step    76000 | time: 1537.77s | valid loss  3.30 | valid ppl    27.007
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    76200 |   7380 batches | lr 0.000171 | ms/batch 426.04 | loss  3.29 | ppl    26.797
+| epoch   7 step    76400 |   7580 batches | lr 0.00017 | ms/batch 382.30 | loss  3.26 | ppl    26.136
+| epoch   7 step    76600 |   7780 batches | lr 0.00017 | ms/batch 382.02 | loss  3.30 | ppl    27.056
+| epoch   7 step    76800 |   7980 batches | lr 0.00017 | ms/batch 382.37 | loss  3.30 | ppl    27.002
+| epoch   7 step    77000 |   8180 batches | lr 0.000169 | ms/batch 381.93 | loss  3.28 | ppl    26.581
+| epoch   7 step    77200 |   8380 batches | lr 0.000169 | ms/batch 382.07 | loss  3.31 | ppl    27.477
+| epoch   7 step    77400 |   8580 batches | lr 0.000168 | ms/batch 382.05 | loss  3.29 | ppl    26.873
+| epoch   7 step    77600 |   8780 batches | lr 0.000168 | ms/batch 382.22 | loss  3.30 | ppl    27.165
+| epoch   7 step    77800 |   8980 batches | lr 0.000168 | ms/batch 381.94 | loss  3.30 | ppl    27.157
+| epoch   7 step    78000 |   9180 batches | lr 0.000167 | ms/batch 382.32 | loss  3.28 | ppl    26.666
+| epoch   7 step    78200 |   9380 batches | lr 0.000167 | ms/batch 382.20 | loss  3.30 | ppl    27.120
+| epoch   7 step    78400 |   9580 batches | lr 0.000167 | ms/batch 384.94 | loss  3.32 | ppl    27.624
+| epoch   7 step    78600 |   9780 batches | lr 0.000166 | ms/batch 382.60 | loss  3.29 | ppl    26.882
+| epoch   7 step    78800 |   9980 batches | lr 0.000166 | ms/batch 382.96 | loss  3.29 | ppl    26.881
+| epoch   7 step    79000 |  10180 batches | lr 0.000165 | ms/batch 382.31 | loss  3.28 | ppl    26.599
+| epoch   7 step    79200 |  10380 batches | lr 0.000165 | ms/batch 382.49 | loss  3.30 | ppl    26.981
+| epoch   7 step    79400 |  10580 batches | lr 0.000165 | ms/batch 381.98 | loss  3.32 | ppl    27.616
+| epoch   7 step    79600 |  10780 batches | lr 0.000164 | ms/batch 382.74 | loss  3.28 | ppl    26.452
+| epoch   7 step    79800 |  10980 batches | lr 0.000164 | ms/batch 382.19 | loss  3.30 | ppl    27.073
+| epoch   7 step    80000 |  11180 batches | lr 0.000164 | ms/batch 382.42 | loss  3.32 | ppl    27.720
+----------------------------------------------------------------------------------------------------
+| Eval  20 at step    80000 | time: 1535.91s | valid loss  3.29 | valid ppl    26.801
+----------------------------------------------------------------------------------------------------
+| epoch   7 step    80200 |  11380 batches | lr 0.000163 | ms/batch 426.32 | loss  3.31 | ppl    27.251
+| epoch   8 step    80400 |    110 batches | lr 0.000163 | ms/batch 381.08 | loss  3.29 | ppl    26.710
+| epoch   8 step    80600 |    310 batches | lr 0.000163 | ms/batch 382.69 | loss  3.27 | ppl    26.275
+| epoch   8 step    80800 |    510 batches | lr 0.000162 | ms/batch 382.14 | loss  3.30 | ppl    27.200
+| epoch   8 step    81000 |    710 batches | lr 0.000162 | ms/batch 382.38 | loss  3.26 | ppl    26.123
+| epoch   8 step    81200 |    910 batches | lr 0.000161 | ms/batch 381.93 | loss  3.27 | ppl    26.392
+| epoch   8 step    81400 |   1110 batches | lr 0.000161 | ms/batch 382.53 | loss  3.30 | ppl    27.145
+| epoch   8 step    81600 |   1310 batches | lr 0.000161 | ms/batch 382.13 | loss  3.27 | ppl    26.432
+| epoch   8 step    81800 |   1510 batches | lr 0.00016 | ms/batch 382.22 | loss  3.28 | ppl    26.450
+| epoch   8 step    82000 |   1710 batches | lr 0.00016 | ms/batch 382.63 | loss  3.26 | ppl    26.073
+| epoch   8 step    82200 |   1910 batches | lr 0.000159 | ms/batch 384.42 | loss  3.30 | ppl    27.082
+| epoch   8 step    82400 |   2110 batches | lr 0.000159 | ms/batch 382.36 | loss  3.32 | ppl    27.564
+| epoch   8 step    82600 |   2310 batches | lr 0.000159 | ms/batch 382.85 | loss  3.30 | ppl    26.997
+| epoch   8 step    82800 |   2510 batches | lr 0.000158 | ms/batch 382.56 | loss  3.28 | ppl    26.548
+| epoch   8 step    83000 |   2710 batches | lr 0.000158 | ms/batch 383.18 | loss  3.27 | ppl    26.416
+| epoch   8 step    83200 |   2910 batches | lr 0.000158 | ms/batch 382.57 | loss  3.25 | ppl    25.839
+| epoch   8 step    83400 |   3110 batches | lr 0.000157 | ms/batch 383.07 | loss  3.28 | ppl    26.580
+| epoch   8 step    83600 |   3310 batches | lr 0.000157 | ms/batch 382.96 | loss  3.30 | ppl    27.031
+| epoch   8 step    83800 |   3510 batches | lr 0.000156 | ms/batch 382.14 | loss  3.26 | ppl    25.985
+| epoch   8 step    84000 |   3710 batches | lr 0.000156 | ms/batch 382.44 | loss  3.28 | ppl    26.556
+----------------------------------------------------------------------------------------------------
+| Eval  21 at step    84000 | time: 1536.38s | valid loss  3.28 | valid ppl    26.596
+----------------------------------------------------------------------------------------------------
+| epoch   8 step    84200 |   3910 batches | lr 0.000156 | ms/batch 426.73 | loss  3.27 | ppl    26.340
+| epoch   8 step    84400 |   4110 batches | lr 0.000155 | ms/batch 383.01 | loss  3.28 | ppl    26.661
+| epoch   8 step    84600 |   4310 batches | lr 0.000155 | ms/batch 382.82 | loss  3.28 | ppl    26.601
+| epoch   8 step    84800 |   4510 batches | lr 0.000155 | ms/batch 382.43 | loss  3.30 | ppl    27.018
+| epoch   8 step    85000 |   4710 batches | lr 0.000154 | ms/batch 382.14 | loss  3.25 | ppl    25.913
+| epoch   8 step    85200 |   4910 batches | lr 0.000154 | ms/batch 382.26 | loss  3.27 | ppl    26.342
+| epoch   8 step    85400 |   5110 batches | lr 0.000153 | ms/batch 382.40 | loss  3.27 | ppl    26.318
+| epoch   8 step    85600 |   5310 batches | lr 0.000153 | ms/batch 382.15 | loss  3.26 | ppl    26.005
+| epoch   8 step    85800 |   5510 batches | lr 0.000153 | ms/batch 382.10 | loss  3.26 | ppl    26.088
+| epoch   8 step    86000 |   5710 batches | lr 0.000152 | ms/batch 382.38 | loss  3.26 | ppl    26.174
+| epoch   8 step    86200 |   5910 batches | lr 0.000152 | ms/batch 382.06 | loss  3.27 | ppl    26.388
+| epoch   8 step    86400 |   6110 batches | lr 0.000152 | ms/batch 382.67 | loss  3.27 | ppl    26.188
+| epoch   8 step    86600 |   6310 batches | lr 0.000151 | ms/batch 382.05 | loss  3.28 | ppl    26.641
+| epoch   8 step    86800 |   6510 batches | lr 0.000151 | ms/batch 382.46 | loss  3.23 | ppl    25.326
+| epoch   8 step    87000 |   6710 batches | lr 0.00015 | ms/batch 382.15 | loss  3.24 | ppl    25.460
+| epoch   8 step    87200 |   6910 batches | lr 0.00015 | ms/batch 382.31 | loss  3.26 | ppl    25.930
+| epoch   8 step    87400 |   7110 batches | lr 0.00015 | ms/batch 382.02 | loss  3.25 | ppl    25.772
+| epoch   8 step    87600 |   7310 batches | lr 0.000149 | ms/batch 382.39 | loss  3.21 | ppl    24.844
+| epoch   8 step    87800 |   7510 batches | lr 0.000149 | ms/batch 381.83 | loss  3.25 | ppl    25.800
+| epoch   8 step    88000 |   7710 batches | lr 0.000148 | ms/batch 382.25 | loss  3.24 | ppl    25.514
+----------------------------------------------------------------------------------------------------
+| Eval  22 at step    88000 | time: 1535.57s | valid loss  3.27 | valid ppl    26.318
+----------------------------------------------------------------------------------------------------
+| epoch   8 step    88200 |   7910 batches | lr 0.000148 | ms/batch 428.61 | loss  3.24 | ppl    25.613
+| epoch   8 step    88400 |   8110 batches | lr 0.000148 | ms/batch 384.30 | loss  3.25 | ppl    25.863
+| epoch   8 step    88600 |   8310 batches | lr 0.000147 | ms/batch 382.44 | loss  3.25 | ppl    25.698
+| epoch   8 step    88800 |   8510 batches | lr 0.000147 | ms/batch 383.09 | loss  3.24 | ppl    25.631
+| epoch   8 step    89000 |   8710 batches | lr 0.000146 | ms/batch 382.43 | loss  3.26 | ppl    26.027
+| epoch   8 step    89200 |   8910 batches | lr 0.000146 | ms/batch 382.16 | loss  3.26 | ppl    25.968
+| epoch   8 step    89400 |   9110 batches | lr 0.000146 | ms/batch 383.10 | loss  3.26 | ppl    26.008
+| epoch   8 step    89600 |   9310 batches | lr 0.000145 | ms/batch 382.52 | loss  3.24 | ppl    25.563
+| epoch   8 step    89800 |   9510 batches | lr 0.000145 | ms/batch 382.29 | loss  3.27 | ppl    26.341
+| epoch   8 step    90000 |   9710 batches | lr 0.000145 | ms/batch 382.88 | loss  3.25 | ppl    25.798
+| epoch   8 step    90200 |   9910 batches | lr 0.000144 | ms/batch 383.02 | loss  3.24 | ppl    25.588
+| epoch   8 step    90400 |  10110 batches | lr 0.000144 | ms/batch 382.30 | loss  3.25 | ppl    25.882
+| epoch   8 step    90600 |  10310 batches | lr 0.000143 | ms/batch 382.20 | loss  3.25 | ppl    25.703
+| epoch   8 step    90800 |  10510 batches | lr 0.000143 | ms/batch 382.03 | loss  3.27 | ppl    26.421
+| epoch   8 step    91000 |  10710 batches | lr 0.000143 | ms/batch 382.76 | loss  3.24 | ppl    25.531
+| epoch   8 step    91200 |  10910 batches | lr 0.000142 | ms/batch 382.12 | loss  3.23 | ppl    25.348
+| epoch   8 step    91400 |  11110 batches | lr 0.000142 | ms/batch 382.21 | loss  3.29 | ppl    26.919
+| epoch   8 step    91600 |  11310 batches | lr 0.000141 | ms/batch 382.14 | loss  3.25 | ppl    25.882
+| epoch   9 step    91800 |     40 batches | lr 0.000141 | ms/batch 382.47 | loss  3.27 | ppl    26.230
+| epoch   9 step    92000 |    240 batches | lr 0.000141 | ms/batch 382.51 | loss  3.21 | ppl    24.853
+----------------------------------------------------------------------------------------------------
+| Eval  23 at step    92000 | time: 1536.94s | valid loss  3.27 | valid ppl    26.218
+----------------------------------------------------------------------------------------------------
+| epoch   9 step    92200 |    440 batches | lr 0.00014 | ms/batch 428.15 | loss  3.25 | ppl    25.837
+| epoch   9 step    92400 |    640 batches | lr 0.00014 | ms/batch 382.43 | loss  3.22 | ppl    25.062
+| epoch   9 step    92600 |    840 batches | lr 0.000139 | ms/batch 382.40 | loss  3.26 | ppl    26.170
+| epoch   9 step    92800 |   1040 batches | lr 0.000139 | ms/batch 382.80 | loss  3.23 | ppl    25.183
+| epoch   9 step    93000 |   1240 batches | lr 0.000139 | ms/batch 382.69 | loss  3.24 | ppl    25.433
+| epoch   9 step    93200 |   1440 batches | lr 0.000138 | ms/batch 382.44 | loss  3.25 | ppl    25.668
+| epoch   9 step    93400 |   1640 batches | lr 0.000138 | ms/batch 382.71 | loss  3.22 | ppl    24.999
+| epoch   9 step    93600 |   1840 batches | lr 0.000138 | ms/batch 382.20 | loss  3.24 | ppl    25.529
+| epoch   9 step    93800 |   2040 batches | lr 0.000137 | ms/batch 382.68 | loss  3.28 | ppl    26.591
+| epoch   9 step    94000 |   2240 batches | lr 0.000137 | ms/batch 382.11 | loss  3.25 | ppl    25.717
+| epoch   9 step    94200 |   2440 batches | lr 0.000136 | ms/batch 382.20 | loss  3.25 | ppl    25.779
+| epoch   9 step    94400 |   2640 batches | lr 0.000136 | ms/batch 382.68 | loss  3.24 | ppl    25.650
+| epoch   9 step    94600 |   2840 batches | lr 0.000136 | ms/batch 382.16 | loss  3.20 | ppl    24.565
+| epoch   9 step    94800 |   3040 batches | lr 0.000135 | ms/batch 382.20 | loss  3.25 | ppl    25.666
+| epoch   9 step    95000 |   3240 batches | lr 0.000135 | ms/batch 382.37 | loss  3.24 | ppl    25.475
+| epoch   9 step    95200 |   3440 batches | lr 0.000134 | ms/batch 384.41 | loss  3.23 | ppl    25.172
+| epoch   9 step    95400 |   3640 batches | lr 0.000134 | ms/batch 382.59 | loss  3.22 | ppl    25.074
+| epoch   9 step    95600 |   3840 batches | lr 0.000134 | ms/batch 382.09 | loss  3.24 | ppl    25.433
+| epoch   9 step    95800 |   4040 batches | lr 0.000133 | ms/batch 382.85 | loss  3.25 | ppl    25.792
+| epoch   9 step    96000 |   4240 batches | lr 0.000133 | ms/batch 381.98 | loss  3.23 | ppl    25.300
+----------------------------------------------------------------------------------------------------
+| Eval  24 at step    96000 | time: 1536.74s | valid loss  3.26 | valid ppl    25.985
+----------------------------------------------------------------------------------------------------
+| epoch   9 step    96200 |   4440 batches | lr 0.000132 | ms/batch 426.34 | loss  3.24 | ppl    25.442
+| epoch   9 step    96400 |   4640 batches | lr 0.000132 | ms/batch 384.43 | loss  3.23 | ppl    25.346
+| epoch   9 step    96600 |   4840 batches | lr 0.000132 | ms/batch 382.04 | loss  3.22 | ppl    25.046
+| epoch   9 step    96800 |   5040 batches | lr 0.000131 | ms/batch 383.09 | loss  3.24 | ppl    25.583
+| epoch   9 step    97000 |   5240 batches | lr 0.000131 | ms/batch 382.36 | loss  3.23 | ppl    25.241
+| epoch   9 step    97200 |   5440 batches | lr 0.00013 | ms/batch 382.39 | loss  3.20 | ppl    24.466
+| epoch   9 step    97400 |   5640 batches | lr 0.00013 | ms/batch 382.22 | loss  3.24 | ppl    25.589
+| epoch   9 step    97600 |   5840 batches | lr 0.00013 | ms/batch 384.87 | loss  3.23 | ppl    25.329
+| epoch   9 step    97800 |   6040 batches | lr 0.000129 | ms/batch 382.09 | loss  3.21 | ppl    24.792
+| epoch   9 step    98000 |   6240 batches | lr 0.000129 | ms/batch 382.24 | loss  3.23 | ppl    25.197
+| epoch   9 step    98200 |   6440 batches | lr 0.000129 | ms/batch 384.08 | loss  3.23 | ppl    25.386
+| epoch   9 step    98400 |   6640 batches | lr 0.000128 | ms/batch 384.03 | loss  3.18 | ppl    24.057
+| epoch   9 step    98600 |   6840 batches | lr 0.000128 | ms/batch 382.74 | loss  3.21 | ppl    24.797
+| epoch   9 step    98800 |   7040 batches | lr 0.000127 | ms/batch 382.19 | loss  3.22 | ppl    24.906
+| epoch   9 step    99000 |   7240 batches | lr 0.000127 | ms/batch 382.54 | loss  3.18 | ppl    24.052
+| epoch   9 step    99200 |   7440 batches | lr 0.000127 | ms/batch 382.03 | loss  3.20 | ppl    24.555
+| epoch   9 step    99400 |   7640 batches | lr 0.000126 | ms/batch 382.21 | loss  3.18 | ppl    24.134
+| epoch   9 step    99600 |   7840 batches | lr 0.000126 | ms/batch 382.21 | loss  3.21 | ppl    24.800
+| epoch   9 step    99800 |   8040 batches | lr 0.000125 | ms/batch 382.39 | loss  3.21 | ppl    24.779
+| epoch   9 step   100000 |   8240 batches | lr 0.000125 | ms/batch 382.26 | loss  3.20 | ppl    24.531
+----------------------------------------------------------------------------------------------------
+| Eval  25 at step   100000 | time: 1537.21s | valid loss  3.25 | valid ppl    25.840
+----------------------------------------------------------------------------------------------------
+| epoch   9 step   100200 |   8440 batches | lr 0.000125 | ms/batch 427.57 | loss  3.22 | ppl    24.958
+| epoch   9 step   100400 |   8640 batches | lr 0.000124 | ms/batch 382.27 | loss  3.20 | ppl    24.578
+| epoch   9 step   100600 |   8840 batches | lr 0.000124 | ms/batch 382.52 | loss  3.23 | ppl    25.217
+| epoch   9 step   100800 |   9040 batches | lr 0.000123 | ms/batch 382.37 | loss  3.22 | ppl    24.969
+| epoch   9 step   101000 |   9240 batches | lr 0.000123 | ms/batch 382.24 | loss  3.20 | ppl    24.417
+| epoch   9 step   101200 |   9440 batches | lr 0.000123 | ms/batch 382.79 | loss  3.22 | ppl    25.039
+| epoch   9 step   101400 |   9640 batches | lr 0.000122 | ms/batch 382.67 | loss  3.24 | ppl    25.415
+| epoch   9 step   101600 |   9840 batches | lr 0.000122 | ms/batch 382.45 | loss  3.19 | ppl    24.174
+| epoch   9 step   101800 |  10040 batches | lr 0.000121 | ms/batch 382.08 | loss  3.22 | ppl    25.102
+| epoch   9 step   102000 |  10240 batches | lr 0.000121 | ms/batch 383.61 | loss  3.20 | ppl    24.451
+| epoch   9 step   102200 |  10440 batches | lr 0.000121 | ms/batch 382.06 | loss  3.22 | ppl    24.923
+| epoch   9 step   102400 |  10640 batches | lr 0.00012 | ms/batch 382.37 | loss  3.24 | ppl    25.448
+| epoch   9 step   102600 |  10840 batches | lr 0.00012 | ms/batch 382.39 | loss  3.18 | ppl    23.979
+| epoch   9 step   102800 |  11040 batches | lr 0.00012 | ms/batch 382.32 | loss  3.24 | ppl    25.423
+| epoch   9 step   103000 |  11240 batches | lr 0.000119 | ms/batch 383.03 | loss  3.24 | ppl    25.534
+| epoch   9 step   103200 |  11440 batches | lr 0.000119 | ms/batch 382.33 | loss  3.21 | ppl    24.815
+| epoch  10 step   103400 |    170 batches | lr 0.000118 | ms/batch 381.61 | loss  3.20 | ppl    24.481
+| epoch  10 step   103600 |    370 batches | lr 0.000118 | ms/batch 383.21 | loss  3.19 | ppl    24.264
+| epoch  10 step   103800 |    570 batches | lr 0.000118 | ms/batch 382.43 | loss  3.20 | ppl    24.604
+| epoch  10 step   104000 |    770 batches | lr 0.000117 | ms/batch 382.42 | loss  3.20 | ppl    24.608
+----------------------------------------------------------------------------------------------------
+| Eval  26 at step   104000 | time: 1536.35s | valid loss  3.24 | valid ppl    25.656
+----------------------------------------------------------------------------------------------------
+| epoch  10 step   104200 |    970 batches | lr 0.000117 | ms/batch 428.66 | loss  3.18 | ppl    24.059
+| epoch  10 step   104400 |   1170 batches | lr 0.000116 | ms/batch 382.64 | loss  3.22 | ppl    24.956
+| epoch  10 step   104600 |   1370 batches | lr 0.000116 | ms/batch 382.09 | loss  3.19 | ppl    24.344
+| epoch  10 step   104800 |   1570 batches | lr 0.000116 | ms/batch 382.32 | loss  3.19 | ppl    24.285
+| epoch  10 step   105000 |   1770 batches | lr 0.000115 | ms/batch 382.47 | loss  3.19 | ppl    24.407
+| epoch  10 step   105200 |   1970 batches | lr 0.000115 | ms/batch 382.28 | loss  3.22 | ppl    25.101
+| epoch  10 step   105400 |   2170 batches | lr 0.000114 | ms/batch 382.16 | loss  3.22 | ppl    24.958
+| epoch  10 step   105600 |   2370 batches | lr 0.000114 | ms/batch 382.82 | loss  3.21 | ppl    24.760
+| epoch  10 step   105800 |   2570 batches | lr 0.000114 | ms/batch 382.65 | loss  3.20 | ppl    24.606
+| epoch  10 step   106000 |   2770 batches | lr 0.000113 | ms/batch 383.22 | loss  3.18 | ppl    24.045
+| epoch  10 step   106200 |   2970 batches | lr 0.000113 | ms/batch 382.13 | loss  3.19 | ppl    24.269
+| epoch  10 step   106400 |   3170 batches | lr 0.000112 | ms/batch 382.19 | loss  3.21 | ppl    24.703
+| epoch  10 step   106600 |   3370 batches | lr 0.000112 | ms/batch 381.99 | loss  3.20 | ppl    24.587
+| epoch  10 step   106800 |   3570 batches | lr 0.000112 | ms/batch 381.93 | loss  3.18 | ppl    23.994
+| epoch  10 step   107000 |   3770 batches | lr 0.000111 | ms/batch 382.52 | loss  3.19 | ppl    24.305
+| epoch  10 step   107200 |   3970 batches | lr 0.000111 | ms/batch 382.40 | loss  3.20 | ppl    24.528
+| epoch  10 step   107400 |   4170 batches | lr 0.000111 | ms/batch 382.31 | loss  3.19 | ppl    24.408
+| epoch  10 step   107600 |   4370 batches | lr 0.00011 | ms/batch 382.60 | loss  3.20 | ppl    24.599
+| epoch  10 step   107800 |   4570 batches | lr 0.00011 | ms/batch 382.24 | loss  3.21 | ppl    24.863
+| epoch  10 step   108000 |   4770 batches | lr 0.000109 | ms/batch 382.13 | loss  3.17 | ppl    23.782
+----------------------------------------------------------------------------------------------------
+| Eval  27 at step   108000 | time: 1536.23s | valid loss  3.23 | valid ppl    25.255
+----------------------------------------------------------------------------------------------------
+| epoch  10 step   108200 |   4970 batches | lr 0.000109 | ms/batch 426.28 | loss  3.21 | ppl    24.763
+| epoch  10 step   108400 |   5170 batches | lr 0.000109 | ms/batch 382.29 | loss  3.19 | ppl    24.200
+| epoch  10 step   108600 |   5370 batches | lr 0.000108 | ms/batch 382.26 | loss  3.16 | ppl    23.645
+| epoch  10 step   108800 |   5570 batches | lr 0.000108 | ms/batch 382.46 | loss  3.18 | ppl    24.039
+| epoch  10 step   109000 |   5770 batches | lr 0.000107 | ms/batch 383.71 | loss  3.20 | ppl    24.615
+| epoch  10 step   109200 |   5970 batches | lr 0.000107 | ms/batch 382.20 | loss  3.18 | ppl    24.018
+| epoch  10 step   109400 |   6170 batches | lr 0.000107 | ms/batch 382.78 | loss  3.18 | ppl    23.980
+| epoch  10 step   109600 |   6370 batches | lr 0.000106 | ms/batch 382.05 | loss  3.22 | ppl    25.013
+| epoch  10 step   109800 |   6570 batches | lr 0.000106 | ms/batch 382.35 | loss  3.13 | ppl    22.954
+| epoch  10 step   110000 |   6770 batches | lr 0.000105 | ms/batch 382.09 | loss  3.17 | ppl    23.779
+| epoch  10 step   110200 |   6970 batches | lr 0.000105 | ms/batch 382.41 | loss  3.18 | ppl    24.146
+| epoch  10 step   110400 |   7170 batches | lr 0.000105 | ms/batch 381.99 | loss  3.14 | ppl    23.079
+| epoch  10 step   110600 |   7370 batches | lr 0.000104 | ms/batch 382.25 | loss  3.17 | ppl    23.727
+| epoch  10 step   110800 |   7570 batches | lr 0.000104 | ms/batch 381.90 | loss  3.14 | ppl    23.090
+| epoch  10 step   111000 |   7770 batches | lr 0.000104 | ms/batch 382.75 | loss  3.18 | ppl    24.008
+| epoch  10 step   111200 |   7970 batches | lr 0.000103 | ms/batch 382.33 | loss  3.17 | ppl    23.716
+| epoch  10 step   111400 |   8170 batches | lr 0.000103 | ms/batch 382.39 | loss  3.16 | ppl    23.509
+| epoch  10 step   111600 |   8370 batches | lr 0.000102 | ms/batch 382.05 | loss  3.19 | ppl    24.226
+| epoch  10 step   111800 |   8570 batches | lr 0.000102 | ms/batch 382.85 | loss  3.17 | ppl    23.716
+| epoch  10 step   112000 |   8770 batches | lr 0.000102 | ms/batch 382.42 | loss  3.18 | ppl    23.938
+----------------------------------------------------------------------------------------------------
+| Eval  28 at step   112000 | time: 1535.84s | valid loss  3.23 | valid ppl    25.189
+----------------------------------------------------------------------------------------------------
+| epoch  10 step   112200 |   8970 batches | lr 0.000101 | ms/batch 426.52 | loss  3.18 | ppl    24.127
+| epoch  10 step   112400 |   9170 batches | lr 0.000101 | ms/batch 383.30 | loss  3.16 | ppl    23.619
+| epoch  10 step   112600 |   9370 batches | lr 0.0001 | ms/batch 382.14 | loss  3.18 | ppl    23.950
+| epoch  10 step   112800 |   9570 batches | lr 0.0001 | ms/batch 385.48 | loss  3.20 | ppl    24.423
+| epoch  10 step   113000 |   9770 batches | lr 9.97e-05 | ms/batch 382.69 | loss  3.17 | ppl    23.829
+| epoch  10 step   113200 |   9970 batches | lr 9.93e-05 | ms/batch 382.49 | loss  3.17 | ppl    23.821
+| epoch  10 step   113400 |  10170 batches | lr 9.89e-05 | ms/batch 382.23 | loss  3.15 | ppl    23.294
+| epoch  10 step   113600 |  10370 batches | lr 9.85e-05 | ms/batch 384.21 | loss  3.18 | ppl    23.956
+| epoch  10 step   113800 |  10570 batches | lr 9.81e-05 | ms/batch 382.45 | loss  3.20 | ppl    24.546
+| epoch  10 step   114000 |  10770 batches | lr 9.77e-05 | ms/batch 382.38 | loss  3.15 | ppl    23.438
+| epoch  10 step   114200 |  10970 batches | lr 9.73e-05 | ms/batch 382.18 | loss  3.17 | ppl    23.817
+| epoch  10 step   114400 |  11170 batches | lr 9.7e-05 | ms/batch 381.94 | loss  3.21 | ppl    24.673
+| epoch  10 step   114600 |  11370 batches | lr 9.66e-05 | ms/batch 382.15 | loss  3.18 | ppl    24.121
+| epoch  11 step   114800 |    100 batches | lr 9.62e-05 | ms/batch 381.61 | loss  3.17 | ppl    23.803
+| epoch  11 step   115000 |    300 batches | lr 9.58e-05 | ms/batch 382.75 | loss  3.15 | ppl    23.304
+| epoch  11 step   115200 |    500 batches | lr 9.54e-05 | ms/batch 384.39 | loss  3.19 | ppl    24.227
+| epoch  11 step   115400 |    700 batches | lr 9.51e-05 | ms/batch 385.50 | loss  3.14 | ppl    23.071
+| epoch  11 step   115600 |    900 batches | lr 9.47e-05 | ms/batch 385.56 | loss  3.16 | ppl    23.511
+| epoch  11 step   115800 |   1100 batches | lr 9.43e-05 | ms/batch 382.23 | loss  3.18 | ppl    24.006
+| epoch  11 step   116000 |   1300 batches | lr 9.39e-05 | ms/batch 382.19 | loss  3.16 | ppl    23.507
+----------------------------------------------------------------------------------------------------
+| Eval  29 at step   116000 | time: 1538.31s | valid loss  3.22 | valid ppl    25.114
+----------------------------------------------------------------------------------------------------
+| epoch  11 step   116200 |   1500 batches | lr 9.35e-05 | ms/batch 426.05 | loss  3.15 | ppl    23.409
+| epoch  11 step   116400 |   1700 batches | lr 9.32e-05 | ms/batch 382.79 | loss  3.15 | ppl    23.260
+| epoch  11 step   116600 |   1900 batches | lr 9.28e-05 | ms/batch 382.10 | loss  3.18 | ppl    23.952
+| epoch  11 step   116800 |   2100 batches | lr 9.24e-05 | ms/batch 382.37 | loss  3.20 | ppl    24.514
+| epoch  11 step   117000 |   2300 batches | lr 9.2e-05 | ms/batch 382.47 | loss  3.18 | ppl    24.021
+| epoch  11 step   117200 |   2500 batches | lr 9.16e-05 | ms/batch 382.34 | loss  3.16 | ppl    23.538
+| epoch  11 step   117400 |   2700 batches | lr 9.13e-05 | ms/batch 382.68 | loss  3.16 | ppl    23.667
+| epoch  11 step   117600 |   2900 batches | lr 9.09e-05 | ms/batch 382.24 | loss  3.13 | ppl    22.809
+| epoch  11 step   117800 |   3100 batches | lr 9.05e-05 | ms/batch 382.77 | loss  3.16 | ppl    23.658
+| epoch  11 step   118000 |   3300 batches | lr 9.01e-05 | ms/batch 382.95 | loss  3.18 | ppl    24.090
+| epoch  11 step   118200 |   3500 batches | lr 8.97e-05 | ms/batch 382.53 | loss  3.14 | ppl    23.180
+| epoch  11 step   118400 |   3700 batches | lr 8.94e-05 | ms/batch 382.45 | loss  3.16 | ppl    23.624
+| epoch  11 step   118600 |   3900 batches | lr 8.9e-05 | ms/batch 382.36 | loss  3.15 | ppl    23.373
+| epoch  11 step   118800 |   4100 batches | lr 8.86e-05 | ms/batch 382.39 | loss  3.17 | ppl    23.850
+| epoch  11 step   119000 |   4300 batches | lr 8.82e-05 | ms/batch 383.06 | loss  3.16 | ppl    23.517
+| epoch  11 step   119200 |   4500 batches | lr 8.79e-05 | ms/batch 382.42 | loss  3.18 | ppl    24.035
+| epoch  11 step   119400 |   4700 batches | lr 8.75e-05 | ms/batch 382.30 | loss  3.14 | ppl    23.085
+| epoch  11 step   119600 |   4900 batches | lr 8.71e-05 | ms/batch 382.34 | loss  3.15 | ppl    23.369
+| epoch  11 step   119800 |   5100 batches | lr 8.67e-05 | ms/batch 382.28 | loss  3.16 | ppl    23.601
+| epoch  11 step   120000 |   5300 batches | lr 8.64e-05 | ms/batch 382.23 | loss  3.14 | ppl    23.129
+----------------------------------------------------------------------------------------------------
+| Eval  30 at step   120000 | time: 1536.06s | valid loss  3.22 | valid ppl    24.910
+----------------------------------------------------------------------------------------------------
+| epoch  11 step   120200 |   5500 batches | lr 8.6e-05 | ms/batch 426.45 | loss  3.14 | ppl    23.184
+| epoch  11 step   120400 |   5700 batches | lr 8.56e-05 | ms/batch 382.84 | loss  3.15 | ppl    23.387
+| epoch  11 step   120600 |   5900 batches | lr 8.53e-05 | ms/batch 382.80 | loss  3.16 | ppl    23.487
+| epoch  11 step   120800 |   6100 batches | lr 8.49e-05 | ms/batch 382.29 | loss  3.15 | ppl    23.382
+| epoch  11 step   121000 |   6300 batches | lr 8.45e-05 | ms/batch 384.19 | loss  3.16 | ppl    23.578
+| epoch  11 step   121200 |   6500 batches | lr 8.41e-05 | ms/batch 382.43 | loss  3.12 | ppl    22.710
+| epoch  11 step   121400 |   6700 batches | lr 8.38e-05 | ms/batch 382.14 | loss  3.12 | ppl    22.638
+| epoch  11 step   121600 |   6900 batches | lr 8.34e-05 | ms/batch 382.48 | loss  3.14 | ppl    23.168
+| epoch  11 step   121800 |   7100 batches | lr 8.3e-05 | ms/batch 383.22 | loss  3.14 | ppl    23.054
+| epoch  11 step   122000 |   7300 batches | lr 8.27e-05 | ms/batch 382.59 | loss  3.09 | ppl    22.058
+| epoch  11 step   122200 |   7500 batches | lr 8.23e-05 | ms/batch 382.23 | loss  3.14 | ppl    23.079
+| epoch  11 step   122400 |   7700 batches | lr 8.19e-05 | ms/batch 382.91 | loss  3.12 | ppl    22.627
+| epoch  11 step   122600 |   7900 batches | lr 8.16e-05 | ms/batch 382.47 | loss  3.13 | ppl    22.780
+| epoch  11 step   122800 |   8100 batches | lr 8.12e-05 | ms/batch 382.22 | loss  3.14 | ppl    23.145
+| epoch  11 step   123000 |   8300 batches | lr 8.08e-05 | ms/batch 382.37 | loss  3.13 | ppl    22.848
+| epoch  11 step   123200 |   8500 batches | lr 8.04e-05 | ms/batch 382.30 | loss  3.13 | ppl    22.881
+| epoch  11 step   123400 |   8700 batches | lr 8.01e-05 | ms/batch 382.49 | loss  3.15 | ppl    23.295
+| epoch  11 step   123600 |   8900 batches | lr 7.97e-05 | ms/batch 382.00 | loss  3.14 | ppl    23.137
+| epoch  11 step   123800 |   9100 batches | lr 7.93e-05 | ms/batch 382.89 | loss  3.14 | ppl    23.205
+| epoch  11 step   124000 |   9300 batches | lr 7.9e-05 | ms/batch 382.01 | loss  3.13 | ppl    22.877
+----------------------------------------------------------------------------------------------------
+| Eval  31 at step   124000 | time: 1536.54s | valid loss  3.21 | valid ppl    24.705
+----------------------------------------------------------------------------------------------------
+| epoch  11 step   124200 |   9500 batches | lr 7.86e-05 | ms/batch 426.03 | loss  3.15 | ppl    23.341
+| epoch  11 step   124400 |   9700 batches | lr 7.83e-05 | ms/batch 382.70 | loss  3.14 | ppl    23.144
+| epoch  11 step   124600 |   9900 batches | lr 7.79e-05 | ms/batch 382.71 | loss  3.13 | ppl    22.771
+| epoch  11 step   124800 |  10100 batches | lr 7.75e-05 | ms/batch 382.50 | loss  3.14 | ppl    23.138
+| epoch  11 step   125000 |  10300 batches | lr 7.72e-05 | ms/batch 382.99 | loss  3.13 | ppl    22.907
+| epoch  11 step   125200 |  10500 batches | lr 7.68e-05 | ms/batch 382.03 | loss  3.16 | ppl    23.676
+| epoch  11 step   125400 |  10700 batches | lr 7.64e-05 | ms/batch 382.49 | loss  3.13 | ppl    22.800
+| epoch  11 step   125600 |  10900 batches | lr 7.61e-05 | ms/batch 382.28 | loss  3.12 | ppl    22.598
+| epoch  11 step   125800 |  11100 batches | lr 7.57e-05 | ms/batch 382.13 | loss  3.17 | ppl    23.875
+| epoch  11 step   126000 |  11300 batches | lr 7.54e-05 | ms/batch 383.41 | loss  3.15 | ppl    23.357
+| epoch  12 step   126200 |     30 batches | lr 7.5e-05 | ms/batch 381.25 | loss  3.15 | ppl    23.413
+| epoch  12 step   126400 |    230 batches | lr 7.46e-05 | ms/batch 382.16 | loss  3.10 | ppl    22.274
+| epoch  12 step   126600 |    430 batches | lr 7.43e-05 | ms/batch 383.09 | loss  3.14 | ppl    23.086
+| epoch  12 step   126800 |    630 batches | lr 7.39e-05 | ms/batch 382.18 | loss  3.11 | ppl    22.526
+| epoch  12 step   127000 |    830 batches | lr 7.36e-05 | ms/batch 382.31 | loss  3.15 | ppl    23.399
+| epoch  12 step   127200 |   1030 batches | lr 7.32e-05 | ms/batch 382.19 | loss  3.11 | ppl    22.478
+| epoch  12 step   127400 |   1230 batches | lr 7.28e-05 | ms/batch 383.22 | loss  3.13 | ppl    22.942
+| epoch  12 step   127600 |   1430 batches | lr 7.25e-05 | ms/batch 383.14 | loss  3.13 | ppl    22.840
+| epoch  12 step   127800 |   1630 batches | lr 7.21e-05 | ms/batch 382.25 | loss  3.11 | ppl    22.402
+| epoch  12 step   128000 |   1830 batches | lr 7.18e-05 | ms/batch 382.04 | loss  3.14 | ppl    22.998
+----------------------------------------------------------------------------------------------------
+| Eval  32 at step   128000 | time: 1536.17s | valid loss  3.21 | valid ppl    24.729
+----------------------------------------------------------------------------------------------------
+| epoch  12 step   128200 |   2030 batches | lr 7.14e-05 | ms/batch 413.67 | loss  3.17 | ppl    23.753
+| epoch  12 step   128400 |   2230 batches | lr 7.11e-05 | ms/batch 382.31 | loss  3.14 | ppl    23.103
+| epoch  12 step   128600 |   2430 batches | lr 7.07e-05 | ms/batch 382.32 | loss  3.14 | ppl    23.163
+| epoch  12 step   128800 |   2630 batches | lr 7.04e-05 | ms/batch 383.18 | loss  3.13 | ppl    22.938
+| epoch  12 step   129000 |   2830 batches | lr 7e-05 | ms/batch 382.20 | loss  3.10 | ppl    22.155
+| epoch  12 step   129200 |   3030 batches | lr 6.97e-05 | ms/batch 382.30 | loss  3.13 | ppl    22.918
+| epoch  12 step   129400 |   3230 batches | lr 6.93e-05 | ms/batch 383.25 | loss  3.13 | ppl    22.808
+| epoch  12 step   129600 |   3430 batches | lr 6.9e-05 | ms/batch 382.15 | loss  3.12 | ppl    22.706
+| epoch  12 step   129800 |   3630 batches | lr 6.86e-05 | ms/batch 382.07 | loss  3.11 | ppl    22.435
+| epoch  12 step   130000 |   3830 batches | lr 6.83e-05 | ms/batch 382.59 | loss  3.13 | ppl    22.807
+| epoch  12 step   130200 |   4030 batches | lr 6.79e-05 | ms/batch 382.34 | loss  3.14 | ppl    23.171
+| epoch  12 step   130400 |   4230 batches | lr 6.76e-05 | ms/batch 382.69 | loss  3.13 | ppl    22.817
+| epoch  12 step   130600 |   4430 batches | lr 6.72e-05 | ms/batch 382.08 | loss  3.13 | ppl    22.790
+| epoch  12 step   130800 |   4630 batches | lr 6.69e-05 | ms/batch 382.35 | loss  3.13 | ppl    22.794
+| epoch  12 step   131000 |   4830 batches | lr 6.65e-05 | ms/batch 382.00 | loss  3.11 | ppl    22.490
+| epoch  12 step   131200 |   5030 batches | lr 6.62e-05 | ms/batch 382.50 | loss  3.14 | ppl    23.008
+| epoch  12 step   131400 |   5230 batches | lr 6.58e-05 | ms/batch 382.93 | loss  3.12 | ppl    22.728
+| epoch  12 step   131600 |   5430 batches | lr 6.55e-05 | ms/batch 382.13 | loss  3.09 | ppl    22.037
+| epoch  12 step   131800 |   5630 batches | lr 6.51e-05 | ms/batch 382.22 | loss  3.13 | ppl    22.860
+| epoch  12 step   132000 |   5830 batches | lr 6.48e-05 | ms/batch 382.29 | loss  3.13 | ppl    22.808
+----------------------------------------------------------------------------------------------------
+| Eval  33 at step   132000 | time: 1535.91s | valid loss  3.20 | valid ppl    24.508
+----------------------------------------------------------------------------------------------------
+| epoch  12 step   132200 |   6030 batches | lr 6.44e-05 | ms/batch 426.93 | loss  3.10 | ppl    22.292
+| epoch  12 step   132400 |   6230 batches | lr 6.41e-05 | ms/batch 382.45 | loss  3.12 | ppl    22.597
+| epoch  12 step   132600 |   6430 batches | lr 6.38e-05 | ms/batch 382.37 | loss  3.13 | ppl    22.881
+| epoch  12 step   132800 |   6630 batches | lr 6.34e-05 | ms/batch 383.06 | loss  3.08 | ppl    21.695
+| epoch  12 step   133000 |   6830 batches | lr 6.31e-05 | ms/batch 382.08 | loss  3.10 | ppl    22.302
+| epoch  12 step   133200 |   7030 batches | lr 6.27e-05 | ms/batch 382.16 | loss  3.11 | ppl    22.463
+| epoch  12 step   133400 |   7230 batches | lr 6.24e-05 | ms/batch 382.05 | loss  3.08 | ppl    21.683
+| epoch  12 step   133600 |   7430 batches | lr 6.2e-05 | ms/batch 383.07 | loss  3.09 | ppl    21.918
+| epoch  12 step   133800 |   7630 batches | lr 6.17e-05 | ms/batch 381.84 | loss  3.08 | ppl    21.792
+| epoch  12 step   134000 |   7830 batches | lr 6.14e-05 | ms/batch 382.49 | loss  3.10 | ppl    22.284
+| epoch  12 step   134200 |   8030 batches | lr 6.1e-05 | ms/batch 381.94 | loss  3.11 | ppl    22.330
+| epoch  12 step   134400 |   8230 batches | lr 6.07e-05 | ms/batch 382.66 | loss  3.10 | ppl    22.087
+| epoch  12 step   134600 |   8430 batches | lr 6.04e-05 | ms/batch 381.98 | loss  3.11 | ppl    22.433
+| epoch  12 step   134800 |   8630 batches | lr 6e-05 | ms/batch 382.70 | loss  3.10 | ppl    22.219
+| epoch  12 step   135000 |   8830 batches | lr 5.97e-05 | ms/batch 382.07 | loss  3.12 | ppl    22.686
+| epoch  12 step   135200 |   9030 batches | lr 5.94e-05 | ms/batch 382.67 | loss  3.12 | ppl    22.550
+| epoch  12 step   135400 |   9230 batches | lr 5.9e-05 | ms/batch 383.43 | loss  3.09 | ppl    21.869
+| epoch  12 step   135600 |   9430 batches | lr 5.87e-05 | ms/batch 382.29 | loss  3.12 | ppl    22.561
+| epoch  12 step   135800 |   9630 batches | lr 5.84e-05 | ms/batch 383.83 | loss  3.13 | ppl    22.883
+| epoch  12 step   136000 |   9830 batches | lr 5.8e-05 | ms/batch 382.22 | loss  3.09 | ppl    21.958
+----------------------------------------------------------------------------------------------------
+| Eval  34 at step   136000 | time: 1536.34s | valid loss  3.19 | valid ppl    24.347
+----------------------------------------------------------------------------------------------------
+| epoch  12 step   136200 |  10030 batches | lr 5.77e-05 | ms/batch 427.48 | loss  3.11 | ppl    22.491
+| epoch  12 step   136400 |  10230 batches | lr 5.74e-05 | ms/batch 382.28 | loss  3.10 | ppl    22.171
+| epoch  12 step   136600 |  10430 batches | lr 5.7e-05 | ms/batch 382.31 | loss  3.11 | ppl    22.329
+| epoch  12 step   136800 |  10630 batches | lr 5.67e-05 | ms/batch 382.04 | loss  3.14 | ppl    23.048
+| epoch  12 step   137000 |  10830 batches | lr 5.64e-05 | ms/batch 382.41 | loss  3.08 | ppl    21.659
+| epoch  12 step   137200 |  11030 batches | lr 5.6e-05 | ms/batch 382.01 | loss  3.13 | ppl    22.971
+| epoch  12 step   137400 |  11230 batches | lr 5.57e-05 | ms/batch 382.43 | loss  3.13 | ppl    22.881
+| epoch  12 step   137600 |  11430 batches | lr 5.54e-05 | ms/batch 382.30 | loss  3.12 | ppl    22.562
+| epoch  13 step   137800 |    160 batches | lr 5.51e-05 | ms/batch 381.95 | loss  3.10 | ppl    22.208
+| epoch  13 step   138000 |    360 batches | lr 5.47e-05 | ms/batch 382.40 | loss  3.09 | ppl    21.933
+| epoch  13 step   138200 |    560 batches | lr 5.44e-05 | ms/batch 382.21 | loss  3.11 | ppl    22.330
+| epoch  13 step   138400 |    760 batches | lr 5.41e-05 | ms/batch 382.34 | loss  3.10 | ppl    22.150
+| epoch  13 step   138600 |    960 batches | lr 5.38e-05 | ms/batch 383.74 | loss  3.08 | ppl    21.791
+| epoch  13 step   138800 |   1160 batches | lr 5.34e-05 | ms/batch 382.09 | loss  3.12 | ppl    22.594
+| epoch  13 step   139000 |   1360 batches | lr 5.31e-05 | ms/batch 382.12 | loss  3.09 | ppl    22.023
+| epoch  13 step   139200 |   1560 batches | lr 5.28e-05 | ms/batch 382.56 | loss  3.09 | ppl    21.997
+| epoch  13 step   139400 |   1760 batches | lr 5.25e-05 | ms/batch 382.43 | loss  3.09 | ppl    21.979
+| epoch  13 step   139600 |   1960 batches | lr 5.22e-05 | ms/batch 382.02 | loss  3.12 | ppl    22.641
+| epoch  13 step   139800 |   2160 batches | lr 5.18e-05 | ms/batch 382.36 | loss  3.12 | ppl    22.747
+| epoch  13 step   140000 |   2360 batches | lr 5.15e-05 | ms/batch 382.02 | loss  3.11 | ppl    22.326
+----------------------------------------------------------------------------------------------------
+| Eval  35 at step   140000 | time: 1535.70s | valid loss  3.19 | valid ppl    24.249
+----------------------------------------------------------------------------------------------------
+| epoch  13 step   140200 |   2560 batches | lr 5.12e-05 | ms/batch 426.40 | loss  3.11 | ppl    22.373
+| epoch  13 step   140400 |   2760 batches | lr 5.09e-05 | ms/batch 382.20 | loss  3.08 | ppl    21.833
+| epoch  13 step   140600 |   2960 batches | lr 5.06e-05 | ms/batch 382.51 | loss  3.09 | ppl    21.962
+| epoch  13 step   140800 |   3160 batches | lr 5.03e-05 | ms/batch 382.14 | loss  3.11 | ppl    22.316
+| epoch  13 step   141000 |   3360 batches | lr 4.99e-05 | ms/batch 382.34 | loss  3.10 | ppl    22.298
+| epoch  13 step   141200 |   3560 batches | lr 4.96e-05 | ms/batch 382.74 | loss  3.08 | ppl    21.850
+| epoch  13 step   141400 |   3760 batches | lr 4.93e-05 | ms/batch 382.82 | loss  3.09 | ppl    22.077
+| epoch  13 step   141600 |   3960 batches | lr 4.9e-05 | ms/batch 382.57 | loss  3.10 | ppl    22.157
+| epoch  13 step   141800 |   4160 batches | lr 4.87e-05 | ms/batch 382.56 | loss  3.10 | ppl    22.237
+| epoch  13 step   142000 |   4360 batches | lr 4.84e-05 | ms/batch 382.02 | loss  3.10 | ppl    22.210
+| epoch  13 step   142200 |   4560 batches | lr 4.81e-05 | ms/batch 382.78 | loss  3.12 | ppl    22.641
+| epoch  13 step   142400 |   4760 batches | lr 4.78e-05 | ms/batch 382.16 | loss  3.07 | ppl    21.546
+| epoch  13 step   142600 |   4960 batches | lr 4.75e-05 | ms/batch 382.39 | loss  3.11 | ppl    22.384
+| epoch  13 step   142800 |   5160 batches | lr 4.72e-05 | ms/batch 382.22 | loss  3.09 | ppl    21.989
+| epoch  13 step   143000 |   5360 batches | lr 4.68e-05 | ms/batch 382.58 | loss  3.08 | ppl    21.657
+| epoch  13 step   143200 |   5560 batches | lr 4.65e-05 | ms/batch 382.38 | loss  3.08 | ppl    21.757
+| epoch  13 step   143400 |   5760 batches | lr 4.62e-05 | ms/batch 383.58 | loss  3.10 | ppl    22.194
+| epoch  13 step   143600 |   5960 batches | lr 4.59e-05 | ms/batch 382.90 | loss  3.09 | ppl    21.933
+| epoch  13 step   143800 |   6160 batches | lr 4.56e-05 | ms/batch 383.73 | loss  3.08 | ppl    21.719
+| epoch  13 step   144000 |   6360 batches | lr 4.53e-05 | ms/batch 382.50 | loss  3.13 | ppl    22.838
+----------------------------------------------------------------------------------------------------
+| Eval  36 at step   144000 | time: 1536.60s | valid loss  3.19 | valid ppl    24.245
+----------------------------------------------------------------------------------------------------
+| epoch  13 step   144200 |   6560 batches | lr 4.5e-05 | ms/batch 428.61 | loss  3.03 | ppl    20.797
+| epoch  13 step   144400 |   6760 batches | lr 4.47e-05 | ms/batch 382.58 | loss  3.07 | ppl    21.616
+| epoch  13 step   144600 |   6960 batches | lr 4.44e-05 | ms/batch 382.81 | loss  3.09 | ppl    21.927
+| epoch  13 step   144800 |   7160 batches | lr 4.41e-05 | ms/batch 382.66 | loss  3.04 | ppl    20.999
+| epoch  13 step   145000 |   7360 batches | lr 4.38e-05 | ms/batch 382.36 | loss  3.07 | ppl    21.506
+| epoch  13 step   145200 |   7560 batches | lr 4.35e-05 | ms/batch 382.21 | loss  3.05 | ppl    21.175
+| epoch  13 step   145400 |   7760 batches | lr 4.32e-05 | ms/batch 382.89 | loss  3.08 | ppl    21.819
+| epoch  13 step   145600 |   7960 batches | lr 4.29e-05 | ms/batch 382.31 | loss  3.06 | ppl    21.426
+| epoch  13 step   145800 |   8160 batches | lr 4.26e-05 | ms/batch 383.34 | loss  3.07 | ppl    21.444
+| epoch  13 step   146000 |   8360 batches | lr 4.23e-05 | ms/batch 382.40 | loss  3.09 | ppl    22.010
+| epoch  13 step   146200 |   8560 batches | lr 4.2e-05 | ms/batch 382.59 | loss  3.07 | ppl    21.595
+| epoch  13 step   146400 |   8760 batches | lr 4.17e-05 | ms/batch 382.41 | loss  3.08 | ppl    21.741
+| epoch  13 step   146600 |   8960 batches | lr 4.15e-05 | ms/batch 382.37 | loss  3.10 | ppl    22.096
+| epoch  13 step   146800 |   9160 batches | lr 4.12e-05 | ms/batch 382.26 | loss  3.07 | ppl    21.442
+| epoch  13 step   147000 |   9360 batches | lr 4.09e-05 | ms/batch 382.91 | loss  3.08 | ppl    21.744
+| epoch  13 step   147200 |   9560 batches | lr 4.06e-05 | ms/batch 385.27 | loss  3.11 | ppl    22.345
+| epoch  13 step   147400 |   9760 batches | lr 4.03e-05 | ms/batch 384.15 | loss  3.08 | ppl    21.665
+| epoch  13 step   147600 |   9960 batches | lr 4e-05 | ms/batch 383.92 | loss  3.08 | ppl    21.738
+| epoch  13 step   147800 |  10160 batches | lr 3.97e-05 | ms/batch 383.83 | loss  3.05 | ppl    21.213
+| epoch  13 step   148000 |  10360 batches | lr 3.94e-05 | ms/batch 384.68 | loss  3.09 | ppl    21.995
+----------------------------------------------------------------------------------------------------
+| Eval  37 at step   148000 | time: 1538.82s | valid loss  3.18 | valid ppl    23.993
+----------------------------------------------------------------------------------------------------
+| epoch  13 step   148200 |  10560 batches | lr 3.91e-05 | ms/batch 468.97 | loss  3.11 | ppl    22.352
+| epoch  13 step   148400 |  10760 batches | lr 3.89e-05 | ms/batch 704.91 | loss  3.06 | ppl    21.398
+| epoch  13 step   148600 |  10960 batches | lr 3.86e-05 | ms/batch 703.78 | loss  3.07 | ppl    21.622
+| epoch  13 step   148800 |  11160 batches | lr 3.83e-05 | ms/batch 671.68 | loss  3.12 | ppl    22.641
+| epoch  13 step   149000 |  11360 batches | lr 3.8e-05 | ms/batch 704.17 | loss  3.09 | ppl    21.935
+| epoch  14 step   149200 |     90 batches | lr 3.77e-05 | ms/batch 707.89 | loss  3.08 | ppl    21.847
+| epoch  14 step   149400 |    290 batches | lr 3.74e-05 | ms/batch 692.06 | loss  3.06 | ppl    21.250
+| epoch  14 step   149600 |    490 batches | lr 3.72e-05 | ms/batch 698.40 | loss  3.10 | ppl    22.096
+| epoch  14 step   149800 |    690 batches | lr 3.69e-05 | ms/batch 708.46 | loss  3.05 | ppl    21.130
+| epoch  14 step   150000 |    890 batches | lr 3.66e-05 | ms/batch 701.80 | loss  3.07 | ppl    21.611
+| epoch  14 step   150200 |   1090 batches | lr 3.63e-05 | ms/batch 684.70 | loss  3.08 | ppl    21.866
+| epoch  14 step   150400 |   1290 batches | lr 3.61e-05 | ms/batch 680.94 | loss  3.07 | ppl    21.455
+| epoch  14 step   150600 |   1490 batches | lr 3.58e-05 | ms/batch 682.02 | loss  3.07 | ppl    21.451
+| epoch  14 step   150800 |   1690 batches | lr 3.55e-05 | ms/batch 667.16 | loss  3.06 | ppl    21.432
+| epoch  14 step   151000 |   1890 batches | lr 3.52e-05 | ms/batch 687.92 | loss  3.08 | ppl    21.720
+| epoch  14 step   151200 |   2090 batches | lr 3.5e-05 | ms/batch 690.29 | loss  3.12 | ppl    22.629
+| epoch  14 step   151400 |   2290 batches | lr 3.47e-05 | ms/batch 695.24 | loss  3.09 | ppl    21.973
+| epoch  14 step   151600 |   2490 batches | lr 3.44e-05 | ms/batch 690.62 | loss  3.07 | ppl    21.541
+| epoch  14 step   151800 |   2690 batches | lr 3.41e-05 | ms/batch 691.73 | loss  3.08 | ppl    21.853
+| epoch  14 step   152000 |   2890 batches | lr 3.39e-05 | ms/batch 721.76 | loss  3.03 | ppl    20.724
+----------------------------------------------------------------------------------------------------
+| Eval  38 at step   152000 | time: 2730.88s | valid loss  3.17 | valid ppl    23.892
+----------------------------------------------------------------------------------------------------
+| epoch  14 step   152200 |   3090 batches | lr 3.36e-05 | ms/batch 773.37 | loss  3.08 | ppl    21.734
+| epoch  14 step   152400 |   3290 batches | lr 3.33e-05 | ms/batch 682.72 | loss  3.09 | ppl    22.046
+| epoch  14 step   152600 |   3490 batches | lr 3.31e-05 | ms/batch 701.64 | loss  3.06 | ppl    21.282
+| epoch  14 step   152800 |   3690 batches | lr 3.28e-05 | ms/batch 716.98 | loss  3.07 | ppl    21.645
+| epoch  14 step   153000 |   3890 batches | lr 3.25e-05 | ms/batch 702.88 | loss  3.06 | ppl    21.403
+| epoch  14 step   153200 |   4090 batches | lr 3.23e-05 | ms/batch 682.68 | loss  3.09 | ppl    21.972
+| epoch  14 step   153400 |   4290 batches | lr 3.2e-05 | ms/batch 704.02 | loss  3.07 | ppl    21.549
+| epoch  14 step   153600 |   4490 batches | lr 3.18e-05 | ms/batch 703.61 | loss  3.09 | ppl    21.998
+| epoch  14 step   153800 |   4690 batches | lr 3.15e-05 | ms/batch 710.51 | loss  3.06 | ppl    21.290
+| epoch  14 step   154000 |   4890 batches | lr 3.12e-05 | ms/batch 713.73 | loss  3.07 | ppl    21.440
+| epoch  14 step   154200 |   5090 batches | lr 3.1e-05 | ms/batch 737.96 | loss  3.08 | ppl    21.739
+| epoch  14 step   154400 |   5290 batches | lr 3.07e-05 | ms/batch 711.39 | loss  3.06 | ppl    21.344
+| epoch  14 step   154600 |   5490 batches | lr 3.05e-05 | ms/batch 702.95 | loss  3.05 | ppl    21.190
+| epoch  14 step   154800 |   5690 batches | lr 3.02e-05 | ms/batch 719.75 | loss  3.07 | ppl    21.542
+| epoch  14 step   155000 |   5890 batches | lr 2.99e-05 | ms/batch 672.31 | loss  3.07 | ppl    21.580
+| epoch  14 step   155200 |   6090 batches | lr 2.97e-05 | ms/batch 709.44 | loss  3.07 | ppl    21.587
+| epoch  14 step   155400 |   6290 batches | lr 2.94e-05 | ms/batch 709.79 | loss  3.07 | ppl    21.648
+| epoch  14 step   155600 |   6490 batches | lr 2.92e-05 | ms/batch 688.42 | loss  3.05 | ppl    21.036
+| epoch  14 step   155800 |   6690 batches | lr 2.89e-05 | ms/batch 689.25 | loss  3.03 | ppl    20.757
+| epoch  14 step   156000 |   6890 batches | lr 2.87e-05 | ms/batch 721.47 | loss  3.06 | ppl    21.351
+----------------------------------------------------------------------------------------------------
+| Eval  39 at step   156000 | time: 2828.47s | valid loss  3.17 | valid ppl    23.854
+----------------------------------------------------------------------------------------------------
+| epoch  14 step   156200 |   7090 batches | lr 2.84e-05 | ms/batch 761.55 | loss  3.06 | ppl    21.267
+| epoch  14 step   156400 |   7290 batches | lr 2.82e-05 | ms/batch 656.50 | loss  3.01 | ppl    20.271
+| epoch  14 step   156600 |   7490 batches | lr 2.79e-05 | ms/batch 694.99 | loss  3.06 | ppl    21.258
+| epoch  14 step   156800 |   7690 batches | lr 2.77e-05 | ms/batch 716.22 | loss  3.04 | ppl    20.894
+| epoch  14 step   157000 |   7890 batches | lr 2.74e-05 | ms/batch 713.94 | loss  3.04 | ppl    20.902
+| epoch  14 step   157200 |   8090 batches | lr 2.72e-05 | ms/batch 687.11 | loss  3.06 | ppl    21.311
+| epoch  14 step   157400 |   8290 batches | lr 2.7e-05 | ms/batch 682.84 | loss  3.05 | ppl    21.037
+| epoch  14 step   157600 |   8490 batches | lr 2.67e-05 | ms/batch 665.10 | loss  3.05 | ppl    21.110
+| epoch  14 step   157800 |   8690 batches | lr 2.65e-05 | ms/batch 742.98 | loss  3.07 | ppl    21.548
+| epoch  14 step   158000 |   8890 batches | lr 2.62e-05 | ms/batch 742.00 | loss  3.06 | ppl    21.303
+| epoch  14 step   158200 |   9090 batches | lr 2.6e-05 | ms/batch 682.98 | loss  3.06 | ppl    21.343
+| epoch  14 step   158400 |   9290 batches | lr 2.58e-05 | ms/batch 707.66 | loss  3.05 | ppl    21.196
+| epoch  14 step   158600 |   9490 batches | lr 2.55e-05 | ms/batch 700.45 | loss  3.06 | ppl    21.433
+| epoch  14 step   158800 |   9690 batches | lr 2.53e-05 | ms/batch 678.26 | loss  3.06 | ppl    21.401
+| epoch  14 step   159000 |   9890 batches | lr 2.5e-05 | ms/batch 678.52 | loss  3.04 | ppl    20.949
+| epoch  14 step   159200 |  10090 batches | lr 2.48e-05 | ms/batch 704.73 | loss  3.07 | ppl    21.508
+| epoch  14 step   159400 |  10290 batches | lr 2.46e-05 | ms/batch 705.36 | loss  3.05 | ppl    21.058
+| epoch  14 step   159600 |  10490 batches | lr 2.43e-05 | ms/batch 690.24 | loss  3.09 | ppl    21.881
+| epoch  14 step   159800 |  10690 batches | lr 2.41e-05 | ms/batch 698.55 | loss  3.05 | ppl    21.185
+| epoch  14 step   160000 |  10890 batches | lr 2.39e-05 | ms/batch 678.42 | loss  3.04 | ppl    20.881
+----------------------------------------------------------------------------------------------------
+| Eval  40 at step   160000 | time: 2795.13s | valid loss  3.17 | valid ppl    23.806
+----------------------------------------------------------------------------------------------------
+| epoch  14 step   160200 |  11090 batches | lr 2.36e-05 | ms/batch 743.16 | loss  3.09 | ppl    21.924
+| epoch  14 step   160400 |  11290 batches | lr 2.34e-05 | ms/batch 670.98 | loss  3.08 | ppl    21.781
+| epoch  15 step   160600 |     20 batches | lr 2.32e-05 | ms/batch 688.74 | loss  3.07 | ppl    21.534
+| epoch  15 step   160800 |    220 batches | lr 2.3e-05 | ms/batch 707.95 | loss  3.03 | ppl    20.736
+| epoch  15 step   161000 |    420 batches | lr 2.27e-05 | ms/batch 685.60 | loss  3.07 | ppl    21.451
+| epoch  15 step   161200 |    620 batches | lr 2.25e-05 | ms/batch 711.76 | loss  3.04 | ppl    20.824
+| epoch  15 step   161400 |    820 batches | lr 2.23e-05 | ms/batch 695.85 | loss  3.07 | ppl    21.648
+| epoch  15 step   161600 |   1020 batches | lr 2.21e-05 | ms/batch 680.45 | loss  3.04 | ppl    20.808
+| epoch  15 step   161800 |   1220 batches | lr 2.18e-05 | ms/batch 733.80 | loss  3.06 | ppl    21.352
+| epoch  15 step   162000 |   1420 batches | lr 2.16e-05 | ms/batch 702.32 | loss  3.05 | ppl    21.184
+| epoch  15 step   162200 |   1620 batches | lr 2.14e-05 | ms/batch 689.95 | loss  3.03 | ppl    20.716
+| epoch  15 step   162400 |   1820 batches | lr 2.12e-05 | ms/batch 700.66 | loss  3.07 | ppl    21.463
+| epoch  15 step   162600 |   2020 batches | lr 2.1e-05 | ms/batch 673.18 | loss  3.09 | ppl    21.980
+| epoch  15 step   162800 |   2220 batches | lr 2.07e-05 | ms/batch 709.69 | loss  3.07 | ppl    21.463
+| epoch  15 step   163000 |   2420 batches | lr 2.05e-05 | ms/batch 709.74 | loss  3.07 | ppl    21.488
+| epoch  15 step   163200 |   2620 batches | lr 2.03e-05 | ms/batch 702.37 | loss  3.06 | ppl    21.232
+| epoch  15 step   163400 |   2820 batches | lr 2.01e-05 | ms/batch 695.04 | loss  3.03 | ppl    20.696
+| epoch  15 step   163600 |   3020 batches | lr 1.99e-05 | ms/batch 718.85 | loss  3.06 | ppl    21.244
+| epoch  15 step   163800 |   3220 batches | lr 1.97e-05 | ms/batch 674.99 | loss  3.05 | ppl    21.183
+| epoch  15 step   164000 |   3420 batches | lr 1.95e-05 | ms/batch 708.94 | loss  3.06 | ppl    21.252
+----------------------------------------------------------------------------------------------------
+| Eval  41 at step   164000 | time: 2798.25s | valid loss  3.17 | valid ppl    23.747
+----------------------------------------------------------------------------------------------------
+| epoch  15 step   164200 |   3620 batches | lr 1.92e-05 | ms/batch 756.27 | loss  3.03 | ppl    20.794
+| epoch  15 step   164400 |   3820 batches | lr 1.9e-05 | ms/batch 686.46 | loss  3.06 | ppl    21.270
+| epoch  15 step   164600 |   4020 batches | lr 1.88e-05 | ms/batch 695.84 | loss  3.07 | ppl    21.566
+| epoch  15 step   164800 |   4220 batches | lr 1.86e-05 | ms/batch 708.79 | loss  3.05 | ppl    21.174
+| epoch  15 step   165000 |   4420 batches | lr 1.84e-05 | ms/batch 678.67 | loss  3.06 | ppl    21.240
+| epoch  15 step   165200 |   4620 batches | lr 1.82e-05 | ms/batch 696.74 | loss  3.06 | ppl    21.238
+| epoch  15 step   165400 |   4820 batches | lr 1.8e-05 | ms/batch 725.44 | loss  3.04 | ppl    20.967
+| epoch  15 step   165600 |   5020 batches | lr 1.78e-05 | ms/batch 682.40 | loss  3.07 | ppl    21.539
+| epoch  15 step   165800 |   5220 batches | lr 1.76e-05 | ms/batch 686.03 | loss  3.05 | ppl    21.048
+| epoch  15 step   166000 |   5420 batches | lr 1.74e-05 | ms/batch 705.11 | loss  3.02 | ppl    20.520
+| epoch  15 step   166200 |   5620 batches | lr 1.72e-05 | ms/batch 692.95 | loss  3.06 | ppl    21.245
+| epoch  15 step   166400 |   5820 batches | lr 1.7e-05 | ms/batch 680.20 | loss  3.05 | ppl    21.210
+| epoch  15 step   166600 |   6020 batches | lr 1.68e-05 | ms/batch 725.01 | loss  3.04 | ppl    20.885
+| epoch  15 step   166800 |   6220 batches | lr 1.66e-05 | ms/batch 696.24 | loss  3.05 | ppl    21.047
+| epoch  15 step   167000 |   6420 batches | lr 1.64e-05 | ms/batch 679.60 | loss  3.06 | ppl    21.386
+| epoch  15 step   167200 |   6620 batches | lr 1.62e-05 | ms/batch 685.90 | loss  3.01 | ppl    20.239
+| epoch  15 step   167400 |   6820 batches | lr 1.6e-05 | ms/batch 696.26 | loss  3.04 | ppl    20.831
+| epoch  15 step   167600 |   7020 batches | lr 1.58e-05 | ms/batch 667.73 | loss  3.05 | ppl    21.056
+| epoch  15 step   167800 |   7220 batches | lr 1.57e-05 | ms/batch 710.56 | loss  3.01 | ppl    20.250
+| epoch  15 step   168000 |   7420 batches | lr 1.55e-05 | ms/batch 684.67 | loss  3.02 | ppl    20.435
+----------------------------------------------------------------------------------------------------
+| Eval  42 at step   168000 | time: 2785.72s | valid loss  3.16 | valid ppl    23.632
+----------------------------------------------------------------------------------------------------
+| epoch  15 step   168200 |   7620 batches | lr 1.53e-05 | ms/batch 757.05 | loss  3.01 | ppl    20.240
+| epoch  15 step   168400 |   7820 batches | lr 1.51e-05 | ms/batch 723.60 | loss  3.04 | ppl    20.901
+| epoch  15 step   168600 |   8020 batches | lr 1.49e-05 | ms/batch 655.26 | loss  3.04 | ppl    20.915
+| epoch  15 step   168800 |   8220 batches | lr 1.47e-05 | ms/batch 744.40 | loss  3.03 | ppl    20.637
+| epoch  15 step   169000 |   8420 batches | lr 1.45e-05 | ms/batch 683.70 | loss  3.04 | ppl    20.935
+| epoch  15 step   169200 |   8620 batches | lr 1.43e-05 | ms/batch 706.63 | loss  3.04 | ppl    20.841
+| epoch  15 step   169400 |   8820 batches | lr 1.42e-05 | ms/batch 673.37 | loss  3.06 | ppl    21.253
+| epoch  15 step   169600 |   9020 batches | lr 1.4e-05 | ms/batch 724.83 | loss  3.05 | ppl    21.077
+| epoch  15 step   169800 |   9220 batches | lr 1.38e-05 | ms/batch 710.05 | loss  3.02 | ppl    20.465
+| epoch  15 step   170000 |   9420 batches | lr 1.36e-05 | ms/batch 714.29 | loss  3.05 | ppl    21.075
+| epoch  15 step   170200 |   9620 batches | lr 1.34e-05 | ms/batch 708.96 | loss  3.06 | ppl    21.377
+| epoch  15 step   170400 |   9820 batches | lr 1.33e-05 | ms/batch 709.15 | loss  3.03 | ppl    20.644
+| epoch  15 step   170600 |  10020 batches | lr 1.31e-05 | ms/batch 675.72 | loss  3.04 | ppl    20.958
+| epoch  15 step   170800 |  10220 batches | lr 1.29e-05 | ms/batch 688.52 | loss  3.04 | ppl    20.876
+| epoch  15 step   171000 |  10420 batches | lr 1.27e-05 | ms/batch 685.00 | loss  3.04 | ppl    20.869
+| epoch  15 step   171200 |  10620 batches | lr 1.26e-05 | ms/batch 720.81 | loss  3.07 | ppl    21.626
+| epoch  15 step   171400 |  10820 batches | lr 1.24e-05 | ms/batch 688.74 | loss  3.02 | ppl    20.402
+| epoch  15 step   171600 |  11020 batches | lr 1.22e-05 | ms/batch 688.38 | loss  3.06 | ppl    21.433
+| epoch  15 step   171800 |  11220 batches | lr 1.21e-05 | ms/batch 725.25 | loss  3.06 | ppl    21.409
+| epoch  15 step   172000 |  11420 batches | lr 1.19e-05 | ms/batch 688.06 | loss  3.06 | ppl    21.341
+----------------------------------------------------------------------------------------------------
+| Eval  43 at step   172000 | time: 2811.86s | valid loss  3.16 | valid ppl    23.555
+----------------------------------------------------------------------------------------------------
+| epoch  16 step   172200 |    150 batches | lr 1.17e-05 | ms/batch 733.80 | loss  3.04 | ppl    20.922
+| epoch  16 step   172400 |    350 batches | lr 1.16e-05 | ms/batch 716.14 | loss  3.02 | ppl    20.536
+| epoch  16 step   172600 |    550 batches | lr 1.14e-05 | ms/batch 697.95 | loss  3.05 | ppl    21.120
+| epoch  16 step   172800 |    750 batches | lr 1.12e-05 | ms/batch 677.36 | loss  3.03 | ppl    20.767
+| epoch  16 step   173000 |    950 batches | lr 1.11e-05 | ms/batch 688.14 | loss  3.02 | ppl    20.590
+| epoch  16 step   173200 |   1150 batches | lr 1.09e-05 | ms/batch 694.21 | loss  3.06 | ppl    21.245
+| epoch  16 step   173400 |   1350 batches | lr 1.08e-05 | ms/batch 687.60 | loss  3.04 | ppl    20.835
+| epoch  16 step   173600 |   1550 batches | lr 1.06e-05 | ms/batch 689.94 | loss  3.03 | ppl    20.718
+| epoch  16 step   173800 |   1750 batches | lr 1.04e-05 | ms/batch 701.32 | loss  3.03 | ppl    20.615
+| epoch  16 step   174000 |   1950 batches | lr 1.03e-05 | ms/batch 718.46 | loss  3.06 | ppl    21.302
+| epoch  16 step   174200 |   2150 batches | lr 1.01e-05 | ms/batch 701.55 | loss  3.07 | ppl    21.531
+| epoch  16 step   174400 |   2350 batches | lr 9.97e-06 | ms/batch 714.53 | loss  3.05 | ppl    21.045
+| epoch  16 step   174600 |   2550 batches | lr 9.82e-06 | ms/batch 688.64 | loss  3.05 | ppl    21.136
+| epoch  16 step   174800 |   2750 batches | lr 9.67e-06 | ms/batch 676.25 | loss  3.03 | ppl    20.650
+| epoch  16 step   175000 |   2950 batches | lr 9.52e-06 | ms/batch 672.01 | loss  3.03 | ppl    20.677
+| epoch  16 step   175200 |   3150 batches | lr 9.37e-06 | ms/batch 682.98 | loss  3.05 | ppl    21.058
+| epoch  16 step   175400 |   3350 batches | lr 9.22e-06 | ms/batch 703.95 | loss  3.05 | ppl    21.083
+| epoch  16 step   175600 |   3550 batches | lr 9.07e-06 | ms/batch 725.15 | loss  3.03 | ppl    20.678
+| epoch  16 step   175800 |   3750 batches | lr 8.92e-06 | ms/batch 697.98 | loss  3.04 | ppl    20.887
+| epoch  16 step   176000 |   3950 batches | lr 8.78e-06 | ms/batch 714.39 | loss  3.04 | ppl    20.890
+----------------------------------------------------------------------------------------------------
+| Eval  44 at step   176000 | time: 2793.96s | valid loss  3.16 | valid ppl    23.555
+----------------------------------------------------------------------------------------------------
+| epoch  16 step   176200 |   4150 batches | lr 8.63e-06 | ms/batch 740.62 | loss  3.05 | ppl    21.035
+| epoch  16 step   176400 |   4350 batches | lr 8.49e-06 | ms/batch 688.27 | loss  3.05 | ppl    21.013
+| epoch  16 step   176600 |   4550 batches | lr 8.35e-06 | ms/batch 709.61 | loss  3.07 | ppl    21.515
+| epoch  16 step   176800 |   4750 batches | lr 8.21e-06 | ms/batch 675.71 | loss  3.01 | ppl    20.389
+| epoch  16 step   177000 |   4950 batches | lr 8.07e-06 | ms/batch 680.17 | loss  3.05 | ppl    21.062
+| epoch  16 step   177200 |   5150 batches | lr 7.93e-06 | ms/batch 701.57 | loss  3.04 | ppl    20.847
+| epoch  16 step   177400 |   5350 batches | lr 7.79e-06 | ms/batch 675.55 | loss  3.02 | ppl    20.562
+| epoch  16 step   177600 |   5550 batches | lr 7.66e-06 | ms/batch 697.09 | loss  3.03 | ppl    20.635
+| epoch  16 step   177800 |   5750 batches | lr 7.52e-06 | ms/batch 694.86 | loss  3.04 | ppl    21.003
+| epoch  16 step   178000 |   5950 batches | lr 7.39e-06 | ms/batch 717.27 | loss  3.03 | ppl    20.709
+| epoch  16 step   178200 |   6150 batches | lr 7.26e-06 | ms/batch 708.80 | loss  3.03 | ppl    20.721
+| epoch  16 step   178400 |   6350 batches | lr 7.13e-06 | ms/batch 680.38 | loss  3.07 | ppl    21.498
+| epoch  16 step   178600 |   6550 batches | lr 7e-06 | ms/batch 690.85 | loss  2.99 | ppl    19.816
+| epoch  16 step   178800 |   6750 batches | lr 6.87e-06 | ms/batch 686.33 | loss  3.02 | ppl    20.487
+| epoch  16 step   179000 |   6950 batches | lr 6.74e-06 | ms/batch 700.78 | loss  3.03 | ppl    20.767
+| epoch  16 step   179200 |   7150 batches | lr 6.61e-06 | ms/batch 699.08 | loss  3.00 | ppl    20.040
+| epoch  16 step   179400 |   7350 batches | lr 6.49e-06 | ms/batch 731.67 | loss  3.01 | ppl    20.243
+| epoch  16 step   179600 |   7550 batches | lr 6.36e-06 | ms/batch 701.46 | loss  3.01 | ppl    20.274
+| epoch  16 step   179800 |   7750 batches | lr 6.24e-06 | ms/batch 708.31 | loss  3.03 | ppl    20.608
+| epoch  16 step   180000 |   7950 batches | lr 6.12e-06 | ms/batch 709.01 | loss  3.01 | ppl    20.331
+----------------------------------------------------------------------------------------------------
+| Eval  45 at step   180000 | time: 2799.41s | valid loss  3.16 | valid ppl    23.509
+----------------------------------------------------------------------------------------------------
+| epoch  16 step   180200 |   8150 batches | lr 6e-06 | ms/batch 762.66 | loss  3.02 | ppl    20.552
+| epoch  16 step   180400 |   8350 batches | lr 5.88e-06 | ms/batch 712.89 | loss  3.03 | ppl    20.748
+| epoch  16 step   180600 |   8550 batches | lr 5.76e-06 | ms/batch 697.51 | loss  3.02 | ppl    20.448
+| epoch  16 step   180800 |   8750 batches | lr 5.64e-06 | ms/batch 692.89 | loss  3.03 | ppl    20.772
+| epoch  16 step   181000 |   8950 batches | lr 5.53e-06 | ms/batch 704.48 | loss  3.04 | ppl    20.993
+| epoch  16 step   181200 |   9150 batches | lr 5.41e-06 | ms/batch 681.81 | loss  3.01 | ppl    20.388
+| epoch  16 step   181400 |   9350 batches | lr 5.3e-06 | ms/batch 739.49 | loss  3.03 | ppl    20.750
+| epoch  16 step   181600 |   9550 batches | lr 5.18e-06 | ms/batch 673.63 | loss  3.06 | ppl    21.365
+| epoch  16 step   181800 |   9750 batches | lr 5.07e-06 | ms/batch 678.87 | loss  3.02 | ppl    20.486
+| epoch  16 step   182000 |   9950 batches | lr 4.96e-06 | ms/batch 688.93 | loss  3.03 | ppl    20.719
+| epoch  16 step   182200 |  10150 batches | lr 4.85e-06 | ms/batch 700.14 | loss  3.01 | ppl    20.286
+| epoch  16 step   182400 |  10350 batches | lr 4.75e-06 | ms/batch 698.98 | loss  3.04 | ppl    20.915
+| epoch  16 step   182600 |  10550 batches | lr 4.64e-06 | ms/batch 675.18 | loss  3.06 | ppl    21.356
+| epoch  16 step   182800 |  10750 batches | lr 4.53e-06 | ms/batch 675.41 | loss  3.01 | ppl    20.282
+| epoch  16 step   183000 |  10950 batches | lr 4.43e-06 | ms/batch 696.78 | loss  3.03 | ppl    20.604
+| epoch  16 step   183200 |  11150 batches | lr 4.33e-06 | ms/batch 705.01 | loss  3.08 | ppl    21.672
+| epoch  16 step   183400 |  11350 batches | lr 4.23e-06 | ms/batch 724.39 | loss  3.04 | ppl    20.891
+| epoch  17 step   183600 |     80 batches | lr 4.12e-06 | ms/batch 694.42 | loss  3.04 | ppl    20.978
+| epoch  17 step   183800 |    280 batches | lr 4.03e-06 | ms/batch 706.89 | loss  3.01 | ppl    20.311
+| epoch  17 step   184000 |    480 batches | lr 3.93e-06 | ms/batch 697.17 | loss  3.05 | ppl    21.175
+----------------------------------------------------------------------------------------------------
+| Eval  46 at step   184000 | time: 2799.09s | valid loss  3.16 | valid ppl    23.480
+----------------------------------------------------------------------------------------------------
+| epoch  17 step   184200 |    680 batches | lr 3.83e-06 | ms/batch 724.21 | loss  3.01 | ppl    20.267
+| epoch  17 step   184400 |    880 batches | lr 3.73e-06 | ms/batch 717.97 | loss  3.04 | ppl    20.832
+| epoch  17 step   184600 |   1080 batches | lr 3.64e-06 | ms/batch 700.46 | loss  3.04 | ppl    20.875
+| epoch  17 step   184800 |   1280 batches | lr 3.55e-06 | ms/batch 707.72 | loss  3.02 | ppl    20.489
+| epoch  17 step   185000 |   1480 batches | lr 3.45e-06 | ms/batch 667.24 | loss  3.02 | ppl    20.563
+| epoch  17 step   185200 |   1680 batches | lr 3.36e-06 | ms/batch 734.80 | loss  3.02 | ppl    20.586
+| epoch  17 step   185400 |   1880 batches | lr 3.27e-06 | ms/batch 688.00 | loss  3.03 | ppl    20.797
+| epoch  17 step   185600 |   2080 batches | lr 3.18e-06 | ms/batch 689.00 | loss  3.08 | ppl    21.708
+| epoch  17 step   185800 |   2280 batches | lr 3.1e-06 | ms/batch 736.30 | loss  3.05 | ppl    21.169
+| epoch  17 step   186000 |   2480 batches | lr 3.01e-06 | ms/batch 688.24 | loss  3.03 | ppl    20.685
+| epoch  17 step   186200 |   2680 batches | lr 2.93e-06 | ms/batch 682.16 | loss  3.05 | ppl    21.041
+| epoch  17 step   186400 |   2880 batches | lr 2.84e-06 | ms/batch 733.76 | loss  2.99 | ppl    19.908
+| epoch  17 step   186600 |   3080 batches | lr 2.76e-06 | ms/batch 681.75 | loss  3.04 | ppl    20.892
+| epoch  17 step   186800 |   3280 batches | lr 2.68e-06 | ms/batch 694.90 | loss  3.05 | ppl    21.196
+| epoch  17 step   187000 |   3480 batches | lr 2.6e-06 | ms/batch 714.81 | loss  3.02 | ppl    20.444
+| epoch  17 step   187200 |   3680 batches | lr 2.52e-06 | ms/batch 739.94 | loss  3.04 | ppl    20.839
+| epoch  17 step   187400 |   3880 batches | lr 2.44e-06 | ms/batch 696.52 | loss  3.02 | ppl    20.547
+| epoch  17 step   187600 |   4080 batches | lr 2.36e-06 | ms/batch 711.46 | loss  3.05 | ppl    21.143
+| epoch  17 step   187800 |   4280 batches | lr 2.29e-06 | ms/batch 676.34 | loss  3.03 | ppl    20.690
+| epoch  17 step   188000 |   4480 batches | lr 2.21e-06 | ms/batch 721.67 | loss  3.05 | ppl    21.132
+----------------------------------------------------------------------------------------------------
+| Eval  47 at step   188000 | time: 2818.78s | valid loss  3.15 | valid ppl    23.437
+----------------------------------------------------------------------------------------------------
+| epoch  17 step   188200 |   4680 batches | lr 2.14e-06 | ms/batch 744.57 | loss  3.02 | ppl    20.544
+| epoch  17 step   188400 |   4880 batches | lr 2.07e-06 | ms/batch 679.14 | loss  3.02 | ppl    20.582
+| epoch  17 step   188600 |   5080 batches | lr 2e-06 | ms/batch 683.64 | loss  3.04 | ppl    20.906
+| epoch  17 step   188800 |   5280 batches | lr 1.93e-06 | ms/batch 701.30 | loss  3.03 | ppl    20.615
+| epoch  17 step   189000 |   5480 batches | lr 1.86e-06 | ms/batch 708.69 | loss  3.01 | ppl    20.322
+| epoch  17 step   189200 |   5680 batches | lr 1.79e-06 | ms/batch 672.27 | loss  3.04 | ppl    20.907
+| epoch  17 step   189400 |   5880 batches | lr 1.73e-06 | ms/batch 732.04 | loss  3.03 | ppl    20.725
+| epoch  17 step   189600 |   6080 batches | lr 1.66e-06 | ms/batch 710.39 | loss  3.03 | ppl    20.774
+| epoch  17 step   189800 |   6280 batches | lr 1.6e-06 | ms/batch 692.23 | loss  3.04 | ppl    20.937
+| epoch  17 step   190000 |   6480 batches | lr 1.54e-06 | ms/batch 703.65 | loss  3.02 | ppl    20.415
+| epoch  17 step   190200 |   6680 batches | lr 1.48e-06 | ms/batch 695.33 | loss  2.99 | ppl    19.968
+| epoch  17 step   190400 |   6880 batches | lr 1.42e-06 | ms/batch 698.42 | loss  3.03 | ppl    20.649
+| epoch  17 step   190600 |   7080 batches | lr 1.36e-06 | ms/batch 685.73 | loss  3.02 | ppl    20.404
+| epoch  17 step   190800 |   7280 batches | lr 1.3e-06 | ms/batch 685.45 | loss  2.98 | ppl    19.645
+| epoch  17 step   191000 |   7480 batches | lr 1.25e-06 | ms/batch 684.16 | loss  3.02 | ppl    20.496
+| epoch  17 step   191200 |   7680 batches | lr 1.19e-06 | ms/batch 693.92 | loss  3.00 | ppl    20.163
+| epoch  17 step   191400 |   7880 batches | lr 1.14e-06 | ms/batch 687.54 | loss  3.01 | ppl    20.235
+| epoch  17 step   191600 |   8080 batches | lr 1.09e-06 | ms/batch 705.35 | loss  3.03 | ppl    20.600
+| epoch  17 step   191800 |   8280 batches | lr 1.04e-06 | ms/batch 708.66 | loss  3.01 | ppl    20.376
+| epoch  17 step   192000 |   8480 batches | lr 9.86e-07 | ms/batch 703.61 | loss  3.02 | ppl    20.442
+----------------------------------------------------------------------------------------------------
+| Eval  48 at step   192000 | time: 2792.73s | valid loss  3.15 | valid ppl    23.404
+----------------------------------------------------------------------------------------------------
+| epoch  17 step   192200 |   8680 batches | lr 9.37e-07 | ms/batch 738.99 | loss  3.03 | ppl    20.750
+| epoch  17 step   192400 |   8880 batches | lr 8.9e-07 | ms/batch 684.91 | loss  3.03 | ppl    20.652
+| epoch  17 step   192600 |   9080 batches | lr 8.44e-07 | ms/batch 697.17 | loss  3.03 | ppl    20.656
+| epoch  17 step   192800 |   9280 batches | lr 7.99e-07 | ms/batch 716.20 | loss  3.02 | ppl    20.529
+| epoch  17 step   193000 |   9480 batches | lr 7.55e-07 | ms/batch 708.87 | loss  3.03 | ppl    20.800
+| epoch  17 step   193200 |   9680 batches | lr 7.12e-07 | ms/batch 680.97 | loss  3.03 | ppl    20.765
+| epoch  17 step   193400 |   9880 batches | lr 6.71e-07 | ms/batch 701.09 | loss  3.01 | ppl    20.225
+| epoch  17 step   193600 |  10080 batches | lr 6.31e-07 | ms/batch 697.86 | loss  3.04 | ppl    20.959
+| epoch  17 step   193800 |  10280 batches | lr 5.92e-07 | ms/batch 704.29 | loss  3.01 | ppl    20.360
+| epoch  17 step   194000 |  10480 batches | lr 5.55e-07 | ms/batch 705.22 | loss  3.05 | ppl    21.131
+| epoch  17 step   194200 |  10680 batches | lr 5.18e-07 | ms/batch 690.06 | loss  3.03 | ppl    20.726
+| epoch  17 step   194400 |  10880 batches | lr 4.83e-07 | ms/batch 694.26 | loss  3.01 | ppl    20.253
+| epoch  17 step   194600 |  11080 batches | lr 4.49e-07 | ms/batch 691.17 | loss  3.05 | ppl    21.187
+| epoch  17 step   194800 |  11280 batches | lr 4.17e-07 | ms/batch 706.39 | loss  3.05 | ppl    21.185
+| epoch  18 step   195000 |     10 batches | lr 3.85e-07 | ms/batch 710.81 | loss  3.04 | ppl    20.965
+| epoch  18 step   195200 |    210 batches | lr 3.55e-07 | ms/batch 698.26 | loss  3.01 | ppl    20.292
+| epoch  18 step   195400 |    410 batches | lr 3.26e-07 | ms/batch 694.39 | loss  3.04 | ppl    20.958
+| epoch  18 step   195600 |    610 batches | lr 2.98e-07 | ms/batch 691.04 | loss  3.01 | ppl    20.287
+| epoch  18 step   195800 |    810 batches | lr 2.72e-07 | ms/batch 701.33 | loss  3.05 | ppl    21.051
+| epoch  18 step   196000 |   1010 batches | lr 2.47e-07 | ms/batch 719.59 | loss  3.01 | ppl    20.240
+----------------------------------------------------------------------------------------------------
+| Eval  49 at step   196000 | time: 2804.08s | valid loss  3.15 | valid ppl    23.395
+----------------------------------------------------------------------------------------------------
+| epoch  18 step   196200 |   1210 batches | lr 2.23e-07 | ms/batch 743.40 | loss  3.04 | ppl    20.868
+| epoch  18 step   196400 |   1410 batches | lr 2e-07 | ms/batch 688.08 | loss  3.03 | ppl    20.707
+| epoch  18 step   196600 |   1610 batches | lr 1.78e-07 | ms/batch 698.43 | loss  3.01 | ppl    20.227
+| epoch  18 step   196800 |   1810 batches | lr 1.58e-07 | ms/batch 698.99 | loss  3.04 | ppl    20.847
+| epoch  18 step   197000 |   2010 batches | lr 1.39e-07 | ms/batch 711.49 | loss  3.06 | ppl    21.434
+| epoch  18 step   197200 |   2210 batches | lr 1.21e-07 | ms/batch 699.04 | loss  3.05 | ppl    21.071
+| epoch  18 step   197400 |   2410 batches | lr 1.04e-07 | ms/batch 678.89 | loss  3.04 | ppl    20.965
+| epoch  18 step   197600 |   2610 batches | lr 8.88e-08 | ms/batch 705.13 | loss  3.03 | ppl    20.720
+| epoch  18 step   197800 |   2810 batches | lr 7.46e-08 | ms/batch 712.00 | loss  3.01 | ppl    20.327
+| epoch  18 step   198000 |   3010 batches | lr 6.17e-08 | ms/batch 711.63 | loss  3.03 | ppl    20.694
+| epoch  18 step   198200 |   3210 batches | lr 5e-08 | ms/batch 692.05 | loss  3.03 | ppl    20.710
+| epoch  18 step   198400 |   3410 batches | lr 3.95e-08 | ms/batch 685.17 | loss  3.04 | ppl    20.895
+| epoch  18 step   198600 |   3610 batches | lr 3.02e-08 | ms/batch 692.91 | loss  3.01 | ppl    20.257
+| epoch  18 step   198800 |   3810 batches | lr 2.22e-08 | ms/batch 685.56 | loss  3.03 | ppl    20.780
+| epoch  18 step   199000 |   4010 batches | lr 1.54e-08 | ms/batch 699.55 | loss  3.05 | ppl    21.096
+| epoch  18 step   199200 |   4210 batches | lr 9.87e-09 | ms/batch 690.53 | loss  3.03 | ppl    20.654
+| epoch  18 step   199400 |   4410 batches | lr 5.55e-09 | ms/batch 688.91 | loss  3.04 | ppl    20.838
+| epoch  18 step   199600 |   4610 batches | lr 2.47e-09 | ms/batch 711.03 | loss  3.04 | ppl    20.891
+| epoch  18 step   199800 |   4810 batches | lr 6.17e-10 | ms/batch 686.10 | loss  3.02 | ppl    20.406
+| epoch  18 step   200000 |   5010 batches | lr 0 | ms/batch 702.14 | loss  3.05 | ppl    21.176
+----------------------------------------------------------------------------------------------------
+| Eval  50 at step   200000 | time: 2793.85s | valid loss  3.15 | valid ppl    23.396
+----------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------
+End of training
+====================================================================================================
+| End of training | test loss  3.19 | test ppl    24.241
+====================================================================================================
diff --git a/NLP/Transformer-XL/mem_transformer.py b/NLP/Transformer-XL/mem_transformer.py
new file mode 100644
index 0000000..45147df
--- /dev/null
+++ b/NLP/Transformer-XL/mem_transformer.py
@@ -0,0 +1,812 @@
+import sys
+import math
+import functools
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+sys.path.append('utils')
+from proj_adaptive_softmax import ProjectedAdaptiveLogSoftmax
+from log_uniform_sampler import LogUniformSampler, sample_logits
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[:,None,:].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:,None,:]
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
+        super(PositionwiseFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            ##### layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            ##### residual connection
+            output = core_out + inp
+        else:
+            ##### positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            ##### residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
+                 pre_lnorm=False):
+        super(MultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
+        self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, h, attn_mask=None, mems=None):
+        ##### multihead attention
+        # [hlen x bsz x n_head x d_head]
+
+        if mems is not None:
+            c = torch.cat([mems, h], 0)
+        else:
+            c = h
+
+        if self.pre_lnorm:
+            ##### layer normalization
+            c = self.layer_norm(c)
+
+        head_q = self.q_net(h)
+        head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
+
+        head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
+        head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
+        head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
+        attn_score.mul_(self.scale)
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = h + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(h + attn_out)
+
+        return output
+
+class RelMultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
+                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False):
+        super(RelMultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+    def _parallelogram_mask(self, h, w, left=False):
+        mask = torch.ones((h, w)).byte()
+        m = min(h, w)
+        mask[:m,:m] = torch.triu(mask[:m,:m])
+        mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
+
+        if left:
+            return mask
+        else:
+            return mask.flip(0)
+
+    def _shift(self, x, qlen, klen, mask, left=False):
+        if qlen > 1:
+            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
+                                    device=x.device, dtype=x.dtype)
+        else:
+            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
+
+        if left:
+            mask = mask.flip(1)
+            x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
+        else:
+            x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
+
+        x = x_padded.masked_select(mask[:,:,None,None]) \
+                    .view(qlen, klen, x.size(2), x.size(3))
+
+        return x
+
+    def _rel_shift(self, x, zero_triu=False):
+        zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]),
+                               device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=1)
+
+        x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:])
+
+        x = x_padded[1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
+
+        return x
+
+    def forward(self, w, r, attn_mask=None, mems=None):
+        raise NotImplementedError
+
+class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
+
+    def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None):
+        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias                                         # qlen x bsz x n_head x d_head
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + r_r_bias
+        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[None,:,:,None], -float('inf')).type_as(attn_score)
+            elif attn_mask.dim() == 3:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[:,:,:,None], -float('inf')).type_as(attn_score)
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None):
+        # r_emb: [klen, n_head, d_head], used for term B
+        # r_w_bias: [n_head, d_head], used for term C
+        # r_bias: [klen, n_head], used for term D
+
+        qlen, bsz = w.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
+
+        if klen > r_emb.size(0):
+            r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
+            r_emb = torch.cat([r_emb_pad, r_emb], 0)
+            r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
+            r_bias = torch.cat([r_bias_pad, r_bias], 0)
+        else:
+            r_emb = r_emb[-klen:]
+            r_bias = r_bias[-klen:]
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias[None]                                   # qlen x bsz x n_head x d_head
+
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+        B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb))                  # qlen x klen x bsz x n_head
+        D_ = r_bias[None, :, None]                                              # 1    x klen x 1   x n_head
+        BD = self._rel_shift(B_ + D_)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class DecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
+        super(DecoderLayer, self).__init__()
+
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
+                                         **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelPartialLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelPartialLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
+                            d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 
+                 sample_softmax=False):
+        super(AdaptiveEmbedding, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj ** 0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            self.emb_layers.append(
+                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
+            )
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))
+
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed  = F.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], 
+                dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = F.linear(emb_i, self.emb_projs[i])
+
+                emb_flat.index_copy_(0, indices_i, emb_i)
+
+            embed = emb_flat.view(*inp.size(), self.d_proj)
+
+        embed.mul_(self.emb_scale)
+
+        return embed
+
+class MemTransformerLM(nn.Module):
+    def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner,
+                 dropout, dropatt, tie_weight=True, d_embed=None, 
+                 div_val=1, tie_projs=[False], pre_lnorm=False,
+                 tgt_len=None, ext_len=None, mem_len=None, 
+                 cutoffs=[], adapt_inp=False,
+                 same_length=False, attn_type=0, clamp_len=-1, 
+                 sample_softmax=-1):
+        super(MemTransformerLM, self).__init__()
+        self.n_token = n_token
+
+        d_embed = d_model if d_embed is None else d_embed
+        self.d_embed = d_embed
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+
+        self.word_emb = AdaptiveEmbedding(n_token, d_embed, d_model, cutoffs, 
+                                          div_val=div_val)
+
+        self.drop = nn.Dropout(dropout)
+
+        self.n_layer = n_layer
+
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+        self.max_klen = tgt_len + ext_len + mem_len
+
+        self.attn_type = attn_type
+
+        self.layers = nn.ModuleList()
+        if attn_type == 0: # the default attention
+            for i in range(n_layer):
+                self.layers.append(
+                    RelPartialLearnableDecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+        elif attn_type == 1: # learnable embeddings
+            for i in range(n_layer):
+                self.layers.append(
+                    RelLearnableDecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+        elif attn_type in [2, 3]: # absolute embeddings
+            for i in range(n_layer):
+                self.layers.append(
+                    DecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+
+        self.sample_softmax = sample_softmax
+        # use sampled softmax
+        if sample_softmax > 0:
+            self.out_layer = nn.Linear(d_model, n_token)
+            if tie_weight:
+                self.out_layer.weight = self.word_emb.weight
+            self.tie_weight = tie_weight
+            self.sampler = LogUniformSampler(n_token, sample_softmax)
+
+        # use adaptive softmax (including standard softmax)
+        else:
+            self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model, 
+                                                    cutoffs, div_val=div_val)
+
+            if tie_weight:
+                for i in range(len(self.crit.out_layers)):
+                    self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight
+
+            if tie_projs:
+                for i, tie_proj in enumerate(tie_projs):
+                    if tie_proj and div_val == 1 and d_model != d_embed:
+                        self.crit.out_projs[i] = self.word_emb.emb_projs[0]
+                    elif tie_proj and div_val != 1:
+                        self.crit.out_projs[i] = self.word_emb.emb_projs[i]
+
+        self.same_length = same_length
+        self.clamp_len = clamp_len
+
+        self._create_params()
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def _create_params(self):
+        if self.attn_type == 0: # default attention
+            self.pos_emb = PositionalEmbedding(self.d_model)
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        elif self.attn_type == 1: # learnable
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.n_head, self.d_head))
+            self.r_bias = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head))
+        elif self.attn_type == 2: # absolute standard
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        elif self.attn_type == 3: # absolute deeper SA
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+
+    def init_mems(self):
+        if self.mem_len > 0:
+            mems = []
+            param = next(self.parameters())
+            for i in range(self.n_layer+1):
+                empty = torch.empty(0, dtype=param.dtype, device=param.device)
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, qlen, mlen):
+        # does not deal with None
+        if mems is None: return None
+
+        # mems is not None
+        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        # For the next step, the last `ext_len` of the `qlen` tokens
+        # will be used as the extended context. Hence, we only cache
+        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
+        # to `mlen + qlen - self.ext_len`.
+        with torch.no_grad():
+            new_mems = []
+            end_idx = mlen + max(0, qlen - 0 - self.ext_len)
+            beg_idx = max(0, end_idx - self.mem_len)
+            for i in range(len(hids)):
+
+                cat = torch.cat([mems[i], hids[i]], dim=0)
+                new_mems.append(cat[beg_idx:end_idx].detach())
+
+        return new_mems
+
+    def _forward(self, dec_inp, mems=None):
+        qlen, bsz = dec_inp.size()
+
+        word_emb = self.word_emb(dec_inp)
+
+        mlen = mems[0].size(0) if mems is not None else 0
+        klen = mlen + qlen
+        if self.same_length:
+            all_ones = word_emb.new_ones(qlen, klen)
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
+                    + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1
+        else:
+            dec_attn_mask = torch.triu(
+                word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]
+
+        hids = []
+        if self.attn_type == 0: # default
+            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb)
+            pos_emb = self.drop(pos_emb)
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, pos_emb, self.r_w_bias,
+                        self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 1: # learnable
+            core_out = self.drop(word_emb)
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                if self.clamp_len > 0:
+                    r_emb = self.r_emb[i][-self.clamp_len :]
+                    r_bias = self.r_bias[i][-self.clamp_len :]
+                else:
+                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]
+
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, r_emb, self.r_w_bias[i],
+                        r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 2: # absolute
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb + pos_emb[-qlen:])
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and i == 0:
+                    mems_i += pos_emb[:mlen]
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 3:
+            core_out = self.drop(word_emb)
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and mlen > 0:
+                    cur_emb = self.r_emb[i][:-qlen]
+                    cur_size = cur_emb.size(0)
+                    if cur_size < mlen:
+                        cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
+                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
+                    else:
+                        cur_emb = cur_emb[-mlen:]
+                    mems_i += cur_emb.view(mlen, 1, -1)
+                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
+
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+                hids.append(core_out)
+
+        core_out = self.drop(core_out)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        return core_out, new_mems
+
+    def forward(self, data, target, *mems):
+        # nn.DataParallel does not allow size(0) tensors to be broadcasted.
+        # So, have to initialize size(0) mems inside the model forward.
+        # Moreover, have to return new_mems to allow nn.DataParallel to piece
+        # them together.
+        if not mems: mems = self.init_mems()
+
+        tgt_len = target.size(0)
+        hidden, new_mems = self._forward(data, mems=mems)
+
+        pred_hid = hidden[-tgt_len:]
+        if self.sample_softmax > 0 and self.training:
+            assert self.tie_weight
+            logit = sample_logits(self.word_emb,
+                self.out_layer.bias, target, pred_hid, self.sampler)
+            loss = -F.log_softmax(logit, -1)[:, :, 0]
+        else:
+            loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1))
+            loss = loss.view(tgt_len, -1)
+
+        if new_mems is None:
+            return [loss]
+        else:
+            return [loss] + new_mems
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='unit test')
+
+    parser.add_argument('--n_layer', type=int, default=4, help='')
+    parser.add_argument('--n_rel_layer', type=int, default=4, help='')
+    parser.add_argument('--n_head', type=int, default=2, help='')
+    parser.add_argument('--d_head', type=int, default=2, help='')
+    parser.add_argument('--d_model', type=int, default=200, help='')
+    parser.add_argument('--d_embed', type=int, default=200, help='')
+    parser.add_argument('--d_inner', type=int, default=200, help='')
+    parser.add_argument('--dropout', type=float, default=0.0, help='')
+    parser.add_argument('--cuda', action='store_true', help='')
+    parser.add_argument('--seed', type=int, default=1111, help='')
+    parser.add_argument('--multi_gpu', action='store_true', help='')
+
+    args = parser.parse_args()
+
+    device = torch.device("cuda" if args.cuda else "cpu")
+
+    B = 4
+    tgt_len, mem_len, ext_len = 36, 36, 0
+    data_len = tgt_len * 20
+    args.n_token = 10000
+
+    import data_utils
+
+    data = torch.LongTensor(data_len*B).random_(0, args.n_token).to(device)
+    diter = data_utils.LMOrderedIterator(data, B, tgt_len, device=device, ext_len=ext_len)
+
+    cutoffs = [args.n_token // 2]
+    tie_projs = [False] + [True] * len(cutoffs)
+
+    for div_val in [1, 2]:
+        for d_embed in [200, 100]:
+            model = MemTransformerLM(args.n_token, args.n_layer, args.n_head,
+                            args.d_model, args.d_head, args.d_inner, args.dropout,
+                            dropatt=args.dropout, tie_weight=True, 
+                            d_embed=d_embed, div_val=div_val, 
+                            tie_projs=tie_projs, pre_lnorm=True,
+                            tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, 
+                            cutoffs=cutoffs, attn_type=0).to(device)
+
+            print(sum(p.numel() for p in model.parameters()))
+
+            mems = tuple()
+            for idx, (inp, tgt, seqlen) in enumerate(diter):
+                print('batch {}'.format(idx))
+                out = model(inp, tgt, *mems)
+                mems = out[1:]
diff --git a/NLP/Transformer-XL/run_wt103_adan.sh b/NLP/Transformer-XL/run_wt103_adan.sh
new file mode 100644
index 0000000..8ea88ff
--- /dev/null
+++ b/NLP/Transformer-XL/run_wt103_adan.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+if [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --cuda \
+        --data /root/autodl-tmp/data/wikitext-103/ \
+        --dataset wt103 \
+        --adaptive \
+        --n_layer 16 \
+        --d_model 410 \
+        --n_head 10 \
+        --d_head 41 \
+        --d_inner 2100 \
+        --dropout 0.1 \
+        --dropatt 0.0 \
+        --optim adan \
+        --wd 0.02 \
+        --lr 0.0015 \
+        --opt-betas 0.9 0.9 0.999 \
+        --clip 0.25 \
+        --lr_min 1e-6 \
+        --warmup_step 5000 \
+        --max_step 200000 \
+        --tgt_len 150 \
+        --mem_len 150 \
+        --eval_tgt_len 150 \
+        --batch_size 60 \
+        --multi_gpu \
+        --gpu0_bsz 4 \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python eval.py \
+        --cuda \
+        --data /root/autodl-tmp/data/wikitext-103/ \
+        --dataset wt103 \
+        --tgt_len 64 \
+        --mem_len 640 \
+        --clamp_len 400 \
+        --same_length \
+        --split test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
diff --git a/NLP/Transformer-XL/train.py b/NLP/Transformer-XL/train.py
new file mode 100644
index 0000000..be07202
--- /dev/null
+++ b/NLP/Transformer-XL/train.py
@@ -0,0 +1,581 @@
+# coding: utf-8
+import argparse
+import time
+import math
+import os, sys
+import itertools
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from adan import Adan
+
+from data_utils import get_lm_corpus
+from mem_transformer import MemTransformerLM
+from utils.exp_utils import create_exp_dir
+from utils.data_parallel import BalancedDataParallel
+
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+parser.add_argument('--data', type=str, default='../data/wikitext-103',
+                    help='location of the data corpus')
+parser.add_argument('--dataset', type=str, default='wt103',
+                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
+                    help='dataset name')
+parser.add_argument('--n_layer', type=int, default=12,
+                    help='number of total layers')
+parser.add_argument('--n_head', type=int, default=10,
+                    help='number of heads')
+parser.add_argument('--d_head', type=int, default=50,
+                    help='head dimension')
+parser.add_argument('--d_embed', type=int, default=-1,
+                    help='embedding dimension')
+parser.add_argument('--d_model', type=int, default=500,
+                    help='model dimension')
+parser.add_argument('--d_inner', type=int, default=1000,
+                    help='inner dimension in FF')
+parser.add_argument('--dropout', type=float, default=0.0,
+                    help='global dropout rate')
+parser.add_argument('--dropatt', type=float, default=0.0,
+                    help='attention probability dropout rate')
+parser.add_argument('--init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--emb_init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--init_range', type=float, default=0.1,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--emb_init_range', type=float, default=0.01,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--init_std', type=float, default=0.02,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--proj_init_std', type=float, default=0.01,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--optim', default='adam', type=str,
+                    choices=['adam', 'sgd', 'adagrad', 'adan'],
+                    help='optimizer to use.')
+parser.add_argument('--lr', type=float, default=0.00025,
+                    help='initial learning rate (0.00025|5 for adam|sgd)')
+parser.add_argument('--wd', type=float, default=0.02,
+                    help='weight decayss')
+parser.add_argument('--mom', type=float, default=0.0,
+                    help='momentum for sgd')
+parser.add_argument('--scheduler', default='cosine', type=str,
+                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
+                    help='lr scheduler to use.')
+parser.add_argument('--warmup_step', type=int, default=0,
+                    help='upper epoch limit')
+parser.add_argument('--decay_rate', type=float, default=0.5,
+                    help='decay factor when ReduceLROnPlateau is used')
+parser.add_argument('--lr_min', type=float, default=0.0,
+                    help='minimum learning rate during annealing')
+parser.add_argument('--clip', type=float, default=0.25,
+                    help='gradient clipping')
+parser.add_argument('--clip_nonemb', action='store_true',
+                    help='only clip the gradient of non-embedding params')
+parser.add_argument('--max_step', type=int, default=100000,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=60,
+                    help='batch size')
+parser.add_argument('--batch_chunk', type=int, default=1,
+                    help='split batch into chunks to save memory')
+parser.add_argument('--tgt_len', type=int, default=70,
+                    help='number of tokens to predict')
+parser.add_argument('--eval_tgt_len', type=int, default=50,
+                    help='number of tokens to predict for evaluation')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=0,
+                    help='length of the retained previous heads')
+parser.add_argument('--not_tied', action='store_true',
+                    help='do not tie the word embedding and softmax weights')
+parser.add_argument('--seed', type=int, default=1111,
+                    help='random seed')
+parser.add_argument('--cuda', action='store_true',
+                    help='use CUDA')
+parser.add_argument('--adaptive', action='store_true',
+                    help='use adaptive softmax')
+parser.add_argument('--div_val', type=int, default=1,
+                    help='divident value for adapative input and softmax')
+parser.add_argument('--pre_lnorm', action='store_true',
+                    help='apply LayerNorm to the input instead of the output')
+parser.add_argument('--varlen', action='store_true',
+                    help='use variable length')
+parser.add_argument('--multi_gpu', action='store_true',
+                    help='use multiple GPU')
+parser.add_argument('--log-interval', type=int, default=200,
+                    help='report interval')
+parser.add_argument('--eval-interval', type=int, default=4000,
+                    help='evaluation interval')
+parser.add_argument('--work_dir', default='LM-TFM', type=str,
+                    help='experiment directory.')
+parser.add_argument('--restart', action='store_true',
+                    help='restart training from the saved checkpoint')
+parser.add_argument('--restart_dir', type=str, default='',
+                    help='restart dir')
+parser.add_argument('--debug', action='store_true',
+                    help='run in debug mode (do not create exp dir)')
+parser.add_argument('--same_length', action='store_true',
+                    help='use the same attn length for all tokens')
+parser.add_argument('--attn_type', type=int, default=0,
+                    help='attention type. 0 for ours, 1 for Shaw et al,'
+                    '2 for Vaswani et al, 3 for Al Rfou et al.')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='use the same pos embeddings after clamp_len')
+parser.add_argument('--eta_min', type=float, default=0.0,
+                    help='min learning rate for cosine scheduler')
+parser.add_argument('--gpu0_bsz', type=int, default=-1,
+                    help='batch size on gpu 0')
+parser.add_argument('--max_eval_steps', type=int, default=-1,
+                    help='max eval steps')
+parser.add_argument('--sample_softmax', type=int, default=-1,
+                    help='number of samples in sampled softmax')
+parser.add_argument('--patience', type=int, default=0,
+                    help='patience')
+parser.add_argument('--finetune_v2', action='store_true',
+                    help='finetune v2')
+parser.add_argument('--finetune_v3', action='store_true',
+                    help='finetune v3')
+parser.add_argument('--fp16', action='store_true',
+                    help='Run in pseudo-fp16 mode (fp16 storage fp32 math).')
+parser.add_argument('--static-loss-scale', type=float, default=1,
+                    help='Static loss scale, positive power of 2 values can '
+                    'improve fp16 convergence.')
+parser.add_argument('--dynamic-loss-scale', action='store_true',
+                    help='Use dynamic loss scaling.  If supplied, this argument'
+                    ' supersedes --static-loss-scale.')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+args = parser.parse_args()
+args.tied = not args.not_tied
+
+if args.d_embed < 0:
+    args.d_embed = args.d_model
+
+assert args.ext_len >= 0, 'extended context length must be non-negative'
+assert args.batch_size % args.batch_chunk == 0
+
+args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
+args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
+logging = create_exp_dir(args.work_dir,
+    scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
+
+# Set the random seed manually for reproducibility.
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+if torch.cuda.is_available():
+    if not args.cuda:
+        print('WARNING: You have a CUDA device, so you should probably run with --cuda')
+    else:
+        torch.cuda.manual_seed_all(args.seed)
+
+# Validate `--fp16` option
+if args.fp16:
+    if not args.cuda:
+        print('WARNING: --fp16 requires --cuda, ignoring --fp16 option')
+        args.fp16 = False
+    else:
+        try:
+            from apex.fp16_utils import FP16_Optimizer
+        except:
+            print('WARNING: apex not installed, ignoring --fp16 option')
+            args.fp16 = False
+
+device = torch.device('cuda' if args.cuda else 'cpu')
+
+###############################################################################
+# Load data
+###############################################################################
+corpus = get_lm_corpus(args.data, args.dataset)
+ntokens = len(corpus.vocab)
+args.n_token = ntokens
+
+eval_batch_size = 10
+tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
+    device=device, ext_len=args.ext_len)
+te_iter = corpus.get_iterator('test', eval_batch_size, args.eval_tgt_len,
+    device=device, ext_len=args.ext_len)
+
+# adaptive softmax / embedding
+cutoffs, tie_projs = [], [False]
+if args.adaptive:
+    assert args.dataset in ['wt103', 'lm1b']
+    if args.dataset == 'wt103':
+        cutoffs = [20000, 40000, 200000]
+        tie_projs += [True] * len(cutoffs)
+    elif args.dataset == 'lm1b':
+        cutoffs = [60000, 100000, 640000]
+        tie_projs += [False] * len(cutoffs)
+
+###############################################################################
+# Build the model
+###############################################################################
+def init_weight(weight):
+    if args.init == 'uniform':
+        nn.init.uniform_(weight, -args.init_range, args.init_range)
+    elif args.init == 'normal':
+        nn.init.normal_(weight, 0.0, args.init_std)
+
+def init_bias(bias):
+    nn.init.constant_(bias, 0.0)
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        if hasattr(m, 'weight') and m.weight is not None:
+            init_weight(m.weight)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('AdaptiveEmbedding') != -1:
+        if hasattr(m, 'emb_projs'):
+            for i in range(len(m.emb_projs)):
+                if m.emb_projs[i] is not None:
+                    nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('Embedding') != -1:
+        if hasattr(m, 'weight'):
+            init_weight(m.weight)
+    elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+        if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+            init_weight(m.cluster_weight)
+        if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+            init_bias(m.cluster_bias)
+        if hasattr(m, 'out_projs'):
+            for i in range(len(m.out_projs)):
+                if m.out_projs[i] is not None:
+                    nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('LayerNorm') != -1:
+        if hasattr(m, 'weight'):
+            nn.init.normal_(m.weight, 1.0, args.init_std)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('TransformerLM') != -1:
+        if hasattr(m, 'r_emb'):
+            init_weight(m.r_emb)
+        if hasattr(m, 'r_w_bias'):
+            init_weight(m.r_w_bias)
+        if hasattr(m, 'r_r_bias'):
+            init_weight(m.r_r_bias)
+        if hasattr(m, 'r_bias'):
+            init_bias(m.r_bias)
+
+def update_dropout(m):
+    classname = m.__class__.__name__
+    if classname.find('Dropout') != -1:
+        if hasattr(m, 'p'):
+            m.p = args.dropout
+
+def update_dropatt(m):
+    if hasattr(m, 'dropatt'):
+        m.dropatt.p = args.dropatt
+
+if args.restart:
+    with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f:
+        model = torch.load(f)
+    if not args.fp16:
+        model = model.float()
+    model.apply(update_dropout)
+    model.apply(update_dropatt)
+else:
+    model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model,
+        args.d_head, args.d_inner, args.dropout, args.dropatt,
+        tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
+        tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
+        ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
+        same_length=args.same_length, attn_type=args.attn_type,
+        clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
+    model.apply(weights_init)
+    model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing
+args.n_all_param = sum([p.nelement() for p in model.parameters()])
+args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
+
+if args.fp16:
+    model = model.half()
+
+if args.multi_gpu:
+    model = model.to(device)
+    if args.gpu0_bsz >= 0:
+        para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk,
+                                          model, dim=1).to(device)
+    else:
+        para_model = nn.DataParallel(model, dim=1).to(device)
+else:
+    para_model = model.to(device)
+
+#### optimizer
+if args.optim.lower() == 'sgd':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
+        optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
+    else:
+        optimizer = optim.SGD(model.parameters(), lr=args.lr,
+            momentum=args.mom)
+elif args.optim.lower() == 'adam':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
+        optimizer = optim.Adam(dense_params, lr=args.lr)
+    else:
+        optimizer = optim.Adam(model.parameters(), lr=args.lr)
+
+elif args.optim.lower() == 'adan':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = Adan(sparse_params,betas=args.opt_betas, lr=args.lr, weight_decay= args.wd)
+        optimizer = Adan(dense_params, lr=args.lr,betas=args.opt_betas, weight_decay= args.wd)
+    else:
+        optimizer = Adan(model.parameters(), lr=args.lr, betas=args.opt_betas, weight_decay= args.wd)
+
+elif args.optim.lower() == 'adagrad':
+    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
+
+#### scheduler
+if args.scheduler == 'cosine':
+    # here we do not set eta_min to lr_min to be backward compatible
+    # because in previous versions eta_min is default to 0
+    # rather than the default value of lr_min 1e-6
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
+        args.max_step, eta_min=args.eta_min) # should use eta_min arg
+    if args.sample_softmax > 0:
+        scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
+            args.max_step, eta_min=args.eta_min) # should use eta_min arg
+elif args.scheduler == 'inv_sqrt':
+    # originally used for Transformer (in Attention is all you need)
+    def lr_lambda(step):
+        # return a multiplier instead of a learning rate
+        if step == 0 and args.warmup_step == 0:
+            return 1.
+        else:
+            return 1. / (step ** 0.5) if step > args.warmup_step \
+                   else step / (args.warmup_step ** 1.5)
+    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
+elif args.scheduler == 'dev_perf':
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
+        factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+    if args.sample_softmax > 0:
+        scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse,
+            factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+elif args.scheduler == 'constant':
+    pass
+
+if args.cuda and args.fp16:
+    # If args.dynamic_loss_scale is False, static_loss_scale will be used.
+    # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale.
+    optimizer = FP16_Optimizer(optimizer,
+                               static_loss_scale = args.static_loss_scale,
+                               dynamic_loss_scale = args.dynamic_loss_scale,
+                               dynamic_loss_args = {'init_scale': 2 ** 16})
+
+if args.restart:
+    if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
+        with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
+            opt_state_dict = torch.load(f)
+            optimizer.load_state_dict(opt_state_dict)
+    else:
+        print('Optimizer was not saved. Start from scratch.')
+
+logging('=' * 100)
+for k, v in args.__dict__.items():
+    logging('    - {} : {}'.format(k, v))
+logging('=' * 100)
+logging('#params = {}'.format(args.n_all_param))
+logging('#non emb params = {}'.format(args.n_nonemb_param))
+
+###############################################################################
+# Training code
+###############################################################################
+
+def evaluate(eval_iter):
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    # If the model does not use memory at all, make the ext_len longer.
+    # Otherwise, make the mem_len longer and keep the ext_len the same.
+    if args.mem_len == 0:
+        model.reset_length(args.eval_tgt_len,
+            args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
+    else:
+        model.reset_length(args.eval_tgt_len,
+            args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)
+
+    # Evaluation
+    total_len, total_loss = 0, 0.
+    with torch.no_grad():
+        mems = tuple()
+        for i, (data, target, seq_len) in enumerate(eval_iter):
+            if args.max_eval_steps > 0 and i >= args.max_eval_steps:
+                break
+            ret = model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.mean()
+            total_loss += seq_len * loss.float().item()
+            total_len += seq_len
+
+    # Switch back to the training mode
+    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+    model.train()
+
+    return total_loss / total_len
+
+
+def train():
+    # Turn on training mode which enables dropout.
+    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
+    model.train()
+    if args.batch_chunk > 1:
+        mems = [tuple() for _ in range(args.batch_chunk)]
+    else:
+        mems = tuple()
+    train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
+    for batch, (data, target, seq_len) in enumerate(train_iter):
+        model.zero_grad()
+        if args.batch_chunk > 1:
+            data_chunks = torch.chunk(data, args.batch_chunk, 1)
+            target_chunks = torch.chunk(target, args.batch_chunk, 1)
+            for i in range(args.batch_chunk):
+                data_i = data_chunks[i].contiguous()
+                target_i = target_chunks[i].contiguous()
+                ret = para_model(data_i, target_i, *mems[i])
+                loss, mems[i] = ret[0], ret[1:]
+                loss = loss.float().mean().type_as(loss) / args.batch_chunk
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                train_loss += loss.float().item()
+        else:
+            ret = para_model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.float().mean().type_as(loss)
+            if args.fp16:
+                optimizer.backward(loss)
+            else:
+                loss.backward()
+            train_loss += loss.float().item()
+
+        if args.fp16:
+            optimizer.clip_master_grads(args.clip)
+        else:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
+
+        optimizer.step()
+        if args.sample_softmax > 0:
+            optimizer_sparse.step()
+
+        # step-wise learning rate annealing
+        train_step += 1
+        if args.scheduler in ['cosine', 'constant', 'dev_perf']:
+            # linear warmup stage
+            if train_step < args.warmup_step:
+                curr_lr = args.lr * train_step / args.warmup_step
+                optimizer.param_groups[0]['lr'] = curr_lr
+                if args.sample_softmax > 0:
+                    optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
+            else:
+                if args.scheduler == 'cosine':
+                    scheduler.step(train_step)
+                    if args.sample_softmax > 0:
+                        scheduler_sparse.step(train_step)
+        elif args.scheduler == 'inv_sqrt':
+            scheduler.step(train_step)
+
+        if train_step % args.log_interval == 0:
+            cur_loss = train_loss / args.log_interval
+            elapsed = time.time() - log_start_time
+            log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
+                      '| ms/batch {:5.2f} | loss {:5.2f}'.format(
+                epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
+                elapsed * 1000 / args.log_interval, cur_loss)
+            if args.dataset in ['enwik8', 'text8']:
+                log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
+            else:
+                log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
+            logging(log_str)
+            train_loss = 0
+            log_start_time = time.time()
+
+        if train_step % args.eval_interval == 0:
+            val_loss = evaluate(va_iter)
+            logging('-' * 100)
+            log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
+                      '| valid loss {:5.2f}'.format(
+                train_step // args.eval_interval, train_step,
+                (time.time() - eval_start_time), val_loss)
+            if args.dataset in ['enwik8', 'text8']:
+                log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
+            else:
+                log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
+            logging(log_str)
+            logging('-' * 100)
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_val_loss or val_loss < best_val_loss:
+                if not args.debug:
+                    with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f:
+                        torch.save(model, f)
+                    with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f:
+                        torch.save(optimizer.state_dict(), f)
+                best_val_loss = val_loss
+
+            # dev-performance based learning rate annealing
+            if args.scheduler == 'dev_perf':
+                scheduler.step(val_loss)
+                if args.sample_softmax > 0:
+                    scheduler_sparse.step(val_loss)
+
+            eval_start_time = time.time()
+
+        if train_step == args.max_step:
+            break
+
+# Loop over epochs.
+train_step = 0
+train_loss = 0
+best_val_loss = None
+
+log_start_time = time.time()
+eval_start_time = time.time()
+
+# At any point you can hit Ctrl + C to break out of training early.
+try:
+    for epoch in itertools.count(start=1):
+        train()
+        if train_step == args.max_step:
+            logging('-' * 100)
+            logging('End of training')
+            break
+except KeyboardInterrupt:
+    logging('-' * 100)
+    logging('Exiting from training early')
+
+# Load the best saved model.
+with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
+    model = torch.load(f)
+para_model = model.to(device)
+
+# Run on test data.
+test_loss = evaluate(te_iter)
+logging('=' * 100)
+if args.dataset in ['enwik8', 'text8']:
+    logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format(
+        test_loss, test_loss / math.log(2)))
+else:
+    logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
+        test_loss, math.exp(test_loss)))
+logging('=' * 100)
diff --git a/NLP/Transformer-XL/utils/adaptive_softmax.py b/NLP/Transformer-XL/utils/adaptive_softmax.py
new file mode 100644
index 0000000..68ae016
--- /dev/null
+++ b/NLP/Transformer-XL/utils/adaptive_softmax.py
@@ -0,0 +1,90 @@
+from collections import defaultdict
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class AdaptiveLogSoftmax(nn.Module):
+    def __init__(self, in_features, n_classes, cutoffs, keep_order=False):
+        super(AdaptiveLogSoftmax, self).__init__()
+
+        cutoffs = list(cutoffs)
+
+        if (cutoffs != sorted(cutoffs)) \
+                or (min(cutoffs) <= 0) \
+                or (max(cutoffs) >= (n_classes - 1)) \
+                or (len(set(cutoffs)) != len(cutoffs)) \
+                or any([int(c) != c for c in cutoffs]):
+
+            raise ValueError("cutoffs should be a sequence of unique, positive "
+                             "integers sorted in an increasing order, where "
+                             "each value is between 1 and n_classes-1")
+
+        self.in_features = in_features
+        self.n_classes = n_classes
+        self.cutoffs = cutoffs + [n_classes]
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features))
+        self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.keep_order = keep_order
+
+
+    def forward(self, hidden, target, weight, bias, keep_order=False):
+        if hidden.size(0) != target.size(0):
+            raise RuntimeError('Input and target should have the same size '
+                               'in the batch dimension.')
+
+        head_weight = torch.cat(
+            [weight[:self.shortlist_size], self.cluster_weight], dim=0)
+        head_bias = torch.cat(
+            [bias[:self.shortlist_size], self.cluster_bias], dim=0)
+
+        head_logit = F.linear(hidden, head_weight, bias=head_bias)
+        head_logprob = F.log_softmax(head_logit, dim=1)
+
+        nll = torch.zeros_like(target,
+                dtype=hidden.dtype, device=hidden.device)
+
+        offset = 0
+        cutoff_values = [0] + self.cutoffs
+        for i in range(len(cutoff_values) - 1):
+            l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1]
+
+            mask_i = (target >= l_idx) & (target < h_idx)
+            indices_i = mask_i.nonzero().squeeze()
+
+            if indices_i.numel() == 0:
+                continue
+
+            target_i = target.index_select(0, indices_i) - l_idx
+            head_logprob_i = head_logprob.index_select(0, indices_i)
+
+            if i == 0:
+                logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+            else:
+                weight_i = weight[l_idx:h_idx]
+                bias_i = bias[l_idx:h_idx]
+
+                hidden_i = hidden.index_select(0, indices_i)
+
+                tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i)
+                tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                logprob_i = head_logprob_i[:, -i] \
+                          + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+
+            if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                nll.index_copy_(0, indices_i, -logprob_i)
+            else:
+                nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+
+            offset += logprob_i.size(0)
+
+        return nll
diff --git a/NLP/Transformer-XL/utils/data_parallel.py b/NLP/Transformer-XL/utils/data_parallel.py
new file mode 100644
index 0000000..d7e1811
--- /dev/null
+++ b/NLP/Transformer-XL/utils/data_parallel.py
@@ -0,0 +1,91 @@
+
+from torch.nn.parallel import DataParallel
+import torch
+from torch.nn.parallel._functions import Scatter
+from torch.nn.parallel.parallel_apply import parallel_apply
+
+def scatter(inputs, target_gpus, chunk_sizes, dim=0):
+    r"""
+    Slices tensors into approximately equal chunks and
+    distributes them across given GPUs. Duplicates
+    references to objects that are not tensors.
+    """
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            try:
+                return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
+            except:
+                print('obj', obj.size())
+                print('dim', dim)
+                print('chunk_sizes', chunk_sizes)
+                quit()
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict) and len(obj) > 0:
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0):
+    r"""Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
+
+class BalancedDataParallel(DataParallel):
+    def __init__(self, gpu0_bsz, *args, **kwargs):
+        self.gpu0_bsz = gpu0_bsz
+        super().__init__(*args, **kwargs)
+
+    def forward(self, *inputs, **kwargs):
+        if not self.device_ids:
+            return self.module(*inputs, **kwargs)
+        if self.gpu0_bsz == 0:
+            device_ids = self.device_ids[1:]
+        else:
+            device_ids = self.device_ids
+        inputs, kwargs = self.scatter(inputs, kwargs, device_ids)
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids)
+        if self.gpu0_bsz == 0:
+            replicas = replicas[1:]
+        outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def parallel_apply(self, replicas, device_ids, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, device_ids)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        bsz = inputs[0].size(self.dim)
+        num_dev = len(self.device_ids)
+        gpu0_bsz = self.gpu0_bsz
+        bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1)
+        if gpu0_bsz < bsz_unit:
+            chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1)
+            delta = bsz - sum(chunk_sizes)
+            for i in range(delta):
+                chunk_sizes[i + 1] += 1
+            if gpu0_bsz == 0:
+                chunk_sizes = chunk_sizes[1:]
+        else:
+            return super().scatter(inputs, kwargs, device_ids)
+        return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim)
+
diff --git a/NLP/Transformer-XL/utils/exp_utils.py b/NLP/Transformer-XL/utils/exp_utils.py
new file mode 100644
index 0000000..e44f7c2
--- /dev/null
+++ b/NLP/Transformer-XL/utils/exp_utils.py
@@ -0,0 +1,40 @@
+import functools
+import os, shutil
+
+import numpy as np
+
+import torch
+
+
+def logging(s, log_path, print_=True, log_=True):
+    if print_:
+        print(s)
+    if log_:
+        with open(log_path, 'a+') as f_log:
+            f_log.write(s + '\n')
+
+def get_logger(log_path, **kwargs):
+    return functools.partial(logging, log_path=log_path, **kwargs)
+
+def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
+    if debug:
+        print('Debug Mode : no experiment dir created')
+        return functools.partial(logging, log_path=None, log_=False)
+
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+
+    print('Experiment dir : {}'.format(dir_path))
+    if scripts_to_save is not None:
+        script_path = os.path.join(dir_path, 'scripts')
+        if not os.path.exists(script_path):
+            os.makedirs(script_path)
+        for script in scripts_to_save:
+            dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script))
+            shutil.copyfile(script, dst_file)
+
+    return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
+
+def save_checkpoint(model, optimizer, path, epoch):
+    torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch)))
+    torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch)))
diff --git a/NLP/Transformer-XL/utils/log_uniform_sampler.py b/NLP/Transformer-XL/utils/log_uniform_sampler.py
new file mode 100644
index 0000000..503f635
--- /dev/null
+++ b/NLP/Transformer-XL/utils/log_uniform_sampler.py
@@ -0,0 +1,147 @@
+import torch
+from torch import nn
+import numpy as np
+
+class LogUniformSampler(object):
+    def __init__(self, range_max, n_sample):
+        """
+        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+
+        expected count can be approximated by 1 - (1 - p)^n
+        and we use a numerically stable version -expm1(num_tries * log1p(-p))
+
+        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
+        """
+        with torch.no_grad():
+            self.range_max = range_max
+            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+            # print('P', self.dist.numpy().tolist()[-30:])
+
+            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+
+        self.n_sample = n_sample
+
+    def sample(self, labels):
+        """
+            labels: [b1, b2]
+        Return
+            true_log_probs: [b1, b2]
+            samp_log_probs: [n_sample]
+            neg_samples: [n_sample]
+        """
+
+        # neg_samples = torch.empty(0).long()
+        n_sample = self.n_sample
+        n_tries = 2 * n_sample
+
+        with torch.no_grad():
+            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
+            device = labels.device
+            neg_samples = neg_samples.to(device)
+            true_log_probs = self.log_q[labels].to(device)
+            samp_log_probs = self.log_q[neg_samples].to(device)
+            return true_log_probs, samp_log_probs, neg_samples
+
+def sample_logits(embedding, bias, labels, inputs, sampler):
+    """
+        embedding: an nn.Embedding layer
+        bias: [n_vocab]
+        labels: [b1, b2]
+        inputs: [b1, b2, n_emb]
+        sampler: you may use a LogUniformSampler
+    Return
+        logits: [b1, b2, 1 + n_sample]
+    """
+    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
+    n_sample = neg_samples.size(0)
+    b1, b2 = labels.size(0), labels.size(1)
+    all_ids = torch.cat([labels.view(-1), neg_samples])
+    all_w = embedding(all_ids)
+    true_w = all_w[: -n_sample].view(b1, b2, -1)
+    sample_w = all_w[- n_sample:].view(n_sample, -1)
+
+    all_b = bias[all_ids]
+    true_b = all_b[: -n_sample].view(b1, b2)
+    sample_b = all_b[- n_sample:]
+
+    hit = (labels[:, :, None] == neg_samples).detach()
+
+    true_logits = torch.einsum('ijk,ijk->ij',
+        [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum('lk,ijk->ijl',
+        [sample_w, inputs]) + sample_b - samp_log_probs
+    sample_logits.masked_fill_(hit, -1e30)
+    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
+
+    return logits
+
+
+# class LogUniformSampler(object):
+#     def __init__(self, range_max, unique=False):
+#         """
+#         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+#             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+#         """
+#         self.range_max = range_max
+#         log_indices = torch.arange(1., range_max+2., 1.).log_()
+#         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+
+#         self.unique = unique
+
+#         if self.unique:
+#             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
+
+#     def sample(self, n_sample, labels):
+#         pos_sample, new_labels = labels.unique(return_inverse=True)
+#         n_pos_sample = pos_sample.size(0)
+#         n_neg_sample = n_sample - n_pos_sample
+
+#         if self.unique:
+#             self.exclude_mask.index_fill_(0, pos_sample, 1)
+#             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
+#             self.exclude_mask.index_fill_(0, pos_sample, 0)
+#         else:
+#             sample_dist = self.dist
+
+#         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
+
+#         sample = torch.cat([pos_sample, neg_sample])
+#         sample_prob = self.dist[sample]
+
+#         return new_labels, sample, sample_prob
+
+
+if __name__ == '__main__':
+    S, B = 3, 4
+    n_vocab = 10000
+    n_sample = 5
+    H = 32
+
+    labels = torch.LongTensor(S, B).random_(0, n_vocab)
+
+    # sampler = LogUniformSampler(n_vocab, unique=False)
+    # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
+
+    sampler = LogUniformSampler(n_vocab, unique=True)
+    # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
+
+    # print('true_probs', true_probs.numpy().tolist())
+    # print('samp_probs', samp_probs.numpy().tolist())
+    # print('neg_samples', neg_samples.numpy().tolist())
+
+    # print('sum', torch.sum(sampler.dist).item())
+
+    # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
+
+    embedding = nn.Embedding(n_vocab, H)
+    bias = torch.zeros(n_vocab)
+    inputs = torch.Tensor(S, B, H).normal_()
+
+    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
+    print('logits', logits.detach().numpy().tolist())
+    print('logits shape', logits.size())
+    print('out_labels', out_labels.detach().numpy().tolist())
+    print('out_labels shape', out_labels.size())
+
diff --git a/NLP/Transformer-XL/utils/proj_adaptive_softmax.py b/NLP/Transformer-XL/utils/proj_adaptive_softmax.py
new file mode 100644
index 0000000..a0fbfeb
--- /dev/null
+++ b/NLP/Transformer-XL/utils/proj_adaptive_softmax.py
@@ -0,0 +1,151 @@
+from collections import defaultdict
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
+CUDA_MINOR = int(torch.version.cuda.split('.')[1])
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+                 keep_order=False):
+        super(ProjectedAdaptiveLogSoftmax, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(
+                        nn.Parameter(torch.Tensor(d_proj, d_embed))
+                    )
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+
+                self.out_projs.append(
+                    nn.Parameter(torch.Tensor(d_proj, d_emb_i))
+                )
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = F.linear(hidden, weight, bias=bias)
+        else:
+            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
+            proj_hid = F.linear(hidden, proj.t().contiguous())
+            logit = F.linear(proj_hid, weight, bias=bias)
+            # else:
+            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
+            #     if bias is not None:
+            #         logit = logit + bias
+
+        return logit
+
+    def forward(self, hidden, target, keep_order=False):
+        '''
+            hidden :: [len*bsz x d_proj]
+            target :: [len*bsz]
+        '''
+
+        if hidden.size(0) != target.size(0):
+            raise RuntimeError('Input and target should have the same size '
+                               'in the batch dimension.')
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            nll = -F.log_softmax(logit, dim=-1) \
+                    .gather(1, target.unsqueeze(1)).squeeze(1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            nll = torch.zeros_like(target,
+                    dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                mask_i = (target >= l_idx) & (target < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                target_i = target.index_select(0, indices_i) - l_idx
+                head_logprob_i = head_logprob.index_select(0, indices_i)
+
+                if i == 0:
+                    logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    hidden_i = hidden.index_select(0, indices_i)
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob_i[:, -i] \
+                              + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+
+                if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                    nll.index_copy_(0, indices_i, -logprob_i)
+                else:
+                    nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+
+                offset += logprob_i.size(0)
+
+        return nll
diff --git a/NLP/Transformer-XL/utils/vocabulary.py b/NLP/Transformer-XL/utils/vocabulary.py
new file mode 100644
index 0000000..b6b8249
--- /dev/null
+++ b/NLP/Transformer-XL/utils/vocabulary.py
@@ -0,0 +1,163 @@
+import os
+from collections import Counter, OrderedDict
+
+import torch
+
+class Vocab(object):
+    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
+                 delimiter=None, vocab_file=None):
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+
+    def tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = line.strip()
+        # convert to lower case
+        if self.lower_case:
+            line = line.lower()
+
+        # empty delimiter '' will evaluate False
+        if self.delimiter == '':
+            symbols = line
+        else:
+            symbols = line.split(self.delimiter)
+
+        if add_double_eos: # lm1b
+            return ['<S>'] + symbols + ['<S>']
+        elif add_eos:
+            return symbols + ['<eos>']
+        else:
+            return symbols
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose: print('counting file {} ...'.format(path))
+        assert os.path.exists(path)
+
+        sents = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+    def count_sents(self, sents, verbose=False):
+        """
+            sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose: print('counting {} sents ...'.format(len(sents)))
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        self.unk_idx = self.sym2idx['<UNK>']
+
+    def build_vocab(self):
+        if self.vocab_file:
+            print('building vocab from {}'.format(self.vocab_file))
+            self._build_from_file(self.vocab_file)
+            print('final vocab size {}'.format(len(self)))
+        else:
+            print('building vocab with min_freq={}, max_size={}'.format(
+                self.min_freq, self.max_size))
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq: break
+                self.add_symbol(sym)
+
+            print('final vocab size {} from {} unique tokens'.format(
+                len(self), len(self.counter)))
+
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
+            add_double_eos=False):
+        if verbose: print('encoding file {} ...'.format(path))
+        assert os.path.exists(path)
+        encoded = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos,
+                    add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose: print('encoding {} sents ...'.format(len(sents)))
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def get_sym(self, idx):
+        assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
+        return self.idx2sym[idx]
+
+    def get_idx(self, sym):
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            # print('encounter unk {}'.format(sym))
+            assert '<eos>' not in sym
+            assert hasattr(self, 'unk_idx')
+            return self.sym2idx.get(sym, self.unk_idx)
+
+    def get_symbols(self, indices):
+        return [self.get_sym(idx) for idx in indices]
+
+    def get_indices(self, symbols):
+        return [self.get_idx(sym) for sym in symbols]
+
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.get_indices(symbols))
+
+    def convert_to_sent(self, indices, exclude=None):
+        if exclude is None:
+            return ' '.join([self.get_sym(idx) for idx in indices])
+        else:
+            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+
+    def __len__(self):
+        return len(self.idx2sym)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1b6a463
--- /dev/null
+++ b/README.md
@@ -0,0 +1,135 @@
+# Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
+
+This is an official PyTorch implementation of **Adan**. See paper [here](https://arxiv.org/abs/2208.06677). If you find our adan helpful or heuristic to your projects, please cite this paper and also star this repository. Thanks!
+
+
+
+
+```tex
+@article{xie2022adan,
+  title={Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models},
+  author={Xie, Xingyu and Zhou, Pan and Li, Huan and Lin, Zhouchen and Yan, Shuicheng},
+  journal={arXiv preprint arXiv:2208.06677},
+  year={2022}
+}
+```
+
+
+
+## Usage
+
+For your convenience to use Adan, we briefly provide some intuitive instructions below, then provide some general experimental tips, and finally give more details (e.g. specific commonds and hyper-parameters) for each experiment in the paper. 
+
+#### 1) Two steps to use Adan
+
+**Step 1.** add Adan-dependent hyper-parameters by adding the following hyper-parameters to the config:
+
+```python
+parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient  (default: 0.0, no gradient clip)')
+parser.add_argument('--weight-decay', type=float, default=0.02,  help='weight decay, similar one used in AdamW (default: 0.02)')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)')
+parser.add_argument('--no-prox', action='store_true', default=False, help='whether perform weight decay like AdamW (default=False)')
+```
+`no-prox`: It determines the update rule of parameters with weight decay. By default, Adan updates the parameters in the way presented in Algorithm 1 in the paper:
+
+  $$\boldsymbol{\theta}_{k+1} = ( 1+\lambda \eta)^{-1}\left[\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k)\right],$$_
+
+But one also can update the parameter like Adamw:
+
+$$\boldsymbol{\theta}_{k+1} = ( 1-\lambda \eta)\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k).$$
+In all experiments, we set `no-prox=False` in our paper. 
+
+**Step 2.** creat the Adan optimizer as follows. In this step, we can directly replace the vanilla optimizer by using the following command:
+
+```python
+from adam import Adan
+optimizer = Adan(param, lr=args.lr, weight_decay=args.weight_decay, betas=args.opt_betas, eps = args.opt_eps, max_grad_norm=args.max_grad_norm, no_prox=args.no_prox)
+```
+
+#### 2) Tips for Experiments
+
+- To make Adan simple, in all experiments except Table 12 in the paper, we do not use the restart strategy in Adan. But Table 12 shows that restart strategy can further slightly improve  the performance of Adan.
+- Adan often allow one to use a large peak learning rate which often fails other optimizers, e.g. Adam and AdamW. For example, in all experiments except for the experiments on MAE pre-training and LSTM, the learning rate used by Adan is **5-10 times** than that in Adam/AdamW.
+- It seems that Adan prefers a large batch size for large-scale experiments, e.g. 2,048 total batch size in our paper. 
+- Adan is relatively robust to `beta1`, `beta2` and `beta3`, especially for `beta2`. If you hope better performance, you can first tune `beta3` and then `beta1`.  
+- Interestingly, we found that `weight_decay = 0.02` is suitable for all experiments in our paper.
+
+#### 3) More extra detailed steps to reproduce experimental results in paper 
+
+Please refer to the following links for detailed steps. In these detailed steps, we even include the **docker images** for reproducibility. 
+
+- [Instruction](./CV/timm/) for **<u>ViTs</u>**, **<u>ResNets</u>**, and **<u>ConvNext</u>**.
+- [Instruction](./CV/MAE/) for **<u>MAE</u>**.
+- [Instruction](./NLP/BERT/) for **<u>BERT</u>**.
+- [Instruction](./NLP/Transformer-XL/) for **<u>Transformer-XL</u>**.
+
+
+
+## Model Zoo
+
+### Results on vision tasks
+
+For your convenience to use Adan, we provide the configs and log files for the experiments on ImageNet-1k.
+
+| Model         |  Epoch  | Training Setting | Acc. (%) |                            Config                            |                           Download                           |
+| ------------- | :-----: | :-----: | :------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| ViT-S         |   150   |    I    |   80.1   | [config](./CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml) | [log](./CV/timm/exp_results/ViT/small/summary_vit-s_150-I.csv)/model |
+| ViT-S         |   150   |   II    |   79.6   | [config](./CV/timm/exp_results/ViT/small/args_vit-s_150.yaml) |  [log](./CV/timm/exp_results/ViT/small/summary_vit-s_150.csv)/model  |
+| ViT-S         |   300   |    I    |   81.1   | [config](./CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml) | [log](./CV/timm/exp_results/ViT/small/summary_vit-s_300-I.csv)/model |
+| ViT-S         |   300   |   II    |   80.7   | [config](./CV/timm/exp_results/ViT/small/args_vit-s_300.yaml) | [log](./CV/timm/exp_results/ViT/small/summary_vit-s_300.csv)/model |
+| ViT-B         |   150   |   II    |   81.7   | [config](./CV/timm/exp_results/ViT/base/args_vit-B_150.yaml) | [log](./CV/timm/exp_results/ViT/base/summary_vit-B_150.csv)/model |
+| ViT-B         |   300   |   II    |   82.3   | [config](./CV/timm/exp_results/ViT/base/args_vit-B_300.yaml) | [log](./CV/timm/exp_results/ViT/base/summary_vit-B_300.csv)/model |
+| ResNet-50     |   100   |    I    |   78.1   | [config](./CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml) | [log](./CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv)/model |
+| ResNet-50     |   200   |    I    |   79.7   |   [config](./exp_results/ResNet/Res50/args_res50_200.yaml)   | [log](./exp_results/ResNet/Res50/summary_res50_200.csv)/model |
+| ResNet-50     |   300   |    I    |   80.2   | [config](./CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml) | [log](./CV/timm/exp_results/ResNet/Res50/summary_res50_300.csv)/model |
+| ConvNext-tiny |   150   |   II    |   81.7   | [config](./CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml) | [log](./CV/timm/exp_results/ConvNext/small/summary_cvnext_150.csv)//model |
+| ConvNext-tiny |   300   |   II    |   82.4   | [config](./CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml) | [log](./CV/timm/exp_results/ConvNext/small/summary_cvnext_300.csv)/model |
+| MAE-small     | 800+100 |   ---   |   83.8   |                 [config](./CV/MAE/README.md)                 | [log-pretrain](./CV/MAE/exp_results/MAE/base/log_base_pretrain.txt)/[log-finetune](./CV/MAE/exp_results/MAE/base/log_base_ft.txt)/model |
+| MAE-Large     | 800+50  |   ---   |   85.9   |                 [config](./CV/MAE/README.md)                 | [log-pretrain](./CV/MAE/exp_results/MAE/large/log_large_pretrain.txt)/[log-finetune](./CV/MAE/exp_results/MAE/large/log_large_ft.txt)/model |
+
+
+
+### Results on NLP tasks
+
+#### BERT-base
+
+We give the configs and log files of the BERT-base model pre-trained on the Bookcorpus and Wikipedia datasets and fine-tuned on GLUE tasks. Note, we provide the config and log file, and detailed [instruction](./NLP/BERT/README.md) for BERT-base in the folder `./NLP/BERT`.
+
+
+
+
+| Pretraining | Config  |  Log   | Model  |
+| --------- | :--------: | :--------- | :--------: |
+| Adan      |  [config](./NLP/BERT/config/pretraining/bert-adan.yaml)  |   [log](./NLP/BERT/exp_results/pretrain/hydra_train-adan.log)   | model |
+
+
+| Fine-tuning on GLUE-Task | Metric                       |  Result   |                         Config                          |
+| -------------- | :--------------------------- | :-------: | :-----------------------------------------------------: |
+| CoLA      | Matthew's corr.              |   64.6    | [config](./NLP/BERT/config/finetuning/cola-adan.yaml)  |
+| SST-2     | Accuracy                     |   93.2    | [config](./NLP/BERT/config/finetuning/sst_2-adan.yaml) |
+| STS-B     | Person corr.                 |   89.3    | [config](./NLP/BERT/config/finetuning/sts_b-adan.yaml) |
+| QQP       | Accuracy                     |   91.2    |  [config](./NLP/BERT/config/finetuning/qqp-adan.yaml)  |
+| MNLI      | Matched acc./Mismatched acc. | 85.7/85.6 | [config](./NLP/BERT/config/finetuning/mnli-adan.yaml)  |
+| QNLI      | Accuracy                     |   91.3    |  [config](./NLP/BERT/config/finetuning/qnli-adan.yaml)  |
+| RTE       | Accuracy                     |   73.3    |  [config](./NLP/BERT/config/finetuning/rte-adan.yaml)   |
+
+
+
+#### Transformer-XL-base 
+
+We provide the config and log for Transformer-XL-base trained on the WikiText-103 dataset.
+
+|                     | Steps | Test PPL |                          Download                           |
+| ------------------- | :---: | :------: | :---------------------------------------------------------: |
+| Baseline (Adam)     | 200k  |   24.2   | [log&config](./NLP/Transformer-XL/exp_results/log-adam.txt) |
+| Transformer-XL-base |  50k  |   26.2   | [log&config](./NLP/Transformer-XL/exp_results/log-50k.txt)  |
+| Transformer-XL-base | 100k  |   24.2   | [log&config](./NLP/Transformer-XL/exp_results/log-100k.txt) |
+| Transformer-XL-base | 200k  |   23.5   | [log&config](./NLP/Transformer-XL/exp_results/log-200k.txt) |
+
+  
+
+​	
+
+
+
diff --git a/adan.py b/adan.py
new file mode 100644
index 0000000..e2a224a
--- /dev/null
+++ b/adan.py
@@ -0,0 +1,154 @@
+# Copyright 2022 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from timm.utils import *
+
+
+class Adan(Optimizer):
+    """
+    Implements a pytorch variant of Adan
+
+    Adan was proposed in
+    Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022.
+    https://arxiv.org/abs/2208.06677
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float, flot], optional): coefficients used for computing 
+            running averages of gradient and its norm. (default: (0.98, 0.92, 0.99))
+        eps (float, optional): term added to the denominator to improve 
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0)
+        max_grad_norm (float, optional): value used to clip 
+            global grad norm (default: 0.0 no clip)
+        no_prox (bool): how to perform the decoupled weight decay (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8,
+                 weight_decay=0.0, max_grad_norm=0.0, no_prox=False):
+        if not 0.0 <= max_grad_norm:
+            raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm))
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= betas[2] < 1.0:
+            raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm, no_prox=no_prox)
+        super(Adan, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adan, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('no_prox', False)
+
+    @torch.no_grad()
+    def restart_opt(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                if p.requires_grad:
+                    state = self.state[p]
+                    # State initialization
+
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    # Exponential moving average of gradient difference
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+    @torch.no_grad()
+    def step(self):
+        """
+            Performs a single optimization step.
+        """
+        if self.defaults['max_grad_norm'] > 0:
+            device = self.param_groups[0]['params'][0].device
+            global_grad_norm = torch.zeros(1, device=device)
+
+            max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device)
+            for group in self.param_groups:
+
+                for p in group['params']:
+                    if p.grad is not None:
+                        grad = p.grad
+                        global_grad_norm.add_(grad.pow(2).sum())
+
+            global_grad_norm = torch.sqrt(global_grad_norm)
+
+            clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0)
+        else:
+            clip_global_grad_norm = 1.0
+
+        for group in self.param_groups:
+            beta1, beta2, beta3 = group['betas']
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            bias_correction1 = 1.0 - beta1 ** group['step']
+
+            bias_correction2 = 1.0 - beta2 ** group['step']
+
+            bias_correction3 = 1.0 - beta3 ** group['step']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+                grad = p.grad.mul_(clip_global_grad_norm)
+                if 'pre_grad' not in state or group['step'] == 1:
+                    state['pre_grad'] = grad
+
+                copy_grad = grad.clone()
+
+                exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff']
+                diff = grad - state['pre_grad']
+
+                update = grad + beta2 * diff
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
+                exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
+                exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # n_t
+
+                denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps'])
+                update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom)
+
+                if group['no_prox']:
+                    p.data.mul_(1 - group['lr'] * group['weight_decay'])
+                    p.add_(update, alpha=-group['lr'])
+                else:
+                    p.add_(update, alpha=-group['lr'])
+                    p.data.div_(1 + group['lr'] * group['weight_decay'])
+
+                state['pre_grad'] = copy_grad