diff --git a/README-mmpretrain.md b/README-mmpretrain.md
new file mode 100644
index 00000000000..5318df5b958
--- /dev/null
+++ b/README-mmpretrain.md
@@ -0,0 +1,339 @@
+<div align="center">
+
+<img src="resources/mmpt-logo.png" width="600"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab website</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab platform</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![PyPI](https://img.shields.io/pypi/v/mmpretrain)](https://pypi.org/project/mmpretrain)
+[![Docs](https://img.shields.io/badge/docs-latest-blue)](https://mmpretrain.readthedocs.io/en/latest/)
+[![Build Status](https://github.com/open-mmlab/mmpretrain/workflows/build/badge.svg)](https://github.com/open-mmlab/mmpretrain/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmpretrain/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmpretrain)
+[![license](https://img.shields.io/github/license/open-mmlab/mmpretrain.svg)](https://github.com/open-mmlab/mmpretrain/blob/main/LICENSE)
+[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmpretrain.svg)](https://github.com/open-mmlab/mmpretrain/issues)
+[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmpretrain.svg)](https://github.com/open-mmlab/mmpretrain/issues)
+
+[📘 Documentation](https://mmpretrain.readthedocs.io/en/latest/) |
+[🛠️ Installation](https://mmpretrain.readthedocs.io/en/latest/get_started.html#installation) |
+[👀 Model Zoo](https://mmpretrain.readthedocs.io/en/latest/modelzoo_statistics.html) |
+[🆕 Update News](https://mmpretrain.readthedocs.io/en/latest/notes/changelog.html) |
+[🤔 Reporting Issues](https://github.com/open-mmlab/mmpretrain/issues/new/choose)
+
+<img src="https://user-images.githubusercontent.com/36138628/230307505-4727ad0a-7d71-4069-939d-b499c7e272b7.png" width="400"/>
+
+English | [简体中文](/README_zh-CN.md)
+
+</div>
+
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.gg/raweFPmdzG" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
+</div>
+
+## Introduction
+
+MMPreTrain is an open source pre-training toolbox based on PyTorch. It is a part of the [OpenMMLab](https://openmmlab.com/) project.
+
+The `main` branch works with **PyTorch 1.8+**.
+
+### Major features
+
+- Various backbones and pretrained models
+- Rich training strategies (supervised learning, self-supervised learning, multi-modality learning etc.)
+- Bag of training tricks
+- Large-scale training configs
+- High efficiency and extensibility
+- Powerful toolkits for model analysis and experiments
+- Various out-of-box inference tasks.
+  - Image Classification
+  - Image Caption
+  - Visual Question Answering
+  - Visual Grounding
+  - Retrieval (Image-To-Image, Text-To-Image, Image-To-Text)
+
+https://github.com/open-mmlab/mmpretrain/assets/26739999/e4dcd3a2-f895-4d1b-a351-fbc74a04e904
+
+## What's new
+
+🌟 v1.2.0 was released in 04/01/2023
+
+- Support LLaVA 1.5.
+- Implement of RAM with a gradio interface.
+
+🌟 v1.1.0 was released in 12/10/2023
+
+- Support Mini-GPT4 training and provide a Chinese model (based on Baichuan-7B)
+- Support zero-shot classification based on CLIP.
+
+🌟 v1.0.0 was released in 04/07/2023
+
+- Support inference of more **multi-modal** algorithms, such as [**LLaVA**](./configs/llava/), [**MiniGPT-4**](./configs/minigpt4), [**Otter**](./configs/otter/), etc.
+- Support around **10 multi-modal** datasets!
+- Add [**iTPN**](./configs/itpn/), [**SparK**](./configs/spark/) self-supervised learning algorithms.
+- Provide examples of [New Config](./mmpretrain/configs/) and [DeepSpeed/FSDP with FlexibleRunner](./configs/mae/benchmarks/). Here are the documentation links of [New Config](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) and [DeepSpeed/FSDP with FlexibleRunner](https://mmengine.readthedocs.io/en/latest/api/generated/mmengine.runner.FlexibleRunner.html#mmengine.runner.FlexibleRunner).
+
+🌟 Upgrade from MMClassification to MMPreTrain
+
+- Integrated Self-supervised learning algorithms from **MMSelfSup**, such as **MAE**, **BEiT**, etc.
+- Support **RIFormer**, a simple but effective vision backbone by removing token mixer.
+- Refactor dataset pipeline visualization.
+- Support **LeViT**, **XCiT**, **ViG**, **ConvNeXt-V2**, **EVA**, **RevViT**, **EfficientnetV2**, **CLIP**, **TinyViT** and **MixMIM** backbones.
+
+This release introduced a brand new and flexible training & test engine, but it's still in progress. Welcome
+to try according to [the documentation](https://mmpretrain.readthedocs.io/en/latest/).
+
+And there are some BC-breaking changes. Please check [the migration tutorial](https://mmpretrain.readthedocs.io/en/latest/migration.html).
+
+Please refer to [changelog](https://mmpretrain.readthedocs.io/en/latest/notes/changelog.html) for more details and other release history.
+
+## Installation
+
+Below are quick steps for installation:
+
+```shell
+conda create -n open-mmlab python=3.8 pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -y
+conda activate open-mmlab
+pip install openmim
+git clone https://github.com/open-mmlab/mmpretrain.git
+cd mmpretrain
+mim install -e .
+```
+
+Please refer to [installation documentation](https://mmpretrain.readthedocs.io/en/latest/get_started.html) for more detailed installation and dataset preparation.
+
+For multi-modality models support, please install the extra dependencies by:
+
+```shell
+mim install -e ".[multimodal]"
+```
+
+## User Guides
+
+We provided a series of tutorials about the basic usage of MMPreTrain for new users:
+
+- [Learn about Configs](https://mmpretrain.readthedocs.io/en/latest/user_guides/config.html)
+- [Prepare Dataset](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html)
+- [Inference with existing models](https://mmpretrain.readthedocs.io/en/latest/user_guides/inference.html)
+- [Train](https://mmpretrain.readthedocs.io/en/latest/user_guides/train.html)
+- [Test](https://mmpretrain.readthedocs.io/en/latest/user_guides/test.html)
+- [Downstream tasks](https://mmpretrain.readthedocs.io/en/latest/user_guides/downstream.html)
+
+For more information, please refer to [our documentation](https://mmpretrain.readthedocs.io/en/latest/).
+
+## Model zoo
+
+Results and models are available in the [model zoo](https://mmpretrain.readthedocs.io/en/latest/modelzoo_statistics.html).
+
+<div align="center">
+  <b>Overview</b>
+</div>
+<table align="center">
+  <tbody>
+    <tr align="center" valign="bottom">
+      <td>
+        <b>Supported Backbones</b>
+      </td>
+      <td>
+        <b>Self-supervised Learning</b>
+      </td>
+      <td>
+        <b>Multi-Modality Algorithms</b>
+      </td>
+      <td>
+        <b>Others</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <ul>
+        <li><a href="configs/vgg">VGG</a></li>
+        <li><a href="configs/resnet">ResNet</a></li>
+        <li><a href="configs/resnext">ResNeXt</a></li>
+        <li><a href="configs/seresnet">SE-ResNet</a></li>
+        <li><a href="configs/seresnet">SE-ResNeXt</a></li>
+        <li><a href="configs/regnet">RegNet</a></li>
+        <li><a href="configs/shufflenet_v1">ShuffleNet V1</a></li>
+        <li><a href="configs/shufflenet_v2">ShuffleNet V2</a></li>
+        <li><a href="configs/mobilenet_v2">MobileNet V2</a></li>
+        <li><a href="configs/mobilenet_v3">MobileNet V3</a></li>
+        <li><a href="configs/swin_transformer">Swin-Transformer</a></li>
+        <li><a href="configs/swin_transformer_v2">Swin-Transformer V2</a></li>
+        <li><a href="configs/repvgg">RepVGG</a></li>
+        <li><a href="configs/vision_transformer">Vision-Transformer</a></li>
+        <li><a href="configs/tnt">Transformer-in-Transformer</a></li>
+        <li><a href="configs/res2net">Res2Net</a></li>
+        <li><a href="configs/mlp_mixer">MLP-Mixer</a></li>
+        <li><a href="configs/deit">DeiT</a></li>
+        <li><a href="configs/deit3">DeiT-3</a></li>
+        <li><a href="configs/conformer">Conformer</a></li>
+        <li><a href="configs/t2t_vit">T2T-ViT</a></li>
+        <li><a href="configs/twins">Twins</a></li>
+        <li><a href="configs/efficientnet">EfficientNet</a></li>
+        <li><a href="configs/edgenext">EdgeNeXt</a></li>
+        <li><a href="configs/convnext">ConvNeXt</a></li>
+        <li><a href="configs/hrnet">HRNet</a></li>
+        <li><a href="configs/van">VAN</a></li>
+        <li><a href="configs/convmixer">ConvMixer</a></li>
+        <li><a href="configs/cspnet">CSPNet</a></li>
+        <li><a href="configs/poolformer">PoolFormer</a></li>
+        <li><a href="configs/inception_v3">Inception V3</a></li>
+        <li><a href="configs/mobileone">MobileOne</a></li>
+        <li><a href="configs/efficientformer">EfficientFormer</a></li>
+        <li><a href="configs/mvit">MViT</a></li>
+        <li><a href="configs/hornet">HorNet</a></li>
+        <li><a href="configs/mobilevit">MobileViT</a></li>
+        <li><a href="configs/davit">DaViT</a></li>
+        <li><a href="configs/replknet">RepLKNet</a></li>
+        <li><a href="configs/beit">BEiT</a></li>
+        <li><a href="configs/mixmim">MixMIM</a></li>
+        <li><a href="configs/efficientnet_v2">EfficientNet V2</a></li>
+        <li><a href="configs/revvit">RevViT</a></li>
+        <li><a href="configs/convnext_v2">ConvNeXt V2</a></li>
+        <li><a href="configs/vig">ViG</a></li>
+        <li><a href="configs/xcit">XCiT</a></li>
+        <li><a href="configs/levit">LeViT</a></li>
+        <li><a href="configs/riformer">RIFormer</a></li>
+        <li><a href="configs/glip">GLIP</a></li>
+        <li><a href="configs/sam">ViT SAM</a></li>
+        <li><a href="configs/eva02">EVA02</a></li>
+        <li><a href="configs/dinov2">DINO V2</a></li>
+        <li><a href="configs/hivit">HiViT</a></li>
+        </ul>
+      </td>
+      <td>
+        <ul>
+        <li><a href="configs/mocov2">MoCo V1 (CVPR'2020)</a></li>
+        <li><a href="configs/simclr">SimCLR (ICML'2020)</a></li>
+        <li><a href="configs/mocov2">MoCo V2 (arXiv'2020)</a></li>
+        <li><a href="configs/byol">BYOL (NeurIPS'2020)</a></li>
+        <li><a href="configs/swav">SwAV (NeurIPS'2020)</a></li>
+        <li><a href="configs/densecl">DenseCL (CVPR'2021)</a></li>
+        <li><a href="configs/simsiam">SimSiam (CVPR'2021)</a></li>
+        <li><a href="configs/barlowtwins">Barlow Twins (ICML'2021)</a></li>
+        <li><a href="configs/mocov3">MoCo V3 (ICCV'2021)</a></li>
+        <li><a href="configs/beit">BEiT (ICLR'2022)</a></li>
+        <li><a href="configs/mae">MAE (CVPR'2022)</a></li>
+        <li><a href="configs/simmim">SimMIM (CVPR'2022)</a></li>
+        <li><a href="configs/maskfeat">MaskFeat (CVPR'2022)</a></li>
+        <li><a href="configs/cae">CAE (arXiv'2022)</a></li>
+        <li><a href="configs/milan">MILAN (arXiv'2022)</a></li>
+        <li><a href="configs/beitv2">BEiT V2 (arXiv'2022)</a></li>
+        <li><a href="configs/eva">EVA (CVPR'2023)</a></li>
+        <li><a href="configs/mixmim">MixMIM (arXiv'2022)</a></li>
+        <li><a href="configs/itpn">iTPN (CVPR'2023)</a></li>
+        <li><a href="configs/spark">SparK (ICLR'2023)</a></li>
+        <li><a href="configs/mff">MFF (ICCV'2023)</a></li>
+        </ul>
+      </td>
+      <td>
+        <ul>
+        <li><a href="configs/blip">BLIP (arxiv'2022)</a></li>
+        <li><a href="configs/blip2">BLIP-2 (arxiv'2023)</a></li>
+        <li><a href="configs/ofa">OFA (CoRR'2022)</a></li>
+        <li><a href="configs/flamingo">Flamingo (NeurIPS'2022)</a></li>
+        <li><a href="configs/chinese_clip">Chinese CLIP (arxiv'2022)</a></li>
+        <li><a href="configs/minigpt4">MiniGPT-4 (arxiv'2023)</a></li>
+        <li><a href="configs/llava">LLaVA (arxiv'2023)</a></li>
+        <li><a href="configs/otter">Otter (arxiv'2023)</a></li>
+        </ul>
+      </td>
+      <td>
+      Image Retrieval Task:
+        <ul>
+        <li><a href="configs/arcface">ArcFace (CVPR'2019)</a></li>
+        </ul>
+      Training&Test Tips:
+        <ul>
+        <li><a href="https://arxiv.org/abs/1909.13719">RandAug</a></li>
+        <li><a href="https://arxiv.org/abs/1805.09501">AutoAug</a></li>
+        <li><a href="mmpretrain/datasets/samplers/repeat_aug.py">RepeatAugSampler</a></li>
+        <li><a href="mmpretrain/models/tta/score_tta.py">TTA</a></li>
+        <li>...</li>
+        </ul>
+      </td>
+  </tbody>
+</table>
+
+## Contributing
+
+We appreciate all contributions to improve MMPreTrain.
+Please refer to [CONTRUBUTING](https://mmpretrain.readthedocs.io/en/latest/notes/contribution_guide.html) for the contributing guideline.
+
+## Acknowledgement
+
+MMPreTrain is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedbacks.
+We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and supporting their own academic research.
+
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```BibTeX
+@misc{2023mmpretrain,
+    title={OpenMMLab's Pre-training Toolbox and Benchmark},
+    author={MMPreTrain Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmpretrain}},
+    year={2023}
+}
+```
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
+
+## Projects in OpenMMLab
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries.
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
+- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab.
diff --git a/README.md b/README.md
index 5318df5b958..b2cd1f6a75b 100644
--- a/README.md
+++ b/README.md
@@ -1,339 +1,7 @@
-<div align="center">
+# Bear MMPretrain
 
-<img src="resources/mmpt-logo.png" width="600"/>
-  <div>&nbsp;</div>
-  <div align="center">
-    <b><font size="5">OpenMMLab website</font></b>
-    <sup>
-      <a href="https://openmmlab.com">
-        <i><font size="4">HOT</font></i>
-      </a>
-    </sup>
-    &nbsp;&nbsp;&nbsp;&nbsp;
-    <b><font size="5">OpenMMLab platform</font></b>
-    <sup>
-      <a href="https://platform.openmmlab.com">
-        <i><font size="4">TRY IT OUT</font></i>
-      </a>
-    </sup>
-  </div>
-  <div>&nbsp;</div>
+## Dataset
+### 1. ImageNet-1K
+### 2. ImageNet-100
 
-[![PyPI](https://img.shields.io/pypi/v/mmpretrain)](https://pypi.org/project/mmpretrain)
-[![Docs](https://img.shields.io/badge/docs-latest-blue)](https://mmpretrain.readthedocs.io/en/latest/)
-[![Build Status](https://github.com/open-mmlab/mmpretrain/workflows/build/badge.svg)](https://github.com/open-mmlab/mmpretrain/actions)
-[![codecov](https://codecov.io/gh/open-mmlab/mmpretrain/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmpretrain)
-[![license](https://img.shields.io/github/license/open-mmlab/mmpretrain.svg)](https://github.com/open-mmlab/mmpretrain/blob/main/LICENSE)
-[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmpretrain.svg)](https://github.com/open-mmlab/mmpretrain/issues)
-[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmpretrain.svg)](https://github.com/open-mmlab/mmpretrain/issues)
-
-[📘 Documentation](https://mmpretrain.readthedocs.io/en/latest/) |
-[🛠️ Installation](https://mmpretrain.readthedocs.io/en/latest/get_started.html#installation) |
-[👀 Model Zoo](https://mmpretrain.readthedocs.io/en/latest/modelzoo_statistics.html) |
-[🆕 Update News](https://mmpretrain.readthedocs.io/en/latest/notes/changelog.html) |
-[🤔 Reporting Issues](https://github.com/open-mmlab/mmpretrain/issues/new/choose)
-
-<img src="https://user-images.githubusercontent.com/36138628/230307505-4727ad0a-7d71-4069-939d-b499c7e272b7.png" width="400"/>
-
-English | [简体中文](/README_zh-CN.md)
-
-</div>
-
-</div>
-
-<div align="center">
-  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
-  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
-  <a href="https://discord.gg/raweFPmdzG" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
-  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
-  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
-  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
-  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
-  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
-  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
-  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
-  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
-</div>
-
-## Introduction
-
-MMPreTrain is an open source pre-training toolbox based on PyTorch. It is a part of the [OpenMMLab](https://openmmlab.com/) project.
-
-The `main` branch works with **PyTorch 1.8+**.
-
-### Major features
-
-- Various backbones and pretrained models
-- Rich training strategies (supervised learning, self-supervised learning, multi-modality learning etc.)
-- Bag of training tricks
-- Large-scale training configs
-- High efficiency and extensibility
-- Powerful toolkits for model analysis and experiments
-- Various out-of-box inference tasks.
-  - Image Classification
-  - Image Caption
-  - Visual Question Answering
-  - Visual Grounding
-  - Retrieval (Image-To-Image, Text-To-Image, Image-To-Text)
-
-https://github.com/open-mmlab/mmpretrain/assets/26739999/e4dcd3a2-f895-4d1b-a351-fbc74a04e904
-
-## What's new
-
-🌟 v1.2.0 was released in 04/01/2023
-
-- Support LLaVA 1.5.
-- Implement of RAM with a gradio interface.
-
-🌟 v1.1.0 was released in 12/10/2023
-
-- Support Mini-GPT4 training and provide a Chinese model (based on Baichuan-7B)
-- Support zero-shot classification based on CLIP.
-
-🌟 v1.0.0 was released in 04/07/2023
-
-- Support inference of more **multi-modal** algorithms, such as [**LLaVA**](./configs/llava/), [**MiniGPT-4**](./configs/minigpt4), [**Otter**](./configs/otter/), etc.
-- Support around **10 multi-modal** datasets!
-- Add [**iTPN**](./configs/itpn/), [**SparK**](./configs/spark/) self-supervised learning algorithms.
-- Provide examples of [New Config](./mmpretrain/configs/) and [DeepSpeed/FSDP with FlexibleRunner](./configs/mae/benchmarks/). Here are the documentation links of [New Config](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) and [DeepSpeed/FSDP with FlexibleRunner](https://mmengine.readthedocs.io/en/latest/api/generated/mmengine.runner.FlexibleRunner.html#mmengine.runner.FlexibleRunner).
-
-🌟 Upgrade from MMClassification to MMPreTrain
-
-- Integrated Self-supervised learning algorithms from **MMSelfSup**, such as **MAE**, **BEiT**, etc.
-- Support **RIFormer**, a simple but effective vision backbone by removing token mixer.
-- Refactor dataset pipeline visualization.
-- Support **LeViT**, **XCiT**, **ViG**, **ConvNeXt-V2**, **EVA**, **RevViT**, **EfficientnetV2**, **CLIP**, **TinyViT** and **MixMIM** backbones.
-
-This release introduced a brand new and flexible training & test engine, but it's still in progress. Welcome
-to try according to [the documentation](https://mmpretrain.readthedocs.io/en/latest/).
-
-And there are some BC-breaking changes. Please check [the migration tutorial](https://mmpretrain.readthedocs.io/en/latest/migration.html).
-
-Please refer to [changelog](https://mmpretrain.readthedocs.io/en/latest/notes/changelog.html) for more details and other release history.
-
-## Installation
-
-Below are quick steps for installation:
-
-```shell
-conda create -n open-mmlab python=3.8 pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -y
-conda activate open-mmlab
-pip install openmim
-git clone https://github.com/open-mmlab/mmpretrain.git
-cd mmpretrain
-mim install -e .
-```
-
-Please refer to [installation documentation](https://mmpretrain.readthedocs.io/en/latest/get_started.html) for more detailed installation and dataset preparation.
-
-For multi-modality models support, please install the extra dependencies by:
-
-```shell
-mim install -e ".[multimodal]"
-```
-
-## User Guides
-
-We provided a series of tutorials about the basic usage of MMPreTrain for new users:
-
-- [Learn about Configs](https://mmpretrain.readthedocs.io/en/latest/user_guides/config.html)
-- [Prepare Dataset](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html)
-- [Inference with existing models](https://mmpretrain.readthedocs.io/en/latest/user_guides/inference.html)
-- [Train](https://mmpretrain.readthedocs.io/en/latest/user_guides/train.html)
-- [Test](https://mmpretrain.readthedocs.io/en/latest/user_guides/test.html)
-- [Downstream tasks](https://mmpretrain.readthedocs.io/en/latest/user_guides/downstream.html)
-
-For more information, please refer to [our documentation](https://mmpretrain.readthedocs.io/en/latest/).
-
-## Model zoo
-
-Results and models are available in the [model zoo](https://mmpretrain.readthedocs.io/en/latest/modelzoo_statistics.html).
-
-<div align="center">
-  <b>Overview</b>
-</div>
-<table align="center">
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>Supported Backbones</b>
-      </td>
-      <td>
-        <b>Self-supervised Learning</b>
-      </td>
-      <td>
-        <b>Multi-Modality Algorithms</b>
-      </td>
-      <td>
-        <b>Others</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-        <ul>
-        <li><a href="configs/vgg">VGG</a></li>
-        <li><a href="configs/resnet">ResNet</a></li>
-        <li><a href="configs/resnext">ResNeXt</a></li>
-        <li><a href="configs/seresnet">SE-ResNet</a></li>
-        <li><a href="configs/seresnet">SE-ResNeXt</a></li>
-        <li><a href="configs/regnet">RegNet</a></li>
-        <li><a href="configs/shufflenet_v1">ShuffleNet V1</a></li>
-        <li><a href="configs/shufflenet_v2">ShuffleNet V2</a></li>
-        <li><a href="configs/mobilenet_v2">MobileNet V2</a></li>
-        <li><a href="configs/mobilenet_v3">MobileNet V3</a></li>
-        <li><a href="configs/swin_transformer">Swin-Transformer</a></li>
-        <li><a href="configs/swin_transformer_v2">Swin-Transformer V2</a></li>
-        <li><a href="configs/repvgg">RepVGG</a></li>
-        <li><a href="configs/vision_transformer">Vision-Transformer</a></li>
-        <li><a href="configs/tnt">Transformer-in-Transformer</a></li>
-        <li><a href="configs/res2net">Res2Net</a></li>
-        <li><a href="configs/mlp_mixer">MLP-Mixer</a></li>
-        <li><a href="configs/deit">DeiT</a></li>
-        <li><a href="configs/deit3">DeiT-3</a></li>
-        <li><a href="configs/conformer">Conformer</a></li>
-        <li><a href="configs/t2t_vit">T2T-ViT</a></li>
-        <li><a href="configs/twins">Twins</a></li>
-        <li><a href="configs/efficientnet">EfficientNet</a></li>
-        <li><a href="configs/edgenext">EdgeNeXt</a></li>
-        <li><a href="configs/convnext">ConvNeXt</a></li>
-        <li><a href="configs/hrnet">HRNet</a></li>
-        <li><a href="configs/van">VAN</a></li>
-        <li><a href="configs/convmixer">ConvMixer</a></li>
-        <li><a href="configs/cspnet">CSPNet</a></li>
-        <li><a href="configs/poolformer">PoolFormer</a></li>
-        <li><a href="configs/inception_v3">Inception V3</a></li>
-        <li><a href="configs/mobileone">MobileOne</a></li>
-        <li><a href="configs/efficientformer">EfficientFormer</a></li>
-        <li><a href="configs/mvit">MViT</a></li>
-        <li><a href="configs/hornet">HorNet</a></li>
-        <li><a href="configs/mobilevit">MobileViT</a></li>
-        <li><a href="configs/davit">DaViT</a></li>
-        <li><a href="configs/replknet">RepLKNet</a></li>
-        <li><a href="configs/beit">BEiT</a></li>
-        <li><a href="configs/mixmim">MixMIM</a></li>
-        <li><a href="configs/efficientnet_v2">EfficientNet V2</a></li>
-        <li><a href="configs/revvit">RevViT</a></li>
-        <li><a href="configs/convnext_v2">ConvNeXt V2</a></li>
-        <li><a href="configs/vig">ViG</a></li>
-        <li><a href="configs/xcit">XCiT</a></li>
-        <li><a href="configs/levit">LeViT</a></li>
-        <li><a href="configs/riformer">RIFormer</a></li>
-        <li><a href="configs/glip">GLIP</a></li>
-        <li><a href="configs/sam">ViT SAM</a></li>
-        <li><a href="configs/eva02">EVA02</a></li>
-        <li><a href="configs/dinov2">DINO V2</a></li>
-        <li><a href="configs/hivit">HiViT</a></li>
-        </ul>
-      </td>
-      <td>
-        <ul>
-        <li><a href="configs/mocov2">MoCo V1 (CVPR'2020)</a></li>
-        <li><a href="configs/simclr">SimCLR (ICML'2020)</a></li>
-        <li><a href="configs/mocov2">MoCo V2 (arXiv'2020)</a></li>
-        <li><a href="configs/byol">BYOL (NeurIPS'2020)</a></li>
-        <li><a href="configs/swav">SwAV (NeurIPS'2020)</a></li>
-        <li><a href="configs/densecl">DenseCL (CVPR'2021)</a></li>
-        <li><a href="configs/simsiam">SimSiam (CVPR'2021)</a></li>
-        <li><a href="configs/barlowtwins">Barlow Twins (ICML'2021)</a></li>
-        <li><a href="configs/mocov3">MoCo V3 (ICCV'2021)</a></li>
-        <li><a href="configs/beit">BEiT (ICLR'2022)</a></li>
-        <li><a href="configs/mae">MAE (CVPR'2022)</a></li>
-        <li><a href="configs/simmim">SimMIM (CVPR'2022)</a></li>
-        <li><a href="configs/maskfeat">MaskFeat (CVPR'2022)</a></li>
-        <li><a href="configs/cae">CAE (arXiv'2022)</a></li>
-        <li><a href="configs/milan">MILAN (arXiv'2022)</a></li>
-        <li><a href="configs/beitv2">BEiT V2 (arXiv'2022)</a></li>
-        <li><a href="configs/eva">EVA (CVPR'2023)</a></li>
-        <li><a href="configs/mixmim">MixMIM (arXiv'2022)</a></li>
-        <li><a href="configs/itpn">iTPN (CVPR'2023)</a></li>
-        <li><a href="configs/spark">SparK (ICLR'2023)</a></li>
-        <li><a href="configs/mff">MFF (ICCV'2023)</a></li>
-        </ul>
-      </td>
-      <td>
-        <ul>
-        <li><a href="configs/blip">BLIP (arxiv'2022)</a></li>
-        <li><a href="configs/blip2">BLIP-2 (arxiv'2023)</a></li>
-        <li><a href="configs/ofa">OFA (CoRR'2022)</a></li>
-        <li><a href="configs/flamingo">Flamingo (NeurIPS'2022)</a></li>
-        <li><a href="configs/chinese_clip">Chinese CLIP (arxiv'2022)</a></li>
-        <li><a href="configs/minigpt4">MiniGPT-4 (arxiv'2023)</a></li>
-        <li><a href="configs/llava">LLaVA (arxiv'2023)</a></li>
-        <li><a href="configs/otter">Otter (arxiv'2023)</a></li>
-        </ul>
-      </td>
-      <td>
-      Image Retrieval Task:
-        <ul>
-        <li><a href="configs/arcface">ArcFace (CVPR'2019)</a></li>
-        </ul>
-      Training&Test Tips:
-        <ul>
-        <li><a href="https://arxiv.org/abs/1909.13719">RandAug</a></li>
-        <li><a href="https://arxiv.org/abs/1805.09501">AutoAug</a></li>
-        <li><a href="mmpretrain/datasets/samplers/repeat_aug.py">RepeatAugSampler</a></li>
-        <li><a href="mmpretrain/models/tta/score_tta.py">TTA</a></li>
-        <li>...</li>
-        </ul>
-      </td>
-  </tbody>
-</table>
-
-## Contributing
-
-We appreciate all contributions to improve MMPreTrain.
-Please refer to [CONTRUBUTING](https://mmpretrain.readthedocs.io/en/latest/notes/contribution_guide.html) for the contributing guideline.
-
-## Acknowledgement
-
-MMPreTrain is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedbacks.
-We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and supporting their own academic research.
-
-## Citation
-
-If you find this project useful in your research, please consider cite:
-
-```BibTeX
-@misc{2023mmpretrain,
-    title={OpenMMLab's Pre-training Toolbox and Benchmark},
-    author={MMPreTrain Contributors},
-    howpublished = {\url{https://github.com/open-mmlab/mmpretrain}},
-    year={2023}
-}
-```
-
-## License
-
-This project is released under the [Apache 2.0 license](LICENSE).
-
-## Projects in OpenMMLab
-
-- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
-- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
-- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
-- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries.
-- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark.
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
-- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
-- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
-- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
-- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox.
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
-- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab.
+## CSPNeXt Training
diff --git a/configs/cspnext/README.md b/configs/cspnext/README.md
new file mode 100644
index 00000000000..2db5a50ec5e
--- /dev/null
+++ b/configs/cspnext/README.md
@@ -0,0 +1,53 @@
+# CSPNeXt ImageNet Pre-training
+
+In this folder, we provide the imagenet pre-training config of RTMDet's backbone CSPNeXt.
+
+## Requirements
+
+To train with these configs, please install [MMClassification 1.x](https://github.com/open-mmlab/mmclassification/tree/1.x) first.
+
+Install by MIM:
+
+```shell
+mim install mmcls>=1.0.0rc0
+```
+
+or install by pip:
+
+```shell
+pip install mmcls>=1.0.0rc0
+```
+
+## Prepare Dataset
+
+To pre-train on ImageNet, you need to prepare the dataset first. Please refer to the [guide](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#imagenet).
+
+## How to Train
+
+You can use the classification config in the same way as the detection config.
+
+For single-GPU training, run:
+
+```shell
+python tools/train.py \
+    ${CONFIG_FILE} \
+    [optional arguments]
+```
+
+For multi-GPU training, run:
+
+```shell
+bash ./tools/dist_train.sh \
+    ${CONFIG_FILE} \
+    ${GPU_NUM} \
+    [optional arguments]
+```
+
+More details can be found in [user guides](https://mmdetection.readthedocs.io/en/3.x/user_guides/train.html).
+
+## Results and Models
+
+|    Model     | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                                                      Download                                                       |
+| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------------------------------------------------: |
+| CSPNeXt-tiny |  224x224   |   2.73    |  0.339   |   69.44   |   89.45   | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth) |
+|  CSPNeXt-s   |  224x224   |   4.89    |  0.664   |   74.41   |   92.23   |  [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth)   |
diff --git a/configs/cspnext/cpsnext-tiny-600e_in100.py b/configs/cspnext/cpsnext-tiny-600e_in100.py
new file mode 100644
index 00000000000..687a67ffb74
--- /dev/null
+++ b/configs/cspnext/cpsnext-tiny-600e_in100.py
@@ -0,0 +1,112 @@
+_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py'
+work_dir = "./work_dir/cspnext-tiny-600e_in100"
+# act_cfg = dict(type='ReLU6', inplace=True)
+
+data_preprocessor = dict(
+    type='ClsDataPreprocessor',
+    num_classes=100,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    to_rgb=True,
+)
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(deepen_factor=0.167, widen_factor=0.375),
+    head=dict(in_channels=384,
+    num_classes=100))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),  # <--- [필수]
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=256, edge='short'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),  # <--- [필수]
+]
+train_dataloader = dict(
+    batch_size=384,
+    num_workers=12,
+    dataset=dict(
+        type='ImageNet',
+        data_root='/mnt/hdd/OpenDataLab___ImageNet-100/raw/MyImagenet',
+        ann_file='meta/train.txt',
+        data_prefix='train/',
+        pipeline=train_pipeline), # <--- [추가]
+    sampler=dict(type='RepeatAugSampler', shuffle=True)
+)
+
+val_dataloader = dict(
+    batch_size=384,
+    num_workers=36,
+    dataset=dict(
+        type='ImageNet',
+        data_root='/mnt/hdd/OpenDataLab___ImageNet-100/raw/MyImagenet',
+        ann_file='meta/val.txt',
+        data_prefix='val/',
+        pipeline=test_pipeline), # <--- [추가]
+    sampler=dict(type='DefaultSampler', shuffle=False)
+)
+val_cfg = dict() 
+
+# 2. 평가 지표 설정 (정확도 Top-1, Top-5 측정)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+
+# test_dataloader = val_dataloader
+# test_evaluator = val_evaluator
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(
+        type='WandbVisBackend',
+        init_kwargs=dict(
+            project='csp', # WandB 프로젝트 이름
+            name='CSPNeXt-tiny-default',      # (선택) 실험 이름
+            # entity='my-username'            # (선택) 팀/유저 이름
+        )
+    )
+]
+
+visualizer = dict(
+    type='UniversalVisualizer',  # <--- [중요] mmpretrain 전용 비주얼라이저 사용
+    vis_backends=vis_backends
+)
+
+default_hooks = dict(
+    # 기존 설정 유지
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    
+    # [수정 1] Segmentation 전용 훅 제거 -> 일반 VisualizationHook 사용
+    visualization=dict(type='VisualizationHook', enable=True),
+    
+    # [수정 2] CheckpointHook의 저장 기준 변경
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=1,                # 1 Epoch마다 저장 시도
+        save_best='accuracy/top1', # [변경] Segmentation(mIoU) -> Classification(Top-1 Accuracy)
+        rule='greater',            # 정확도는 높을수록 좋으므로 greater
+        max_keep_ckpts=3,          # Best 모델 최대 3개 유지 (주석에 따라 1로 줄여도 됨)
+        save_last=True             # 가장 마지막 Epoch 모델 별도 저장
+    )
+)
+
+optim_wrapper = dict(
+    type='OptimWrapper', # 기본 OptimWrapper 사용
+    optimizer=dict(
+        type='Lamb',      # 여기에 반드시 type이 있어야 합니다.
+        lr=0.005,
+        weight_decay=0.02
+    ),
+    paramwise_cfg=dict(
+        bias_decay_mult=0.0,
+        norm_decay_mult=0.0
+    )
+)
\ No newline at end of file
diff --git a/configs/cspnext/cspnext-s_8xb256-rsb-a1-600e_in1k.py b/configs/cspnext/cspnext-s_8xb256-rsb-a1-600e_in1k.py
new file mode 100644
index 00000000000..02b861c75e2
--- /dev/null
+++ b/configs/cspnext/cspnext-s_8xb256-rsb-a1-600e_in1k.py
@@ -0,0 +1,67 @@
+_base_ = [
+    'mmcls::_base_/datasets/imagenet_bs256_rsb_a12.py',
+    'mmcls::_base_/schedules/imagenet_bs2048_rsb.py',
+    'mmcls::_base_/default_runtime.py'
+]
+default_scope = 'mmpretrain'
+# custom_imports = dict(
+#     imports=['mmdet.models', 'mmyolo.models'], allow_failed_imports=False)
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='CSPNeXt',
+        arch='P5',
+        out_indices=(4, ),
+        expand_ratio=0.5,
+        deepen_factor=0.33,
+        widen_factor=0.5,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type='SiLU')),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(
+            type='LabelSmoothLoss',
+            label_smooth_val=0.1,
+            mode='original',
+            loss_weight=1.0),
+        topk=(1, 5)),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.2),
+        dict(type='CutMix', alpha=1.0)
+    ]))
+
+# dataset settings
+train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(weight_decay=0.01),
+    paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
+)
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=0.0001,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=595,
+        eta_min=1.0e-6,
+        by_epoch=True,
+        begin=5,
+        end=600)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=600)
diff --git a/configs/cspnext/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py b/configs/cspnext/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py
new file mode 100644
index 00000000000..af3170bdc51
--- /dev/null
+++ b/configs/cspnext/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py
@@ -0,0 +1,5 @@
+_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py'
+
+model = dict(
+    backbone=dict(deepen_factor=0.167, widen_factor=0.375),
+    head=dict(in_channels=384))
diff --git a/mmpretrain/models/backbones/__init__.py b/mmpretrain/models/backbones/__init__.py
index 60e37fb7b6e..35691539ff7 100644
--- a/mmpretrain/models/backbones/__init__.py
+++ b/mmpretrain/models/backbones/__init__.py
@@ -58,6 +58,7 @@
 from .vit_eva02 import ViTEVA02
 from .vit_sam import ViTSAM
 from .xcit import XCiT
+from .cspnext import CSPNeXt
 
 __all__ = [
     'LeNet5',
@@ -126,4 +127,5 @@
     'HiViT',
     'SparseResNet',
     'SparseConvNeXt',
+    'CSPNeXt',
 ]
diff --git a/mmpretrain/models/backbones/base_backbone_yolo.py b/mmpretrain/models/backbones/base_backbone_yolo.py
new file mode 100644
index 00000000000..c2f9bbbc7e6
--- /dev/null
+++ b/mmpretrain/models/backbones/base_backbone_yolo.py
@@ -0,0 +1,221 @@
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from mmpretrain.registry import MODELS
+from mmengine.model import BaseModule
+
+@MODELS.register_module()
+class BaseBackbone(BaseModule, metaclass=ABCMeta):
+    """BaseBackbone backbone used in YOLO series.
+
+    .. code:: text
+
+     Backbone model structure diagram
+     +-----------+
+     |   input   |
+     +-----------+
+           v
+     +-----------+
+     |   stem    |
+     |   layer   |
+     +-----------+
+           v
+     +-----------+
+     |   stage   |
+     |  layer 1  |
+     +-----------+
+           v
+     +-----------+
+     |   stage   |
+     |  layer 2  |
+     +-----------+
+           v
+         ......
+           v
+     +-----------+
+     |   stage   |
+     |  layer n  |
+     +-----------+
+     In P5 model, n=4
+     In P6 model, n=5
+
+    Args:
+        arch_setting (list): Architecture of BaseBackbone.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels: Number of input image channels. Defaults to 3.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to None.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to None.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 arch_setting: list,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Sequence[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 plugins: Union[dict, List[dict]] = None,
+                 norm_cfg = None,
+                 act_cfg = None,
+                 norm_eval: bool = False,
+                 init_cfg = None):
+        super().__init__(init_cfg)
+        self.num_stages = len(arch_setting)
+        self.arch_setting = arch_setting
+
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('"frozen_stages" must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.input_channels = input_channels
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.widen_factor = widen_factor
+        self.deepen_factor = deepen_factor
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.plugins = plugins
+
+        self.stem = self.build_stem_layer()
+        self.layers = ['stem']
+
+        for idx, setting in enumerate(arch_setting):
+            stage = []
+            stage += self.build_stage_layer(idx, setting)
+            if plugins is not None:
+                stage += self.make_stage_plugins(plugins, idx, setting)
+            self.add_module(f'stage{idx + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{idx + 1}')
+
+    @abstractmethod
+    def build_stem_layer(self):
+        """Build a stem layer."""
+        pass
+
+    @abstractmethod
+    def build_stage_layer(self, stage_idx: int, setting: list):
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        pass
+
+    def make_stage_plugins(self, plugins, stage_idx, setting):
+        """Make plugins for backbone ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block``, ``dropout_block``
+        into the backbone.
+
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True)),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True)),
+            ... ]
+            >>> model = YOLOv5CSPDarknet()
+            >>> stage_plugins = model.make_stage_plugins(plugins, 0, setting)
+            >>> assert len(stage_plugins) == 1
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1 -> conv2 -> conv3 -> yyy
+
+        Suppose ``stage_idx=1``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1 -> conv2 -> conv3 -> xxx -> yyy
+
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+                If stages is missing, the plugin would be applied to all
+                stages.
+            setting (list): The architecture setting of a stage layer.
+
+        Returns:
+            list[nn.Module]: Plugins for current stage
+        """
+        # TODO: It is not general enough to support any channel and needs
+        # to be refactored
+        in_channels = int(setting[1] * self.widen_factor)
+        plugin_layers = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            if stages is None or stages[stage_idx]:
+                name, layer = build_plugin_layer(
+                    plugin['cfg'], in_channels=in_channels)
+                plugin_layers.append(layer)
+        return plugin_layers
+
+    def _freeze_stages(self):
+        """Freeze the parameters of the specified stage so that they are no
+        longer updated."""
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode: bool = True):
+        """Convert the model into training mode while keep normalization layer
+        frozen."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """Forward batch_inputs from the data_preprocessor."""
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
diff --git a/mmpretrain/models/backbones/cspnext.py b/mmpretrain/models/backbones/cspnext.py
new file mode 100644
index 00000000000..fe87a12985c
--- /dev/null
+++ b/mmpretrain/models/backbones/cspnext.py
@@ -0,0 +1,194 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Sequence, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+
+from mmpretrain.registry import MODELS
+from mmpretrain.utils.typing_utils import ConfigType, OptConfigType, OptMultiConfig, MultiConfig
+#from ..layers import SPPFBottleneck
+from mmengine.model import BaseModule
+from .base_backbone_yolo import BaseBackbone
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Sequence, Union
+
+from mmengine.model import BaseModule
+
+from ..layers.yolo_blocks import DarknetBottleneck, CSPLayer, SPPFBottleneck
+
+@MODELS.register_module()
+class CSPNeXt(BaseBackbone):
+    """CSPNeXt backbone used in RTMDet.
+
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.Defaults to
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(
+        self,
+        arch: str = 'P5',
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        input_channels: int = 3,
+        out_indices: Sequence[int] = (2, 3, 4),
+        frozen_stages: int = -1,
+        plugins: Union[dict, List[dict]] = None,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        arch_ovewrite: dict = None,
+        channel_attention: bool = True,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        norm_eval: bool = False,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        self.channel_attention = channel_attention
+        self.use_depthwise = use_depthwise
+        self.conv = DepthwiseSeparableConvModule \
+            if use_depthwise else ConvModule
+        self.expand_ratio = expand_ratio
+        self.conv_cfg = conv_cfg
+
+        super().__init__(
+            arch_setting,
+            deepen_factor,
+            widen_factor,
+            input_channels,
+            out_indices,
+            frozen_stages=frozen_stages,
+            plugins=plugins,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        stem = nn.Sequential(
+            ConvModule(
+                3,
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                3,
+                padding=1,
+                stride=2,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                int(self.arch_setting[0][0] * self.widen_factor),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        return stem
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        num_blocks = max(round(num_blocks * self.deepen_factor), 1)
+
+        stage = []
+        conv_layer = self.conv(
+            in_channels,
+            out_channels,
+            3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=5,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        csp_layer = CSPLayer(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            use_depthwise=self.use_depthwise,
+            # use_cspnext_block=True,
+            expand_ratio=self.expand_ratio,
+            # channel_attention=self.channel_attention,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        return stage
diff --git a/mmpretrain/models/layers/__init__.py b/mmpretrain/models/layers/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/mmpretrain/models/layers/yolo_blocks.py b/mmpretrain/models/layers/yolo_blocks.py
new file mode 100644
index 00000000000..ae374307b43
--- /dev/null
+++ b/mmpretrain/models/layers/yolo_blocks.py
@@ -0,0 +1,242 @@
+import torch
+import torch.nn as nn
+
+from typing import List, Optional, Sequence, Tuple, Union
+
+from mmengine.model import BaseModule
+from mmpretrain.utils import ConfigType, OptConfigType, OptMultiConfig
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+ 
+class DarknetBottleneck(BaseModule):
+    """The basic bottleneck block used in Darknet.
+
+    Each ResBlock consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and LeakyReLU.
+    The first convLayer has filter size of 1x1 and the second one has the
+    filter size of 3x3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (int): The kernel size of the convolution. Default: 0.5
+        add_identity (bool): Whether to add identity to the out.
+            Default: True
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=0.5,
+                 add_identity=True,
+                 use_depthwise=False,
+                 conv_cfg: OptConfigType =None,
+                 norm_cfg: ConfigType =dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType =dict(type='Swish'),
+                 init_cfg: OptMultiConfig =None):
+        super().__init__(init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = conv(
+            hidden_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPLayer(BaseModule):
+    """Cross Stage Partial Layer.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Default: 0.5
+        num_blocks (int): Number of blocks. Default: 1
+        add_identity (bool): Whether to add identity in blocks.
+            Default: True
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expand_ratio=0.5,
+                 num_blocks=1,
+                 add_identity=True,
+                 use_depthwise=False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig =None):
+        super().__init__(init_cfg)
+        mid_channels = int(out_channels * expand_ratio)
+        self.main_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.short_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.final_conv = ConvModule(
+            2 * mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.Sequential(*[
+            DarknetBottleneck(
+                mid_channels,
+                mid_channels,
+                1.0,
+                add_identity,
+                use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg) for _ in range(num_blocks)
+        ])
+
+    def forward(self, x):
+        x_short = self.short_conv(x)
+
+        x_main = self.main_conv(x)
+        x_main = self.blocks(x_main)
+
+        x_final = torch.cat((x_main, x_short), dim=1)
+        return self.final_conv(x_final)
+
+
+class SPPFBottleneck(BaseModule):
+    """Spatial pyramid pooling - Fast (SPPF) layer for
+    YOLOv5, YOLOX and PPYOLOE by Glenn Jocher
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (int, tuple[int]): Sequential or number of kernel
+            sizes of pooling layers. Defaults to 5.
+        use_conv_first (bool): Whether to use conv before pooling layer.
+            In YOLOv5 and YOLOX, the para set to True.
+            In PPYOLOE, the para set to False.
+            Defaults to True.
+        mid_channels_scale (float): Channel multiplier, multiply in_channels
+            by this amount to get mid_channels. This parameter is valid only
+            when use_conv_fist=True.Defaults to 0.5.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None.
+            which means using conv2d. Defaults to None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_sizes: Union[int, Sequence[int]] = 5,
+                 use_conv_first: bool = True,
+                 mid_channels_scale: float = 0.5,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType  = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg)
+
+        if use_conv_first:
+            mid_channels = int(in_channels * mid_channels_scale)
+            self.conv1 = ConvModule(
+                in_channels,
+                mid_channels,
+                1,
+                stride=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            mid_channels = in_channels
+            self.conv1 = None
+        self.kernel_sizes = kernel_sizes
+        if isinstance(kernel_sizes, int):
+            self.poolings = nn.MaxPool2d(
+                kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2)
+            conv2_in_channels = mid_channels * 4
+        else:
+            self.poolings = nn.ModuleList([
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ])
+            conv2_in_channels = mid_channels * (len(kernel_sizes) + 1)
+
+        self.conv2 = ConvModule(
+            conv2_in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        """Forward process
+        Args:
+            x (Tensor): The input tensor.
+        """
+        if self.conv1:
+            x = self.conv1(x)
+        if isinstance(self.kernel_sizes, int):
+            y1 = self.poolings(x)
+            y2 = self.poolings(y1)
+            x = torch.cat([x, y1, y2, self.poolings(y2)], dim=1)
+        else:
+            x = torch.cat(
+                [x] + [pooling(x) for pooling in self.poolings], dim=1)
+        x = self.conv2(x)
+        return x
\ No newline at end of file
diff --git a/mmpretrain/utils/__init__.py b/mmpretrain/utils/__init__.py
index 991e3217d2f..fab8559df85 100644
--- a/mmpretrain/utils/__init__.py
+++ b/mmpretrain/utils/__init__.py
@@ -5,8 +5,18 @@
 from .misc import get_ori_model
 from .progress import track, track_on_main_process
 from .setup_env import register_all_modules
+from .typing_utils import ConfigType, OptConfigType, MultiConfig, OptMultiConfig, InstanceList, OptInstanceList, PixelList, OptPixelList, RangeType
 
 __all__ = [
     'collect_env', 'register_all_modules', 'track_on_main_process',
-    'load_json_log', 'get_ori_model', 'track', 'require'
+    'load_json_log', 'get_ori_model', 'track', 'require',
+    'ConfigType', 
+    'OptConfigType', 
+    'MultiConfig', 
+    'OptMultiConfig', 
+    'InstanceList', 
+    'OptInstanceList', 
+    'PixelList', 
+    'OptPixelList', 
+    'RangeType'
 ]
diff --git a/mmpretrain/utils/typing_utils.py b/mmpretrain/utils/typing_utils.py
new file mode 100644
index 00000000000..127b5a361b0
--- /dev/null
+++ b/mmpretrain/utils/typing_utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmdetection."""
+from typing import List, Optional, Sequence, Tuple, Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData, PixelData
+
+# TODO: Need to avoid circular import with assigner and sampler
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+InstanceList = List[InstanceData]
+OptInstanceList = Optional[InstanceList]
+
+PixelList = List[PixelData]
+OptPixelList = Optional[PixelList]
+
+RangeType = Sequence[Tuple[int, int]]
\ No newline at end of file