diff --git a/README.md b/README.md index 5a9c221305c..89748a970d0 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,7 @@ We are excited to announce our latest work on real-time object recognition tasks **v3.1.0** was released in 30/6/2023: - Supports tracking algorithms including multi-object tracking (MOT) algorithms SORT, DeepSORT, StrongSORT, OCSORT, ByteTrack, QDTrack, and video instance segmentation (VIS) algorithm MaskTrackRCNN, Mask2Former-VIS. +- Support [ViTDet](projects/ViTDet) - Supports inference and evaluation of multimodal algorithms [GLIP](configs/glip) and [XDecoder](projects/XDecoder), and also supports datasets such as COCO semantic segmentation, COCO Caption, ADE20k general segmentation, and RefCOCO. GLIP fine-tuning will be supported in the future. - Provides a [gradio demo](https://github.com/open-mmlab/mmdetection/blob/dev-3.x/projects/gradio_demo/README.md) for image type tasks of MMDetection, making it easy for users to experience. diff --git a/README_zh-CN.md b/README_zh-CN.md index 3812169f7c7..7f2713dec75 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -120,6 +120,7 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope **v3.1.0** 版本已经在 2023.6.30 发布: - 支持 Tracking 类算法,包括多目标跟踪 MOT 算法 SORT、DeepSORT、StrongSORT、OCSORT、ByteTrack、QDTrack 和视频实例分割 VIS 算法 MaskTrackRCNN、Mask2Former-VIS。 +- 支持 [ViTDet](projects/ViTDet) - 支持多模态开放检测算法 [GLIP](configs/glip) 和 [XDecoder](projects/XDecoder) 推理和评估,并同时支持了 COCO 语义分割、COCO Caption、ADE20k 通用分割、RefCOCO 等数据集。后续将支持 GLIP 微调 - 提供了包括 MMDetection 图片任务的 [gradio demo](https://github.com/open-mmlab/mmdetection/blob/dev-3.x/projects/gradio_demo/README.md),方便用户快速体验 diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md index 9c12195c0cd..88dfe98145f 100644 --- a/docs/en/notes/changelog.md +++ b/docs/en/notes/changelog.md @@ -5,6 +5,7 @@ ### Highlights - Supports tracking algorithms including multi-object tracking (MOT) algorithms SORT, DeepSORT, StrongSORT, OCSORT, ByteTrack, QDTrack, and video instance segmentation (VIS) algorithm MaskTrackRCNN, Mask2Former-VIS. +- Support [ViTDet](../../../projects/ViTDet) - Supports inference and evaluation of multimodal algorithms [GLIP](../../../configs/glip) and [XDecoder](../../../projects/XDecoder), and also supports datasets such as COCO semantic segmentation, COCO Caption, ADE20k general segmentation, and RefCOCO. GLIP fine-tuning will be supported in the future. - Provides a [gradio demo](https://github.com/open-mmlab/mmdetection/blob/dev-3.x/projects/gradio_demo/README.md) for image type tasks of MMDetection, making it easy for users to experience. diff --git a/docs/en/user_guides/tracking_inference.md b/docs/en/user_guides/tracking_inference.md index 4d3cad3593d..06a6912acf6 100644 --- a/docs/en/user_guides/tracking_inference.md +++ b/docs/en/user_guides/tracking_inference.md @@ -9,7 +9,7 @@ Note that if you use a folder as the input, the image names there must be **sor This script can inference an input video / images with a multiple object tracking or video instance segmentation model. ```shell -python demo/demo_mot.py \ +python demo/mot_demo.py \ ${INPUTS} ${CONFIG_FILE} \ [--checkpoint ${CHECKPOINT_FILE}] \ @@ -39,7 +39,7 @@ Optional arguments: ```shell # Example 1: do not specify --checkpoint to use --detector -python demo/demo_mot.py \ +python demo/mot_demo.py \ demo/demo_mot.mp4 \ configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \ --detector \ @@ -47,7 +47,7 @@ python demo/demo_mot.py \ --out mot.mp4 # Example 2: use --checkpoint -python demo/demo_mot.py \ +python demo/mot_demo.py \ demo/demo_mot.mp4 \ configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \ --checkpoint https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth \ diff --git a/mmdet/configs/_base_/datasets/coco_instance.py b/mmdet/configs/_base_/datasets/coco_instance.py new file mode 100644 index 00000000000..b9575432e26 --- /dev/null +++ b/mmdet/configs/_base_/datasets/coco_instance.py @@ -0,0 +1,106 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.dataset.sampler import DefaultSampler + +from mmdet.datasets.coco import CocoDataset +from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.evaluation.metrics.coco_metric import CocoMetric + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=2, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict( +# type=CocoDataset, +# data_root=data_root, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type=CocoMetric, +# metric=['bbox', 'segm'], +# format_only=True, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_instance/test') diff --git a/mmdet/configs/_base_/datasets/coco_instance_semantic.py b/mmdet/configs/_base_/datasets/coco_instance_semantic.py new file mode 100644 index 00000000000..7cf5b2cfab8 --- /dev/null +++ b/mmdet/configs/_base_/datasets/coco_instance_semantic.py @@ -0,0 +1,87 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.dataset.sampler import DefaultSampler + +from mmdet.datasets.coco import CocoDataset +from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.evaluation.metrics.coco_metric import CocoMetric + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) + +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdet/configs/_base_/datasets/coco_panoptic.py b/mmdet/configs/_base_/datasets/coco_panoptic.py new file mode 100644 index 00000000000..29d655ff619 --- /dev/null +++ b/mmdet/configs/_base_/datasets/coco_panoptic.py @@ -0,0 +1,105 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.dataset.sampler import DefaultSampler + +from mmdet.datasets.coco_panoptic import CocoPanopticDataset +from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadPanopticAnnotations +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.evaluation.metrics.coco_panoptic_metric import CocoPanopticMetric + +# dataset settings +dataset_type = 'CocoPanopticDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadPanopticAnnotations, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=LoadPanopticAnnotations, backend_args=backend_args), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=CocoPanopticDataset, + data_root=data_root, + ann_file='annotations/panoptic_train2017.json', + data_prefix=dict( + img='train2017/', seg='annotations/panoptic_train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=CocoPanopticDataset, + data_root=data_root, + ann_file='annotations/panoptic_val2017.json', + data_prefix=dict(img='val2017/', seg='annotations/panoptic_val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoPanopticMetric, + ann_file=data_root + 'annotations/panoptic_val2017.json', + seg_prefix=data_root + 'annotations/panoptic_val2017/', + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=1, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict( +# type=CocoPanopticDataset, +# data_root=data_root, +# ann_file='annotations/panoptic_image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type=CocoPanopticMetric, +# format_only=True, +# ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_panoptic/test') diff --git a/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py new file mode 100644 index 00000000000..b9132ac4033 --- /dev/null +++ b/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py @@ -0,0 +1,220 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.cascade_rcnn import CascadeRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead +from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=CascadeRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type=CascadeRoIHead, + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type=FCNMaskHead, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py b/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py new file mode 100644 index 00000000000..8e6654f381f --- /dev/null +++ b/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.cascade_rcnn import CascadeRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=CascadeRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type=CascadeRoIHead, + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100))) diff --git a/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py b/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py new file mode 100644 index 00000000000..7e18de2224d --- /dev/null +++ b/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.faster_rcnn import FasterRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=FasterRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + roi_head=dict( + type=StandardRoIHead, + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) diff --git a/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py b/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py new file mode 100644 index 00000000000..96be6627d02 --- /dev/null +++ b/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.mask_rcnn import MaskRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=MaskRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + roi_head=dict( + type=StandardRoIHead, + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + mask_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type=FCNMaskHead, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py b/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000000..a81c25af8b9 --- /dev/null +++ b/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.cascade_mask_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py b/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000000..883f09be670 --- /dev/null +++ b/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_detection import * + from .._base_.default_runtime import * + from .._base_.models.cascade_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py b/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000000..f0a6d5a2147 --- /dev/null +++ b/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_detection import * + from .._base_.default_runtime import * + from .._base_.models.faster_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py b/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000000..8145d08fee8 --- /dev/null +++ b/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.mask_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py b/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py new file mode 100644 index 00000000000..fc8932803ca --- /dev/null +++ b/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.models.mask_rcnn_r50_fpn import * + from .._base_.datasets.coco_panoptic import * + from .._base_.schedules.schedule_1x import * + from .._base_.default_runtime import * + +from mmcv.ops import nms +from torch.nn import GroupNorm + +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.detectors.panoptic_fpn import PanopticFPN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.seg_heads.panoptic_fpn_head import PanopticFPNHead +from mmdet.models.seg_heads.panoptic_fusion_heads import HeuristicFusionHead + +model.update( + dict( + type=PanopticFPN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255), + semantic_head=dict( + type=PanopticFPNHead, + num_things_classes=80, + num_stuff_classes=53, + in_channels=256, + inner_channels=128, + start_level=0, + end_level=4, + norm_cfg=dict(type=GroupNorm, num_groups=32, requires_grad=True), + conv_cfg=None, + loss_seg=dict( + type=CrossEntropyLoss, ignore_index=255, loss_weight=0.5)), + panoptic_fusion_head=dict( + type=HeuristicFusionHead, + num_things_classes=80, + num_stuff_classes=53), + test_cfg=dict( + rcnn=dict( + score_thr=0.6, + nms=dict(type=nms, iou_threshold=0.5, class_agnostic=True), + max_per_img=100, + mask_thr_binary=0.5), + # used in HeuristicFusionHead + panoptic=dict(mask_overlap=0.5, stuff_area_limit=4096)))) + +# Forced to remove NumClassCheckHook +custom_hooks = [] diff --git a/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py b/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py new file mode 100644 index 00000000000..5dcda7bf994 --- /dev/null +++ b/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py @@ -0,0 +1,220 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + from .._base_.schedules.schedule_1x import * + from .._base_.datasets.coco_detection import * + from .rtmdet_tta import * + +from mmcv.ops import nms +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize +from mmengine.hooks.ema_hook import EMAHook +from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR +from torch.nn import SyncBatchNorm +from torch.nn.modules.activation import SiLU +from torch.optim.adamw import AdamW + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) +from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook +from mmdet.models.backbones.cspnext import CSPNeXt +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rtmdet_head import RTMDetSepBNHead +from mmdet.models.detectors.rtmdet import RTMDet +from mmdet.models.layers.ema import ExpMomentumEMA +from mmdet.models.losses.gfocal_loss import QualityFocalLoss +from mmdet.models.losses.iou_loss import GIoULoss +from mmdet.models.necks.cspnext_pafpn import CSPNeXtPAFPN +from mmdet.models.task_modules.assigners.dynamic_soft_label_assigner import \ + DynamicSoftLabelAssigner +from mmdet.models.task_modules.coders.distance_point_bbox_coder import \ + DistancePointBBoxCoder +from mmdet.models.task_modules.prior_generators.point_generator import \ + MlvlPointGenerator + +model = dict( + type=RTMDet, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False, + batch_augments=None), + backbone=dict( + type=CSPNeXt, + arch='P5', + expand_ratio=0.5, + deepen_factor=1, + widen_factor=1, + channel_attention=True, + norm_cfg=dict(type=SyncBatchNorm), + act_cfg=dict(type=SiLU, inplace=True)), + neck=dict( + type=CSPNeXtPAFPN, + in_channels=[256, 512, 1024], + out_channels=256, + num_csp_blocks=3, + expand_ratio=0.5, + norm_cfg=dict(type=SyncBatchNorm), + act_cfg=dict(type=SiLU, inplace=True)), + bbox_head=dict( + type=RTMDetSepBNHead, + num_classes=80, + in_channels=256, + stacked_convs=2, + feat_channels=256, + anchor_generator=dict( + type=MlvlPointGenerator, offset=0, strides=[8, 16, 32]), + bbox_coder=dict(type=DistancePointBBoxCoder), + loss_cls=dict( + type=QualityFocalLoss, use_sigmoid=True, beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type=GIoULoss, loss_weight=2.0), + with_objectness=False, + exp_on_reg=True, + share_conv=True, + pred_kernel_size=1, + norm_cfg=dict(type=SyncBatchNorm), + act_cfg=dict(type=SiLU, inplace=True)), + train_cfg=dict( + assigner=dict(type=DynamicSoftLabelAssigner, topk=13), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type=nms, iou_threshold=0.65), + max_per_img=300), +) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.1, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type=PackDetInputs) +] + +train_pipeline_stage2 = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=RandomResize, + scale=(640, 640), + ratio_range=(0.1, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=PackDetInputs) +] + +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(640, 640), keep_ratio=True), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader.update( + dict( + batch_size=32, + num_workers=10, + batch_sampler=None, + pin_memory=True, + dataset=dict(pipeline=train_pipeline))) +val_dataloader.update( + dict(batch_size=5, num_workers=10, dataset=dict(pipeline=test_pipeline))) +test_dataloader = val_dataloader + +max_epochs = 300 +stage2_num_epochs = 20 +base_lr = 0.004 +interval = 10 + +train_cfg.update( + dict( + max_epochs=max_epochs, + val_interval=interval, + dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)])) + +val_evaluator.update(dict(proposal_nums=(100, 1, 10))) +test_evaluator = val_evaluator + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=AdamW, lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type=CosineAnnealingLR, + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks.update( + dict( + checkpoint=dict( + interval=interval, + max_keep_ckpts=3 # only keep latest 3 checkpoints + ))) + +custom_hooks = [ + dict( + type=EMAHook, + ema_type=ExpMomentumEMA, + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type=PipelineSwitchHook, + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py b/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py new file mode 100644 index 00000000000..db21b747e95 --- /dev/null +++ b/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_l_8xb32_300e_coco import * + +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize +from mmengine.hooks.ema_hook import EMAHook + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) +from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook +from mmdet.models.layers.ema import ExpMomentumEMA + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa +model.update( + dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.5, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict( + in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1), + bbox_head=dict(in_channels=128, feat_channels=128, exp_on_reg=False))) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type=PackDetInputs) +] + +train_pipeline_stage2 = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=RandomResize, + scale=(640, 640), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=PackDetInputs) +] + +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) + +custom_hooks = [ + dict( + type=EMAHook, + ema_type=ExpMomentumEMA, + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type=PipelineSwitchHook, + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdet/configs/rtmdet/rtmdet_tta.py b/mmdet/configs/rtmdet/rtmdet_tta.py new file mode 100644 index 00000000000..f27b7aa4a3b --- /dev/null +++ b/mmdet/configs/rtmdet/rtmdet_tta.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import TestTimeAug + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import Pad, RandomFlip, Resize +from mmdet.models.test_time_augs.det_tta import DetTTAModel + +tta_model = dict( + type=DetTTAModel, + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100)) + +img_scales = [(640, 640), (320, 320), (960, 960)] + +tta_pipeline = [ + dict(type=LoadImageFromFile, backend_args=None), + dict( + type=TestTimeAug, + transforms=[ + [dict(type=Resize, scale=s, keep_ratio=True) for s in img_scales], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type=RandomFlip, prob=1.), + dict(type=RandomFlip, prob=0.) + ], + [ + dict( + type=Pad, + size=(960, 960), + pad_val=dict(img=(114, 114, 114))), + ], + [dict(type=LoadAnnotations, with_bbox=True)], + [ + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +]