Update README.md and add more experiment results

fundamentalvision · Aug 4, 2023 · 32b65ff · 32b65ff
1 parent 001242c
commit 32b65ff
Show file tree

Hide file tree

Showing 6 changed files with 1,808 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -45,7 +45,14 @@ The proposed approach achieves the new state-of-the-art **56.9\%** in terms of N
 | [R101-DCN](https://github.com/zhiqi-li/storage/releases/download/v1.0/r101_dcn_fcos3d_pretrain.pth)  | BEVFormer-base | 24ep | 51.7|41.6 |28500M |[config](projects/configs/bevformer/bevformer_base.py) | [model](https://github.com/zhiqi-li/storage/releases/download/v1.0/bevformer_r101_dcn_24ep.pth)/[log](https://github.com/zhiqi-li/storage/releases/download/v1.0/bevformer_r101_dcn_24ep.log) |
 | [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij)  | BEVformerV2-t1-base | 24ep | 42.6 | 35.1 | 23952M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
 | [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij)  | BEVformerV2-t1-base | 48ep | 43.9 | 35.9 | 23952M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
+| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij)  | BEVformerV2-t1 | 24ep | 45.3 | 38.1 | 37579M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
+| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij)  | BEVformerV2-t1 | 48ep | 46.5 | 39.5 | 37579M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
+| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij)  | BEVformerV2-t2 | 24ep | 51.8 | 42.0 | 38954M |[config](projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
+| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij)  | BEVformerV2-t2 | 48ep | 52.6 | 43.1 | 38954M |[config](projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
+| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij)  | BEVformerV2-t8 | 24ep | 55.3 | 46.0 | 40392M |[config](projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
+
 # Catalog
+- [ ] BEVFormerV2 HyperQuery
 - [ ] BEVFormerV2 Optimization, including memory, speed, inference.
 - [x] BEVFormerV2 Release
 - [ ] BEV Segmentation checkpoints

diff --git a/projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py b/projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.3805
+# mATE: 0.7198
+# mASE: 0.2805
+# mAOE: 0.4131
+# mAVE: 0.7652
+# mAAE: 0.1951
+# NDS: 0.4529
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768],  #  (0.8, 1.2)
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='GlobalRotScaleTransImage',
+        rot_range=[-22.5, 22.5],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+        reverse_angle=True,
+        training=True,
+        flip_dx_ratio=0.5,
+        flip_dy_ratio=0.5,
+        only_gt=True,),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead_GroupDETR',
+        group_detr=group_detr,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='GroupMultiheadAttention',
+                            group=group_detr,
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)