Skip to content

Commit

Permalink
Update README.md and add more experiment results
Browse files Browse the repository at this point in the history
  • Loading branch information
tianhao2 committed Aug 4, 2023
1 parent 001242c commit 32b65ff
Show file tree
Hide file tree
Showing 6 changed files with 1,808 additions and 0 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,14 @@ The proposed approach achieves the new state-of-the-art **56.9\%** in terms of N
| [R101-DCN](https://github.com/zhiqi-li/storage/releases/download/v1.0/r101_dcn_fcos3d_pretrain.pth) | BEVFormer-base | 24ep | 51.7|41.6 |28500M |[config](projects/configs/bevformer/bevformer_base.py) | [model](https://github.com/zhiqi-li/storage/releases/download/v1.0/bevformer_r101_dcn_24ep.pth)/[log](https://github.com/zhiqi-li/storage/releases/download/v1.0/bevformer_r101_dcn_24ep.log) |
| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij) | BEVformerV2-t1-base | 24ep | 42.6 | 35.1 | 23952M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij) | BEVformerV2-t1-base | 48ep | 43.9 | 35.9 | 23952M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij) | BEVformerV2-t1 | 24ep | 45.3 | 38.1 | 37579M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij) | BEVformerV2-t1 | 48ep | 46.5 | 39.5 | 37579M |[config](projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij) | BEVformerV2-t2 | 24ep | 51.8 | 42.0 | 38954M |[config](projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij) | BEVformerV2-t2 | 48ep | 52.6 | 43.1 | 38954M |[config](projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |
| [R50](https://pan.baidu.com/s/1Jh5Aq2YwcD6tdj7Sl5BB3g?pwd=5rij) | BEVformerV2-t8 | 24ep | 55.3 | 46.0 | 40392M |[config](projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py) | [model/log](https://pan.baidu.com/s/1ynzlAt1DQbH8NkqmisatTw?pwd=fdcv) |

# Catalog
- [ ] BEVFormerV2 HyperQuery
- [ ] BEVFormerV2 Optimization, including memory, speed, inference.
- [x] BEVFormerV2 Release
- [ ] BEV Segmentation checkpoints
Expand Down
360 changes: 360 additions & 0 deletions projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,360 @@
# mAP: 0.3805
# mATE: 0.7198
# mASE: 0.2805
# mAOE: 0.4131
# mAVE: 0.7652
# mAAE: 0.1951
# NDS: 0.4529
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (0,)
group_detr = 11
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": True,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='GlobalRotScaleTransImage',
rot_range=[-22.5, 22.5],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
reverse_angle=True,
training=True,
flip_dx_ratio=0.5,
flip_dy_ratio=0.5,
only_gt=True,),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]

data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)

# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5

model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead_GroupDETR',
group_detr=group_detr,
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='GroupMultiheadAttention',
group=group_detr,
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))

# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[20, ])
total_epochs = 24
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
Loading

0 comments on commit 32b65ff

Please sign in to comment.