SCsleepy
diff --git a/‎.flake8
+1-1 b/‎.flake8
+1-1
diff --git a/‎configs/common/coco_schedule.py
+1-1 b/‎configs/common/coco_schedule.py
+1-1
diff --git a/‎detrex/layers/__init__.py
+1-1 b/‎detrex/layers/__init__.py
+1-1
diff --git a/‎detrex/layers/denoising.py
+42-25 b/‎detrex/layers/denoising.py
+42-25
diff --git a/‎detrex/layers/multi_scale_deform_attn.py
+3-1 b/‎detrex/layers/multi_scale_deform_attn.py
+3-1
diff --git a/‎detrex/layers/position_embedding.py
+2-2 b/‎detrex/layers/position_embedding.py
+2-2
diff --git a/‎detrex/layers/shape_spec.py
+2-1 b/‎detrex/layers/shape_spec.py
+2-1
diff --git a/‎detrex/layers/transformer.py
+2-1 b/‎detrex/layers/transformer.py
+2-1
diff --git a/‎detrex/modeling/backbone/convnext.py
+2-2 b/‎detrex/modeling/backbone/convnext.py
+2-2
diff --git a/‎detrex/modeling/backbone/focalnet.py
-1 b/‎detrex/modeling/backbone/focalnet.py
-1
diff --git a/‎detrex/modeling/backbone/timm_backbone.py
+8-4 b/‎detrex/modeling/backbone/timm_backbone.py
+8-4
diff --git a/‎detrex/modeling/backbone/torchvision_backbone.py
+32-30 b/‎detrex/modeling/backbone/torchvision_backbone.py
+32-30
@@ -3,7 +3,7 @@
 
 [flake8]
 ignore = W503, E203, E221, C901, C408, E741, C407, B017
-max-line-length = 100
+max-line-length = 120
 max-complexity = 18
 select = B,C,E,F,W,T4,B9
 exclude = build, detectron2
 
@@ -84,4 +84,4 @@ def default_coco_scheduler(epochs=50, decay_epochs=40, warmup_epochs=0):
 
 # warmup scheduler for detr
 lr_multiplier_50ep_warmup = default_coco_scheduler(50, 40, 1e-3)
-lr_multiplier_12ep_warmup = default_coco_scheduler(12, 11, 1e-3)
+lr_multiplier_12ep_warmup = default_coco_scheduler(12, 11, 1e-3)
@@ -49,4 +49,4 @@
     apply_label_noise,
     GenerateDNQueries,
 )
-from .shape_spec import ShapeSpec
+from .shape_spec import ShapeSpec
@@ -20,8 +20,8 @@
 
 
 def apply_label_noise(
-    labels: torch.Tensor, 
-    label_noise_prob: float = 0.2, 
+    labels: torch.Tensor,
+    label_noise_prob: float = 0.2,
     num_classes: int = 80,
 ):
     """
@@ -57,16 +57,14 @@ def apply_box_noise(
         diff = torch.zeros_like(boxes)
         diff[:, :2] = boxes[:, 2:] / 2
         diff[:, 2:] = boxes[:, 2:]
-        boxes += (
-            torch.mul((torch.rand_like(boxes) * 2 - 1.0), diff) * box_noise_scale
-        )
+        boxes += torch.mul((torch.rand_like(boxes) * 2 - 1.0), diff) * box_noise_scale
         boxes = boxes.clamp(min=0.0, max=1.0)
     return boxes
 
 
 class GenerateDNQueries(nn.Module):
     """Generate denoising queries for DN-DETR
-    
+
     Args:
         num_queries (int): Number of total queries in DN-DETR. Default: 300
         num_classes (int): Number of total categories. Default: 80.
@@ -77,6 +75,7 @@ class GenerateDNQueries(nn.Module):
         with_indicator (bool): If True, add indicator in noised label/box queries.
 
     """
+
     def __init__(
         self,
         num_queries: int = 300,
@@ -95,7 +94,7 @@ def __init__(
         self.label_noise_prob = label_noise_prob
         self.box_noise_scale = box_noise_scale
         self.with_indicator = with_indicator
-        
+
         # leave one dim for indicator mentioned in DN-DETR
         if with_indicator:
             self.label_encoder = nn.Embedding(num_classes, label_embed_dim - 1)
@@ -116,15 +115,17 @@ def generate_query_masks(self, max_gt_num_per_image, device):
                 ] = True
             if i == self.denoising_groups - 1:
                 attn_mask[
-                    max_gt_num_per_image * i : max_gt_num_per_image * (i + 1), : max_gt_num_per_image * i
+                    max_gt_num_per_image * i : max_gt_num_per_image * (i + 1),
+                    : max_gt_num_per_image * i,
                 ] = True
             else:
                 attn_mask[
                     max_gt_num_per_image * i : max_gt_num_per_image * (i + 1),
                     max_gt_num_per_image * (i + 1) : noised_query_nums,
                 ] = True
                 attn_mask[
-                    max_gt_num_per_image * i : max_gt_num_per_image * (i + 1), : max_gt_num_per_image * i
+                    max_gt_num_per_image * i : max_gt_num_per_image * (i + 1),
+                    : max_gt_num_per_image * i,
                 ] = True
         return attn_mask
 
@@ -135,7 +136,7 @@ def forward(
     ):
         """
         Args:
-            gt_boxes_list (list[torch.Tensor]): Ground truth bounding boxes per image 
+            gt_boxes_list (list[torch.Tensor]): Ground truth bounding boxes per image
                 with normalized coordinates in format ``(x, y, w, h)`` in shape ``(num_gts, 4)``
             gt_labels_list (list[torch.Tensor]): Classification labels per image in shape ``(num_gt, )``.
         """
@@ -162,7 +163,6 @@ def forward(
         # means there are 2 instances in the first image and 3 instances in the second image
         gt_nums_per_image = [x.numel() for x in gt_labels_list]
 
-
         # Add noise on labels and boxes
         noised_labels = apply_label_noise(gt_labels, self.label_noise_prob, self.num_classes)
         noised_boxes = apply_box_noise(gt_boxes, self.box_noise_scale)
@@ -175,50 +175,67 @@ def forward(
         # add indicator to label encoding if with_indicator == True
         if self.with_indicator:
             label_embedding = torch.cat([label_embedding, torch.ones([query_num, 1]).to(device)], 1)
-        
+
         # calculate the max number of ground truth in one image inside the batch.
-        # e.g. gt_nums_per_image = [2, 3] which means the first image has 2 instances and the second image has 3 instances
+        # e.g. gt_nums_per_image = [2, 3] which means
+        # the first image has 2 instances and the second image has 3 instances
         # then the max_gt_num_per_image should be 3.
         max_gt_num_per_image = max(gt_nums_per_image)
-        
+
         # the total denoising queries is depended on denoising groups and max number of instances.
         noised_query_nums = max_gt_num_per_image * self.denoising_groups
 
         # initialize the generated noised queries to zero.
         # And the zero initialized queries will be assigned with noised embeddings later.
-        noised_label_queries = torch.zeros(noised_query_nums, self.label_embed_dim).to(device).repeat(batch_size, 1, 1)
+        noised_label_queries = (
+            torch.zeros(noised_query_nums, self.label_embed_dim).to(device).repeat(batch_size, 1, 1)
+        )
         noised_box_queries = torch.zeros(noised_query_nums, 4).to(device).repeat(batch_size, 1, 1)
 
-
         # batch index per image: [0, 1, 2, 3] for batch_size == 4
         batch_idx = torch.arange(0, batch_size)
-        
+
         # e.g. gt_nums_per_image = [2, 3]
         # batch_idx = [0, 1]
-        # then the "batch_idx_per_instance" equals to [0, 0, 1, 1, 1] which indicates which image the instance belongs to.
+        # then the "batch_idx_per_instance" equals to [0, 0, 1, 1, 1]
+        # which indicates which image the instance belongs to.
         # cuz the instances has been flattened before.
-        batch_idx_per_instance = torch.repeat_interleave(batch_idx, torch.tensor(gt_nums_per_image).long())
+        batch_idx_per_instance = torch.repeat_interleave(
+            batch_idx, torch.tensor(gt_nums_per_image).long()
+        )
 
         # indicate which image the noised labels belong to. For example:
         # noised label: tensor([0, 1, 2, 2, 3, 4, 0, 1, 2, 2, 3, 4])
         # batch_idx_per_group: tensor([0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1])
         # which means the first label "tensor([0])"" belongs to "image_0".
         batch_idx_per_group = batch_idx_per_instance.repeat(self.denoising_groups, 1).flatten()
 
-
-        # Cuz there might be different numbers of ground truth in each image of the same batch. 
+        # Cuz there might be different numbers of ground truth in each image of the same batch.
         # So there might be some padding part in noising queries.
-        # Here we calculate the indexes for the valid queries and fill them with the noised embeddings.
+        # Here we calculate the indexes for the valid queries and
+        # fill them with the noised embeddings.
         # And leave the padding part to zeros.
         if len(gt_nums_per_image):
-            valid_index_per_group = torch.cat([torch.tensor(list(range(num))) for num in gt_nums_per_image])
             valid_index_per_group = torch.cat(
-                [valid_index_per_group + max_gt_num_per_image * i for i in range(self.denoising_groups)]).long()
+                [torch.tensor(list(range(num))) for num in gt_nums_per_image]
+            )
+            valid_index_per_group = torch.cat(
+                [
+                    valid_index_per_group + max_gt_num_per_image * i
+                    for i in range(self.denoising_groups)
+                ]
+            ).long()
         if len(batch_idx_per_group):
             noised_label_queries[(batch_idx_per_group, valid_index_per_group)] = label_embedding
             noised_box_queries[(batch_idx_per_group, valid_index_per_group)] = noised_boxes
 
         # generate attention masks for transformer layers
         attn_mask = self.generate_query_masks(max_gt_num_per_image, device)
 
-        return noised_label_queries, noised_box_queries, attn_mask, self.denoising_groups, max_gt_num_per_image
+        return (
+            noised_label_queries,
+            noised_box_queries,
+            attn_mask,
+            self.denoising_groups,
+            max_gt_num_per_image,
+        )
@@ -406,4 +406,6 @@ def _dummy(*args, **kwargs):
     # TODO: register ops natively so there is no need to import _C.
     _msg = "detrex is not compiled successfully, please build following the instructions!"
     _args = ("detrex._C", _msg)
-    MultiScaleDeformableAttention = create_dummy_class("MultiScaleDeformableAttention", *_args)
+    MultiScaleDeformableAttention = create_dummy_class(  # noqa
+        "MultiScaleDeformableAttention", *_args
+    )
@@ -189,11 +189,11 @@ def get_sine_pos_embed(
         temperature (int): The temperature used for scaling
             the position embedding. Default: 10000.
         exchange_xy (bool, optional): exchange pos x and pos y. \
-            For example, input tensor is `[x, y]`, the results will 
+            For example, input tensor is `[x, y]`, the results will  # noqa 
             be `[pos(y), pos(x)]`. Defaults: True.
 
     Returns:
-        torch.Tensor: Returned position embedding 
+        torch.Tensor: Returned position embedding  # noqa 
         with shape `(None, n * num_pos_feats)`.
     """
     scale = 2 * math.pi
 
@@ -19,6 +19,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
+
 @dataclass
 class ShapeSpec:
     """
@@ -30,4 +31,4 @@ class ShapeSpec:
     channels: Optional[int] = None
     height: Optional[int] = None
     width: Optional[int] = None
-    stride: Optional[int] = None
+    stride: Optional[int] = None
@@ -67,7 +67,8 @@ def __init__(
         else:
             assert len(attn) == num_attn, (
                 f"The length of attn (nn.Module or List[nn.Module]) {num_attn}"
-                f"is not consistent with the number of attention in operation_order {operation_order}"
+                f"is not consistent with the number of attention in "
+                f"operation_order {operation_order}"
             )
 
         self.num_attn = num_attn
 
@@ -22,12 +22,12 @@
 from functools import partial
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from timm.models.layers import DropPath, trunc_normal_
-from detectron2.modeling.backbone import Backbone
 
 from detrex.layers import LayerNorm
 
+from detectron2.modeling.backbone import Backbone
+
 
 class Block(nn.Module):
     r"""ConvNeXt Block. There are two equivalent implementations:
 
@@ -19,7 +19,6 @@
 # https://github.com/microsoft/FocalNet/blob/main/detection/mmdet/models/backbones/focalnet.py
 # ------------------------------------------------------------------------------------------------
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -27,8 +27,8 @@
 import torch.nn as nn
 
 from detectron2.modeling.backbone import Backbone
-from detectron2.utils.logger import setup_logger
 from detectron2.utils import comm
+from detectron2.utils.logger import setup_logger
 
 try:
     import timm
@@ -135,17 +135,21 @@ def __init__(
 
         if feature_info is not None:
             output_feature_channels = {
-                "p{}".format(out_indices[i]): feature_info.channels()[i] for i in range(len(out_indices))
+                "p{}".format(out_indices[i]): feature_info.channels()[i]
+                for i in range(len(out_indices))
             }
             out_feature_strides = {
-                "p{}".format(out_indices[i]): feature_info.reduction()[i] for i in range(len(out_indices))
+                "p{}".format(out_indices[i]): feature_info.reduction()[i]
+                for i in range(len(out_indices))
             }
 
             self._out_features = {"p{}".format(out_indices[i]) for i in range(len(out_indices))}
             self._out_feature_channels = {
                 feat: output_feature_channels[feat] for feat in self._out_features
             }
-            self._out_feature_strides = {feat: out_feature_strides[feat] for feat in self._out_features}
+            self._out_feature_strides = {
+                feat: out_feature_strides[feat] for feat in self._out_features
+            }
 
     def forward(self, x):
         """Forward function of `TimmBackbone`.
 
@@ -22,14 +22,15 @@
     from torchvision.models.feature_extraction import (
         create_feature_extractor,
     )
+
     has_feature_extractor = True
 except ImportError:
     has_feature_extractor = False
 
 
 class TorchvisionBackbone(Backbone):
     """A wrapper for torchvision pretrained backbones
-    
+
     Please check `Feature extraction for model inspection
     <https://pytorch.org/vision/stable/feature_extraction.html>`_
     for more details.
@@ -41,51 +42,52 @@ class TorchvisionBackbone(Backbone):
         return_nodes (Dict[str, str]): The keys are the node names and the values are the
             user-specified keys for the graph module's returned dictionary.
     """
-    def __init__(self,
-                 model_name: str = "resnet50",
-                 pretrained: bool = False,
-                 return_nodes: Dict[str, str] = {
-                    "layer1": "res2",
-                    "layer2": "res3",
-                    "layer3": "res4",
-                    "layer4": "res5",
-                 },
-                 train_return_nodes: Dict[str, str] = None,
-                 eval_return_nodes: Dict[str, str] = None,
-                 tracer_kwargs: Dict[str, Any] = None,
-                 suppress_diff_warnings: bool = False,
-                 **kwargs,
-                ):
+
+    def __init__(
+        self,
+        model_name: str = "resnet50",
+        pretrained: bool = False,
+        return_nodes: Dict[str, str] = {
+            "layer1": "res2",
+            "layer2": "res3",
+            "layer3": "res4",
+            "layer4": "res5",
+        },
+        train_return_nodes: Dict[str, str] = None,
+        eval_return_nodes: Dict[str, str] = None,
+        tracer_kwargs: Dict[str, Any] = None,
+        suppress_diff_warnings: bool = False,
+        **kwargs,
+    ):
         super(TorchvisionBackbone, self).__init__()
-        
+
         # build torchvision models
-        self.model = getattr(torchvision.models, model_name)(
-            pretrained=pretrained,
-            **kwargs
-        )
-        
+        self.model = getattr(torchvision.models, model_name)(pretrained=pretrained, **kwargs)
+
         if has_feature_extractor is False:
-            raise RuntimeError('Failed to import create_feature_extractor from torchvision. \
-            Please install torchvision 1.10+.')
-        
+            raise RuntimeError(
+                "Failed to import create_feature_extractor from torchvision. \
+            Please install torchvision 1.10+."
+            )
+
         # turn models into feature extractor
         self.feature_extractor = create_feature_extractor(
-            model = self.model,
+            model=self.model,
             return_nodes=return_nodes,
             train_return_nodes=train_return_nodes,
             eval_return_nodes=eval_return_nodes,
             tracer_kwargs=tracer_kwargs,
-            suppress_diff_warning=suppress_diff_warnings
+            suppress_diff_warning=suppress_diff_warnings,
         )
 
     def forward(self, x):
         """Forward function of TorchvisionBackbone
-        
+
         Args:
             x (torch.Tensor): the input tensor for feature extraction.
-        
+
         Returns:
             dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
         """
         outs = self.feature_extractor(x)
-        return outs
+        return outs
Original file line number	Diff line number	Diff line change
`@@ -49,4 +49,4 @@`
`49`	`49`	`apply_label_noise,`
`50`	`50`	`GenerateDNQueries,`
`51`	`51`	`)`
`52`		`-from .shape_spec import ShapeSpec`
	`52`	`+from .shape_spec import ShapeSpec`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,8 @@ def __init__(`
`67`	`67`	`else:`
`68`	`68`	`assert len(attn) == num_attn, (`
`69`	`69`	`f"The length of attn (nn.Module or List[nn.Module]) {num_attn}"`
`70`		`- f"is not consistent with the number of attention in operation_order {operation_order}"`
	`70`	`+ f"is not consistent with the number of attention in "`
	`71`	`+ f"operation_order {operation_order}"`
`71`	`72`	`)`
`72`	`73`
`73`	`74`	`self.num_attn = num_attn`