feat: generalize scripts with hydra configs

dgcnz · Oct 30, 2024 · 8cee790 · 8cee790
1 parent 303ad17
commit 8cee790
Show file tree

Hide file tree

Showing 11 changed files with 238 additions and 98 deletions.
diff --git a/cpp/src/benchmark.cpp b/cpp/src/benchmark.cpp
@@ -23,7 +23,7 @@ void benchmark(std::string model_name, int n_warmup = 5, int n_iter = 5)
 
     auto trt_mod = torch::jit::load(model_name, torch::kCUDA);
     trt_mod.eval();
-    torch::Tensor input_tensor = torch::rand({1, 3, 512, 512}).cuda();
+    torch::Tensor input_tensor = torch::rand({3, 512, 512}).cuda();
 
     std::cout << "warmup["<< n_warmup << "]" <<  std::endl;
     while (n_warmup--)

diff --git a/detrex b/detrex
diff --git a/poetry.lock b/poetry.lock
diff --git a/projects/dino_dinov2/modeling/exportable/dino_transformer.py b/projects/dino_dinov2/modeling/exportable/dino_transformer.py
@@ -200,6 +200,7 @@ def forward(
             if reference_points.shape[-1] == 4:
                 reference_points_input = (
                     reference_points[:, :, None]
+                    # DYNAMO REFACTOR
                     # small refactor to avoid: https://github.com/pytorch/pytorch/issues/129038
                     # * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
                     * valid_ratios.repeat(*[1] * (valid_ratios.ndim - 1), 2)[:, None]
@@ -272,6 +273,7 @@ def __init__(
         num_feature_levels=4,
         two_stage_num_proposals=900,
         learnt_init_query=True,
+        specialize_with_list: bool = False,
     ):
         super(DINOTransformer, self).__init__()
         self.encoder = encoder
@@ -289,6 +291,7 @@ def __init__(
             self.tgt_embed = nn.Embedding(self.two_stage_num_proposals, self.embed_dim)
         self.enc_output = nn.Linear(self.embed_dim, self.embed_dim)
         self.enc_output_norm = nn.LayerNorm(self.embed_dim)
+        self.specialize_with_list = specialize_with_list
 
         self.init_weights()
 
@@ -301,7 +304,7 @@ def init_weights(self):
                 m.init_weights()
         nn.init.normal_(self.level_embeds)
 
-    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes: List[Tuple[int, int]]):
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
         N, S, C = memory.shape
         proposals = []
         _cur = 0
@@ -348,7 +351,7 @@ def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shap
         return output_memory, output_proposals
 
     @staticmethod
-    def get_reference_points(spatial_shapes: List[Tuple[int, int]], valid_ratios: torch.Tensor, device: torch.device):
+    def get_reference_points(spatial_shapes, valid_ratios: torch.Tensor, device: torch.device):
         """Get the reference points used in decoder.
 
         Args:
@@ -422,17 +425,18 @@ def forward(
         feat_flatten = torch.cat(feat_flatten, 1)
         mask_flatten = torch.cat(mask_flatten, 1)
         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
-        # spatial_shapes = torch.as_tensor(
-        #     spatial_shapes, dtype=torch.long, device=feat_flatten.device
-        # )
-        # list refactor
-        # level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
-        level_start_index = [0] + list(itertools.accumulate(list(map(math.prod, spatial_shapes))))[:-1]
+        if not self.specialize_with_list:
+            spatial_shapes = torch.tensor(
+                spatial_shapes, dtype=torch.long, device=feat_flatten.device
+            )
+            level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        else:
+            level_start_index = [0] + list(itertools.accumulate(list(map(math.prod, spatial_shapes))))[:-1]
         valid_ratios = torch.stack(
             [self.get_valid_ratio(m) for m in multi_level_masks], 1
         )
 
-        reference_points = self.get_reference_points( # DONE
+        reference_points = self.get_reference_points(
             spatial_shapes, valid_ratios, device=feat.device
         )
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ viztracer = "^0.17.0"
 pandas = "^2.2.3"
 jupyter-book = "^1.0.3"
 livereload = "^2.7.0"
+pyppeteer = "^2.0.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/scripts/benchmark_gpu.py b/scripts/benchmark_gpu.py
@@ -1,74 +1,71 @@
 import torch
 import time
-from typing import Optional
 from functools import partial
 import contextlib
 from src.utils import (
     load_input_fixed,
-    plot_predictions,
     TracingAdapter,
 )
-from src.utils import load_model as _load_model
+from src.utils import load_model
 from statistics import stdev, mean
 import torch_tensorrt
 import logging
-import argparse
 from pathlib import Path
 import detrex
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import importlib
 
-detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False
 
+logging.basicConfig(level=logging.INFO)
 
-def setup_parser():
-    DEFAULT_IMG = Path("artifacts/idea_raw.jpg")
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=Path, required=True)
-    parser.add_argument("--image", type=Path, default=DEFAULT_IMG)
-    parser.add_argument("--n_warmup", type=int, default=10)
-    parser.add_argument("--n_iter", type=int, default=10)
-    parser.add_argument("--output", type=Path, default=None)
-    parser.add_argument(
-        "--amp_dtype", type=str, default=None, choices=["fp16", "bf16", None]
-    )
-    return parser
 
+@hydra.main(
+    version_base=None, config_path="config/benchmark_gpu", config_name="default"
+)
+def main(cfg: DictConfig):
+    OUTPUT_DIR = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
+    print(OmegaConf.to_yaml(cfg))
+
+    n_iter = cfg.n_iter  # default 10
+    n_warmup = cfg.n_warmup  # default 10
+    amp_dtype = cfg.amp_dtype  # default None
+    compile_run_path = Path(cfg.compile_run_path)
+    compile_run_cfg = OmegaConf.load(compile_run_path / ".hydra" / "config.yaml")
+    print(OmegaConf.to_yaml(compile_run_cfg))
+
+    # Setting variables
+    for var, val in compile_run_cfg.env.items():
+        logging.info(f"Setting {var} to {val}")
+        module_name, attr_name = var.rsplit(".", 1)
+        module = importlib.import_module(module_name)
+        setattr(module, attr_name, val)
+
+    height, width = compile_run_cfg.image.height, compile_run_cfg.image.width
+
+    base_model = load_model(
+        config_file=compile_run_cfg.model.config,
+        ckpt_path=compile_run_cfg.model.ckpt_path,
+        opts=compile_run_cfg.model.opts,
+    )
 
-logging.basicConfig(level=logging.INFO)
+    _, inputs = load_input_fixed(height=height, width=width, device="cuda")
+    model = TracingAdapter(
+        base_model, inputs=inputs, allow_non_tensor=False, specialize_non_tensor=True
+    )
 
+    inputs = model.flattened_inputs
+    print(inputs[0].shape)
 
-def load_model(model_path: Path):
-    if model_path.suffix == ".ts":
-        *_, height, width = model_path.stem.split("_")
+    if cfg.load_ts:
+        del base_model, model
+        model_path = compile_run_path / "model.ts"
         model = torch.jit.load(model_path)
-    elif model_path.suffix == ".ep":
-        *_, height, width = model_path.stem.split("_")
-        model = torch.export.load(model_path).module()
-    elif model_path.suffix == ".pth":
-        height, width = 512, 512
-        model = _load_model().cuda()
-        model = TracingAdapter(model, *load_input_fixed(height=height, width=width))
-    else:
-        raise ValueError(f"Unsupported model format: {model_path.suffix}")
-
-    return model, int(height), int(width)
-
-
-def benchmark(
-    model_path: Path,
-    image_path: Path,
-    n_warmup: int,
-    n_iter: int,
-    output_path: Optional[Path],
-    amp_dtype: Optional[str] = None,
-):
-    # track cuda memory history
+
     torch.cuda.memory._record_memory_history()
-    model, height, width = load_model(model_path)
+
     model.eval()
     model.cuda()
-    logging.info("Loaded model")
-    img, example_kwargs = load_input_fixed(str(image_path), height, width)
-    input = (example_kwargs["images"].cuda(),)
 
     ctx = contextlib.nullcontext
     if amp_dtype is not None:
@@ -81,15 +78,15 @@ def benchmark(
     with torch.no_grad(), ctx():
         logging.info("warmup")
         for _ in range(n_warmup):
-            _ = model(*input)
+            _ = model(*inputs)
 
         torch.cuda.reset_peak_memory_stats()
         logging.info("measuring time")
         times = []
         for _ in range(n_iter):
             torch.cuda.synchronize()
             start_time = time.time()
-            _ = model(*input)
+            _ = model(*inputs)
             torch.cuda.synchronize()
             end_time = time.time()
             inference_time = end_time - start_time
@@ -101,24 +98,9 @@ def benchmark(
 
         # get max memory usage
         max_memory = torch.cuda.memory.max_memory_allocated()
-        torch.cuda.memory._dump_snapshot(f"artifacts/{model_path.stem}_mem.pickle")
+        torch.cuda.memory._dump_snapshot(OUTPUT_DIR / "mem.pickle")
         logging.info(f"Max memory usage: {max_memory / 1e6:.4f} MB")
 
-    if output_path is not None:
-        outputs = model(*input)
-        outputs = unflatten_repr(outputs)
-        plot_predictions(outputs, img, output_file=output_path)
-
-
-def main():
-    parser = setup_parser()
-    args = parser.parse_args()
-    logging.info("Loading model")
-    model_path = args.model
-    benchmark(
-        model_path, args.image, args.n_warmup, args.n_iter, args.output, args.amp_dtype
-    )
-
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/config/benchmark_gpu/default.yaml b/scripts/config/benchmark_gpu/default.yaml
@@ -0,0 +1,5 @@
+n_iter: 100
+n_warmup: 10
+amp_dtype: null
+compile_run_path: null
+load_ts: true
diff --git a/scripts/config/export_tensorrt/dinov2.yaml b/scripts/config/export_tensorrt/dinov2.yaml
@@ -6,10 +6,17 @@ amp_dtype: "fp32"
 trt:
   enabled_precisions:
     - "fp32"
+    - "fp16"
+    - "bf16"
 model:
   config: "projects/dino_dinov2/configs/models/dino_dinov2.py"
   ckpt_path: "artifacts/model_final.pth"
   opts:
     - "model.backbone.net.img_size=[512, 512]"
     - "model.backbone.net.dynamic_img_size=False"
     - "model.backbone.net.dynamic_img_pad=False"
+    - "model.transformer.specialize_with_list=True"
+
+env:
+  "torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT":  2000
+  "detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA": False
diff --git a/scripts/config/export_tensorrt/vit.yaml b/scripts/config/export_tensorrt/vit.yaml
@@ -14,4 +14,11 @@ trt:
   enable_experimental_decompositions: True
   min_block_size: 1
   use_fast_partitioner: True # doesn't make any difference in supported nodes
+  torch_executed_ops:
+    - "torch.ops.aten.sym_size.int"
 amp_dtype: "fp32"
+env:
+  "torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT":  2000
+  "detectron2.modeling.proposal_generator.proposal_utils.SKIP_NMS": True
+  "detectron2.modeling.roi_heads.fast_rcnn.SKIP_NMS": True
+  "detectron2.modeling.roi_heads.fast_rcnn.SKIP_FILTER_CONFIDENCE": True
diff --git a/scripts/export_tensorrt.py b/scripts/export_tensorrt.py
@@ -8,12 +8,13 @@
 import torch_tensorrt
 from omegaconf import DictConfig, OmegaConf
 
+import importlib
 import detrex
 from src.utils import TracingAdapter, load_input_fixed, load_model, plot_predictions
 
 logging.basicConfig(level=logging.INFO)
-torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT = 2000
-detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False
+# torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT = 2000
+# detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False
 
 
 def to_dtype(precision: str):
@@ -86,11 +87,18 @@ def compile(
         return trt_gm
 
 
-@hydra.main(version_base=None, config_path="config/export_tensorrt", config_name="vit")
+@hydra.main(version_base=None, config_path="config/export_tensorrt", config_name="dinov2")
 def main(cfg: DictConfig):
     OUTPUT_DIR = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
     print(OmegaConf.to_yaml(cfg))
 
+    # Setting variables
+    for var, val in cfg.env.items():
+        logging.info(f"Setting {var} to {val}")
+        module_name, attr_name = var.rsplit(".", 1)
+        module = importlib.import_module(module_name)
+        setattr(module, attr_name, val)
+
     # check that amp_dtype is in enabled_precisions
     if cfg.amp_dtype not in cfg.trt.enabled_precisions:
         raise ValueError(
@@ -116,6 +124,7 @@ def main(cfg: DictConfig):
     )
     inputs = model.flattened_inputs
     model.eval().cuda()
+    # This forward call is important, it ensures the model works before compilation
     model(*inputs)
     try:
         trt_gm = compile(model, inputs, amp_dtype=cfg.amp_dtype, trt_cfg=cfg.trt)

diff --git a/src/utils/io.py b/src/utils/io.py
@@ -34,7 +34,7 @@ def load_input_fixed(
     with torch.no_grad():
         if input_format == "RGB":
             img = img[:, :, ::-1]
-        img = torch.as_tensor(img.astype("float32").transpose(2, 0, 1))
+        img = torch.as_tensor(img.astype("float32").transpose(2, 0, 1)).contiguous()
         return original_img, (
             [
                 {