Skip to content

Commit

Permalink
feat: generalize scripts with hydra configs
Browse files Browse the repository at this point in the history
  • Loading branch information
dgcnz committed Oct 30, 2024
1 parent 303ad17 commit 8cee790
Show file tree
Hide file tree
Showing 11 changed files with 238 additions and 98 deletions.
2 changes: 1 addition & 1 deletion cpp/src/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ void benchmark(std::string model_name, int n_warmup = 5, int n_iter = 5)

auto trt_mod = torch::jit::load(model_name, torch::kCUDA);
trt_mod.eval();
torch::Tensor input_tensor = torch::rand({1, 3, 512, 512}).cuda();
torch::Tensor input_tensor = torch::rand({3, 512, 512}).cuda();

std::cout << "warmup["<< n_warmup << "]" << std::endl;
while (n_warmup--)
Expand Down
2 changes: 1 addition & 1 deletion detrex
161 changes: 143 additions & 18 deletions poetry.lock

Large diffs are not rendered by default.

22 changes: 13 additions & 9 deletions projects/dino_dinov2/modeling/exportable/dino_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ def forward(
if reference_points.shape[-1] == 4:
reference_points_input = (
reference_points[:, :, None]
# DYNAMO REFACTOR
# small refactor to avoid: https://github.com/pytorch/pytorch/issues/129038
# * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
* valid_ratios.repeat(*[1] * (valid_ratios.ndim - 1), 2)[:, None]
Expand Down Expand Up @@ -272,6 +273,7 @@ def __init__(
num_feature_levels=4,
two_stage_num_proposals=900,
learnt_init_query=True,
specialize_with_list: bool = False,
):
super(DINOTransformer, self).__init__()
self.encoder = encoder
Expand All @@ -289,6 +291,7 @@ def __init__(
self.tgt_embed = nn.Embedding(self.two_stage_num_proposals, self.embed_dim)
self.enc_output = nn.Linear(self.embed_dim, self.embed_dim)
self.enc_output_norm = nn.LayerNorm(self.embed_dim)
self.specialize_with_list = specialize_with_list

self.init_weights()

Expand All @@ -301,7 +304,7 @@ def init_weights(self):
m.init_weights()
nn.init.normal_(self.level_embeds)

def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes: List[Tuple[int, int]]):
def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
N, S, C = memory.shape
proposals = []
_cur = 0
Expand Down Expand Up @@ -348,7 +351,7 @@ def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shap
return output_memory, output_proposals

@staticmethod
def get_reference_points(spatial_shapes: List[Tuple[int, int]], valid_ratios: torch.Tensor, device: torch.device):
def get_reference_points(spatial_shapes, valid_ratios: torch.Tensor, device: torch.device):
"""Get the reference points used in decoder.
Args:
Expand Down Expand Up @@ -422,17 +425,18 @@ def forward(
feat_flatten = torch.cat(feat_flatten, 1)
mask_flatten = torch.cat(mask_flatten, 1)
lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
# spatial_shapes = torch.as_tensor(
# spatial_shapes, dtype=torch.long, device=feat_flatten.device
# )
# list refactor
# level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
level_start_index = [0] + list(itertools.accumulate(list(map(math.prod, spatial_shapes))))[:-1]
if not self.specialize_with_list:
spatial_shapes = torch.tensor(
spatial_shapes, dtype=torch.long, device=feat_flatten.device
)
level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
else:
level_start_index = [0] + list(itertools.accumulate(list(map(math.prod, spatial_shapes))))[:-1]
valid_ratios = torch.stack(
[self.get_valid_ratio(m) for m in multi_level_masks], 1
)

reference_points = self.get_reference_points( # DONE
reference_points = self.get_reference_points(
spatial_shapes, valid_ratios, device=feat.device
)

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ viztracer = "^0.17.0"
pandas = "^2.2.3"
jupyter-book = "^1.0.3"
livereload = "^2.7.0"
pyppeteer = "^2.0.0"

[build-system]
requires = ["poetry-core"]
Expand Down
112 changes: 47 additions & 65 deletions scripts/benchmark_gpu.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,71 @@
import torch
import time
from typing import Optional
from functools import partial
import contextlib
from src.utils import (
load_input_fixed,
plot_predictions,
TracingAdapter,
)
from src.utils import load_model as _load_model
from src.utils import load_model
from statistics import stdev, mean
import torch_tensorrt
import logging
import argparse
from pathlib import Path
import detrex
import hydra
from omegaconf import DictConfig, OmegaConf
import importlib

detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False

logging.basicConfig(level=logging.INFO)

def setup_parser():
DEFAULT_IMG = Path("artifacts/idea_raw.jpg")
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=Path, required=True)
parser.add_argument("--image", type=Path, default=DEFAULT_IMG)
parser.add_argument("--n_warmup", type=int, default=10)
parser.add_argument("--n_iter", type=int, default=10)
parser.add_argument("--output", type=Path, default=None)
parser.add_argument(
"--amp_dtype", type=str, default=None, choices=["fp16", "bf16", None]
)
return parser

@hydra.main(
version_base=None, config_path="config/benchmark_gpu", config_name="default"
)
def main(cfg: DictConfig):
OUTPUT_DIR = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
print(OmegaConf.to_yaml(cfg))

n_iter = cfg.n_iter # default 10
n_warmup = cfg.n_warmup # default 10
amp_dtype = cfg.amp_dtype # default None
compile_run_path = Path(cfg.compile_run_path)
compile_run_cfg = OmegaConf.load(compile_run_path / ".hydra" / "config.yaml")
print(OmegaConf.to_yaml(compile_run_cfg))

# Setting variables
for var, val in compile_run_cfg.env.items():
logging.info(f"Setting {var} to {val}")
module_name, attr_name = var.rsplit(".", 1)
module = importlib.import_module(module_name)
setattr(module, attr_name, val)

height, width = compile_run_cfg.image.height, compile_run_cfg.image.width

base_model = load_model(
config_file=compile_run_cfg.model.config,
ckpt_path=compile_run_cfg.model.ckpt_path,
opts=compile_run_cfg.model.opts,
)

logging.basicConfig(level=logging.INFO)
_, inputs = load_input_fixed(height=height, width=width, device="cuda")
model = TracingAdapter(
base_model, inputs=inputs, allow_non_tensor=False, specialize_non_tensor=True
)

inputs = model.flattened_inputs
print(inputs[0].shape)

def load_model(model_path: Path):
if model_path.suffix == ".ts":
*_, height, width = model_path.stem.split("_")
if cfg.load_ts:
del base_model, model
model_path = compile_run_path / "model.ts"
model = torch.jit.load(model_path)
elif model_path.suffix == ".ep":
*_, height, width = model_path.stem.split("_")
model = torch.export.load(model_path).module()
elif model_path.suffix == ".pth":
height, width = 512, 512
model = _load_model().cuda()
model = TracingAdapter(model, *load_input_fixed(height=height, width=width))
else:
raise ValueError(f"Unsupported model format: {model_path.suffix}")

return model, int(height), int(width)


def benchmark(
model_path: Path,
image_path: Path,
n_warmup: int,
n_iter: int,
output_path: Optional[Path],
amp_dtype: Optional[str] = None,
):
# track cuda memory history

torch.cuda.memory._record_memory_history()
model, height, width = load_model(model_path)

model.eval()
model.cuda()
logging.info("Loaded model")
img, example_kwargs = load_input_fixed(str(image_path), height, width)
input = (example_kwargs["images"].cuda(),)

ctx = contextlib.nullcontext
if amp_dtype is not None:
Expand All @@ -81,15 +78,15 @@ def benchmark(
with torch.no_grad(), ctx():
logging.info("warmup")
for _ in range(n_warmup):
_ = model(*input)
_ = model(*inputs)

torch.cuda.reset_peak_memory_stats()
logging.info("measuring time")
times = []
for _ in range(n_iter):
torch.cuda.synchronize()
start_time = time.time()
_ = model(*input)
_ = model(*inputs)
torch.cuda.synchronize()
end_time = time.time()
inference_time = end_time - start_time
Expand All @@ -101,24 +98,9 @@ def benchmark(

# get max memory usage
max_memory = torch.cuda.memory.max_memory_allocated()
torch.cuda.memory._dump_snapshot(f"artifacts/{model_path.stem}_mem.pickle")
torch.cuda.memory._dump_snapshot(OUTPUT_DIR / "mem.pickle")
logging.info(f"Max memory usage: {max_memory / 1e6:.4f} MB")

if output_path is not None:
outputs = model(*input)
outputs = unflatten_repr(outputs)
plot_predictions(outputs, img, output_file=output_path)


def main():
parser = setup_parser()
args = parser.parse_args()
logging.info("Loading model")
model_path = args.model
benchmark(
model_path, args.image, args.n_warmup, args.n_iter, args.output, args.amp_dtype
)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions scripts/config/benchmark_gpu/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
n_iter: 100
n_warmup: 10
amp_dtype: null
compile_run_path: null
load_ts: true
7 changes: 7 additions & 0 deletions scripts/config/export_tensorrt/dinov2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,17 @@ amp_dtype: "fp32"
trt:
enabled_precisions:
- "fp32"
- "fp16"
- "bf16"
model:
config: "projects/dino_dinov2/configs/models/dino_dinov2.py"
ckpt_path: "artifacts/model_final.pth"
opts:
- "model.backbone.net.img_size=[512, 512]"
- "model.backbone.net.dynamic_img_size=False"
- "model.backbone.net.dynamic_img_pad=False"
- "model.transformer.specialize_with_list=True"

env:
"torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT": 2000
"detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA": False
7 changes: 7 additions & 0 deletions scripts/config/export_tensorrt/vit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,11 @@ trt:
enable_experimental_decompositions: True
min_block_size: 1
use_fast_partitioner: True # doesn't make any difference in supported nodes
torch_executed_ops:
- "torch.ops.aten.sym_size.int"
amp_dtype: "fp32"
env:
"torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT": 2000
"detectron2.modeling.proposal_generator.proposal_utils.SKIP_NMS": True
"detectron2.modeling.roi_heads.fast_rcnn.SKIP_NMS": True
"detectron2.modeling.roi_heads.fast_rcnn.SKIP_FILTER_CONFIDENCE": True
15 changes: 12 additions & 3 deletions scripts/export_tensorrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import torch_tensorrt
from omegaconf import DictConfig, OmegaConf

import importlib
import detrex
from src.utils import TracingAdapter, load_input_fixed, load_model, plot_predictions

logging.basicConfig(level=logging.INFO)
torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT = 2000
detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False
# torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT = 2000
# detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False


def to_dtype(precision: str):
Expand Down Expand Up @@ -86,11 +87,18 @@ def compile(
return trt_gm


@hydra.main(version_base=None, config_path="config/export_tensorrt", config_name="vit")
@hydra.main(version_base=None, config_path="config/export_tensorrt", config_name="dinov2")
def main(cfg: DictConfig):
OUTPUT_DIR = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
print(OmegaConf.to_yaml(cfg))

# Setting variables
for var, val in cfg.env.items():
logging.info(f"Setting {var} to {val}")
module_name, attr_name = var.rsplit(".", 1)
module = importlib.import_module(module_name)
setattr(module, attr_name, val)

# check that amp_dtype is in enabled_precisions
if cfg.amp_dtype not in cfg.trt.enabled_precisions:
raise ValueError(
Expand All @@ -116,6 +124,7 @@ def main(cfg: DictConfig):
)
inputs = model.flattened_inputs
model.eval().cuda()
# This forward call is important, it ensures the model works before compilation
model(*inputs)
try:
trt_gm = compile(model, inputs, amp_dtype=cfg.amp_dtype, trt_cfg=cfg.trt)
Expand Down
2 changes: 1 addition & 1 deletion src/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def load_input_fixed(
with torch.no_grad():
if input_format == "RGB":
img = img[:, :, ::-1]
img = torch.as_tensor(img.astype("float32").transpose(2, 0, 1))
img = torch.as_tensor(img.astype("float32").transpose(2, 0, 1)).contiguous()
return original_img, (
[
{
Expand Down

0 comments on commit 8cee790

Please sign in to comment.