From 8cee79092845ea2f357402ebe855fd061bdd2aeb Mon Sep 17 00:00:00 2001
From: Diego Canez <canezdiego@gmail.com>
Date: Wed, 30 Oct 2024 22:54:56 +0100
Subject: [PATCH] feat: generalize scripts with hydra configs

---
 cpp/src/benchmark.cpp                         |   2 +-
 detrex                                        |   2 +-
 poetry.lock                                   | 161 ++++++++++++++++--
 .../modeling/exportable/dino_transformer.py   |  22 ++-
 pyproject.toml                                |   1 +
 scripts/benchmark_gpu.py                      | 112 +++++-------
 scripts/config/benchmark_gpu/default.yaml     |   5 +
 scripts/config/export_tensorrt/dinov2.yaml    |   7 +
 scripts/config/export_tensorrt/vit.yaml       |   7 +
 scripts/export_tensorrt.py                    |  15 +-
 src/utils/io.py                               |   2 +-
 11 files changed, 238 insertions(+), 98 deletions(-)
 create mode 100644 scripts/config/benchmark_gpu/default.yaml

diff --git a/cpp/src/benchmark.cpp b/cpp/src/benchmark.cpp
index 7f72ecc..991eae8 100644
--- a/cpp/src/benchmark.cpp
+++ b/cpp/src/benchmark.cpp
@@ -23,7 +23,7 @@ void benchmark(std::string model_name, int n_warmup = 5, int n_iter = 5)
 
     auto trt_mod = torch::jit::load(model_name, torch::kCUDA);
     trt_mod.eval();
-    torch::Tensor input_tensor = torch::rand({1, 3, 512, 512}).cuda();
+    torch::Tensor input_tensor = torch::rand({3, 512, 512}).cuda();
 
     std::cout << "warmup["<< n_warmup << "]" <<  std::endl;
     while (n_warmup--)
diff --git a/detrex b/detrex
index b392ec4..df9f69f 160000
--- a/detrex
+++ b/detrex
@@ -1 +1 @@
-Subproject commit b392ec497142bf3d93a040a8b6c40668e9339caf
+Subproject commit df9f69ffa5c7902640081a16f2e5af2f24144ee1
diff --git a/poetry.lock b/poetry.lock
index fc0353a..1b792c5 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -93,6 +93,17 @@ doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)",
 test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21.0b1)"]
 trio = ["trio (>=0.26.1)"]
 
+[[package]]
+name = "appdirs"
+version = "1.4.4"
+description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+optional = false
+python-versions = "*"
+files = [
+    {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
+    {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
+]
+
 [[package]]
 name = "appnope"
 version = "0.1.4"
@@ -3264,6 +3275,23 @@ doc = ["ablog (>=0.11.8)", "colorama", "graphviz", "ipykernel", "ipyleaflet", "i
 i18n = ["Babel", "jinja2"]
 test = ["pytest", "pytest-cov", "pytest-regressions", "sphinx[test]"]
 
+[[package]]
+name = "pyee"
+version = "11.1.1"
+description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyee-11.1.1-py3-none-any.whl", hash = "sha256:9e4cdd7c2f9fcf247db94bad39a260aceffefdbe52286ce71be01959de34a5c2"},
+    {file = "pyee-11.1.1.tar.gz", hash = "sha256:82e1eb1853f8497c4ff1a0c7fa26b9cd2f1253e2b6ffb93b4700fda907017302"},
+]
+
+[package.dependencies]
+typing-extensions = "*"
+
+[package.extras]
+dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"]
+
 [[package]]
 name = "pyflakes"
 version = "3.2.0"
@@ -3289,6 +3317,26 @@ files = [
 [package.extras]
 windows-terminal = ["colorama (>=0.4.6)"]
 
+[[package]]
+name = "pyppeteer"
+version = "2.0.0"
+description = "Headless chrome/chromium automation library (unofficial port of puppeteer)"
+optional = false
+python-versions = ">=3.8,<4.0"
+files = [
+    {file = "pyppeteer-2.0.0-py3-none-any.whl", hash = "sha256:96f4c574fb36f1d15e02746303ab742b98941f0da58337187e7c1d2ef982adea"},
+    {file = "pyppeteer-2.0.0.tar.gz", hash = "sha256:4af63473ff36a746a53347b2336a49efda669bcd781e400bc1799b81838358d9"},
+]
+
+[package.dependencies]
+appdirs = ">=1.4.3,<2.0.0"
+certifi = ">=2023"
+importlib-metadata = ">=1.4"
+pyee = ">=11.0.0,<12.0.0"
+tqdm = ">=4.42.1,<5.0.0"
+urllib3 = ">=1.25.8,<2.0.0"
+websockets = ">=10.0,<11.0"
+
 [[package]]
 name = "pyreadline3"
 version = "3.5.4"
@@ -4888,19 +4936,19 @@ reference = "torch-cu124"
 
 [[package]]
 name = "torch-tensorrt"
-version = "2.6.0.dev20241028+cu124"
+version = "2.6.0.dev20241030+cu124"
 description = "Torch-TensorRT is a package which allows users to automatically compile PyTorch and TorchScript modules to TensorRT while remaining in PyTorch"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp310-cp310-linux_x86_64.whl", hash = "sha256:417367ff79c45207dafd4e00fac18f228f4eccee3b794df7d1716ace2e486ea1"},
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp310-cp310-win_amd64.whl", hash = "sha256:bad699a998ab36c62633508a4217757128c4d0a9d436b47c743b318f7dcbf7bb"},
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp311-cp311-linux_x86_64.whl", hash = "sha256:d021b8fb322a1a18e53f3bb0f8b30bf4f0b3ba1fa6013063087af8b391cf77c8"},
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp311-cp311-win_amd64.whl", hash = "sha256:fd82aafc9337a73d06777839245fb60d304ec0b5af246a43590b3f9e02e51369"},
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp312-cp312-linux_x86_64.whl", hash = "sha256:dc2e319cc362830abe86b5aaa85d98256754c1627c84c28060ffd2715270e1a1"},
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp312-cp312-win_amd64.whl", hash = "sha256:522cf309f322f21ced0e99383be1cc9e0f08e1229ba33e577f744fe5fb272d2e"},
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp39-cp39-linux_x86_64.whl", hash = "sha256:9322e21bc03fc3e889ab35fdf83b652736d7662098ba6ff610c008248508b5cf"},
-    {file = "torch_tensorrt-2.6.0.dev20241028+cu124-cp39-cp39-win_amd64.whl", hash = "sha256:2391ff89e3011e032a5d984c6fc2cbb90bd462cffa63b716552f817fb4d68a14"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp310-cp310-linux_x86_64.whl", hash = "sha256:2b2afc87a6358d2f61ed5f722cfb674b1fb9ce766efc6e87e3df159340af8016"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp310-cp310-win_amd64.whl", hash = "sha256:7995d57316cef1ccf6c0db02fae581f4f102ee9d925b8c993d1b86d301a524e3"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp311-cp311-linux_x86_64.whl", hash = "sha256:75ce10f91c6eed7786bf050f27f168157b0f20a358040e5f380d908da6bb99f7"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp311-cp311-win_amd64.whl", hash = "sha256:79976077898c7a00999b80ab96131df45f8bcedd511f43252b3faa74cd63c9e6"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp312-cp312-linux_x86_64.whl", hash = "sha256:bf15559c2a61901e6711528ade8a3493cfd2aff839d818a2bcecda7b9c053678"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp312-cp312-win_amd64.whl", hash = "sha256:dc6640802b13685df1594c487cea56667bb88e3739a054348f896594f6d9c6ec"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp39-cp39-linux_x86_64.whl", hash = "sha256:b530b3dfd302b1a97cea30488fc46865c230cdd53365d1291466f4a94e619e45"},
+    {file = "torch_tensorrt-2.6.0.dev20241030+cu124-cp39-cp39-win_amd64.whl", hash = "sha256:5fb9fc65083867cf11794c75d1071dd2836edac99aabaa942782032af2a4aa89"},
 ]
 
 [package.dependencies]
@@ -5154,20 +5202,19 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake
 
 [[package]]
 name = "urllib3"
-version = "2.2.3"
+version = "1.26.20"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=3.8"
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
-    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
-    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+    {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"},
+    {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"},
 ]
 
 [package.extras]
-brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-h2 = ["h2 (>=4,<5)"]
-socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
-zstd = ["zstandard (>=0.18.0)"]
+brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
+socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
 [[package]]
 name = "viztracer"
@@ -5279,6 +5326,84 @@ docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"]
 optional = ["python-socks", "wsaccel"]
 test = ["websockets"]
 
+[[package]]
+name = "websockets"
+version = "10.4"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "websockets-10.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d58804e996d7d2307173d56c297cf7bc132c52df27a3efaac5e8d43e36c21c48"},
+    {file = "websockets-10.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc0b82d728fe21a0d03e65f81980abbbcb13b5387f733a1a870672c5be26edab"},
+    {file = "websockets-10.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ba089c499e1f4155d2a3c2a05d2878a3428cf321c848f2b5a45ce55f0d7d310c"},
+    {file = "websockets-10.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33d69ca7612f0ddff3316b0c7b33ca180d464ecac2d115805c044bf0a3b0d032"},
+    {file = "websockets-10.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62e627f6b6d4aed919a2052efc408da7a545c606268d5ab5bfab4432734b82b4"},
+    {file = "websockets-10.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ea7b82bfcae927eeffc55d2ffa31665dc7fec7b8dc654506b8e5a518eb4d50"},
+    {file = "websockets-10.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e0cb5cc6ece6ffa75baccfd5c02cffe776f3f5c8bf486811f9d3ea3453676ce8"},
+    {file = "websockets-10.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ae5e95cfb53ab1da62185e23b3130e11d64431179debac6dc3c6acf08760e9b1"},
+    {file = "websockets-10.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7c584f366f46ba667cfa66020344886cf47088e79c9b9d39c84ce9ea98aaa331"},
+    {file = "websockets-10.4-cp310-cp310-win32.whl", hash = "sha256:b029fb2032ae4724d8ae8d4f6b363f2cc39e4c7b12454df8df7f0f563ed3e61a"},
+    {file = "websockets-10.4-cp310-cp310-win_amd64.whl", hash = "sha256:8dc96f64ae43dde92530775e9cb169979f414dcf5cff670455d81a6823b42089"},
+    {file = "websockets-10.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:47a2964021f2110116cc1125b3e6d87ab5ad16dea161949e7244ec583b905bb4"},
+    {file = "websockets-10.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e789376b52c295c4946403bd0efecf27ab98f05319df4583d3c48e43c7342c2f"},
+    {file = "websockets-10.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d3f0b61c45c3fa9a349cf484962c559a8a1d80dae6977276df8fd1fa5e3cb8c"},
+    {file = "websockets-10.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f55b5905705725af31ccef50e55391621532cd64fbf0bc6f4bac935f0fccec46"},
+    {file = "websockets-10.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00c870522cdb69cd625b93f002961ffb0c095394f06ba8c48f17eef7c1541f96"},
+    {file = "websockets-10.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f38706e0b15d3c20ef6259fd4bc1700cd133b06c3c1bb108ffe3f8947be15fa"},
+    {file = "websockets-10.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f2c38d588887a609191d30e902df2a32711f708abfd85d318ca9b367258cfd0c"},
+    {file = "websockets-10.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fe10ddc59b304cb19a1bdf5bd0a7719cbbc9fbdd57ac80ed436b709fcf889106"},
+    {file = "websockets-10.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:90fcf8929836d4a0e964d799a58823547df5a5e9afa83081761630553be731f9"},
+    {file = "websockets-10.4-cp311-cp311-win32.whl", hash = "sha256:b9968694c5f467bf67ef97ae7ad4d56d14be2751000c1207d31bf3bb8860bae8"},
+    {file = "websockets-10.4-cp311-cp311-win_amd64.whl", hash = "sha256:a7a240d7a74bf8d5cb3bfe6be7f21697a28ec4b1a437607bae08ac7acf5b4882"},
+    {file = "websockets-10.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:74de2b894b47f1d21cbd0b37a5e2b2392ad95d17ae983e64727e18eb281fe7cb"},
+    {file = "websockets-10.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3a686ecb4aa0d64ae60c9c9f1a7d5d46cab9bfb5d91a2d303d00e2cd4c4c5cc"},
+    {file = "websockets-10.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0d15c968ea7a65211e084f523151dbf8ae44634de03c801b8bd070b74e85033"},
+    {file = "websockets-10.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00213676a2e46b6ebf6045bc11d0f529d9120baa6f58d122b4021ad92adabd41"},
+    {file = "websockets-10.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e23173580d740bf8822fd0379e4bf30aa1d5a92a4f252d34e893070c081050df"},
+    {file = "websockets-10.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:dd500e0a5e11969cdd3320935ca2ff1e936f2358f9c2e61f100a1660933320ea"},
+    {file = "websockets-10.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4239b6027e3d66a89446908ff3027d2737afc1a375f8fd3eea630a4842ec9a0c"},
+    {file = "websockets-10.4-cp37-cp37m-win32.whl", hash = "sha256:8a5cc00546e0a701da4639aa0bbcb0ae2bb678c87f46da01ac2d789e1f2d2038"},
+    {file = "websockets-10.4-cp37-cp37m-win_amd64.whl", hash = "sha256:a9f9a735deaf9a0cadc2d8c50d1a5bcdbae8b6e539c6e08237bc4082d7c13f28"},
+    {file = "websockets-10.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c1289596042fad2cdceb05e1ebf7aadf9995c928e0da2b7a4e99494953b1b94"},
+    {file = "websockets-10.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0cff816f51fb33c26d6e2b16b5c7d48eaa31dae5488ace6aae468b361f422b63"},
+    {file = "websockets-10.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dd9becd5fe29773d140d68d607d66a38f60e31b86df75332703757ee645b6faf"},
+    {file = "websockets-10.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45ec8e75b7dbc9539cbfafa570742fe4f676eb8b0d3694b67dabe2f2ceed8aa6"},
+    {file = "websockets-10.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f72e5cd0f18f262f5da20efa9e241699e0cf3a766317a17392550c9ad7b37d8"},
+    {file = "websockets-10.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:185929b4808b36a79c65b7865783b87b6841e852ef5407a2fb0c03381092fa3b"},
+    {file = "websockets-10.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7d27a7e34c313b3a7f91adcd05134315002aaf8540d7b4f90336beafaea6217c"},
+    {file = "websockets-10.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:884be66c76a444c59f801ac13f40c76f176f1bfa815ef5b8ed44321e74f1600b"},
+    {file = "websockets-10.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:931c039af54fc195fe6ad536fde4b0de04da9d5916e78e55405436348cfb0e56"},
+    {file = "websockets-10.4-cp38-cp38-win32.whl", hash = "sha256:db3c336f9eda2532ec0fd8ea49fef7a8df8f6c804cdf4f39e5c5c0d4a4ad9a7a"},
+    {file = "websockets-10.4-cp38-cp38-win_amd64.whl", hash = "sha256:48c08473563323f9c9debac781ecf66f94ad5a3680a38fe84dee5388cf5acaf6"},
+    {file = "websockets-10.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:40e826de3085721dabc7cf9bfd41682dadc02286d8cf149b3ad05bff89311e4f"},
+    {file = "websockets-10.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:56029457f219ade1f2fc12a6504ea61e14ee227a815531f9738e41203a429112"},
+    {file = "websockets-10.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f5fc088b7a32f244c519a048c170f14cf2251b849ef0e20cbbb0fdf0fdaf556f"},
+    {file = "websockets-10.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fc8709c00704194213d45e455adc106ff9e87658297f72d544220e32029cd3d"},
+    {file = "websockets-10.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0154f7691e4fe6c2b2bc275b5701e8b158dae92a1ab229e2b940efe11905dff4"},
+    {file = "websockets-10.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c6d2264f485f0b53adf22697ac11e261ce84805c232ed5dbe6b1bcb84b00ff0"},
+    {file = "websockets-10.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9bc42e8402dc5e9905fb8b9649f57efcb2056693b7e88faa8fb029256ba9c68c"},
+    {file = "websockets-10.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:edc344de4dac1d89300a053ac973299e82d3db56330f3494905643bb68801269"},
+    {file = "websockets-10.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:84bc2a7d075f32f6ed98652db3a680a17a4edb21ca7f80fe42e38753a58ee02b"},
+    {file = "websockets-10.4-cp39-cp39-win32.whl", hash = "sha256:c94ae4faf2d09f7c81847c63843f84fe47bf6253c9d60b20f25edfd30fb12588"},
+    {file = "websockets-10.4-cp39-cp39-win_amd64.whl", hash = "sha256:bbccd847aa0c3a69b5f691a84d2341a4f8a629c6922558f2a70611305f902d74"},
+    {file = "websockets-10.4-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:82ff5e1cae4e855147fd57a2863376ed7454134c2bf49ec604dfe71e446e2193"},
+    {file = "websockets-10.4-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d210abe51b5da0ffdbf7b43eed0cfdff8a55a1ab17abbec4301c9ff077dd0342"},
+    {file = "websockets-10.4-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:942de28af58f352a6f588bc72490ae0f4ccd6dfc2bd3de5945b882a078e4e179"},
+    {file = "websockets-10.4-pp37-pypy37_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9b27d6c1c6cd53dc93614967e9ce00ae7f864a2d9f99fe5ed86706e1ecbf485"},
+    {file = "websockets-10.4-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:3d3cac3e32b2c8414f4f87c1b2ab686fa6284a980ba283617404377cd448f631"},
+    {file = "websockets-10.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:da39dd03d130162deb63da51f6e66ed73032ae62e74aaccc4236e30edccddbb0"},
+    {file = "websockets-10.4-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:389f8dbb5c489e305fb113ca1b6bdcdaa130923f77485db5b189de343a179393"},
+    {file = "websockets-10.4-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09a1814bb15eff7069e51fed0826df0bc0702652b5cb8f87697d469d79c23576"},
+    {file = "websockets-10.4-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff64a1d38d156d429404aaa84b27305e957fd10c30e5880d1765c9480bea490f"},
+    {file = "websockets-10.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b343f521b047493dc4022dd338fc6db9d9282658862756b4f6fd0e996c1380e1"},
+    {file = "websockets-10.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:932af322458da7e4e35df32f050389e13d3d96b09d274b22a7aa1808f292fee4"},
+    {file = "websockets-10.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a4162139374a49eb18ef5b2f4da1dd95c994588f5033d64e0bbfda4b6b6fcf"},
+    {file = "websockets-10.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c57e4c1349fbe0e446c9fa7b19ed2f8a4417233b6984277cce392819123142d3"},
+    {file = "websockets-10.4-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b627c266f295de9dea86bd1112ed3d5fafb69a348af30a2422e16590a8ecba13"},
+    {file = "websockets-10.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:05a7233089f8bd355e8cbe127c2e8ca0b4ea55467861906b80d2ebc7db4d6b72"},
+    {file = "websockets-10.4.tar.gz", hash = "sha256:eef610b23933c54d5d921c92578ae5f89813438fded840c2e9809d378dc765d3"},
+]
+
 [[package]]
 name = "wheel"
 version = "0.44.0"
@@ -5326,4 +5451,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.10"
-content-hash = "e569013ff95d0a9b7d56e385914b1ce0072d7098d4330bdaa71f9c92f7dd98bd"
+content-hash = "0cbc14aea958585e09f8b0ff9310b2ee816c2fc6bd8a5bf6885054cdee20fe3d"
diff --git a/projects/dino_dinov2/modeling/exportable/dino_transformer.py b/projects/dino_dinov2/modeling/exportable/dino_transformer.py
index ab21990..345ef2e 100644
--- a/projects/dino_dinov2/modeling/exportable/dino_transformer.py
+++ b/projects/dino_dinov2/modeling/exportable/dino_transformer.py
@@ -200,6 +200,7 @@ def forward(
             if reference_points.shape[-1] == 4:
                 reference_points_input = (
                     reference_points[:, :, None]
+                    # DYNAMO REFACTOR
                     # small refactor to avoid: https://github.com/pytorch/pytorch/issues/129038
                     # * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
                     * valid_ratios.repeat(*[1] * (valid_ratios.ndim - 1), 2)[:, None]
@@ -272,6 +273,7 @@ def __init__(
         num_feature_levels=4,
         two_stage_num_proposals=900,
         learnt_init_query=True,
+        specialize_with_list: bool = False,
     ):
         super(DINOTransformer, self).__init__()
         self.encoder = encoder
@@ -289,6 +291,7 @@ def __init__(
             self.tgt_embed = nn.Embedding(self.two_stage_num_proposals, self.embed_dim)
         self.enc_output = nn.Linear(self.embed_dim, self.embed_dim)
         self.enc_output_norm = nn.LayerNorm(self.embed_dim)
+        self.specialize_with_list = specialize_with_list
 
         self.init_weights()
 
@@ -301,7 +304,7 @@ def init_weights(self):
                 m.init_weights()
         nn.init.normal_(self.level_embeds)
 
-    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes: List[Tuple[int, int]]):
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
         N, S, C = memory.shape
         proposals = []
         _cur = 0
@@ -348,7 +351,7 @@ def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shap
         return output_memory, output_proposals
 
     @staticmethod
-    def get_reference_points(spatial_shapes: List[Tuple[int, int]], valid_ratios: torch.Tensor, device: torch.device):
+    def get_reference_points(spatial_shapes, valid_ratios: torch.Tensor, device: torch.device):
         """Get the reference points used in decoder.
 
         Args:
@@ -422,17 +425,18 @@ def forward(
         feat_flatten = torch.cat(feat_flatten, 1)
         mask_flatten = torch.cat(mask_flatten, 1)
         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
-        # spatial_shapes = torch.as_tensor(
-        #     spatial_shapes, dtype=torch.long, device=feat_flatten.device
-        # )
-        # list refactor
-        # level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
-        level_start_index = [0] + list(itertools.accumulate(list(map(math.prod, spatial_shapes))))[:-1]
+        if not self.specialize_with_list:
+            spatial_shapes = torch.tensor(
+                spatial_shapes, dtype=torch.long, device=feat_flatten.device
+            )
+            level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        else:
+            level_start_index = [0] + list(itertools.accumulate(list(map(math.prod, spatial_shapes))))[:-1]
         valid_ratios = torch.stack(
             [self.get_valid_ratio(m) for m in multi_level_masks], 1
         )
 
-        reference_points = self.get_reference_points( # DONE
+        reference_points = self.get_reference_points(
             spatial_shapes, valid_ratios, device=feat.device
         )
 
diff --git a/pyproject.toml b/pyproject.toml
index a754056..576e2f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ viztracer = "^0.17.0"
 pandas = "^2.2.3"
 jupyter-book = "^1.0.3"
 livereload = "^2.7.0"
+pyppeteer = "^2.0.0"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/scripts/benchmark_gpu.py b/scripts/benchmark_gpu.py
index c68c687..3a65ad0 100644
--- a/scripts/benchmark_gpu.py
+++ b/scripts/benchmark_gpu.py
@@ -1,74 +1,71 @@
 import torch
 import time
-from typing import Optional
 from functools import partial
 import contextlib
 from src.utils import (
     load_input_fixed,
-    plot_predictions,
     TracingAdapter,
 )
-from src.utils import load_model as _load_model
+from src.utils import load_model
 from statistics import stdev, mean
 import torch_tensorrt
 import logging
-import argparse
 from pathlib import Path
 import detrex
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import importlib
 
-detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False
 
+logging.basicConfig(level=logging.INFO)
 
-def setup_parser():
-    DEFAULT_IMG = Path("artifacts/idea_raw.jpg")
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=Path, required=True)
-    parser.add_argument("--image", type=Path, default=DEFAULT_IMG)
-    parser.add_argument("--n_warmup", type=int, default=10)
-    parser.add_argument("--n_iter", type=int, default=10)
-    parser.add_argument("--output", type=Path, default=None)
-    parser.add_argument(
-        "--amp_dtype", type=str, default=None, choices=["fp16", "bf16", None]
-    )
-    return parser
 
+@hydra.main(
+    version_base=None, config_path="config/benchmark_gpu", config_name="default"
+)
+def main(cfg: DictConfig):
+    OUTPUT_DIR = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
+    print(OmegaConf.to_yaml(cfg))
+
+    n_iter = cfg.n_iter  # default 10
+    n_warmup = cfg.n_warmup  # default 10
+    amp_dtype = cfg.amp_dtype  # default None
+    compile_run_path = Path(cfg.compile_run_path)
+    compile_run_cfg = OmegaConf.load(compile_run_path / ".hydra" / "config.yaml")
+    print(OmegaConf.to_yaml(compile_run_cfg))
+
+    # Setting variables
+    for var, val in compile_run_cfg.env.items():
+        logging.info(f"Setting {var} to {val}")
+        module_name, attr_name = var.rsplit(".", 1)
+        module = importlib.import_module(module_name)
+        setattr(module, attr_name, val)
+
+    height, width = compile_run_cfg.image.height, compile_run_cfg.image.width
+
+    base_model = load_model(
+        config_file=compile_run_cfg.model.config,
+        ckpt_path=compile_run_cfg.model.ckpt_path,
+        opts=compile_run_cfg.model.opts,
+    )
 
-logging.basicConfig(level=logging.INFO)
+    _, inputs = load_input_fixed(height=height, width=width, device="cuda")
+    model = TracingAdapter(
+        base_model, inputs=inputs, allow_non_tensor=False, specialize_non_tensor=True
+    )
 
+    inputs = model.flattened_inputs
+    print(inputs[0].shape)
 
-def load_model(model_path: Path):
-    if model_path.suffix == ".ts":
-        *_, height, width = model_path.stem.split("_")
+    if cfg.load_ts:
+        del base_model, model
+        model_path = compile_run_path / "model.ts"
         model = torch.jit.load(model_path)
-    elif model_path.suffix == ".ep":
-        *_, height, width = model_path.stem.split("_")
-        model = torch.export.load(model_path).module()
-    elif model_path.suffix == ".pth":
-        height, width = 512, 512
-        model = _load_model().cuda()
-        model = TracingAdapter(model, *load_input_fixed(height=height, width=width))
-    else:
-        raise ValueError(f"Unsupported model format: {model_path.suffix}")
-
-    return model, int(height), int(width)
-
-
-def benchmark(
-    model_path: Path,
-    image_path: Path,
-    n_warmup: int,
-    n_iter: int,
-    output_path: Optional[Path],
-    amp_dtype: Optional[str] = None,
-):
-    # track cuda memory history
+
     torch.cuda.memory._record_memory_history()
-    model, height, width = load_model(model_path)
+
     model.eval()
     model.cuda()
-    logging.info("Loaded model")
-    img, example_kwargs = load_input_fixed(str(image_path), height, width)
-    input = (example_kwargs["images"].cuda(),)
 
     ctx = contextlib.nullcontext
     if amp_dtype is not None:
@@ -81,7 +78,7 @@ def benchmark(
     with torch.no_grad(), ctx():
         logging.info("warmup")
         for _ in range(n_warmup):
-            _ = model(*input)
+            _ = model(*inputs)
 
         torch.cuda.reset_peak_memory_stats()
         logging.info("measuring time")
@@ -89,7 +86,7 @@ def benchmark(
         for _ in range(n_iter):
             torch.cuda.synchronize()
             start_time = time.time()
-            _ = model(*input)
+            _ = model(*inputs)
             torch.cuda.synchronize()
             end_time = time.time()
             inference_time = end_time - start_time
@@ -101,24 +98,9 @@ def benchmark(
 
         # get max memory usage
         max_memory = torch.cuda.memory.max_memory_allocated()
-        torch.cuda.memory._dump_snapshot(f"artifacts/{model_path.stem}_mem.pickle")
+        torch.cuda.memory._dump_snapshot(OUTPUT_DIR / "mem.pickle")
         logging.info(f"Max memory usage: {max_memory / 1e6:.4f} MB")
 
-    if output_path is not None:
-        outputs = model(*input)
-        outputs = unflatten_repr(outputs)
-        plot_predictions(outputs, img, output_file=output_path)
-
-
-def main():
-    parser = setup_parser()
-    args = parser.parse_args()
-    logging.info("Loading model")
-    model_path = args.model
-    benchmark(
-        model_path, args.image, args.n_warmup, args.n_iter, args.output, args.amp_dtype
-    )
-
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/config/benchmark_gpu/default.yaml b/scripts/config/benchmark_gpu/default.yaml
new file mode 100644
index 0000000..94780d1
--- /dev/null
+++ b/scripts/config/benchmark_gpu/default.yaml
@@ -0,0 +1,5 @@
+n_iter: 100
+n_warmup: 10
+amp_dtype: null
+compile_run_path: null
+load_ts: true
\ No newline at end of file
diff --git a/scripts/config/export_tensorrt/dinov2.yaml b/scripts/config/export_tensorrt/dinov2.yaml
index 426ba33..dec8fa9 100644
--- a/scripts/config/export_tensorrt/dinov2.yaml
+++ b/scripts/config/export_tensorrt/dinov2.yaml
@@ -6,6 +6,8 @@ amp_dtype: "fp32"
 trt:
   enabled_precisions:
     - "fp32"
+    - "fp16"
+    - "bf16"
 model:
   config: "projects/dino_dinov2/configs/models/dino_dinov2.py"
   ckpt_path: "artifacts/model_final.pth"
@@ -13,3 +15,8 @@ model:
     - "model.backbone.net.img_size=[512, 512]"
     - "model.backbone.net.dynamic_img_size=False"
     - "model.backbone.net.dynamic_img_pad=False"
+    - "model.transformer.specialize_with_list=True"
+
+env:
+  "torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT":  2000
+  "detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA": False
\ No newline at end of file
diff --git a/scripts/config/export_tensorrt/vit.yaml b/scripts/config/export_tensorrt/vit.yaml
index c0b6b05..4147d18 100644
--- a/scripts/config/export_tensorrt/vit.yaml
+++ b/scripts/config/export_tensorrt/vit.yaml
@@ -14,4 +14,11 @@ trt:
   enable_experimental_decompositions: True
   min_block_size: 1
   use_fast_partitioner: True # doesn't make any difference in supported nodes
+  torch_executed_ops:
+    - "torch.ops.aten.sym_size.int"
 amp_dtype: "fp32"
+env:
+  "torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT":  2000
+  "detectron2.modeling.proposal_generator.proposal_utils.SKIP_NMS": True
+  "detectron2.modeling.roi_heads.fast_rcnn.SKIP_NMS": True
+  "detectron2.modeling.roi_heads.fast_rcnn.SKIP_FILTER_CONFIDENCE": True
\ No newline at end of file
diff --git a/scripts/export_tensorrt.py b/scripts/export_tensorrt.py
index 983a8ff..ffc1a02 100644
--- a/scripts/export_tensorrt.py
+++ b/scripts/export_tensorrt.py
@@ -8,12 +8,13 @@
 import torch_tensorrt
 from omegaconf import DictConfig, OmegaConf
 
+import importlib
 import detrex
 from src.utils import TracingAdapter, load_input_fixed, load_model, plot_predictions
 
 logging.basicConfig(level=logging.INFO)
-torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT = 2000
-detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False
+# torch._subclasses.fake_tensor.CONSTANT_NUMEL_LIMIT = 2000
+# detrex.layers.multi_scale_deform_attn._ENABLE_CUDA_MSDA = False
 
 
 def to_dtype(precision: str):
@@ -86,11 +87,18 @@ def compile(
         return trt_gm
 
 
-@hydra.main(version_base=None, config_path="config/export_tensorrt", config_name="vit")
+@hydra.main(version_base=None, config_path="config/export_tensorrt", config_name="dinov2")
 def main(cfg: DictConfig):
     OUTPUT_DIR = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
     print(OmegaConf.to_yaml(cfg))
 
+    # Setting variables
+    for var, val in cfg.env.items():
+        logging.info(f"Setting {var} to {val}")
+        module_name, attr_name = var.rsplit(".", 1)
+        module = importlib.import_module(module_name)
+        setattr(module, attr_name, val)
+
     # check that amp_dtype is in enabled_precisions
     if cfg.amp_dtype not in cfg.trt.enabled_precisions:
         raise ValueError(
@@ -116,6 +124,7 @@ def main(cfg: DictConfig):
     )
     inputs = model.flattened_inputs
     model.eval().cuda()
+    # This forward call is important, it ensures the model works before compilation
     model(*inputs)
     try:
         trt_gm = compile(model, inputs, amp_dtype=cfg.amp_dtype, trt_cfg=cfg.trt)
diff --git a/src/utils/io.py b/src/utils/io.py
index d5dbca8..f0fbaf6 100644
--- a/src/utils/io.py
+++ b/src/utils/io.py
@@ -34,7 +34,7 @@ def load_input_fixed(
     with torch.no_grad():
         if input_format == "RGB":
             img = img[:, :, ::-1]
-        img = torch.as_tensor(img.astype("float32").transpose(2, 0, 1))
+        img = torch.as_tensor(img.astype("float32").transpose(2, 0, 1)).contiguous()
         return original_img, (
             [
                 {