From 218f8d8bd13f2d760294c6b139b38feec18e8d93 Mon Sep 17 00:00:00 2001
From: Diego Canez <canezdiego@gmail.com>
Date: Thu, 31 Oct 2024 10:54:18 +0100
Subject: [PATCH] docs: update docs

---
 cpp/src/benchmark.cpp     |  3 +-
 docs/src/_toc.yml         |  1 +
 docs/src/part3/results.md | 88 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 docs/src/part3/results.md

diff --git a/cpp/src/benchmark.cpp b/cpp/src/benchmark.cpp
index 991eae8..41d25c9 100644
--- a/cpp/src/benchmark.cpp
+++ b/cpp/src/benchmark.cpp
@@ -46,8 +46,7 @@ void benchmark(std::string model_name, int n_warmup = 5, int n_iter = 5)
     float mean = std::accumulate(durations.begin(), durations.end(), 0.0) / durations.size();
     float sq_sum = std::inner_product(durations.begin(), durations.end(), durations.begin(), 0.0);
     float stdev = std::sqrt(sq_sum / durations.size() - mean * mean);
-    std::cout << "mean: " << mean << " ms" << std::endl;
-    std::cout << "std: " << stdev << " ms" << std::endl;
+    std::cout << "Average inference time: " << mean << " ± " << stdev << " ms" << std::endl;
 }
 
 int main(int argc, char *argv[])
diff --git a/docs/src/_toc.yml b/docs/src/_toc.yml
index 0b37d6d..6673253 100644
--- a/docs/src/_toc.yml
+++ b/docs/src/_toc.yml
@@ -17,6 +17,7 @@ parts:
   - caption: Optimization
     chapters:
     - file: part3/compilation
+    - file: part3/results
   - caption: Other
     chapters:
     - file: bibliography 
diff --git a/docs/src/part3/results.md b/docs/src/part3/results.md
new file mode 100644
index 0000000..3d296bb
--- /dev/null
+++ b/docs/src/part3/results.md
@@ -0,0 +1,88 @@
+# Results
+
+## Running the benchmarks
+
+Before running the benchmarks make sure you have compiled your desired model.
+
+```bash
+python -m scripts.export_tensorrt --config-name dinov2 amp_dtype=fp32 trt.enabled_precisions="[fp32, bf16, fp16]" 
+# ...
+# OUTPUT DIR: outputs/2024-10-31/10-43-31
+```
+
+The outputs of this script will be found in the directory specified by `OUTPUT DIR`. The directory will contain the following files:
+
+```
+├── export_tensorrt.log     # log file
+├── .hydra
+│   ├── config.yaml         # config file
+│   ├── hydra.yaml
+│   └── overrides.yaml      
+├── model.ts                # compiled torchscript model
+└── predictions.png         # sample predictions for the model
+```
+
+There are three possible runtimes to benchmark, examples of how to run the benchmarks are shown below:
+
+**Python Runtime, no TensorRT**
+```bash
+python -m scripts.benchmark_gpu compile_run_path=outputs/2024-10-31/10-43-31 n_iter=100 load_ts=False amp_dtype=fp16
+```
+
+**Python Runtime with TensorRT**
+```bash
+python -m scripts.benchmark_gpu compile_run_path=outputs/2024-10-31/10-43-31 n_iter=100 load_ts=True
+```
+
+**C++ Runtime with TensorRT**
+```bash
+./build/benchmark --model outputs/2024-10-31/10-43-31/model.ts --n_iter=100
+```
+
+## Results
+
+**Python Runtime, no TensorRT**
+
+| model's precision | amp_dtype              | latency        |
+| ----------------- | ---------------------- | -------------- |
+| fp32              | fp32+fp16              | 66.322 ± 0.927 |
+| fp32              | fp32+bf16              | 66.497 ± 1.052 |
+| fp32              | fp32                   | 76.275 ± 0.587 |
+
+**Python Runtime, with TensorRT**
+
+| model's precision | trt.enabled_precisions | latency        |
+| ----------------- | ---------------------- | -------------- |
+| fp32+fp16         | fp32+bf16+fp16         | 15.369 ± 0.023 |
+| fp32              | fp32+bf16+fp16         | 23.164 ± 0.031 |
+| fp32              | fp32+bf16              | 25.148 ± 0.030 |
+| fp32              | fp32                   | 38.381 ± 0.022 |
+
+**C++ Runtime, no TensorRT**
+
+| model's precision | trt.enabled_precisions | latency        |
+| ----------------- | ---------------------- | -------------- |
+| fp32+fp16         | fp32+bf16+fp16         | 15.433 ± 0.029 |
+| fp32              | fp32+bf16+fp16         | 23.263 ± 0.027 |
+| fp32              | fp32+bf16              | 25.255 ± 0.014 |
+| fp32              | fp32                   | 38.465 ± 0.029 |
+
+
+
+
+Note: For some reason in the latest version of torch_tensorrt, `bfloat16` precision is not working well and it's not achieving the previously measured performance of (13-14ms) and/or failing compilation. 
+
+We include the previous results for completeness:
+
+| Runtime | model's precision | Enabled Precisions | Latency | Memory (MB) |
+| ------- | ----------------- | ------------------ | ------- | ----------- |
+| cpp+trt | fp32              | fp32+fp16          | 13.984  | 500         |
+| cpp+trt | fp32              | fp32+bf16+fp16     | 13.898  | 500         |
+| cpp+trt | fp32              | fp32+bf16          | 17.261  | 500         |
+| cpp+trt | bf16              | fp32+bf16          | 22.913  | 500         |
+| cpp+trt | bf16              | bf16               | 22.938  | 500         |
+| cpp+trt | fp32              | fp32               | 37.639  | 770         |
+
+
+
+