From 218f8d8bd13f2d760294c6b139b38feec18e8d93 Mon Sep 17 00:00:00 2001 From: Diego Canez Date: Thu, 31 Oct 2024 10:54:18 +0100 Subject: [PATCH] docs: update docs --- cpp/src/benchmark.cpp | 3 +- docs/src/_toc.yml | 1 + docs/src/part3/results.md | 88 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 docs/src/part3/results.md diff --git a/cpp/src/benchmark.cpp b/cpp/src/benchmark.cpp index 991eae8..41d25c9 100644 --- a/cpp/src/benchmark.cpp +++ b/cpp/src/benchmark.cpp @@ -46,8 +46,7 @@ void benchmark(std::string model_name, int n_warmup = 5, int n_iter = 5) float mean = std::accumulate(durations.begin(), durations.end(), 0.0) / durations.size(); float sq_sum = std::inner_product(durations.begin(), durations.end(), durations.begin(), 0.0); float stdev = std::sqrt(sq_sum / durations.size() - mean * mean); - std::cout << "mean: " << mean << " ms" << std::endl; - std::cout << "std: " << stdev << " ms" << std::endl; + std::cout << "Average inference time: " << mean << " ± " << stdev << " ms" << std::endl; } int main(int argc, char *argv[]) diff --git a/docs/src/_toc.yml b/docs/src/_toc.yml index 0b37d6d..6673253 100644 --- a/docs/src/_toc.yml +++ b/docs/src/_toc.yml @@ -17,6 +17,7 @@ parts: - caption: Optimization chapters: - file: part3/compilation + - file: part3/results - caption: Other chapters: - file: bibliography diff --git a/docs/src/part3/results.md b/docs/src/part3/results.md new file mode 100644 index 0000000..3d296bb --- /dev/null +++ b/docs/src/part3/results.md @@ -0,0 +1,88 @@ +# Results + +## Running the benchmarks + +Before running the benchmarks make sure you have compiled your desired model. + +```bash +python -m scripts.export_tensorrt --config-name dinov2 amp_dtype=fp32 trt.enabled_precisions="[fp32, bf16, fp16]" +# ... +# OUTPUT DIR: outputs/2024-10-31/10-43-31 +``` + +The outputs of this script will be found in the directory specified by `OUTPUT DIR`. The directory will contain the following files: + +``` +├── export_tensorrt.log # log file +├── .hydra +│ ├── config.yaml # config file +│ ├── hydra.yaml +│ └── overrides.yaml +├── model.ts # compiled torchscript model +└── predictions.png # sample predictions for the model +``` + +There are three possible runtimes to benchmark, examples of how to run the benchmarks are shown below: + +**Python Runtime, no TensorRT** +```bash +python -m scripts.benchmark_gpu compile_run_path=outputs/2024-10-31/10-43-31 n_iter=100 load_ts=False amp_dtype=fp16 +``` + +**Python Runtime with TensorRT** +```bash +python -m scripts.benchmark_gpu compile_run_path=outputs/2024-10-31/10-43-31 n_iter=100 load_ts=True +``` + +**C++ Runtime with TensorRT** +```bash +./build/benchmark --model outputs/2024-10-31/10-43-31/model.ts --n_iter=100 +``` + +## Results + +**Python Runtime, no TensorRT** + +| model's precision | amp_dtype | latency | +| ----------------- | ---------------------- | -------------- | +| fp32 | fp32+fp16 | 66.322 ± 0.927 | +| fp32 | fp32+bf16 | 66.497 ± 1.052 | +| fp32 | fp32 | 76.275 ± 0.587 | + +**Python Runtime, with TensorRT** + +| model's precision | trt.enabled_precisions | latency | +| ----------------- | ---------------------- | -------------- | +| fp32+fp16 | fp32+bf16+fp16 | 15.369 ± 0.023 | +| fp32 | fp32+bf16+fp16 | 23.164 ± 0.031 | +| fp32 | fp32+bf16 | 25.148 ± 0.030 | +| fp32 | fp32 | 38.381 ± 0.022 | + +**C++ Runtime, no TensorRT** + +| model's precision | trt.enabled_precisions | latency | +| ----------------- | ---------------------- | -------------- | +| fp32+fp16 | fp32+bf16+fp16 | 15.433 ± 0.029 | +| fp32 | fp32+bf16+fp16 | 23.263 ± 0.027 | +| fp32 | fp32+bf16 | 25.255 ± 0.014 | +| fp32 | fp32 | 38.465 ± 0.029 | + + + + +Note: For some reason in the latest version of torch_tensorrt, `bfloat16` precision is not working well and it's not achieving the previously measured performance of (13-14ms) and/or failing compilation. + +We include the previous results for completeness: + +| Runtime | model's precision | Enabled Precisions | Latency | Memory (MB) | +| ------- | ----------------- | ------------------ | ------- | ----------- | +| cpp+trt | fp32 | fp32+fp16 | 13.984 | 500 | +| cpp+trt | fp32 | fp32+bf16+fp16 | 13.898 | 500 | +| cpp+trt | fp32 | fp32+bf16 | 17.261 | 500 | +| cpp+trt | bf16 | fp32+bf16 | 22.913 | 500 | +| cpp+trt | bf16 | bf16 | 22.938 | 500 | +| cpp+trt | fp32 | fp32 | 37.639 | 770 | + + + +