Skip to content

Commit

Permalink
feat: Adding live progress monitoring to the engine building phase (#…
Browse files Browse the repository at this point in the history
…3087)

Signed-off-by: Naren Dasan <[email protected]>
Signed-off-by: Naren Dasan <[email protected]>
  • Loading branch information
narendasan authored Aug 21, 2024
1 parent 2d7fb4c commit 2f4fef1
Show file tree
Hide file tree
Showing 5 changed files with 291 additions and 57 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/build-test-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
pushd .
cd tests/modules
Expand Down Expand Up @@ -112,6 +113,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 8 conversion/
Expand Down Expand Up @@ -140,6 +142,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
Expand Down Expand Up @@ -168,6 +171,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
Expand Down Expand Up @@ -196,6 +200,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
Expand Down Expand Up @@ -226,6 +231,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
Expand Down Expand Up @@ -256,6 +262,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
nvidia-smi
Expand Down Expand Up @@ -286,6 +293,7 @@ jobs:
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/core
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/build-test-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/modules
python hub.py
Expand Down Expand Up @@ -114,6 +115,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 8 conversion/
Expand All @@ -139,6 +141,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
Expand All @@ -164,6 +167,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
Expand All @@ -189,6 +193,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
Expand Down Expand Up @@ -216,6 +221,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
Expand Down Expand Up @@ -243,6 +249,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
Expand All @@ -269,6 +276,7 @@ jobs:
pre-script: packaging/driver_upgrade.bat
script: |
export USE_HOST_DEPS=1
export CI_BUILD=1
pushd .
cd tests/py/core
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
Expand Down
159 changes: 159 additions & 0 deletions py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import os
import sys
from typing import Any, Dict, Optional

import tensorrt as trt


class _ASCIIMonitor(trt.IProgressMonitor): # type: ignore
def __init__(self, engine_name: str = "") -> None:
trt.IProgressMonitor.__init__(self)
self._active_phases: Dict[str, Dict[str, Any]] = {}
self._step_result = True

self._render = True
if (ci_env_var := os.environ.get("CI_BUILD")) is not None:
if ci_env_var == "1":
self._render = False

def phase_start(
self, phase_name: str, parent_phase: Optional[str], num_steps: int
) -> None:
try:
if parent_phase is not None:
nbIndents = 1 + self._active_phases[parent_phase]["nbIndents"]
else:
nbIndents = 0
self._active_phases[phase_name] = {
"title": phase_name,
"steps": 0,
"num_steps": num_steps,
"nbIndents": nbIndents,
}
self._redraw()
except KeyboardInterrupt:
_step_result = False

def phase_finish(self, phase_name: str) -> None:
try:
del self._active_phases[phase_name]
self._redraw(blank_lines=1) # Clear the removed phase.
except KeyboardInterrupt:
_step_result = False

def step_complete(self, phase_name: str, step: int) -> bool:
try:
self._active_phases[phase_name]["steps"] = step
self._redraw()
return self._step_result
except KeyboardInterrupt:
return False

def _redraw(self, *, blank_lines: int = 0) -> None:
if self._render:

def clear_line() -> None:
print("\x1B[2K", end="")

def move_to_start_of_line() -> None:
print("\x1B[0G", end="")

def move_cursor_up(lines: int) -> None:
print("\x1B[{}A".format(lines), end="")

def progress_bar(steps: int, num_steps: int) -> str:
INNER_WIDTH = 10
completed_bar_chars = int(INNER_WIDTH * steps / float(num_steps))
return "[{}{}]".format(
"=" * completed_bar_chars, "-" * (INNER_WIDTH - completed_bar_chars)
)

# Set max_cols to a default of 200 if not run in interactive mode.
max_cols = os.get_terminal_size().columns if sys.stdout.isatty() else 200

move_to_start_of_line()
for phase in self._active_phases.values():
phase_prefix = "{indent}{bar} {title}".format(
indent=" " * phase["nbIndents"],
bar=progress_bar(phase["steps"], phase["num_steps"]),
title=phase["title"],
)
phase_suffix = "{steps}/{num_steps}".format(**phase)
allowable_prefix_chars = max_cols - len(phase_suffix) - 2
if allowable_prefix_chars < len(phase_prefix):
phase_prefix = phase_prefix[0 : allowable_prefix_chars - 3] + "..."
clear_line()
print(phase_prefix, phase_suffix)
for line in range(blank_lines):
clear_line()
print()
move_cursor_up(len(self._active_phases) + blank_lines)
sys.stdout.flush()


try:
from rich.progress import BarColumn, Progress, TaskID, TextColumn, TimeElapsedColumn

class _RichMonitor(trt.IProgressMonitor): # type: ignore
def __init__(self, engine_name: str = "") -> None:
trt.IProgressMonitor.__init__(self)
self._active_phases: Dict[str, TaskID] = {}
self._step_result = True

self._progress_monitors = Progress(
TextColumn(" "),
TimeElapsedColumn(),
TextColumn("{task.description}: "),
BarColumn(),
TextColumn(" {task.percentage:.0f}% ({task.completed}/{task.total})"),
)

self._render = True
if (ci_env_var := os.environ.get("CI_BUILD")) is not None:
if ci_env_var == "1":
self._render = False

if self._render:
self._progress_monitors.start()

def phase_start(
self, phase_name: str, parent_phase: Optional[str], num_steps: int
) -> None:
try:
self._active_phases[phase_name] = self._progress_monitors.add_task(
phase_name, total=num_steps
)
self._progress_monitors.refresh()
except KeyboardInterrupt:
# The phase_start callback cannot directly cancel the build, so request the cancellation from within step_complete.
_step_result = False

def phase_finish(self, phase_name: str) -> None:
try:
self._progress_monitors.update(
self._active_phases[phase_name], visible=False
)
self._progress_monitors.stop_task(self._active_phases[phase_name])
self._progress_monitors.remove_task(self._active_phases[phase_name])
self._progress_monitors.refresh()
except KeyboardInterrupt:
_step_result = False

def step_complete(self, phase_name: str, step: int) -> bool:
try:
self._progress_monitors.update(
self._active_phases[phase_name], completed=step
)
self._progress_monitors.refresh()
return self._step_result
except KeyboardInterrupt:
# There is no need to propagate this exception to TensorRT. We can simply cancel the build.
return False

def __del__(self) -> None:
if self._progress_monitors:
self._progress_monitors.stop()

TRTBulderMonitor: trt.IProgressMonitor = _RichMonitor
except ImportError:
TRTBulderMonitor: trt.IProgressMonitor = _ASCIIMonitor # type: ignore[no-redef]
35 changes: 15 additions & 20 deletions py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple

import numpy as np
import tensorrt as trt
import torch
import torch.fx
from torch.fx.node import _get_qualified_name
Expand All @@ -21,6 +20,7 @@
DYNAMO_CONVERTERS as CONVERTERS,
)
from torch_tensorrt.dynamo.conversion._ConverterRegistry import CallingConvention
from torch_tensorrt.dynamo.conversion._TRTBuilderMonitor import TRTBulderMonitor
from torch_tensorrt.dynamo.conversion.converter_utils import (
get_node_io,
get_node_name,
Expand All @@ -30,6 +30,7 @@
from torch_tensorrt.fx.observer import Observer
from torch_tensorrt.logging import TRT_LOGGER

import tensorrt as trt
from packaging import version

_LOGGER: logging.Logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -146,7 +147,7 @@ def clean_repr(x: Any, depth: int = 0) -> Any:
else:
return "(...)"
else:
return x
return f"{x} <{type(x).__name__}>"

str_args = [clean_repr(a) for a in args]
return repr(tuple(str_args))
Expand Down Expand Up @@ -176,6 +177,10 @@ def _populate_trt_builder_config(
) -> trt.IBuilderConfig:

builder_config = self.builder.create_builder_config()

if self.compilation_settings.debug:
builder_config.progress_monitor = TRTBulderMonitor()

if self.compilation_settings.workspace_size != 0:
builder_config.set_memory_pool_limit(
trt.MemoryPoolType.WORKSPACE, self.compilation_settings.workspace_size
Expand Down Expand Up @@ -516,18 +521,18 @@ def run_node(self, n: torch.fx.Node) -> torch.fx.Node:
kwargs["_itensor_to_tensor_meta"] = self._itensor_to_tensor_meta
n.kwargs = kwargs

# run the node
_LOGGER.debug(
f"Running node {self._cur_node_name}, a {self._cur_node.op} node "
f"with target {self._cur_node.target} in the TensorRT Interpreter"
)
if _LOGGER.isEnabledFor(logging.DEBUG):
_LOGGER.debug(
f"Converting node {self._cur_node_name} (kind: {n.target}, args: {TRTInterpreter._args_str(n.args)})"
)

trt_node: torch.fx.Node = super().run_node(n)

if n.op == "get_attr":
self.const_mapping[str(n)] = (tuple(trt_node.shape), str(trt_node.dtype))

_LOGGER.debug(
f"Ran node {self._cur_node_name} with properties: {get_node_io(n, self.const_mapping)}"
_LOGGER.info(
f"Converted node {self._cur_node_name} [{n.target}] ({get_node_io(n, self.const_mapping)})"
)

# remove "_itensor_to_tensor_meta"
Expand Down Expand Up @@ -611,9 +616,7 @@ def call_module(
converter, calling_convention = converter_packet

assert self._cur_node_name is not None
_LOGGER.debug(
f"Converting node {self._cur_node_name} (kind: {target}, args: {TRTInterpreter._args_str(args)})"
)

if calling_convention is CallingConvention.LEGACY:
return converter(self.ctx.net, submod, args, kwargs, self._cur_node_name)
else:
Expand All @@ -629,10 +632,6 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any:

converter, calling_convention = converter_packet

assert self._cur_node_name is not None
_LOGGER.debug(
f"Converting node {self._cur_node_name} (kind: {target}, args: {TRTInterpreter._args_str(args)})"
)
if calling_convention is CallingConvention.LEGACY:
return converter(self.ctx.net, target, args, kwargs, self._cur_node_name)
else:
Expand Down Expand Up @@ -663,10 +662,6 @@ def call_method(self, target: str, args: Any, kwargs: Any) -> Any:
)
converter, calling_convention = converter_packet

assert self._cur_node_name is not None
_LOGGER.debug(
f"Converting node {self._cur_node_name} (kind: {target}, args: {TRTInterpreter._args_str(args)})"
)
if calling_convention is CallingConvention.LEGACY:
return converter(self.ctx.net, target, args, kwargs, self._cur_node_name)
else:
Expand Down
Loading

0 comments on commit 2f4fef1

Please sign in to comment.