chore: reset context in weight budget setting

keehyuna · keehyuna · commit 493b0a66d21d · 2024-09-12T15:09:27.000Z
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -94,12 +94,12 @@ TRTEngine::TRTEngine(
   if (get_streamable_weights_size() > 0) {
     // Scratch memory size may change based on the current weight streaming budget
     // Required memory for full streaming is used to minimum weight budget
-    set_device_memory_budget(0);
+    cuda_engine->setWeightStreamingBudgetV2(0);
     min_required_device_budget = cuda_engine->getWeightStreamingScratchMemorySize();
 
     int64_t budget_bytes = get_weight_streaming_automatic_budget();
     LOG_INFO("Set automatic weight streaming budget bytes " << budget_bytes);
-    set_device_memory_budget(budget_bytes);
+    cuda_engine->setWeightStreamingBudgetV2(budget_bytes);
   }
 
   exec_ctx = make_trt(cuda_engine->createExecutionContext());
@@ -276,7 +276,20 @@ int64_t TRTEngine::get_device_memory_budget() {
 }
 
 bool TRTEngine::set_device_memory_budget(int64_t budget) {
-  return cuda_engine->setWeightStreamingBudgetV2(budget);
+  // Recreating the context because weight streaming budget cannot be modified while there are active context.
+  if (exec_ctx.get() != nullptr) {
+    exec_ctx.reset();
+  }
+  if (profile_execution) {
+    trt_engine_profiler.reset();
+  }
+  bool result = cuda_engine->setWeightStreamingBudgetV2(budget);
+  exec_ctx = make_trt(cuda_engine->createExecutionContext());
+  TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
+  if (profile_execution) {
+    enable_profiling();
+  }
+  return result;
 }
 
 // Returns 0 if BuilderFlag::kWEIGHT_STREAMING is unset during engine building.
@@ -292,26 +305,6 @@ int64_t TRTEngine::get_weight_streaming_automatic_budget() {
   return cuda_engine->getWeightStreamingAutomaticBudget();
 }
 
-void TRTEngine::init_context() {
-  if (exec_ctx.get() == nullptr) {
-    exec_ctx = make_trt(cuda_engine->createExecutionContext());
-    TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
-    if (profile_execution) {
-      enable_profiling();
-    }
-  }
-}
-
-void TRTEngine::reset_context() {
-  if (exec_ctx.get() != nullptr) {
-    exec_ctx.reset();
-    exec_ctx = nullptr;
-  }
-  if (profile_execution) {
-    trt_engine_profiler.reset();
-  }
-}
-
 std::string TRTEngine::to_str() const {
   // clang-format off
   std::stringstream ss;
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -76,8 +76,6 @@ struct TRTEngine : torch::CustomClassHolder {
   int64_t get_streamable_weights_size();
   int64_t get_min_required_device_budget();
   int64_t get_weight_streaming_automatic_budget();
-  void init_context();
-  void reset_context();
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
 
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -93,8 +93,6 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def_property("streamable_weights_size", &TRTEngine::get_streamable_weights_size)
         .def_property("min_required_device_budget", &TRTEngine::get_min_required_device_budget)
         .def_property("weight_streaming_automatic_budget", &TRTEngine::get_weight_streaming_automatic_budget)
-        .def("init_context", &TRTEngine::init_context)
-        .def("reset_context", &TRTEngine::reset_context)
         .def_pickle(
             [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> {
               // Serialize TensorRT engine
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -88,7 +88,7 @@ def compile(
     engine_cache_dir: Optional[str] = _defaults.ENGINE_CACHE_DIR,
     engine_cache_size: Optional[int] = _defaults.ENGINE_CACHE_SIZE,
     custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE,
-    enable_weight_streaming: bool = _defaults.WEIGHT_STREAMING,
+    enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -40,7 +40,7 @@
 ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
 ENGINE_CACHE_SIZE = 1073741824
 CUSTOM_ENGINE_CACHE = None
-WEIGHT_STREAMING = False
+ENABLE_WEIGHT_STREAMING = False
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -14,6 +14,7 @@
     DLA_SRAM_SIZE,
     DRYRUN,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
+    ENABLE_WEIGHT_STREAMING,
     ENABLED_PRECISIONS,
     ENGINE_CAPABILITY,
     HARDWARE_COMPATIBLE,
@@ -32,7 +33,6 @@
     USE_FAST_PARTITIONER,
     USE_PYTHON_RUNTIME,
     VERSION_COMPATIBLE,
-    WEIGHT_STREAMING,
     WORKSPACE_SIZE,
     default_device,
 )
@@ -114,4 +114,4 @@ class CompilationSettings:
     lazy_engine_init: bool = LAZY_ENGINE_INIT
     cache_built_engines: bool = CACHE_BUILT_ENGINES
     reuse_cached_engines: bool = REUSE_CACHED_ENGINES
-    enable_weight_streaming: bool = WEIGHT_STREAMING
+    enable_weight_streaming: bool = ENABLE_WEIGHT_STREAMING
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -12,9 +12,6 @@
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import Platform, dtype
 from torch_tensorrt.dynamo._settings import CompilationSettings
-from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import (
-    recreate_context_decorator,
-)
 from torch_tensorrt.dynamo.utils import DYNAMIC_DIM
 from torch_tensorrt.logging import TRT_LOGGER
 from torch_tensorrt.runtime._utils import (
@@ -115,16 +112,6 @@ def __init__(
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
 
-    def init_context(self) -> None:
-        assert self.engine, "Context is used before setting up the engine"
-        if self.context is None:
-            self.context = self.engine.create_execution_context()
-
-    def reset_context(self) -> None:
-        if self.context is not None:
-            del self.context
-            self.context = None
-
     def get_streamable_weights_size(self) -> Any:
         return self.engine.streamable_weights_size
 
@@ -137,9 +124,13 @@ def get_weight_streaming_budget(self) -> Any:
     def get_automatic_weight_streaming_budget(self) -> Any:
         return self.engine.get_weight_streaming_automatic_budget()
 
-    @recreate_context_decorator
     def set_device_memory_budget(self, budget_bytes: int) -> int:
-        return self._set_device_memory_budget(budget_bytes)
+        # Recreating the context because weight streaming budget cannot be modified while there are active context.
+        if self.context is not None:
+            del self.context
+        budget_bytes = self._set_device_memory_budget(budget_bytes)
+        self.context = self.engine.create_execution_context()
+        return budget_bytes
 
     def _set_device_memory_budget(self, budget_bytes: int) -> int:
         # Disable weight streaming for invalid budget size
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -4,8 +4,7 @@
 import copy
 import logging
 import pickle
-from functools import wraps
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import torch
 from torch_tensorrt._Device import Device
@@ -50,22 +49,6 @@
     SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN()  # 9
 
 
-def recreate_context_decorator(method: Callable[..., Any]) -> Callable[..., Any]:
-    """
-    A decorator that destroys a context before a method execution and
-    creates it after the method execution within the same class instance.
-    """
-
-    @wraps(method)
-    def wrapper(self: object, *args: Any, **kwargs: Any) -> Any:
-        self.reset_context()
-        result = method(self, *args, **kwargs)
-        self.init_context()
-        return result
-
-    return wrapper
-
-
 @for_all_methods(needs_torch_tensorrt_runtime)
 class TorchTensorRTModule(torch.nn.Module):  # type: ignore[misc]
     """TorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
@@ -186,12 +169,6 @@ def _pack_engine_info(self) -> List[str | bytes]:
 
         return engine_info
 
-    def init_context(self) -> None:
-        self.engine.init_context()
-
-    def reset_context(self) -> None:
-        self.engine.reset_context()
-
     def get_streamable_weights_size(self) -> Any:
         return self.engine.streamable_weights_size
 
@@ -204,11 +181,7 @@ def get_weight_streaming_budget(self) -> Any:
     def get_automatic_weight_streaming_budget(self) -> Any:
         return self.engine.weight_streaming_automatic_budget
 
-    @recreate_context_decorator
     def set_device_memory_budget(self, budget_bytes: int) -> int:
-        return self._set_device_memory_budget(budget_bytes)
-
-    def _set_device_memory_budget(self, budget_bytes: int) -> int:
         # Disable weight streaming for invalid budget size
         if budget_bytes < 0:
             budget_bytes = self.get_streamable_weights_size()
diff --git a/tests/py/dynamo/runtime/test_004_weight_streaming.py b/tests/py/dynamo/runtime/test_004_weight_streaming.py
@@ -49,9 +49,9 @@ def test_weight_streaming_default(self, _, use_python_runtime):
             use_python_runtime=use_python_runtime,
             enable_weight_streaming=True,
         )
-        # Checking default weight streaming budget(automatic) is applied
-        with torchtrt.runtime.weight_streaming(optimized_model) as weight_streaming_ctx:
-            assert weight_streaming_ctx.device_budget > 0
+        # Checking if default weight streaming budget(automatic) is applied when compiler option was provided
+        weight_streaming_ctx = torchtrt.runtime.weight_streaming(optimized_model)
+        assert weight_streaming_ctx.device_budget > 0
 
         ref = model(*input)
         out = optimized_model(*input)