fix: bugfix for matmul when use_fp32_acc

chohk88 · chohk88 · commit 13fb978114cb · 2025-09-09T01:09:10.000Z
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py
@@ -48,9 +48,13 @@ def matrix_multiply(
     input, other = broadcast(
         ctx, input, other, f"{name}_input", f"{name}_other", preset_diff
     )
+    # Get the original input dtype
+    input_dtype = _enums.dtype._from(input.dtype).to(torch.dtype)
+
     if (
         ctx.net.get_flag(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED)
         and ctx.compilation_settings.use_fp32_acc
+        and input_dtype == torch.float16
     ):
         input = cast_trt_tensor(ctx, input, torch.float32, f"{name}_input_casted")
         other = cast_trt_tensor(ctx, other, torch.float32, f"{name}_other_casted")
@@ -63,9 +67,10 @@ def matrix_multiply(
     if (
         ctx.net.get_flag(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED)
         and ctx.compilation_settings.use_fp32_acc
+        and input_dtype == torch.float16
     ):
         matmul_output = cast_trt_tensor(
-            ctx, matmul_output, torch.float16, f"{name}_output_casted"
+            ctx, matmul_output, input_dtype, f"{name}_output_casted"
         )
 
     set_layer_name(matmul_layer, target, name, source_ir)
diff --git a/tests/py/dynamo/models/test_llm_models.py b/tests/py/dynamo/models/test_llm_models.py
@@ -44,10 +44,7 @@ def test_llm_decoder_layer(precision):
             .to("cuda")
         )
 
-        if register_sdpa._SDPA_MAPPING.get(args.model, None) is not None:
-            register_sdpa._SDPA_MAPPING[args.model](model_config=model.config)
-        else:
-            register_sdpa._SDPA_MAPPING["default"](model_config=model.config)
+        register_sdpa.enable_sdpa_converter(args.model, model.config)
         model = model.to(dtype)
         # use randint will generate nan values in the logits, use a fixed input_ids for now
         # input_ids = torch.randint(0, model.config.vocab_size, (1, args.num_tokens)).to("cuda")
diff --git a/tools/llm/run_llm.py b/tools/llm/run_llm.py
@@ -59,10 +59,7 @@ def get_model(args):
             .cuda()
         )
         # register SDPA variant for the model
-        if register_sdpa._SDPA_MAPPING.get(args.model, None) is not None:
-            register_sdpa._SDPA_MAPPING[args.model](model_config=model.config)
-        else:
-            register_sdpa._SDPA_MAPPING["default"](model_config=model.config)
+        register_sdpa.enable_sdpa_converter(args.model, model.config)
 
     if args.precision == "FP16":
         model = model.to(torch.float16)
diff --git a/tools/llm/run_vlm.py b/tools/llm/run_vlm.py
@@ -589,6 +589,8 @@ def print_outputs(backend_name: str, gen_tokens: torch.Tensor, tokenizer):
     print("--- Registering SDPA lowering pass locally for LM compilation ---")
     from torchtrt_ext import register_sdpa
 
+    register_sdpa.enable_sdpa_converter(args.model, model.config)
+
     if args.cache == "static_v1":
         import static_cache_v1  # noqa: F401
     elif args.cache not in ("", None):
diff --git a/tools/llm/torchtrt_ext/register_sdpa.py b/tools/llm/torchtrt_ext/register_sdpa.py
@@ -19,16 +19,27 @@
 
 logger = logging.getLogger(__name__)
 
-# Remove decompositions for aten.scaled_dot_product_attention, aten._scaled_dot_product_efficient_attention, aten._scaled_dot_product_flash_attention
-# This is because we want to have SDPA as a standalone operator in the graph and invoke the custom converter for it.
-TORCH_TRT_DECOMPOSITIONS.pop(torch.ops.aten.scaled_dot_product_attention.default, None)
-TORCH_TRT_DECOMPOSITIONS.pop(
-    torch.ops.aten._scaled_dot_product_efficient_attention.default, None
-)
-TORCH_TRT_DECOMPOSITIONS.pop(
-    torch.ops.aten._scaled_dot_product_flash_attention.default, None
+_SDPA_OPS_TO_REMOVE = (
+    torch.ops.aten.scaled_dot_product_attention.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
 )
 
+
+def _remove_decompositions():
+    """
+    Remove decompositions for SDPA operators.
+
+    This function is idempotent. It ensures that the SDPA operators are removed
+    from the decomposition table, allowing a custom converter to be used.
+    """
+    # Check if any of the decompositions still exist before proceeding
+    if any(op in TORCH_TRT_DECOMPOSITIONS for op in _SDPA_OPS_TO_REMOVE):
+        logger.debug("Removing SDPA decompositions to enable custom converter.")
+        for op in _SDPA_OPS_TO_REMOVE:
+            TORCH_TRT_DECOMPOSITIONS.pop(op, None)
+
+
 REPLACEABLE_ATEN_OPS = {
     torch.ops.aten._scaled_dot_product_efficient_attention.default,
     torch.ops.aten._scaled_dot_product_flash_attention.default,
@@ -271,3 +282,35 @@ def default_sdpa_pass(
     "google/gemma-3-1b-it": register_gemma3_sdpa_pass,
     "default": register_default_sdpa_pass,
 }
+
+
+def enable_sdpa_converter(model_name: str, model_config: Any) -> None:
+    """
+    Enables the custom SDPA converter for a given model.
+
+    This function performs two main actions:
+    1. Removes the default PyTorch SDPA decompositions from Torch-TensorRT's
+       lowering registry. This is necessary to prevent them from being used
+       instead of our custom converter.
+    2. Registers a model-specific or default lowering pass that replaces the
+       standard SDPA operators with a version optimized for TensorRT conversion.
+
+    Args:
+        model_name (str): The name of the model (e.g., from Hugging Face).
+        model_config (Any): The model's configuration object. This is used to
+                            extract parameters for model-specific optimizations,
+                            like sliding window attention.
+    """
+    _remove_decompositions()
+
+    pass_registrator = _SDPA_MAPPING.get(model_name)
+
+    if pass_registrator:
+        logger.info(f"Registering specific SDPA lowering pass for model: {model_name}")
+        pass_registrator(model_config=model_config)
+    else:
+        logger.info(
+            f"No specific SDPA lowering pass for model '{model_name}'. "
+            "Using default SDPA pass."
+        )
+        _SDPA_MAPPING["default"](model_config=model_config)