NVIDIA · naoyam · Jan 14, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/python/nvfuser_direct/__init__.py b/python/nvfuser_direct/__init__.py
@@ -366,10 +366,15 @@ def execute(
             # A copy of fusion is created after construction FusionExecutorCache
             # Delete the _fusion and reference the fusion inside FusionExecutorCache
             del self._fusion
+
+        # Add "id_model" as a default enable option
+        default_enable_options = ["id_model"]
+        merged_enable_options = default_enable_options + _enable_options
+
         return self.fec.execute(
             inputs,
             device=self._get_device_index(device),
-            _enable_options=_enable_options,
+            _enable_options=merged_enable_options,
             _disable_options=_disable_options,
         )
 

diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
@@ -864,8 +864,9 @@ TEST_F(IndexingTest, Reshape) {
           // to provide the extent of the group. However, since everything
           // should be deterministic, string match should also work.
           return std::string(
-              "( ( ( ( ( i98 * 20 ) + ( ( i99 * 10 ) + i100 ) ) / 25 ) * 25 ) "
-              "+ ( ( ( i98 * 20 ) + ( ( i99 * 10 ) + i100 ) ) % 25 ) )");
+              "( ( ( ( ( i114 * 20 ) + ( ( i115 * 10 ) + i116 ) ) / 25 ) * 25 "
+              ") "
+              "+ ( ( ( i114 * 20 ) + ( ( i115 * 10 ) + i116 ) ) % 25 ) )");
         }
         default:
           return std::string();

diff --git a/tests/python/direct/test_python_direct.py b/tests/python/direct/test_python_direct.py
@@ -229,22 +229,20 @@ def test_fusion_execution_cache():
   i2 = i0 / 8;
   nvfuser_index_t i3;
   i3 = i0 % 8;
-  nvfuser_index_t i4;
-  i4 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x));
-  if ((i4 < 64)) {
+  if ((((nvfuser_index_t)threadIdx.x) < 64)) {
     Array<float, 1, 1> T4;
     T4[0] = 0;
     T4[0]
-       = T1[((((T1.alloc_stride[0LL] * i1) + (T1.alloc_stride[1LL] * i2)) + (T1.alloc_stride[2LL] * i3)) + ((4 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
+       = T1[(((T1.alloc_stride[0LL] * i1) + (T1.alloc_stride[1LL] * i2)) + (T1.alloc_stride[2LL] * i3))];
     Array<float, 1, 1> T3;
     T3[0] = 0;
     T3[0]
-       = T0[((((T0.alloc_stride[0LL] * i1) + (T0.alloc_stride[1LL] * i2)) + (T0.alloc_stride[2LL] * i3)) + ((4 * T0.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
+       = T0[(((T0.alloc_stride[0LL] * i1) + (T0.alloc_stride[1LL] * i2)) + (T0.alloc_stride[2LL] * i3))];
     Array<float, 1, 1> T5;
     T5[0]
       = T3[0]
       + T4[0];
-    T2[i4]
+    T2[((nvfuser_index_t)threadIdx.x)]
        = T5[0];
   }
 }\n"""