diff --git a/.github/scripts/bench/bench_op.py b/.github/scripts/bench/bench_op.py
index a0d99b2fa..7bbce06e9 100644
--- a/.github/scripts/bench/bench_op.py
+++ b/.github/scripts/bench/bench_op.py
@@ -14,7 +14,8 @@ def bench_matmul_f16(params: str, *args, **kwargs) -> float:
     c = hidet.ops.matmul(a, b)
     g = hidet.trace_from(c, inputs=[a, b])
     g = hidet.graph.optimize(g)
-    return g.latency()
+    g = g.cuda_graph()
+    return bench_torch_model(lambda: g.run_async(), [])
 
 def bench_batch_matmul(params: str, *args, **kwargs) -> float:
     # Default to benchmarking f32 for now, though this op can run other dtypes
@@ -26,7 +27,8 @@ def bench_batch_matmul(params: str, *args, **kwargs) -> float:
     c = hidet.ops.matmul(a, b)
     g = hidet.trace_from(c, inputs=[a, b])
     g = hidet.graph.optimize(g)
-    return g.latency()
+    g = g.cuda_graph()
+    return bench_torch_model(lambda: g.run_async(), [])
 
 def bench_conv2d(params: str, *args, **kwargs) -> float:
     x_shape, w_shape = params.split(',')
@@ -37,7 +39,8 @@ def bench_conv2d(params: str, *args, **kwargs) -> float:
     o = hidet.ops.conv2d(x, w)
     g = hidet.trace_from(o, inputs=[x, w])
     g = hidet.graph.optimize(g)
-    return g.latency()
+    g = g.cuda_graph()
+    return bench_torch_model(lambda: g.run_async(), [])
 
 def bench_conv2d_gemm_f16(params: str, *args, **kwargs) -> float:
     x_shape, w_shape = params.split(',')
@@ -48,7 +51,8 @@ def bench_conv2d_gemm_f16(params: str, *args, **kwargs) -> float:
     o = hidet.ops.conv2d(x, w)
     g = hidet.trace_from(o, inputs=[x, w])
     g = hidet.graph.optimize(g)
-    return g.latency()
+    g = g.cuda_graph()
+    return bench_torch_model(lambda: g.run_async(), [])
 
 def bench_attn(params: str, *args, **kwargs) -> float:
     bs, seqlen, nhead, hdim = [int(s) for s in params.split('x')]
@@ -61,7 +65,8 @@ def bench_attn(params: str, *args, **kwargs) -> float:
     o = hidet.ops.attention(q, k, v)
     g = hidet.trace_from(o, inputs=[q, k, v])
     g = hidet.graph.optimize(g)
-    return g.latency()
+    g = g.cuda_graph()
+    return bench_torch_model(lambda: g.run_async(), [])
 
 def bench_attn_mask_add(params: str, *args, **kwargs) -> float:
     bs, seqlen, nhead, hdim = [int(s) for s in params.split('x')]
@@ -76,7 +81,8 @@ def bench_attn_mask_add(params: str, *args, **kwargs) -> float:
     o = hidet.ops.attention(q, k, v, mask=mask)
     g = hidet.trace_from(o, inputs=[q, k, v, mask])
     g = hidet.graph.optimize(g)
-    return g.latency()
+    g = g.cuda_graph()
+    return bench_torch_model(lambda: g.run_async(), [])
 
 def bench_reduce(params: str, *args, **kwargs) -> float:
     x_shape, axis = params.split(',', maxsplit=1)
@@ -88,7 +94,8 @@ def bench_reduce(params: str, *args, **kwargs) -> float:
     o = hidet.ops.sum(x, dims=axis)
     g = hidet.trace_from(o, inputs=[x])
     g = hidet.graph.optimize(g)
-    return g.latency()
+    g = g.cuda_graph()
+    return bench_torch_model(lambda: g.run_async(), [])
 
 bench_func_map = {
     'matmul_f16': bench_matmul_f16,
diff --git a/.github/scripts/bench/bench_utils.py b/.github/scripts/bench/bench_utils.py
index 09cf862a8..3921eea7a 100644
--- a/.github/scripts/bench/bench_utils.py
+++ b/.github/scripts/bench/bench_utils.py
@@ -35,9 +35,10 @@ def bench_torch_model(model, torch_inputs, bench_iters=100, warmup_iters=10):
     return latency
 
 def enable_compile_server(enable=True):
-    hidet.option.compile_server.addr(os.environ.get('CI_CS_HOSTNAME'))
-    hidet.option.compile_server.port(int(os.environ.get('CI_CS_PORT')))
-    hidet.option.compile_server.username(os.environ.get('CI_CS_USERNAME'))
-    hidet.option.compile_server.password(os.environ.get('CI_CS_PASSWORD'))
-    hidet.option.compile_server.repo(os.environ.get('REPO_NAME').strip(), os.environ.get('REPO_BRANCH').strip())
-    hidet.option.compile_server.enable(flag=enable)
\ No newline at end of file
+    if os.environ.get('CI_CS_HOSTNAME'):
+        hidet.option.compile_server.addr(os.environ.get('CI_CS_HOSTNAME'))
+        hidet.option.compile_server.port(int(os.environ.get('CI_CS_PORT')))
+        hidet.option.compile_server.username(os.environ.get('CI_CS_USERNAME'))
+        hidet.option.compile_server.password(os.environ.get('CI_CS_PASSWORD'))
+        hidet.option.compile_server.repo(os.environ.get('REPO_NAME').strip(), os.environ.get('REPO_BRANCH').strip())
+        hidet.option.compile_server.enable(flag=enable)
\ No newline at end of file
diff --git a/.github/workflows/launch.yaml b/.github/workflows/launch.yaml
new file mode 100644
index 000000000..15663bf63
--- /dev/null
+++ b/.github/workflows/launch.yaml
@@ -0,0 +1,37 @@
+name: Launch CI
+
+
+on:
+  workflow_dispatch:
+    inputs:
+      shutdown_instances:
+        description: 'Shut down GPU instances when finished.'
+        required: true
+        type: boolean
+        default: true
+  issue_comment:
+    types: [created]
+  
+
+jobs:
+  trigger:
+    if: |
+      github.event_name == 'workflow_dispatch' || 
+      github.event_name == 'issue_comment' && github.event.issue.pull_request != '' &&
+      contains(fromJSON('["MEMBER", "OWNER", "COLLABORATOR"]'), github.event.comment.author_association) &&
+      contains(github.event.comment.body, '$hidet-ci launch')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger workflow in internal repository
+        run: |
+         curl -L \
+         -X POST \
+         -H "Accept: application/vnd.github+json" \
+         -H "Authorization: Bearer ${{ secrets.GH_PAT }}" \
+         -H "X-GitHub-Api-Version: 2022-11-28" \
+         https://api.github.com/repos/${{ secrets.REPO_NAME }}/actions/workflows/regression.yaml/dispatches \
+         -d "{\"ref\": \"main\", \"inputs\": {\"shutdown_instances\": \"${{ env.SHUTDOWN }}\", \"source_repo\": \"${{ env.SOURCE_REPO }}\", \"source_ref\": \"${{ env.SOURCE_REF }}\"}}"
+        env:
+          SHUTDOWN: ${{ github.event_name == 'workflow_dispatch' && inputs.shutdown_instances || !contains(github.event.comment.body, '--keep') }}
+          SOURCE_REPO: ${{ github.repository }}
+          SOURCE_REF: ${{ github.event_name == 'workflow_dispatch' && github.ref_name || github.event.issue.number }}
diff --git a/.github/workflows/regression.yaml b/.github/workflows/regression.yaml
index 4d4729c29..210ec3d30 100644
--- a/.github/workflows/regression.yaml
+++ b/.github/workflows/regression.yaml
@@ -8,16 +8,19 @@ on:
         required: true
         type: boolean
         default: true
-  issue_comment:
-    types: [created]
+      source_repo:
+        description: 'Source Repository Name. E.g, hidet-org/hidet'
+        required: true
+        type: string
+        default: 'this'
+      source_ref:
+        description: 'Source repository ref (Branch name or PR number).'
+        required: true
+        type: string
+        default: 'this'
 
 jobs:
   start_instances:
-    if: |
-      github.event_name == 'workflow_dispatch' || 
-      github.event_name == 'issue_comment' && github.event.issue.pull_request != '' &&
-      contains(fromJSON('["MEMBER", "OWNER", "COLLABORATOR"]'), github.event.comment.author_association) &&
-      contains(github.event.comment.body, '$hidet-ci launch')
     runs-on: ubuntu-latest
     outputs:
       started_instances: ${{ steps.run_py_script.outputs.started_instances }}
@@ -61,13 +64,20 @@ jobs:
     container:
       image: nvcr.io/nvidia/pytorch:23.10-py3
       options: --gpus all
+    outputs:
+      commit_time: ${{ steps.get_commit_info.outputs.commit_time }}
+      commit_author: ${{ steps.get_commit_info.outputs.commit_author }}
+      commit_sha: ${{ steps.get_commit_info.outputs.commit_sha }}
     steps:
       - name: Checkout repo
         uses: actions/checkout@v4
         with:
+          repository: |
+            ${{ inputs.source_repo == 'this' && github.repository ||
+            inputs.source_repo }}
           ref: |
-            ${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
-            format('refs/pull/{0}/head', github.event.issue.number) }}
+            ${{ inputs.source_repo == 'this' && github.ref_name || 
+            format('refs/pull/{0}/head', inputs.source_ref) }}
           path: hidet
       
       - name: Checkout models
@@ -114,10 +124,8 @@ jobs:
           python hidet/.github/scripts/run_tests.py
         env:
           HW_CONFIG: ${{ matrix.hw_configs }}
-          REPO_NAME: ${{ github.repository }}
-          REPO_BRANCH: |
-            ${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
-            format('pull/{0}', github.event.issue.number) }}
+          REPO_NAME: ${{ inputs.source_repo == 'this' && github.repository || inputs.source_repo }}
+          REPO_BRANCH: ${{ inputs.source_repo == 'this' && github.ref_name || format('pull/{0}', inputs.source_ref) }}
           CI_CS_HOSTNAME: ${{ secrets.CI_CS_HOSTNAME }}
           CI_CS_PORT: ${{ secrets.CI_CS_PORT }}
           CI_CS_USERNAME: ${{ secrets.CI_CS_USERNAME }}
@@ -130,6 +138,17 @@ jobs:
           name: run_configs_${{ matrix.hw_configs }}
           path: run_configs.json
           retention-days: 1
+      
+      - name: Retrieve commit properties
+        id: get_commit_info
+        run: |
+          cd hidet
+          COMMIT_TIME=$(git log -1 --format=%cd --date=format:'%Y-%m-%d %H:%M:%S')
+          COMMIT_AUTHOR=$(git log -1 --format=%an)
+          COMMIT_SHA=$(git log -1 --format=%H)
+          echo "commit_time=$COMMIT_TIME" >> $GITHUB_OUTPUT
+          echo "commit_author=$COMMIT_AUTHOR" >> $GITHUB_OUTPUT
+          echo "commit_sha=$COMMIT_SHA" >> $GITHUB_OUTPUT
   
   upload_results:
     runs-on: ubuntu-latest
@@ -143,26 +162,15 @@ jobs:
 
       - name: Download run configs
         uses: actions/download-artifact@v3
-      
-      - name: Setup ENV
-        run: |
-          COMMIT_TIME=$(git log -1 --format=%cd --date=format:'%Y-%m-%d %H:%M:%S')
-          COMMIT_AUTHOR=$(git log -1 --format=%an)
-          COMMIT_SHA=$(git log -1 --format=%H)
-          echo "COMMIT_TIME=$COMMIT_TIME" >> $GITHUB_ENV
-          echo "COMMIT_AUTHOR=$COMMIT_AUTHOR" >> $GITHUB_ENV
-          echo "COMMIT_SHA=$COMMIT_SHA" >> $GITHUB_ENV
 
       - name: Run main Python script
         run: python ./.github/scripts/upload_results.py
         env:
-          REPO_NAME: ${{ github.repository }}
-          REPO_BRANCH: |
-            ${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
-            format('pull/{0}', github.event.issue.number) }}
-          COMMIT_SHA: ${{ env.COMMIT_SHA }}
-          COMMIT_TIME: ${{ env.COMMIT_TIME }}
-          COMMIT_AUTHOR: ${{ env.COMMIT_AUTHOR }}
+          REPO_NAME: ${{ inputs.source_repo == 'this' && github.repository || inputs.source_repo }}
+          REPO_BRANCH: ${{ inputs.source_repo == 'this' && github.ref_name || format('pull/{0}', inputs.source_ref) }}
+          COMMIT_SHA: ${{ needs.run_tests.outputs.commit_sha }}
+          COMMIT_TIME: ${{ needs.run_tests.outputs.commit_time }}
+          COMMIT_AUTHOR: ${{ needs.run_tests.outputs.commit_author }}
           HW_CONFIGS: ${{ needs.start_instances.outputs.hw_configs }}
           CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
           CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
@@ -170,10 +178,7 @@ jobs:
           CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
 
   stop_instances:
-    if: |
-      github.event_name == 'workflow_dispatch' && inputs.shutdown_instances || 
-      github.event_name == 'issue_comment' && github.event.issue.pull_request != '' &&
-      !contains(github.event.comment.body, '--keep')
+    if: inputs.shutdown_instances
     runs-on: ubuntu-latest
     needs: [start_instances, run_tests]
     steps:
diff --git a/python/hidet/graph/ops/fusion/apply_prologue_epilogue.py b/python/hidet/graph/ops/fusion/apply_prologue_epilogue.py
index 10d9c6fca..2b2a7f74e 100644
--- a/python/hidet/graph/ops/fusion/apply_prologue_epilogue.py
+++ b/python/hidet/graph/ops/fusion/apply_prologue_epilogue.py
@@ -771,7 +771,6 @@ def process_module(self, ir_module: IRModule) -> IRModule:
         try:
             rewriter = PrologueEpilogueFuseRewriter(self.fused_task, prologues, epilogues, tensor_map, marks)
             ir_module = rewriter.rewrite(ir_module)
-            print('success')
             return ir_module
         except CanNotFuseError:
             pass
diff --git a/python/hidet/ir/schedulers/cuda/scheduler.py b/python/hidet/ir/schedulers/cuda/scheduler.py
index cae20ab74..558b56aa0 100644
--- a/python/hidet/ir/schedulers/cuda/scheduler.py
+++ b/python/hidet/ir/schedulers/cuda/scheduler.py
@@ -35,7 +35,13 @@ def schedule_grid_compute(self, node: GridCompute, tensor_map: Dict[TensorNode,
         grid_dim: Expr = (prod(node.shape) + block_dim - 1) // block_dim
 
         if self.task is not None:
-            name = f'{self.task.name}_compute_{node.name}'
+            from hidet.graph.ops.fusion.fused_operator import FusedTask
+
+            if isinstance(self.task, FusedTask):
+                fused_name = self.task.attrs['fused_ops'].replace(' ', '_')
+                name = f'fused_{fused_name}_{node.name}'
+            else:
+                name = f'{self.task.name}_{node.name}'
         else:
             name = f'compute_{node.name}'