diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td
index 28641951..91f599a7 100644
--- a/include/TaskflowDialect/TaskflowOps.td
+++ b/include/TaskflowDialect/TaskflowOps.td
@@ -82,10 +82,10 @@ def TaskflowReturnOp : TaskflowOpBase<"return", [Terminator]> {
 
 // Defines a uniform computation and control task operation within a Taskflow graph.
 def TaskflowTaskOp : TaskflowOpBase<"task", [
+    IsolatedFromAbove,
     AttrSizedOperandSegments,
     AttrSizedResultSegments,
-    SingleBlockImplicitTerminator<"TaskflowYieldOp">,
-    NoMemoryEffect,
+    SingleBlockImplicitTerminator<"TaskflowYieldOp">
 ]>{
   let summary = "Uniform computation and control task operation within a Taskflow graph";
 
@@ -156,10 +156,11 @@ def TaskflowDriveOp : TaskflowOpBase<"drive", [Pure]>{
   let results = (outs TaskflowPacketType:$target);
 
   let assemblyFormat = [{
-    $source attr-dict `:` type($source) `->` type($target)
+    $source attr-dict `:` qualified(type($source)) `->` qualified(type($target))
   }];
 }
 
+// Defines the data dependency edge operation that carries data dependencies between tasks in a Taskflow graph.
 def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{
   let summary = "Data dependency edge that carries data dependencies between tasks in a Taskflow graph";
   let description = [{
@@ -175,4 +176,30 @@ def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{
   }];
 }
 
+// Defines the emit operatin to emit data and control packets from a task before termination (used in streaming or hierachical control cases).
+def TaskflowEmitOp : TaskflowOpBase<"emit", [Pure]>{
+  let summary = "Emit operation for taskflow.task to emit data and control packets before termination";
+  let description = [{
+    Emits control and data packets from a task before its termination.
+    This is useful in streaming or hierarchical control scenarios where tasks need to send out packets without terminating the entire task.
+    
+    Example:
+      taskflow.task(...) {
+        ...
+        taskflow.emit %control_packet, %data_packet : !taskflow.packet<...>, i32
+        taskflow.yield
+      }
+  }];
+  let arguments = (ins Variadic<AnyType>:$results);
+  let assemblyFormat = [{
+    ($results^ `:` type($results))? attr-dict
+  }];
+  let builders = [
+    // Default builder for empty emit.
+    OpBuilder<(ins), [{
+    build($_builder, $_state, ValueRange{});
+    }]>
+  ];
+}
+
 #endif // TASKFLOW_OPS_TD
\ No newline at end of file
diff --git a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp b/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp
index a4489f44..7b4e851f 100644
--- a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp
+++ b/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp
@@ -89,8 +89,7 @@ static void collectExternalValuesForOp(
     // Skips values defined inside graph ops or nested regions.
     Operation *def_op = operand.getDefiningOp();
     if (def_op) {
-      if (!graph_op_set.contains(def_op) &&
-          def_op->getBlock()->getParentOp() == func_op.getOperation()) {
+      if (def_op->getBlock()->getParentOp() == func_op.getOperation()) {
         external_values.insert(operand);
       }
     }
@@ -147,9 +146,22 @@ static SmallVector<Value> identifyGraphInputs(ArrayRef<Operation *> graph_ops,
                                               func::FuncOp func_op) {
   llvm::SetVector<Value> input_set;
   llvm::DenseSet<Operation *> graph_op_set(graph_ops.begin(), graph_ops.end());
-
+  DenseMap<Operation *, SmallVector<Value>> external_values_per_op =
+      collectExternalValuesPerOp(graph_ops, func_op);
   for (Operation *op : graph_ops) {
-    collectExternalValuesForOp(op, graph_op_set, func_op, input_set);
+    for (Value external_val : external_values_per_op[op]) {
+      if (external_val.getDefiningOp()) {
+        if (!graph_op_set.contains(external_val.getDefiningOp())) {
+          input_set.insert(external_val);
+        }
+      } else {
+        if (isa<BlockArgument>(external_val) &&
+            external_val.getParentBlock()->getParentOp() ==
+                func_op.getOperation()) {
+          input_set.insert(external_val);
+        }
+      }
+    }
   }
 
   return SmallVector<Value>(input_set.begin(), input_set.end());
@@ -396,24 +408,9 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
     return success();
   }
 
-  llvm::errs() << "Converting function: " << func_op.getName() << "\n";
-  llvm::errs() << "Collected taskflow graph operations:\n";
-  for (Operation *op : graph_ops) {
-    llvm::errs() << "  " << *op << "\n";
-  }
-
   SmallVector<Value> graph_inputs = identifyGraphInputs(graph_ops, func_op);
   SmallVector<Value> graph_outputs = identifyGraphOutputs(graph_ops, func_op);
 
-  llvm::errs() << "Identified graph inputs:\n";
-  for (Value input : graph_inputs) {
-    llvm::errs() << "  " << input << "\n";
-  }
-  llvm::errs() << "Identified graph outputs:\n";
-  for (Value output : graph_outputs) {
-    llvm::errs() << "  " << output << "\n";
-  }
-
   // Finds insertion point: after the last operation that defines a graph input.
   Operation *insertion_point = nullptr;
   for (Value input : graph_inputs) {
@@ -441,10 +438,6 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
   // Step 2 & 3 & 4: Creates the taskflow.graph op.
   auto result = buildTaskflowGraph(builder, func_op, graph_ops, graph_inputs,
                                    graph_outputs, op_external_values);
-  llvm::errs() << "Converted function to TaskFlow graph.\n";
-  llvm::errs() << "Resulting function:\n";
-  func_op.print(llvm::errs());
-  llvm::errs() << "\n";
 
   return result;
 }
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop-taskflow.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop-taskflow.mlir
new file mode 100644
index 00000000..615c7c50
--- /dev/null
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop-taskflow.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-neura-opt %s | FileCheck %s
+
+module {
+  func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c2_i32 = arith.constant 2 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %alloca = memref.alloca() : memref<i32>
+    %alloca_0 = memref.alloca() : memref<4x8xi32>
+    taskflow.graph(%c0_i32, %alloca_0, %alloca, %c2_i32, %c8_i32) {
+    ^bb0(%arg0: i32, %arg1: memref<4x8xi32>, %arg2: memref<i32>, %arg3: i32, %arg4: i32):
+      %data_outs = "taskflow.task"(%arg0) <{operandSegmentSizes = array<i32: 0, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+      ^bb0(%arg5: i32):
+        %7 = affine.for %arg6 = 0 to 5 iter_args(%arg7 = %arg5) -> (i32) {
+          %8 = arith.index_cast %arg6 : index to i32
+          %9 = arith.addi %arg7, %8 : i32
+          affine.yield %9 : i32
+        }
+        taskflow.yield %7 : i32
+      }) : (i32) -> i32
+      %1 = taskflow.channel %data_outs : i32 -> i32
+      %control_outs, %data_outs_1 = "taskflow.task"(%arg4) <{operandSegmentSizes = array<i32: 0, 1>, resultSegmentSizes = array<i32: 1, 1>, task_name = "Controller_1"}> ({
+      ^bb0(%arg5: i32):
+        affine.for %arg6 = 0 to 4 {
+          %7 = arith.index_cast %arg6 : index to i32
+          %8 = arith.muli %7, %arg5 : i32
+          taskflow.emit %arg6, %8 : index, i32
+        }
+        taskflow.yield
+      }) : (i32) -> (!taskflow.packet<index>, i32)
+      %2 = taskflow.channel %data_outs_1 : i32 -> i32
+      %3 = taskflow.channel %data_outs_1 : i32 -> i32
+      %4 = taskflow.drive %control_outs : !taskflow.packet<index> -> !taskflow.packet<index>
+      %5 = taskflow.drive %control_outs : !taskflow.packet<index> -> !taskflow.packet<index>
+      %data_outs_2 = "taskflow.task"(%5, %3, %arg1) <{operandSegmentSizes = array<i32: 1, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_2"}> ({
+      ^bb0(%arg5: index, %arg6: i32, %arg7: memref<4x8xi32>):
+        affine.for %arg8 = 0 to 8 {
+          %7 = arith.index_cast %arg8 : index to i32
+          %8 = arith.addi %arg6, %7 : i32
+          memref.store %8, %arg7[%arg5, %arg8] : memref<4x8xi32>
+        }
+        taskflow.yield %arg7 : memref<4x8xi32>
+      }) : (!taskflow.packet<index>, i32, memref<4x8xi32>) -> memref<4x8xi32>
+      %6 = taskflow.channel %data_outs_2 : memref<4x8xi32> -> memref<4x8xi32>
+      "taskflow.task"(%4, %2, %6, %1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 1, 5>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_3"}> ({
+      ^bb0(%arg5: index, %arg6: i32, %arg7: memref<4x8xi32>, %arg8: i32, %arg9: memref<i32>, %arg10: i32):
+        affine.for %arg11 = 0 to 8 {
+          %7 = memref.load %arg7[%arg5, %arg11] : memref<4x8xi32>
+          %8 = arith.addi %7, %arg8 : i32
+          %c3 = arith.constant 3 : index
+          %9 = arith.cmpi eq, %arg5, %c3 : index
+          %c7 = arith.constant 7 : index
+          %10 = arith.cmpi eq, %arg11, %c7 : index
+          %11 = arith.andi %9, %10 : i1
+          scf.if %11 {
+            memref.store %8, %arg9[] : memref<i32>
+            %12 = arith.muli %8, %arg10 : i32
+            memref.store %12, %arg9[] : memref<i32>
+          }
+        }
+        taskflow.yield %arg9 : memref<i32>
+      }) : (!taskflow.packet<index>, i32, memref<4x8xi32>, i32, memref<i32>, i32) -> memref<i32>
+    } : (i32, memref<4x8xi32>, memref<i32>, i32, i32) -> ()
+    %0 = affine.load %alloca[] : memref<i32>
+    return %0 : i32
+  }
+}
+
+// CHECK-LABEL: func.func @_Z21irregularLoopExample1v
+// CHECK: taskflow.graph
+// CHECK: taskflow.task
+// CHECK: taskflow.channel
+// CHECK: taskflow.yield
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp
new file mode 100644
index 00000000..580e1971
--- /dev/null
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp
@@ -0,0 +1,35 @@
+using namespace std;
+
+#define M 4
+#define N 8
+#define K 5
+
+// Example 1: Matrix processing + vectorization + RAW dependency
+int irregularLoopExample1() {
+  //   vector<vector<int>> A(M, vector<int>(N, 0));
+  int A[M][N];
+  int B[M][N];
+  int temp[N];
+
+  for (int i = 0; i < M; i++) {
+    // First independent loop: matrix initialization (Independent Loop 1)
+    for (int j = 0; j < N; j++) {
+      A[i][j] = i * N + j;
+      temp[j] = 0; // Initialize temp
+    }
+
+    // Non-nested code segment
+    int sum = 0;
+    for (int k = 0; k < K; k++) {
+      sum += k;
+    }
+
+    // Second independent loop: using the results of the first loop (Independent
+    // Loop 2 - RAW Dependency) RAW: depends on the writes to temp[j] above
+    for (int j = 0; j < N; j++) {
+      B[i][j] = A[i][j] + temp[j] + sum; // Read temp[j] (RAW dependency)
+      B[i][j] *= 2;
+    }
+  }
+  return B[M - 1][N - 1];
+}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested-taskflow.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested-taskflow.mlir
new file mode 100644
index 00000000..e1f27dba
--- /dev/null
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested-taskflow.mlir
@@ -0,0 +1,84 @@
+// RUN: mlir-neura-opt %s | FileCheck %s
+
+module {
+  func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    taskflow.graph(%arg0, %arg5, %arg1, %arg2, %arg6, %arg9, %arg3, %arg7, %arg4, %arg8) {
+    ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?xi32>, %arg12: memref<?x8x5xi32>, %arg13: memref<?x8x5xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>, %arg16: memref<?x7xi32>, %arg17: memref<?xi32>, %arg18: memref<?x9xi32>, %arg19: memref<?xi32>):
+      %data_outs = "taskflow.task"(%arg10, %arg11) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+      ^bb0(%arg20: memref<?x8x6xi32>, %arg21: memref<?xi32>):
+        affine.for %arg22 = 0 to 4 {
+          affine.for %arg23 = 0 to 8 {
+            affine.for %arg24 = 0 to 6 {
+              %4 = affine.load %arg20[%arg22, %arg23, %arg24] : memref<?x8x6xi32>
+              affine.store %4, %arg21[%arg24] : memref<?xi32>
+            }
+          }
+        }
+        taskflow.yield %arg21 : memref<?xi32>
+      }) : (memref<?x8x6xi32>, memref<?xi32>) -> memref<?xi32>
+      %1 = taskflow.channel %data_outs : memref<?xi32> -> memref<?xi32>
+      %data_outs_0 = "taskflow.task"(%arg12, %arg13, %arg14) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_1"}> ({
+      ^bb0(%arg20: memref<?x8x5xi32>, %arg21: memref<?x8x5xi32>, %arg22: memref<?xi32>):
+        affine.for %arg23 = 0 to 4 {
+          affine.for %arg24 = 0 to 8 {
+            affine.for %arg25 = 0 to 5 {
+              %4 = affine.load %arg20[%arg23, %arg24, %arg25] : memref<?x8x5xi32>
+              %5 = affine.load %arg21[%arg23, %arg24, %arg25] : memref<?x8x5xi32>
+              %6 = arith.addi %4, %5 : i32
+              affine.store %6, %arg22[%arg25] : memref<?xi32>
+            }
+          }
+        }
+        taskflow.yield %arg22 : memref<?xi32>
+      }) : (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) -> memref<?xi32>
+      %2 = taskflow.channel %data_outs_0 : memref<?xi32> -> memref<?xi32>
+      %data_outs_1 = "taskflow.task"(%1, %2, %arg15) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_2"}> ({
+      ^bb0(%arg20: memref<?xi32>, %arg21: memref<?xi32>, %arg22: memref<?xi32>):
+        affine.for %arg23 = 0 to 4 {
+          affine.for %arg24 = 0 to 8 {
+            affine.for %arg25 = 0 to 6 {
+              %4 = affine.load %arg20[%arg25] : memref<?xi32>
+              %5 = affine.load %arg21[%arg25] : memref<?xi32>
+              %6 = arith.addi %4, %5 : i32
+              %7 = affine.load %arg22[0] : memref<?xi32>
+              %8 = arith.addi %7, %6 : i32
+              affine.store %8, %arg22[0] : memref<?xi32>
+            }
+          }
+        }
+        taskflow.yield %arg22 : memref<?xi32>
+      }) : (memref<?xi32>, memref<?xi32>, memref<?xi32>) -> memref<?xi32>
+      %data_outs_2 = "taskflow.task"(%arg16, %arg17) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_3"}> ({
+      ^bb0(%arg20: memref<?x7xi32>, %arg21: memref<?xi32>):
+        affine.for %arg22 = 0 to 4 {
+          affine.for %arg23 = 0 to 7 {
+            %4 = affine.load %arg20[%arg22, %arg23] : memref<?x7xi32>
+            affine.store %4, %arg21[%arg23] : memref<?xi32>
+          }
+        }
+        taskflow.yield %arg21 : memref<?xi32>
+      }) : (memref<?x7xi32>, memref<?xi32>) -> memref<?xi32>
+      %3 = taskflow.channel %data_outs_2 : memref<?xi32> -> memref<?xi32>
+      %data_outs_3 = "taskflow.task"(%arg18, %3, %arg19) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_4"}> ({
+      ^bb0(%arg20: memref<?x9xi32>, %arg21: memref<?xi32>, %arg22: memref<?xi32>):
+        affine.for %arg23 = 0 to 4 {
+          affine.for %arg24 = 0 to 9 {
+            %4 = affine.load %arg20[%arg23, %arg24] : memref<?x9xi32>
+            %5 = affine.load %arg21[%arg24] : memref<?xi32>
+            %6 = arith.addi %4, %5 : i32
+            affine.store %6, %arg22[%arg24] : memref<?xi32>
+          }
+        }
+        taskflow.yield %arg22 : memref<?xi32>
+      }) : (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> memref<?xi32>
+    } : (memref<?x8x6xi32>, memref<?xi32>, memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>, memref<?xi32>, memref<?x7xi32>, memref<?xi32>, memref<?x9xi32>, memref<?xi32>) -> ()
+    %0 = affine.load %arg9[0] : memref<?xi32>
+    return %0 : i32
+  }
+}
+
+// CHECK-LABEL: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_
+// CHECK: taskflow.graph
+// CHECK: taskflow.task
+// CHECK: taskflow.channel
+// CHECK: taskflow.yield
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.cpp b/test/multi-cgra/taskflow/multi-nested/multi-nested.cpp
new file mode 100644
index 00000000..e228ba86
--- /dev/null
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.cpp
@@ -0,0 +1,25 @@
+// Pure nested loop structure - no inter-loop computations
+int pureNestedLoopExample(int d1[4][8][6], int d2[4][8][5], int d3[4][8][5],
+                          int d4[4][7], int d5[4][9], int m1[6], int m2[5],
+                          int m3[7], int m4[9], int *result) {
+  for (int i = 0; i < 4; i++) {     // Loop A
+    for (int j = 0; j < 8; j++) {   // Loop B
+      for (int k = 0; k < 6; k++) { // Loop C
+        m1[k] = d1[i][j][k];
+      }
+      for (int k = 0; k < 5; k++) { // Loop D
+        m2[k] = d2[i][j][k] + d3[i][j][k];
+      }
+      for (int k = 0; k < 6; k++) { // Loop E
+        *result += m1[k] + m2[k];
+      }
+    }
+    for (int j = 0; j < 7; j++) { // Loop F
+      m3[j] = d4[i][j];
+    }
+    for (int j = 0; j < 9; j++) { // Loop G
+      m4[j] = d5[i][j] + m3[j];
+    }
+  }
+  return *result;
+}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/resenet/resnet.mlir b/test/multi-cgra/taskflow/resenet/resnet.mlir
index f537fe8f..8042874c 100644
--- a/test/multi-cgra/taskflow/resenet/resnet.mlir
+++ b/test/multi-cgra/taskflow/resenet/resnet.mlir
@@ -5,70 +5,71 @@
 
 // RUN: FileCheck %s --input-file=%t-resnet-taskflow.mlir
 
-// CHECK:      %2 = taskflow.graph(%arg0, %cst_1, %cst_0, %1, %0, %cst) {
-// CHECK-NEXT:   ^bb0(%arg1: tensor<1x64x8x8xf32>, %arg2: f32, %arg3: tensor<64x64x3x3xf32>, %arg4: tensor<1x64x8x8xf32>, %arg5: tensor<1x64x8x8xf32>, %arg6: tensor<64x64x3x3xf32>):
-// CHECK-NEXT:     %data_outs = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "task_0"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32):
-// CHECK-NEXT:       %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] {
-// CHECK-NEXT:       ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
-// CHECK-NEXT:         tensor.yield %arg8 : f32
-// CHECK-NEXT:       } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32>
-// CHECK-NEXT:       taskflow.yield %padded : tensor<1x64x10x10xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %3 = taskflow.channel %data_outs : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %data_outs_2 = "taskflow.task"(%arg3, %arg4, %3) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "conv2d_1"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>):
-// CHECK-NEXT:       %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %4 = taskflow.channel %data_outs_2 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_3 = "taskflow.task"(%arg5, %arg2, %4) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_2"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_2 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) {
-// CHECK-NEXT:       ^bb0(%in: f32, %out: f32):
-// CHECK-NEXT:         %10 = arith.cmpf ugt, %in, %arg8 : f32
-// CHECK-NEXT:         %11 = arith.select %10, %in, %arg8 : f32
-// CHECK-NEXT:         linalg.yield %11 : f32
-// CHECK-NEXT:       } -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %5 = taskflow.channel %data_outs_3 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_4 = "taskflow.task"(%arg2, %5) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "task_3"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: f32, %arg8: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %padded = tensor.pad %data_outs_3 low[0, 0, 1, 1] high[0, 0, 1, 1] {
-// CHECK-NEXT:       ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
-// CHECK-NEXT:         tensor.yield %arg7 : f32
-// CHECK-NEXT:       } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32>
-// CHECK-NEXT:       taskflow.yield %padded : tensor<1x64x10x10xf32>
-// CHECK-NEXT:     }) : (f32, tensor<1x64x8x8xf32>) -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %6 = taskflow.channel %data_outs_4 : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %data_outs_5 = "taskflow.task"(%arg6, %arg4, %6) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "conv2d_4"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>):
-// CHECK-NEXT:       %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs_4, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %7 = taskflow.channel %data_outs_5 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_6 = "taskflow.task"(%arg1, %arg5, %7) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_5"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %9 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_5, %arg7 : tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) {
-// CHECK-NEXT:       ^bb0(%in: f32, %in_8: f32, %out: f32):
-// CHECK-NEXT:         %10 = arith.addf %in, %in_8 : f32
-// CHECK-NEXT:         linalg.yield %10 : f32
-// CHECK-NEXT:       } -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %8 = taskflow.channel %data_outs_6 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_7 = "taskflow.task"(%arg5, %arg2, %8) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_6"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_6 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) {
-// CHECK-NEXT:       ^bb0(%in: f32, %out: f32):
-// CHECK-NEXT:         %10 = arith.cmpf ugt, %in, %arg8 : f32
-// CHECK-NEXT:         %11 = arith.select %10, %in, %arg8 : f32
-// CHECK-NEXT:         linalg.yield %11 : f32
-// CHECK-NEXT:       } -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     taskflow.return %data_outs_7 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:   } : (tensor<1x64x8x8xf32>, f32, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<64x64x3x3xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:   return %2 : tensor<1x64x8x8xf32>
+// CHECK:          %2 = taskflow.graph(%arg0, %cst_1, %cst_0, %1, %0, %cst) {
+// CHECK-NEXT:     ^bb0(%arg1: tensor<1x64x8x8xf32>, %arg2: f32, %arg3: tensor<64x64x3x3xf32>, %arg4: tensor<1x64x8x8xf32>, %arg5: tensor<1x64x8x8xf32>, %arg6: tensor<64x64x3x3xf32>):
+// CHECK-NEXT:       %data_outs = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "task_0"}> ({
+// CHECK-NEXT:       ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32):
+// CHECK-NEXT:         %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] {
+// CHECK-NEXT:         ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
+// CHECK-NEXT:           tensor.yield %arg8 : f32
+// CHECK-NEXT:         } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32>
+// CHECK-NEXT:         taskflow.yield %padded : tensor<1x64x10x10xf32>
+// CHECK-NEXT:       }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32>
+// CHECK-NEXT:       %3 = taskflow.channel %data_outs : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32>
+// CHECK-NEXT:       %data_outs_2 = "taskflow.task"(%3, %arg3, %arg4) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "conv2d_1"}> ({
+// CHECK-NEXT:       ^bb0(%arg7: tensor<1x64x10x10xf32>, %arg8: tensor<64x64x3x3xf32>, %arg9: tensor<1x64x8x8xf32>):
+// CHECK-NEXT:         %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg7, %arg8 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg9 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:         taskflow.yield %9 : tensor<1x64x8x8xf32>
+// CHECK-NEXT:       }) : (tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %4 = taskflow.channel %data_outs_2 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %data_outs_3 = "taskflow.task"(%4, %arg5, %arg2) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_2"}> ({
+// CHECK-NEXT:       ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: f32):
+// CHECK-NEXT:         %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg7 : tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) {
+// CHECK-NEXT:         ^bb0(%in: f32, %out: f32):
+// CHECK-NEXT:           %10 = arith.cmpf ugt, %in, %arg9 : f32
+// CHECK-NEXT:           %11 = arith.select %10, %in, %arg9 : f32
+// CHECK-NEXT:           linalg.yield %11 : f32
+// CHECK-NEXT:         } -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:         taskflow.yield %9 : tensor<1x64x8x8xf32>
+// CHECK-NEXT:       }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, f32) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %5 = taskflow.channel %data_outs_3 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %data_outs_4 = "taskflow.task"(%5, %arg2) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "task_3"}> ({
+// CHECK-NEXT:       ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32):
+// CHECK-NEXT:         %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] {
+// CHECK-NEXT:         ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
+// CHECK-NEXT:           tensor.yield %arg8 : f32
+// CHECK-NEXT:         } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32>
+// CHECK-NEXT:         taskflow.yield %padded : tensor<1x64x10x10xf32>
+// CHECK-NEXT:       }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32>
+// CHECK-NEXT:       %6 = taskflow.channel %data_outs_4 : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32>
+// CHECK-NEXT:       %data_outs_5 = "taskflow.task"(%6, %arg6, %arg4) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "conv2d_4"}> ({
+// CHECK-NEXT:       ^bb0(%arg7: tensor<1x64x10x10xf32>, %arg8: tensor<64x64x3x3xf32>, %arg9: tensor<1x64x8x8xf32>):
+// CHECK-NEXT:         %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg7, %arg8 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg9 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:         taskflow.yield %9 : tensor<1x64x8x8xf32>
+// CHECK-NEXT:       }) : (tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %7 = taskflow.channel %data_outs_5 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %data_outs_6 = "taskflow.task"(%7, %arg1, %arg5) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_5"}> ({
+// CHECK-NEXT:       ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x8x8xf32>):
+// CHECK-NEXT:         %9 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg7, %arg8 : tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) outs(%arg9 : tensor<1x64x8x8xf32>) {
+// CHECK-NEXT:         ^bb0(%in: f32, %in_8: f32, %out: f32):
+// CHECK-NEXT:           %10 = arith.addf %in, %in_8 : f32
+// CHECK-NEXT:           linalg.yield %10 : f32
+// CHECK-NEXT:         } -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:         taskflow.yield %9 : tensor<1x64x8x8xf32>
+// CHECK-NEXT:       }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %8 = taskflow.channel %data_outs_6 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       %data_outs_7 = "taskflow.task"(%8, %arg5, %arg2) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_6"}> ({
+// CHECK-NEXT:       ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: f32):
+// CHECK-NEXT:         %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg7 : tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) {
+// CHECK-NEXT:         ^bb0(%in: f32, %out: f32):
+// CHECK-NEXT:           %10 = arith.cmpf ugt, %in, %arg9 : f32
+// CHECK-NEXT:           %11 = arith.select %10, %in, %arg9 : f32
+// CHECK-NEXT:           linalg.yield %11 : f32
+// CHECK-NEXT:         } -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:         taskflow.yield %9 : tensor<1x64x8x8xf32>
+// CHECK-NEXT:       }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, f32) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:       taskflow.return %data_outs_7 : tensor<1x64x8x8xf32>
+// CHECK-NEXT:     } : (tensor<1x64x8x8xf32>, f32, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<64x64x3x3xf32>) -> tensor<1x64x8x8xf32>
+// CHECK-NEXT:     return %2 : tensor<1x64x8x8xf32>
+// CHECK-NEXT:   }
 // CHECK-NEXT: }
\ No newline at end of file
diff --git a/tools/mlir-neura-opt/CMakeLists.txt b/tools/mlir-neura-opt/CMakeLists.txt
index 70c06a51..84980cd8 100644
--- a/tools/mlir-neura-opt/CMakeLists.txt
+++ b/tools/mlir-neura-opt/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LIBS
         MLIRNeuraTransforms
         MLIRConversion
         MLIRNeura
+        MLIRTaskflow
         MLIRTransforms
         MLIROptLib
         MLIRPass
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index cd824879..55672b7c 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -17,6 +17,7 @@
 #include "NeuraDialect/Architecture/ArchitectureSpec.h"
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraPasses.h"
+#include "TaskflowDialect/TaskflowDialect.h"
 
 // Global variable to store architecture spec file path
 static std::string architecture_spec_file;
@@ -71,6 +72,7 @@ int main(int argc, char **argv) {
   registry.insert<mlir::ml_program::MLProgramDialect>();
   registry.insert<mlir::tensor::TensorDialect>();
   registry.insert<mlir::linalg::LinalgDialect>();
+  registry.insert<mlir::taskflow::TaskflowDialect>();
 
   mlir::neura::registerPasses();
   mlir::registerPasses();