diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index f17b9fcb..550e6092 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -21,7 +21,7 @@ std::unique_ptr<mlir::Pass> createLowerBuiltinToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
 
 // TaskFlow Conversion Passes.
-std::unique_ptr<mlir::Pass> createConvertLinalgToTaskflowPass();
+std::unique_ptr<mlir::Pass> createConvertAffineToTaskflowPass();
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
 
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index e1e477ca..a341d9fe 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -47,16 +47,29 @@ def LowerAffineToNeura : Pass<"lower-affine-to-neura", "func::FuncOp">{
 //=========================================================//
 // TaskFlow Conversion Passes.
 //=========================================================//
-def ConvertLinalgToTaskflow : Pass<"convert-linalg-to-taskflow", "ModuleOp">{
-  let summary = "Convert Linalg operations to Taskflow dialect";
+
+def ConvertAffineToTaskflow : Pass<"convert-affine-to-taskflow", "ModuleOp">{
+  let summary = "Convert top-level affine.for operations to Taskflow dialect";
   let description = [{
-    Extracts compute-intensive linalg operations and wraps them into a Taskflow graph for spatial architecture execution.
+    This pass converts top-level affine.for loops in a function into
+    taskflow.task operations within a taskflow.graph. Each top-level loop
+    becomes a separate task, and data dependencies between tasks are made
+    explicit through taskflow.channel operations.
+    
+    The pass:
+    1. Identifies all top-level affine.for operations
+    2. Analyzes data dependencies (RAW, WAR, WAW) between loops
+    3. Creates a taskflow.graph containing the loops
+    4. Converts each loop to a taskflow.task
+    5. Inserts taskflow.channel operations for data dependencies
   }];
-  let constructor = "mlir::createConvertLinalgToTaskflowPass()";
+
+  let constructor = "mlir::createConvertAffineToTaskflowPass()";
   let dependentDialects = [
     "mlir::taskflow::TaskflowDialect",
-    "mlir::linalg::LinalgDialect",
+    "mlir::affine::AffineDialect",
     "mlir::func::FuncDialect",
+    "mlir::memref::MemRefDialect",
     "mlir::arith::ArithDialect"
   ];
 }
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 72e97d72..36dc4c63 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -15,7 +15,7 @@ namespace neura {
 
 void registerNeuraConversionPassPipeline();
 
-// Passes defined in GraphPasses.td
+// Passes defined in NeuraPasses.td
 #define GEN_PASS_DECL
 #include "NeuraDialect/NeuraPasses.h.inc"
 std::unique_ptr<mlir::Pass> createInsertDataMovPass();
diff --git a/include/TaskflowDialect/CMakeLists.txt b/include/TaskflowDialect/CMakeLists.txt
index c2588b05..26d2d8cd 100644
--- a/include/TaskflowDialect/CMakeLists.txt
+++ b/include/TaskflowDialect/CMakeLists.txt
@@ -1 +1,5 @@
 add_mlir_dialect(Taskflow taskflow)
+
+set(LLVM_TARGET_DEFINITIONS TaskflowPasses.td)
+mlir_tablegen(TaskflowPasses.h.inc --gen-pass-decls)
+add_public_tablegen_target(MLIRTaskflowTransformsIncGen)
\ No newline at end of file
diff --git a/include/TaskflowDialect/Taskflow.td b/include/TaskflowDialect/Taskflow.td
index 7b6cc8fd..753e7a51 100644
--- a/include/TaskflowDialect/Taskflow.td
+++ b/include/TaskflowDialect/Taskflow.td
@@ -3,5 +3,7 @@
 
 include "TaskflowDialect.td"
 include "TaskflowOps.td"
+include "TaskflowPasses.td"
+include "TaskflowTypes.td"
 
 #endif // TASKFLOW_TD
\ No newline at end of file
diff --git a/include/TaskflowDialect/TaskflowOps.h b/include/TaskflowDialect/TaskflowOps.h
index 9dc984a8..dc8b87f4 100644
--- a/include/TaskflowDialect/TaskflowOps.h
+++ b/include/TaskflowDialect/TaskflowOps.h
@@ -8,6 +8,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 // First includes the interface declarations.
diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td
index 28641951..66603f7d 100644
--- a/include/TaskflowDialect/TaskflowOps.td
+++ b/include/TaskflowDialect/TaskflowOps.td
@@ -7,6 +7,9 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/RegionKindInterface.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/IR/CommonTypeConstraints.td"
+include "mlir/IR/CommonAttrConstraints.td"
 
 //----------------------------------------------------------------------
 // Base Class for all Taskflow operations.
@@ -14,165 +17,238 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 class TaskflowOpBase<string mnemonic, list<Trait> traits = []> : Op<TaskflowDialect, mnemonic, traits>;
 
 //----------------------------------------------------------------------
-// Graph Level Operations.
+// Task Level Operations.
 //----------------------------------------------------------------------
 
-// Defines the top-level graph operation representing the workload.
-def TaskflowGraphOp : TaskflowOpBase<"graph", [
+// Defines a uniform computation task operation within a Taskflow graph.
+def TaskflowTaskOp : TaskflowOpBase<"task", [
     IsolatedFromAbove,
     AutomaticAllocationScope,
-    SingleBlockImplicitTerminator<"TaskflowReturnOp">
+    AttrSizedOperandSegments,
+    AttrSizedResultSegments,
+    SingleBlockImplicitTerminator<"TaskflowYieldOp">
 ]>{
-  let summary = "Top-level graph operation representing workload on a scale-out/scale-up spatial architecture.";  
+  let summary = "Computation task operation within a Taskflow graph.";
 
   let description = [{
-    Defines a region where all operations are flat tasks connected by edges.
-    This is the boundary between Host (CPU) and Device (spatial architecture).
-    
-    The graph contains:
-    - A flat list of `taskflow.task` operations (nodes)
-    - `taskflow.drive` operations (control edges)
-    - `taskflow.connect` operations (data dependency edges)
-    - A single `taskflow.return` operation to terminate the graph.
-    
-    Example:
+    Represents a computational task that takes data inputs and produces
+    data outoputs. Tasks are isolated from their surrounding scope and can only
+    communicate through explicit data dependencies.
 
+    Tasks has two types of inputs/outputs:
+    1. Memory dependencies: memrefs that are read or written by the task
+    2. Value dependencies: SSA values from producer tasks
+
+    Example:
+      // Memory input: %mem, Value input: %val
+      $out_mem, %out_val = taskflow.task "Task_0"
+        memory_inputs(%mem : memref<4xi32>)
+        value_inputs(%val : i32) {
+      ^bb0(%a0: memref<4xi32>, %a1: i32):
+        affine.for %i = 0 to 4 {
+          %v = affine.load %a0[%i] : memref<4xi32>
+          %sum = arith.addi %v, %a1 : i32
+          affine.store %sum, %a0[%i] : memref<4xi32>
+        }
+        taskflow.yield memory_outputs(%a0 : memref<4xi32>) value_outputs(%a1 : i32)
+      } : (memref<4xi32>, i32) -> (memref<4xi32>, i32)
   }];
 
-  let arguments = (ins Variadic<AnyType>:$inputs);
-  let results = (outs Variadic<AnyType>:$results);
+  let arguments = (ins
+    Variadic<AnyMemRef>:$memory_inputs,
+    Variadic<AnyType>:$value_inputs,
+    StrAttr:$task_name
+  );
+
+  let results = (outs
+    Variadic<AnyMemRef>:$memory_outputs,
+    Variadic<AnyType>:$value_outputs
+  );
+
   let regions = (region SizedRegion<1>:$body);
 
-  let assemblyFormat = [{
-    `(` $inputs `)` attr-dict-with-keyword $body `:` functional-type($inputs, $results)
-  }];
+  // let hasCustomAssemblyFormat = 1;
+
+  // let assemblyFormat = [{
+  //   (`memory_inputs` `(` $memory_inputs^ `:` type($memory_inputs) `)`)?
+  //   (`value_inputs` `(` $value_inputs^ `:` type($value_inputs) `)`)?
+  //   attr-dict-with-keyword
+  //   $body
+  //   `->` `(` type($memory_outputs) `,` type($value_outputs) `)`
+  // }];
+
 }
 
-// Defines the return operation to terminate a Taskflow graph.
-def TaskflowReturnOp : TaskflowOpBase<"return", [Terminator]> {
-  let summary = "Return operation for Taskflow graph.";
+// Defines the yield operation to terminate a Taskflow task.
+def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, AttrSizedOperandSegments, ParentOneOf<["TaskflowTaskOp"]>]>{
+  let summary = "Yield operation for Taskflow task";
   let description = [{
-    This operation terminates a Taskflow graph.
-    This acts as a interaction op between the spatial architecture and the host processor.
-
-    Example"
-      taskflow.graph {
+    Yields values from a task body. The number and types of operands
+    must match the result types of the parent taskflow.task operation.
+    
+    Example:
+      taskflow.task "Task_0" (%arg0, %arg1) {
         ...
-        taskflow.return
-      }
+        taskflow.yield %a0 : memref<4xi32>
+      } : (memref<4xi32>, i32) -> memref<4xi32>
   }];
 
-  let arguments = (ins Variadic<AnyType>:$results);
+  let arguments = (ins
+    Variadic<AnyMemRef>:$memory_results,
+    Variadic<AnyType>:$value_results);
 
-  let assemblyFormat = [{
-    ($results^ `:` type($results))? attr-dict
-  }];
+  // let assemblyFormat = [{
+  //   (`memory_outputs` `(` $memory_results^ `:` type($memory_results) `)`)?
+  //   (`value_outputs` `(` $value_results^ `:` type($value_results) `)`)?
+  //   attr-dict
+  // }];
+
+  // let hasCustomAssemblyFormat = 1;
 
   let builders = [
-    // Default builder for empty return.
+    // Default builder for empty yield.
     OpBuilder<(ins), [{
-    build($_builder, $_state, ValueRange{});
+    build($_builder, $_state, ValueRange{}, ValueRange{});
     }]>
   ];
 }
 
-//----------------------------------------------------------------------
-// Task Level Operations.
-//----------------------------------------------------------------------
-
-// Defines a uniform computation and control task operation within a Taskflow graph.
-def TaskflowTaskOp : TaskflowOpBase<"task", [
-    AttrSizedOperandSegments,
-    AttrSizedResultSegments,
-    SingleBlockImplicitTerminator<"TaskflowYieldOp">,
-    NoMemoryEffect,
-]>{
-  let summary = "Uniform computation and control task operation within a Taskflow graph";
-
+// Defines the data dependency edge operation that carries data dependencies between tasks in a Taskflow graph.
+def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure, SameOperandsAndResultType]>{
+  let summary = "Data dependency edge that carries data dependencies between tasks in a Taskflow graph";
   let description = [{
-  
-  }];
+    Represents a data dependency edge between tasks in the taskflow graph.
+    A channel connects a producer task's output to a consumer task's input.
 
-  let arguments = (ins
-    // Optional control inputs.
-    Variadic<TaskflowPacketType>:$control_ins,
-    // Optional data inputs.
-    Variadic<AnyType>:$data_ins,
-    StrAttr:$task_name,
-
-    // Task metadata.
-    OptionalAttr<AffineMapArrayAttr>:$indexing_maps,
-    OptionalAttr<ArrayAttr>:$iterator_types
-  );
+    Channels enforce explicit data dependencies and can be used for:
+    - Producer-consumer relationships
+    - Read-after-write (RAW) dependencies
+    - Write-after-read (WAR) dependencies
+    - Write-after-write (WAW) dependencies
 
-  let results = (outs
-    // Optional control outputs.
-    Variadic<TaskflowPacketType>:$control_outs,
-    // Optional data outputs.
-    Variadic<AnyType>:$data_outs
-  );
+    Example:
+      %0 = taskflow.task "producer_task" (...) { ... } : (...) -> memref<4xi32>
+      %1 = taskflow.channel %0 : memref<4xi32>
+      %2 = taskflow.task "consumer_task" (%1, ...) { ... } : (memref<4xi32>, ...) -> ...
+  }];
 
-  let regions = (region SizedRegion<1>:$body);
+  let arguments = (ins AnyType:$source);
+  let results = (outs AnyType:$target);
 
-  // let assemblyFormat = [{
-  //   $task_name
-  //   (`control_ins` `(` $control_ins^ `:` type($control_ins) `)`)?
-  //   (`data_ins` `(` $data_ins^ `:` type($data_ins) `)`)?
-  //   $body attr-dict
-  //   `->` type(results)
-  // }];
+  let assemblyFormat = [{
+    $source attr-dict `:` type($source) `->` type($target)
+  }];
 }
 
-// Defines the yield operation to terminate a Taskflow task.
-def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, ParentOneOf<["TaskflowTaskOp"]>]>{
-  let summary = "Yield operation for Taskflow task";
+//----------------------------------------------------------------------
+// Intra-Task Operations.
+//----------------------------------------------------------------------
+// Counter operation representing loop iteration control within a Taskflow task.
+def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{
+  let summary = "Loop counter operation with hardware counter semantics";
+
   let description = [{
-  
+    Represents a loop counter that generates iteration indices.
+    The hardware counter produces a predicated index value.
+
+    Counter behavior:
+    - Top-level counter: increments unconditionally each cycle.
+    - Nested counter: increments only when the parent counter is valid.
+
+    Example:
+      // Top-level counter
+      %i = taskflow.counter {
+        lower_bound = 0 : index,
+        upper_bound = 16 : index,
+        step = 1 : index,
+        counter_name = "i"
+      } : index
+      // Nested counter
+      %j = taskflow.counter parent(%i) {
+        lower_bound = 0 : index,
+        upper_bound = 8 : index,
+        step = 1 : index,
+        counter_name = "j"
+      } : index
   }];
 
-  let arguments = (ins Variadic<AnyType>:$results);
+  let arguments = (ins
+    Optional<AnyType>:$parent_index,
+    IndexAttr:$lower_bound,
+    IndexAttr:$upper_bound,
+    IndexAttr:$step
+  );
+
+  let results = (outs AnyType:$counter_index);
 
   let assemblyFormat = [{
-    ($results^ `:` type($results))? attr-dict
+    (`parent` `(` $parent_index^ `:` type($parent_index) `)`)?
+    attr-dict-with-keyword
+    `:` type($counter_index)
   }];
-
-  let builders = [
-    // Default builder for empty yield.
-    OpBuilder<(ins), [{
-    build($_builder, $_state, ValueRange{});
-    }]>
-  ];
 }
 
-// Defines the control edge operation that carries control packets between tasks in a Taskflow graph.
-def TaskflowDriveOp : TaskflowOpBase<"drive", [Pure]>{
-  let summary = "Control edge that carries control packets between tasks in a Taskflow graph";
+def TaskflowHyperblockOp : TaskflowOpBase<"hyperblock",[
+  AutomaticAllocationScope,
+  SingleBlockImplicitTerminator<"TaskflowHyperblockYieldOp">
+]>{
+  let summary = "Hyperblock operation containing loop body computation";
+
   let description = [{
-  
+    Represents the loop body computation as a hyperblock controlled by taskflow.counter operation.
+    The hyperblock takes the counter indices as input to trigger its execution.
+
+    If the hyperblock has a return value, it must return the final value produced by the hyperblock (i.e., from the last iteration).
+
+    Example:
+      %result = taskflow.hyperblock indices(%i : index) {
+        ^bb0(%idx: index):
+          // Loop body computation using %idx
+          ...
+          taskflow.hyperblock.yield %output : i32
+      } -> i32
   }];
 
-  let arguments = (ins TaskflowPacketType:$source);
+  let arguments = (ins
+    Variadic<AnyType>:$indices
+  );
+
+  let results = (outs
+    Variadic<AnyType>:$outputs
+  );
 
-  let results = (outs TaskflowPacketType:$target);
+  let regions = (region SizedRegion<1>:$body);
 
   let assemblyFormat = [{
-    $source attr-dict `:` type($source) `->` type($target)
+    (`indices` `(` $indices^ `:` type($indices) `)`)?
+    attr-dict-with-keyword
+    $body
+    `->` `(` type($outputs) `)`
   }];
 }
 
-def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{
-  let summary = "Data dependency edge that carries data dependencies between tasks in a Taskflow graph";
+def TaskflowHyperblockYieldOp : TaskflowOpBase<"hyperblock.yield", [
+  Terminator,
+  Pure,
+  ReturnLike,
+  ParentOneOf<["TaskflowHyperblockOp"]>
+]>{
+  let summary = "Yield operation for Taskflow hyperblock";
+
   let description = [{
-  
+    Terminates the hyperblock body.
   }];
 
-  let arguments = (ins AnyType:$source);
-
-  let results = (outs AnyType:$target);
+  let arguments = (ins Variadic<AnyType>:$outputs);
 
   let assemblyFormat = [{
-    $source attr-dict `:` type($source) `->` type($target)
+    (`outputs` `(` $outputs^ `:` type($outputs) `)`)?
+    attr-dict
   }];
+
+  let builders = [
+    OpBuilder<(ins), [{build($_builder, $_state, ValueRange{});}]>
+  ];
 }
 
 #endif // TASKFLOW_OPS_TD
\ No newline at end of file
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
new file mode 100644
index 00000000..f6219511
--- /dev/null
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -0,0 +1,25 @@
+// TaskflowPasses.h - Header file for Taskflow passes
+
+#ifndef TASKFLOW_PASSES_H
+#define TASKFLOW_PASSES_H
+
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include <memory>
+namespace mlir {
+namespace taskflow {
+// Passes defined in TaskflowPasses.td
+#define GEN_PASS_DECL
+#include "TaskflowDialect/TaskflowPasses.h.inc"
+std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
+
+#define GEN_PASS_REGISTRATION
+#include "TaskflowDialect/TaskflowPasses.h.inc"
+} // namespace taskflow
+} // namespace mlir
+
+#endif // TASKFLOW_PASSES_H
\ No newline at end of file
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
new file mode 100644
index 00000000..1bcf3b22
--- /dev/null
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -0,0 +1,18 @@
+// TaskflowPasses.td - Passes for the Taskflow dialect
+
+#ifndef TASKFLOW_PASSES_TD
+#define TASKFLOW_PASSES_TD
+
+include "mlir/Pass/PassBase.td"
+
+//=========================================================//
+// Passes for the Taskflow dialect
+//=========================================================//
+def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> {
+  let summary = "Constructs hyperblocks and counter chain from Taskflow tasks";
+  let description = [{
+    This pass constructs hyperblocks and counter chain from Taskflow tasks.
+  }];
+  let constructor = "taskflow::createConstructHyperblockFromTaskPass()";
+}
+#endif // TASKFLOW_PASSES_TD
\ No newline at end of file
diff --git a/include/TaskflowDialect/TaskflowTypes.td b/include/TaskflowDialect/TaskflowTypes.td
index 120a5265..dacbf512 100644
--- a/include/TaskflowDialect/TaskflowTypes.td
+++ b/include/TaskflowDialect/TaskflowTypes.td
@@ -11,29 +11,4 @@ class TaskflowTypeBase<string name, string typeMnemonic, list<Trait> traits = []
     : TypeDef<TaskflowDialect, name, traits>{
   let mnemonic = typeMnemonic;
 }
-
-//----------------------------------------------------------------------
-// PacketType - Control conifguration packet type.
-//----------------------------------------------------------------------
-def TaskflowPacketType : TaskflowTypeBase<"Packet", "packet">{
-  let summary = "Control packet carrying conifguration metadata for affine controller";
-
-  let description = [{
-    
-  }];
-
-  // Payload type carried by the packet.
-  let parameters = (ins "::mlir::Type":$payloadType);
-
-  let assemblyFormat = [{
-    `<` $payloadType `>`
-  }];
-
-  let builders = [
-    TypeBuilderWithInferredContext<(ins "Type":$payloadType),
-      [{
-        return $_get(payloadType.getContext(), payloadType);
-      }]>
-  ];
-}
 #endif //TASKFLOW_TYPES_TD
\ No newline at end of file
diff --git a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp
new file mode 100644
index 00000000..f628364f
--- /dev/null
+++ b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp
@@ -0,0 +1,364 @@
+#include "Conversion/ConversionPasses.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowTypes.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+//------------------------------------------------------------------------------
+// Helper Functions.
+//------------------------------------------------------------------------------
+
+// Collects all top-level affine.for operations in a function.
+static SmallVector<affine::AffineForOp>
+collectTopLevelLooops(func::FuncOp func_op) {
+  SmallVector<affine::AffineForOp> top_level_loops;
+  for (Block &block : func_op.getBlocks()) {
+    for (Operation &op : block) {
+      if (auto for_op = dyn_cast<affine::AffineForOp>(op)) {
+        top_level_loops.push_back(for_op);
+      }
+    }
+  }
+
+  return top_level_loops;
+}
+
+// Collects memrefs that are loaded (read) within a given operation scope.
+static void collectReadMemrefs(Operation *op, SetVector<Value> &read_memrefs) {
+  op->walk([&](Operation *nested_op) {
+    if (auto load_op = dyn_cast<affine::AffineLoadOp>(nested_op)) {
+      read_memrefs.insert(load_op.getMemRef());
+    } else if (auto load_op = dyn_cast<memref::LoadOp>(nested_op)) {
+      read_memrefs.insert(load_op.getMemRef());
+    }
+  });
+}
+
+// Collects memrefs that are stored (written) within a given operation scope.
+static void collectWrittenMemrefs(Operation *op,
+                                  SetVector<Value> &written_memrefs) {
+  op->walk([&](Operation *nested_op) {
+    if (auto store_op = dyn_cast<affine::AffineStoreOp>(nested_op)) {
+      written_memrefs.insert(store_op.getMemRef());
+    } else if (auto store_op = dyn_cast<memref::StoreOp>(nested_op)) {
+      written_memrefs.insert(store_op.getMemRef());
+    }
+  });
+}
+
+// Collects external values used within a given scope of operations.
+static void collectExternalValues(Operation *root_op,
+                                  const DenseSet<Operation *> &scope_ops,
+                                  SetVector<Value> &external_values) {
+  for (Value operand : root_op->getOperands()) {
+    // Skips memref types (handled separately as memory dependencies).
+    if (isa<MemRefType>(operand.getType())) {
+      continue;
+    }
+
+    // Checks if it's a block argument.
+    if (auto block_arg = dyn_cast<BlockArgument>(operand)) {
+      // Only adds if the block argument is not from within the scope.
+      Operation *parent_op = block_arg.getOwner()->getParentOp();
+      if (!scope_ops.contains(parent_op)) {
+        external_values.insert(operand);
+      }
+      continue;
+    }
+
+    // Checks if the operand is defined outside the scope.
+    Operation *def_op = operand.getDefiningOp();
+    if (def_op && !scope_ops.contains(def_op)) {
+      external_values.insert(operand);
+    }
+  }
+
+  // Recursively processes nested operations.
+  for (Region &region : root_op->getRegions()) {
+    for (Block &block : region.getBlocks()) {
+      for (Operation &op : block.getOperations()) {
+        collectExternalValues(&op, scope_ops, external_values);
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Task Conversion
+//------------------------------------------------------------------------------
+
+// Converts a top-level affine.for to a taskflow.task operation.
+static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
+                                        affine::AffineForOp for_op,
+                                        DenseMap<Value, Value> &value_mapping,
+                                        int task_id) {
+  Location loc = for_op.getLoc();
+  std::string task_name = "Task_" + std::to_string(task_id);
+
+  // Collects all operations in the loop scope.
+  DenseSet<Operation *> scope_ops;
+  scope_ops.insert(for_op.getOperation());
+  for_op.walk([&](Operation *op) { scope_ops.insert(op); });
+
+  //-------------------------------------------------------------------
+  // Step 1: Collects read and written memrefs.
+  //-------------------------------------------------------------------
+  SetVector<Value> read_memrefs;
+  SetVector<Value> written_memrefs;
+  collectReadMemrefs(for_op.getOperation(), read_memrefs);
+  collectWrittenMemrefs(for_op.getOperation(), written_memrefs);
+
+  llvm::errs() << "Read memrefs for loop:\n" << for_op << "\n";
+  for (Value memref : read_memrefs) {
+    llvm::errs() << memref << "\n";
+  }
+
+  llvm::errs() << "Written memrefs for loop:\n" << for_op << "\n";
+  for (Value memref : written_memrefs) {
+    llvm::errs() << memref << "\n";
+  }
+
+  //-------------------------------------------------------------------
+  // Step 2: Determines memory inputs and outputs.
+  //-------------------------------------------------------------------
+  // Memory inputs: ALL memrefs that are accessed (read OR written).
+  // This ensures WAR and WAW dependencies are respected.
+  SetVector<Value> accessed_memrefs;
+  accessed_memrefs.insert(read_memrefs.begin(), read_memrefs.end());
+  accessed_memrefs.insert(written_memrefs.begin(), written_memrefs.end());
+
+  // Memory outputs: ONLY memrefs that are written.
+  // This ensures RAW and WAW dependencies are respected.
+  SetVector<Value> output_memrefs;
+  output_memrefs.insert(written_memrefs.begin(), written_memrefs.end());
+
+  //-------------------------------------------------------------------
+  // Step 3: Collects external SSA values (non-memref).
+  //-------------------------------------------------------------------
+  SetVector<Value> external_values;
+  collectExternalValues(for_op.getOperation(), scope_ops, external_values);
+
+  llvm::errs() << "External values for loop:\n" << for_op << "\n";
+  for (Value val : external_values) {
+    llvm::errs() << val << "\n";
+  }
+
+  //-------------------------------------------------------------------
+  // Step 4: Resolves inputs through value mapping.
+  //-------------------------------------------------------------------
+  SmallVector<Value> memory_inputs;
+  SmallVector<Value> value_inputs;
+  IRMapping mapping;
+
+  // Resolves memory inputs.
+  for (Value memref : accessed_memrefs) {
+    Value resolved_memref = value_mapping.lookup(memref);
+    if (!resolved_memref) {
+      resolved_memref = memref;
+    }
+    memory_inputs.push_back(resolved_memref);
+    mapping.map(memref, resolved_memref);
+  }
+
+  // Resolves external SSA value inputs.
+  for (Value external_val : external_values) {
+    Value resolved_val = value_mapping.lookup(external_val);
+    if (!resolved_val) {
+      resolved_val = external_val;
+    }
+    value_inputs.push_back(resolved_val);
+    mapping.map(external_val, resolved_val);
+  }
+
+  //-------------------------------------------------------------------
+  // Step 5: Prepares output types.
+  //-------------------------------------------------------------------
+  SmallVector<Type> memory_output_types;
+  for (Value memref : output_memrefs) {
+    memory_output_types.push_back(memref.getType());
+  }
+
+  SmallVector<Type> value_output_types;
+  for (Type result_type : for_op.getResultTypes()) {
+    value_output_types.push_back(result_type);
+  }
+
+  //-------------------------------------------------------------------
+  // Step 6: Creates the taskflow.task operation.
+  //-------------------------------------------------------------------
+  TaskflowTaskOp task_op = builder.create<TaskflowTaskOp>(
+      loc,
+      /*memory_outputs=*/memory_output_types,
+      /*value_outputs=*/value_output_types,
+      /*memory_inputs=*/memory_inputs,
+      /*value_inputs=*/value_inputs,
+      /*task_name=*/builder.getStringAttr(task_name));
+
+  //-------------------------------------------------------------------
+  // Step 7: Builds the task body.
+  //-------------------------------------------------------------------
+  Block *task_body = new Block();
+  task_op.getBody().push_back(task_body);
+
+  // Adds block arguments (memory inputs first, then value inputs).
+  DenseMap<Value, BlockArgument> input_to_block_arg;
+  // Memory input arguments.
+  for (Value memref : accessed_memrefs) {
+    BlockArgument arg = task_body->addArgument(memref.getType(), loc);
+    mapping.map(memref, arg);
+    input_to_block_arg[memref] = arg;
+  }
+
+  // Value input arguments.
+  for (Value val : external_values) {
+    BlockArgument arg = task_body->addArgument(val.getType(), loc);
+    mapping.map(val, arg);
+    input_to_block_arg[val] = arg;
+  }
+
+  // Clones loop into the task body.
+  OpBuilder task_builder(task_body, task_body->begin());
+  Operation *cloned_loop = task_builder.clone(*for_op.getOperation(), mapping);
+
+  //---------------------------------------------------------------
+  // Step 8: Creates the yield operation.
+  //---------------------------------------------------------------
+  task_builder.setInsertionPointToEnd(task_body);
+  SmallVector<Value> memory_yield_operands;
+  SmallVector<Value> value_yield_operands;
+
+  // Memory yield outputs: yield the written memrefs.
+  for (Value memref : output_memrefs) {
+    if (input_to_block_arg.count(memref)) {
+      memory_yield_operands.push_back(input_to_block_arg[memref]);
+    } else {
+      assert(false && "Written memref not in inputs!");
+    }
+  }
+
+  // Value yield outputs: yield the loop results.
+  for (Value result : cloned_loop->getResults()) {
+    value_yield_operands.push_back(result);
+  }
+  task_builder.create<TaskflowYieldOp>(loc, memory_yield_operands,
+                                       value_yield_operands);
+
+  //-------------------------------------------------------------------
+  // Step 9 : Updates value mapping with task outputs for subsequent tasks
+  // conversion.
+  //-------------------------------------------------------------------
+  // Memory outputs.
+  for (auto [memref, task_output] :
+       llvm::zip(output_memrefs, task_op.getMemoryOutputs())) {
+    value_mapping[memref] = task_output;
+  }
+
+  return task_op;
+}
+
+//------------------------------------------------------------------------------
+// Main Conversion Process.
+//------------------------------------------------------------------------------
+// Converts a single function to TaskFlow operations.
+static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
+  // Collects top-level loops for conversion.
+  SmallVector<affine::AffineForOp> top_level_loops =
+      collectTopLevelLooops(func_op);
+
+  if (top_level_loops.empty()) {
+    // No loops to convert.
+    llvm::errs() << "No top-level affine.for loops found in function '"
+                 << func_op.getName() << "'.\n";
+    return success();
+  }
+
+  llvm::errs() << "\n===Converting function: " << func_op.getName() << "===\n";
+  llvm::errs() << "Found " << top_level_loops.size()
+               << " top-level affine.for loops to convert:\n";
+  for (affine::AffineForOp for_op : top_level_loops) {
+    llvm::errs() << for_op.getLoc() << "\n";
+  }
+
+  OpBuilder builder(func_op.getContext());
+  DenseMap<Value, Value> value_mapping;
+
+  // Converts each top-level loop to taskflow.task operation.
+  for (auto [idx, loop] : llvm::enumerate(top_level_loops)) {
+    builder.setInsertionPoint(loop);
+    TaskflowTaskOp task_op =
+        convertLoopToTask(builder, loop, value_mapping, idx);
+
+    // Replaces uses of loop results with task value outputs.
+    for (auto [loop_result, task_value_output] :
+         llvm::zip(loop.getResults(), task_op.getValueOutputs())) {
+      loop_result.replaceAllUsesWith(task_value_output);
+    }
+  }
+
+  // Erases the original loops after conversion.
+  for (affine::AffineForOp for_op : top_level_loops) {
+    for_op.erase();
+  }
+
+  return success();
+}
+
+class ConvertAffineToTaskflowPass
+    : public PassWrapper<ConvertAffineToTaskflowPass, OperationPass<ModuleOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertAffineToTaskflowPass)
+
+  StringRef getArgument() const final { return "convert-affine-to-taskflow"; }
+
+  StringRef getDescription() const final {
+    return "Convert Affine operations to Taskflow operations";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TaskflowDialect, affine::AffineDialect, func::FuncDialect,
+                    arith::ArithDialect, memref::MemRefDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+
+    WalkResult result = module.walk([](func::FuncOp func_op) {
+      if (failed(convertFuncToTaskflow(func_op))) {
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+
+    if (result.wasInterrupted()) {
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createConvertAffineToTaskflowPass() {
+  return std::make_unique<ConvertAffineToTaskflowPass>();
+}
\ No newline at end of file
diff --git a/lib/Conversion/LinalgToTaskflow/CMakeLists.txt b/lib/Conversion/AffineToTaskflow/CMakeLists.txt
similarity index 74%
rename from lib/Conversion/LinalgToTaskflow/CMakeLists.txt
rename to lib/Conversion/AffineToTaskflow/CMakeLists.txt
index c8e425a6..bb4f3f52 100644
--- a/lib/Conversion/LinalgToTaskflow/CMakeLists.txt
+++ b/lib/Conversion/AffineToTaskflow/CMakeLists.txt
@@ -1,7 +1,7 @@
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-add_mlir_conversion_library(MLIRLinalgToTaskflowPass
-  LinalgToTaskflowPass.cpp
+add_mlir_conversion_library(MLIRAffineToTaskflowPass
+  AffineToTaskflowPass.cpp
 
   DEPENDS
   MLIRConversionIncGen
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 4f4e247f..cf66d518 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -5,7 +5,7 @@ add_subdirectory(AffineToNeura)
 add_subdirectory(LlvmToNeura)
 add_subdirectory(MemRefToNeura)
 add_subdirectory(BuiltinToNeura)
-add_subdirectory(LinalgToTaskflow)
+add_subdirectory(AffineToTaskflow)
 
 add_library(MLIRConversion INTERFACE)
 
@@ -22,6 +22,6 @@ target_link_libraries(MLIRConversion INTERFACE
   MLIRNeuraLlvmToNeuraPass
   MLIRNeuraMemRefToNeuraPass
   MLIRNeuraBuiltinToNeuraPass
-  MLIRLinalgToTaskflowPass
+  MLIRAffineToTaskflowPass
   ${dialect_libs}
 )
\ No newline at end of file
diff --git a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp b/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp
deleted file mode 100644
index a4489f44..00000000
--- a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp
+++ /dev/null
@@ -1,487 +0,0 @@
-#include "Conversion/ConversionPasses.h"
-#include "TaskflowDialect/TaskflowDialect.h"
-#include "TaskflowDialect/TaskflowOps.h"
-#include "TaskflowDialect/TaskflowTypes.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace mlir;
-using namespace mlir::taskflow;
-
-namespace {
-//------------------------------------------------------------------------------
-// Helper Functions.
-//------------------------------------------------------------------------------
-// Gets a descriptive task name based on the operation type.
-static std::string generateTaskBaseName(Operation *op) {
-  if (isa<linalg::Conv2DNchwFchwOp, linalg::Conv2DNhwcHwcfOp>(op)) {
-    return "conv2d";
-  }
-  if (isa<linalg::MatmulOp>(op)) {
-    return "matmul";
-  }
-  if (isa<linalg::BatchMatmulOp>(op)) {
-    return "batch_matmul";
-  }
-  if (isa<linalg::PoolingNchwMaxOp, linalg::PoolingNchwSumOp>(op)) {
-    return "pooling";
-  }
-  if (auto generic_op = dyn_cast<linalg::GenericOp>(op)) {
-    return "generic";
-  }
-  return "task";
-}
-
-// Maintains conversion context during the conversion process.
-struct ConversionContext {
-  // Maps original SSA values to task output values.
-  DenseMap<Value, Value> value_mapping;
-
-  // Maps graph input values to graph block arguments.
-  DenseMap<Value, BlockArgument> graph_input_mapping;
-
-  // Counter for generating unique task names.
-  int task_counter = 0;
-
-  // Generates a unique task name.
-  std::string getTaskBaseName(StringRef base_name) {
-    return (base_name + "_" + Twine(task_counter++)).str();
-  }
-};
-
-// Operation classification.
-static bool isComputeIntensiveOp(Operation *op) {
-  // Returns true if the operation is one of the compute-intensive Linalg ops.
-  return isa<linalg::Conv2DNchwFchwOp, linalg::MatmulOp, linalg::BatchMatmulOp,
-             linalg::GenericOp, linalg::PoolingNchwMaxOp,
-             linalg::PoolingNchwSumOp, tensor::PadOp>(op);
-}
-
-// Collects external values for a single operation.
-static void collectExternalValuesForOp(
-    Operation *op, const DenseSet<Operation *> &graph_op_set,
-    func::FuncOp func_op, SetVector<Value> &external_values) {
-  for (Value operand : op->getOperands()) {
-    // Skips nested region block arguments.
-    if (auto block_arg = dyn_cast<BlockArgument>(operand)) {
-      if (block_arg.getOwner()->getParentOp() != func_op.getOperation()) {
-        continue;
-      }
-      external_values.insert(operand);
-      continue;
-    }
-
-    // Skips values defined inside graph ops or nested regions.
-    Operation *def_op = operand.getDefiningOp();
-    if (def_op) {
-      if (!graph_op_set.contains(def_op) &&
-          def_op->getBlock()->getParentOp() == func_op.getOperation()) {
-        external_values.insert(operand);
-      }
-    }
-  }
-
-  // Recurses into nested regions.
-  for (Region &region : op->getRegions()) {
-    for (Block &block : region) {
-      for (Operation &nested_op : block) {
-        collectExternalValuesForOp(&nested_op, graph_op_set, func_op,
-                                   external_values);
-      }
-    }
-  }
-}
-
-// Collects external values used by each graph operation.
-static DenseMap<Operation *, SmallVector<Value>>
-collectExternalValuesPerOp(ArrayRef<Operation *> graph_ops,
-                           func::FuncOp func_op) {
-  DenseSet<Operation *> graph_op_set(graph_ops.begin(), graph_ops.end());
-  DenseMap<Operation *, SmallVector<Value>> op_external_values;
-
-  for (Operation *op : graph_ops) {
-    SetVector<Value> external_values;
-    collectExternalValuesForOp(op, graph_op_set, func_op, external_values);
-    op_external_values[op] =
-        SmallVector<Value>(external_values.begin(), external_values.end());
-  }
-
-  return op_external_values;
-}
-
-//------------------------------------------------------------------------------
-// Step 1: Scope Identification - Collects operations for the taskflow.graph
-// op.
-//------------------------------------------------------------------------------
-// Collects all operations that should be included in the taskflow graph.
-// Returns operations in topological order.
-static SmallVector<Operation *> collectTaskflowGraphOps(func::FuncOp func_op) {
-  SmallVector<Operation *> graph_ops;
-
-  func_op.walk([&](Operation *op) {
-    if (isComputeIntensiveOp(op)) {
-      graph_ops.push_back(op);
-    }
-  });
-  return graph_ops;
-}
-
-// Identifies external inputs to the taskflow graph (values defined outside the
-// graph ops).
-static SmallVector<Value> identifyGraphInputs(ArrayRef<Operation *> graph_ops,
-                                              func::FuncOp func_op) {
-  llvm::SetVector<Value> input_set;
-  llvm::DenseSet<Operation *> graph_op_set(graph_ops.begin(), graph_ops.end());
-
-  for (Operation *op : graph_ops) {
-    collectExternalValuesForOp(op, graph_op_set, func_op, input_set);
-  }
-
-  return SmallVector<Value>(input_set.begin(), input_set.end());
-}
-
-// Identifies outputs from the graph (values used outside the graph ops).
-static SmallVector<Value> identifyGraphOutputs(ArrayRef<Operation *> graph_ops,
-                                               func::FuncOp func_op) {
-  SmallVector<Value> outputs;
-  DenseSet<Operation *> graph_op_set(graph_ops.begin(), graph_ops.end());
-
-  for (Operation *op : graph_ops) {
-    for (Value result : op->getResults()) {
-      bool used_outside = false;
-      for (Operation *user : result.getUsers()) {
-        if (!graph_op_set.contains(user)) {
-          used_outside = true;
-          break;
-        }
-      }
-      if (used_outside) {
-        outputs.push_back(result);
-      }
-    }
-  }
-  return outputs;
-}
-
-//------------------------------------------------------------------------------
-// Step 2: Task Contruction - Creates the taskflow.task ops.
-//------------------------------------------------------------------------------
-// Reolves the input value for a task operand.
-// Returns the corresponding buffer value from the context, or wraps the
-// original value.
-static Value resolveTaskInput(OpBuilder &builder, Location loc,
-                              Value original_value, ConversionContext &ctx) {
-  // Checks if this value is produced by a task.
-  if (ctx.value_mapping.count(original_value)) {
-    return ctx.value_mapping[original_value];
-  }
-
-  // Checks if this value is a graph input.
-  if (ctx.graph_input_mapping.count(original_value)) {
-    return ctx.graph_input_mapping[original_value];
-  }
-
-  // Should not reach here for well-formed graphs.
-  assert(false && "Unable to resolve task input value");
-  return Value();
-}
-
-// Creates a taskflow.task op from a given operation.
-// For pure data dependent workloads (e.g., AI workloads), taskes have:
-//   - data_ins: input buffers
-//   - data_outs: output buffers
-//   - no control dependencies
-static TaskflowTaskOp createTaskFromOp(OpBuilder &builder, Operation *op,
-                                       ConversionContext &ctx,
-                                       ArrayRef<Value> external_values) {
-  Location loc = op->getLoc();
-  std::string task_name = ctx.getTaskBaseName(generateTaskBaseName(op));
-
-  // Resolves all external values to graph local values.
-  SmallVector<Value> data_ins;
-  IRMapping mapping;
-
-  for (Value external_val : external_values) {
-    Value resolved_input = resolveTaskInput(builder, loc, external_val, ctx);
-    assert(resolved_input && "Failed to resolve task input");
-    data_ins.push_back(resolved_input);
-    mapping.map(external_val, resolved_input);
-  }
-
-  for (Value operand : op->getOperands()) {
-    if (llvm::is_contained(external_values, operand)) {
-      // Already mapped.
-      continue;
-    }
-    Value resolved_input = resolveTaskInput(builder, loc, operand, ctx);
-    assert(resolved_input && "Failed to resolve task input");
-    data_ins.push_back(resolved_input);
-    mapping.map(operand, resolved_input);
-  }
-
-  // Data outputs uses original result types.
-  SmallVector<Type> data_out_types;
-  for (Type result_type : op->getResultTypes()) {
-    data_out_types.push_back(result_type);
-  }
-
-  // Creates the taskflow.task op.
-  auto task_op = builder.create<TaskflowTaskOp>(
-      loc,
-      /*control_outs=*/TypeRange{},
-      /*data_outs=*/data_out_types,
-      /*control_ins=*/ValueRange{},
-      /*data_ins=*/data_ins, builder.getStringAttr(task_name),
-      /*indexing_maps=*/nullptr,
-      /*iterator_types=*/nullptr);
-
-  // Builds task body.
-  Block *task_body = new Block();
-  task_op.getBody().push_back(task_body);
-
-  // Block arguments have same types as data_ins (original tensor types).
-  for (Value input : data_ins) {
-    task_body->addArgument(input.getType(), loc);
-  }
-
-  // Maps external values to task block arguments.
-  for (size_t i = 0; i < external_values.size(); i++) {
-    mapping.map(external_values[i], task_body->getArgument(i));
-  }
-
-  // Switches to the task body to clone the original operation.
-  OpBuilder task_builder(task_body, task_body->begin());
-  Operation *cloned_op = task_builder.clone(*op, mapping);
-  // Yields the results.
-  task_builder.create<TaskflowYieldOp>(loc, cloned_op->getResults());
-
-  // Registers task outputs in context (same types as original results).
-  for (auto [orig_result, task_output] :
-       llvm::zip(op->getResults(), task_op.getDataOuts())) {
-    ctx.value_mapping[orig_result] = task_output;
-  }
-
-  return task_op;
-}
-
-//------------------------------------------------------------------------------
-// Step 3: Channel Insertion - Inserts taskflow.channel ops between tasks.
-//------------------------------------------------------------------------------
-static void insertChannels(OpBuilder &builder, ArrayRef<TaskflowTaskOp> tasks) {
-  DenseSet<TaskflowTaskOp> task_set(tasks.begin(), tasks.end());
-
-  for (TaskflowTaskOp producer_task : tasks) {
-    Location loc = producer_task.getLoc();
-
-    // For each data output of this producer task.
-    for (Value data_out : producer_task.getDataOuts()) {
-      // Collects all consumer tasks that use this output.
-      SmallVector<std::pair<TaskflowTaskOp, OpOperand *>> consumer_tasks;
-
-      for (OpOperand &use : data_out.getUses()) {
-        Operation *user = use.getOwner();
-        if (auto consumer_task = dyn_cast<TaskflowTaskOp>(user)) {
-          if (task_set.contains(consumer_task)) {
-            consumer_tasks.push_back({consumer_task, &use});
-          }
-        }
-      }
-
-      // Creates a dedicated channel for each consumer task.
-      builder.setInsertionPointAfter(producer_task);
-
-      for (auto [consumer_task, use] : consumer_tasks) {
-        // Creates a new channel for this specific producer->consumer edge.
-        auto channel_op = builder.create<TaskflowChannelOp>(
-            loc, data_out.getType(), data_out);
-
-        // Replaces only this specific use with the channel output.
-        use->set(channel_op.getTarget());
-      }
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// Step 4: Graph Construction - Creates the taskflow.graph op.
-//------------------------------------------------------------------------------
-static LogicalResult buildTaskflowGraph(
-    OpBuilder &builder, func::FuncOp func_op, ArrayRef<Operation *> graph_ops,
-    ArrayRef<Value> graph_inputs, MutableArrayRef<Value> graph_outputs,
-    const DenseMap<Operation *, SmallVector<Value>> &op_external_values) {
-  Location loc = func_op.getLoc();
-
-  // Graph result types = original output types (no conversion).
-  SmallVector<Type> result_types;
-  for (Value output : graph_outputs) {
-    result_types.push_back(output.getType());
-  }
-
-  // Creates graph op.
-  auto graph_op =
-      builder.create<TaskflowGraphOp>(loc, result_types, graph_inputs);
-
-  // Builds graph body.
-  Block *graph_body = new Block();
-  graph_op.getBody().push_back(graph_body);
-
-  // Block arguments have same types as graph inputs.
-  ConversionContext ctx;
-  for (Value input : graph_inputs) {
-    BlockArgument arg = graph_body->addArgument(input.getType(), loc);
-    ctx.graph_input_mapping[input] = arg;
-  }
-
-  // Converts each operation to a task.
-  builder.setInsertionPointToStart(graph_body);
-  SmallVector<TaskflowTaskOp> tasks;
-  for (Operation *op : graph_ops) {
-    const SmallVector<Value> &external_values = op_external_values.lookup(op);
-    auto task_op = createTaskFromOp(builder, op, ctx, external_values);
-    if (!task_op) {
-      return failure();
-    }
-    tasks.push_back(task_op);
-  }
-
-  // Inserts channels between tasks.
-  insertChannels(builder, tasks);
-
-  // Creates graph return.
-  SmallVector<Value> return_values;
-  for (Value output : graph_outputs) {
-    Value resolved = ctx.value_mapping[output];
-    return_values.push_back(resolved);
-  }
-  builder.create<TaskflowReturnOp>(loc, return_values);
-
-  // Replaces original outputs with graph results.
-  for (auto [orig_output, graph_result] :
-       llvm::zip(graph_outputs, graph_op.getResults())) {
-    orig_output.replaceAllUsesExcept(graph_result, graph_op.getOperation());
-  }
-
-  // Erases original operations.
-  for (Operation *op : llvm::reverse(graph_ops)) {
-    op->erase();
-  }
-
-  return success();
-}
-
-//------------------------------------------------------------------------------
-// Main Conversion Process.
-//------------------------------------------------------------------------------
-// Converts a single function to TaskFlow operations.
-static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
-  // Step 1: Collects operations for the taskflow.graph op.
-  SmallVector<Operation *> graph_ops = collectTaskflowGraphOps(func_op);
-  if (graph_ops.empty()) {
-    // No operations to convert.
-    return success();
-  }
-
-  llvm::errs() << "Converting function: " << func_op.getName() << "\n";
-  llvm::errs() << "Collected taskflow graph operations:\n";
-  for (Operation *op : graph_ops) {
-    llvm::errs() << "  " << *op << "\n";
-  }
-
-  SmallVector<Value> graph_inputs = identifyGraphInputs(graph_ops, func_op);
-  SmallVector<Value> graph_outputs = identifyGraphOutputs(graph_ops, func_op);
-
-  llvm::errs() << "Identified graph inputs:\n";
-  for (Value input : graph_inputs) {
-    llvm::errs() << "  " << input << "\n";
-  }
-  llvm::errs() << "Identified graph outputs:\n";
-  for (Value output : graph_outputs) {
-    llvm::errs() << "  " << output << "\n";
-  }
-
-  // Finds insertion point: after the last operation that defines a graph input.
-  Operation *insertion_point = nullptr;
-  for (Value input : graph_inputs) {
-    if (auto *def_op = input.getDefiningOp()) {
-      if (!insertion_point || insertion_point->isBeforeInBlock(def_op)) {
-        insertion_point = def_op;
-      }
-    }
-  }
-
-  // Set the insertion point for the builder.
-  OpBuilder builder(func_op.getContext());
-  if (insertion_point) {
-    builder.setInsertionPointAfter(insertion_point);
-  } else {
-    // If no inputs are defined by an operation (i.e., they are all function
-    // arguments), insert the graph at the beginning of the function body.
-    builder.setInsertionPointToStart(&func_op.front());
-  }
-
-  // Collects external values for each graph operation.
-  DenseMap<Operation *, SmallVector<Value>> op_external_values =
-      collectExternalValuesPerOp(graph_ops, func_op);
-
-  // Step 2 & 3 & 4: Creates the taskflow.graph op.
-  auto result = buildTaskflowGraph(builder, func_op, graph_ops, graph_inputs,
-                                   graph_outputs, op_external_values);
-  llvm::errs() << "Converted function to TaskFlow graph.\n";
-  llvm::errs() << "Resulting function:\n";
-  func_op.print(llvm::errs());
-  llvm::errs() << "\n";
-
-  return result;
-}
-
-class ConvertLinalgToTaskflowPass
-    : public PassWrapper<ConvertLinalgToTaskflowPass, OperationPass<ModuleOp>> {
-public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertLinalgToTaskflowPass)
-
-  StringRef getArgument() const final { return "convert-linalg-to-taskflow"; }
-
-  StringRef getDescription() const final {
-    return "Convert Linalg operations to Taskflow operations";
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<TaskflowDialect, linalg::LinalgDialect, func::FuncDialect,
-                    arith::ArithDialect, tensor::TensorDialect>();
-  }
-
-  void runOnOperation() override {
-    ModuleOp module = getOperation();
-
-    WalkResult result = module.walk([](func::FuncOp func_op) {
-      if (failed(convertFuncToTaskflow(func_op))) {
-        return WalkResult::interrupt();
-      }
-      return WalkResult::advance();
-    });
-
-    if (result.wasInterrupted()) {
-      signalPassFailure();
-    }
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::createConvertLinalgToTaskflowPass() {
-  return std::make_unique<ConvertLinalgToTaskflowPass>();
-}
\ No newline at end of file
diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt
index 5762784e..d8e5d7ff 100644
--- a/lib/TaskflowDialect/CMakeLists.txt
+++ b/lib/TaskflowDialect/CMakeLists.txt
@@ -1,11 +1,16 @@
 add_mlir_dialect_library(MLIRTaskflow
         Taskflow.cpp
+        TaskflowPasses.cpp
+        TaskflowOps.cpp
 
         DEPENDS
         MLIRConversionIncGen
+        MLIRTaskflowTransformsIncGen
         
         LINK_LIBS PUBLIC
         MLIRIR
         MLIRSupport
         MLIRInferTypeOpInterface
-        )
\ No newline at end of file
+)
+
+add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Taskflow.cpp b/lib/TaskflowDialect/Taskflow.cpp
index 61ff3195..bff4ec54 100644
--- a/lib/TaskflowDialect/Taskflow.cpp
+++ b/lib/TaskflowDialect/Taskflow.cpp
@@ -40,4 +40,16 @@ void TaskflowDialect::printAttribute(mlir::Attribute attr,
                                      mlir::DialectAsmPrinter &printer) const {
   // Currently no custom attributes to print.
   llvm_unreachable("Unknown Taskflow attribute");
+}
+
+mlir::Type TaskflowDialect::parseType(mlir::DialectAsmParser &parser) const {
+  // Currently no custom types to parse.
+  parser.emitError(parser.getNameLoc()) << "unknown Taskflow type";
+  return mlir::Type();
+}
+
+void TaskflowDialect::printType(mlir::Type type,
+                                mlir::DialectAsmPrinter &printer) const {
+  // Currently no custom types to print.
+  llvm_unreachable("Unknown Taskflow type");
 }
\ No newline at end of file
diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp
new file mode 100644
index 00000000..e69de29b
diff --git a/lib/TaskflowDialect/TaskflowPasses.cpp b/lib/TaskflowDialect/TaskflowPasses.cpp
new file mode 100644
index 00000000..1a10c2ef
--- /dev/null
+++ b/lib/TaskflowDialect/TaskflowPasses.cpp
@@ -0,0 +1,7 @@
+#include "TaskflowDialect/TaskflowPasses.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Transforms/Passes.h"
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
new file mode 100644
index 00000000..270ce96a
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -0,0 +1,17 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+
+add_mlir_library(MLIRTaskflowTransforms
+    ConstructHyperblockFromTaskPass.cpp
+
+    DEPENDS
+    MLIRTaskflowTransformsIncGen
+
+    LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRPass
+    MLIRSupport
+    MLIRTransforms
+    MLIRTaskflow
+    ${dialect_libs}
+    LLVMSupport
+)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
new file mode 100644
index 00000000..7ba05060
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -0,0 +1,493 @@
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <memory>
+#include <optional>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+//---------------------------------------------------------------------------
+// Loop Info Structure
+//----------------------------------------------------------------------------
+struct LoopInfo {
+  affine::AffineForOp for_op;
+  int lower_bound;
+  int upper_bound;
+  int step;
+
+  // For nested loops
+  LoopInfo *parent_loop_info = nullptr;
+  SmallVector<LoopInfo *> child_loops;
+
+  // Generated counter index
+  Value counter_index;
+};
+
+//---------------------------------------------------------------------------
+// Hyperblock Info Structure
+//----------------------------------------------------------------------------
+// Represents a code block that should become a hyperblock.
+struct HyperblockInfo {
+  // The operations that belong to this hyperblock.
+  SmallVector<Operation *> operations;
+
+  // The counter indices that trigger this hyperblock (empty for top-level
+  // operations before any loops).
+  SmallVector<Value> trigger_indices;
+
+  // Whther this hyperblock is nested within loops.
+  bool is_loop_body = false;
+
+  // The corresponding loop.
+  affine::AffineForOp loop_op = nullptr;
+};
+
+//----------------------------------------------------------------------------
+// Helper Functions
+//----------------------------------------------------------------------------
+// Extracts loop parameters from affine.for operation.
+static std::optional<LoopInfo> extractLoopBound(affine::AffineForOp for_op) {
+  LoopInfo loop_info;
+  loop_info.for_op = for_op;
+
+  // Gets lower bound.
+  if (for_op.hasConstantLowerBound()) {
+    loop_info.lower_bound = for_op.getConstantLowerBound();
+  } else {
+    return std::nullopt;
+  }
+
+  // Gets upper bound.
+  if (for_op.hasConstantUpperBound()) {
+    loop_info.upper_bound = for_op.getConstantUpperBound();
+  } else {
+    return std::nullopt;
+  }
+
+  // Gets step.
+  loop_info.step = for_op.getStepAsInt();
+
+  return loop_info;
+}
+
+// Collects all affine.for loops and builds loop hierarchy.
+static SmallVector<LoopInfo> collectLoopInfo(TaskflowTaskOp task_op) {
+  SmallVector<LoopInfo> loops_info;
+  DenseMap<Operation *, LoopInfo *> op_to_loopinfo;
+
+  // Step 1: Collects all loops with its parameter.
+  task_op.walk([&](affine::AffineForOp for_op) {
+    auto info = extractLoopBound(for_op);
+    if (!info) {
+      assert(false && "Non-constant loop bounds are not supported.");
+    }
+
+    loops_info.push_back(*info);
+    op_to_loopinfo[for_op.getOperation()] = &loops_info.back();
+  });
+
+  // Step 2: Builds parent-child relationships among loops.
+  for (auto &loop_info : loops_info) {
+    Operation *parent_op = loop_info.for_op->getParentOp();
+    if (auto parent_for = dyn_cast<affine::AffineForOp>(parent_op)) {
+      if (op_to_loopinfo.count(parent_for.getOperation())) {
+        LoopInfo *parent_loop_info = op_to_loopinfo[parent_for.getOperation()];
+        loop_info.parent_loop_info = parent_loop_info;
+        parent_loop_info->child_loops.push_back(&loop_info);
+      }
+    }
+  }
+
+  return loops_info;
+}
+
+//----------------------------------------------------------------------------
+// Counter Chain Creation
+//----------------------------------------------------------------------------
+// Recursively creates counter chain for each top-level loop.
+static void createCounterChainRecursivly(OpBuilder &builder, Location loc,
+                                         LoopInfo *loop_info,
+                                         Value parent_counter) {
+  // Creates counter for this loop.
+  Value counter_index;
+  if (parent_counter) {
+    // Nested counter.
+    auto counter_op = builder.create<TaskflowCounterOp>(
+        loc, builder.getIndexType(), parent_counter,
+        builder.getIndexAttr(loop_info->lower_bound),
+        builder.getIndexAttr(loop_info->upper_bound),
+        builder.getIndexAttr(loop_info->step));
+    counter_index = counter_op.getCounterIndex();
+  } else {
+    // Top-level counter.
+    auto counter_op = builder.create<TaskflowCounterOp>(
+        loc, builder.getIndexType(), /*parent_index=*/nullptr,
+        builder.getIndexAttr(loop_info->lower_bound),
+        builder.getIndexAttr(loop_info->upper_bound),
+        builder.getIndexAttr(loop_info->step));
+    counter_index = counter_op.getCounterIndex();
+  }
+
+  loop_info->counter_index = counter_index;
+
+  // Recursively creates counters for child loops.
+  for (LoopInfo *child : loop_info->child_loops) {
+    createCounterChainRecursivly(builder, loc, child, counter_index);
+  }
+}
+
+// Creates counter chain for all top-level loops.
+static void createCounterChain(OpBuilder &builder, Location loc,
+                               SmallVector<LoopInfo *> &top_level_loops_info) {
+  for (LoopInfo *loop_info : top_level_loops_info) {
+    createCounterChainRecursivly(builder, loc, loop_info, nullptr);
+  }
+}
+
+// Gets top-level loops' info (loops without parents).
+static SmallVector<LoopInfo *>
+getTopLevelLoopsInfo(SmallVector<LoopInfo> &loops_info) {
+  SmallVector<LoopInfo *> top_level_loops_info;
+  for (auto &loop_info : loops_info) {
+    if (!loop_info.parent_loop_info) {
+      top_level_loops_info.push_back(&loop_info);
+    }
+  }
+  return top_level_loops_info;
+}
+
+//----------------------------------------------------------------------------
+// Hyperblock Creation
+//----------------------------------------------------------------------------
+// Recursively extracts hyperblocks from a region.
+static void extractHyperblocksInfoFromRegion(
+    Region &region,
+    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map,
+    SmallVector<Value> parent_indices,
+    SmallVector<HyperblockInfo> &hyperblocks_info) {
+  Block &block = region.front();
+  SmallVector<Operation *> current_block_ops;
+
+  for (Operation &op : block.getOperations()) {
+    if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
+      // Before processing the loop, emits any accumulated operations as a
+      // hyperblock.
+      if (!current_block_ops.empty()) {
+        HyperblockInfo info;
+        info.operations = current_block_ops;
+        info.trigger_indices = parent_indices;
+        info.is_loop_body = !parent_indices.empty();
+        hyperblocks_info.push_back(info);
+        current_block_ops.clear();
+      }
+
+      // Gets the loop info.
+      LoopInfo *loop_info = loop_info_map.lookup(for_op);
+      assert(loop_info && "Loop not found in loop_info_map");
+
+      // Builds trigger indices fro this loop (parent indices + this loop's
+      // index).
+      SmallVector<Value> loop_indices = parent_indices;
+      loop_indices.push_back(loop_info->counter_index);
+
+      // Recursively extracts hyperblocks from the loop body.
+      extractHyperblocksInfoFromRegion(for_op.getRegion(), loop_info_map,
+                                       loop_indices, hyperblocks_info);
+    } else if (isa<TaskflowYieldOp, TaskflowCounterOp>(&op) ||
+               (isa<affine::AffineYieldOp>(&op) && op.getOperands().empty())) {
+      // Skips TaskflowYieldOp and TaskflowCounterOp.
+      continue;
+    } else {
+      // Regular operation, accumulates it.
+      current_block_ops.push_back(&op);
+    }
+  }
+
+  // Emits any remaining operations as a hyperblock.
+  if (!current_block_ops.empty()) {
+    HyperblockInfo info;
+    info.operations = current_block_ops;
+    info.trigger_indices = parent_indices;
+    info.is_loop_body = !parent_indices.empty();
+    hyperblocks_info.push_back(info);
+    current_block_ops.clear();
+  }
+}
+
+// Extracts all hyperblocks from a task.
+static SmallVector<HyperblockInfo> extractHyperblocksInfo(
+    TaskflowTaskOp task_op,
+    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map) {
+  SmallVector<HyperblockInfo> hyperblocks_info;
+  // No parent indices for top-level hyperblocks (Not nested in a loop).
+  SmallVector<Value> empty_indices;
+
+  extractHyperblocksInfoFromRegion(task_op.getBody(), loop_info_map,
+                                   empty_indices, hyperblocks_info);
+
+  return hyperblocks_info;
+}
+
+// Collects all indices that are actually used by operations in the hyperblock.
+static SmallVector<Value> collectUsedIndices(
+    const SmallVector<Operation *> &operations,
+    const SmallVector<Value> &candidate_indices,
+    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map) {
+  // Builds reverse mapping: counter -> induction variable.
+  DenseMap<Value, Value> counter_to_indvar;
+  for (auto [loop_op, loop_info] : loop_info_map) {
+    counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar();
+  }
+
+  // Collects all values used by operations.
+  SetVector<Value> used_indvars_set;
+  for (Operation *op : operations) {
+    for (Value operand : op->getOperands()) {
+      used_indvars_set.insert(operand);
+    }
+  }
+
+  // Returns in the same order as candidate_indices to maintain parent->child
+  // order.
+  SmallVector<Value> used_counters;
+  for (Value counter : candidate_indices) {
+    if (counter_to_indvar.count(counter)) {
+      Value indvar = counter_to_indvar[counter];
+      if (used_indvars_set.contains(indvar)) {
+        used_counters.push_back(counter);
+      }
+    }
+  }
+
+  return used_counters;
+}
+
+// Determines output types for the hyperblock based on operations.
+static SmallVector<Type>
+determineHyperblockOutputTypes(const SmallVector<Operation *> &operations) {
+  SmallVector<Type> output_types = {};
+
+  // Checks if there's an affine.yield operation.
+  for (Operation *op : operations) {
+    if (auto affine_yield = dyn_cast<affine::AffineYieldOp>(op)) {
+      // Uses the operand types of affine.yield as output types.
+      for (Value operand : affine_yield.getOperands()) {
+        output_types.push_back(operand.getType());
+      }
+      return output_types;
+    }
+  }
+
+  // No affine.yield found, no output types needed.
+  return output_types;
+}
+
+// Creates a taskflow.hyperblock operation from HyperblockInfo.
+static TaskflowHyperblockOp createHyperblock(
+    OpBuilder &builder, Location loc, const HyperblockInfo &info,
+    Block *task_body,
+    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map) {
+  // Collects only the indices that are actually used in the hyperblock.
+  SmallVector<Value> used_indices =
+      collectUsedIndices(info.operations, info.trigger_indices, loop_info_map);
+
+  // Determines output types for the hyperblock based on operations.
+  SmallVector<Type> output_types =
+      determineHyperblockOutputTypes(info.operations);
+
+  // Creates the hyperblock operation.
+  TaskflowHyperblockOp hyperblock_op =
+      builder.create<TaskflowHyperblockOp>(loc, output_types, used_indices);
+  Block *hyperblock_body = new Block();
+  hyperblock_op.getBody().push_back(hyperblock_body);
+
+  // Adds block arguments for the used indices.
+  for (Value idx : used_indices) {
+    hyperblock_body->addArgument(idx.getType(), loc);
+  }
+
+  // Clone operations into the hyperblock body.
+  OpBuilder hyperblock_builder(hyperblock_body, hyperblock_body->begin());
+  IRMapping mapping;
+
+  // Maps used indices to block arguments
+  for (auto [idx, arg] :
+       llvm::zip(used_indices, hyperblock_body->getArguments())) {
+    mapping.map(idx, arg);
+  }
+
+  // Creates a mapping from loop counters to loop induction variables.
+  DenseMap<Value, Value> counter_to_indvar;
+  for (auto [loop_op, loop_info] : loop_info_map) {
+    counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar();
+  }
+
+  // Maps loop induction variables to hyperblock block arguments.
+  for (auto [idx, arg] :
+       llvm::zip(used_indices, hyperblock_body->getArguments())) {
+    if (counter_to_indvar.count(idx)) {
+      Value indvar = counter_to_indvar[idx];
+      mapping.map(indvar, arg);
+    }
+  }
+
+  // Clones all operations and handle terminators.
+  bool has_terminator = false;
+  for (Operation *op : info.operations) {
+    // Handles affine.yield specially - convert to hyperblock.yield.
+    if (auto affine_yield = dyn_cast<affine::AffineYieldOp>(op)) {
+      // Maps the yield operands through the IRMapping.
+      SmallVector<Value> yield_operands;
+      for (Value operand : affine_yield.getOperands()) {
+        Value mapped_operand = mapping.lookupOrDefault(operand);
+        yield_operands.push_back(mapped_operand);
+      }
+
+      // Creates hyperblock.yield with the mapped operands.
+      hyperblock_builder.create<TaskflowHyperblockYieldOp>(loc, yield_operands);
+      has_terminator = true;
+      continue;
+    }
+
+    // Clones regular operations.
+    hyperblock_builder.clone(*op, mapping);
+  }
+
+  // Adds terminator if the last operation wasn't already a yield.
+  if (!has_terminator) {
+    hyperblock_builder.setInsertionPointToEnd(hyperblock_body);
+    hyperblock_builder.create<TaskflowHyperblockYieldOp>(loc);
+  }
+
+  MLIRContext *context = hyperblock_op.getContext();
+  RewritePatternSet patterns(context);
+
+  populateAffineToStdConversionPatterns(patterns);
+  ConversionTarget target(*context);
+  target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
+                         func::FuncDialect, taskflow::TaskflowDialect>();
+  target.addIllegalOp<affine::AffineLoadOp, affine::AffineStoreOp,
+                      affine::AffineIfOp>();
+  if (failed(
+          applyPartialConversion(hyperblock_op, target, std::move(patterns)))) {
+    assert(false && "Affine to Standard conversion failed.");
+  }
+
+  return hyperblock_op;
+}
+
+//----------------------------------------------------------------------------
+// Task Transformation
+//----------------------------------------------------------------------------
+// The main transformation function for TaskflowTaskOp.
+static LogicalResult transformTask(TaskflowTaskOp task_op) {
+  Location loc = task_op.getLoc();
+
+  // Step 1: Collects loop information.
+  DenseMap<affine::AffineForOp, LoopInfo *> loop_info_map;
+  SmallVector<LoopInfo> loops_info = collectLoopInfo(task_op);
+  for (auto &loop_info : loops_info) {
+    loop_info_map[loop_info.for_op] = &loop_info;
+  }
+
+  // Gets the body block of the task.
+  Block *task_body = &task_op.getBody().front();
+
+  // Finds the first loop in the task body.
+  affine::AffineForOp first_loop_op = nullptr;
+  for (Operation &op : task_body->getOperations()) {
+    if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
+      first_loop_op = for_op;
+      break;
+    }
+  }
+
+  assert(first_loop_op && "No loops found in the task body.");
+
+  // Step 2: Creates counter chain before the first loop.
+  OpBuilder builder(first_loop_op);
+  SmallVector<LoopInfo *> top_level_loops_info =
+      getTopLevelLoopsInfo(loops_info);
+  createCounterChain(builder, loc, top_level_loops_info);
+
+  // Step 3: Extracts hyperblocks from task.
+  SmallVector<HyperblockInfo> hyperblocks_info =
+      extractHyperblocksInfo(task_op, loop_info_map);
+
+  // Step 4: Creates taskflow.hyperblock operations for each hyperblock.
+  builder.setInsertionPoint(first_loop_op);
+
+  // Collects all operations to erase.
+  SmallVector<Operation *> ops_to_erase;
+  for (Operation &op : llvm::make_early_inc_range(task_body->getOperations())) {
+    if (!isa<TaskflowYieldOp, TaskflowCounterOp>(&op)) {
+      ops_to_erase.push_back(&op);
+    }
+  }
+
+  // Creates hyperblock ops.
+  for (const auto &info : hyperblocks_info) {
+    createHyperblock(builder, loc, info, task_body, loop_info_map);
+  }
+
+  // Erases original operations.
+  for (Operation *op : ops_to_erase) {
+    op->erase();
+  }
+
+  return success();
+}
+
+struct ConstructHyperblockFromTaskPass
+    : public PassWrapper<ConstructHyperblockFromTaskPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConstructHyperblockFromTaskPass)
+
+  StringRef getArgument() const final {
+    return "construct-hyperblock-from-task";
+  }
+
+  StringRef getDescription() const final {
+    return "Constructs hyperblocks and counter chains from Taskflow tasks.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::taskflow::TaskflowDialect, affine::AffineDialect,
+                    func::FuncDialect, memref::MemRefDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func_op = getOperation();
+    // Collects all tasks.
+    SmallVector<TaskflowTaskOp> tasks;
+    func_op.walk([&](TaskflowTaskOp task_op) { tasks.push_back(task_op); });
+
+    // Transforms each task.
+    for (TaskflowTaskOp task_op : tasks) {
+      if (failed(transformTask(task_op))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::taskflow::createConstructHyperblockFromTaskPass() {
+  return std::make_unique<ConstructHyperblockFromTaskPass>();
+}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
new file mode 100644
index 00000000..ac2881c1
--- /dev/null
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -0,0 +1,141 @@
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: | FileCheck %s --check-prefixes=TASKFLOW
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: | FileCheck %s --check-prefixes=HYPERBLOCK
+
+module attributes {} {
+  func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    affine.for %arg10 = 0 to 4 {
+      affine.for %arg11 = 0 to 8 {
+        affine.for %arg12 = 0 to 6 {
+          %1 = affine.load %arg0[%arg10, %arg11, %arg12] : memref<?x8x6xi32>
+          affine.store %1, %arg5[%arg12] : memref<?xi32>
+        }
+        affine.for %arg12 = 0 to 5 {
+          %1 = affine.load %arg1[%arg10, %arg11, %arg12] : memref<?x8x5xi32>
+          %2 = affine.load %arg2[%arg10, %arg11, %arg12] : memref<?x8x5xi32>
+          %3 = arith.addi %1, %2 : i32
+          affine.store %3, %arg6[%arg12] : memref<?xi32>
+        }
+        affine.for %arg12 = 0 to 6 {
+          %1 = affine.load %arg5[%arg12] : memref<?xi32>
+          %2 = affine.load %arg6[%arg12] : memref<?xi32>
+          %3 = arith.addi %1, %2 : i32
+          %4 = affine.load %arg9[0] : memref<?xi32>
+          %5 = arith.addi %4, %3 : i32
+          affine.store %5, %arg9[0] : memref<?xi32>
+        }
+      }
+      affine.for %arg11 = 0 to 7 {
+        %1 = affine.load %arg3[%arg10, %arg11] : memref<?x7xi32>
+        affine.store %1, %arg7[%arg11] : memref<?xi32>
+      }
+      affine.for %arg11 = 0 to 9 {
+        %1 = affine.load %arg4[%arg10, %arg11] : memref<?x9xi32>
+        %2 = affine.load %arg7[%arg11] : memref<?xi32>
+        %3 = arith.addi %1, %2 : i32
+        affine.store %3, %arg8[%arg11] : memref<?xi32>
+      }
+    }
+    %0 = affine.load %arg9[0] : memref<?xi32>
+    return %0 : i32
+  }
+}
+
+// TASKFLOW:     module {
+// TASKFLOW-NEXT:  func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// TASKFLOW-NEXT:    %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array<i32: 10, 0>, resultSegmentSizes = array<i32: 5, 0>, task_name = "Task_0"}> ({
+// TASKFLOW-NEXT:    ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?x8x5xi32>, %arg13: memref<?xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>, %arg16: memref<?x7xi32>, %arg17: memref<?x9xi32>, %arg18: memref<?xi32>, %arg19: memref<?xi32>):
+// TASKFLOW-NEXT:      affine.for %arg20 = 0 to 4 {
+// TASKFLOW-NEXT:        affine.for %arg21 = 0 to 8 {
+// TASKFLOW-NEXT:          affine.for %arg22 = 0 to 6 {
+// TASKFLOW-NEXT:            %1 = affine.load %arg10[%arg20, %arg21, %arg22] : memref<?x8x6xi32>
+// TASKFLOW-NEXT:            affine.store %1, %arg13[%arg22] : memref<?xi32>
+// TASKFLOW-NEXT:          }
+// TASKFLOW-NEXT:          affine.for %arg22 = 0 to 5 {
+// TASKFLOW-NEXT:            %1 = affine.load %arg11[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
+// TASKFLOW-NEXT:            %2 = affine.load %arg12[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
+// TASKFLOW-NEXT:            %3 = arith.addi %1, %2 : i32
+// TASKFLOW-NEXT:            affine.store %3, %arg14[%arg22] : memref<?xi32>
+// TASKFLOW-NEXT:          }
+// TASKFLOW-NEXT:          affine.for %arg22 = 0 to 6 {
+// TASKFLOW-NEXT:            %1 = affine.load %arg13[%arg22] : memref<?xi32>
+// TASKFLOW-NEXT:            %2 = affine.load %arg14[%arg22] : memref<?xi32>
+// TASKFLOW-NEXT:            %3 = arith.addi %1, %2 : i32
+// TASKFLOW-NEXT:            %4 = affine.load %arg15[0] : memref<?xi32>
+// TASKFLOW-NEXT:            %5 = arith.addi %4, %3 : i32
+// TASKFLOW-NEXT:            affine.store %5, %arg15[0] : memref<?xi32>
+// TASKFLOW-NEXT:          }
+// TASKFLOW-NEXT:        }
+// TASKFLOW-NEXT:        affine.for %arg21 = 0 to 7 {
+// TASKFLOW-NEXT:          %1 = affine.load %arg16[%arg20, %arg21] : memref<?x7xi32>
+// TASKFLOW-NEXT:          affine.store %1, %arg18[%arg21] : memref<?xi32>
+// TASKFLOW-NEXT:        }
+// TASKFLOW-NEXT:        affine.for %arg21 = 0 to 9 {
+// TASKFLOW-NEXT:          %1 = affine.load %arg17[%arg20, %arg21] : memref<?x9xi32>
+// TASKFLOW-NEXT:          %2 = affine.load %arg18[%arg21] : memref<?xi32>
+// TASKFLOW-NEXT:          %3 = arith.addi %1, %2 : i32
+// TASKFLOW-NEXT:          affine.store %3, %arg19[%arg21] : memref<?xi32>
+// TASKFLOW-NEXT:        }
+// TASKFLOW-NEXT:      }
+// TASKFLOW-NEXT:      "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array<i32: 5, 0>}> : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> ()
+// TASKFLOW-NEXT:    }) : (memref<?x8x6xi32>, memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?x7xi32>, memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>)
+// TASKFLOW-NEXT:    %0 = affine.load %arg9[0] : memref<?xi32>
+// TASKFLOW-NEXT:    return %0 : i32
+// TASKFLOW-NEXT:  }
+// TASKFLOW-NEXT:}
+
+// HYPERBLOCK:     module {
+// HYPERBLOCK-NEXT:  func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// HYPERBLOCK-NEXT:    %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array<i32: 10, 0>, resultSegmentSizes = array<i32: 5, 0>, task_name = "Task_0"}> ({
+// HYPERBLOCK-NEXT:    ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?x8x5xi32>, %arg13: memref<?xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>, %arg16: memref<?x7xi32>, %arg17: memref<?x9xi32>, %arg18: memref<?xi32>, %arg19: memref<?xi32>):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:      %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
+// HYPERBLOCK-NEXT:      %4 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
+// HYPERBLOCK-NEXT:      %5 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
+// HYPERBLOCK-NEXT:      %6 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index
+// HYPERBLOCK-NEXT:      %7 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index
+// HYPERBLOCK-NEXT:      taskflow.hyperblock indices(%1, %2, %3 : index, index, index) {
+// HYPERBLOCK-NEXT:      ^bb0(%arg20: index, %arg21: index, %arg22: index):
+// HYPERBLOCK-NEXT:        %8 = memref.load %arg10[%arg20, %arg21, %arg22] : memref<?x8x6xi32>
+// HYPERBLOCK-NEXT:        memref.store %8, %arg13[%arg22] : memref<?xi32>
+// HYPERBLOCK-NEXT:      } -> ()
+// HYPERBLOCK-NEXT:      taskflow.hyperblock indices(%1, %2, %4 : index, index, index) {
+// HYPERBLOCK-NEXT:      ^bb0(%arg20: index, %arg21: index, %arg22: index):
+// HYPERBLOCK-NEXT:        %8 = memref.load %arg11[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
+// HYPERBLOCK-NEXT:        %9 = memref.load %arg12[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
+// HYPERBLOCK-NEXT:        %10 = arith.addi %8, %9 : i32
+// HYPERBLOCK-NEXT:        memref.store %10, %arg14[%arg22] : memref<?xi32>
+// HYPERBLOCK-NEXT:      } -> ()
+// HYPERBLOCK-NEXT:      taskflow.hyperblock indices(%5 : index) {
+// HYPERBLOCK-NEXT:      ^bb0(%arg20: index):
+// HYPERBLOCK-NEXT:        %8 = memref.load %arg13[%arg20] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %9 = memref.load %arg14[%arg20] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %10 = arith.addi %8, %9 : i32
+// HYPERBLOCK-NEXT:        %c0 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:        %11 = memref.load %arg15[%c0] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %12 = arith.addi %11, %10 : i32
+// HYPERBLOCK-NEXT:        %c0_0 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:        memref.store %12, %arg15[%c0_0] : memref<?xi32>
+// HYPERBLOCK-NEXT:      } -> ()
+// HYPERBLOCK-NEXT:      taskflow.hyperblock indices(%1, %6 : index, index) {
+// HYPERBLOCK-NEXT:      ^bb0(%arg20: index, %arg21: index):
+// HYPERBLOCK-NEXT:        %8 = memref.load %arg16[%arg20, %arg21] : memref<?x7xi32>
+// HYPERBLOCK-NEXT:        memref.store %8, %arg18[%arg21] : memref<?xi32>
+// HYPERBLOCK-NEXT:      } -> ()
+// HYPERBLOCK-NEXT:      taskflow.hyperblock indices(%1, %7 : index, index) {
+// HYPERBLOCK-NEXT:      ^bb0(%arg20: index, %arg21: index):
+// HYPERBLOCK-NEXT:        %8 = memref.load %arg17[%arg20, %arg21] : memref<?x9xi32>
+// HYPERBLOCK-NEXT:        %9 = memref.load %arg18[%arg21] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %10 = arith.addi %8, %9 : i32
+// HYPERBLOCK-NEXT:        memref.store %10, %arg19[%arg21] : memref<?xi32>
+// HYPERBLOCK-NEXT:      } -> ()
+// HYPERBLOCK-NEXT:      "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array<i32: 5, 0>}> : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> ()
+// HYPERBLOCK-NEXT:    }) : (memref<?x8x6xi32>, memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?x7xi32>, memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>)
+// HYPERBLOCK-NEXT:    %0 = affine.load %arg9[0] : memref<?xi32>
+// HYPERBLOCK-NEXT:    return %0 : i32
+// HYPERBLOCK-NEXT:  }
+// HYPERBLOCK-NEXT:}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
new file mode 100644
index 00000000..ab4360ed
--- /dev/null
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -0,0 +1,94 @@
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: | FileCheck %s --check-prefixes=TASKFLOW
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: | FileCheck %s --check-prefixes=HYPERBLOCK
+
+module {
+  // Example: Parallel nested loops scenario
+  // Task 0: Single-level loop (vector scaling)
+  // Task 1: Two-level nested loop (matrix multiplication)
+  func.func @parallel_nested_example(%A: memref<16xf32>, 
+                                      %B: memref<8x8xf32>, 
+                                      %C: memref<8x8xf32>,
+                                      %D: memref<8x8xf32>,
+                                      %scalar: f32) {
+    // Task 0: Single-level loop - Vector scaling
+    // Computes: A[i] = A[i] * scalar
+    affine.for %i = 0 to 16 {
+      %v = affine.load %A[%i] : memref<16xf32>
+      %scaled = arith.mulf %v, %scalar : f32
+      affine.store %scaled, %A[%i] : memref<16xf32>
+    }
+    
+    // Task 1: Two-level nested loop - Matrix multiplication
+    // Computes: D[i][j] = B[i][j] * C[i][j] (element-wise)
+    affine.for %i = 0 to 8 {
+      affine.for %j = 0 to 8 {
+        %b_val = affine.load %B[%i, %j] : memref<8x8xf32>
+        %c_val = affine.load %C[%i, %j] : memref<8x8xf32>
+        %product = arith.mulf %b_val, %c_val : f32
+        affine.store %product, %D[%i, %j] : memref<8x8xf32>
+      }
+    }
+    return
+  }
+}
+
+// TASKFLOW:      module {
+// TASKFLOW-NEXT:   func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) {
+// TASKFLOW-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array<i32: 1, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// TASKFLOW-NEXT:     ^bb0(%arg5: memref<16xf32>, %arg6: f32):
+// TASKFLOW-NEXT:       affine.for %arg7 = 0 to 16 {
+// TASKFLOW-NEXT:         %0 = affine.load %arg5[%arg7] : memref<16xf32>
+// TASKFLOW-NEXT:         %1 = arith.mulf %0, %arg6 : f32
+// TASKFLOW-NEXT:         affine.store %1, %arg5[%arg7] : memref<16xf32>
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       "taskflow.yield"(%arg5) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
+// TASKFLOW-NEXT:     }) : (memref<16xf32>, f32) -> memref<16xf32>
+// TASKFLOW-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 3, 0>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
+// TASKFLOW-NEXT:     ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>):
+// TASKFLOW-NEXT:       affine.for %arg8 = 0 to 8 {
+// TASKFLOW-NEXT:         affine.for %arg9 = 0 to 8 {
+// TASKFLOW-NEXT:           %0 = affine.load %arg5[%arg8, %arg9] : memref<8x8xf32>
+// TASKFLOW-NEXT:           %1 = affine.load %arg6[%arg8, %arg9] : memref<8x8xf32>
+// TASKFLOW-NEXT:           %2 = arith.mulf %0, %1 : f32
+// TASKFLOW-NEXT:           affine.store %2, %arg7[%arg8, %arg9] : memref<8x8xf32>
+// TASKFLOW-NEXT:         }
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       "taskflow.yield"(%arg7) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<8x8xf32>) -> ()
+// TASKFLOW-NEXT:     }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32>
+// TASKFLOW-NEXT:     return
+// TASKFLOW-NEXT:   }
+// TASKFLOW-NEXT: }
+
+// HYPERBLOCK:      module {
+// HYPERBLOCK-NEXT:   func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) {
+// HYPERBLOCK-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array<i32: 1, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// HYPERBLOCK-NEXT:     ^bb0(%arg5: memref<16xf32>, %arg6: f32):
+// HYPERBLOCK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index
+// HYPERBLOCK-NEXT:       taskflow.hyperblock indices(%0 : index) {
+// HYPERBLOCK-NEXT:       ^bb0(%arg7: index):
+// HYPERBLOCK-NEXT:         %1 = memref.load %arg5[%arg7] : memref<16xf32>
+// HYPERBLOCK-NEXT:         %2 = arith.mulf %1, %arg6 : f32
+// HYPERBLOCK-NEXT:         memref.store %2, %arg5[%arg7] : memref<16xf32>
+// HYPERBLOCK-NEXT:       } -> ()
+// HYPERBLOCK-NEXT:       "taskflow.yield"(%arg5) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
+// HYPERBLOCK-NEXT:     }) : (memref<16xf32>, f32) -> memref<16xf32>
+// HYPERBLOCK-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 3, 0>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
+// HYPERBLOCK-NEXT:     ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>):
+// HYPERBLOCK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:       taskflow.hyperblock indices(%0, %1 : index, index) {
+// HYPERBLOCK-NEXT:       ^bb0(%arg8: index, %arg9: index):
+// HYPERBLOCK-NEXT:         %2 = memref.load %arg5[%arg8, %arg9] : memref<8x8xf32>
+// HYPERBLOCK-NEXT:         %3 = memref.load %arg6[%arg8, %arg9] : memref<8x8xf32>
+// HYPERBLOCK-NEXT:         %4 = arith.mulf %2, %3 : f32
+// HYPERBLOCK-NEXT:         memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32>
+// HYPERBLOCK-NEXT:       } -> ()
+// HYPERBLOCK-NEXT:       "taskflow.yield"(%arg7) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<8x8xf32>) -> ()
+// HYPERBLOCK-NEXT:     }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32>
+// HYPERBLOCK-NEXT:     return
+// HYPERBLOCK-NEXT:   }
+// HYPERBLOCK-NEXT: }
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/resenet/resnet.mlir b/test/multi-cgra/taskflow/resenet/resnet.mlir
deleted file mode 100644
index f537fe8f..00000000
--- a/test/multi-cgra/taskflow/resenet/resnet.mlir
+++ /dev/null
@@ -1,74 +0,0 @@
-// RUN: cd %S && python resnet.py
-
-// RUN: mlir-neura-opt %S/Output/simple_resnet.mlir \
-// RUN: --convert-linalg-to-taskflow -o %t-resnet-taskflow.mlir
-
-// RUN: FileCheck %s --input-file=%t-resnet-taskflow.mlir
-
-// CHECK:      %2 = taskflow.graph(%arg0, %cst_1, %cst_0, %1, %0, %cst) {
-// CHECK-NEXT:   ^bb0(%arg1: tensor<1x64x8x8xf32>, %arg2: f32, %arg3: tensor<64x64x3x3xf32>, %arg4: tensor<1x64x8x8xf32>, %arg5: tensor<1x64x8x8xf32>, %arg6: tensor<64x64x3x3xf32>):
-// CHECK-NEXT:     %data_outs = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "task_0"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32):
-// CHECK-NEXT:       %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] {
-// CHECK-NEXT:       ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
-// CHECK-NEXT:         tensor.yield %arg8 : f32
-// CHECK-NEXT:       } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32>
-// CHECK-NEXT:       taskflow.yield %padded : tensor<1x64x10x10xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %3 = taskflow.channel %data_outs : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %data_outs_2 = "taskflow.task"(%arg3, %arg4, %3) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "conv2d_1"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>):
-// CHECK-NEXT:       %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %4 = taskflow.channel %data_outs_2 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_3 = "taskflow.task"(%arg5, %arg2, %4) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_2"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_2 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) {
-// CHECK-NEXT:       ^bb0(%in: f32, %out: f32):
-// CHECK-NEXT:         %10 = arith.cmpf ugt, %in, %arg8 : f32
-// CHECK-NEXT:         %11 = arith.select %10, %in, %arg8 : f32
-// CHECK-NEXT:         linalg.yield %11 : f32
-// CHECK-NEXT:       } -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %5 = taskflow.channel %data_outs_3 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_4 = "taskflow.task"(%arg2, %5) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "task_3"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: f32, %arg8: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %padded = tensor.pad %data_outs_3 low[0, 0, 1, 1] high[0, 0, 1, 1] {
-// CHECK-NEXT:       ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
-// CHECK-NEXT:         tensor.yield %arg7 : f32
-// CHECK-NEXT:       } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32>
-// CHECK-NEXT:       taskflow.yield %padded : tensor<1x64x10x10xf32>
-// CHECK-NEXT:     }) : (f32, tensor<1x64x8x8xf32>) -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %6 = taskflow.channel %data_outs_4 : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32>
-// CHECK-NEXT:     %data_outs_5 = "taskflow.task"(%arg6, %arg4, %6) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "conv2d_4"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>):
-// CHECK-NEXT:       %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs_4, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %7 = taskflow.channel %data_outs_5 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_6 = "taskflow.task"(%arg1, %arg5, %7) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_5"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %9 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_5, %arg7 : tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) {
-// CHECK-NEXT:       ^bb0(%in: f32, %in_8: f32, %out: f32):
-// CHECK-NEXT:         %10 = arith.addf %in, %in_8 : f32
-// CHECK-NEXT:         linalg.yield %10 : f32
-// CHECK-NEXT:       } -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %8 = taskflow.channel %data_outs_6 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     %data_outs_7 = "taskflow.task"(%arg5, %arg2, %8) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "generic_6"}> ({
-// CHECK-NEXT:     ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>):
-// CHECK-NEXT:       %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_6 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) {
-// CHECK-NEXT:       ^bb0(%in: f32, %out: f32):
-// CHECK-NEXT:         %10 = arith.cmpf ugt, %in, %arg8 : f32
-// CHECK-NEXT:         %11 = arith.select %10, %in, %arg8 : f32
-// CHECK-NEXT:         linalg.yield %11 : f32
-// CHECK-NEXT:       } -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:       taskflow.yield %9 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:     }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:     taskflow.return %data_outs_7 : tensor<1x64x8x8xf32>
-// CHECK-NEXT:   } : (tensor<1x64x8x8xf32>, f32, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<64x64x3x3xf32>) -> tensor<1x64x8x8xf32>
-// CHECK-NEXT:   return %2 : tensor<1x64x8x8xf32>
-// CHECK-NEXT: }
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/resenet/resnet.py b/test/multi-cgra/taskflow/resenet/resnet.py
deleted file mode 100644
index 90523903..00000000
--- a/test/multi-cgra/taskflow/resenet/resnet.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import torch
-import torch.nn as nn
-from torch._inductor.decomposition import decompositions as inductor_decomp
-import os
-
-
-class SimpleResNetBlock(nn.Module):
-    """
-    Minimal ResNet Block: Conv -> ReLU -> Conv -> Add (residual)
-    """
-
-    def __init__(self, channels=64):
-        super().__init__()
-        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False)
-        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False)
-
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = torch.relu(out)
-        out = self.conv2(out)
-        out = out + residual  # Residual connection
-        out = torch.relu(out)
-        return out
-
-
-def generate_mlir():
-    """Generate MLIR with Linalg ops"""
-    model = SimpleResNetBlock(channels=64)
-    model.eval()
-
-    # Small input for quick testing: [batch, channels, height, width]
-    x = torch.randn(1, 64, 8, 8)
-
-    # Export to MLIR via torch-mlir
-    try:
-        from torch_mlir import compile
-
-        mlir_module = compile(
-            model, x, output_type="linalg-on-tensors", use_tracing=True
-        )
-        output_dir = os.path.dirname(os.path.abspath(__file__))
-        output_dir = os.path.join(output_dir, "Output")
-        os.makedirs(output_dir, exist_ok=True)
-        filename = os.path.join(output_dir, "simple_resnet.mlir")
-        with open(filename, "w") as f:
-            f.write(str(mlir_module))
-    except ImportError:
-        print("Error: torch-mlir is not installed.\n")
-
-
-if __name__ == "__main__":
-    generate_mlir()
diff --git a/tools/mlir-neura-opt/CMakeLists.txt b/tools/mlir-neura-opt/CMakeLists.txt
index 70c06a51..e1e49db2 100644
--- a/tools/mlir-neura-opt/CMakeLists.txt
+++ b/tools/mlir-neura-opt/CMakeLists.txt
@@ -5,8 +5,10 @@ set(LIBS
         ${dialect_libs}
         ${conversion_libs}
         MLIRNeuraTransforms
+        MLIRTaskflowTransforms
         MLIRConversion
         MLIRNeura
+        MLIRTaskflow
         MLIRTransforms
         MLIROptLib
         MLIRPass
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index cd824879..a4ac0e2e 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -17,6 +17,8 @@
 #include "NeuraDialect/Architecture/ArchitectureSpec.h"
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraPasses.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowPasses.h"
 
 // Global variable to store architecture spec file path
 static std::string architecture_spec_file;
@@ -71,10 +73,12 @@ int main(int argc, char **argv) {
   registry.insert<mlir::ml_program::MLProgramDialect>();
   registry.insert<mlir::tensor::TensorDialect>();
   registry.insert<mlir::linalg::LinalgDialect>();
+  registry.insert<mlir::taskflow::TaskflowDialect>();
 
   mlir::neura::registerPasses();
   mlir::registerPasses();
   mlir::registerViewOpGraphPass();
+  mlir::taskflow::registerPasses();
 
   // Register all standard conversion passes
   mlir::registerConversionPasses();