diff --git a/include/TaskflowDialect/TaskflowAttributes.h b/include/TaskflowDialect/TaskflowAttributes.h
new file mode 100644
index 00000000..668d649d
--- /dev/null
+++ b/include/TaskflowDialect/TaskflowAttributes.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace taskflow {
+namespace attr {
+// Attribute keys on taskflow.task operations produced by the
+// TaskDivisibilityAnalysisPass.
+constexpr llvm::StringLiteral kDivisibilityInfo = "divisibility_info";
+constexpr llvm::StringLiteral kDivisibility = "divisibility";
+constexpr llvm::StringLiteral kParallelDims = "parallel_dims";
+constexpr llvm::StringLiteral kParallelSpace = "parallel_space";
+
+namespace val {
+constexpr llvm::StringLiteral kDivisible = "divisible";
+constexpr llvm::StringLiteral kAtomic = "atomic";
+} // namespace val
+} // namespace attr
+} // namespace taskflow
+} // namespace mlir
\ No newline at end of file
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 92393d7c..21d7f322 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -23,6 +23,7 @@ std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
 std::unique_ptr<mlir::Pass> createFuseTaskPass();
+std::unique_ptr<mlir::Pass> createTaskDivisibilityAnalysisPass();
 
 //=========================================================//
 // Optimization Passes
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 5cf07cd7..5494d329 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -95,6 +95,33 @@ def FuseTask : Pass<"fuse-task", "func::FuncOp"> {
                            "mlir::taskflow::TaskflowDialect"];
 }
 
+def TaskDivisibilityAnalysis : Pass<"task-divisibility-analysis", "func::FuncOp"> {
+  let summary = "Analyzes taskn divisibility based on loop parallelism";
+  let description = [{
+    Analyzes each taskflow.task operation to determine whether its loop nest
+    contains parallel loops that can be tiled for data-level parallelism.
+
+    Task divisibility categories:
+    - divisible: The task has at least one parallel loop (no loop-carried
+      dependencies) whose trip count > 1.  Such tasks can be tiled into
+      sibling sub-tasks for runtime configuration duplication (DLP).
+    - atomic: The task has no exploitable parallel loops.  It must execute
+      as a single indivisible unit.
+
+    The pass attaches three attributes to each taskflow.task:
+    - divisibility : StringAttr  ("divisible" or "atomic")
+    - parallel_dims : DenseI64ArrayAttr  (loop depth indices of parallel loops)
+    - parallel_space : DenseI64ArrayAttr  (trip counts of the parallel dims)
+
+    Parallel loop detection uses MLIR's affine dependence analysis
+    (isLoopParallel).  Reduction loops recognised by MLIR are also counted
+    as parallel.
+  }];
+  let constructor = "taskflow::createTaskDivisibilityAnalysisPass()";
+  let dependentDialects = ["mlir::affine::AffineDialect",
+                           "mlir::func::FuncDialect"];
+}
+
 def MemoryAccessStreamingFusion
     : Pass<"memory-access-streaming-fusion", "func::FuncOp"> {
   let summary =
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index 60078298..02b69ee5 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_mlir_library(MLIRTaskflowTransforms
     ClassifyCountersPass.cpp
     MapTaskOnCgraPass.cpp
     FuseTaskPass.cpp
+    TaskDivisibilityAnalysisPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
diff --git a/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp b/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp
new file mode 100644
index 00000000..f24b4004
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp
@@ -0,0 +1,220 @@
+//===- TaskDivisibilityAnalysisPass.cpp - Analyze task divisibility ----===//
+//
+// This pass analyzes each taskflow.task operation to determine whether its
+// loop nest contains parallel loops that can be tiled for data-level
+// parallelism (DLP).
+//
+// Task divisibility categories:
+//   - divisible: Has at least one parallel loop (no loop-carried deps) with
+//     trip_count > 1.  Can be tiled into sibling sub-tasks for runtime
+//     configuration duplication.
+//   - atomic: No exploitable parallel loops.  Must execute as a single
+//     indivisible unit.
+//
+// The pass attaches an attribute to each taskflow.task:
+//   divisibility_info = {
+//     divisibility   : StringAttr       ("divisible" or "atomic")
+//     parallel_dims  : DenseI32ArrayAttr (loop depth indices of parallel loops)
+//     parallel_space : DenseI32ArrayAttr (trip counts of those parallel loops)
+//   }
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowAttributes.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
+#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Loop Nest Traversal Helpers
+//===----------------------------------------------------------------------===//
+
+// Collects the full loop nest starting from `outermost`, walking into
+// perfectly and imperfectly nested loops (only follows the first nested
+// affine.for at each level to form the "spine" of the nest).
+static SmallVector<affine::AffineForOp>
+collectLoopNest(affine::AffineForOp outermost) {
+  SmallVector<affine::AffineForOp> nest;
+  affine::AffineForOp current = outermost;
+
+  while (current) {
+    nest.push_back(current);
+
+    // Looks for a single nested affine.for in the body.
+    affine::AffineForOp nested = nullptr;
+    for (Operation &op : current.getBody()->getOperations()) {
+      if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
+        if (nested) {
+          // Multiple nested loops — stop descending (not a simple chain).
+          nested = nullptr;
+          break;
+        }
+        nested = for_op;
+      }
+    }
+    current = nested;
+  }
+
+  return nest;
+}
+
+//===----------------------------------------------------------------------===//
+// Per-Task Parallelism Analysis
+//===----------------------------------------------------------------------===//
+
+struct TaskParallelismInfo {
+  StringRef divisibility;          // "divisible" or "atomic"
+  SmallVector<int> parallel_dims;  // Loop depth indices of parallel loops.
+  SmallVector<int> parallel_space; // Trip counts of parallel dims.
+};
+
+// Analyzes a single taskflow.task and determines its category.
+static TaskParallelismInfo analyzeTask(TaskflowTaskOp task_op) {
+  TaskParallelismInfo info;
+  info.divisibility = attr::val::kAtomic; // Default: no parallelism found.
+
+  // Finds the outermost affine.for in the task body.
+  affine::AffineForOp outermost_loop = nullptr;
+  task_op.getBody().walk([&](affine::AffineForOp for_op) {
+    // We want the outermost loop. Walk visits ops in pre-order,
+    // so the first affine.for encountered at the top level is outermost.
+    if (!outermost_loop) {
+      // Checks that this loop is at the top level of the task body
+      // (its parent is the task's block, not another loop).
+      if (for_op->getParentOp() == task_op.getOperation()) {
+        outermost_loop = for_op;
+      }
+    }
+  });
+
+  if (!outermost_loop) {
+    llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName()
+                 << ": no affine.for found, classified as atomic\n";
+    return info;
+  }
+
+  // Collects the loop nest spine.
+  SmallVector<affine::AffineForOp> loop_nest = collectLoopNest(outermost_loop);
+
+  llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName()
+               << ": loop nest depth = " << loop_nest.size() << "\n";
+
+  // Analyzes each loop level for parallelism.
+  for (size_t depth = 0; depth < loop_nest.size(); ++depth) {
+    affine::AffineForOp loop = loop_nest[depth];
+
+    // Checks if the loop is parallel (not including reduction-parallel).
+    bool is_parallel = affine::isLoopParallel(loop);
+
+    // Gets the trip count.
+    std::optional<int> trip_count = affine::getConstantTripCount(loop);
+    int tc = trip_count.has_value() ? static_cast<int>(*trip_count) : -1;
+
+    llvm::errs() << "[TaskDivisibilityAnalysis]   depth " << depth
+                 << ": parallel=" << is_parallel << ", trip_count=" << tc
+                 << "\n";
+
+    if (is_parallel && tc > 1) {
+      info.parallel_dims.push_back(static_cast<int>(depth));
+      info.parallel_space.push_back(tc);
+    }
+  }
+
+  // Classifies based on whether any parallel dims were found.
+  if (!info.parallel_dims.empty()) {
+    info.divisibility = "divisible";
+  }
+
+  llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName()
+               << " -> " << info.divisibility;
+  if (!info.parallel_dims.empty()) {
+    llvm::errs() << ", parallel_dims=[";
+    for (size_t i = 0; i < info.parallel_dims.size(); ++i) {
+      if (i > 0)
+        llvm::errs() << ",";
+      llvm::errs() << info.parallel_dims[i];
+    }
+    llvm::errs() << "], parallel_space=[";
+    for (size_t i = 0; i < info.parallel_space.size(); ++i) {
+      if (i > 0)
+        llvm::errs() << ",";
+      llvm::errs() << info.parallel_space[i];
+    }
+    llvm::errs() << "]";
+  }
+  llvm::errs() << "\n";
+
+  return info;
+}
+
+//===----------------------------------------------------------------------===//
+// Task Divisibility Analysis Pass
+//===----------------------------------------------------------------------===//
+
+struct TaskDivisibilityAnalysisPass
+    : public PassWrapper<TaskDivisibilityAnalysisPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TaskDivisibilityAnalysisPass)
+
+  StringRef getArgument() const final { return "task-divisibility-analysis"; }
+
+  StringRef getDescription() const final {
+    return "Analyzes task divisibility based on loop parallelism";
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    llvm::errs() << "[TaskDivisibilityAnalysis] Running on function: "
+                 << func.getName() << "\n";
+
+    func.walk([&](TaskflowTaskOp task_op) {
+      // Analyzes the task.
+      TaskParallelismInfo info = analyzeTask(task_op);
+      // Attaches the divisibility_info attribute to each task.
+      MLIRContext *ctx = task_op.getContext();
+      OpBuilder builder(task_op);
+
+      SmallVector<NamedAttribute, 3> div_attrs;
+      div_attrs.push_back(
+          NamedAttribute(StringAttr::get(ctx, attr::kDivisibility),
+                         StringAttr::get(ctx, info.divisibility)));
+      div_attrs.push_back(
+          NamedAttribute(StringAttr::get(ctx, attr::kParallelDims),
+                         DenseI32ArrayAttr::get(ctx, info.parallel_dims)));
+      div_attrs.push_back(
+          NamedAttribute(StringAttr::get(ctx, attr::kParallelSpace),
+                         DenseI32ArrayAttr::get(ctx, info.parallel_space)));
+
+      task_op->setAttr(attr::kDivisibilityInfo,
+                       builder.getDictionaryAttr(div_attrs));
+    });
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Registration
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<Pass> mlir::taskflow::createTaskDivisibilityAnalysisPass() {
+  return std::make_unique<TaskDivisibilityAnalysisPass>();
+}
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
index cc8e85d8..ac231e87 100644
--- a/test/multi-cgra/kernel_mapping/fir/fir.mlir
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -2,6 +2,11 @@
 // RUN: -o %t.taskflow.mlir
 // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
 
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --task-divisibility-analysis \
+// RUN: -o %t.div.mlir
+// RUN: FileCheck %s --input-file=%t.div.mlir --check-prefixes=DIV
+
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: -o %t.hyperblock.mlir
@@ -107,6 +112,24 @@ module attributes {} {
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }
 
+// DIV:      module {
+// DIV-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// DIV-NEXT:     %c0_i32 = arith.constant 0 : i32
+// DIV-NEXT:     %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>)] {divisibility_info = {divisibility = "atomic", parallel_dims = array<i32>, parallel_space = array<i32>}} : (memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>, memref<?xi32>, i32) {
+// DIV-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// DIV-NEXT:       %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) {
+// DIV-NEXT:         %1 = affine.load %arg3[%arg6] : memref<?xi32>
+// DIV-NEXT:         %2 = affine.load %arg4[%arg6] : memref<?xi32>
+// DIV-NEXT:         %3 = arith.muli %1, %2 : i32
+// DIV-NEXT:         %4 = arith.addi %arg7, %3 : i32
+// DIV-NEXT:         affine.yield %4 : i32
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) values(%0 : i32)
+// DIV-NEXT:     }
+// DIV-NEXT:     return %value_outputs : i32
+// DIV-NEXT:   }
+// DIV-NEXT: }
+
 // HYPERBLOCK: module {
 // HYPERBLOCK-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // HYPERBLOCK-NEXT:     %c0_i32 = arith.constant 0 : i32
diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
index 968031f0..aa7362eb 100644
--- a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
+++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
@@ -15,6 +15,15 @@
 // RUN: -o %t.stream.mlir
 // RUN: FileCheck %s --input-file=%t.stream.mlir --check-prefixes=STREAM
 
+// RUN: mlir-neura-opt %t.affine.mlir \
+// RUN: --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --memory-access-streaming-fusion \
+// RUN: --task-divisibility-analysis \
+// RUN: -o %t.div.mlir
+// RUN: FileCheck %s --input-file=%t.div.mlir --check-prefixes=DIV
+
 // RUN: mlir-neura-opt %t.stream.mlir \
 // RUN: --affine-loop-tree-serialization \
 // RUN: --affine-loop-perfection \
@@ -704,6 +713,187 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
 // STREAM-NEXT:   }
 // STREAM-NEXT: }
 
+// DIV:      module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
+// DIV-NEXT:   memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64}
+// DIV-NEXT:   memref.global "private" constant @__constant_64x3x3x64xf32_0 : memref<64x3x3x64xf32> = dense<-0.0151730878> {alignment = 64 : i64}
+// DIV-NEXT:   memref.global "private" constant @__constant_64x3x3x64xf32 : memref<64x3x3x64xf32> = dense<0.0197670367> {alignment = 64 : i64}
+// DIV-NEXT:   func.func @forward(%arg0: memref<1x64x8x8xf32>) -> memref<1x64x8x8xf32> {
+// DIV-NEXT:     %cst = arith.constant 0.0197670367 : f32
+// DIV-NEXT:     %cst_0 = arith.constant -0.0151730878 : f32
+// DIV-NEXT:     %cst_1 = arith.constant 3.40282347E+38 : f32
+// DIV-NEXT:     %cst_2 = arith.constant 0.000000e+00 : f32
+// DIV-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// DIV-NEXT:     %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<1x64x8x8xf32>) dependency_write_in(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 8, 8, 64>}} : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>):
+// DIV-NEXT:       affine.for %arg3 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg4 = 0 to 8 {
+// DIV-NEXT:           affine.for %arg5 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg6 = 0 to 64 {
+// DIV-NEXT:               %0 = affine.load %arg1[%arg3, %arg6, %arg4, %arg5] : memref<1x64x8x8xf32>
+// DIV-NEXT:               affine.store %0, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32>
+// DIV-NEXT:     %dependency_write_out_4 = taskflow.task @Task_1 dependency_write_in(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 10, 10, 64>}} : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32):
+// DIV-NEXT:       affine.for %arg3 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg4 = 0 to 10 {
+// DIV-NEXT:           affine.for %arg5 = 0 to 10 {
+// DIV-NEXT:             affine.for %arg6 = 0 to 64 {
+// DIV-NEXT:               affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x10x10x64xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// DIV-NEXT:     %dependency_write_out_6 = taskflow.task @Task_2 dependency_write_in(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 8, 8, 64>}} : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32):
+// DIV-NEXT:       affine.for %arg3 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg4 = 0 to 8 {
+// DIV-NEXT:           affine.for %arg5 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg6 = 0 to 64 {
+// DIV-NEXT:               affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %dependency_read_out_7:2, %dependency_write_out_8 = taskflow.task @Task_3 dependency_read_in(%dependency_write_out_4, %dependency_write_out_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 8, 8, 64>}} : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32):
+// DIV-NEXT:       affine.for %arg5 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg6 = 0 to 8 {
+// DIV-NEXT:           affine.for %arg7 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg8 = 0 to 64 {
+// DIV-NEXT:               affine.for %arg9 = 0 to 3 {
+// DIV-NEXT:                 affine.for %arg10 = 0 to 3 {
+// DIV-NEXT:                   affine.for %arg11 = 0 to 64 {
+// DIV-NEXT:                     %0 = affine.load %arg1[%arg5, %arg6 + %arg9, %arg7 + %arg10, %arg11] : memref<1x10x10x64xf32>
+// DIV-NEXT:                     %1 = affine.load %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32>
+// DIV-NEXT:                     %2 = arith.mulf %0, %arg4 : f32
+// DIV-NEXT:                     %3 = arith.addf %1, %2 : f32
+// DIV-NEXT:                     affine.store %3, %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32>
+// DIV-NEXT:                   }
+// DIV-NEXT:                 }
+// DIV-NEXT:               }
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// DIV-NEXT:     %dependency_read_out_10, %dependency_write_out_11 = taskflow.task @Task_4_Task_5_fused dependency_read_in(%dependency_write_out_8 : memref<1x8x8x64xf32>) dependency_write_in(%alloc_9 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 64, 8, 8>}} : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32):
+// DIV-NEXT:       affine.for %arg5 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg6 = 0 to 64 {
+// DIV-NEXT:           affine.for %arg7 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg8 = 0 to 8 {
+// DIV-NEXT:               %0 = affine.load %arg1[%arg5, %arg7, %arg8, %arg6] : memref<1x8x8x64xf32>
+// DIV-NEXT:               %1 = arith.minimumf %0, %arg3 : f32
+// DIV-NEXT:               %2 = arith.maximumf %1, %arg4 : f32
+// DIV-NEXT:               affine.store %2, %arg2[%arg5, %arg6, %arg7, %arg8] : memref<1x64x8x8xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// DIV-NEXT:     %dependency_read_out_13, %dependency_write_out_14 = taskflow.task @Task_6 dependency_read_in(%dependency_write_out_11 : memref<1x64x8x8xf32>) dependency_write_in(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 8, 8, 64>}} : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>):
+// DIV-NEXT:       affine.for %arg3 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg4 = 0 to 8 {
+// DIV-NEXT:           affine.for %arg5 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg6 = 0 to 64 {
+// DIV-NEXT:               %0 = affine.load %arg1[%arg3, %arg6, %arg4, %arg5] : memref<1x64x8x8xf32>
+// DIV-NEXT:               affine.store %0, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32>
+// DIV-NEXT:     %dependency_write_out_16 = taskflow.task @Task_7 dependency_write_in(%alloc_15 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_15 : memref<1x10x10x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 10, 10, 64>}} : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32):
+// DIV-NEXT:       affine.for %arg3 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg4 = 0 to 10 {
+// DIV-NEXT:           affine.for %arg5 = 0 to 10 {
+// DIV-NEXT:             affine.for %arg6 = 0 to 64 {
+// DIV-NEXT:               affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x10x10x64xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// DIV-NEXT:     %dependency_write_out_18 = taskflow.task @Task_8 dependency_write_in(%alloc_17 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 8, 8, 64>}} : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32):
+// DIV-NEXT:       affine.for %arg3 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg4 = 0 to 8 {
+// DIV-NEXT:           affine.for %arg5 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg6 = 0 to 64 {
+// DIV-NEXT:               affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %dependency_read_out_19:2, %dependency_write_out_20 = taskflow.task @Task_9 dependency_read_in(%dependency_write_out_16, %dependency_write_out_18 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_18 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_15, %alloc_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 8, 8, 64>}} : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32):
+// DIV-NEXT:       affine.for %arg5 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg6 = 0 to 8 {
+// DIV-NEXT:           affine.for %arg7 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg8 = 0 to 64 {
+// DIV-NEXT:               affine.for %arg9 = 0 to 3 {
+// DIV-NEXT:                 affine.for %arg10 = 0 to 3 {
+// DIV-NEXT:                   affine.for %arg11 = 0 to 64 {
+// DIV-NEXT:                     %0 = affine.load %arg1[%arg5, %arg6 + %arg9, %arg7 + %arg10, %arg11] : memref<1x10x10x64xf32>
+// DIV-NEXT:                     %1 = affine.load %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32>
+// DIV-NEXT:                     %2 = arith.mulf %0, %arg4 : f32
+// DIV-NEXT:                     %3 = arith.addf %1, %2 : f32
+// DIV-NEXT:                     affine.store %3, %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32>
+// DIV-NEXT:                   }
+// DIV-NEXT:                 }
+// DIV-NEXT:               }
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// DIV-NEXT:     %dependency_read_out_22:2, %dependency_write_out_23 = taskflow.task @Task_10_Task_11_Task_12_fused_fused dependency_read_in(%dependency_write_out_20, %dependency_read_out : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) dependency_write_in(%alloc_21 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_17, %arg0 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array<i32: 1, 2, 3>, parallel_space = array<i32: 64, 8, 8>}} : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) {
+// DIV-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>, %arg4: f32, %arg5: f32):
+// DIV-NEXT:       affine.for %arg6 = 0 to 1 {
+// DIV-NEXT:         affine.for %arg7 = 0 to 64 {
+// DIV-NEXT:           affine.for %arg8 = 0 to 8 {
+// DIV-NEXT:             affine.for %arg9 = 0 to 8 {
+// DIV-NEXT:               %0 = affine.load %arg1[%arg6, %arg8, %arg9, %arg7] : memref<1x8x8x64xf32>
+// DIV-NEXT:               %1 = affine.load %arg2[%arg6, %arg7, %arg8, %arg9] : memref<1x64x8x8xf32>
+// DIV-NEXT:               %2 = arith.addf %0, %1 : f32
+// DIV-NEXT:               %3 = arith.minimumf %2, %arg4 : f32
+// DIV-NEXT:               %4 = arith.maximumf %3, %arg5 : f32
+// DIV-NEXT:               affine.store %4, %arg3[%arg6, %arg7, %arg8, %arg9] : memref<1x64x8x8xf32>
+// DIV-NEXT:             }
+// DIV-NEXT:           }
+// DIV-NEXT:         }
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) writes(%arg3 : memref<1x64x8x8xf32>)
+// DIV-NEXT:     }
+// DIV-NEXT:     return %dependency_write_out_23 : memref<1x64x8x8xf32>
+// DIV-NEXT:   }
+// DIV-NEXT: }
+
 
 // RESOPT: module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
 // RESOPT-NEXT:   memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64}