coredac · ShangkunLi · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/include/TaskflowDialect/TaskflowAttributes.h b/include/TaskflowDialect/TaskflowAttributes.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace taskflow {
+namespace attr {
+// Attribute keys on taskflow.task operations produced by the
+// TaskDivisibilityAnalysisPass.
+constexpr llvm::StringLiteral kDivisibilityInfo = "divisibility_info";
+constexpr llvm::StringLiteral kDivisibility = "divisibility";
+constexpr llvm::StringLiteral kParallelDims = "parallel_dims";
+constexpr llvm::StringLiteral kParallelSpace = "parallel_space";
+
+namespace val {
+constexpr llvm::StringLiteral kDivisible = "divisible";
+constexpr llvm::StringLiteral kAtomic = "atomic";
+} // namespace val
+} // namespace attr
+} // namespace taskflow
+} // namespace mlir
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
@@ -23,6 +23,7 @@ std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
 std::unique_ptr<mlir::Pass> createFuseTaskPass();
+std::unique_ptr<mlir::Pass> createTaskDivisibilityAnalysisPass();
 
 //=========================================================//
 // Optimization Passes

diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
@@ -95,6 +95,33 @@ def FuseTask : Pass<"fuse-task", "func::FuncOp"> {
                            "mlir::taskflow::TaskflowDialect"];
 }
 
+def TaskDivisibilityAnalysis : Pass<"task-divisibility-analysis", "func::FuncOp"> {
+  let summary = "Analyzes taskn divisibility based on loop parallelism";
+  let description = [{
+    Analyzes each taskflow.task operation to determine whether its loop nest
+    contains parallel loops that can be tiled for data-level parallelism.
+
+    Task divisibility categories:
+    - divisible: The task has at least one parallel loop (no loop-carried
+      dependencies) whose trip count > 1.  Such tasks can be tiled into
+      sibling sub-tasks for runtime configuration duplication (DLP).
+    - atomic: The task has no exploitable parallel loops.  It must execute
+      as a single indivisible unit.
+
+    The pass attaches three attributes to each taskflow.task:
+    - divisibility : StringAttr  ("divisible" or "atomic")
+    - parallel_dims : DenseI64ArrayAttr  (loop depth indices of parallel loops)
+    - parallel_space : DenseI64ArrayAttr  (trip counts of the parallel dims)
+
+    Parallel loop detection uses MLIR's affine dependence analysis
+    (isLoopParallel).  Reduction loops recognised by MLIR are also counted
+    as parallel.
+  }];
+  let constructor = "taskflow::createTaskDivisibilityAnalysisPass()";
+  let dependentDialects = ["mlir::affine::AffineDialect",
+                           "mlir::func::FuncDialect"];
+}
+
 def MemoryAccessStreamingFusion
     : Pass<"memory-access-streaming-fusion", "func::FuncOp"> {
   let summary =

diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_mlir_library(MLIRTaskflowTransforms
     ClassifyCountersPass.cpp
     MapTaskOnCgraPass.cpp
     FuseTaskPass.cpp
+    TaskDivisibilityAnalysisPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen

diff --git a/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp b/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp
@@ -0,0 +1,220 @@
+//===- TaskDivisibilityAnalysisPass.cpp - Analyze task divisibility ----===//
+//
+// This pass analyzes each taskflow.task operation to determine whether its
+// loop nest contains parallel loops that can be tiled for data-level
+// parallelism (DLP).
+//
+// Task divisibility categories:
+//   - divisible: Has at least one parallel loop (no loop-carried deps) with
+//     trip_count > 1.  Can be tiled into sibling sub-tasks for runtime
+//     configuration duplication.
+//   - atomic: No exploitable parallel loops.  Must execute as a single
+//     indivisible unit.
+//
+// The pass attaches an attribute to each taskflow.task:
+//   divisibility_info = {
+//     divisibility   : StringAttr       ("divisible" or "atomic")
+//     parallel_dims  : DenseI32ArrayAttr (loop depth indices of parallel loops)
+//     parallel_space : DenseI32ArrayAttr (trip counts of those parallel loops)
+//   }
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowAttributes.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
+#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Loop Nest Traversal Helpers
+//===----------------------------------------------------------------------===//
+
+// Collects the full loop nest starting from `outermost`, walking into
+// perfectly and imperfectly nested loops (only follows the first nested
+// affine.for at each level to form the "spine" of the nest).
+static SmallVector<affine::AffineForOp>
+collectLoopNest(affine::AffineForOp outermost) {
+  SmallVector<affine::AffineForOp> nest;
+  affine::AffineForOp current = outermost;
+
+  while (current) {
+    nest.push_back(current);
+
+    // Looks for a single nested affine.for in the body.
+    affine::AffineForOp nested = nullptr;
+    for (Operation &op : current.getBody()->getOperations()) {
+      if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
+        if (nested) {
+          // Multiple nested loops — stop descending (not a simple chain).
+          nested = nullptr;
+          break;
+        }
+        nested = for_op;
+      }
+    }
+    current = nested;
+  }
+
+  return nest;
+}
+
+//===----------------------------------------------------------------------===//
+// Per-Task Parallelism Analysis
+//===----------------------------------------------------------------------===//
+
+struct TaskParallelismInfo {
+  StringRef divisibility;          // "divisible" or "atomic"
+  SmallVector<int> parallel_dims;  // Loop depth indices of parallel loops.
+  SmallVector<int> parallel_space; // Trip counts of parallel dims.
+};
+
+// Analyzes a single taskflow.task and determines its category.
+static TaskParallelismInfo analyzeTask(TaskflowTaskOp task_op) {
+  TaskParallelismInfo info;
+  info.divisibility = attr::val::kAtomic; // Default: no parallelism found.
+
+  // Finds the outermost affine.for in the task body.
+  affine::AffineForOp outermost_loop = nullptr;
+  task_op.getBody().walk([&](affine::AffineForOp for_op) {
+    // We want the outermost loop. Walk visits ops in pre-order,
+    // so the first affine.for encountered at the top level is outermost.
+    if (!outermost_loop) {
+      // Checks that this loop is at the top level of the task body
+      // (its parent is the task's block, not another loop).
+      if (for_op->getParentOp() == task_op.getOperation()) {
+        outermost_loop = for_op;
+      }
+    }
+  });
+
+  if (!outermost_loop) {
+    llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName()
+                 << ": no affine.for found, classified as atomic\n";
+    return info;
+  }
+
+  // Collects the loop nest spine.
+  SmallVector<affine::AffineForOp> loop_nest = collectLoopNest(outermost_loop);
+
+  llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName()
+               << ": loop nest depth = " << loop_nest.size() << "\n";
+
+  // Analyzes each loop level for parallelism.
+  for (size_t depth = 0; depth < loop_nest.size(); ++depth) {
+    affine::AffineForOp loop = loop_nest[depth];
+
+    // Checks if the loop is parallel (not including reduction-parallel).
+    bool is_parallel = affine::isLoopParallel(loop);
+
+    // Gets the trip count.
+    std::optional<int> trip_count = affine::getConstantTripCount(loop);
+    int tc = trip_count.has_value() ? static_cast<int>(*trip_count) : -1;
+
+    llvm::errs() << "[TaskDivisibilityAnalysis]   depth " << depth
+                 << ": parallel=" << is_parallel << ", trip_count=" << tc
+                 << "\n";
+
+    if (is_parallel && tc > 1) {
+      info.parallel_dims.push_back(static_cast<int>(depth));
+      info.parallel_space.push_back(tc);
+    }
+  }
+
+  // Classifies based on whether any parallel dims were found.
+  if (!info.parallel_dims.empty()) {
+    info.divisibility = "divisible";
+  }
+
+  llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName()
+               << " -> " << info.divisibility;
+  if (!info.parallel_dims.empty()) {
+    llvm::errs() << ", parallel_dims=[";
+    for (size_t i = 0; i < info.parallel_dims.size(); ++i) {
+      if (i > 0)
+        llvm::errs() << ",";
+      llvm::errs() << info.parallel_dims[i];
+    }
+    llvm::errs() << "], parallel_space=[";
+    for (size_t i = 0; i < info.parallel_space.size(); ++i) {
+      if (i > 0)
+        llvm::errs() << ",";
+      llvm::errs() << info.parallel_space[i];
+    }
+    llvm::errs() << "]";
+  }
+  llvm::errs() << "\n";
+
+  return info;
+}
+
+//===----------------------------------------------------------------------===//
+// Task Divisibility Analysis Pass
+//===----------------------------------------------------------------------===//
+
+struct TaskDivisibilityAnalysisPass
+    : public PassWrapper<TaskDivisibilityAnalysisPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TaskDivisibilityAnalysisPass)
+
+  StringRef getArgument() const final { return "task-divisibility-analysis"; }
+
+  StringRef getDescription() const final {
+    return "Analyzes task divisibility based on loop parallelism";
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    llvm::errs() << "[TaskDivisibilityAnalysis] Running on function: "
+                 << func.getName() << "\n";
+
+    func.walk([&](TaskflowTaskOp task_op) {
+      // Analyzes the task.
+      TaskParallelismInfo info = analyzeTask(task_op);
+      // Attaches the divisibility_info attribute to each task.
+      MLIRContext *ctx = task_op.getContext();
+      OpBuilder builder(task_op);
+
+      SmallVector<NamedAttribute, 3> div_attrs;
+      div_attrs.push_back(
+          NamedAttribute(StringAttr::get(ctx, attr::kDivisibility),
+                         StringAttr::get(ctx, info.divisibility)));
+      div_attrs.push_back(
+          NamedAttribute(StringAttr::get(ctx, attr::kParallelDims),
+                         DenseI32ArrayAttr::get(ctx, info.parallel_dims)));
+      div_attrs.push_back(
+          NamedAttribute(StringAttr::get(ctx, attr::kParallelSpace),
+                         DenseI32ArrayAttr::get(ctx, info.parallel_space)));
+
+      task_op->setAttr(attr::kDivisibilityInfo,
+                       builder.getDictionaryAttr(div_attrs));
+    });
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Registration
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<Pass> mlir::taskflow::createTaskDivisibilityAnalysisPass() {
+  return std::make_unique<TaskDivisibilityAnalysisPass>();
+}
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -2,6 +2,11 @@
 // RUN: -o %t.taskflow.mlir
 // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
 
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --task-divisibility-analysis \
+// RUN: -o %t.div.mlir
+// RUN: FileCheck %s --input-file=%t.div.mlir --check-prefixes=DIV
+
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: -o %t.hyperblock.mlir
@@ -107,6 +112,24 @@ module attributes {} {
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }
 
+// DIV:      module {
+// DIV-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// DIV-NEXT:     %c0_i32 = arith.constant 0 : i32
+// DIV-NEXT:     %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>)] {divisibility_info = {divisibility = "atomic", parallel_dims = array<i32>, parallel_space = array<i32>}} : (memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>, memref<?xi32>, i32) {
+// DIV-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// DIV-NEXT:       %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) {
+// DIV-NEXT:         %1 = affine.load %arg3[%arg6] : memref<?xi32>
+// DIV-NEXT:         %2 = affine.load %arg4[%arg6] : memref<?xi32>
+// DIV-NEXT:         %3 = arith.muli %1, %2 : i32
+// DIV-NEXT:         %4 = arith.addi %arg7, %3 : i32
+// DIV-NEXT:         affine.yield %4 : i32
+// DIV-NEXT:       }
+// DIV-NEXT:       taskflow.yield reads(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) values(%0 : i32)
+// DIV-NEXT:     }
+// DIV-NEXT:     return %value_outputs : i32
+// DIV-NEXT:   }
+// DIV-NEXT: }
+
 // HYPERBLOCK: module {
 // HYPERBLOCK-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // HYPERBLOCK-NEXT:     %c0_i32 = arith.constant 0 : i32