diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 14e27a03..c376ac70 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -23,6 +23,10 @@ std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
 // TaskFlow Conversion Passes.
 std::unique_ptr<mlir::Pass> createConvertAffineToTaskflowPass();
 std::unique_ptr<mlir::Pass> createConvertTaskflowToNeuraPass();
+
+// Memref SubView and Copy Conversion Passes.
+std::unique_ptr<mlir::Pass> createFoldSubViewPass();
+std::unique_ptr<mlir::Pass> createConvertCopyToAffineLoopsPass();
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
 
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index ec0b8a56..8d902055 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -93,4 +93,35 @@ def ConvertTaskflowToNeura : Pass<"convert-taskflow-to-neura", "ModuleOp">{
   ];
 }
 
+def FoldSubView : Pass<"fold-subview", "func::FuncOp"> {
+  let summary = "Folds memref.subview into affine load/store operations";
+  let description = [{
+    Eliminates memref.subview operations by folding them into their users
+    (affine.load and affine.store). Adjusts indices to account for subview
+    offsets and strides, enabling direct access to the source memref.
+  }];
+  let constructor = "mlir::createFoldSubViewPass()";
+  let dependentDialects = [
+    "mlir::affine::AffineDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::arith::ArithDialect",
+    "mlir::func::FuncDialect"
+  ];
+}
+
+def ConvertCopyToAffineLoops : Pass<"convert-copy-to-affine-loops", "func::FuncOp"> {
+  let summary = "Converts memref.copy to explicit affine loop nests";
+  let description = [{
+    Converts memref.copy operations into explicit affine.for loop nests
+    with affine.load and affine.store operations. This makes data movement
+    explicit for CGRA compilation with global addressing.
+  }];
+  let constructor = "mlir::createConvertCopyToAffineLoopsPass()";
+  let dependentDialects = [
+    "mlir::affine::AffineDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::func::FuncDialect"
+  ];
+}
+
 #endif // CONVERSION_PASSES_TD
\ No newline at end of file
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 71a3b510..bccdb84d 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -12,6 +12,10 @@
 #include <memory>
 namespace mlir {
 namespace taskflow {
+
+void registerTaskflowConversionPassPipeline();
+void registerTosaToAffineConversionPassPipeline();
+
 // Passes defined in TaskflowPasses.td
 #define GEN_PASS_DECL
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/lib/Conversion/AffineToTaskflow/CMakeLists.txt b/lib/Conversion/AffineToTaskflow/CMakeLists.txt
index bb4f3f52..a5e53989 100644
--- a/lib/Conversion/AffineToTaskflow/CMakeLists.txt
+++ b/lib/Conversion/AffineToTaskflow/CMakeLists.txt
@@ -2,6 +2,8 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_mlir_conversion_library(MLIRAffineToTaskflowPass
   AffineToTaskflowPass.cpp
+  FoldSubViewPass.cpp
+  ConvertCopyToAffineLoopsPass.cpp
 
   DEPENDS
   MLIRConversionIncGen
diff --git a/lib/Conversion/AffineToTaskflow/ConvertCopyToAffineLoopsPass.cpp b/lib/Conversion/AffineToTaskflow/ConvertCopyToAffineLoopsPass.cpp
new file mode 100644
index 00000000..f2b30c3d
--- /dev/null
+++ b/lib/Conversion/AffineToTaskflow/ConvertCopyToAffineLoopsPass.cpp
@@ -0,0 +1,99 @@
+//===- ConvertCopyToAffineLoopsPass.cpp - Converts memref.copy to loops ---===//
+//
+// This pass converts memref.copy operations into explicit affine loop nests.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Conversion/ConversionPasses.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+// Converts memref.copy to nested affine loops with affine.load/store.
+struct CopyOpLoweringPattern : public OpRewritePattern<memref::CopyOp> {
+  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::CopyOp copy,
+                                PatternRewriter &rewriter) const override {
+    // Checks if the target has any users besides this copy.
+    // If the target (e.g., a subview) is only used by this copy and nothing
+    // else, this copy is dead code and should be removed without conversion.
+    Value target = copy.getTarget();
+    bool has_other_users = false;
+    for (auto *user : target.getUsers()) {
+      if (user != copy.getOperation()) {
+        has_other_users = true;
+        break;
+      }
+    }
+
+    if (!has_other_users) {
+      // Target has no users besides this copy, so just erase the copy.
+      rewriter.eraseOp(copy);
+      return success();
+    }
+
+    // Target has other users, convert copy to affine loops.
+    rewriter.setInsertionPoint(copy);
+    auto loc = copy.getLoc();
+    MemRefType memref_type = dyn_cast<MemRefType>(copy.getSource().getType());
+
+    // Creates explicit memory copy using an affine loop nest.
+    SmallVector<Value> ivs;
+    for (auto dim_size : memref_type.getShape()) {
+      auto loop = rewriter.create<affine::AffineForOp>(loc, 0, dim_size);
+      rewriter.setInsertionPointToStart(loop.getBody());
+      ivs.push_back(loop.getInductionVar());
+    }
+
+    // Creates affine load from source and store to target.
+    Value value =
+        rewriter.create<affine::AffineLoadOp>(loc, copy.getSource(), ivs);
+    rewriter.create<affine::AffineStoreOp>(loc, value, copy.getTarget(), ivs);
+
+    rewriter.eraseOp(copy);
+    return success();
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ConvertCopyToAffineLoopsPass
+    : public PassWrapper<ConvertCopyToAffineLoopsPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertCopyToAffineLoopsPass)
+
+  StringRef getArgument() const final { return "convert-copy-to-affine-loops"; }
+
+  StringRef getDescription() const final {
+    return "Convert memref.copy to explicit affine loop nests";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<affine::AffineDialect, memref::MemRefDialect,
+                    func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<CopyOpLoweringPattern>(&getContext());
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createConvertCopyToAffineLoopsPass() {
+  return std::make_unique<ConvertCopyToAffineLoopsPass>();
+}
diff --git a/lib/Conversion/AffineToTaskflow/FoldSubViewPass.cpp b/lib/Conversion/AffineToTaskflow/FoldSubViewPass.cpp
new file mode 100644
index 00000000..08c78a0e
--- /dev/null
+++ b/lib/Conversion/AffineToTaskflow/FoldSubViewPass.cpp
@@ -0,0 +1,191 @@
+//===- FoldSubViewPass.cpp - Fold memref.subview into load/store ---------===//
+//
+// This pass folds memref.subview operations into their affine.load and
+// affine.store users by adjusting the access indices. Designed for CGRA
+// systems with global addressing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Conversion/ConversionPasses.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Utility functions.
+//===----------------------------------------------------------------------===//
+
+// Resolves the source indices for a load/store operation that accesses a
+// subview. Computes the adjusted indices to access the source memref directly.
+//
+// For example:
+//   %subview = memref.subview %source[%offset0, %offset1][...][%stride0,
+//   %stride1] %val = affine.load %subview[%i, %j]
+// becomes:
+//   %val = affine.load %source[%i * %stride0 + %offset0, %j * %stride1 +
+//   %offset1]
+static LogicalResult
+resolveSourceIndices(Location loc, PatternRewriter &rewriter,
+                     memref::SubViewOp sub_view_op, ValueRange indices,
+                     SmallVectorImpl<Value> &source_indices) {
+  SmallVector<OpFoldResult> mixed_offsets = sub_view_op.getMixedOffsets();
+  SmallVector<OpFoldResult> mixed_sizes = sub_view_op.getMixedSizes();
+  SmallVector<OpFoldResult> mixed_strides = sub_view_op.getMixedStrides();
+
+  SmallVector<Value> use_indices;
+  // Handles rank-reducing subviews: for every unit-dim size, adds a zero index.
+  unsigned result_dim = 0;
+  llvm::SmallBitVector unused_dims = sub_view_op.getDroppedDims();
+  for (auto dim :
+       llvm::seq<unsigned>(0, sub_view_op.getSourceType().getRank())) {
+    if (unused_dims.test(dim)) {
+      use_indices.push_back(rewriter.create<arith::ConstantIndexOp>(loc, 0));
+    } else {
+      use_indices.push_back(indices[result_dim++]);
+    }
+  }
+
+  if (use_indices.size() != mixed_offsets.size()) {
+    return failure();
+  }
+
+  source_indices.resize(use_indices.size());
+  for (auto index : llvm::seq<size_t>(0, mixed_offsets.size())) {
+    SmallVector<Value> dynamic_operands;
+    AffineExpr expr = rewriter.getAffineDimExpr(0);
+    unsigned num_symbols = 0;
+    dynamic_operands.push_back(use_indices[index]);
+
+    // Multiplies by stride: index * stride.
+    if (auto attr = mixed_strides[index].dyn_cast<Attribute>()) {
+      int64_t stride_val = dyn_cast<IntegerAttr>(attr).getInt();
+      if (stride_val != 1) {
+        expr = expr * stride_val;
+      }
+    } else {
+      dynamic_operands.push_back(dyn_cast<Value>(mixed_strides[index]));
+      expr = expr * rewriter.getAffineSymbolExpr(num_symbols++);
+    }
+
+    // Adds offset: index * stride + offset.
+    if (auto attr = mixed_offsets[index].dyn_cast<Attribute>()) {
+      int64_t offset_val = dyn_cast<IntegerAttr>(attr).getInt();
+      if (offset_val != 0) {
+        expr = expr + offset_val;
+      }
+    } else {
+      dynamic_operands.push_back(dyn_cast<Value>(mixed_offsets[index]));
+      expr = expr + rewriter.getAffineSymbolExpr(num_symbols++);
+    }
+
+    source_indices[index] = rewriter.create<affine::AffineApplyOp>(
+        loc, AffineMap::get(1, num_symbols, expr), dynamic_operands);
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Folds affine.load from a subview into a load from the source memref.
+class LoadOpOfSubViewFolder final
+    : public OpRewritePattern<affine::AffineLoadOp> {
+public:
+  using OpRewritePattern<affine::AffineLoadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineLoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    auto subViewOp = loadOp.getMemRef().getDefiningOp<memref::SubViewOp>();
+    if (!subViewOp)
+      return failure();
+
+    SmallVector<Value, 4> sourceIndices;
+    if (failed(resolveSourceIndices(loadOp.getLoc(), rewriter, subViewOp,
+                                    loadOp.getIndices(), sourceIndices)))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<affine::AffineLoadOp>(
+        loadOp, subViewOp.getSource(), sourceIndices);
+    return success();
+  }
+};
+
+/// Folds affine.store to a subview into a store to the source memref.
+class StoreOpOfSubViewFolder final
+    : public OpRewritePattern<affine::AffineStoreOp> {
+public:
+  using OpRewritePattern<affine::AffineStoreOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineStoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    auto subViewOp = storeOp.getMemRef().getDefiningOp<memref::SubViewOp>();
+    if (!subViewOp)
+      return failure();
+
+    SmallVector<Value, 4> sourceIndices;
+    if (failed(resolveSourceIndices(storeOp.getLoc(), rewriter, subViewOp,
+                                    storeOp.getIndices(), sourceIndices)))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<affine::AffineStoreOp>(
+        storeOp, storeOp.getValue(), subViewOp.getSource(), sourceIndices);
+    return success();
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct FoldSubViewPass
+    : public PassWrapper<FoldSubViewPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FoldSubViewPass)
+
+  StringRef getArgument() const final { return "fold-subview"; }
+
+  StringRef getDescription() const final {
+    return "Fold memref.subview into affine load/store operations";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<affine::AffineDialect, memref::MemRefDialect,
+                    arith::ArithDialect, func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    // Step 1: Folds subviews into their load/store users.
+    RewritePatternSet patterns(&getContext());
+    patterns.add<LoadOpOfSubViewFolder, StoreOpOfSubViewFolder>(&getContext());
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+    }
+
+    // Step 2: Cleans up dead subview operations that have no remaining users.
+    SmallVector<memref::SubViewOp> dead_sub_views;
+    getOperation().walk([&](memref::SubViewOp sub_view_op) {
+      if (sub_view_op->use_empty()) {
+        dead_sub_views.push_back(sub_view_op);
+      }
+    });
+
+    for (auto sub_view_op : dead_sub_views) {
+      sub_view_op.erase();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createFoldSubViewPass() {
+  return std::make_unique<FoldSubViewPass>();
+}
diff --git a/lib/TaskflowDialect/TaskflowPasses.cpp b/lib/TaskflowDialect/TaskflowPasses.cpp
index 1a10c2ef..6a11db81 100644
--- a/lib/TaskflowDialect/TaskflowPasses.cpp
+++ b/lib/TaskflowDialect/TaskflowPasses.cpp
@@ -1,7 +1,77 @@
 #include "TaskflowDialect/TaskflowPasses.h"
+#include "Conversion/ConversionPasses.h"
 #include "TaskflowDialect/TaskflowDialect.h"
 #include "TaskflowDialect/TaskflowOps.h"
-
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/TosaToArith/TosaToArith.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Tosa/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
-#include "mlir/Transforms/Passes.h"
\ No newline at end of file
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+using namespace mlir::bufferization;
+
+// This pass pipeline can convert affine dialect into taskflow dialect with
+// neura.kernel op.
+void mlir::taskflow::registerTaskflowConversionPassPipeline() {
+  PassPipelineRegistration<>(
+      "taskflow-conversion",
+      "Converts affine dialects to taskflow dialect with neura.kernel ops.",
+      [](OpPassManager &pm) {
+        pm.addPass(mlir::createConvertAffineToTaskflowPass());
+        pm.addPass(mlir::taskflow::createConstructHyperblockFromTaskPass());
+        pm.addPass(mlir::taskflow::createClassifyCountersPass());
+        pm.addPass(mlir::createConvertTaskflowToNeuraPass());
+      });
+}
+
+// This pass pipeline converts TOSA dialect to Affine dialect with cleanup.
+void mlir::taskflow::registerTosaToAffineConversionPassPipeline() {
+  PassPipelineRegistration<>(
+      "tosa-to-affine-conversion",
+      "Complete pipeline: TOSA to Linalg to Affine with subview/copy cleanup",
+      [](OpPassManager &pm) {
+        // Step 1-3: TOSA to Linalg (function-level passes).
+        pm.nest<func::FuncOp>().addPass(
+            mlir::tosa::createTosaInferShapesPass());
+        pm.nest<func::FuncOp>().addPass(
+            mlir::tosa::createTosaMakeBroadcastablePass());
+        pm.nest<func::FuncOp>().addPass(mlir::tosa::createTosaToLinalgNamed());
+        pm.nest<func::FuncOp>().addPass(mlir::tosa::createTosaToLinalg());
+        pm.nest<func::FuncOp>().addPass(mlir::tosa::createTosaToArith());
+        pm.nest<func::FuncOp>().addPass(mlir::tosa::createTosaToTensor());
+
+        // Step 4: Linalg Generalization (function-level).
+        pm.nest<func::FuncOp>().addPass(createLinalgGeneralizeNamedOpsPass());
+
+        // Step 5: Canonicalization (module-level).
+        pm.addPass(createCanonicalizerPass());
+
+        // Step 6: Bufferization with proper options (module-level).
+        OneShotBufferizationOptions bufferizationOptions;
+        bufferizationOptions.bufferizeFunctionBoundaries = true;
+        bufferizationOptions.setFunctionBoundaryTypeConversion(
+            LayoutMapOption::IdentityLayoutMap);
+        pm.addPass(createOneShotBufferizePass(bufferizationOptions));
+
+        // Step 7: Linalg to Affine Loops (function-level).
+        pm.nest<func::FuncOp>().addPass(createConvertLinalgToAffineLoopsPass());
+
+        // Step 8: Cleanup subview and copy operations (function-level).
+        pm.nest<func::FuncOp>().addPass(createFoldSubViewPass());
+        pm.nest<func::FuncOp>().addPass(createConvertCopyToAffineLoopsPass());
+
+        // Step 9: Final Affine cleanup (function-level).
+        pm.nest<func::FuncOp>().addPass(
+            mlir::affine::createAffineLoopNormalizePass());
+        pm.nest<func::FuncOp>().addPass(
+            mlir::affine::createSimplifyAffineStructuresPass());
+        pm.addPass(createCanonicalizerPass());
+      });
+}
\ No newline at end of file
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
index a5790c85..f70d99ca 100644
--- a/test/multi-cgra/kernel_mapping/fir/fir.mlir
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -7,10 +7,7 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
-// RUN: --construct-hyperblock-from-task \
-// RUN: --classify-counters \
-// RUN: --convert-taskflow-to-neura \
+// RUN: neura-compiler %s --taskflow-conversion \
 // RUN: -o %t.kernel.mlir
 // RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL
 
diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir
index 1671e85e..309c8512 100644
--- a/test/multi-cgra/kernel_mapping/relu/relu.mlir
+++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir
@@ -7,10 +7,7 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
-// RUN: --construct-hyperblock-from-task \
-// RUN: --classify-counters \
-// RUN: --convert-taskflow-to-neura \
+// RUN: neura-compiler %s --taskflow-conversion \
 // RUN: -o %t.kernel.mlir
 // RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL
 
diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
new file mode 100644
index 00000000..abc993f5
--- /dev/null
+++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
@@ -0,0 +1,488 @@
+// RUN: neura-compiler %s --tosa-to-affine-conversion \
+// RUN: -o %t.affine.mlir
+// RUN: FileCheck %s --input-file=%t.affine.mlir --check-prefixes=AFFINE
+
+// RUN: neura-compiler %s --tosa-to-affine-conversion \
+// RUN: --taskflow-conversion \
+// RUN: -o %t.kernel.mlir
+// RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL
+
+module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
+  func.func @forward(%arg0: tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> {
+    %0 = "tosa.const"() <{value = dense<"0x7BEEA13C"> : tensor<64x64x3x3xf32>}> : () -> tensor<64x64x3x3xf32>
+    %1 = "tosa.const"() <{value = dense<"0x8B9878BC"> : tensor<64x64x3x3xf32>}> : () -> tensor<64x64x3x3xf32>
+    %2 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<64xf32>}> : () -> tensor<64xf32>
+    %3 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+    %4 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+    %5 = tosa.transpose %arg0, %3 : (tensor<1x64x8x8xf32>, tensor<4xi32>) -> tensor<1x8x8x64xf32>
+    %6 = tosa.transpose %1, %3 : (tensor<64x64x3x3xf32>, tensor<4xi32>) -> tensor<64x3x3x64xf32>
+    %7 = tosa.conv2d %5, %6, %2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>} : (tensor<1x8x8x64xf32>, tensor<64x3x3x64xf32>, tensor<64xf32>) -> tensor<1x8x8x64xf32>
+    %8 = tosa.transpose %7, %4 : (tensor<1x8x8x64xf32>, tensor<4xi32>) -> tensor<1x64x8x8xf32>
+    %9 = tosa.clamp %8 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+    %10 = tosa.transpose %9, %3 : (tensor<1x64x8x8xf32>, tensor<4xi32>) -> tensor<1x8x8x64xf32>
+    %11 = tosa.transpose %0, %3 : (tensor<64x64x3x3xf32>, tensor<4xi32>) -> tensor<64x3x3x64xf32>
+    %12 = tosa.conv2d %10, %11, %2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>} : (tensor<1x8x8x64xf32>, tensor<64x3x3x64xf32>, tensor<64xf32>) -> tensor<1x8x8x64xf32>
+    %13 = tosa.transpose %12, %4 : (tensor<1x8x8x64xf32>, tensor<4xi32>) -> tensor<1x64x8x8xf32>
+    %14 = tosa.add %13, %arg0 : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+    %15 = tosa.clamp %14 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32>
+    return %15 : tensor<1x64x8x8xf32>
+  }
+}
+
+// AFFINE:      module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
+// AFFINE-NEXT:   memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64}
+// AFFINE-NEXT:   memref.global "private" constant @__constant_64x3x3x64xf32_0 : memref<64x3x3x64xf32> = dense<-0.0151730878> {alignment = 64 : i64}
+// AFFINE-NEXT:   memref.global "private" constant @__constant_64x3x3x64xf32 : memref<64x3x3x64xf32> = dense<0.0197670367> {alignment = 64 : i64}
+// AFFINE-NEXT:   func.func @forward(%arg0: memref<1x64x8x8xf32>) -> memref<1x64x8x8xf32> {
+// AFFINE-NEXT:     %cst = arith.constant 0.0197670367 : f32
+// AFFINE-NEXT:     %cst_0 = arith.constant -0.0151730878 : f32
+// AFFINE-NEXT:     %cst_1 = arith.constant 3.40282347E+38 : f32
+// AFFINE-NEXT:     %cst_2 = arith.constant 0.000000e+00 : f32
+// AFFINE-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 8 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             %0 = affine.load %arg0[%arg1, %arg4, %arg2, %arg3] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:             affine.store %0, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 10 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 10 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             affine.store %cst_2, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<1x10x10x64xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 8 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             affine.store %cst_2, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 8 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             affine.for %arg5 = 0 to 3 {
+// AFFINE-NEXT:               affine.for %arg6 = 0 to 3 {
+// AFFINE-NEXT:                 affine.for %arg7 = 0 to 64 {
+// AFFINE-NEXT:                   %0 = affine.load %alloc_3[%arg1, %arg2 + %arg5, %arg3 + %arg6, %arg7] : memref<1x10x10x64xf32>
+// AFFINE-NEXT:                   %1 = affine.load %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:                   %2 = arith.mulf %0, %cst_0 : f32
+// AFFINE-NEXT:                   %3 = arith.addf %1, %2 : f32
+// AFFINE-NEXT:                   affine.store %3, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:                 }
+// AFFINE-NEXT:               }
+// AFFINE-NEXT:             }
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 64 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 8 {
+// AFFINE-NEXT:             %0 = affine.load %alloc_4[%arg1, %arg3, %arg4, %arg2] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:             affine.store %0, %alloc_5[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 64 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 8 {
+// AFFINE-NEXT:             %0 = affine.load %alloc_5[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:             %1 = arith.minimumf %0, %cst_1 : f32
+// AFFINE-NEXT:             %2 = arith.maximumf %1, %cst_2 : f32
+// AFFINE-NEXT:             affine.store %2, %alloc_6[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 8 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             %0 = affine.load %alloc_6[%arg1, %arg4, %arg2, %arg3] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:             affine.store %0, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 10 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 10 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             affine.store %cst_2, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<1x10x10x64xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 8 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             affine.store %cst_2, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 8 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 64 {
+// AFFINE-NEXT:             affine.for %arg5 = 0 to 3 {
+// AFFINE-NEXT:               affine.for %arg6 = 0 to 3 {
+// AFFINE-NEXT:                 affine.for %arg7 = 0 to 64 {
+// AFFINE-NEXT:                   %0 = affine.load %alloc_8[%arg1, %arg2 + %arg5, %arg3 + %arg6, %arg7] : memref<1x10x10x64xf32>
+// AFFINE-NEXT:                   %1 = affine.load %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:                   %2 = arith.mulf %0, %cst : f32
+// AFFINE-NEXT:                   %3 = arith.addf %1, %2 : f32
+// AFFINE-NEXT:                   affine.store %3, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:                 }
+// AFFINE-NEXT:               }
+// AFFINE-NEXT:             }
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 64 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 8 {
+// AFFINE-NEXT:             %0 = affine.load %alloc_9[%arg1, %arg3, %arg4, %arg2] : memref<1x8x8x64xf32>
+// AFFINE-NEXT:             affine.store %0, %alloc_10[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 64 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 8 {
+// AFFINE-NEXT:             %0 = affine.load %alloc_10[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:             %1 = affine.load %arg0[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:             %2 = arith.addf %0, %1 : f32
+// AFFINE-NEXT:             affine.store %2, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// AFFINE-NEXT:     affine.for %arg1 = 0 to 1 {
+// AFFINE-NEXT:       affine.for %arg2 = 0 to 64 {
+// AFFINE-NEXT:         affine.for %arg3 = 0 to 8 {
+// AFFINE-NEXT:           affine.for %arg4 = 0 to 8 {
+// AFFINE-NEXT:             %0 = affine.load %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:             %1 = arith.minimumf %0, %cst_1 : f32
+// AFFINE-NEXT:             %2 = arith.maximumf %1, %cst_2 : f32
+// AFFINE-NEXT:             affine.store %2, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32>
+// AFFINE-NEXT:           }
+// AFFINE-NEXT:         }
+// AFFINE-NEXT:       }
+// AFFINE-NEXT:     }
+// AFFINE-NEXT:     return %alloc_12 : memref<1x64x8x8xf32>
+// AFFINE-NEXT:   }
+// AFFINE-NEXT: }
+
+
+// KERNEL:      module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
+// KERNEL-NEXT:   memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64}
+// KERNEL-NEXT:   memref.global "private" constant @__constant_64x3x3x64xf32_0 : memref<64x3x3x64xf32> = dense<-0.0151730878> {alignment = 64 : i64}
+// KERNEL-NEXT:   memref.global "private" constant @__constant_64x3x3x64xf32 : memref<64x3x3x64xf32> = dense<0.0197670367> {alignment = 64 : i64}
+// KERNEL-NEXT:   func.func @forward(%arg0: memref<1x64x8x8xf32>) -> memref<1x64x8x8xf32> {
+// KERNEL-NEXT:     %cst = arith.constant 0.0197670367 : f32
+// KERNEL-NEXT:     %cst_0 = arith.constant -0.0151730878 : f32
+// KERNEL-NEXT:     %cst_1 = arith.constant 3.40282347E+38 : f32
+// KERNEL-NEXT:     %cst_2 = arith.constant 0.000000e+00 : f32
+// KERNEL-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// KERNEL-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg2 : memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: memref<1x64x8x8xf32>, %arg4: memref<1x8x8x64xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %8 = memref.load %arg3[%4, %7, %5, %6] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32>
+// KERNEL-NEXT:     %write_outputs_4 = taskflow.task @Task_1 write_memrefs(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg2, %arg1 : f32, memref<1x10x10x64xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: f32, %arg4: memref<1x10x10x64xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x10x10x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// KERNEL-NEXT:     %write_outputs_6 = taskflow.task @Task_2 write_memrefs(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg2, %arg1 : f32, memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: f32, %arg4: memref<1x8x8x64xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %write_outputs_7 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       %4 = taskflow.counter parent(%3 : index) attributes {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:       %5 = taskflow.counter parent(%4 : index) attributes {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:       %6 = taskflow.counter parent(%5 : index) attributes {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg3, %arg4 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, f32) {
+// KERNEL-NEXT:       ^bb0(%arg5: memref<1x10x10x64xf32>, %arg6: memref<1x8x8x64xf32>, %arg7: f32):
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %8 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %9 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %10 = neura.counter {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %11 = neura.counter {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:         %12 = neura.counter {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:         %13 = neura.counter {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %14 = arith.addi %8, %11 : index
+// KERNEL-NEXT:         %15 = arith.addi %9, %12 : index
+// KERNEL-NEXT:         %16 = memref.load %arg5[%7, %14, %15, %13] : memref<1x10x10x64xf32>
+// KERNEL-NEXT:         %17 = memref.load %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         %18 = arith.mulf %16, %arg7 : f32
+// KERNEL-NEXT:         %19 = arith.addf %17, %18 : f32
+// KERNEL-NEXT:         memref.store %19, %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// KERNEL-NEXT:     %write_outputs_9 = taskflow.task @Task_4 read_memrefs(%write_outputs_7 : memref<1x8x8x64xf32>) write_memrefs(%alloc_8 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_8 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: memref<1x8x8x64xf32>, %arg4: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %8 = memref.load %arg3[%4, %6, %7, %5] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// KERNEL-NEXT:     %write_outputs_11 = taskflow.task @Task_5 read_memrefs(%write_outputs_9 : memref<1x64x8x8xf32>) write_memrefs(%alloc_10 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_8 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_10 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg3, %arg4, %arg2 : memref<1x64x8x8xf32>, f32, f32, memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg5: memref<1x64x8x8xf32>, %arg6: f32, %arg7: f32, %arg8: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %8 = memref.load %arg5[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         %9 = arith.minimumf %8, %arg6 : f32
+// KERNEL-NEXT:         %10 = arith.maximumf %9, %arg7 : f32
+// KERNEL-NEXT:         memref.store %10, %arg8[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// KERNEL-NEXT:     %write_outputs_13 = taskflow.task @Task_6 read_memrefs(%write_outputs_11 : memref<1x64x8x8xf32>) write_memrefs(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_10 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg2 : memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: memref<1x64x8x8xf32>, %arg4: memref<1x8x8x64xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %8 = memref.load %arg3[%4, %7, %5, %6] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32>
+// KERNEL-NEXT:     %write_outputs_15 = taskflow.task @Task_7 write_memrefs(%alloc_14 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_14 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg2, %arg1 : f32, memref<1x10x10x64xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: f32, %arg4: memref<1x10x10x64xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x10x10x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32>
+// KERNEL-NEXT:     %write_outputs_17 = taskflow.task @Task_8 write_memrefs(%alloc_16 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_16 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg2, %arg1 : f32, memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: f32, %arg4: memref<1x8x8x64xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %write_outputs_18 = taskflow.task @Task_9 read_memrefs(%write_outputs_15, %write_outputs_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_17 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_14, %alloc_16 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_16 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       %4 = taskflow.counter parent(%3 : index) attributes {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:       %5 = taskflow.counter parent(%4 : index) attributes {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:       %6 = taskflow.counter parent(%5 : index) attributes {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg3, %arg4 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, f32) {
+// KERNEL-NEXT:       ^bb0(%arg5: memref<1x10x10x64xf32>, %arg6: memref<1x8x8x64xf32>, %arg7: f32):
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %8 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %9 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %10 = neura.counter {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %11 = neura.counter {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:         %12 = neura.counter {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index
+// KERNEL-NEXT:         %13 = neura.counter {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %14 = arith.addi %8, %11 : index
+// KERNEL-NEXT:         %15 = arith.addi %9, %12 : index
+// KERNEL-NEXT:         %16 = memref.load %arg5[%7, %14, %15, %13] : memref<1x10x10x64xf32>
+// KERNEL-NEXT:         %17 = memref.load %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         %18 = arith.mulf %16, %arg7 : f32
+// KERNEL-NEXT:         %19 = arith.addf %17, %18 : f32
+// KERNEL-NEXT:         memref.store %19, %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// KERNEL-NEXT:     %write_outputs_20 = taskflow.task @Task_10 read_memrefs(%write_outputs_18 : memref<1x8x8x64xf32>) write_memrefs(%alloc_19 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_16 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_19 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg3: memref<1x8x8x64xf32>, %arg4: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %8 = memref.load %arg3[%4, %6, %7, %5] : memref<1x8x8x64xf32>
+// KERNEL-NEXT:         memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// KERNEL-NEXT:     %write_outputs_22 = taskflow.task @Task_11 read_memrefs(%write_outputs_20, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_21 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_19, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg2, %arg3 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg4: memref<1x64x8x8xf32>, %arg5: memref<1x64x8x8xf32>, %arg6: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %8 = memref.load %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         %9 = memref.load %arg5[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         %10 = arith.addf %8, %9 : f32
+// KERNEL-NEXT:         memref.store %10, %arg6[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg3 : memref<1x64x8x8xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     %alloc_23 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32>
+// KERNEL-NEXT:     %write_outputs_24 = taskflow.task @Task_12 read_memrefs(%write_outputs_22 : memref<1x64x8x8xf32>) write_memrefs(%alloc_23 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_21 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_23 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:     ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg1, %arg3, %arg4, %arg2 : memref<1x64x8x8xf32>, f32, f32, memref<1x64x8x8xf32>) {
+// KERNEL-NEXT:       ^bb0(%arg5: memref<1x64x8x8xf32>, %arg6: f32, %arg7: f32, %arg8: memref<1x64x8x8xf32>):
+// KERNEL-NEXT:         %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index
+// KERNEL-NEXT:         %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index
+// KERNEL-NEXT:         %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// KERNEL-NEXT:         %8 = memref.load %arg5[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         %9 = arith.minimumf %8, %arg6 : f32
+// KERNEL-NEXT:         %10 = arith.maximumf %9, %arg7 : f32
+// KERNEL-NEXT:         memref.store %10, %arg8[%4, %5, %6, %7] : memref<1x64x8x8xf32>
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>)
+// KERNEL-NEXT:     }
+// KERNEL-NEXT:     return %write_outputs_24 : memref<1x64x8x8xf32>
+// KERNEL-NEXT:   }
+// KERNEL-NEXT: }
+
diff --git a/tools/neura-compiler/CMakeLists.txt b/tools/neura-compiler/CMakeLists.txt
index 69e78747..fc4e13da 100644
--- a/tools/neura-compiler/CMakeLists.txt
+++ b/tools/neura-compiler/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LIBS
         ${dialect_libs}
         ${conversion_libs}
         MLIRNeuraTransforms
+        MLIRTaskflowTransforms
         MLIRConversion
         MLIRNeura
         MLIRTransforms
@@ -13,6 +14,8 @@ set(LIBS
         MLIRIR
         MLIRParser
         MLIRSupport
+        MLIRFuncAllExtensions
+        MLIRTensorAllExtensions
         )
 
 target_link_libraries(neura-compiler PRIVATE ${LIBS})
\ No newline at end of file
diff --git a/tools/neura-compiler/neura-compiler.cpp b/tools/neura-compiler/neura-compiler.cpp
index e728b8b0..5e65f061 100644
--- a/tools/neura-compiler/neura-compiler.cpp
+++ b/tools/neura-compiler/neura-compiler.cpp
@@ -1,12 +1,7 @@
 // neura-compiler.cpp
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/DLTI/DLTI.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/InitAllDialects.h"
+#include "mlir/InitAllExtensions.h"
 #include "mlir/InitAllPasses.h"
-#include "mlir/Support/FileUtilities.h"
-#include "mlir/Support/LogicalResult.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 
 #include "Conversion/ConversionPasses.h"
@@ -14,6 +9,7 @@
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraPasses.h"
 #include "NeuraDialect/Util/ArchParser.h"
+#include "TaskflowDialect/TaskflowPasses.h"
 #include "mlir/Support/LogicalResult.h"
 
 using mlir::neura::Architecture;
@@ -65,14 +61,14 @@ int main(int argc, char **argv) {
   // Registers MLIR dialects.
   mlir::DialectRegistry registry;
   registry.insert<mlir::neura::NeuraDialect>();
-  registry.insert<mlir::func::FuncDialect>();
-  registry.insert<mlir::arith::ArithDialect>();
-  registry.insert<mlir::DLTIDialect>();
-  registry.insert<mlir::LLVM::LLVMDialect>();
-  registry.insert<mlir::affine::AffineDialect>();
-  registry.insert<mlir::memref::MemRefDialect>();
+  registry.insert<mlir::taskflow::TaskflowDialect>();
+
+  mlir::registerAllDialects(registry);
+  mlir::registerAllExtensions(registry);
 
   mlir::neura::registerNeuraConversionPassPipeline();
+  mlir::taskflow::registerTosaToAffineConversionPassPipeline();
+  mlir::taskflow::registerTaskflowConversionPassPipeline();
 
   // Print architecture spec file info
   if (!architecture_spec_file.empty()) {