diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index b432ab7ab0..fea72079e5 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -31,7 +31,9 @@ using namespace vpux;
 
 namespace {
 
-static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"};
+static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
+        "activation_swish", "eltwise_mul",   "softmax",        "convert",        "rms_norm", "activation_swish",
+        "activation_sin",   "eltwise_equal", "activation_cos", "eltwise_select", "topk"};
 
 //
 // AddSwKernelInstructionPrefetch
@@ -66,12 +68,13 @@ class AddSwKernelInstructionPrefetch final :
                                                             size_t clusterIdx, std::string& kernelName,
                                                             mlir::SymbolRefAttr functionSymbol);
 
-    VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier,
-                                                               size_t clusterIdx, std::string& kernelName);
+    VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
+                                                               mlir::ValueRange updateBarrier, size_t clusterIdx,
+                                                               std::string& kernelName);
     mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier);
     std::pair<std::string, size_t> getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp);
 
-    using SwKernelPrefetchVec = std::vector<std::pair<std::string, size_t>>;
+    using SwKernelPrefetchVec = std::vector<std::tuple<std::string, size_t, size_t>>;
     std::pair<SwKernelPrefetchVec, size_t> getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp,
                                                                                VPURT::TaskConfigVec& allTasks);
     std::tuple<mlir::Operation*, mlir::Value, size_t> getFirstSwTaskInIRAndBestUpdateBarrier(
@@ -79,6 +82,9 @@ class AddSwKernelInstructionPrefetch final :
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
                                                        mlir::Operation* firstShaveTaskInIR,
                                                        mlir::Value bestUpdateBarrier);
+    std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
+            mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
+            VPURT::TaskConfigVec& allTasks);
 
     bool hasVPUSWModule(mlir::Operation* funcOp);
     size_t getOffsetReservedMem(const mlir::ModuleOp module);
@@ -94,6 +100,7 @@ class AddSwKernelInstructionPrefetch final :
     bool _minFreeCyclesHasValue = false;
     size_t _minimumFreeCyclesForPrefetch = 250000;
     bool _useDummyKernelForInstructionPrefetch = false;
+    size_t _dynamicPrefetchTileCounter = 0;
 };
 
 bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
@@ -186,21 +193,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer
 }
 
 // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel
-VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
-                                                                                           mlir::Value updateBarrier,
-                                                                                           size_t clusterIdx,
-                                                                                           std::string& kernelName) {
+VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(
+        mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) {
     mlir::OpBuilder builder(firstSwTask);
-    auto moduleOp = firstSwTask->getParentOfType<mlir::ModuleOp>();
+    auto kernelOp = kernelNameToOps[kernelName];
+    auto moduleOp = kernelOp->getParentOfType<mlir::ModuleOp>();
     auto reservedMemOffset = getOffsetReservedMem(moduleOp);
     auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset);
-    auto kernelOp = kernelNameToOps[kernelName];
+    auto tileIndexAttr = kernelOp.getTileIndexAttr();
+    VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc());
+    const int64_t tileIndex = static_cast<int64_t>(clusterIdx);
 
     auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector<mlir::Value>& buffers) {
         if (auto bufOp = io.getDefiningOp<VPURT::DeclareBufferOp>()) {
-            auto newType = mlir::cast<NDTypeInterface>(io.getType()).changeShape({1, 1, 1, 1});
+            auto origType = mlir::cast<NDTypeInterface>(io.getType());
+            auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(),
+                                                                stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex);
+            auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex});
+            auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr);
             auto newBuff = builder.create<VPURT::DeclareBufferOp>(appendLoc(bufOp->getLoc(), suffix), newType,
-                                                                  bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(),
+                                                                  bufOp.getSectionAttr(), newSectionIndexAttr,
                                                                   offsetAttr, bufOp.getSwizzlingKeyAttr());
             buffers.push_back(newBuff);
             return true;
@@ -230,14 +242,18 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
 
     auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp<VPUIP::SwKernelOp>(
             builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers),
-            mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(),
+            mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex),
             kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr());
     // The dummy kernels here are generated after ActShaveProfilingPass,
     // so we need to add skipProfiling as attribute to avoid capturing their metadata
     cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));
 
-    auto args =
-            (kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];
+    auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" ||
+                 kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" ||
+                 kernelName == "rms_norm")
+                        ? mlir::ArrayAttr::get(moduleOp->getContext(), {})
+                        : kernelNameToArgs[kernelName];
+
     vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args,
                               _log.nest(), /*swKernelRunOp=*/nullptr);
 
@@ -316,7 +332,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat
             }
 
             if (!cache.isLoaded(kernelName)) {
-                kernelsToPrefetch.push_back(std::move(kernelNameAndSize));
+                kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex));
             }
             cache.loadKernel(kernelName, kernelSize);
 
@@ -394,7 +410,7 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
     for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size());
          shaveIdx++) {
         auto clusterIdx = shaveIdx / noOfShavesPerCluster;
-        auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx];
+        auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx];
         _log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx);
         auto newPrefetchKernel =
                 _useDummyKernelForInstructionPrefetch
@@ -410,6 +426,183 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
     return prefetchedKernels;
 }
 
+uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
+                                 std::map<uint64_t, size_t>& swKernelCountsCache) {
+    // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
+    const size_t saturationThreshold = numClusters * 2;
+
+    // Iterate through tasks strictly AFTER the startIndex
+    for (size_t i = startIndex + 1; i < allTasks.size(); ++i) {
+        uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);
+
+        if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
+            size_t swKernelCount = 0;
+            // Count all SW Kernels that start at this specific time
+            for (auto& task : allTasks) {
+                if (static_cast<uint64_t>(task.cycleStart) == currentStartTime) {
+                    if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp.getInnerTaskOp())) {
+                        swKernelCount++;
+                    }
+                }
+                if (static_cast<uint64_t>(task.cycleStart) > currentStartTime) {
+                    break;
+                }
+            }
+            swKernelCountsCache[currentStartTime] = swKernelCount;
+        }
+
+        if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
+            return currentStartTime;
+        }
+    }
+
+    return std::numeric_limits<uint64_t>::max();
+}
+
+struct GapCandidate {
+    uint64_t lookaheadGap = 0;
+    int64_t insertionPointTaskIndex = -1;
+
+    // used for sort
+    bool operator>(const GapCandidate& other) const {
+        return lookaheadGap > other.lookaheadGap;
+    }
+};
+
+size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
+    size_t count = 0;
+    for (auto& taskConfig : allTasks) {
+        if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
+            if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
+                count++;
+            }
+        }
+        if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
+            break;
+        }
+    }
+    return count;
+}
+
+std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime,
+                                                 VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) {
+    const int64_t targetInsertTile = 1;
+    const uint64_t GAP_THRESHOLD = 50000;
+    const size_t saturationThreshold = numClusters * 2;
+
+    // <LookaheadGapSize, GapCandidate>
+    std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
+    std::map<uint64_t, size_t> swKernelCountsCache;  // local cache
+
+    int64_t previousT1TaskIndex = -1;
+    uint64_t previousT1TaskStartTime = 0;
+
+    // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
+    for (size_t i = 0; i < allTasks.size(); ++i) {
+        auto& currentTaskConfig = allTasks[i];
+        uint64_t currentTaskStartTime = static_cast<uint64_t>(currentTaskConfig.cycleStart);
+        if (currentTaskStartTime > targetKernelGroupStartTime) {
+            break;
+        }
+
+        bool isT1Task = false;
+        if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
+            isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
+        }
+
+        if (previousT1TaskIndex != -1 && isT1Task) {
+            auto& insertionPointTask = allTasks[previousT1TaskIndex];
+            auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
+
+            size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
+
+            if (simultaneousSwKernels < saturationThreshold) {
+                uint64_t nextSaturationStart =
+                        findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
+                uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
+                uint64_t lookaheadGap = 0;
+                if (gapEnd > previousT1TaskStartTime) {
+                    lookaheadGap = gapEnd - previousT1TaskStartTime;
+                }
+
+                if (lookaheadGap >= GAP_THRESHOLD) {
+                    GapCandidate gap;
+                    gap.lookaheadGap = lookaheadGap;
+                    gap.insertionPointTaskIndex = previousT1TaskIndex;
+                    validGaps[lookaheadGap] = gap;
+                }
+            }
+        }
+
+        if (isT1Task) {
+            previousT1TaskIndex = static_cast<int64_t>(i);
+            previousT1TaskStartTime = currentTaskStartTime;
+        }
+    }
+
+    if (validGaps.empty()) {
+        log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
+        return std::nullopt;
+    }
+
+    return validGaps.begin()->second;
+}
+
+std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec(
+        mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
+        VPURT::TaskConfigVec& allTasks) {
+    auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>();
+    const auto numClusters = getNumTiles(moduleOp);
+    VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero.");
+
+    std::vector<VPUIP::SwKernelOp> prefetchedKernels{};
+
+    for (auto& kernelInfo : kernelsToPrefetch) {
+        std::string kernelName = std::get<0>(kernelInfo);
+        size_t firstAppearanceIndex = std::get<2>(kernelInfo);
+
+        if (firstAppearanceIndex >= allTasks.size()) {
+            _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
+            continue;
+        }
+        if (kernelNameToOps.count(kernelName) == 0) {
+            _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
+            continue;
+        }
+
+        auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
+
+        auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log);
+
+        if (!bestGapOpt.has_value()) {
+            _log.trace("Kernel '{0}': No valid gap found.", kernelName);
+            continue;
+        }
+
+        GapCandidate bestGap = bestGapOpt.value();
+        _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName,
+                   bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);
+
+        if (bestGap.insertionPointTaskIndex < 0 ||
+            static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
+            _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", kernelName,
+                       bestGap.insertionPointTaskIndex);
+            continue;
+        }
+
+        auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp;
+        size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters;
+        _dynamicPrefetchTileCounter++;
+
+        auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(insertBeforeOp, mlir::ValueRange(),
+                                                                          dynamicExecTile, kernelName);
+
+        prefetchedKernels.push_back(newPrefetchKernel);
+    }
+
+    return prefetchedKernels;
+}
+
 void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     auto funcOp = getOperation();
     if (!hasVPUSWModule(funcOp)) {
@@ -444,10 +637,6 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks);
     auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] =
             getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex);
-    if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) {
-        return;
-    }
-    _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
 
     if (_useDummyKernelForInstructionPrefetch) {
         auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN));
@@ -455,7 +644,15 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
         VPUX_THROW_WHEN(dummyKernelResMem == nullptr,
                         "Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!");
     }
-    auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
+    if (kernelsToPrefetch.empty()) {
+        return;
+    }
+    _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
+
+    auto newPrefetchKernels =
+            (firstShaveTaskInIR == nullptr)
+                    ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks)
+                    : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
 
     // Update dependencies for cache handling operations to meet requirements of control graph split.
     auto& barrierInfo = getAnalysis<BarrierInfo>();
diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
new file mode 100644
index 0000000000..2e85f9a246
--- /dev/null
+++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
@@ -0,0 +1,180 @@
+//
+// Copyright (C) 2024-2025 Intel Corporation.
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true enable-sw-kernel-fifo-per-shave-engine=false" --add-sw-kernel-instruction-prefetch %s | FileCheck %s
+// REQUIRES: arch-NPU40XX
+
+!DummyDDRT = memref<32000x1x1x1xf16, @DDR>
+!DummyCMX0T = memref<32000x1x1x1xf16, [@CMX_NN, 0]>
+!DummyCMX1T = memref<32000x1x1x1xf16, [@CMX_NN, 1]>
+!DummyCMX0TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 0]>
+!DummyCMX1TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 1]>
+
+// This test checks following schedule
+//  Barriers :             0         1         2            3          4         5
+//  Cluster 0:             | [ DMA ] | [ DMA ] | [ Softmax] | [ TopK ] | [ DMA ] | [ Softmax ]
+//  Cluster 1:             | [    DMA    ]     | [ Softmax] | [ TopK ]
+//  Other    : [ SyncDMA ] |
+//
+
+module @subgraph attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+  VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096]
+  module @VPU.SW {
+    func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE}
+    func.func private @builtin_TopK(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, i64, i64, i64, i64) attributes {VPU.kernel_code = "topk.cpp", VPU.kernel_entry = "topk", VPU.task_type = @COMPUTE}
+    func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"}
+  }
+  config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+    builtin.module @ReservedMemory {
+      module @DummySWKernelsForInstructionPrefetchReservedMemory {
+        config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+      }
+    }
+    config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+    config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+    config.ExecutorResource 2 of @SHAVE_ACT
+    config.ExecutorResource 1 of @DPU
+  }
+  config.ExecutorResource 1 of @M2I
+  config.ExecutorResource 1 of @DMA_NN
+  config.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64}
+  net.NetworkInfo {inferenceTiming = 369464 : i64} entryPoint : @main inputsInfo : {
+    DataInfo "data" : tensor<1x3x62x62xui8>
+  } outputsInfo : {
+    DataInfo "out" : tensor<1x3x62x62xui8>
+  }
+  func.func @main(%arg0: memref<1x3x62x62xui8, @DDR>) -> memref<1x3x62x62xui8, @DDR> {
+    %0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %5 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %6 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %7 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+
+    // CHECK:       [[BARRIER_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_2:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_3:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_4:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_5:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_6:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_7:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+
+    %28 = VPURT.DeclareBuffer <DDR> <0> -> memref<0x0x0x0xi32, @DDR>
+    %ddr_buf = VPURT.DeclareBuffer <DDR> <0> -> !DummyDDRT
+    %cmx_0 = VPURT.DeclareBuffer <CMX_NN> [0] <0> -> !DummyCMX0T
+    %cmx_1 = VPURT.DeclareBuffer <CMX_NN> [1] <0> -> !DummyCMX1T
+
+    VPURT.Task updates(%0 : !VPURT.Barrier) {
+        %241 = VPUIP.SyncDMA {port = 0 : i64} inputs(%28 : memref<0x0x0x0xi32, @DDR>) outputs(%28 : memref<0x0x0x0xi32, @DDR>) -> memref<0x0x0x0xi32, @DDR>
+    }
+
+    VPURT.Task waits(%0: !VPURT.Barrier) updates(%1 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%1: !VPURT.Barrier) updates(%2 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%2: !VPURT.Barrier) updates(%3 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 1 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_1 : !DummyCMX1T) -> !DummyCMX1T
+    }
+
+    VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) {
+        %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{
+                VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T
+    }
+    }
+
+    VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) {
+        %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_SoftMax inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T) on tile 1 -> !DummyCMX1T{
+                VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1T
+    }
+    }
+
+    %cmx0_top_k = VPURT.DeclareBuffer <CMX_NN> [0] <0> -> !DummyCMX0TopK
+    VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) {
+        %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 2, 0, 0>} @VPU.SW::@builtin_TopK inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T, %cmx0_top_k as %arg5: !DummyCMX0TopK) on tile 0 -> (!DummyCMX0T, !DummyCMX0TopK) {
+                VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX0T, !DummyCMX0T, !DummyCMX0TopK
+    }
+    }
+
+    %cmx1_top_k = VPURT.DeclareBuffer <CMX_NN> [1] <0> -> !DummyCMX1TopK
+    VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) {
+        %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 2, 0, 0>} @VPU.SW::@builtin_TopK inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T, %cmx1_top_k as %arg5: !DummyCMX1TopK) on tile 1 -> (!DummyCMX1T, !DummyCMX1TopK) {
+                VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX1T, !DummyCMX1T, !DummyCMX1TopK
+    }
+    }
+
+    VPURT.Task waits(%6: !VPURT.Barrier) updates(%7 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%7: !VPURT.Barrier) {
+        %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{
+                VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T
+    }
+    }
+
+    // CHECK:       VPURT.Task updates([[BARRIER_0]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.SyncDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_1]] : !VPURT.Barrier) updates([[BARRIER_2]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_2]] : !VPURT.Barrier) updates([[BARRIER_3]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_SoftMax
+
+    // CHECK:       VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_SoftMax
+
+    // CHECK:       VPURT.Task {
+    // CHECK-NEXT:        VPUIP.SW.Kernel
+    // CHECK-SAME:        skipProfiling
+    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+
+    // CHECK:       VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+
+    // CHECK:       VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+
+    // CHECK:       VPURT.Task waits([[BARRIER_6]] : !VPURT.Barrier) updates([[BARRIER_7]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_7]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_SoftMax
+
+    return %arg0 : memref<1x3x62x62xui8, @DDR>
+  }
+}
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
index a2ae982802..8884adf385 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
@@ -20,6 +20,18 @@
 
 // CHECK-LABEL: @SoftMax
 module @SoftMax attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
     module @VPU.SW {
         func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE}
@@ -166,6 +178,18 @@ module @SoftMax attributes {config.arch = #config.arch_kind<NPU40XX>, config.com
 
 // CHECK-LABEL: @TwoFunctions
 module @TwoFunctions attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     // CHECK-DAG: {{  }}config.Resources
 
     VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096]
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
index 6dd21b5f43..ca02746827 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
@@ -9,6 +9,18 @@
 !MemRef = memref<1x3x62x62xf16>
 
 module @ChainCalls {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     net.NetworkInfo entryPoint : @main inputsInfo : {
         DataInfo "input" : tensor<1x3x62x62xf16>
     } outputsInfo : {
@@ -61,6 +73,18 @@ module @ChainCalls {
 
 !MemRef = memref<1x1x2x64xf16>
 module @SwKernelsChainCalls {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     net.NetworkInfo entryPoint : @main inputsInfo : {
         DataInfo "input" : tensor<1x1x2x64xf16>
     } outputsInfo : {
@@ -146,6 +170,18 @@ module @SwKernelsChainCalls {
 
 !MemRef = memref<1x1x2x64xf16>
 module @SwKernelsIndependentCalls {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     net.NetworkInfo entryPoint : @main inputsInfo : {
         DataInfo "input" : tensor<1x1x2x64xf16>
     } outputsInfo : {
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
index 5406c523ac..b3bf4c898b 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
@@ -9,6 +9,18 @@
 
 // CHECK-LABEL: @Gather
 module @Gather attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     VPURT.SW.Runtime
       entryPoint: @VPU.SW::@runtime
       stack_configuration: [4096, 4096, 4096, 4096]
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
index 02e4a016c5..61918ed50d 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
@@ -14,6 +14,17 @@ module @VerticalFusionOutlining attributes {config.compilationMode = #config.com
     func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"}
   }
 
+  config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+    builtin.module @ReservedMemory {
+    module @DummySWKernelsForInstructionPrefetchReservedMemory {
+        config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+    }
+    }
+    config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+    config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+    config.ExecutorResource 2 of @SHAVE_ACT
+    config.ExecutorResource 1 of @DPU
+  }
   net.NetworkInfo entryPoint : @main inputsInfo : {
     DataInfo "input" : tensor<1x16x128x128xf16, {order = #NHWC}>
   } outputsInfo : {