diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index b432ab7ab0..fea72079e5 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -31,7 +31,9 @@ using namespace vpux; namespace { -static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"}; +static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = { + "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", + "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"}; // // AddSwKernelInstructionPrefetch @@ -66,12 +68,13 @@ class AddSwKernelInstructionPrefetch final : size_t clusterIdx, std::string& kernelName, mlir::SymbolRefAttr functionSymbol); - VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier, - size_t clusterIdx, std::string& kernelName); + VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, + mlir::ValueRange updateBarrier, size_t clusterIdx, + std::string& kernelName); mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier); std::pair getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp); - using SwKernelPrefetchVec = std::vector>; + using SwKernelPrefetchVec = std::vector>; std::pair getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp, VPURT::TaskConfigVec& allTasks); std::tuple getFirstSwTaskInIRAndBestUpdateBarrier( @@ -79,6 +82,9 @@ class AddSwKernelInstructionPrefetch final : std::vector insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch, mlir::Operation* firstShaveTaskInIR, mlir::Value bestUpdateBarrier); + std::vector insertPrefetchTasksDuringExec( + mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks); bool hasVPUSWModule(mlir::Operation* funcOp); size_t getOffsetReservedMem(const mlir::ModuleOp module); @@ -94,6 +100,7 @@ class AddSwKernelInstructionPrefetch final : bool _minFreeCyclesHasValue = false; size_t _minimumFreeCyclesForPrefetch = 250000; bool _useDummyKernelForInstructionPrefetch = false; + size_t _dynamicPrefetchTileCounter = 0; }; bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) { @@ -186,21 +193,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer } // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel -VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, - mlir::Value updateBarrier, - size_t clusterIdx, - std::string& kernelName) { +VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask( + mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) { mlir::OpBuilder builder(firstSwTask); - auto moduleOp = firstSwTask->getParentOfType(); + auto kernelOp = kernelNameToOps[kernelName]; + auto moduleOp = kernelOp->getParentOfType(); auto reservedMemOffset = getOffsetReservedMem(moduleOp); auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset); - auto kernelOp = kernelNameToOps[kernelName]; + auto tileIndexAttr = kernelOp.getTileIndexAttr(); + VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc()); + const int64_t tileIndex = static_cast(clusterIdx); auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector& buffers) { if (auto bufOp = io.getDefiningOp()) { - auto newType = mlir::cast(io.getType()).changeShape({1, 1, 1, 1}); + auto origType = mlir::cast(io.getType()); + auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), + stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex); + auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex}); + auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr); auto newBuff = builder.create(appendLoc(bufOp->getLoc(), suffix), newType, - bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(), + bufOp.getSectionAttr(), newSectionIndexAttr, offsetAttr, bufOp.getSwizzlingKeyAttr()); buffers.push_back(newBuff); return true; @@ -230,14 +242,18 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp( builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers), - mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(), + mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex), kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr()); // The dummy kernels here are generated after ActShaveProfilingPass, // so we need to add skipProfiling as attribute to avoid capturing their metadata cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext())); - auto args = - (kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; + auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" || + kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" || + kernelName == "rms_norm") + ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) + : kernelNameToArgs[kernelName]; + vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args, _log.nest(), /*swKernelRunOp=*/nullptr); @@ -316,7 +332,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat } if (!cache.isLoaded(kernelName)) { - kernelsToPrefetch.push_back(std::move(kernelNameAndSize)); + kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex)); } cache.loadKernel(kernelName, kernelSize); @@ -394,7 +410,7 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size()); shaveIdx++) { auto clusterIdx = shaveIdx / noOfShavesPerCluster; - auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx]; + auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx]; _log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx); auto newPrefetchKernel = _useDummyKernelForInstructionPrefetch @@ -410,6 +426,183 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas return prefetchedKernels; } +uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters, + std::map& swKernelCountsCache) { + // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels) + const size_t saturationThreshold = numClusters * 2; + + // Iterate through tasks strictly AFTER the startIndex + for (size_t i = startIndex + 1; i < allTasks.size(); ++i) { + uint64_t currentStartTime = static_cast(allTasks[i].cycleStart); + + if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) { + size_t swKernelCount = 0; + // Count all SW Kernels that start at this specific time + for (auto& task : allTasks) { + if (static_cast(task.cycleStart) == currentStartTime) { + if (mlir::isa(task.taskOp.getInnerTaskOp())) { + swKernelCount++; + } + } + if (static_cast(task.cycleStart) > currentStartTime) { + break; + } + } + swKernelCountsCache[currentStartTime] = swKernelCount; + } + + if (swKernelCountsCache[currentStartTime] >= saturationThreshold) { + return currentStartTime; + } + } + + return std::numeric_limits::max(); +} + +struct GapCandidate { + uint64_t lookaheadGap = 0; + int64_t insertionPointTaskIndex = -1; + + // used for sort + bool operator>(const GapCandidate& other) const { + return lookaheadGap > other.lookaheadGap; + } +}; + +size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) { + size_t count = 0; + for (auto& taskConfig : allTasks) { + if (static_cast(taskConfig.cycleStart) == startTime) { + if (mlir::isa(taskConfig.taskOp.getInnerTaskOp())) { + count++; + } + } + if (static_cast(taskConfig.cycleStart) > startTime) { + break; + } + } + return count; +} + +std::optional findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime, + VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) { + const int64_t targetInsertTile = 1; + const uint64_t GAP_THRESHOLD = 50000; + const size_t saturationThreshold = numClusters * 2; + + // + std::map> validGaps; + std::map swKernelCountsCache; // local cache + + int64_t previousT1TaskIndex = -1; + uint64_t previousT1TaskStartTime = 0; + + // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched + for (size_t i = 0; i < allTasks.size(); ++i) { + auto& currentTaskConfig = allTasks[i]; + uint64_t currentTaskStartTime = static_cast(currentTaskConfig.cycleStart); + if (currentTaskStartTime > targetKernelGroupStartTime) { + break; + } + + bool isT1Task = false; + if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) { + isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile); + } + + if (previousT1TaskIndex != -1 && isT1Task) { + auto& insertionPointTask = allTasks[previousT1TaskIndex]; + auto insertionPointStartTime = static_cast(insertionPointTask.cycleStart); + + size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); + + if (simultaneousSwKernels < saturationThreshold) { + uint64_t nextSaturationStart = + findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache); + uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); + uint64_t lookaheadGap = 0; + if (gapEnd > previousT1TaskStartTime) { + lookaheadGap = gapEnd - previousT1TaskStartTime; + } + + if (lookaheadGap >= GAP_THRESHOLD) { + GapCandidate gap; + gap.lookaheadGap = lookaheadGap; + gap.insertionPointTaskIndex = previousT1TaskIndex; + validGaps[lookaheadGap] = gap; + } + } + } + + if (isT1Task) { + previousT1TaskIndex = static_cast(i); + previousT1TaskStartTime = currentTaskStartTime; + } + } + + if (validGaps.empty()) { + log.trace("Kernel '{0}': No suitable insertion point found.", kernelName); + return std::nullopt; + } + + return validGaps.begin()->second; +} + +std::vector AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec( + mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks) { + auto moduleOp = funcOp->getParentOfType(); + const auto numClusters = getNumTiles(moduleOp); + VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero."); + + std::vector prefetchedKernels{}; + + for (auto& kernelInfo : kernelsToPrefetch) { + std::string kernelName = std::get<0>(kernelInfo); + size_t firstAppearanceIndex = std::get<2>(kernelInfo); + + if (firstAppearanceIndex >= allTasks.size()) { + _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex); + continue; + } + if (kernelNameToOps.count(kernelName) == 0) { + _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName); + continue; + } + + auto targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); + + auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log); + + if (!bestGapOpt.has_value()) { + _log.trace("Kernel '{0}': No valid gap found.", kernelName); + continue; + } + + GapCandidate bestGap = bestGapOpt.value(); + _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName, + bestGap.lookaheadGap, bestGap.insertionPointTaskIndex); + + if (bestGap.insertionPointTaskIndex < 0 || + static_cast(bestGap.insertionPointTaskIndex) >= allTasks.size()) { + _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", kernelName, + bestGap.insertionPointTaskIndex); + continue; + } + + auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp; + size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters; + _dynamicPrefetchTileCounter++; + + auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(insertBeforeOp, mlir::ValueRange(), + dynamicExecTile, kernelName); + + prefetchedKernels.push_back(newPrefetchKernel); + } + + return prefetchedKernels; +} + void AddSwKernelInstructionPrefetch::safeRunOnFunc() { auto funcOp = getOperation(); if (!hasVPUSWModule(funcOp)) { @@ -444,10 +637,6 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks); auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] = getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex); - if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) { - return; - } - _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); if (_useDummyKernelForInstructionPrefetch) { auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); @@ -455,7 +644,15 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { VPUX_THROW_WHEN(dummyKernelResMem == nullptr, "Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!"); } - auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + if (kernelsToPrefetch.empty()) { + return; + } + _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); + + auto newPrefetchKernels = + (firstShaveTaskInIR == nullptr) + ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks) + : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); // Update dependencies for cache handling operations to meet requirements of control graph split. auto& barrierInfo = getAnalysis(); diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir new file mode 100644 index 0000000000..2e85f9a246 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir @@ -0,0 +1,180 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true enable-sw-kernel-fifo-per-shave-engine=false" --add-sw-kernel-instruction-prefetch %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +!DummyDDRT = memref<32000x1x1x1xf16, @DDR> +!DummyCMX0T = memref<32000x1x1x1xf16, [@CMX_NN, 0]> +!DummyCMX1T = memref<32000x1x1x1xf16, [@CMX_NN, 1]> +!DummyCMX0TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 0]> +!DummyCMX1TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 1]> + +// This test checks following schedule +// Barriers : 0 1 2 3 4 5 +// Cluster 0: | [ DMA ] | [ DMA ] | [ Softmax] | [ TopK ] | [ DMA ] | [ Softmax ] +// Cluster 1: | [ DMA ] | [ Softmax] | [ TopK ] +// Other : [ SyncDMA ] | +// + +module @subgraph attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096] + module @VPU.SW { + func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} + func.func private @builtin_TopK(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, i64, i64, i64, i64) attributes {VPU.kernel_code = "topk.cpp", VPU.kernel_entry = "topk", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + config.ExecutorResource 1 of @M2I + config.ExecutorResource 1 of @DMA_NN + config.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo {inferenceTiming = 369464 : i64} entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x3x62x62xui8> + } outputsInfo : { + DataInfo "out" : tensor<1x3x62x62xui8> + } + func.func @main(%arg0: memref<1x3x62x62xui8, @DDR>) -> memref<1x3x62x62xui8, @DDR> { + %0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %5 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %6 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %7 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + // CHECK: [[BARRIER_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_2:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_3:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_4:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_5:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_6:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_7:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %28 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %ddr_buf = VPURT.DeclareBuffer <0> -> !DummyDDRT + %cmx_0 = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0T + %cmx_1 = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1T + + VPURT.Task updates(%0 : !VPURT.Barrier) { + %241 = VPUIP.SyncDMA {port = 0 : i64} inputs(%28 : memref<0x0x0x0xi32, @DDR>) outputs(%28 : memref<0x0x0x0xi32, @DDR>) -> memref<0x0x0x0xi32, @DDR> + } + + VPURT.Task waits(%0: !VPURT.Barrier) updates(%1 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%1: !VPURT.Barrier) updates(%2 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%2: !VPURT.Barrier) updates(%3 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 1 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_1 : !DummyCMX1T) -> !DummyCMX1T + } + + VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T + } + } + + VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T) on tile 1 -> !DummyCMX1T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1T + } + } + + %cmx0_top_k = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0TopK + VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { + %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_TopK inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T, %cmx0_top_k as %arg5: !DummyCMX0TopK) on tile 0 -> (!DummyCMX0T, !DummyCMX0TopK) { + VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX0T, !DummyCMX0T, !DummyCMX0TopK + } + } + + %cmx1_top_k = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1TopK + VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { + %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_TopK inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T, %cmx1_top_k as %arg5: !DummyCMX1TopK) on tile 1 -> (!DummyCMX1T, !DummyCMX1TopK) { + VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX1T, !DummyCMX1T, !DummyCMX1TopK + } + } + + VPURT.Task waits(%6: !VPURT.Barrier) updates(%7 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%7: !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T + } + } + + // CHECK: VPURT.Task updates([[BARRIER_0]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.SyncDMA + + // CHECK: VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_1]] : !VPURT.Barrier) updates([[BARRIER_2]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_2]] : !VPURT.Barrier) updates([[BARRIER_3]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + // CHECK: VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + // CHECK: VPURT.Task { + // CHECK-NEXT: VPUIP.SW.Kernel + // CHECK-SAME: skipProfiling + // CHECK-SAME: @VPU.SW::@builtin_TopK + + // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_TopK + + // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_TopK + + // CHECK: VPURT.Task waits([[BARRIER_6]] : !VPURT.Barrier) updates([[BARRIER_7]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_7]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + return %arg0 : memref<1x3x62x62xui8, @DDR> + } +} diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir index a2ae982802..8884adf385 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir @@ -20,6 +20,18 @@ // CHECK-LABEL: @SoftMax module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} @@ -166,6 +178,18 @@ module @SoftMax attributes {config.arch = #config.arch_kind, config.com // CHECK-LABEL: @TwoFunctions module @TwoFunctions attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + // CHECK-DAG: {{ }}config.Resources VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir index 6dd21b5f43..ca02746827 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir @@ -9,6 +9,18 @@ !MemRef = memref<1x3x62x62xf16> module @ChainCalls { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x3x62x62xf16> } outputsInfo : { @@ -61,6 +73,18 @@ module @ChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsChainCalls { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1x2x64xf16> } outputsInfo : { @@ -146,6 +170,18 @@ module @SwKernelsChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsIndependentCalls { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1x2x64xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir index 5406c523ac..b3bf4c898b 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir @@ -9,6 +9,18 @@ // CHECK-LABEL: @Gather module @Gather attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + VPURT.SW.Runtime entryPoint: @VPU.SW::@runtime stack_configuration: [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir index 02e4a016c5..61918ed50d 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir @@ -14,6 +14,17 @@ module @VerticalFusionOutlining attributes {config.compilationMode = #config.com func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} } + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x128x128xf16, {order = #NHWC}> } outputsInfo : {