xgqdut2016 · xgqdut2016 · Feb 26, 2025
diff --git a/src/resize/mlu/resize_cnnl.cpp b/src/resize/mlu/resize_cnnl.cpp
@@ -0,0 +1,251 @@
+#include "cnnl.h"
+#include <vector>
+#include <numeric>
+struct ResizeMode
+{
+    enum Mode
+    {
+        // Arithmetic operations:
+        Nearest,
+        Bilinear,
+
+        Count, ///< Number of resize operation types (marker for counting purposes).
+    };
+
+    // This static constant holds the total number of defined resize operations.
+    static const size_t numResizeMode = Count;
+};
+struct CoordinateMode
+{
+    enum Mode
+    {
+        halfPixel,
+        pytorchHalfPixel,
+        alignCorners,
+        asymmetric,
+        tfCropAndResize,
+
+        Count, ///< Number of resize operation types (marker for counting purposes).
+    };
+
+    // This static constant holds the total number of defined resize operations.
+    static const size_t numCoordinateMode = Count;
+};
+template <typename T>
+void resizeCnnlDevice(void const *input, float const *roi, void *output, 
+                           int *x_shape, int *y_shape,
+                           int ndim,
+                           ResizeMode::Mode mode, CoordinateMode::Mode coMode,
+                           cnnlHandle_t &handle, cnrtQueue_t &queue)
+{
+    std::vector<int> permuteI(ndim);//从nchw做转置到nhwc
+    std::vector<int> permuteO(ndim);//从nhwc转置回nchw
+    for (int i = 0; i < ndim; i++) {
+        permuteI[i] = i;
+        permuteO[i] = i;
+    }
+    for (int i = 0; i < ndim; i++) {
+        if(i >= 1){
+            permuteI[i] = i + 1;
+        }
+        if(i >= 2){
+            permuteO[i] = i - 1;
+        }
+    }
+    permuteI[ndim - 1] = 1;
+    permuteO[1] = ndim - 1;
+
+    std::vector<int> inDim(ndim);//原始input的形状为[n,c,h,w]
+    std::vector<int> outDim(ndim);
+    int x_size = 1;//表示input的size
+    int y_size = 1;//表示output的size
+    for (int i = 0; i < ndim; i++) {
+        inDim[i] = x_shape[i];
+        outDim[i] = y_shape[i];
+        x_size *= x_shape[i];
+        y_size *= y_shape[i];
+
+    }
+    std::vector<int> x_tranDim(ndim);//tmpGdramI的形状
+    std::vector<int> y_tranDim(ndim);//tmpGdramO的形状
+    for(int i = 0; i < ndim; i++){
+        x_tranDim[i] = x_shape[permuteI[i]];
+        y_tranDim[i] = y_shape[permuteI[i]];
+    }
+    cnnlTensorLayout_t layoutI = CNNL_LAYOUT_NCHW;//只支持ndim=4
+    cnnlTensorLayout_t layoutO = CNNL_LAYOUT_NHWC;
+
+    cnnlDataType_t dataType;
+    if (sizeof(T) == 2)
+    {
+        dataType = CNNL_DTYPE_HALF;
+    }
+    else if (sizeof(T) == 4)
+    {
+        dataType = CNNL_DTYPE_FLOAT;
+    }
+    T *tmpGdramI, *tmpGdramO;
+    CNRT_CHECK(cnrtMalloc((void **)&tmpGdramI, x_size * sizeof(T)));
+    CNRT_CHECK(cnrtMalloc((void **)&tmpGdramO, y_size * sizeof(T)));
+
+    cnnlTensorDescriptor_t x_desc, y_desc, IDesc, ODesc;
+    cnnlCreateTensorDescriptor(&x_desc);
+    cnnlCreateTensorDescriptor(&y_desc);
+    cnnlCreateTensorDescriptor(&IDesc);
+    cnnlCreateTensorDescriptor(&ODesc);
+
+    cnnlSetTensorDescriptor(
+        x_desc, layoutI, dataType,
+        inDim.size(), inDim.data());//原始input,nchw
+    cnnlSetTensorDescriptor(
+        IDesc, layoutO, dataType,
+        x_tranDim.size(), x_tranDim.data());//转置以后的input,nhwc
+    cnnlSetTensorDescriptor(
+        y_desc, layoutI, dataType,
+        outDim.size(), outDim.data());
+    cnnlSetTensorDescriptor(
+        ODesc, layoutO, dataType,
+        y_tranDim.size(), y_tranDim.data());
+
+    cnnlTransposeDescriptor_t desc;
+    cnnlCreateTransposeDescriptor(&desc);
+    cnnlSetTransposeDescriptor(desc, ndim, permuteI.data());
+    //然后针对input做转置nchw2nhwc
+    size_t tSizeI;
+    cnnlGetTransposeWorkspaceSize(handle, x_desc, desc, &tSizeI);
+    void *workspaceI;
+    cnrtMalloc(&workspaceI, tSizeI);
+
+    cnnlTranspose_v2(handle, desc, x_desc, input, IDesc,
+                            tmpGdramI, workspaceI, tSizeI);
+    CNRT_CHECK(cnrtQueueSync(queue));
+    //下面开始做resize
+    cnnlTensorDescriptor_t boxesDesc, boxesIndexDesc;
+    cnnlCreateTensorDescriptor(&boxesDesc);
+    auto nBatch = x_shape[0];
+    std::vector<int> boxesDim = {nBatch, 4};
+    cnnlSetTensorDescriptor(
+        boxesDesc, CNNL_LAYOUT_ARRAY, dataType,
+        boxesDim.size(), boxesDim.data());
+
+    cnnlCreateTensorDescriptor(&boxesIndexDesc);
+    std::vector<int> boxesIndexDim = {nBatch};
+    cnnlSetTensorDescriptor(
+        boxesIndexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32,
+        boxesIndexDim.size(), boxesIndexDim.data());
+    std::vector<int32_t> boxesIndex(nBatch);
+    std::iota(boxesIndex.begin(), boxesIndex.end(), 0);//boxesIndex=[0,1,2,...]
+    int32_t *boxesIndexData;
+    cnrtMalloc((void**)&boxesIndexData, nBatch * sizeof(int32_t));
+    cnrtMemcpy(boxesIndexData, boxesIndex.data(), nBatch * sizeof(int32_t), cnrtMemcpyHostToDev);
+
+    cnnlCropAndResizeMode_t resizeOp;
+    if (mode == ResizeMode::Nearest){
+        resizeOp = CNNL_CROP_AND_RESIZE_NEAREST;
+    }
+    else if (mode == ResizeMode::Bilinear){
+        resizeOp = CNNL_CROP_AND_RESIZE_BILINEAR;
+    }
+
+    std::vector<float> box = {0, 0, 1.0, 1.0};
+    if (coMode == CoordinateMode::tfCropAndResize){
+        box = {roi[2], roi[3], roi[6], roi[7]};
+    }
+    float *boxesData;
+    cnrtMalloc((void**)&boxesData, nBatch * box.size() * sizeof(float));
+    for(int i = 0; i < nBatch; i++){
+        cnrtMemcpy(boxesData + i * box.size(),
+                    box.data(), box.size() * sizeof(float), cnrtMemcpyHostToDev);
+    }
+    cnnlCropAndResize(
+            handle, IDesc, tmpGdramI, boxesDesc, boxesData,
+            boxesIndexDesc, boxesIndexData, resizeOp, 0.0, ODesc, tmpGdramO);
+    //------------------------------------------------------------ 
+    //下面开始提前对output做转置：nhwc2nchw
+    size_t tSizeO;
+    cnnlGetTransposeWorkspaceSize(handle, ODesc, desc, &tSizeO);
+    void *workspaceO;
+    cnrtMalloc(&workspaceO, tSizeO);
+    cnnlSetTransposeDescriptor(desc, ndim, permuteO.data());
+    cnnlTranspose_v2(handle, desc, ODesc, tmpGdramO, y_desc,
+                            output, workspaceO, tSizeO);
+    CNRT_CHECK(cnrtQueueSync(queue));  
+
+    cnrtFree(tmpGdramI);
+    cnrtFree(tmpGdramO);
+
+    cnrtFree(boxesIndexData);
+    cnrtFree(boxesData);
+
+    cnrtFree(workspaceI);
+    cnrtFree(workspaceO);
+
+    cnnlDestroyTensorDescriptor(IDesc);
+    cnnlDestroyTensorDescriptor(ODesc);
+    cnnlDestroyTransposeDescriptor(desc);
+
+    cnnlDestroyTensorDescriptor(x_desc);
+    cnnlDestroyTensorDescriptor(y_desc);
+    cnnlDestroyTensorDescriptor(boxesDesc);
+    cnnlDestroyTensorDescriptor(boxesIndexDesc);
+
+}
+template <typename T>
+void resizeCnnl(void const *input, float const *roi, void *output, 
+                           int *x_shape, int *y_shape,
+                           int ndim,
+                           ResizeMode::Mode mode, CoordinateMode::Mode coMode)
+{
+    CNRT_CHECK(cnrtSetDevice(0));
+    cnnlHandle_t handle;
+    cnnlCreate(&handle);
+    cnrtQueue_t queue;
+    CNRT_CHECK(cnrtQueueCreate(&queue));
+    cnnlSetQueue(handle, queue); // 将队列绑定到 handle 中, 此接口也可用来更改句柄中的队列。
+
+    resizeCnnlDevice<T>(input, roi, output, 
+                           x_shape, y_shape,
+                           ndim,
+                           mode, coMode, handle, queue);
+
+    cnnlDestroy(handle);
+    CNRT_CHECK(cnrtQueueDestroy(queue));
+}
+extern "C" void nearest_cnnl(void const *input, float const *roi, void *output, 
+                           int *x_shape, int *y_shape,
+                           int ndim, int byteSize)
+{
+    if (byteSize == 2)
+    {
+        resizeCnnl<uint16_t>(input, roi, output, 
+                           x_shape, y_shape,
+                           ndim,
+                           ResizeMode::Nearest, CoordinateMode::tfCropAndResize);
+    }
+    else if (byteSize == 4)
+    {
+        resizeCnnl<float>(input, roi, output, 
+                           x_shape, y_shape,
+                           ndim,
+                           ResizeMode::Nearest, CoordinateMode::tfCropAndResize);
+    }
+}
+extern "C" void bilinear_cnnl(void const *input, float const *roi, void *output, 
+                           int *x_shape, int *y_shape,
+                           int ndim, int byteSize)
+{
+    if (byteSize == 2)
+    {
+        resizeCnnl<uint16_t>(input, roi, output, 
+                           x_shape, y_shape,
+                           ndim,
+                           ResizeMode::Bilinear, CoordinateMode::tfCropAndResize);
+    }
+    else if (byteSize == 4)
+    {
+        resizeCnnl<float>(input, roi, output, 
+                           x_shape, y_shape,
+                           ndim,
+                           ResizeMode::Bilinear, CoordinateMode::tfCropAndResize);
+    }
+}
diff --git a/test/resize.py b/test/resize.py
@@ -0,0 +1,65 @@
+import torch
+import torchvision
+import ctypes
+import numpy as np
+from functools import partial
+import argparse
+
+import performance
+# 添加上一层目录到模块搜索路径
+import sys
+import os
+
+lib_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.././build/lib/libmy_library.so')
+lib = ctypes.CDLL(lib_path)
+
+def crop_and_resize(input_image, boxes, box_indices, output_size, mode='bilinear'):
+    cropped_images = []
+
+    for box, index in zip(boxes, box_indices):
+        top, left, bottom, right = box
+        cropped_image = torchvision.transforms.functional.crop(input_image[index], top, left, bottom - top, right - left)
+        resized_image = torch.nn.functional(cropped_image.unsqueeze(0), size=output_size, mode=mode, align_corners=False)
+        cropped_images.append(resized_image)
+
+    return torch.cat(cropped_images, dim=0)
+
+def test(inputShape, roi, device):
+    operator = "nearest"
+    byteSize = 2
+
+    if byteSize == 2:
+        tensor_dtype = torch.float16
+    elif byteSize == 4:
+        tensor_dtype = torch.float32
+    print(
+        f"Testing {operator} reduce on {device} with inputShape:{inputShape}, roi:{roi}, dtype:{tensor_dtype}"
+    )
+
+
+    a = torch.rand(inputShape, dtype=tensor_dtype).to(device)
+    ndim = len(inputShape)
+
+    aData = ctypes.cast(a.data_ptr(), ctypes.POINTER(ctypes.c_void_p))
+
+    aShape = np.array(inputShape, dtype=np.int32).ctypes.data_as(ctypes.POINTER(ctypes.c_int))
+
+
+    if operator == "nearest":
+        if device == "mlu":
+            torch_reduce_time = performance.BangProfile((maxReduce, (a, axes)))  
+            lib.nearest_cnnl.argtypes = [
+                ctypes.POINTER(ctypes.c_void_p),
+                ctypes.POINTER(ctypes.c_float),
+                ctypes.POINTER(ctypes.c_void_p),
+                ctypes.POINTER(ctypes.c_int),
+                ctypes.POINTER(ctypes.c_int),
+                ctypes.c_int,
+                ctypes.c_int
+            ]
+            custom_reduce_time = \
+            performance.BangProfile((lib.nearest_cnnl, (aData, axes_ptr, cData, aShape, cShape,
+                                    ndim, len(axes), byteSize)))
+        performance.logBenchmark(torch_reduce_time, custom_reduce_time)
+        # 将结果转换回 PyTorch 张量以进行比较
+        tmpa = maxReduce(a, axes).to('cpu').numpy().flatten()