xgqdut2016 · xgqdut2016 · Mar 4, 2025
diff --git a/src/softmax/cpu/softmax.cpp → src/softmax/cpu/softmax_cpu.cpp b/src/softmax/cpu/softmax.cpp → src/softmax/cpu/softmax_cpu.cpp
@@ -2,7 +2,7 @@
 #include <math.h>
 #include "cpu/common_cpu.h"
 template <typename T>
-void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride)
+void softmaxDevice(void const *input, void *output, int size, int dimsize, int stride)
 {
     int othersize = size / dimsize;
     auto source = reinterpret_cast<const T *>(input);
@@ -59,11 +59,14 @@ void softmax_cpu(void const *input, void *output, int size, int dimsize, int str
         }
     }
 }
-extern "C" void softmax_cpu_f32(void const *input, void *output, int size, int dimsize, int stride)
+extern "C" void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride, int byteSize)
 {
-    softmax_cpu<float>(input, output, size, dimsize, stride);
+    if (byteSize == 4)
+    {
+        softmaxDevice<float>(input, output, size, dimsize, stride);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxDevice<uint16_t>(input, output, size, dimsize, stride);
+    }
 }
-extern "C" void softmax_cpu_f16(void const *input, void *output, int size, int dimsize, int stride)
-{
-    softmax_cpu<uint16_t>(input, output, size, dimsize, stride);
-}
diff --git a/src/softmax/gpu/softmax_cuda.cu b/src/softmax/gpu/softmax_cuda.cu
@@ -365,11 +365,14 @@ void softmaxLaunch(void const *input, void *output, int size, int dimsize, int s
     }
     cudaDeviceSynchronize();
 }
-extern "C" void softmax_nv_f32(void const *input, void *output, int size, int dimsize, int stride)
+extern "C" void softmax_nv(void const *input, void *output, int size, int dimsize, int stride, int byteSize)
 {
-    softmaxLaunch<float>(input, output, size, dimsize, stride);
-}
-extern "C" void softmax_nv_f16(void const *input, void *output, int size, int dimsize, int stride)
-{
-    softmaxLaunch<half>(input, output, size, dimsize, stride);
+    if (byteSize == 4)
+    {
+        softmaxLaunch<float>(input, output, size, dimsize, stride);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxLaunch<half>(input, output, size, dimsize, stride);
+    }
 }
diff --git a/src/softmax/gpu/softmax_cudnn.cpp b/src/softmax/gpu/softmax_cudnn.cpp
@@ -42,11 +42,14 @@ void softmaxCudnn(void const *input, void *output, int *shape, int ndim)
     softmaxCudnnDevice<T>(handle, input, output, shape, ndim);
     cudnnDestroy(handle);
 }
-extern "C" void softmax_cudnn_f32(void const *input, void *output, int *shape, int ndim)
+extern "C" void softmax_cudnn(void const *input, void *output, int *shape, int ndim, int byteSize)
 {
-    softmaxCudnn<float>(input, output, shape, ndim);
-}
-extern "C" void softmax_cudnn_f16(void const *input, void *output, int *shape, int ndim)
-{
-    softmaxCudnn<uint16_t>(input, output, shape, ndim);
+    if (byteSize == 4)
+    {
+        softmaxCudnn<float>(input, output, shape, ndim);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxCudnn<uint16_t>(input, output, shape, ndim);
+    }
 }
diff --git a/src/softmax/mlu/softmax_bang.mlu b/src/softmax/mlu/softmax_bang.mlu
@@ -958,17 +958,18 @@ void softmaxUnion1(cnrtQueue_t queue, void const *input, void *output, int other
 
 
 
-extern "C" void softmax_bang_f32(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) {
+extern "C" void softmax_bang(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim, int byteSize) {
     cnrtQueue_t queue;
     CNRT_CHECK(cnrtSetDevice(0));
     CNRT_CHECK(cnrtQueueCreate(&queue));
-    softmaxUnion1<float>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
-    CNRT_CHECK(cnrtQueueDestroy(queue));
-}
-extern "C" void softmax_bang_f16(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) {
-    cnrtQueue_t queue;
-    CNRT_CHECK(cnrtSetDevice(0));
-    CNRT_CHECK(cnrtQueueCreate(&queue));
-    softmaxUnion1<half>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
+    if (byteSize == 4)
+    {
+        softmaxUnion1<float>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxUnion1<half>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
+    }
+
     CNRT_CHECK(cnrtQueueDestroy(queue));
 }
diff --git a/src/softmax/mlu/softmax_cnnl.cpp b/src/softmax/mlu/softmax_cnnl.cpp
@@ -2,8 +2,7 @@
 #include "cnrt.h"
 #include <vector>
 
-
-template<typename T>
+template <typename T>
 void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int *shape, cnnlHandle_t &handle, cnrtQueue_t &queue)
 {
     cnnlSoftmaxMode_t mode;
@@ -79,10 +78,12 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int
     cnnlCreateTensorDescriptor(&aDesc);
     cnnlCreateTensorDescriptor(&cDesc);
     cnnlDataType_t dataType;
-    if(sizeof(T) == 2){
+    if (sizeof(T) == 2)
+    {
         dataType = CNNL_DTYPE_HALF;
     }
-    else if(sizeof(T) == 4){
+    else if (sizeof(T) == 4)
+    {
         dataType = CNNL_DTYPE_FLOAT;
     }
     cnnlSetTensorDescriptor(
@@ -91,24 +92,22 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int
     cnnlSetTensorDescriptor(
         cDesc, CNNL_LAYOUT_ARRAY, dataType,
         outDim.size(), outDim.data());
-    
+
     T alpha = 1.0;
     T beta = 0.0;
     cnnlStatus_t stat =
         cnnlSoftmaxForward_v2(handle, CNNL_SOFTMAX_ACCURATE,
                               mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
                               &alpha, aDesc, source, &beta, cDesc, destination);
-  
+
     CNRT_CHECK(cnrtQueueSync(queue));
 
-
     if (stat != CNNL_STATUS_SUCCESS)
         return;
     cnnlDestroyTensorDescriptor(aDesc);
     cnnlDestroyTensorDescriptor(cDesc);
-
 }
-template<typename T>
+template <typename T>
 void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape)
 {
     auto source = reinterpret_cast<const T *>(input);
@@ -121,22 +120,19 @@ void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape
     cnnlSetQueue(handle, queue); // 将队列绑定到 handle 中, 此接口也可用来更改句柄中的队列。
 
     softmaxCnnlDevice(source, destination, nDim, axis, shape, handle, queue);
-    
+
     cnnlDestroy(handle);
     CNRT_CHECK(cnrtQueueDestroy(queue));
-
-
 }
 
-extern "C" void softmax_cnnl_f32(void const *input, void *output, int nDim, int axis, int *shape){
-    softmaxCnnl<float>(input, output, nDim, axis, shape);
-}
-extern "C" void softmax_cnnl_f16(void const *input, void *output, int nDim, int axis, int *shape){
-    softmaxCnnl<uint16_t>(input, output, nDim, axis, shape);
+extern "C" void softmax_cnnl(void const *input, void *output, int nDim, int axis, int *shape, int byteSize)
+{
+    if (byteSize == 4)
+    {
+        softmaxCnnl<float>(input, output, nDim, axis, shape);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxCnnl<uint16_t>(input, output, nDim, axis, shape);
+    }
 }
-
-
-
-
-
-
diff --git a/src/softmax/npu/softmax_aclnn.cpp b/src/softmax/npu/softmax_aclnn.cpp
@@ -0,0 +1,112 @@
+#include "acl/acl.h"
+#include "aclnnop/aclnn_softmax.h"
+#include <iostream>
+#include <vector>
+#include "npu/common_npu.h"
+
+template <typename T>
+void softmaxAclnnDevice(void *input, void *output, int ndim, int axis, int *shape,
+                        aclrtStream &stream)
+{
+    aclDataType dataType;
+    if (sizeof(T) == 2)
+    {
+        dataType = aclDataType::ACL_FLOAT16;
+    }
+    else if (sizeof(T) == 4)
+    {
+        dataType = aclDataType::ACL_FLOAT;
+    }
+    aclFormat format = aclFormat::ACL_FORMAT_ND;
+    std::vector<int64_t> inputDim(ndim); // aclCreateTensor只支持int64_t的数组
+    std::vector<int64_t> inputStride(ndim, 1);
+    std::vector<int64_t> outputDim(ndim);
+    std::vector<int64_t> outputStride(ndim, 1);
+    for (int i = ndim - 1; i >= 0; i--)
+    {
+        inputDim[i] = int64_t(shape[i]);
+        outputDim[i] = int64_t(shape[i]);
+        if (i < ndim - 1)
+        {
+            inputStride[i] = inputDim[i + 1] * inputStride[i + 1];
+            outputStride[i] = outputDim[i + 1] * outputStride[i + 1];
+        }
+    }
+    auto inputTensor =
+        aclCreateTensor(inputDim.data(), inputDim.size(), dataType,
+                        inputStride.data(), 0, format,
+                        inputDim.data(), inputDim.size(), input); // const aclTensor *inputTensor
+
+    auto outputTensor =
+        aclCreateTensor(outputDim.data(), outputDim.size(), dataType,
+                        outputStride.data(), 0, format,
+                        outputDim.data(), outputDim.size(), output);
+    // 下面开始正式计算
+    uint64_t workspaceSize = 0;
+    aclOpExecutor *executor;
+    auto ret = aclnnSoftmaxGetWorkspaceSize(inputTensor, int64_t(axis), outputTensor,
+                                            &workspaceSize, &executor);
+
+    if (ret != ACL_SUCCESS)
+    {
+        printf("aclnnSoftmaxGetWorkspaceSize failed. ERROR: %d\n", ret);
+    }
+    void *workspaceAddr = nullptr;
+    if (workspaceSize > 0)
+    {
+        ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+
+        if (ret != ACL_SUCCESS)
+        {
+            printf("allocate workspace failed. ERROR: %d\n", ret);
+        }
+    }
+
+    ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor,
+                       stream);
+
+    if (ret != ACL_SUCCESS)
+    {
+        printf("aclnnSoftmax failed. ERROR: %d\n", ret);
+    }
+    ret = aclrtSynchronizeStream(stream);
+
+    if (ret != ACL_SUCCESS)
+    {
+        printf("aclrtSynchronizeStream failed. ERROR: %d\n", ret);
+    }
+
+    aclDestroyTensor(inputTensor);
+    aclDestroyTensor(outputTensor);
+    if (workspaceSize > 0)
+    {
+        aclrtFree(workspaceAddr);
+    }
+}
+template <typename T>
+void softmaxAclnn(void *input, void *output, int ndim, int axis, int *shape)
+{
+    int32_t deviceId = 0;
+
+    aclrtStream stream;
+    auto ret = Init(deviceId, &stream);
+    if (ret != ACL_SUCCESS)
+    {
+        printf("Init acl failed. ERROR: %d\n", ret);
+    }
+
+    softmaxAclnnDevice<T>(input, output, ndim, axis, shape, stream);
+    Finalize(deviceId, stream);
+}
+
+extern "C" void softmax_aclnn(void *input, void *output, int ndim, int axis, int *shape, int byteSize)
+{
+    if (byteSize == 4)
+    {
+        softmaxAclnn<float>(input, output, ndim, axis, shape);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxAclnn<uint16_t>(input, output, ndim, axis, shape);
+    }
+}