diff --git a/src/softmax/cpu/softmax.cpp b/src/softmax/cpu/softmax_cpu.cpp
similarity index 84%
rename from src/softmax/cpu/softmax.cpp
rename to src/softmax/cpu/softmax_cpu.cpp
index 4c862c7..3898044 100644
--- a/src/softmax/cpu/softmax.cpp
+++ b/src/softmax/cpu/softmax_cpu.cpp
@@ -2,7 +2,7 @@
 #include <math.h>
 #include "cpu/common_cpu.h"
 template <typename T>
-void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride)
+void softmaxDevice(void const *input, void *output, int size, int dimsize, int stride)
 {
     int othersize = size / dimsize;
     auto source = reinterpret_cast<const T *>(input);
@@ -59,11 +59,14 @@ void softmax_cpu(void const *input, void *output, int size, int dimsize, int str
         }
     }
 }
-extern "C" void softmax_cpu_f32(void const *input, void *output, int size, int dimsize, int stride)
+extern "C" void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride, int byteSize)
 {
-    softmax_cpu<float>(input, output, size, dimsize, stride);
+    if (byteSize == 4)
+    {
+        softmaxDevice<float>(input, output, size, dimsize, stride);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxDevice<uint16_t>(input, output, size, dimsize, stride);
+    }
 }
-extern "C" void softmax_cpu_f16(void const *input, void *output, int size, int dimsize, int stride)
-{
-    softmax_cpu<uint16_t>(input, output, size, dimsize, stride);
-}
\ No newline at end of file
diff --git a/src/softmax/gpu/softmax_cuda.cu b/src/softmax/gpu/softmax_cuda.cu
index 674db65..4f6be5c 100644
--- a/src/softmax/gpu/softmax_cuda.cu
+++ b/src/softmax/gpu/softmax_cuda.cu
@@ -365,11 +365,14 @@ void softmaxLaunch(void const *input, void *output, int size, int dimsize, int s
     }
     cudaDeviceSynchronize();
 }
-extern "C" void softmax_nv_f32(void const *input, void *output, int size, int dimsize, int stride)
+extern "C" void softmax_nv(void const *input, void *output, int size, int dimsize, int stride, int byteSize)
 {
-    softmaxLaunch<float>(input, output, size, dimsize, stride);
-}
-extern "C" void softmax_nv_f16(void const *input, void *output, int size, int dimsize, int stride)
-{
-    softmaxLaunch<half>(input, output, size, dimsize, stride);
+    if (byteSize == 4)
+    {
+        softmaxLaunch<float>(input, output, size, dimsize, stride);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxLaunch<half>(input, output, size, dimsize, stride);
+    }
 }
diff --git a/src/softmax/gpu/softmax_cudnn.cpp b/src/softmax/gpu/softmax_cudnn.cpp
index 25ab423..182c0fe 100644
--- a/src/softmax/gpu/softmax_cudnn.cpp
+++ b/src/softmax/gpu/softmax_cudnn.cpp
@@ -42,11 +42,14 @@ void softmaxCudnn(void const *input, void *output, int *shape, int ndim)
     softmaxCudnnDevice<T>(handle, input, output, shape, ndim);
     cudnnDestroy(handle);
 }
-extern "C" void softmax_cudnn_f32(void const *input, void *output, int *shape, int ndim)
+extern "C" void softmax_cudnn(void const *input, void *output, int *shape, int ndim, int byteSize)
 {
-    softmaxCudnn<float>(input, output, shape, ndim);
-}
-extern "C" void softmax_cudnn_f16(void const *input, void *output, int *shape, int ndim)
-{
-    softmaxCudnn<uint16_t>(input, output, shape, ndim);
+    if (byteSize == 4)
+    {
+        softmaxCudnn<float>(input, output, shape, ndim);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxCudnn<uint16_t>(input, output, shape, ndim);
+    }
 }
diff --git a/src/softmax/mlu/softmax_bang.mlu b/src/softmax/mlu/softmax_bang.mlu
index e0c7b41..e52288c 100644
--- a/src/softmax/mlu/softmax_bang.mlu
+++ b/src/softmax/mlu/softmax_bang.mlu
@@ -958,17 +958,18 @@ void softmaxUnion1(cnrtQueue_t queue, void const *input, void *output, int other
 
 
 
-extern "C" void softmax_bang_f32(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) {
+extern "C" void softmax_bang(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim, int byteSize) {
     cnrtQueue_t queue;
     CNRT_CHECK(cnrtSetDevice(0));
     CNRT_CHECK(cnrtQueueCreate(&queue));
-    softmaxUnion1<float>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
-    CNRT_CHECK(cnrtQueueDestroy(queue));
-}
-extern "C" void softmax_bang_f16(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) {
-    cnrtQueue_t queue;
-    CNRT_CHECK(cnrtSetDevice(0));
-    CNRT_CHECK(cnrtQueueCreate(&queue));
-    softmaxUnion1<half>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
+    if (byteSize == 4)
+    {
+        softmaxUnion1<float>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxUnion1<half>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
+    }
+    
     CNRT_CHECK(cnrtQueueDestroy(queue));
 }
diff --git a/src/softmax/mlu/softmax_cnnl.cpp b/src/softmax/mlu/softmax_cnnl.cpp
index c7eddd8..762aa0b 100644
--- a/src/softmax/mlu/softmax_cnnl.cpp
+++ b/src/softmax/mlu/softmax_cnnl.cpp
@@ -2,8 +2,7 @@
 #include "cnrt.h"
 #include <vector>
 
-
-template<typename T>
+template <typename T>
 void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int *shape, cnnlHandle_t &handle, cnrtQueue_t &queue)
 {
     cnnlSoftmaxMode_t mode;
@@ -79,10 +78,12 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int
     cnnlCreateTensorDescriptor(&aDesc);
     cnnlCreateTensorDescriptor(&cDesc);
     cnnlDataType_t dataType;
-    if(sizeof(T) == 2){
+    if (sizeof(T) == 2)
+    {
         dataType = CNNL_DTYPE_HALF;
     }
-    else if(sizeof(T) == 4){
+    else if (sizeof(T) == 4)
+    {
         dataType = CNNL_DTYPE_FLOAT;
     }
     cnnlSetTensorDescriptor(
@@ -91,24 +92,22 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int
     cnnlSetTensorDescriptor(
         cDesc, CNNL_LAYOUT_ARRAY, dataType,
         outDim.size(), outDim.data());
-    
+
     T alpha = 1.0;
     T beta = 0.0;
     cnnlStatus_t stat =
         cnnlSoftmaxForward_v2(handle, CNNL_SOFTMAX_ACCURATE,
                               mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
                               &alpha, aDesc, source, &beta, cDesc, destination);
-  
+
     CNRT_CHECK(cnrtQueueSync(queue));
 
-   
     if (stat != CNNL_STATUS_SUCCESS)
         return;
     cnnlDestroyTensorDescriptor(aDesc);
     cnnlDestroyTensorDescriptor(cDesc);
-    
 }
-template<typename T>
+template <typename T>
 void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape)
 {
     auto source = reinterpret_cast<const T *>(input);
@@ -121,22 +120,19 @@ void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape
     cnnlSetQueue(handle, queue); // 将队列绑定到 handle 中, 此接口也可用来更改句柄中的队列。
 
     softmaxCnnlDevice(source, destination, nDim, axis, shape, handle, queue);
-    
+
     cnnlDestroy(handle);
     CNRT_CHECK(cnrtQueueDestroy(queue));
-
-    
 }
 
-extern "C" void softmax_cnnl_f32(void const *input, void *output, int nDim, int axis, int *shape){
-    softmaxCnnl<float>(input, output, nDim, axis, shape);
-}
-extern "C" void softmax_cnnl_f16(void const *input, void *output, int nDim, int axis, int *shape){
-    softmaxCnnl<uint16_t>(input, output, nDim, axis, shape);
+extern "C" void softmax_cnnl(void const *input, void *output, int nDim, int axis, int *shape, int byteSize)
+{
+    if (byteSize == 4)
+    {
+        softmaxCnnl<float>(input, output, nDim, axis, shape);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxCnnl<uint16_t>(input, output, nDim, axis, shape);
+    }
 }
-
-
-
-
-
-
diff --git a/src/softmax/npu/softmax_aclnn.cpp b/src/softmax/npu/softmax_aclnn.cpp
new file mode 100644
index 0000000..b08d2b8
--- /dev/null
+++ b/src/softmax/npu/softmax_aclnn.cpp
@@ -0,0 +1,112 @@
+#include "acl/acl.h"
+#include "aclnnop/aclnn_softmax.h"
+#include <iostream>
+#include <vector>
+#include "npu/common_npu.h"
+
+template <typename T>
+void softmaxAclnnDevice(void *input, void *output, int ndim, int axis, int *shape,
+                        aclrtStream &stream)
+{
+    aclDataType dataType;
+    if (sizeof(T) == 2)
+    {
+        dataType = aclDataType::ACL_FLOAT16;
+    }
+    else if (sizeof(T) == 4)
+    {
+        dataType = aclDataType::ACL_FLOAT;
+    }
+    aclFormat format = aclFormat::ACL_FORMAT_ND;
+    std::vector<int64_t> inputDim(ndim); // aclCreateTensor只支持int64_t的数组
+    std::vector<int64_t> inputStride(ndim, 1);
+    std::vector<int64_t> outputDim(ndim);
+    std::vector<int64_t> outputStride(ndim, 1);
+    for (int i = ndim - 1; i >= 0; i--)
+    {
+        inputDim[i] = int64_t(shape[i]);
+        outputDim[i] = int64_t(shape[i]);
+        if (i < ndim - 1)
+        {
+            inputStride[i] = inputDim[i + 1] * inputStride[i + 1];
+            outputStride[i] = outputDim[i + 1] * outputStride[i + 1];
+        }
+    }
+    auto inputTensor =
+        aclCreateTensor(inputDim.data(), inputDim.size(), dataType,
+                        inputStride.data(), 0, format,
+                        inputDim.data(), inputDim.size(), input); // const aclTensor *inputTensor
+
+    auto outputTensor =
+        aclCreateTensor(outputDim.data(), outputDim.size(), dataType,
+                        outputStride.data(), 0, format,
+                        outputDim.data(), outputDim.size(), output);
+    // 下面开始正式计算
+    uint64_t workspaceSize = 0;
+    aclOpExecutor *executor;
+    auto ret = aclnnSoftmaxGetWorkspaceSize(inputTensor, int64_t(axis), outputTensor,
+                                            &workspaceSize, &executor);
+
+    if (ret != ACL_SUCCESS)
+    {
+        printf("aclnnSoftmaxGetWorkspaceSize failed. ERROR: %d\n", ret);
+    }
+    void *workspaceAddr = nullptr;
+    if (workspaceSize > 0)
+    {
+        ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+
+        if (ret != ACL_SUCCESS)
+        {
+            printf("allocate workspace failed. ERROR: %d\n", ret);
+        }
+    }
+
+    ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor,
+                       stream);
+
+    if (ret != ACL_SUCCESS)
+    {
+        printf("aclnnSoftmax failed. ERROR: %d\n", ret);
+    }
+    ret = aclrtSynchronizeStream(stream);
+
+    if (ret != ACL_SUCCESS)
+    {
+        printf("aclrtSynchronizeStream failed. ERROR: %d\n", ret);
+    }
+
+    aclDestroyTensor(inputTensor);
+    aclDestroyTensor(outputTensor);
+    if (workspaceSize > 0)
+    {
+        aclrtFree(workspaceAddr);
+    }
+}
+template <typename T>
+void softmaxAclnn(void *input, void *output, int ndim, int axis, int *shape)
+{
+    int32_t deviceId = 0;
+
+    aclrtStream stream;
+    auto ret = Init(deviceId, &stream);
+    if (ret != ACL_SUCCESS)
+    {
+        printf("Init acl failed. ERROR: %d\n", ret);
+    }
+
+    softmaxAclnnDevice<T>(input, output, ndim, axis, shape, stream);
+    Finalize(deviceId, stream);
+}
+
+extern "C" void softmax_aclnn(void *input, void *output, int ndim, int axis, int *shape, int byteSize)
+{
+    if (byteSize == 4)
+    {
+        softmaxAclnn<float>(input, output, ndim, axis, shape);
+    }
+    else if (byteSize == 2)
+    {
+        softmaxAclnn<uint16_t>(input, output, ndim, axis, shape);
+    }
+}
diff --git a/test/softmax.py b/test/softmax.py
index 3187f3d..7fb9126 100644
--- a/test/softmax.py
+++ b/test/softmax.py
@@ -25,7 +25,11 @@ def dataPrew(test_shape, test_axis):
             break
         stride *= test_shape[i]
     return size, stride, dimsize
-def test(test_shape, test_axis, test_dtype, device):
+def test(test_shape, test_axis, device):
+    byteSize = 2
+    test_dtype = torch.float16
+    if byteSize == 4:
+        test_dtype = torch.float32
     print(
         f"Testing Softmax on {device} with x_shape:{test_shape} , axis:{test_axis} ,dtype:{test_dtype}"
     )
@@ -36,130 +40,79 @@ def test(test_shape, test_axis, test_dtype, device):
 
     input_ptr = ctypes.cast(Q.data_ptr(), ctypes.POINTER(ctypes.c_void_p))
     output_ptr = ctypes.cast(Q_output.data_ptr(), ctypes.POINTER(ctypes.c_void_p))
-    if test_dtype == torch.float32:
-        if device == "cuda":
-            torch_softmax_time = performance.CudaProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
-            lib.softmax_nv_f32.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int
-            ]
-            custom_softmax_time = performance.CudaProfile((lib.softmax_nv_f32, (input_ptr, output_ptr, size, dimsize, stride)))  # 以毫秒为单位
-        if device == "cpu":
-            torch_softmax_time = performance.CpuProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
-            lib.softmax_cpu_f32.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int
-            ]
-            custom_softmax_time = performance.CpuProfile((lib.softmax_cpu_f32, (input_ptr, output_ptr, size, dimsize, stride)))  # 以毫秒为单位
-        if device == "mlu":
-            torch_softmax_time = performance.BangProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
-            
-            '''
-            frontsize = 1
-            othersize = 1
-            for s in range(ndim - 1, -1, -1):
-                if (s < test_axis): 
-                    frontsize *= test_shape[s]
-                if (s != test_axis):
-                    othersize *= test_shape[s];
-            
-            lib.softmax_bang_f32.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int
-            ]
-            custom_softmax_time = performance.BangProfile((lib.softmax_bang_f32, (input_ptr, output_ptr, othersize, dimsize, frontsize, stride, test_axis, ndim)))
-            '''
-            lib.softmax_cnnl_f32.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.POINTER(ctypes.c_int)
-            ]
-            import numpy as np
-            np_array = np.array(test_shape, dtype=np.int32)
-            ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
-            custom_softmax_time = performance.BangProfile((lib.softmax_cnnl_f32, (input_ptr, output_ptr, ndim, test_axis, ctypes_array)))
-    elif test_dtype == torch.float16:
-        if device == "cuda":
-            torch_softmax_time = performance.CudaProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
-            '''
-            lib.softmax_nv_f16.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int
-            ]
-            custom_softmax_time = performance.CudaProfile((lib.softmax_nv_f16, (input_ptr, output_ptr, size, dimsize, stride))) 
-            '''
-            import numpy as np
-            np_array = np.array(test_shape, dtype=np.int32)
-            ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
-            lib.softmax_cudnn_f16.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_int),
-                ctypes.c_int
-            ]
-            custom_softmax_time = performance.CudaProfile((lib.softmax_cudnn_f16, (input_ptr, output_ptr, ctypes_array, ndim)))
-        if device == "cpu":
-            torch_softmax_time = performance.CpuProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
-            lib.softmax_cpu_f16.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int
-            ]
-            custom_softmax_time = performance.CpuProfile((lib.softmax_cpu_f16, (input_ptr, output_ptr, size, dimsize, stride)))  # 以毫秒为单位
-        if device == "mlu":
-            torch_softmax_time = performance.BangProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
-            ndim = len(test_shape)
-            '''
-            frontsize = 1
-            othersize = 1
-            for s in range(ndim - 1, -1, -1):
-                if (s < test_axis): 
-                    frontsize *= test_shape[s]
-                if (s != test_axis):
-                    othersize *= test_shape[s];
-            
-            lib.softmax_bang_f16.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int
-            ]
-            custom_softmax_time = performance.BangProfile((lib.softmax_bang_f16, (input_ptr, output_ptr, othersize, dimsize, frontsize, stride, test_axis, ndim))) 
-            '''
-            lib.softmax_cnnl_f16.argtypes = [
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.POINTER(ctypes.c_int)
-            ]
-            import numpy as np
-            np_array = np.array(test_shape, dtype=np.int32)
-            ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
-            custom_softmax_time = performance.BangProfile((lib.softmax_cnnl_f16, (input_ptr, output_ptr, ndim, test_axis, ctypes_array)))
+    
+    np_array = np.array(test_shape, dtype=np.int32)
+    ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
+
+    if device == "cuda":
+        torch_softmax_time = performance.CudaProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
+        lib.softmax_nv.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int
+        ]
+        custom_softmax_time = performance.CudaProfile((lib.softmax_nv, (input_ptr, output_ptr, size, dimsize, stride, byteSize)))  # 以毫秒为单位
+    elif device == "cpu":
+        torch_softmax_time = performance.CpuProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
+        lib.softmax_cpu.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int
+        ]
+        custom_softmax_time = performance.CpuProfile((lib.softmax_cpu, (input_ptr, output_ptr, size, dimsize, stride, byteSize)))  # 以毫秒为单位
+    elif device == "mlu":
+        torch_softmax_time = performance.BangProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
+        
+        '''
+        frontsize = 1
+        othersize = 1
+        for s in range(ndim - 1, -1, -1):
+            if (s < test_axis): 
+                frontsize *= test_shape[s]
+            if (s != test_axis):
+                othersize *= test_shape[s];
+        
+        lib.softmax_bang.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int
+        ]
+        custom_softmax_time = performance.BangProfile((lib.softmax_bang, (input_ptr, output_ptr, othersize, dimsize, frontsize, stride, test_axis, ndim, byteSize)))
+        '''
+        lib.softmax_cnnl.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.POINTER(ctypes.c_int),
+            ctypes.c_int
+        ]
+        
+        custom_softmax_time = performance.BangProfile((lib.softmax_cnnl, (input_ptr, output_ptr, ndim, test_axis, ctypes_array, byteSize)))
+    elif device == "npu":
+        torch_softmax_time = performance.AscendProfile((torch.softmax, (Q, test_axis)))  # 以毫秒为单位
+        lib.softmax_aclnn.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.POINTER(ctypes.c_int),
+            ctypes.c_int
+        ]
+        
+        custom_softmax_time = performance.AscendProfile((lib.softmax_aclnn, (input_ptr, output_ptr, ndim, test_axis, ctypes_array, byteSize)))
     performance.logBenchmark(torch_softmax_time, custom_softmax_time)
     # 将结果转换回 PyTorch 张量以进行比较
     tmpa = torch.softmax(Q, test_axis).to('cpu').numpy().flatten()
@@ -175,44 +128,25 @@ def test(test_shape, test_axis, test_dtype, device):
 
 # 解析命令行参数
 parser = argparse.ArgumentParser(description="Test softmax on different devices.")
-parser.add_argument('--device', choices=['cpu', 'cuda', 'mlu'], required=True, help="Device to run the tests on.")
+parser.add_argument('--device', choices=['cpu', 'cuda', 'mlu', 'npu'], required=True, help="Device to run the tests on.")
 args = parser.parse_args()    
 
 test_cases = [
         # x_shape, axis
-        ((700, 1200, 24), 0, torch.float32, 'cuda'),
-        ((700, 1200, 24), 1, torch.float32, 'cuda'), 
-        ((700, 1200, 24), 2, torch.float32, 'cuda'), 
-
-        ((700, 1200, 24), 0, torch.float32, 'mlu'),
-        ((700, 1200, 24), 1, torch.float32, 'mlu'), 
-        ((700, 1200, 24), 2, torch.float32, 'mlu'), 
-
-        ((70, 12, 24), 0, torch.float32, 'cpu'),
-        ((70, 12, 24), 1, torch.float32, 'cpu'), 
-        ((70, 12, 24), 2, torch.float32, 'cpu'), 
-
-        # x_shape, axis
-        ((700, 1200, 24), 0, torch.float16, 'cuda'),
-        ((700, 1200, 24), 1, torch.float16, 'cuda'), 
-        ((700, 1200, 24), 2, torch.float16, 'cuda'), 
-
-        ((700, 1200, 24), 0, torch.float16, 'mlu'),
-        ((700, 1200, 24), 1, torch.float16, 'mlu'), 
-        ((700, 1200, 24), 2, torch.float16, 'mlu'), 
-
-        ((70, 12, 24), 0, torch.float16, 'cpu'),
-        ((70, 12, 24), 1, torch.float16, 'cpu'), 
-        ((70, 12, 24), 2, torch.float16, 'cpu'), 
+        ((700, 1200, 24), 0),
+        ((700, 1200, 24), 1), 
+        ((700, 1200, 24), 2), 
+        # cpu用小规模数据
+        # ((70, 12, 24), 0),
+        # ((70, 12, 24), 1), 
+        # ((70, 12, 24), 2), 
          
 ]
-filtered_test_cases = [
-    (test_shape, test_axis, test_dtype, device)
-    for test_shape, test_axis, test_dtype, device in test_cases
-    if device == args.device
-]
+
 if args.device == 'mlu':
     import torch_mlu
+if args.device == 'npu':
+    import torch_npu
 # 执行过滤后的测试用例
-for test_shape, test_axis, test_dtype, device in filtered_test_cases:
-    test(test_shape, test_axis, test_dtype, device)
\ No newline at end of file
+for test_shape, test_axis in test_cases:
+    test(test_shape, test_axis, args.device)
\ No newline at end of file