diff --git a/src/softmax/cpu/softmax.cpp b/src/softmax/cpu/softmax_cpu.cpp similarity index 84% rename from src/softmax/cpu/softmax.cpp rename to src/softmax/cpu/softmax_cpu.cpp index 4c862c7..3898044 100644 --- a/src/softmax/cpu/softmax.cpp +++ b/src/softmax/cpu/softmax_cpu.cpp @@ -2,7 +2,7 @@ #include #include "cpu/common_cpu.h" template -void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride) +void softmaxDevice(void const *input, void *output, int size, int dimsize, int stride) { int othersize = size / dimsize; auto source = reinterpret_cast(input); @@ -59,11 +59,14 @@ void softmax_cpu(void const *input, void *output, int size, int dimsize, int str } } } -extern "C" void softmax_cpu_f32(void const *input, void *output, int size, int dimsize, int stride) +extern "C" void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride, int byteSize) { - softmax_cpu(input, output, size, dimsize, stride); + if (byteSize == 4) + { + softmaxDevice(input, output, size, dimsize, stride); + } + else if (byteSize == 2) + { + softmaxDevice(input, output, size, dimsize, stride); + } } -extern "C" void softmax_cpu_f16(void const *input, void *output, int size, int dimsize, int stride) -{ - softmax_cpu(input, output, size, dimsize, stride); -} \ No newline at end of file diff --git a/src/softmax/gpu/softmax_cuda.cu b/src/softmax/gpu/softmax_cuda.cu index 674db65..4f6be5c 100644 --- a/src/softmax/gpu/softmax_cuda.cu +++ b/src/softmax/gpu/softmax_cuda.cu @@ -365,11 +365,14 @@ void softmaxLaunch(void const *input, void *output, int size, int dimsize, int s } cudaDeviceSynchronize(); } -extern "C" void softmax_nv_f32(void const *input, void *output, int size, int dimsize, int stride) +extern "C" void softmax_nv(void const *input, void *output, int size, int dimsize, int stride, int byteSize) { - softmaxLaunch(input, output, size, dimsize, stride); -} -extern "C" void softmax_nv_f16(void const *input, void *output, int size, int dimsize, int stride) -{ - softmaxLaunch(input, output, size, dimsize, stride); + if (byteSize == 4) + { + softmaxLaunch(input, output, size, dimsize, stride); + } + else if (byteSize == 2) + { + softmaxLaunch(input, output, size, dimsize, stride); + } } diff --git a/src/softmax/gpu/softmax_cudnn.cpp b/src/softmax/gpu/softmax_cudnn.cpp index 25ab423..182c0fe 100644 --- a/src/softmax/gpu/softmax_cudnn.cpp +++ b/src/softmax/gpu/softmax_cudnn.cpp @@ -42,11 +42,14 @@ void softmaxCudnn(void const *input, void *output, int *shape, int ndim) softmaxCudnnDevice(handle, input, output, shape, ndim); cudnnDestroy(handle); } -extern "C" void softmax_cudnn_f32(void const *input, void *output, int *shape, int ndim) +extern "C" void softmax_cudnn(void const *input, void *output, int *shape, int ndim, int byteSize) { - softmaxCudnn(input, output, shape, ndim); -} -extern "C" void softmax_cudnn_f16(void const *input, void *output, int *shape, int ndim) -{ - softmaxCudnn(input, output, shape, ndim); + if (byteSize == 4) + { + softmaxCudnn(input, output, shape, ndim); + } + else if (byteSize == 2) + { + softmaxCudnn(input, output, shape, ndim); + } } diff --git a/src/softmax/mlu/softmax_bang.mlu b/src/softmax/mlu/softmax_bang.mlu index e0c7b41..e52288c 100644 --- a/src/softmax/mlu/softmax_bang.mlu +++ b/src/softmax/mlu/softmax_bang.mlu @@ -958,17 +958,18 @@ void softmaxUnion1(cnrtQueue_t queue, void const *input, void *output, int other -extern "C" void softmax_bang_f32(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) { +extern "C" void softmax_bang(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim, int byteSize) { cnrtQueue_t queue; CNRT_CHECK(cnrtSetDevice(0)); CNRT_CHECK(cnrtQueueCreate(&queue)); - softmaxUnion1(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim); - CNRT_CHECK(cnrtQueueDestroy(queue)); -} -extern "C" void softmax_bang_f16(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) { - cnrtQueue_t queue; - CNRT_CHECK(cnrtSetDevice(0)); - CNRT_CHECK(cnrtQueueCreate(&queue)); - softmaxUnion1(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim); + if (byteSize == 4) + { + softmaxUnion1(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim); + } + else if (byteSize == 2) + { + softmaxUnion1(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim); + } + CNRT_CHECK(cnrtQueueDestroy(queue)); } diff --git a/src/softmax/mlu/softmax_cnnl.cpp b/src/softmax/mlu/softmax_cnnl.cpp index c7eddd8..762aa0b 100644 --- a/src/softmax/mlu/softmax_cnnl.cpp +++ b/src/softmax/mlu/softmax_cnnl.cpp @@ -2,8 +2,7 @@ #include "cnrt.h" #include - -template +template void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int *shape, cnnlHandle_t &handle, cnrtQueue_t &queue) { cnnlSoftmaxMode_t mode; @@ -79,10 +78,12 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int cnnlCreateTensorDescriptor(&aDesc); cnnlCreateTensorDescriptor(&cDesc); cnnlDataType_t dataType; - if(sizeof(T) == 2){ + if (sizeof(T) == 2) + { dataType = CNNL_DTYPE_HALF; } - else if(sizeof(T) == 4){ + else if (sizeof(T) == 4) + { dataType = CNNL_DTYPE_FLOAT; } cnnlSetTensorDescriptor( @@ -91,24 +92,22 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int cnnlSetTensorDescriptor( cDesc, CNNL_LAYOUT_ARRAY, dataType, outDim.size(), outDim.data()); - + T alpha = 1.0; T beta = 0.0; cnnlStatus_t stat = cnnlSoftmaxForward_v2(handle, CNNL_SOFTMAX_ACCURATE, mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION, &alpha, aDesc, source, &beta, cDesc, destination); - + CNRT_CHECK(cnrtQueueSync(queue)); - if (stat != CNNL_STATUS_SUCCESS) return; cnnlDestroyTensorDescriptor(aDesc); cnnlDestroyTensorDescriptor(cDesc); - } -template +template void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape) { auto source = reinterpret_cast(input); @@ -121,22 +120,19 @@ void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape cnnlSetQueue(handle, queue); // 将队列绑定到 handle 中, 此接口也可用来更改句柄中的队列。 softmaxCnnlDevice(source, destination, nDim, axis, shape, handle, queue); - + cnnlDestroy(handle); CNRT_CHECK(cnrtQueueDestroy(queue)); - - } -extern "C" void softmax_cnnl_f32(void const *input, void *output, int nDim, int axis, int *shape){ - softmaxCnnl(input, output, nDim, axis, shape); -} -extern "C" void softmax_cnnl_f16(void const *input, void *output, int nDim, int axis, int *shape){ - softmaxCnnl(input, output, nDim, axis, shape); +extern "C" void softmax_cnnl(void const *input, void *output, int nDim, int axis, int *shape, int byteSize) +{ + if (byteSize == 4) + { + softmaxCnnl(input, output, nDim, axis, shape); + } + else if (byteSize == 2) + { + softmaxCnnl(input, output, nDim, axis, shape); + } } - - - - - - diff --git a/src/softmax/npu/softmax_aclnn.cpp b/src/softmax/npu/softmax_aclnn.cpp new file mode 100644 index 0000000..b08d2b8 --- /dev/null +++ b/src/softmax/npu/softmax_aclnn.cpp @@ -0,0 +1,112 @@ +#include "acl/acl.h" +#include "aclnnop/aclnn_softmax.h" +#include +#include +#include "npu/common_npu.h" + +template +void softmaxAclnnDevice(void *input, void *output, int ndim, int axis, int *shape, + aclrtStream &stream) +{ + aclDataType dataType; + if (sizeof(T) == 2) + { + dataType = aclDataType::ACL_FLOAT16; + } + else if (sizeof(T) == 4) + { + dataType = aclDataType::ACL_FLOAT; + } + aclFormat format = aclFormat::ACL_FORMAT_ND; + std::vector inputDim(ndim); // aclCreateTensor只支持int64_t的数组 + std::vector inputStride(ndim, 1); + std::vector outputDim(ndim); + std::vector outputStride(ndim, 1); + for (int i = ndim - 1; i >= 0; i--) + { + inputDim[i] = int64_t(shape[i]); + outputDim[i] = int64_t(shape[i]); + if (i < ndim - 1) + { + inputStride[i] = inputDim[i + 1] * inputStride[i + 1]; + outputStride[i] = outputDim[i + 1] * outputStride[i + 1]; + } + } + auto inputTensor = + aclCreateTensor(inputDim.data(), inputDim.size(), dataType, + inputStride.data(), 0, format, + inputDim.data(), inputDim.size(), input); // const aclTensor *inputTensor + + auto outputTensor = + aclCreateTensor(outputDim.data(), outputDim.size(), dataType, + outputStride.data(), 0, format, + outputDim.data(), outputDim.size(), output); + // 下面开始正式计算 + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + auto ret = aclnnSoftmaxGetWorkspaceSize(inputTensor, int64_t(axis), outputTensor, + &workspaceSize, &executor); + + if (ret != ACL_SUCCESS) + { + printf("aclnnSoftmaxGetWorkspaceSize failed. ERROR: %d\n", ret); + } + void *workspaceAddr = nullptr; + if (workspaceSize > 0) + { + ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (ret != ACL_SUCCESS) + { + printf("allocate workspace failed. ERROR: %d\n", ret); + } + } + + ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor, + stream); + + if (ret != ACL_SUCCESS) + { + printf("aclnnSoftmax failed. ERROR: %d\n", ret); + } + ret = aclrtSynchronizeStream(stream); + + if (ret != ACL_SUCCESS) + { + printf("aclrtSynchronizeStream failed. ERROR: %d\n", ret); + } + + aclDestroyTensor(inputTensor); + aclDestroyTensor(outputTensor); + if (workspaceSize > 0) + { + aclrtFree(workspaceAddr); + } +} +template +void softmaxAclnn(void *input, void *output, int ndim, int axis, int *shape) +{ + int32_t deviceId = 0; + + aclrtStream stream; + auto ret = Init(deviceId, &stream); + if (ret != ACL_SUCCESS) + { + printf("Init acl failed. ERROR: %d\n", ret); + } + + softmaxAclnnDevice(input, output, ndim, axis, shape, stream); + Finalize(deviceId, stream); +} + +extern "C" void softmax_aclnn(void *input, void *output, int ndim, int axis, int *shape, int byteSize) +{ + if (byteSize == 4) + { + softmaxAclnn(input, output, ndim, axis, shape); + } + else if (byteSize == 2) + { + softmaxAclnn(input, output, ndim, axis, shape); + } +} diff --git a/test/softmax.py b/test/softmax.py index 3187f3d..7fb9126 100644 --- a/test/softmax.py +++ b/test/softmax.py @@ -25,7 +25,11 @@ def dataPrew(test_shape, test_axis): break stride *= test_shape[i] return size, stride, dimsize -def test(test_shape, test_axis, test_dtype, device): +def test(test_shape, test_axis, device): + byteSize = 2 + test_dtype = torch.float16 + if byteSize == 4: + test_dtype = torch.float32 print( f"Testing Softmax on {device} with x_shape:{test_shape} , axis:{test_axis} ,dtype:{test_dtype}" ) @@ -36,130 +40,79 @@ def test(test_shape, test_axis, test_dtype, device): input_ptr = ctypes.cast(Q.data_ptr(), ctypes.POINTER(ctypes.c_void_p)) output_ptr = ctypes.cast(Q_output.data_ptr(), ctypes.POINTER(ctypes.c_void_p)) - if test_dtype == torch.float32: - if device == "cuda": - torch_softmax_time = performance.CudaProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 - lib.softmax_nv_f32.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.c_int - ] - custom_softmax_time = performance.CudaProfile((lib.softmax_nv_f32, (input_ptr, output_ptr, size, dimsize, stride))) # 以毫秒为单位 - if device == "cpu": - torch_softmax_time = performance.CpuProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 - lib.softmax_cpu_f32.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.c_int - ] - custom_softmax_time = performance.CpuProfile((lib.softmax_cpu_f32, (input_ptr, output_ptr, size, dimsize, stride))) # 以毫秒为单位 - if device == "mlu": - torch_softmax_time = performance.BangProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 - - ''' - frontsize = 1 - othersize = 1 - for s in range(ndim - 1, -1, -1): - if (s < test_axis): - frontsize *= test_shape[s] - if (s != test_axis): - othersize *= test_shape[s]; - - lib.softmax_bang_f32.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int - ] - custom_softmax_time = performance.BangProfile((lib.softmax_bang_f32, (input_ptr, output_ptr, othersize, dimsize, frontsize, stride, test_axis, ndim))) - ''' - lib.softmax_cnnl_f32.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.POINTER(ctypes.c_int) - ] - import numpy as np - np_array = np.array(test_shape, dtype=np.int32) - ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int)) - custom_softmax_time = performance.BangProfile((lib.softmax_cnnl_f32, (input_ptr, output_ptr, ndim, test_axis, ctypes_array))) - elif test_dtype == torch.float16: - if device == "cuda": - torch_softmax_time = performance.CudaProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 - ''' - lib.softmax_nv_f16.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.c_int - ] - custom_softmax_time = performance.CudaProfile((lib.softmax_nv_f16, (input_ptr, output_ptr, size, dimsize, stride))) - ''' - import numpy as np - np_array = np.array(test_shape, dtype=np.int32) - ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int)) - lib.softmax_cudnn_f16.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_int), - ctypes.c_int - ] - custom_softmax_time = performance.CudaProfile((lib.softmax_cudnn_f16, (input_ptr, output_ptr, ctypes_array, ndim))) - if device == "cpu": - torch_softmax_time = performance.CpuProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 - lib.softmax_cpu_f16.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.c_int - ] - custom_softmax_time = performance.CpuProfile((lib.softmax_cpu_f16, (input_ptr, output_ptr, size, dimsize, stride))) # 以毫秒为单位 - if device == "mlu": - torch_softmax_time = performance.BangProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 - ndim = len(test_shape) - ''' - frontsize = 1 - othersize = 1 - for s in range(ndim - 1, -1, -1): - if (s < test_axis): - frontsize *= test_shape[s] - if (s != test_axis): - othersize *= test_shape[s]; - - lib.softmax_bang_f16.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int - ] - custom_softmax_time = performance.BangProfile((lib.softmax_bang_f16, (input_ptr, output_ptr, othersize, dimsize, frontsize, stride, test_axis, ndim))) - ''' - lib.softmax_cnnl_f16.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, - ctypes.c_int, - ctypes.POINTER(ctypes.c_int) - ] - import numpy as np - np_array = np.array(test_shape, dtype=np.int32) - ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int)) - custom_softmax_time = performance.BangProfile((lib.softmax_cnnl_f16, (input_ptr, output_ptr, ndim, test_axis, ctypes_array))) + + np_array = np.array(test_shape, dtype=np.int32) + ctypes_array = np_array.ctypes.data_as(ctypes.POINTER(ctypes.c_int)) + + if device == "cuda": + torch_softmax_time = performance.CudaProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 + lib.softmax_nv.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + ctypes.POINTER(ctypes.c_void_p), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int + ] + custom_softmax_time = performance.CudaProfile((lib.softmax_nv, (input_ptr, output_ptr, size, dimsize, stride, byteSize))) # 以毫秒为单位 + elif device == "cpu": + torch_softmax_time = performance.CpuProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 + lib.softmax_cpu.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + ctypes.POINTER(ctypes.c_void_p), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int + ] + custom_softmax_time = performance.CpuProfile((lib.softmax_cpu, (input_ptr, output_ptr, size, dimsize, stride, byteSize))) # 以毫秒为单位 + elif device == "mlu": + torch_softmax_time = performance.BangProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 + + ''' + frontsize = 1 + othersize = 1 + for s in range(ndim - 1, -1, -1): + if (s < test_axis): + frontsize *= test_shape[s] + if (s != test_axis): + othersize *= test_shape[s]; + + lib.softmax_bang.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + ctypes.POINTER(ctypes.c_void_p), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int + ] + custom_softmax_time = performance.BangProfile((lib.softmax_bang, (input_ptr, output_ptr, othersize, dimsize, frontsize, stride, test_axis, ndim, byteSize))) + ''' + lib.softmax_cnnl.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + ctypes.POINTER(ctypes.c_void_p), + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int), + ctypes.c_int + ] + + custom_softmax_time = performance.BangProfile((lib.softmax_cnnl, (input_ptr, output_ptr, ndim, test_axis, ctypes_array, byteSize))) + elif device == "npu": + torch_softmax_time = performance.AscendProfile((torch.softmax, (Q, test_axis))) # 以毫秒为单位 + lib.softmax_aclnn.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + ctypes.POINTER(ctypes.c_void_p), + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int), + ctypes.c_int + ] + + custom_softmax_time = performance.AscendProfile((lib.softmax_aclnn, (input_ptr, output_ptr, ndim, test_axis, ctypes_array, byteSize))) performance.logBenchmark(torch_softmax_time, custom_softmax_time) # 将结果转换回 PyTorch 张量以进行比较 tmpa = torch.softmax(Q, test_axis).to('cpu').numpy().flatten() @@ -175,44 +128,25 @@ def test(test_shape, test_axis, test_dtype, device): # 解析命令行参数 parser = argparse.ArgumentParser(description="Test softmax on different devices.") -parser.add_argument('--device', choices=['cpu', 'cuda', 'mlu'], required=True, help="Device to run the tests on.") +parser.add_argument('--device', choices=['cpu', 'cuda', 'mlu', 'npu'], required=True, help="Device to run the tests on.") args = parser.parse_args() test_cases = [ # x_shape, axis - ((700, 1200, 24), 0, torch.float32, 'cuda'), - ((700, 1200, 24), 1, torch.float32, 'cuda'), - ((700, 1200, 24), 2, torch.float32, 'cuda'), - - ((700, 1200, 24), 0, torch.float32, 'mlu'), - ((700, 1200, 24), 1, torch.float32, 'mlu'), - ((700, 1200, 24), 2, torch.float32, 'mlu'), - - ((70, 12, 24), 0, torch.float32, 'cpu'), - ((70, 12, 24), 1, torch.float32, 'cpu'), - ((70, 12, 24), 2, torch.float32, 'cpu'), - - # x_shape, axis - ((700, 1200, 24), 0, torch.float16, 'cuda'), - ((700, 1200, 24), 1, torch.float16, 'cuda'), - ((700, 1200, 24), 2, torch.float16, 'cuda'), - - ((700, 1200, 24), 0, torch.float16, 'mlu'), - ((700, 1200, 24), 1, torch.float16, 'mlu'), - ((700, 1200, 24), 2, torch.float16, 'mlu'), - - ((70, 12, 24), 0, torch.float16, 'cpu'), - ((70, 12, 24), 1, torch.float16, 'cpu'), - ((70, 12, 24), 2, torch.float16, 'cpu'), + ((700, 1200, 24), 0), + ((700, 1200, 24), 1), + ((700, 1200, 24), 2), + # cpu用小规模数据 + # ((70, 12, 24), 0), + # ((70, 12, 24), 1), + # ((70, 12, 24), 2), ] -filtered_test_cases = [ - (test_shape, test_axis, test_dtype, device) - for test_shape, test_axis, test_dtype, device in test_cases - if device == args.device -] + if args.device == 'mlu': import torch_mlu +if args.device == 'npu': + import torch_npu # 执行过滤后的测试用例 -for test_shape, test_axis, test_dtype, device in filtered_test_cases: - test(test_shape, test_axis, test_dtype, device) \ No newline at end of file +for test_shape, test_axis in test_cases: + test(test_shape, test_axis, args.device) \ No newline at end of file