Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions src/softmax/cpu/softmax.cpp → src/softmax/cpu/softmax_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include <math.h>
#include "cpu/common_cpu.h"
template <typename T>
void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride)
void softmaxDevice(void const *input, void *output, int size, int dimsize, int stride)
{
int othersize = size / dimsize;
auto source = reinterpret_cast<const T *>(input);
Expand Down Expand Up @@ -59,11 +59,14 @@ void softmax_cpu(void const *input, void *output, int size, int dimsize, int str
}
}
}
extern "C" void softmax_cpu_f32(void const *input, void *output, int size, int dimsize, int stride)
extern "C" void softmax_cpu(void const *input, void *output, int size, int dimsize, int stride, int byteSize)
{
softmax_cpu<float>(input, output, size, dimsize, stride);
if (byteSize == 4)
{
softmaxDevice<float>(input, output, size, dimsize, stride);
}
else if (byteSize == 2)
{
softmaxDevice<uint16_t>(input, output, size, dimsize, stride);
}
}
extern "C" void softmax_cpu_f16(void const *input, void *output, int size, int dimsize, int stride)
{
softmax_cpu<uint16_t>(input, output, size, dimsize, stride);
}
15 changes: 9 additions & 6 deletions src/softmax/gpu/softmax_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,14 @@ void softmaxLaunch(void const *input, void *output, int size, int dimsize, int s
}
cudaDeviceSynchronize();
}
extern "C" void softmax_nv_f32(void const *input, void *output, int size, int dimsize, int stride)
extern "C" void softmax_nv(void const *input, void *output, int size, int dimsize, int stride, int byteSize)
{
softmaxLaunch<float>(input, output, size, dimsize, stride);
}
extern "C" void softmax_nv_f16(void const *input, void *output, int size, int dimsize, int stride)
{
softmaxLaunch<half>(input, output, size, dimsize, stride);
if (byteSize == 4)
{
softmaxLaunch<float>(input, output, size, dimsize, stride);
}
else if (byteSize == 2)
{
softmaxLaunch<half>(input, output, size, dimsize, stride);
}
}
15 changes: 9 additions & 6 deletions src/softmax/gpu/softmax_cudnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ void softmaxCudnn(void const *input, void *output, int *shape, int ndim)
softmaxCudnnDevice<T>(handle, input, output, shape, ndim);
cudnnDestroy(handle);
}
extern "C" void softmax_cudnn_f32(void const *input, void *output, int *shape, int ndim)
extern "C" void softmax_cudnn(void const *input, void *output, int *shape, int ndim, int byteSize)
{
softmaxCudnn<float>(input, output, shape, ndim);
}
extern "C" void softmax_cudnn_f16(void const *input, void *output, int *shape, int ndim)
{
softmaxCudnn<uint16_t>(input, output, shape, ndim);
if (byteSize == 4)
{
softmaxCudnn<float>(input, output, shape, ndim);
}
else if (byteSize == 2)
{
softmaxCudnn<uint16_t>(input, output, shape, ndim);
}
}
19 changes: 10 additions & 9 deletions src/softmax/mlu/softmax_bang.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -958,17 +958,18 @@ void softmaxUnion1(cnrtQueue_t queue, void const *input, void *output, int other



extern "C" void softmax_bang_f32(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) {
extern "C" void softmax_bang(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim, int byteSize) {
cnrtQueue_t queue;
CNRT_CHECK(cnrtSetDevice(0));
CNRT_CHECK(cnrtQueueCreate(&queue));
softmaxUnion1<float>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
CNRT_CHECK(cnrtQueueDestroy(queue));
}
extern "C" void softmax_bang_f16(void const *input, void *output, int othersize, int dimsize, int frontsize, int stride, int axis, int ndim) {
cnrtQueue_t queue;
CNRT_CHECK(cnrtSetDevice(0));
CNRT_CHECK(cnrtQueueCreate(&queue));
softmaxUnion1<half>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
if (byteSize == 4)
{
softmaxUnion1<float>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
}
else if (byteSize == 2)
{
softmaxUnion1<half>(queue, input, output, othersize, dimsize, frontsize, stride, axis, ndim);
}

CNRT_CHECK(cnrtQueueDestroy(queue));
}
42 changes: 19 additions & 23 deletions src/softmax/mlu/softmax_cnnl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
#include "cnrt.h"
#include <vector>


template<typename T>
template <typename T>
void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int *shape, cnnlHandle_t &handle, cnrtQueue_t &queue)
{
cnnlSoftmaxMode_t mode;
Expand Down Expand Up @@ -79,10 +78,12 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int
cnnlCreateTensorDescriptor(&aDesc);
cnnlCreateTensorDescriptor(&cDesc);
cnnlDataType_t dataType;
if(sizeof(T) == 2){
if (sizeof(T) == 2)
{
dataType = CNNL_DTYPE_HALF;
}
else if(sizeof(T) == 4){
else if (sizeof(T) == 4)
{
dataType = CNNL_DTYPE_FLOAT;
}
cnnlSetTensorDescriptor(
Expand All @@ -91,24 +92,22 @@ void softmaxCnnlDevice(T const *source, T *destination, int nDim, int axis, int
cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, dataType,
outDim.size(), outDim.data());

T alpha = 1.0;
T beta = 0.0;
cnnlStatus_t stat =
cnnlSoftmaxForward_v2(handle, CNNL_SOFTMAX_ACCURATE,
mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
&alpha, aDesc, source, &beta, cDesc, destination);

CNRT_CHECK(cnrtQueueSync(queue));


if (stat != CNNL_STATUS_SUCCESS)
return;
cnnlDestroyTensorDescriptor(aDesc);
cnnlDestroyTensorDescriptor(cDesc);

}
template<typename T>
template <typename T>
void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape)
{
auto source = reinterpret_cast<const T *>(input);
Expand All @@ -121,22 +120,19 @@ void softmaxCnnl(void const *input, void *output, int nDim, int axis, int *shape
cnnlSetQueue(handle, queue); // 将队列绑定到 handle 中, 此接口也可用来更改句柄中的队列。

softmaxCnnlDevice(source, destination, nDim, axis, shape, handle, queue);

cnnlDestroy(handle);
CNRT_CHECK(cnrtQueueDestroy(queue));


}

extern "C" void softmax_cnnl_f32(void const *input, void *output, int nDim, int axis, int *shape){
softmaxCnnl<float>(input, output, nDim, axis, shape);
}
extern "C" void softmax_cnnl_f16(void const *input, void *output, int nDim, int axis, int *shape){
softmaxCnnl<uint16_t>(input, output, nDim, axis, shape);
extern "C" void softmax_cnnl(void const *input, void *output, int nDim, int axis, int *shape, int byteSize)
{
if (byteSize == 4)
{
softmaxCnnl<float>(input, output, nDim, axis, shape);
}
else if (byteSize == 2)
{
softmaxCnnl<uint16_t>(input, output, nDim, axis, shape);
}
}






112 changes: 112 additions & 0 deletions src/softmax/npu/softmax_aclnn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#include "acl/acl.h"
#include "aclnnop/aclnn_softmax.h"
#include <iostream>
#include <vector>
#include "npu/common_npu.h"

template <typename T>
void softmaxAclnnDevice(void *input, void *output, int ndim, int axis, int *shape,
aclrtStream &stream)
{
aclDataType dataType;
if (sizeof(T) == 2)
{
dataType = aclDataType::ACL_FLOAT16;
}
else if (sizeof(T) == 4)
{
dataType = aclDataType::ACL_FLOAT;
}
aclFormat format = aclFormat::ACL_FORMAT_ND;
std::vector<int64_t> inputDim(ndim); // aclCreateTensor只支持int64_t的数组
std::vector<int64_t> inputStride(ndim, 1);
std::vector<int64_t> outputDim(ndim);
std::vector<int64_t> outputStride(ndim, 1);
for (int i = ndim - 1; i >= 0; i--)
{
inputDim[i] = int64_t(shape[i]);
outputDim[i] = int64_t(shape[i]);
if (i < ndim - 1)
{
inputStride[i] = inputDim[i + 1] * inputStride[i + 1];
outputStride[i] = outputDim[i + 1] * outputStride[i + 1];
}
}
auto inputTensor =
aclCreateTensor(inputDim.data(), inputDim.size(), dataType,
inputStride.data(), 0, format,
inputDim.data(), inputDim.size(), input); // const aclTensor *inputTensor

auto outputTensor =
aclCreateTensor(outputDim.data(), outputDim.size(), dataType,
outputStride.data(), 0, format,
outputDim.data(), outputDim.size(), output);
// 下面开始正式计算
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnSoftmaxGetWorkspaceSize(inputTensor, int64_t(axis), outputTensor,
&workspaceSize, &executor);

if (ret != ACL_SUCCESS)
{
printf("aclnnSoftmaxGetWorkspaceSize failed. ERROR: %d\n", ret);
}
void *workspaceAddr = nullptr;
if (workspaceSize > 0)
{
ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);

if (ret != ACL_SUCCESS)
{
printf("allocate workspace failed. ERROR: %d\n", ret);
}
}

ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor,
stream);

if (ret != ACL_SUCCESS)
{
printf("aclnnSoftmax failed. ERROR: %d\n", ret);
}
ret = aclrtSynchronizeStream(stream);

if (ret != ACL_SUCCESS)
{
printf("aclrtSynchronizeStream failed. ERROR: %d\n", ret);
}

aclDestroyTensor(inputTensor);
aclDestroyTensor(outputTensor);
if (workspaceSize > 0)
{
aclrtFree(workspaceAddr);
}
}
template <typename T>
void softmaxAclnn(void *input, void *output, int ndim, int axis, int *shape)
{
int32_t deviceId = 0;

aclrtStream stream;
auto ret = Init(deviceId, &stream);
if (ret != ACL_SUCCESS)
{
printf("Init acl failed. ERROR: %d\n", ret);
}

softmaxAclnnDevice<T>(input, output, ndim, axis, shape, stream);
Finalize(deviceId, stream);
}

extern "C" void softmax_aclnn(void *input, void *output, int ndim, int axis, int *shape, int byteSize)
{
if (byteSize == 4)
{
softmaxAclnn<float>(input, output, ndim, axis, shape);
}
else if (byteSize == 2)
{
softmaxAclnn<uint16_t>(input, output, ndim, axis, shape);
}
}
Loading