diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 331a8a98..84108c51 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -23,6 +23,7 @@ jobs: - name: Install Python dependencies run: | + pip install numpy pip install torch - name: Install xmake @@ -33,15 +34,15 @@ jobs: - name: configure xmake run: xmake f --cpu=true -cv - - name: Build with XMake - run: xmake - - - name: Find and Set INFINI_ROOT - id: set_infini_root + - name: Set INFINI_ROOT run: | - export INFINI_ROOT=$GITHUB_WORKSPACE + export INFINI_ROOT=$GITHUB_WORKSPACE/.infini + mkdir -p $INFINI_ROOT echo "INFINI_ROOT=$INFINI_ROOT" >> $GITHUB_ENV + - name: Build with XMake + run: xmake build && xmake install + - name: Run Python Tests run: | GREEN='\033[0;32m' diff --git a/README.md b/README.md index 98913cb9..674a874f 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc ## 一、使用说明 -### 配置 +### 1. 配置 #### 查看当前配置 @@ -99,23 +99,27 @@ xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv xmake f --cambricon-mlu=true -cv ``` -### 编译 +#### 配置 NPU + +````xmake +xmake f --ascend-npu=true -cv +```` + +### 2. 编译安装 ```xmake -xmake +xmake build && xmake install ``` -### 将编译好的算子库添加至环境变量 `INFINI_ROOT` +### 3. 设置环境变量 -```bash -export INFINI_ROOT=[PATH_TO_LIBRARY] -``` +按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。 -### 运行算子测试 +### 4. 运行算子测试 ```bash cd operatorspy/tests -python operator_name.py +python operator_name.py [--cpu | --cuda | --cambricon | --ascend] ``` ## 二、开发说明 diff --git a/include/data_type.h b/include/data_type.h index 1044936a..954a42ea 100644 --- a/include/data_type.h +++ b/include/data_type.h @@ -9,6 +9,7 @@ typedef struct DataLayout { mantissa : 8, exponent : 8; +#ifdef __cplusplus bool operator==(const DataLayout &other) const { union TypePun { DataLayout layout; @@ -24,12 +25,13 @@ typedef struct DataLayout { bool operator!=(const DataLayout &other) const { return !(*this == other); } +#endif } DataLayout; typedef struct DataLayout DT; // clang-format off -constexpr static struct DataLayout +const static struct DataLayout I8 = {1, 1, 1, 7, 0}, I16 = {1, 1, 2, 15, 0}, I32 = {1, 1, 4, 31, 0}, diff --git a/include/ops/concat/concat.h b/include/ops/concat/concat.h new file mode 100644 index 00000000..20ca6339 --- /dev/null +++ b/include/ops/concat/concat.h @@ -0,0 +1,27 @@ +#ifndef CONCAT_H +#define CONCAT_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ConcatDescriptor { + Device device; +} ConcatDescriptor; + +typedef ConcatDescriptor *infiniopConcatDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateConcatDescriptor(infiniopHandle_t handle, + infiniopConcatDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t num_inputs, + int64_t axis); + +__C __export infiniopStatus_t infiniopConcat(infiniopConcatDescriptor_t desc, + void *y, + void const **x, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyConcatDescriptor(infiniopConcatDescriptor_t desc); + +#endif diff --git a/include/ops/gemm/gemm.h b/include/ops/gemm/gemm.h index 4a39da39..a6eac566 100644 --- a/include/ops/gemm/gemm.h +++ b/include/ops/gemm/gemm.h @@ -18,8 +18,8 @@ __C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t hand infiniopTensorDescriptor_t c_desc, float alpha, float beta, - bool transA, - bool transB); + char transA, + char transB); __C __export infiniopStatus_t infiniopGetGEMMWorkspaceSize(infiniopGEMMDescriptor_t desc, uint64_t *size); diff --git a/include/ops/mlp/mlp.h b/include/ops/mlp/mlp.h index 7150c427..9c4c7dd2 100644 --- a/include/ops/mlp/mlp.h +++ b/include/ops/mlp/mlp.h @@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handl infiniopTensorDescriptor_t w12_desc, infiniopTensorDescriptor_t w3_desc, float alpha, - bool residual); + char residual); __C __export infiniopStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, uint64_t *size); diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py index 868cc88d..fb58d6a7 100644 --- a/operatorspy/liboperators.py +++ b/operatorspy/liboperators.py @@ -10,7 +10,6 @@ LIB_OPERATORS_DIR = os.path.join(os.environ.get("INFINI_ROOT"), "lib") - class TensorDescriptor(Structure): _fields_ = [ ("dt", DataLayout), @@ -19,10 +18,8 @@ class TensorDescriptor(Structure): ("pattern", POINTER(c_int64)), ] - infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor) - class CTensor: def __init__(self, desc, data): self.descriptor = desc diff --git a/operatorspy/tests/avg_pool.py b/operatorspy/tests/avg_pool.py index c7d54c24..a1f47374 100644 --- a/operatorspy/tests/avg_pool.py +++ b/operatorspy/tests/avg_pool.py @@ -152,10 +152,6 @@ def test( elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") - - print(x) - print(y) - print(ans) assert torch.allclose(y, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor)) @@ -205,7 +201,7 @@ def test_musa(lib, test_cases): # ((1, 1, 10), (3,), (1,), (1,)), ((1, 1, 2, 2), (2, 2), (1, 1), (1, 1)), ((32, 4, 224, 224), (3, 3), (1, 1), (2, 2)), - ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), + # ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), ] args = get_args() lib = open_lib() diff --git a/operatorspy/tests/concat.py b/operatorspy/tests/concat.py new file mode 100644 index 00000000..412a4b6f --- /dev/null +++ b/operatorspy/tests/concat.py @@ -0,0 +1,218 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_int64 +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + +class ConcatDescriptor(Structure): + _fields_ = [("device", c_int32),] + + +infiniopConcatDescriptor_t = POINTER(ConcatDescriptor) + + +def concat_py(*tensors, dim=0): + return torch.cat(tensors, dim=dim) + + +def test( + lib, + handle, + torch_device, + c_shape, + axis, + input_shapes, + tensor_dtype=torch.float32, + inplace=Inplace.OUT_OF_PLACE, +): + """ + 测试 concat 算子 + """ + print( + f"Testing Concat on {torch_device} with output_shape:{c_shape}, input_shapes:{input_shapes}, axis:{axis}, dtype:{tensor_dtype}, inplace: {inplace.name}" + ) + + inputs = [torch.rand(shape, dtype=tensor_dtype).to(torch_device) for shape in input_shapes] + + if inplace == Inplace.OUT_OF_PLACE: + c = torch.zeros(c_shape, dtype=tensor_dtype).to(torch_device) + else: + c = torch.zeros(c_shape, dtype=tensor_dtype).to(torch_device) + + ans = concat_py(*inputs, dim=axis) + + input_tensors = [to_tensor(t, lib) for t in inputs] + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else to_tensor(c, lib) + + descriptor = infiniopConcatDescriptor_t() + + num_inputs = len(input_tensors) + input_desc_array_type = infiniopTensorDescriptor_t * num_inputs + input_desc_array = input_desc_array_type(*[t.descriptor for t in input_tensors]) + + check_error( + lib.infiniopCreateConcatDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + input_desc_array, + c_uint64(num_inputs), + c_int64(axis), + ) + ) + + input_data_ptrs = (c_void_p * num_inputs)(*[t.data for t in input_tensors]) + check_error( + lib.infiniopConcat( + descriptor, + c_tensor.data, + ctypes.cast(input_data_ptrs, POINTER(c_void_p)), + None + ) + ) + + assert torch.allclose(c, ans, atol=0, rtol=0), "Concat result does not match PyTorch's result." + + check_error(lib.infiniopDestroyConcatDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, axis, input_shapes, inplace in test_cases: + test(lib, handle, "cpu", c_shape, axis, input_shapes, tensor_dtype = torch.float16, inplace = inplace) + test(lib, handle, "cpu", c_shape, axis, input_shapes, tensor_dtype = torch.float32, inplace = inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, axis, input_shapes, inplace in test_cases: + test(lib, handle, "cuda", c_shape, axis, input_shapes, tensor_dtype = torch.float16, inplace = inplace) + test(lib, handle, "cuda", c_shape, axis, input_shapes, tensor_dtype = torch.float32, inplace = inplace) + destroy_handle(lib, handle) + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, axis, input_shapes, inplace in test_cases: + test(lib, handle, "mlu", c_shape, axis, input_shapes, inplace=inplace) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for c_shape, axis, input_shapes,inplace in test_cases: + test(lib, handle, "musa", c_shape, axis, input_shapes, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + + test_cases = [ + #output_tensor, axis, inputs_tensors, inplace + + ((6,), 0, [(2,), (4,)], Inplace.OUT_OF_PLACE), + + ((6, 3), 0, [(2, 3), (4, 3)], Inplace.OUT_OF_PLACE), + ((3, 6), 1, [(3, 2), (3, 4)], Inplace.OUT_OF_PLACE), + ((3, 7), 1, [(3, 2), (3, 4), (3, 1)], Inplace.OUT_OF_PLACE), + ((3, 3, 10), 2, [(3, 3, 4), (3, 3, 6)], Inplace.OUT_OF_PLACE), + ((4, 3, 6), 0, [(3, 3, 6), (1, 3, 6)], Inplace.OUT_OF_PLACE), + ((2, 6, 3), 1, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE), + ((2, 3, 6), 2, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE), + ((4, 3, 5, 6), 0, [(1, 3, 5, 6), (3, 3, 5, 6)], Inplace.OUT_OF_PLACE), + ((2, 5, 5, 6), 1, [(2, 3, 5, 6), (2, 2, 5, 6)], Inplace.OUT_OF_PLACE), + ((2, 3, 5, 6), 2, [(2, 3, 2, 6), (2, 3, 3, 6)], Inplace.OUT_OF_PLACE), + ((2, 3, 5, 6), 3, [(2, 3, 5, 3), (2, 3, 5, 3)], Inplace.OUT_OF_PLACE), + ((2, 3, 5, 15), 3, [(2, 3, 5, 3), (2, 3, 5, 3), (2, 3, 5, 9)], Inplace.OUT_OF_PLACE), + ((4, 2, 3, 4, 5), 0, [(1, 2, 3, 4, 5), (3, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE), + ((2, 4, 3, 2, 5), 1, [(2, 2, 3, 2, 5), (2, 2, 3, 2, 5)], Inplace.OUT_OF_PLACE), + ((1, 2, 4, 4, 5), 2, [(1, 2, 2, 4, 5), (1, 2, 2, 4, 5)], Inplace.OUT_OF_PLACE), + ((1, 2, 3, 8, 5), 3, [(1, 2, 3, 4, 5), (1, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE), + ((1, 2, 3, 4, 5), 4, [(1, 2, 3, 4, 3), (1, 2, 3, 4, 2)], Inplace.OUT_OF_PLACE), + ((4, 14, 3, 4, 5), 1, [(4, 3, 3, 4, 5), (4, 5, 3, 4, 5), (4, 6, 3, 4, 5)], Inplace.OUT_OF_PLACE), + + ((6,), -1, [(2,), (4,)], Inplace.OUT_OF_PLACE), + ((6, 3), -2, [(2, 3), (4, 3)], Inplace.OUT_OF_PLACE), + ((3, 6), -1, [(3, 2), (3, 4)], Inplace.OUT_OF_PLACE), + ((3, 7), -1, [(3, 2), (3, 4), (3, 1)], Inplace.OUT_OF_PLACE), + ((3, 3, 10), -1, [(3, 3, 4), (3, 3, 6)], Inplace.OUT_OF_PLACE), + ((4, 3, 6), -3, [(3, 3, 6), (1, 3, 6)], Inplace.OUT_OF_PLACE), + ((2, 6, 3), -2, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE), + ((2, 3, 6), -1, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE), + ((4, 3, 5, 6), -4, [(1, 3, 5, 6), (3, 3, 5, 6)], Inplace.OUT_OF_PLACE), + ((2, 5, 5, 6), -3, [(2, 3, 5, 6), (2, 2, 5, 6)], Inplace.OUT_OF_PLACE), + ((2, 3, 5, 6), -2, [(2, 3, 2, 6), (2, 3, 3, 6)], Inplace.OUT_OF_PLACE), + ((2, 3, 5, 6), -1, [(2, 3, 5, 3), (2, 3, 5, 3)], Inplace.OUT_OF_PLACE), + ((2, 3, 5, 15), -1, [(2, 3, 5, 3), (2, 3, 5, 3), (2, 3, 5, 9)], Inplace.OUT_OF_PLACE), + ((4, 2, 3, 4, 5), -5, [(1, 2, 3, 4, 5), (3, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE), + ((2, 4, 3, 2, 5), -4, [(2, 2, 3, 2, 5), (2, 2, 3, 2, 5)], Inplace.OUT_OF_PLACE), + ((1, 2, 4, 4, 5), -3, [(1, 2, 2, 4, 5), (1, 2, 2, 4, 5)], Inplace.OUT_OF_PLACE), + ((1, 2, 3, 8, 5), -2, [(1, 2, 3, 4, 5), (1, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE), + ((1, 2, 3, 4, 5), -1, [(1, 2, 3, 4, 3), (1, 2, 3, 4, 2)], Inplace.OUT_OF_PLACE), + ((4, 14, 3, 4, 5), -4, [(4, 3, 3, 4, 5), (4, 5, 3, 4, 5), (4, 6, 3, 4, 5)], Inplace.OUT_OF_PLACE), + + ] + + args = get_args() + lib = open_lib() + + lib.infiniopCreateConcatDescriptor.restype = c_int32 + lib.infiniopCreateConcatDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopConcatDescriptor_t), + infiniopTensorDescriptor_t, + POINTER(infiniopTensorDescriptor_t), + c_uint64, # nums_input + c_int64, # axis + ] + + lib.infiniopConcat.restype = c_int32 + lib.infiniopConcat.argtypes = [ + infiniopConcatDescriptor_t, + c_void_p, + POINTER(c_void_p), + c_void_p, + ] + + lib.infiniopDestroyConcatDescriptor.restype = c_int32 + lib.infiniopDestroyConcatDescriptor.argtypes = [ + infiniopConcatDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mConcat Test passed!\033[0m") diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py index 3ddb6a03..7c13aa69 100644 --- a/operatorspy/tests/conv.py +++ b/operatorspy/tests/conv.py @@ -223,14 +223,14 @@ def test_musa(lib, test_cases): if __name__ == "__main__": test_cases = [ # x_shape, w_shape, pads, strides, dilations, x_strides - # ( - # (32, 3, 4), - # (32, 3, 5), - # (1,), - # (1,), - # (1,), - # None, - # ), + ( + (32, 3, 4), + (32, 3, 5), + (1,), + (1,), + (1,), + None, + ), ( (1, 3, 4, 4), (2, 3, 3, 3), diff --git a/operatorspy/tests/gemm.py b/operatorspy/tests/gemm.py index e899c7cf..16278b7e 100644 --- a/operatorspy/tests/gemm.py +++ b/operatorspy/tests/gemm.py @@ -201,6 +201,29 @@ def test_cuda(lib, test_cases): test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) destroy_handle(lib, handle) +def test_musa(lib, test_cases): + import torch_musa + + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for ( + alpha, + beta, + transA, + transB, + a_shape, + b_shape, + c_shape, + y_shape, + a_stride, + b_stride, + c_stride, + y_stride, + ) in test_cases: + # test(lib, handle, "musa", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) + test(lib, handle, "musa", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) + destroy_handle(lib, handle) + def test_bang(lib, test_cases): import torch_mlu @@ -362,6 +385,8 @@ def test_bang(lib, test_cases): test_cuda(lib, test_cases) if args.bang: test_bang(lib, test_cases) - if not (args.cpu or args.cuda or args.bang): + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.musa or args.bang): test_cpu(lib, test_cases) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/global_avg_pool.py b/operatorspy/tests/global_avg_pool.py index 5c586546..96703c31 100644 --- a/operatorspy/tests/global_avg_pool.py +++ b/operatorspy/tests/global_avg_pool.py @@ -145,23 +145,32 @@ def test_bang(lib, test_cases): test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32) destroy_handle(lib, handle) +def test_musa(lib, test_cases): + import torch_musa + + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for x_shape in test_cases: + test(lib, handle, "musa", x_shape, tensor_dtype = torch.float32) + destroy_handle(lib, handle) if __name__ == "__main__": test_cases = [ # x_shape - ((1, 3, 3)), - ((1, 3, 1, 1, 3)), - ((1, 3, 1, 1, 257)), - ((1, 2, 1, 1, 514)), - ((1, 3, 1, 1, 1025)), - ((32, 256, 1, 112, 112)), - ((2, 3, 2048000)), - ((2, 1, 10243)), - ((2, 20, 100)), - ((3, 33, 333)), - ((32, 20, 512)), - ((3, 3, 11, 11, 11, 3, 2)), - ((32, 256, 1, 112, 112)), + ((1, 1, 2, 2)), + ((1, 3, 3, 6)), + ((8, 4, 6, 12)), + # ((1, 3, 1, 1, 3)), + # ((1, 3, 1, 1, 257)), + # ((1, 2, 1, 1, 514)), + # ((1, 3, 1, 1, 1025)), + # ((32, 256, 1, 112, 112)), + # ((2, 3, 2048000)), + # ((2, 1, 10243)), + # ((2, 20, 100)), + # ((3, 33, 333)), + # ((32, 20, 512)), + # ((32, 256, 1, 112, 112)), ((32, 256, 112, 112)), ] args = get_args() @@ -198,6 +207,8 @@ def test_bang(lib, test_cases): test_cuda(lib, test_cases) if args.bang: test_bang(lib, test_cases) - if not (args.cpu or args.cuda or args.bang): + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): test_cpu(lib, test_cases) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py index a434f7ca..ff4d8037 100644 --- a/operatorspy/tests/matmul.py +++ b/operatorspy/tests/matmul.py @@ -383,6 +383,6 @@ def test_musa(lib, test_cases): test_ascend(lib, test_cases) if args.musa: test_musa(lib, test_cases) - if not (args.cpu or args.cuda or args.bang or args.ascend): + if not (args.cpu or args.cuda or args.musa or args.bang or args.ascend): test_cpu(lib, test_cases) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/max_pool.py b/operatorspy/tests/max_pool.py index f80b8e4d..aea9767f 100644 --- a/operatorspy/tests/max_pool.py +++ b/operatorspy/tests/max_pool.py @@ -148,9 +148,9 @@ def test( ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") - print(x) - print(y) - print(ans) + # print(x) + # print(y) + # print(ans) assert torch.allclose(y, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor)) @@ -197,9 +197,12 @@ def test_musa(lib, test_cases): if __name__ == "__main__": test_cases = [ # x_shape, kernel_shape, padding, strides - ((1, 1, 10), (3,), (1,), (1,)), + # ((1, 1, 10), (3,), (1,), (1,)), ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)), - ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), + ((1, 3, 6, 6), (3, 3), (1, 1), (2, 2)), + ((8, 3, 12, 12), (3, 3), (1, 1), (2, 2)), + ((1, 1, 4, 4), (2, 2), (0, 0), (1, 1)), + # ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), ] args = get_args() lib = open_lib() diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py index c3de7cc0..bb398f9e 100644 --- a/operatorspy/tests/random_sample.py +++ b/operatorspy/tests/random_sample.py @@ -30,7 +30,7 @@ class RandomSampleDescriptor(Structure): def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): - indices = torch.zeros([topk], dtype = torch.uint64) + indices = torch.zeros([topk], dtype = torch.int64) dataNp = data.clone().detach() sorted_indices = torch.arange(voc) @@ -52,7 +52,7 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): globalM = dataNp[0] dataNp = (dataNp - globalM) / temperature - dataNp = torch.softmax(dataNp, dim = 0) + dataNp = torch.softmax(dataNp.float(), dim = 0) sum_s = 0 for end in range(topk): sum_s += dataNp[end] @@ -63,8 +63,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): else: end = topk - - sum_s = 0 for i in range(end): sum_s += dataNp[i] @@ -78,12 +76,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): def random_sample_0(data): return torch.argmax(data) + def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16): print( f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}" ) - - data = torch.rand((voc), dtype=x_dtype).to(torch_device) + data = torch.arange(voc).float() * 0.0001 + _perm = torch.randperm(voc) + data = data[_perm].to(x_dtype).to(torch_device) if(topp > 0 and topk > 1): ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu") else: @@ -96,7 +96,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_ indices = torch.zeros([1], dtype = torch.uint64).to(torch_device) x_tensor = to_tensor(data, lib) indices_tensor = to_tensor(indices, lib) - if(torch_device == 'mlu' or torch_device == 'musa'): + if(torch_device == 'mlu' or torch_device == 'npu' or torch_device == 'musa'): indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64 @@ -127,12 +127,12 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_ None, ) ) - assert indices[0].type(ans.dtype) == ans or abs(data[indices[0]] - data[ans]) == 0.0, "compute error" + if torch_device == "npu": + torch.npu.synchronize() - - + assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]] check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor)) - + print("Test passed!") def test_cpu(lib, test_cases): device = DeviceEnum.DEVICE_CPU @@ -179,13 +179,16 @@ def test_musa(lib, test_cases): if __name__ == "__main__": test_cases = [ # voc, random_val, topp, topk, temperature - (512, 0.92, 0.8, 3, 0.5), - (4096, 0.95, 0.9, 5, 1.0), - (16384, 0.85, 0.85, 10, 2.0), - (512, 0.92, 0, 3, 0.5), - (4096, 0.95, 0.9, 0, 1.0), - (16384, 0.85, 0, 0, 2.0), - (16384, 0.85, 0, 1, 2.0), + (512, 0.8, 0.8, 3, 0.5), + (4096, 0.05, 0.9, 5, 1.0), + (16384, 0.15, 0.85, 10, 2.0), + (512, 0.08, 0, 3, 0.5), + (4096, 0.5, 0.9, 1, 1.0), + (16384, 0.15, 0, 1, 2.0), + (16384, 0.15, 0, 1, 2.0), + (32000, 0.08, 0.8, 50, 1.0), + (32000, 0.08, 1.0, 25, 1.0), + # (119696, 0.01, 1.0, 100, 1.0), ] args = get_args() @@ -231,4 +234,4 @@ def test_musa(lib, test_cases): test_musa(lib, test_cases) if not (args.cpu or args.cuda or args.bang or args.ascend or args.musa): test_cpu(lib, test_cases) - print("Test passed!") + print("\033[92mTest passed!\033[0m") diff --git a/src/devices/ascend/CMakeLists.txt b/src/devices/ascend/CMakeLists.txt index 5498de24..8cc7f7f8 100644 --- a/src/devices/ascend/CMakeLists.txt +++ b/src/devices/ascend/CMakeLists.txt @@ -23,5 +23,6 @@ include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) ascendc_library(ascend_kernels STATIC ../../ops/swiglu/ascend/swiglu_kernel.cpp ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp + ../../ops/random_sample/ascend/random_sample_kernel.cpp ) diff --git a/src/devices/ascend/common_ascend.cc b/src/devices/ascend/common_ascend.cc index e7b0e55d..fe988e5d 100644 --- a/src/devices/ascend/common_ascend.cc +++ b/src/devices/ascend/common_ascend.cc @@ -8,101 +8,138 @@ int64_t numElements(const int64_t *shape, int64_t num) { return numEle; } -void *mallocWorkspace(uint64_t workspaceSize) { - void *workspaceAddr = nullptr; +infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize) { + *workspaceAddr = nullptr; if (workspaceSize > 0) { - auto ret = aclrtMalloc(&workspaceAddr, workspaceSize, - ACL_MEM_MALLOC_HUGE_FIRST); + auto ret = aclrtMalloc(workspaceAddr, workspaceSize, + ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, - LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret)); + LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); } - return workspaceAddr; + return STATUS_SUCCESS; } -void freeWorkspace(void *workspaceAddr) { - aclrtFree(workspaceAddr); +infiniopStatus_t freeWorkspace(void *workspaceAddr) { + if (workspaceAddr != nullptr) { + auto ret = aclrtFree(workspaceAddr); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclrtFree failed, ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + } + return STATUS_SUCCESS; } +aclDataType toAclDataType(DT dt) { + if (dt == I8) + return aclDataType::ACL_INT8; + else if (dt == I16) + return aclDataType::ACL_INT16; + else if (dt == I32) + return aclDataType::ACL_INT32; + else if (dt == I64) + return aclDataType::ACL_INT64; + else if (dt == U8) + return aclDataType::ACL_UINT8; + else if (dt == U16) + return aclDataType::ACL_UINT16; + else if (dt == U32) + return aclDataType::ACL_UINT32; + else if (dt == U64) + return aclDataType::ACL_UINT64; + else if (dt == F16) + return aclDataType::ACL_FLOAT16; + else if (dt == BF16) + return aclDataType::ACL_BF16; + else if (dt == F32) + return aclDataType::ACL_FLOAT; + else if (dt == F64) + return aclDataType::ACL_DOUBLE; + else + return aclDataType::ACL_DT_UNDEFINED; +} + + const char *dataTypeToString(aclDataType dtype) { switch (dtype) { - case ACL_DT_UNDEFINED: - return "ACL_DT_UNDEFINED"; - case ACL_FLOAT: - return "ACL_FLOAT"; - case ACL_FLOAT16: - return "ACL_FLOAT16"; - case ACL_INT8: - return "ACL_INT8"; - case ACL_INT32: - return "ACL_INT32"; - case ACL_UINT8: - return "ACL_UINT8"; - case ACL_INT16: - return "ACL_INT16"; - case ACL_UINT16: - return "ACL_UINT16"; - case ACL_UINT32: - return "ACL_UINT32"; - case ACL_INT64: - return "ACL_INT64"; - case ACL_UINT64: - return "ACL_UINT64"; - case ACL_DOUBLE: - return "ACL_DOUBLE"; - case ACL_BOOL: - return "ACL_BOOL"; - case ACL_STRING: - return "ACL_STRING"; - case ACL_COMPLEX64: - return "ACL_COMPLEX64"; - case ACL_COMPLEX128: - return "ACL_COMPLEX128"; - case ACL_BF16: - return "ACL_BF16"; - case ACL_INT4: - return "ACL_INT4"; - case ACL_UINT1: - return "ACL_UINT1"; - case ACL_COMPLEX32: - return "ACL_COMPLEX32"; - default: - return "UNKNOWN"; + case ACL_DT_UNDEFINED: + return "ACL_DT_UNDEFINED"; + case ACL_FLOAT: + return "ACL_FLOAT"; + case ACL_FLOAT16: + return "ACL_FLOAT16"; + case ACL_INT8: + return "ACL_INT8"; + case ACL_INT32: + return "ACL_INT32"; + case ACL_UINT8: + return "ACL_UINT8"; + case ACL_INT16: + return "ACL_INT16"; + case ACL_UINT16: + return "ACL_UINT16"; + case ACL_UINT32: + return "ACL_UINT32"; + case ACL_INT64: + return "ACL_INT64"; + case ACL_UINT64: + return "ACL_UINT64"; + case ACL_DOUBLE: + return "ACL_DOUBLE"; + case ACL_BOOL: + return "ACL_BOOL"; + case ACL_STRING: + return "ACL_STRING"; + case ACL_COMPLEX64: + return "ACL_COMPLEX64"; + case ACL_COMPLEX128: + return "ACL_COMPLEX128"; + case ACL_BF16: + return "ACL_BF16"; + case ACL_INT4: + return "ACL_INT4"; + case ACL_UINT1: + return "ACL_UINT1"; + case ACL_COMPLEX32: + return "ACL_COMPLEX32"; + default: + return "UNKNOWN"; } } const char *formatToString(aclFormat format) { switch (format) { - case ACL_FORMAT_UNDEFINED: - return "ACL_FORMAT_UNDEFINED"; - case ACL_FORMAT_NCHW: - return "ACL_FORMAT_NCHW"; - case ACL_FORMAT_NHWC: - return "ACL_FORMAT_NHWC"; - case ACL_FORMAT_ND: - return "ACL_FORMAT_ND"; - case ACL_FORMAT_NC1HWC0: - return "ACL_FORMAT_NC1HWC0"; - case ACL_FORMAT_FRACTAL_Z: - return "ACL_FORMAT_FRACTAL_Z"; - case ACL_FORMAT_NC1HWC0_C04: - return "ACL_FORMAT_NC1HWC0_C04"; - case ACL_FORMAT_HWCN: - return "ACL_FORMAT_HWCN"; - case ACL_FORMAT_NDHWC: - return "ACL_FORMAT_NDHWC"; - case ACL_FORMAT_FRACTAL_NZ: - return "ACL_FORMAT_FRACTAL_NZ"; - case ACL_FORMAT_NCDHW: - return "ACL_FORMAT_NCDHW"; - case ACL_FORMAT_NDC1HWC0: - return "ACL_FORMAT_NDC1HWC0"; - case ACL_FRACTAL_Z_3D: - return "ACL_FRACTAL_Z_3D"; - case ACL_FORMAT_NC: - return "ACL_FORMAT_NC"; - case ACL_FORMAT_NCL: - return "ACL_FORMAT_NCL"; - default: - return "UNKNOWN"; + case ACL_FORMAT_UNDEFINED: + return "ACL_FORMAT_UNDEFINED"; + case ACL_FORMAT_NCHW: + return "ACL_FORMAT_NCHW"; + case ACL_FORMAT_NHWC: + return "ACL_FORMAT_NHWC"; + case ACL_FORMAT_ND: + return "ACL_FORMAT_ND"; + case ACL_FORMAT_NC1HWC0: + return "ACL_FORMAT_NC1HWC0"; + case ACL_FORMAT_FRACTAL_Z: + return "ACL_FORMAT_FRACTAL_Z"; + case ACL_FORMAT_NC1HWC0_C04: + return "ACL_FORMAT_NC1HWC0_C04"; + case ACL_FORMAT_HWCN: + return "ACL_FORMAT_HWCN"; + case ACL_FORMAT_NDHWC: + return "ACL_FORMAT_NDHWC"; + case ACL_FORMAT_FRACTAL_NZ: + return "ACL_FORMAT_FRACTAL_NZ"; + case ACL_FORMAT_NCDHW: + return "ACL_FORMAT_NCDHW"; + case ACL_FORMAT_NDC1HWC0: + return "ACL_FORMAT_NDC1HWC0"; + case ACL_FRACTAL_Z_3D: + return "ACL_FRACTAL_Z_3D"; + case ACL_FORMAT_NC: + return "ACL_FORMAT_NC"; + case ACL_FORMAT_NCL: + return "ACL_FORMAT_NCL"; + default: + return "UNKNOWN"; } } diff --git a/src/devices/ascend/common_ascend.h b/src/devices/ascend/common_ascend.h index 7d3a71b0..9b23fd35 100644 --- a/src/devices/ascend/common_ascend.h +++ b/src/devices/ascend/common_ascend.h @@ -1,29 +1,30 @@ #ifndef __COMMON_ASCEND_H__ #define __COMMON_ASCEND_H__ +#include "operators.h" #include #include #include #include #include +#include #include #include -#include #ifdef __cplusplus extern "C" { #endif -#define CHECK_RET(cond, return_expr) \ - do { \ - if (!(cond)) { \ - return_expr; \ - } \ +#define CHECK_RET(cond, return_expr) \ + do { \ + if (!(cond)) { \ + return_expr; \ + } \ } while (0) -#define LOG_PRINT(message, ...) \ - do { \ - printf(message, ##__VA_ARGS__); \ +#define LOG_PRINT(message, ...) \ + do { \ + printf(message, ##__VA_ARGS__); \ } while (0) #ifdef __cplusplus @@ -33,7 +34,8 @@ extern "C" { int64_t numElements(const int64_t *shape, int64_t num); const char *dataTypeToString(aclDataType dtype); const char *formatToString(aclFormat format); -void *mallocWorkspace(uint64_t workspaceSize); -void freeWorkspace(void *workspaceAddr); +infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize); +infiniopStatus_t freeWorkspace(void *workspaceAddr); +aclDataType toAclDataType(DT dt); #endif diff --git a/src/devices/ascend/tensor_aclnn.cc b/src/devices/ascend/tensor_aclnn.cc index 7fd41986..0a0fad74 100644 --- a/src/devices/ascend/tensor_aclnn.cc +++ b/src/devices/ascend/tensor_aclnn.cc @@ -2,25 +2,15 @@ #include "../../ops/utils.h" #include -infiniopStatus_t aclnnTensorDescriptor::setDescriptor(DT dtype, const std::vector &shape, const std::vector &strides) { +infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const std::vector &shape, const std::vector &strides) { if (shape.size() != strides.size()) { return STATUS_BAD_PARAM; } this->ndim = shape.size(); this->shape = std::vector(shape); this->strides = std::vector(strides); + this->dataType = dtype; - if (dtype_eq(dtype, F16)) { - this->dataType = aclDataType::ACL_FLOAT16; - } else if (dtype_eq(dtype, F32)) { - this->dataType = aclDataType::ACL_FLOAT; - } else if (dtype_eq(dtype, U64)) { - this->dataType = aclDataType::ACL_UINT64; - } else if (dtype_eq(dtype, I64)) { - this->dataType = aclDataType::ACL_INT64; - } else { - return STATUS_BAD_TENSOR_DTYPE; - } // Set format // TODO: Support other format aclFormat format = aclFormat::ACL_FORMAT_ND; @@ -31,59 +21,6 @@ infiniopStatus_t aclnnTensorDescriptor::setDescriptor(DT dtype, const std::vecto return STATUS_SUCCESS; } -// infiniopStatus_t aclnnTensorDescriptor::inferStorageShape(){ -// auto shape = std::vector(); -// auto strides = std::vector(); -// for (uint64_t i = 0; i < this->ndim; ++i) { -// if (this->shape[i] > 1){ -// shape.push_back(this->shape[i]); -// strides.push_back(this->strides[i]); -// }else if (this->shape[i] <= 0){ -// return STATUS_BAD_TENSOR_SHAPE; -// } -// } - -// this->storageNdim = shape.size(); -// this->storageShape = std::vector(this->storageNdim, 1); -// std::vector indices(this->storageNdim); -// for (int64_t i = 0; i < this->storageNdim; ++i) { -// indices[i] = i; -// } - -// std::sort(indices.begin(), indices.end(), [&](uint64_t a, uint64_t b) { -// return strides[a] > strides[b]; -// }); -// auto bound = 0; // upper bound of non-zero-strided dimension -// for (int64_t i = 0; i < this->storageNdim; ++i) { -// // sort shape and strides by strides -// shape[i] = this->shape[indices[i]]; -// strides[i] = this->strides[indices[i]]; -// if (strides[i] >= 1){ -// bound++; -// }else if (strides[i] < 0){ -// // negative stride not supported -// return STATUS_BAD_TENSOR_STRIDES; -// } -// } -// // Treat the last non-zero-strided dimension as continuous -// // All trilling zero-strided dimensions are treated as 1 -// shape[bound - 1] = shape[bound - 1] * strides[bound - 1]; -// strides[bound - 1] = 1; -// int64_t carry = 1; -// for (int64_t i = bound - 1; i > 0; --i) { -// // Each non-cummulative stride should be no smaller than corresponding dim -// // and storage shape is the bigger one -// this->storageShape[i] = strides[i-1] / carry; -// if (shape[i] > this->storageShape[i]){ -// return STATUS_BAD_TENSOR_STRIDES; -// } -// carry *= this->storageShape[i]; -// } -// this->storageShape[0] = shape[0]; - -// return STATUS_SUCCESS; -// } - /// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size. /// We don't see why higher dimensional storage shape is ever needed. To change if necesary. @@ -108,7 +45,7 @@ infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTen shape[i] = static_cast(y->shape[i]); strides[i] = y->strides[i]; } - return setDescriptor(y->dt, shape, strides); + return setDescriptor(toAclDataType(y->dt), shape, strides); } /// @brief Wrapper of aclCreateTensor. Create aclTensor. @@ -117,7 +54,7 @@ infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTen /// @param data Data ptr on device global mem. /// @param tensor Pointer of pointer of aclTensor. /// @return -infiniopStatus_t aclnnTensorDescriptor::createTensor() { +infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) { if (this->t) { return STATUS_SUCCESS; } @@ -129,7 +66,7 @@ infiniopStatus_t aclnnTensorDescriptor::createTensor() { this->format, this->storageShape.data(), this->storageNdim, - nullptr); + data); return STATUS_SUCCESS; } diff --git a/src/devices/ascend/tensor_aclnn.h b/src/devices/ascend/tensor_aclnn.h index 44c9e051..4aa25074 100644 --- a/src/devices/ascend/tensor_aclnn.h +++ b/src/devices/ascend/tensor_aclnn.h @@ -2,9 +2,9 @@ #define __ACLNN_TENSOR__ #include "./common_ascend.h" -#include "tensor/tensor_descriptor.h" #include "operators.h" #include "tensor.h" +#include "tensor/tensor_descriptor.h" #include #include #include @@ -24,11 +24,12 @@ struct aclnnTensorDescriptor { aclTensor *t; - infiniopStatus_t setDescriptor(DT dtype, const std::vector &shape, const std::vector &strides); + // Transfer from infiniOp DT to aclDataType + infiniopStatus_t setDescriptor(aclDataType dtype, const std::vector &shape, const std::vector &strides); infiniopStatus_t inferStorageShape(); // Convert form InfiniOpTensorDescriptor infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc); - infiniopStatus_t createTensor(); + infiniopStatus_t createTensor(void *data = nullptr); infiniopStatus_t destroyTensor(); ~aclnnTensorDescriptor(); diff --git a/src/devices/cuda/common_cuda.h b/src/devices/cuda/common_cuda.h index 1afe8c3d..0c10122f 100644 --- a/src/devices/cuda/common_cuda.h +++ b/src/devices/cuda/common_cuda.h @@ -40,7 +40,7 @@ typedef struct DTCudnnMapping { } DTCudnnMapping; // DT cudnnDataType_t mapping table -constexpr DTCudnnMapping dtMappings[] = { +const DTCudnnMapping dtMappings[] = { {F16, CUDNN_DATA_HALF}, {F32, CUDNN_DATA_FLOAT}, {F64, CUDNN_DATA_DOUBLE}, diff --git a/src/devices/cuda/cuda_handle.h b/src/devices/cuda/cuda_handle.h index aa293377..f935ed5f 100644 --- a/src/devices/cuda/cuda_handle.h +++ b/src/devices/cuda/cuda_handle.h @@ -6,7 +6,6 @@ #include "device.h" #include "status.h" #include -#include #include #include diff --git a/src/devices/musa/musa_handle.h b/src/devices/musa/musa_handle.h index 0c715b83..cc6209ab 100644 --- a/src/devices/musa/musa_handle.h +++ b/src/devices/musa/musa_handle.h @@ -61,4 +61,17 @@ void use_mudnn(std::shared_ptr> mudnn_handles_t, int dev mudnn_handles_t->push(handle); } + +// template +// musa::dnn::Status use_mudnn(std::shared_ptr> mudnn_handles_t, int device_id, musaStream_t stream, T const &f) { +// auto handle = mudnn_handles_t->pop(); +// if (!handle) { +// musaSetDevice(device_id); +// handle = std::make_shared(device_id); +// } +// musa::dnn::Status status = f(*handle); +// mudnn_handles_t->push(std::move(handle)); +// return status; +// } + #endif // __MUSA_HANDLE_H__ \ No newline at end of file diff --git a/src/ops/add/cuda/add.cc b/src/ops/add/cuda/add.cc index b010894f..eebcf4be 100644 --- a/src/ops/add/cuda/add.cc +++ b/src/ops/add/cuda/add.cc @@ -46,9 +46,9 @@ infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t handle, cudaGetDeviceProperties(&prop, handle->device_id); int64_t *a_strides_d, *b_strides_d, *c_strides_d; - checkCudaErrorWithCode(cudaMalloc(&a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMalloc(&b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMalloc(&c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void **) &a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void **) &b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void **) &c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); checkCudaErrorWithCode(cudaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); checkCudaErrorWithCode(cudaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); checkCudaErrorWithCode(cudaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); diff --git a/src/ops/add/musa/add_musa.mu b/src/ops/add/musa/add_musa.mu index ac43d66c..e51197f7 100644 --- a/src/ops/add/musa/add_musa.mu +++ b/src/ops/add/musa/add_musa.mu @@ -84,6 +84,7 @@ void _add_mt_gpu(AddMusaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const add<<>>( c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size); } + printf("[SUCCESS to execute add_mt_gpu]\n"); } template diff --git a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc index 38dd61c5..26ed34c1 100644 --- a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc +++ b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc @@ -54,8 +54,8 @@ infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle, aclnn_strides[i] = aclnn_shape[i + 1] * aclnn_strides[i + 1]; } - CHECK_STATUS(aDesc->setDescriptor(y->dt, aclnn_shape, aclnn_strides), STATUS_SUCCESS); - CHECK_STATUS(outDesc->setDescriptor(y->dt, aclnn_shape, aclnn_strides), STATUS_SUCCESS); + CHECK_STATUS(aDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS); + CHECK_STATUS(outDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS); // Set mask Desc auto &maskDesc = (*desc_ptr)->maskDesc; @@ -70,7 +70,7 @@ infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle, } auto mask_strides = std::vector{total_seq_len * seq_len, total_seq_len, 1}; - CHECK_STATUS(maskDesc->setDescriptor(y->dt, mask_shape, mask_strides), STATUS_SUCCESS); + CHECK_STATUS(maskDesc->setDescriptor(toAclDataType(y->dt), mask_shape, mask_strides), STATUS_SUCCESS); // Create aclTensor CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS); @@ -118,7 +118,7 @@ infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle, // malloc mask space auto &maskAddr = (*desc_ptr)->maskAddr; auto mask_size = numElements(maskDesc->shape.data(), maskDesc->ndim) * ele_size; - maskAddr = mallocWorkspace(mask_size); + CHECK_STATUS(mallocWorkspace(&maskAddr, mask_size), STATUS_SUCCESS); // copy mask matrix to device mem ret = aclrtMemcpy(maskAddr, @@ -181,7 +181,7 @@ infiniopStatus_t aclnnDestroyCausalSoftmaxDescriptor(CausalSoftmaxAclnnDescripto delete desc->maskDesc; delete desc->outDesc; aclDestroyAclOpExecutor(desc->executor); - freeWorkspace(desc->maskAddr); + CHECK_STATUS(freeWorkspace(desc->maskAddr), STATUS_SUCCESS); delete desc; return STATUS_SUCCESS; } diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.cc b/src/ops/causal_softmax/bang/causal_softmax_bang.cc index e0e32ca8..cc9b6d37 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_bang.cc +++ b/src/ops/causal_softmax/bang/causal_softmax_bang.cc @@ -33,7 +33,7 @@ infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle, return STATUS_SUCCESS; } -infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size) { if (desc->ndim > 3) { *size = desc->ndim * sizeof(int) * 2; } else { diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.h b/src/ops/causal_softmax/bang/causal_softmax_bang.h index a2e503f9..c9e09921 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_bang.h +++ b/src/ops/causal_softmax/bang/causal_softmax_bang.h @@ -21,11 +21,11 @@ infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle, CausalSoftmaxBangDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc); -infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, unsigned long int *size); +infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size); infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *data, void *stream); diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu index 57c445a3..bd7fd1af 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu +++ b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu @@ -787,7 +787,7 @@ void causal_softmax_bang_f16(CausalSoftmaxBangDescriptor_t desc, void *workspace infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *data, void *stream) { if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { @@ -798,4 +798,4 @@ infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc, return STATUS_SUCCESS; } return STATUS_BAD_TENSOR_DTYPE; -} \ No newline at end of file +} diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc index 5e27cdf1..02adc37f 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc +++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc @@ -38,7 +38,7 @@ infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle, return STATUS_SUCCESS; } -infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size) { *size = sizeof(bool) * desc->dims[0] * desc->dims[1] * desc->dims[2] * desc->dims[3]; return STATUS_SUCCESS; } @@ -52,7 +52,7 @@ infiniopStatus_t cnnlDestroyCausalSoftmaxDescriptor(CausalSoftmaxCnnlDescriptor_ infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *data, void *stream) { if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h index 74b35bf6..feaf274e 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h +++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h @@ -22,11 +22,11 @@ infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle, CausalSoftmaxCnnlDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc); -infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, unsigned long int *size); +infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size); infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *data, void *stream); diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cc b/src/ops/causal_softmax/cuda/causal_softmax.cc index 12e16e33..c7f4d5ed 100644 --- a/src/ops/causal_softmax/cuda/causal_softmax.cc +++ b/src/ops/causal_softmax/cuda/causal_softmax.cc @@ -1,11 +1,11 @@ #include "causal_softmax.cuh" -#include "../../utils.h" #include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle, CausalSoftmaxCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y) { - unsigned long int ndim = y->ndim; + uint64_t ndim = y->ndim; // TODO: only support 2d or 3d tensor if (ndim != 2 && ndim != 3) { return STATUS_BAD_TENSOR_SHAPE; @@ -13,12 +13,12 @@ infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle, if (!dtype_eq(y->dt, F16)) { return STATUS_BAD_TENSOR_DTYPE; } - unsigned long int total_seq_len = y->shape[ndim - 1]; - unsigned long int seq_len = y->shape[ndim - 2]; - unsigned long int batch_size = 1; - unsigned long int stride_b = 0; - unsigned long int stride_i = y->strides[ndim - 2]; - unsigned long int stride_j = y->strides[ndim - 1]; + uint64_t total_seq_len = y->shape[ndim - 1]; + uint64_t seq_len = y->shape[ndim - 2]; + uint64_t batch_size = 1; + uint64_t stride_b = 0; + uint64_t stride_i = y->strides[ndim - 2]; + uint64_t stride_j = y->strides[ndim - 1]; if (stride_j != 1) { return STATUS_BAD_TENSOR_STRIDES; } @@ -44,7 +44,7 @@ infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle, return STATUS_SUCCESS; } -infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size) { *size = 0; return STATUS_SUCCESS; } diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cu b/src/ops/causal_softmax/cuda/causal_softmax.cu index 280420a7..09fd1741 100644 --- a/src/ops/causal_softmax/cuda/causal_softmax.cu +++ b/src/ops/causal_softmax/cuda/causal_softmax.cu @@ -218,17 +218,17 @@ __global__ void fused_softmax_standard( } -void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void* y, void *stream) { - unsigned long int total_seq_len = desc->total_seq_len; - unsigned long int seq_len = desc->seq_len; - unsigned long int batch_size = desc->batch_size; - unsigned long int stride_x = desc->stride_b; - unsigned long int stride_y = desc->stride_i; - unsigned long int stride_z = desc->stride_j;// covert byte strides to element strides +void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void *y, void *stream) { + uint64_t total_seq_len = desc->total_seq_len; + uint64_t seq_len = desc->seq_len; + uint64_t batch_size = desc->batch_size; + uint64_t stride_x = desc->stride_b; + uint64_t stride_y = desc->stride_i; + uint64_t stride_z = desc->stride_j;// covert byte strides to element strides unsigned int max_items_per_thread = desc->max_items_per_thread; dim3 grid(batch_size, seq_len); - + if (max_items_per_thread == 1) { fused_softmax_padding <<>>((half *) (y), stride_x, stride_y, stride_z); @@ -243,13 +243,13 @@ void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void* y, void infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *data, - void *stream){ - if(cudaSetDevice(desc->device_id) != cudaSuccess){ + void *stream) { + if (cudaSetDevice(desc->device_id) != cudaSuccess) { return STATUS_BAD_DEVICE; } - if (dtype_eq(desc->dtype, F16)){ + if (dtype_eq(desc->dtype, F16)) { causal_softmax_nv_gpu_f16(desc, data, stream); return STATUS_SUCCESS; } diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cuh b/src/ops/causal_softmax/cuda/causal_softmax.cuh index a2f1f8df..30516bee 100644 --- a/src/ops/causal_softmax/cuda/causal_softmax.cuh +++ b/src/ops/causal_softmax/cuda/causal_softmax.cuh @@ -1,19 +1,19 @@ #ifndef __CUDA_CAUSAL_SOFTMAX_H__ #define __CUDA_CAUSAL_SOFTMAX_H__ -#include "operators.h" #include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" struct CausalSoftmaxCudaDescriptor { Device device; int device_id; DT dtype; - unsigned long int batch_size; - unsigned long int stride_b; - unsigned long int seq_len; - unsigned long int stride_i; - unsigned long int total_seq_len; - unsigned long int stride_j; + uint64_t batch_size; + uint64_t stride_b; + uint64_t seq_len; + uint64_t stride_i; + uint64_t total_seq_len; + uint64_t stride_j; unsigned int max_items_per_thread; }; @@ -23,11 +23,11 @@ infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle, CausalSoftmaxCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc); -infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, unsigned long int *size); +infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size); infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *data, void *stream); diff --git a/src/ops/concat/cpu/concat_cpu.cc b/src/ops/concat/cpu/concat_cpu.cc new file mode 100644 index 00000000..6c9bd419 --- /dev/null +++ b/src/ops/concat/cpu/concat_cpu.cc @@ -0,0 +1,139 @@ +#include "concat_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateConcatDescriptor( + infiniopHandle_t handle, + ConcatCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t num_inputs, + int64_t axis) { + if (y == nullptr || x == nullptr || desc_ptr == nullptr || num_inputs == 0) { + return STATUS_BAD_PARAM; + } + + if (!is_contiguous(y)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + int64_t ndim = y->ndim; + if (axis >= ndim || axis < -ndim) { + return STATUS_BAD_PARAM; + } + + if(axis < 0){ + axis = axis + ndim; + } + + uint64_t total_size = 0; + std::vector> input_shapes(num_inputs); + + std::vector output_shape(y->shape, y->shape + ndim); + + for (size_t i = 0; i < num_inputs; ++i) { + + if (!is_contiguous(x[i])) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (x[i]->dt != y->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (x[i]->ndim != ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + + for (size_t j = 0; j < ndim; ++j) { + if (j != axis && x[i]->shape[j] != y->shape[j]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + + input_shapes[i] = std::vector(x[i]->shape, x[i]->shape + ndim); + total_size += x[i]->shape[axis]; + } + + if (total_size != y->shape[axis]) { + return STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new ConcatCpuDescriptor{ + DevCpu, + y->dt, + axis, + num_inputs, + input_shapes, + output_shape, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyConcatDescriptor(ConcatCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t concatCompute(const ConcatCpuDescriptor_t& desc, + T* y, + void const** x) { + int64_t axis = desc->axis; + uint64_t num_inputs = desc->num_inputs; + const std::vector>& input_shapes = desc->input_shapes; + const std::vector& output_shape = desc->output_shape; + + size_t blockOffsetInner = 1; + for (size_t i = output_shape.size() - 1; i > axis; --i) { + blockOffsetInner *= output_shape[i]; + } + size_t blockOffset = output_shape[axis] * blockOffsetInner; + + for (size_t i = 0; i < num_inputs; ++i) { + const std::vector& input_shape = input_shapes[i]; + + size_t dimOffset = 0; + for (size_t j = 0; j < i; ++j) { + dimOffset += input_shapes[j][axis]; + } + + size_t localBlockOffset = 1; + for (size_t j = input_shape.size() - 1; j >= axis && j != static_cast(-1); --j) { + localBlockOffset *= input_shape[j]; + } + + size_t innerOffset = blockOffsetInner * dimOffset; + size_t inSize = 1; + for (auto dim : input_shape) { + inSize *= dim; + } + + T* input_data = static_cast(const_cast(x[i])); + + #pragma omp parallel for + for (size_t iOffset = 0; iOffset < inSize; ++iOffset) { + + size_t oOffset = iOffset % localBlockOffset + innerOffset + + iOffset / localBlockOffset * blockOffset; + + y[oOffset] = input_data[iOffset]; + } + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuConcat(ConcatCpuDescriptor_t desc, + void *y, + void const **x, + void *stream) { + if (desc->dtype == F16) { + return concatCompute(desc, reinterpret_cast(y), x); + } + if (desc->dtype == F32) { + return concatCompute(desc, reinterpret_cast(y), x); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/concat/cpu/concat_cpu.h b/src/ops/concat/cpu/concat_cpu.h new file mode 100644 index 00000000..a8d4d71d --- /dev/null +++ b/src/ops/concat/cpu/concat_cpu.h @@ -0,0 +1,32 @@ +#ifndef __CPU_CONCAT_H__ +#define __CPU_CONCAT_H__ +#include "operators.h" +#include +#include + +struct ConcatCpuDescriptor { + Device device; + DT dtype; + int64_t axis; + uint64_t num_inputs; + std::vector> input_shapes; + std::vector output_shape; +}; + +typedef struct ConcatCpuDescriptor *ConcatCpuDescriptor_t; + +infiniopStatus_t cpuCreateConcatDescriptor(infiniopHandle_t handle, + ConcatCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t num_inputs, + int64_t axis); + +infiniopStatus_t cpuConcat(ConcatCpuDescriptor_t desc, + void *y, + void const **x, + void *stream); + +infiniopStatus_t cpuDestroyConcatDescriptor(ConcatCpuDescriptor_t desc); + +#endif diff --git a/src/ops/concat/cuda/concat.cc b/src/ops/concat/cuda/concat.cc new file mode 100644 index 00000000..d99d167b --- /dev/null +++ b/src/ops/concat/cuda/concat.cc @@ -0,0 +1,73 @@ +#include "concat.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateConcatDescriptor(CudaHandle_t handle, + ConcatCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t num_inputs, + int64_t axis){ + if (y == nullptr || x == nullptr || desc_ptr == nullptr || num_inputs == 0) { + return STATUS_BAD_PARAM; + } + + if (!is_contiguous(y)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + int64_t ndim = y->ndim; + if (axis >= ndim || axis < -ndim) { + return STATUS_BAD_PARAM; + } + + if(axis < 0){ + axis = axis + ndim; + } + uint64_t total_size = 0; + + std::vector> input_shapes(num_inputs); + std::vector output_shape(y->shape, y->shape + ndim); + + for (size_t i = 0; i < num_inputs; ++i) { + + if (!is_contiguous(x[i])) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (x[i]->dt != y->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (x[i]->ndim != ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t j = 0; j < ndim; ++j) { + if (j != axis && x[i]->shape[j] != y->shape[j]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + + input_shapes[i] = std::vector(x[i]->shape, x[i]->shape + ndim); + total_size += x[i]->shape[axis]; + } + + if (total_size != y->shape[axis]) { + return STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new ConcatCudaDescriptor{ + DevNvGpu, + y->dt, + axis, + num_inputs, + input_shapes, + output_shape, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyConcatDescriptor(ConcatCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/concat/cuda/concat.cu b/src/ops/concat/cuda/concat.cu new file mode 100644 index 00000000..4d4ae521 --- /dev/null +++ b/src/ops/concat/cuda/concat.cu @@ -0,0 +1,86 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "concat.cuh" + +// Kernel function to perform concatenation on NvGPU +template +__global__ void concatKernel(const T* x, T* y, + size_t inSize, + size_t localBlockOffset, + size_t innerOffset, + size_t blockOffset) { + size_t iOffset = blockIdx.x * blockDim.x + threadIdx.x; + if (iOffset < inSize) { + size_t oOffset = (iOffset % localBlockOffset) + innerOffset + + (iOffset / localBlockOffset) * blockOffset; + y[oOffset] = x[iOffset]; + } +} + +template +infiniopStatus_t concatCompute(ConcatCudaDescriptor_t& desc, + T* y, + void const** x, + cudaStream_t stream) { + int64_t axis = desc->axis; + uint64_t num_inputs = desc->num_inputs; + const std::vector>& input_shapes = desc->input_shapes; + const std::vector& output_shape = desc->output_shape; + + size_t blockOffsetInner = 1; + for (size_t i = output_shape.size() - 1; i > axis; --i) { + blockOffsetInner *= output_shape[i]; + } + size_t blockOffset = output_shape[axis] * blockOffsetInner; + +#pragma unroll + for (size_t i = 0; i < num_inputs; ++i) { + const std::vector& input_shape = input_shapes[i]; + + size_t dimOffset = 0; + for (size_t j = 0; j < i; ++j) { + dimOffset += input_shapes[j][axis]; + } + + size_t localBlockOffset = 1; + for (size_t j = input_shape.size() - 1; j >= axis && j != static_cast(-1); --j) { + localBlockOffset *= input_shape[j]; + } + + size_t innerOffset = blockOffsetInner * dimOffset; + size_t inSize = 1; + for (auto dim : input_shape) { + inSize *= dim; + } + + T* input_data = static_cast(const_cast(x[i])); + + // Launch CUDA kernel + int threads = 256; + int blocks = (inSize + threads - 1) / threads; + concatKernel<<>>(input_data, y, inSize, localBlockOffset, innerOffset, blockOffset); + + // Check for CUDA errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return STATUS_EXECUTION_FAILED; + } + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaConcat(ConcatCudaDescriptor_t desc, + void* y, + void const** x, + void* stream) { + cudaStream_t cudaStream = reinterpret_cast(stream); + + if (desc->dtype == F16) { + return concatCompute(desc, reinterpret_cast(y), x, cudaStream); + } + if (desc->dtype == F32) { + return concatCompute(desc, reinterpret_cast(y), x, cudaStream); + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/concat/cuda/concat.cuh b/src/ops/concat/cuda/concat.cuh new file mode 100644 index 00000000..9eeaf06f --- /dev/null +++ b/src/ops/concat/cuda/concat.cuh @@ -0,0 +1,36 @@ +#ifndef __CUDA_CONCAT_H__ +#define __CUDA_CONCAT_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include +#include + +struct ConcatCudaDescriptor { + Device device; + DT dtype; + int64_t axis; + uint64_t num_inputs; + std::vector> input_shapes; + std::vector output_shape; +}; + +typedef struct ConcatCudaDescriptor *ConcatCudaDescriptor_t; + +infiniopStatus_t cudaCreateConcatDescriptor(CudaHandle_t handle, + ConcatCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t nums_input, + int64_t axis); + +infiniopStatus_t cudaConcat(ConcatCudaDescriptor_t desc, + void *y, + void const **x, + void *stream); + +infiniopStatus_t cudaDestroyConcatDescriptor(ConcatCudaDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/src/ops/concat/musa/concat_musa.cc b/src/ops/concat/musa/concat_musa.cc new file mode 100644 index 00000000..e4666dda --- /dev/null +++ b/src/ops/concat/musa/concat_musa.cc @@ -0,0 +1,73 @@ +#include "concat_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" + +infiniopStatus_t musaCreateConcatDescriptor(MusaHandle_t handle, + ConcatMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t num_inputs, + int64_t axis){ + if (y == nullptr || x == nullptr || desc_ptr == nullptr || num_inputs == 0) { + return STATUS_BAD_PARAM; + } + + if (!is_contiguous(y)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + int64_t ndim = y->ndim; + if (axis >= ndim || axis < -ndim) { + return STATUS_BAD_PARAM; + } + + if(axis < 0){ + axis = axis + ndim; + } + uint64_t total_size = 0; + + std::vector> input_shapes(num_inputs); + std::vector output_shape(y->shape, y->shape + ndim); + + for (size_t i = 0; i < num_inputs; ++i) { + + if (!is_contiguous(x[i])) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (x[i]->dt != y->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (x[i]->ndim != ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t j = 0; j < ndim; ++j) { + if (j != axis && x[i]->shape[j] != y->shape[j]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + + input_shapes[i] = std::vector(x[i]->shape, x[i]->shape + ndim); + total_size += x[i]->shape[axis]; + } + + if (total_size != y->shape[axis]) { + return STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new ConcatMusaDescriptor{ + DevMtGpu, + y->dt, + axis, + num_inputs, + input_shapes, + output_shape, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyConcatDescriptor(ConcatMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/concat/musa/concat_musa.h b/src/ops/concat/musa/concat_musa.h new file mode 100644 index 00000000..6bdea49d --- /dev/null +++ b/src/ops/concat/musa/concat_musa.h @@ -0,0 +1,35 @@ +#ifndef __MUSA_CONCAT_H__ +#define __MUSA_CONCAT_H__ + +#include "../../../devices/musa/common_musa.h" +#include "../../../devices/musa/musa_handle.h" +#include "operators.h" +#include +#include + +struct ConcatMusaDescriptor { + Device device; + DT dtype; + int64_t axis; + uint64_t num_inputs; + std::vector> input_shapes; + std::vector output_shape; +}; + +typedef struct ConcatMusaDescriptor *ConcatMusaDescriptor_t; + +infiniopStatus_t musaCreateConcatDescriptor(MusaHandle_t handle, + ConcatMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t nums_input, + int64_t axis); + +infiniopStatus_t musaConcat(ConcatMusaDescriptor_t desc, + void *y, + void const **x, + void *stream); + +infiniopStatus_t musaDestroyConcatDescriptor(ConcatMusaDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/src/ops/concat/musa/concat_musa.mu b/src/ops/concat/musa/concat_musa.mu new file mode 100644 index 00000000..734cf905 --- /dev/null +++ b/src/ops/concat/musa/concat_musa.mu @@ -0,0 +1,86 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "concat_musa.h" + +// Kernel function to perform concatenation on MtGPU +template +__global__ void concatKernel(const T* x, T* y, + size_t inSize, + size_t localBlockOffset, + size_t innerOffset, + size_t blockOffset) { + size_t iOffset = blockIdx.x * blockDim.x + threadIdx.x; + if (iOffset < inSize) { + size_t oOffset = (iOffset % localBlockOffset) + innerOffset + + (iOffset / localBlockOffset) * blockOffset; + y[oOffset] = x[iOffset]; + } +} + +template +infiniopStatus_t concatCompute(ConcatMusaDescriptor_t& desc, + T* y, + void const** x, + musaStream_t stream) { + int64_t axis = desc->axis; + uint64_t num_inputs = desc->num_inputs; + const std::vector>& input_shapes = desc->input_shapes; + const std::vector& output_shape = desc->output_shape; + + size_t blockOffsetInner = 1; + for (size_t i = output_shape.size() - 1; i > axis; --i) { + blockOffsetInner *= output_shape[i]; + } + size_t blockOffset = output_shape[axis] * blockOffsetInner; + +#pragma unroll + for (size_t i = 0; i < num_inputs; ++i) { + const std::vector& input_shape = input_shapes[i]; + + size_t dimOffset = 0; + for (size_t j = 0; j < i; ++j) { + dimOffset += input_shapes[j][axis]; + } + + size_t localBlockOffset = 1; + for (size_t j = input_shape.size() - 1; j >= axis && j != static_cast(-1); --j) { + localBlockOffset *= input_shape[j]; + } + + size_t innerOffset = blockOffsetInner * dimOffset; + size_t inSize = 1; + for (auto dim : input_shape) { + inSize *= dim; + } + + T* input_data = static_cast(const_cast(x[i])); + + // Launch MUSA kernel + int threads = 256; + int blocks = (inSize + threads - 1) / threads; + concatKernel<<>>(input_data, y, inSize, localBlockOffset, innerOffset, blockOffset); + + // Check for MUSA errors + musaError_t err = musaGetLastError(); + if (err != musaSuccess) { + return STATUS_EXECUTION_FAILED; + } + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaConcat(ConcatMusaDescriptor_t desc, + void* y, + void const** x, + void* stream) { + musaStream_t musaStream = reinterpret_cast(stream); + + if (desc->dtype == F16) { + return concatCompute(desc, reinterpret_cast(y), x, musaStream); + } + if (desc->dtype == F32) { + return concatCompute(desc, reinterpret_cast(y), x, musaStream); + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/concat/operator.cc b/src/ops/concat/operator.cc new file mode 100644 index 00000000..f35bade6 --- /dev/null +++ b/src/ops/concat/operator.cc @@ -0,0 +1,87 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/concat/concat.h" + +#ifdef ENABLE_CPU +#include "cpu/concat_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/concat.cuh" +#endif +#ifdef ENABLE_MT_GPU +#include "../../devices/musa/musa_handle.h" +#include "musa/concat_musa.h" +#endif + +__C infiniopStatus_t infiniopCreateConcatDescriptor( + infiniopHandle_t handle, + infiniopConcatDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t *x, + uint64_t num_inputs, + int64_t axis) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateConcatDescriptor(handle, (ConcatCpuDescriptor_t *) desc_ptr, y, x, num_inputs, axis); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateConcatDescriptor((CudaHandle_t) handle, (ConcatCudaDescriptor_t *) desc_ptr, y, x, num_inputs, axis); + } +#endif +#ifdef ENABLE_MT_GPU + case DevMtGpu:{ + return musaCreateConcatDescriptor((MusaHandle_t) handle, (ConcatMusaDescriptor_t *) desc_ptr, y, x, num_inputs, axis); + } +#endif + } + return STATUS_BAD_DEVICE; +} + + +__C infiniopStatus_t infiniopConcat(infiniopConcatDescriptor_t desc, void *y, void const **x, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuConcat((ConcatCpuDescriptor_t) desc, y, x, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + printf("[INTO ENABLE_NV_GPU]\n"); + return cudaConcat((ConcatCudaDescriptor_t) desc, y, x, stream); + } + +#endif +#ifdef ENABLE_MT_GPU + case DevMtGpu: { + + return musaConcat((ConcatMusaDescriptor_t) desc, y, x, stream); + } +#endif + + } + return STATUS_BAD_TENSOR_SHAPE; +} + + +__C infiniopStatus_t infiniopDestroyConcatDescriptor(infiniopConcatDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyConcatDescriptor((ConcatCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyConcatDescriptor((ConcatCudaDescriptor_t) desc); + } +#endif +#ifdef ENABLE_MT_GPU + case DevMtGpu: { + return musaDestroyConcatDescriptor((ConcatMusaDescriptor_t) desc); + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/conv/cpu/conv_cpu.cc b/src/ops/conv/cpu/conv_cpu.cc index dd198d97..ece37d0b 100644 --- a/src/ops/conv/cpu/conv_cpu.cc +++ b/src/ops/conv/cpu/conv_cpu.cc @@ -173,7 +173,8 @@ void _conv_cpu(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_siz Ydata *y, Xdata const *x, Xdata const *w) { if (desc->padded_x_size > 0) { auto padded_x = reinterpret_cast(workspace); - uint64_t padded_shape[desc->ndim]; + std::vector padded_shape_(desc->ndim); + auto padded_shape = padded_shape_.data(); std::fill(padded_x, padded_x + desc->padded_x_size, 0); getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape); fillPaddedInput(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0); diff --git a/src/ops/conv/cuda/conv.cc b/src/ops/conv/cuda/conv.cc index 9a352878..2ccabfda 100644 --- a/src/ops/conv/cuda/conv.cc +++ b/src/ops/conv/cuda/conv.cc @@ -25,7 +25,7 @@ infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle, return STATUS_BAD_TENSOR_DTYPE; } - const auto new_ndim = std::max(4UL, ndim); + const uint64_t new_ndim = std::max(ndim, (uint64_t)4); // convert pads, strides, dilations into int32[] int32_t *pad = new int32_t[new_ndim]; int32_t *stride = new int32_t[new_ndim]; @@ -87,12 +87,12 @@ infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle, // create and set tensor descriptors for y cudnnTensorDescriptor_t y_desc; - int outDim[new_ndim]; + std::vector outDim_(new_ndim); + auto outDim = outDim_.data(); checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(op_desc, x_desc, w_desc, new_ndim, outDim)); checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); checkCudnnError(cudnnSetTensorNdDescriptorEx(y_desc, CUDNN_TENSOR_NCHW, static_cast(tensor_dt), new_ndim, y_shape)); - // tuning: get the best algorithm int requestedAlgoCount = 1; checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr, @@ -101,7 +101,8 @@ infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle, int chosenAlgoIndex = 0; bool chosen = false; size_t workspace_size = 0; - cudnnConvolutionFwdAlgoPerf_t perf_results[requestedAlgoCount]; + std::vector perf_results_(requestedAlgoCount); + auto perf_results = perf_results_.data(); checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr, [&](cudnnHandle_t handle) { return cudnnFindConvolutionForwardAlgorithm(handle, x_desc, w_desc, op_desc, y_desc, requestedAlgoCount, &algoCounts, perf_results); })); if (algoCounts < 1) { diff --git a/src/ops/conv/musa/conv_musa.cc b/src/ops/conv/musa/conv_musa.cc index 581bc287..d966956e 100644 --- a/src/ops/conv/musa/conv_musa.cc +++ b/src/ops/conv/musa/conv_musa.cc @@ -1,7 +1,6 @@ #include "conv_musa.h" #include "../../../devices/musa/common_musa.h" #include "../../utils.h" -#include infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle, ConvMusaDescriptor_t *desc_ptr, @@ -16,6 +15,9 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle, if (ndim < 3 || ndim != x->ndim || ndim != w->ndim) { return STATUS_BAD_TENSOR_SHAPE; } + if (ndim > 5 ) { + return STATUS_BAD_TENSOR_SHAPE; + } if (x->shape[0] != y->shape[0] || w->shape[0] != y->shape[1] || x->shape[1] != w->shape[1]) { return STATUS_BAD_TENSOR_SHAPE; } @@ -26,11 +28,11 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle, return STATUS_BAD_TENSOR_DTYPE; } - const auto new_ndim = std::max(4UL, ndim); + const uint64_t new_ndim = std::max(ndim, (uint64_t)4); // convert pads, strides, dilations into int32[] - int *pad = new int[new_ndim]; - int *stride = new int[new_ndim]; - int *dilation = new int[new_ndim]; + int32_t *pad = new int32_t[new_ndim]; + int32_t *stride = new int32_t[new_ndim]; + int32_t *dilation = new int32_t[new_ndim]; int64_t *x_shape = new int64_t[new_ndim]; int64_t *w_shape = new int64_t[new_ndim]; int64_t *y_shape = new int64_t[new_ndim]; @@ -60,28 +62,74 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle, w_tensor->SetType(musa::dnn::Tensor::Type::FLOAT); } - x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); - y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); - w_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + if (new_ndim == 5) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW); + } + else if (new_ndim == 4) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + } + else if (new_ndim == 3) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCW); + } + else { + return STATUS_BAD_TENSOR_SHAPE; + } x_tensor->SetNdInfo((int) new_ndim, x_shape); y_tensor->SetNdInfo((int) new_ndim, y_shape); w_tensor->SetNdInfo((int) new_ndim, w_shape); + // musa::dnn::Status status1 = y_tensor->SetNdInfo((int) new_ndim, y_shape); + // if (status1 == musa::dnn::Status::SUCCESS) { + // std::cerr << "Success to set y_tensor." << std::endl; + // } + + // 设置卷积的填充、步长和膨胀 musa::dnn::Convolution* conv_operator = new musa::dnn::Convolution(); - conv_operator->SetNdInfo((int) new_ndim-2, pad, stride, dilation); + musa::dnn::Status status2 = conv_operator->SetNdInfo(new_ndim - 2, pad, stride, dilation); + // if (status2 == musa::dnn::Status::SUCCESS) { + // std::cerr << "Success to set convolution dimensions." << std::endl; + // } + + musa::dnn::Status status3 = conv_operator->SetComputeMode(musa::dnn::Convolution::ComputeMode::TENSOR); + // if (status3 == musa::dnn::Status::SUCCESS) { + // std::cerr << "Success to set compute mode." << std::endl; + // // printf("status3: %s\n",status3); + // printf("SetComputeMode Status:%d\n", static_cast(status3)); + // } + musa::dnn::Convolution::Algorithm algo = musa::dnn::Convolution::Algorithm::DIRECT; - size_t workspace_size = 0; - use_mudnn(handle->mudnn_handles_t, handle->device_id, nullptr, [&](musa::dnn::Handle* handle) { - printf(" %d \n", conv_operator->GetRecommendForwardAlgorithm(*handle, algo, *y_tensor, *x_tensor, *w_tensor)); - // printf(" %d \n", conv_operator->GetForwardWorkspaceSize(*handle, workspace_size, *y_tensor, *x_tensor, *w_tensor, algo)); - }); + + use_mudnn(handle->mudnn_handles_t, handle->device_id, nullptr, + [&](musa::dnn::Handle* handle) {conv_operator->GetRecommendForwardAlgorithm(*handle, algo, *y_tensor, *x_tensor, *w_tensor);}); + + size_t workspace_size = 2; + // printf("workspace_size before: %zu\n", workspace_size); + + use_mudnn(handle->mudnn_handles_t, handle->device_id, nullptr, + [&](musa::dnn::Handle* handle) { + musa::dnn::Status status = conv_operator->GetForwardWorkspaceSize(*handle, workspace_size, *y_tensor, *x_tensor, *w_tensor, algo); + // printf("GetForwardWorkspaceSize status: %d\n", static_cast(status)); + }); + + // printf("workspace_size after: %zu\n", workspace_size); + const float alpha = 1.0f; const float beta = 0.0f; - printf("after: %d\n", algo); - printf("A\n"); + musa::dnn::MemoryMaintainer maintainer = [](size_t size) -> musa::dnn::MemoryHandler { + void* ptr = nullptr; + musaMalloc(&ptr, size); + return musa::dnn::MemoryHandler(ptr, [](void* p) { + if (p) musaFree(p); + }); + }; + + // musa::dnn::MemoryHandler workspace_mem = maintainer(workspace_size); *desc_ptr = new ConvMusaDescriptor{ DevMtGpu, @@ -95,7 +143,9 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle, algo, alpha, beta, - workspace_size}; + workspace_size, + maintainer + }; delete[] pad; delete[] stride; diff --git a/src/ops/conv/musa/conv_musa.h b/src/ops/conv/musa/conv_musa.h index d24baf25..959520d3 100644 --- a/src/ops/conv/musa/conv_musa.h +++ b/src/ops/conv/musa/conv_musa.h @@ -19,6 +19,7 @@ struct ConvMusaDescriptor { const float alpha; const float beta; uint64_t workspace_size; + musa::dnn::MemoryMaintainer maintainer; }; typedef struct ConvMusaDescriptor *ConvMusaDescriptor_t; diff --git a/src/ops/conv/musa/conv_musa.mu b/src/ops/conv/musa/conv_musa.mu index cb1debec..f8f7cf12 100644 --- a/src/ops/conv/musa/conv_musa.mu +++ b/src/ops/conv/musa/conv_musa.mu @@ -8,12 +8,13 @@ infiniopStatus_t conv_mt_gpu(ConvMusaDescriptor_t desc, void *workspace, uint64_ desc->y_tensor->SetAddr(y); desc->x_tensor->SetAddr(x); desc->w_tensor->SetAddr(w); - printf("b\n"); use_mudnn(desc->mudnn_handles_t, desc->device_id, (musaStream_t) stream, [&](musa::dnn::Handle* handle) { - desc->conv_operator->Run(*handle, *(desc->y_tensor), *(desc->x_tensor), *(desc->w_tensor), desc->algo, nullptr); - + desc->conv_operator->Run(*handle, *(desc->y_tensor), *(desc->x_tensor), *(desc->w_tensor), desc->algo, desc->maintainer); }); + + printf("[SUCCESS to execute conv_mt_gpu]\n"); + return STATUS_SUCCESS; } diff --git a/src/ops/expand/cuda/expand.cc b/src/ops/expand/cuda/expand.cc index cf43b326..d0467c01 100644 --- a/src/ops/expand/cuda/expand.cc +++ b/src/ops/expand/cuda/expand.cc @@ -24,7 +24,7 @@ infiniopStatus_t cudaCreateExpandDescriptor(CudaHandle_t handle, int64_t *x_strides_d, *y_strides_d; char *strides_and_shape_d; - checkCudaErrorWithCode(cudaMalloc(&strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void **) &strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED); checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d, x_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + ndim * sizeof(int64_t), y->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + 2 * ndim * sizeof(int64_t), y->shape, ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); diff --git a/src/ops/gemm/operator.cc b/src/ops/gemm/operator.cc index 071c2870..7036b032 100644 --- a/src/ops/gemm/operator.cc +++ b/src/ops/gemm/operator.cc @@ -21,8 +21,8 @@ __C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t hand infiniopTensorDescriptor_t c_desc, float alpha, float beta, - bool transA, - bool transB) { + char transA, + char transB) { // transpose a and b if needed a_desc = transA ? permute(a_desc, {1, 0}) : a_desc; b_desc = transB ? permute(b_desc, {1, 0}) : b_desc; diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cc b/src/ops/global_avg_pool/cuda/global_avg_pool.cc index da12cfb4..25d7acbe 100644 --- a/src/ops/global_avg_pool/cuda/global_avg_pool.cc +++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cc @@ -82,13 +82,13 @@ infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle, }; } else if (x->ndim <= 5) { - int x_shape[ndim]; - int x_strides[ndim]; - int y_shape[ndim]; - int y_strides[ndim]; - int k_shape[ndim - 2]; - int pads[ndim - 2]; - int strides[ndim - 2]; + std::vector x_shape(ndim); + std::vector x_strides(ndim); + std::vector y_shape(ndim); + std::vector y_strides(ndim); + std::vector k_shape(ndim - 2); + std::vector pads(ndim - 2); + std::vector strides(ndim - 2); #pragma omp parallel for for (size_t i = 0; i < ndim; ++i) { @@ -109,7 +109,7 @@ infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle, // create and set tensor descriptors for x cudnnTensorDescriptor_t x_desc; checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); - checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast(tensor_dt), ndim, x_shape, x_strides)); + checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast(tensor_dt), ndim, x_shape.data(), x_strides.data())); // Create and set pooling descriptor for average pooling cudnnPoolingDescriptor_t pool_desc; @@ -118,14 +118,14 @@ infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, CUDNN_NOT_PROPAGATE_NAN, ndim - 2, - k_shape, - pads, - strides)); + k_shape.data(), + pads.data(), + strides.data())); // create and set tensor descriptors for y cudnnTensorDescriptor_t y_desc; checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); - checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape)); - checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast(tensor_dt), ndim, y_shape, y_strides)); + checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data())); + checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast(tensor_dt), ndim, y_shape.data(), y_strides.data())); float alpha = 1.0f, beta = 0.0f; diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cuh b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh index 35e38d7b..cd97be5b 100644 --- a/src/ops/global_avg_pool/cuda/global_avg_pool.cuh +++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh @@ -7,6 +7,7 @@ #include #include #include +#include struct GlobalAvgPoolCudaDescriptor { Device device; diff --git a/src/ops/global_avg_pool/musa/global_avg_pool_musa.cc b/src/ops/global_avg_pool/musa/global_avg_pool_musa.cc new file mode 100644 index 00000000..2739ce3b --- /dev/null +++ b/src/ops/global_avg_pool/musa/global_avg_pool_musa.cc @@ -0,0 +1,157 @@ +#include "global_avg_pool_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" + +infiniopStatus_t musaCreateGlobalAvgPoolDescriptor(MusaHandle_t handle, + GlobalAvgPoolMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (ndim <= 2 || ndim != x->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + + for (size_t i = 0; i < ndim; ++i) { + if (i < 2 && y->shape[i] != x->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } else if (i >= 2 && y->shape[i] != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + const uint64_t new_ndim = ndim; + + int64_t *x_shape = new int64_t[new_ndim]; + int64_t *y_shape = new int64_t[new_ndim]; + for(size_t i = 0; i < new_ndim; ++i){ + x_shape[i] = static_cast(x->shape[i]); + y_shape[i] = static_cast(y->shape[i]); + + } + + musa::dnn::Tensor *x_tensor = new musa::dnn::Tensor(); + musa::dnn::Tensor *y_tensor = new musa::dnn::Tensor(); + musa::dnn::Tensor *indices = new musa::dnn::Tensor(); + + x_tensor->SetNdInfo((int)new_ndim, x_shape); + y_tensor->SetNdInfo((int)new_ndim, y_shape); + indices->SetNdInfo((int)new_ndim, x_shape); + + if (y->dt == F16) { + x_tensor->SetType(musa::dnn::Tensor::Type::HALF); + y_tensor->SetType(musa::dnn::Tensor::Type::HALF); + indices->SetType(musa::dnn::Tensor::Type::HALF); + } else if (y->dt == F32) { + x_tensor->SetType(musa::dnn::Tensor::Type::FLOAT); + y_tensor->SetType(musa::dnn::Tensor::Type::FLOAT); + indices->SetType(musa::dnn::Tensor::Type::FLOAT); + } + + if (new_ndim == 5) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW); + indices->SetFormat(musa::dnn::Tensor::Format::NCDHW); + } + else if (new_ndim == 4) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + indices->SetFormat(musa::dnn::Tensor::Format::NCHW); + } + else if (new_ndim == 3) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCW); + indices->SetFormat(musa::dnn::Tensor::Format::NCW); + } + else { + return STATUS_BAD_TENSOR_SHAPE; + } + + + int N = x_shape[0]; // batch size + int C = x_shape[1]; // channels + int D = x_shape[2]; + int H = x_shape[3]; // height + int W = x_shape[4]; // width + + int kernel[] = {D, H, W}; + int pad[] = {0, 0, 0}; + int stride[] = {D, H, W}; + int dilation[] = {1, 1, 1}; + + musa::dnn::Status status; + musa::dnn::Pooling *pool_desc = new musa::dnn::Pooling(); + + status = pool_desc->SetMode(musa::dnn::Pooling::Mode::GLOBAL_AVGPOOL); + // if (status == musa::dnn::Status::SUCCESS) { + // printf("pool_desc SetMode status:%d\n", static_cast(status)); + // } + + // status = pool_desc->SetNdInfo(3, kernel, pad, stride, dilation); + // if (status == musa::dnn::Status::SUCCESS) { + // printf("pool_desc SetNdInfo status:%d\n", static_cast(status)); + // } + + // status = pool_desc->SetNdInfo({H}, {0}, {H}, {1}); + // if (status == musa::dnn::Status::SUCCESS) { + // printf("pool_desc SetNdInfo status:%d\n", static_cast(status)); + // } + + // status = pool_desc->SetDivisor(H * W); + // if (status == musa::dnn::Status::SUCCESS) { + // printf("pool_desc SetDivisor status:%d\n", static_cast(status)); + // } + + const float alpha = 1.0f; + const float beta = 0.0f; + + *desc_ptr = new GlobalAvgPoolMusaDescriptor{ + DevMtGpu, + y->dt, + handle->device_id, + ndim, + 0, + 0, + 0, + 0, + 0, + 0, + handle->mudnn_handles_t, + x_tensor, + y_tensor, + pool_desc, + indices, + alpha, + beta, + }; + + delete[] x_shape; + delete[] y_shape; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolMusaDescriptor_t desc, uint64_t *size) { + *size = desc->ndim <= 5 ? 0 : (desc->dtype != F16 ? 0 : std::min(desc->dtype.size * 2, 8) * desc->y_data_size); + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolMusaDescriptor_t desc) { + if (desc->ndim <= 5) { + delete desc->x_desc; + delete desc->y_desc; + delete desc->pool_desc; + delete desc->indices; + } + desc->mudnn_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/global_avg_pool/musa/global_avg_pool_musa.h b/src/ops/global_avg_pool/musa/global_avg_pool_musa.h new file mode 100644 index 00000000..ccb5ee6d --- /dev/null +++ b/src/ops/global_avg_pool/musa/global_avg_pool_musa.h @@ -0,0 +1,47 @@ +#ifndef __MUSA_GLOBAL_AVG_POOL_H__ +#define __MUSA_GLOBAL_AVG_POOL_H__ + +#include "../../../devices/musa/common_musa.h" +#include "../../../devices/musa/musa_handle.h" +#include "operators.h" +#include +#include +#include +#include + +struct GlobalAvgPoolMusaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t data_size; + uint64_t y_data_size; + uint64_t x_per_NC_data_size; + unsigned max_block_size; + uint64_t max_grid_size; + uint64_t items_per_thread; + std::shared_ptr> mudnn_handles_t; + musa::dnn::Tensor *x_desc; + musa::dnn::Tensor *y_desc; + musa::dnn::Pooling *pool_desc; + musa::dnn::Tensor *indices; + const float alpha; + const float beta; +}; + +typedef struct GlobalAvgPoolMusaDescriptor *GlobalAvgPoolMusaDescriptor_t; + +infiniopStatus_t musaCreateGlobalAvgPoolDescriptor(MusaHandle_t, + GlobalAvgPoolMusaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t musaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolMusaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t musaGlobalAvgPool(GlobalAvgPoolMusaDescriptor_t desc, + void *workspace, uint64_t workspace_size, void *y, void const *x, + void *stream); + +infiniopStatus_t musaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolMusaDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/src/ops/global_avg_pool/musa/global_avg_pool_musa.mu b/src/ops/global_avg_pool/musa/global_avg_pool_musa.mu new file mode 100644 index 00000000..a55c3359 --- /dev/null +++ b/src/ops/global_avg_pool/musa/global_avg_pool_musa.mu @@ -0,0 +1,34 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "global_avg_pool_musa.h" + + +infiniopStatus_t global_avg_pool_mt_gpu(GlobalAvgPoolMusaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream, unsigned pack_size) { + // use muDNN lib + + checkMusaError(musaSetDevice(desc->device_id)); + desc->y_desc->SetAddr(y); + desc->x_desc->SetAddr(x); + + use_mudnn(desc->mudnn_handles_t, desc->device_id, (musaStream_t) stream, [&](musa::dnn::Handle* handle) { + desc->pool_desc->Run(*handle, *(desc->y_desc), *(desc->x_desc), *(desc->indices)); + }); + + printf("[SUCCESS to execute global_avg_pool_mt_gpu]\n"); + + return STATUS_SUCCESS; +} + + + +infiniopStatus_t musaGlobalAvgPool(GlobalAvgPoolMusaDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, + void *stream) { + checkMusaError(musaSetDevice(desc->device_id)); + + if (desc->dtype == F32) { + return global_avg_pool_mt_gpu(desc, workspace, workspace_size, y, x, stream, 1); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/global_avg_pool/operator.cc b/src/ops/global_avg_pool/operator.cc index 92484283..30ed0de7 100644 --- a/src/ops/global_avg_pool/operator.cc +++ b/src/ops/global_avg_pool/operator.cc @@ -9,6 +9,10 @@ #include "../../devices/cuda/cuda_handle.h" #include "cuda/global_avg_pool.cuh" #endif +#ifdef ENABLE_MT_GPU +#include "../../devices/musa/musa_handle.h" +#include "musa/global_avg_pool_musa.h" +#endif #ifdef ENABLE_CAMBRICON_MLU // TODO: Cambricon #endif @@ -29,6 +33,12 @@ __C infiniopStatus_t infiniopCreateGlobalAvgPoolDescriptor( } #endif +#ifdef ENABLE_MT_GPU + case DevMtGpu: { + return musaCreateGlobalAvgPoolDescriptor((MusaHandle_t) handle, (GlobalAvgPoolMusaDescriptor_t *) desc_ptr, y, x); + } +#endif + #ifdef ENABLE_CAMBRICON_MLU // TODO #endif @@ -47,6 +57,12 @@ __C infiniopStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPool return cudaGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolCudaDescriptor_t) desc, size); } +#endif +#ifdef ENABLE_MT_GPU + case DevMtGpu: { + return musaGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolMusaDescriptor_t) desc, size); + } + #endif #ifdef ENABLE_CAMBRICON_MLU // TODO: Cambricon support @@ -67,6 +83,12 @@ __C infiniopStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t des return cudaGlobalAvgPool((GlobalAvgPoolCudaDescriptor_t) desc, workspace, workspace_size, y, x, stream); } +#endif +#ifdef ENABLE_MT_GPU + case DevMtGpu: { + return musaGlobalAvgPool((GlobalAvgPoolMusaDescriptor_t) desc, workspace, workspace_size, y, x, stream); + } + #endif #ifdef ENABLE_CAMBRICON_MLU // TODO @@ -86,6 +108,12 @@ __C infiniopStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoo return cudaDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolCudaDescriptor_t) desc); } +#endif +#ifdef ENABLE_MT_GPU + case DevMtGpu: { + return musaDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolMusaDescriptor_t) desc); + } + #endif #ifdef ENABLE_CAMBRICON_MLU // TODO diff --git a/src/ops/matmul/ascend/matmul_aclnn.cc b/src/ops/matmul/ascend/matmul_aclnn.cc index 158e6d2c..1502469e 100644 --- a/src/ops/matmul/ascend/matmul_aclnn.cc +++ b/src/ops/matmul/ascend/matmul_aclnn.cc @@ -45,9 +45,9 @@ infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle, auto &bDesc = (*desc_ptr)->bDesc; // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation - CHECK_STATUS(cDesc->setDescriptor(c_desc->dt, {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS); - CHECK_STATUS(aDesc->setDescriptor(a_desc->dt, {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS); - CHECK_STATUS(bDesc->setDescriptor(b_desc->dt, {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS); + CHECK_STATUS(cDesc->setDescriptor(toAclDataType(c_desc->dt), {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS); + CHECK_STATUS(aDesc->setDescriptor(toAclDataType(a_desc->dt), {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS); + CHECK_STATUS(bDesc->setDescriptor(toAclDataType(b_desc->dt), {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS); CHECK_STATUS(cDesc->createTensor(), STATUS_SUCCESS); CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS); @@ -69,13 +69,12 @@ infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle, // aclnnGemm support C = alpha * A @ B + beta * C // see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc, - (*desc_ptr)->mt, &workspaceSize, &executor); + (*desc_ptr)->mt, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, - LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret); - return STATUS_EXECUTION_FAILED); + LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); aclSetAclOpExecutorRepeatable(executor); - return STATUS_SUCCESS; } @@ -109,14 +108,14 @@ infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, aclrtSetDevice(desc->device_id); for (int i = 0; i < batch; i++) { - AclSetTensorAddr(executor, 0, ta, (char *)(a) + i * desc->info->a_matrix.stride * desc->dtype.size); - AclSetTensorAddr(executor, 1, tb, (char *)(b) + i * desc->info->b_matrix.stride * desc->dtype.size); - AclSetTensorAddr(executor, 2, tc, (char *)(c) + i * desc->info->c_matrix.stride * desc->dtype.size); - AclSetTensorAddr(executor, 3, tc, (char *)(c) + i * desc->info->c_matrix.stride * desc->dtype.size); + AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size); + AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size); + AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size); + AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size); aclnnStatus ret = aclnnGemm(workspace, - workspaceSize, - executor, - stream); + workspaceSize, + executor, + stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret); return STATUS_EXECUTION_FAILED); diff --git a/src/ops/mlp/operator.cc b/src/ops/mlp/operator.cc index 1186a8dc..3cf7ab5d 100644 --- a/src/ops/mlp/operator.cc +++ b/src/ops/mlp/operator.cc @@ -26,7 +26,7 @@ __C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handl infiniopTensorDescriptor_t w12_desc, infiniopTensorDescriptor_t w3_desc, float alpha, - bool residual) { + char residual) { if (y_desc->ndim != 2 || x_desc->ndim != 2 || w12_desc->ndim != 2 || w3_desc->ndim != 2) { return STATUS_BAD_TENSOR_SHAPE; } diff --git a/src/ops/pooling/cpu/pooling_cpu.cc b/src/ops/pooling/cpu/pooling_cpu.cc index 6f411303..f5bd04d1 100644 --- a/src/ops/pooling/cpu/pooling_cpu.cc +++ b/src/ops/pooling/cpu/pooling_cpu.cc @@ -191,7 +191,8 @@ void _pooling_cpu(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspa Ydata *y, Xdata const *x) { if (desc->padded_x_size > 0) { auto padded_x = reinterpret_cast(workspace); - uint64_t padded_shape[desc->ndim]; + std::vector padded_shape_(desc->ndim); + auto padded_shape = padded_shape_.data(); std::fill(padded_x, padded_x + desc->padded_x_size, 0); getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape); fillPaddedInput(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0); diff --git a/src/ops/pooling/cuda/pooling.cc b/src/ops/pooling/cuda/pooling.cc index 35f2f791..0cf45d64 100644 --- a/src/ops/pooling/cuda/pooling.cc +++ b/src/ops/pooling/cuda/pooling.cc @@ -91,16 +91,13 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle, beta, }; } else { - int x_shape[ndim]; - int x_strides[ndim]; - int y_shape[ndim]; - int y_strides[ndim]; - int k_shape[ndim - 2]; - int pads_int[ndim - 2]; - int strides_int[ndim - 2]; - const auto kernel_ = reinterpret_cast(kernel_shape); - const auto pads_ = reinterpret_cast(pads); - const auto strides_ = reinterpret_cast(strides); + std::vector x_shape(ndim); + std::vector x_strides(ndim); + std::vector y_shape(ndim); + std::vector y_strides(ndim); + std::vector k_shape(ndim - 2); + std::vector pads_int(ndim - 2); + std::vector strides_int(ndim - 2); #pragma omp parallel for for (size_t i = 0; i < ndim; ++i) { @@ -109,9 +106,9 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle, y_shape[i] = static_cast(y->shape[i]); y_strides[i] = static_cast(y->strides[i]); if (i < ndim - 2) { - k_shape[i] = static_cast(kernel_[i]); - pads_int[i] = static_cast(pads_[i]); - strides_int[i] = static_cast(strides_[i]); + k_shape[i] = static_cast(kernel_shape[i]); + pads_int[i] = static_cast(pads[i]); + strides_int[i] = static_cast(strides[i]); } } @@ -121,7 +118,7 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle, // create and set tensor descriptors for x cudnnTensorDescriptor_t x_desc; checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); - checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast(tensor_dt), ndim, x_shape, x_strides)); + checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast(tensor_dt), ndim, x_shape.data(), x_strides.data())); // Create and set pooling descriptor for average pooling cudnnPoolingDescriptor_t pool_desc; @@ -130,14 +127,14 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle, getPoolingMode(pooling_type), CUDNN_NOT_PROPAGATE_NAN, ndim - 2, - k_shape, - pads_int, - strides_int)); + k_shape.data(), + pads_int.data(), + strides_int.data())); // create and set tensor descriptors for y cudnnTensorDescriptor_t y_desc; checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); - checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape)); - checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast(tensor_dt), ndim, y_shape, y_strides)); + checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data())); + checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast(tensor_dt), ndim, y_shape.data(), y_strides.data())); *desc_ptr = new PoolingCudaDescriptor{ DevNvGpu, diff --git a/src/ops/pooling/cuda/pooling.cuh b/src/ops/pooling/cuda/pooling.cuh index ab26d280..dd080e1e 100644 --- a/src/ops/pooling/cuda/pooling.cuh +++ b/src/ops/pooling/cuda/pooling.cuh @@ -3,6 +3,7 @@ #include "../../../devices/cuda/cuda_handle.h" #include "operators.h" +#include struct PoolingCudaDescriptor { Device device; diff --git a/src/ops/pooling/musa/pooling_musa.cc b/src/ops/pooling/musa/pooling_musa.cc index ccb21b96..d7e1121c 100644 --- a/src/ops/pooling/musa/pooling_musa.cc +++ b/src/ops/pooling/musa/pooling_musa.cc @@ -32,44 +32,73 @@ infiniopStatus_t musaCreatePoolingDescriptor(MusaHandle_t handle, return STATUS_BAD_TENSOR_DTYPE; } - float alpha = 1.0f, beta = 0.0f; + const uint64_t new_ndim = ndim; - const auto kernel_ = reinterpret_cast(kernel_shape); - const auto pads_ = reinterpret_cast(pads); - const auto strides_ = reinterpret_cast(strides); + int64_t *x_shape = new int64_t[new_ndim]; + int64_t *y_shape = new int64_t[new_ndim]; + for(size_t i = 0; i < new_ndim; ++i){ + x_shape[i] = static_cast(x->shape[i]); + y_shape[i] = static_cast(y->shape[i]); - const auto x_shape = reinterpret_cast(x->shape); - const auto x_strides = reinterpret_cast(x->strides); - const auto y_shape = reinterpret_cast(y->shape); - const auto y_strides = reinterpret_cast(y->strides); + } musa::dnn::Tensor *x_tensor = new musa::dnn::Tensor(); musa::dnn::Tensor *y_tensor = new musa::dnn::Tensor(); - musa::dnn::Tensor *indices_tensor = new musa::dnn::Tensor(); + musa::dnn::Tensor *indices = new musa::dnn::Tensor(); + + x_tensor->SetNdInfo((int)new_ndim, x_shape); + y_tensor->SetNdInfo((int)new_ndim, y_shape); + indices->SetNdInfo((int)new_ndim, y_shape); if (y->dt == F16) { x_tensor->SetType(musa::dnn::Tensor::Type::HALF); y_tensor->SetType(musa::dnn::Tensor::Type::HALF); + indices->SetType(musa::dnn::Tensor::Type::HALF); } else if (y->dt == F32) { x_tensor->SetType(musa::dnn::Tensor::Type::FLOAT); y_tensor->SetType(musa::dnn::Tensor::Type::FLOAT); + indices->SetType(musa::dnn::Tensor::Type::FLOAT); } - x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); - y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + if (new_ndim == 5) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW); + indices->SetFormat(musa::dnn::Tensor::Format::NCDHW); + } + else if (new_ndim == 4) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW); + indices->SetFormat(musa::dnn::Tensor::Format::NCHW); + } + else if (new_ndim == 3) { + x_tensor->SetFormat(musa::dnn::Tensor::Format::NCW); + y_tensor->SetFormat(musa::dnn::Tensor::Format::NCW); + indices->SetFormat(musa::dnn::Tensor::Format::NCW); + } + else { + return STATUS_BAD_TENSOR_SHAPE; + } - x_tensor->SetNdInfo((int) ndim, x_shape, x_strides); - y_tensor->SetNdInfo((int) ndim, y_shape, y_strides); + musa::dnn::Status status; + musa::dnn::Pooling* pooling_operator = new musa::dnn::Pooling(); - int *indice = new int[ndim]; + status = pooling_operator->SetMode(getPoolingMode(pooling_type)); + // if (status == musa::dnn::Status::SUCCESS) { + // printf("pool_desc SetMode status:%d\n", static_cast(status)); + // } - musa::dnn::Pooling* pooling_operator = new musa::dnn::Pooling(); - pooling_operator->SetMode(getPoolingMode(pooling_type)); + std::initializer_list kernel = {static_cast(kernel_shape[0]), static_cast(kernel_shape[1])}; + std::initializer_list pad = {static_cast(pads[0]), static_cast(pads[1])}; + std::initializer_list stride = {static_cast(strides[0]), static_cast(strides[1])}; + std::initializer_list dilationList = {1, 1}; - int* dilation_ = new int[n]; - std::fill(dilation_, dilation_+((int) n), 1); + status = pooling_operator->SetNdInfo(kernel, pad, stride, dilationList); + // if (status == musa::dnn::Status::SUCCESS) { + // printf("pool_desc SetNdInfo status:%d\n", static_cast(status)); + // } - pooling_operator->SetNdInfo((int) n, kernel_, pads_, strides_, (const int*) dilation_); + const float alpha = 1.0f; + const float beta = 0.0f; *desc_ptr = new PoolingMusaDescriptor{ DevMtGpu, @@ -78,11 +107,14 @@ infiniopStatus_t musaCreatePoolingDescriptor(MusaHandle_t handle, handle->mudnn_handles_t, x_tensor, y_tensor, - indices_tensor, + indices, pooling_operator, alpha, beta, }; + + delete[] x_shape; + delete[] y_shape; return STATUS_SUCCESS; } diff --git a/src/ops/pooling/musa/pooling_musa.h b/src/ops/pooling/musa/pooling_musa.h index 1b8daf11..73339b87 100644 --- a/src/ops/pooling/musa/pooling_musa.h +++ b/src/ops/pooling/musa/pooling_musa.h @@ -46,6 +46,7 @@ inline musa::dnn::Pooling::Mode getPoolingMode(int pooling_type) { return musa::dnn::Pooling::Mode::MAXPOOL; case 1: return musa::dnn::Pooling::Mode::AVGPOOL_COUNT_PAD; + // return musa::dnn::Pooling::Mode::AVGPOOL_COUNT_WITHOUT_PAD; default: return musa::dnn::Pooling::Mode::MAXPOOL; } diff --git a/src/ops/pooling/musa/pooling_musa.mu b/src/ops/pooling/musa/pooling_musa.mu index ece291ad..bb8081ca 100644 --- a/src/ops/pooling/musa/pooling_musa.mu +++ b/src/ops/pooling/musa/pooling_musa.mu @@ -12,6 +12,7 @@ infiniopStatus_t pooling_mt_gpu(PoolingMusaDescriptor_t desc, void *y, void cons desc->pool_operator->Run(*handle, *(desc->y_tensor), *(desc->x_tensor), *(desc->indices_tensor)); }); + printf("[SUCCESS to execute pooling_mt_gpu]\n"); return STATUS_SUCCESS; } diff --git a/src/ops/random_sample/ascend/random_sample.cc b/src/ops/random_sample/ascend/random_sample.cc new file mode 100644 index 00000000..b16159dc --- /dev/null +++ b/src/ops/random_sample/ascend/random_sample.cc @@ -0,0 +1,153 @@ +#include "random_sample.h" + +RandomSampleAscendDescriptor::RandomSampleAscendDescriptor(Device _device) { + device = _device; + device_id = 0; + pDesc = new aclnnTensorDescriptor(); + topkIdxDesc = new aclnnTensorDescriptor(); + topkValDesc = new aclnnTensorDescriptor(); + resDesc = new aclnnTensorDescriptor(); +} + +infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle, + RandomSampleAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs) { + if (probs->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(result->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + if (result->ndim != 1 && result->shape[0] != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + + (*desc_ptr) = new RandomSampleAscendDescriptor(handle->device); + (*desc_ptr)->device_id = handle->device_id; + + CHECK_STATUS((*desc_ptr)->pDesc->fromInfiniOpTensorDescriptor(probs), STATUS_SUCCESS); + CHECK_STATUS((*desc_ptr)->resDesc->fromInfiniOpTensorDescriptor(result), STATUS_SUCCESS); + // Ascend aclnnTopk doesn't support U64 type + (*desc_ptr)->resDesc->dataType = aclDataType::ACL_INT64; + + return STATUS_SUCCESS; +} + + +infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc, + uint64_t *size) { + auto &pDesc = desc->pDesc; + *size = numElements(pDesc->shape.data(), pDesc->ndim) * aclDataTypeSize(pDesc->dataType) + + numElements(pDesc->shape.data(), pDesc->ndim) * sizeof(I64); + + return STATUS_SUCCESS; +} + +infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + if (topk <= 0 || topp < 0 || topp > 1.0) { + return STATUS_BAD_PARAM; + } + + if (random_val < 0 || random_val > 1.0) { + return STATUS_BAD_PARAM; + } + + auto &pDesc = desc->pDesc; + auto &topkIdxDesc = desc->topkIdxDesc; + auto &topkValDesc = desc->topkValDesc; + auto ndim = static_cast(pDesc->ndim); + auto voc = pDesc->shape[0]; + auto topk_ = topk <= voc ? topk : voc; + bool doSample = topk_ > 1 && temperature != 0 && topp != 0; + + auto topkShape = std::vector(pDesc->shape); + topkShape[ndim - 1] = doSample ? topk_ : 1; + + auto topkStrides = std::vector(pDesc->strides); + // Infer contiguous strides + topkStrides[ndim - 1] = 1; + for (int64_t i = ndim - 2; i >= 0; --i) { + topkStrides[i] = topkStrides[i + 1] * topkShape[i + 1]; + } + + CHECK_STATUS(topkValDesc->setDescriptor(pDesc->dataType, topkShape, topkStrides), STATUS_SUCCESS); + CHECK_STATUS(topkIdxDesc->setDescriptor(aclDataType::ACL_INT64, topkShape, topkStrides), STATUS_SUCCESS); + + // Infer data ptr + auto workspaceTmp = workspace; + auto topkValAddr = workspaceTmp; + workspaceTmp = (void *) ((uint8_t *) workspace + + numElements(topkValDesc->shape.data(), topkValDesc->ndim) * aclDataTypeSize(topkValDesc->dataType)); + auto topkIdxAddr = workspaceTmp; + auto pAddr = (void *) probs; + + // Create aclTensor + CHECK_STATUS(pDesc->createTensor(pAddr), STATUS_SUCCESS); + CHECK_STATUS(topkValDesc->createTensor(topkValAddr), STATUS_SUCCESS); + CHECK_STATUS(topkIdxDesc->createTensor(topkIdxAddr), STATUS_SUCCESS); + if (!doSample) { + CHECK_STATUS(desc->resDesc->createTensor(result), STATUS_SUCCESS); + } + + // Do Topk calculate + uint64_t topkWorkspaceSize = 0; + aclOpExecutor *topkExecutor = nullptr; + auto ret = aclnnTopkGetWorkspaceSize(pDesc->t, + topkShape[ndim - 1], + ndim - 1, + true, + true, + topkValDesc->t, + doSample ? topkIdxDesc->t + : desc->resDesc->t, + &topkWorkspaceSize, + &topkExecutor); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnTopkGetWorkspaceSize failed ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + void *topkWorkspace; + CHECK_STATUS(mallocWorkspace(&topkWorkspace, topkWorkspaceSize), STATUS_SUCCESS); + ret = aclnnTopk(topkWorkspace, + topkWorkspaceSize, + topkExecutor, + stream); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnTopk failed ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + CHECK_STATUS(freeWorkspace(topkWorkspace), STATUS_SUCCESS); + + if (doSample) { + // Do softmax and topp random sample + CHECK_STATUS(random_sample_do( + pAddr, + result, + topkValAddr, + topkIdxAddr, + topk, + static_cast(pDesc->shape[0]), + topp, + temperature, + random_val, + pDesc->dataType, + stream), + STATUS_SUCCESS); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc) { + delete desc->pDesc; + delete desc->topkIdxDesc; + delete desc->topkValDesc; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/random_sample/ascend/random_sample.h b/src/ops/random_sample/ascend/random_sample.h new file mode 100644 index 00000000..1ecc16fc --- /dev/null +++ b/src/ops/random_sample/ascend/random_sample.h @@ -0,0 +1,52 @@ +#ifndef __ASCEND_RANDOM_SAMPLE_H__ +#define __ASCEND_RANDOM_SAMPLE_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "../../../devices/ascend/tensor_aclnn.h" +#include "../../utils.h" +#include "operators.h" +#include +#include +#include +#include + + +struct RandomSampleAscendDescriptor { + Device device; + int device_id; + aclnnTensorDescriptor_t pDesc; + aclnnTensorDescriptor_t topkValDesc; + aclnnTensorDescriptor_t topkIdxDesc; + aclnnTensorDescriptor_t resDesc; + RandomSampleAscendDescriptor(Device _device); +}; + +typedef struct RandomSampleAscendDescriptor *RandomSampleAscendDescriptor_t; + +infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle, + RandomSampleAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs); + +infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc, + uint64_t *size); + +infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc); + +extern "C" infiniopStatus_t +random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr, + int32_t topk, int32_t voc, float topp, float temper, + float random, int dtype, void *stream); + +#endif diff --git a/src/ops/random_sample/ascend/random_sample_aclnn.cc b/src/ops/random_sample/ascend/random_sample_aclnn.cc deleted file mode 100644 index e888b2f9..00000000 --- a/src/ops/random_sample/ascend/random_sample_aclnn.cc +++ /dev/null @@ -1,107 +0,0 @@ -#include "random_sample_aclnn.h" -#include "../../../devices/cpu/common_cpu.h" -#include "../../utils.h" - -RandomSampleAclnnDescriptor::RandomSampleAclnnDescriptor(Device _device) { - device = _device; - device_id = 0; - argMaxExecutor = nullptr; - pDesc = new aclnnTensorDescriptor(); - rDesc = new aclnnTensorDescriptor(); - random_val = 1.0; - topp = 0; - topk = 0; - temperature = 1.0; - argMaxWorkspaceSize = 0; -} - -infiniopStatus_t aclnnCreateRandomSampleDescriptor(AscendHandle_t handle, - RandomSampleAclnnDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t result, - infiniopTensorDescriptor_t probs) { - - (*desc_ptr) = new RandomSampleAclnnDescriptor(handle->device); - (*desc_ptr)->device_id = handle->device_id; - (*desc_ptr)->random_val = 0; - (*desc_ptr)->topp = 0; - (*desc_ptr)->topk = 0; - (*desc_ptr)->temperature = 1.0; - - auto &pDesc = (*desc_ptr)->pDesc; - auto &rDesc = (*desc_ptr)->rDesc; - - CHECK_STATUS(pDesc->fromInfiniOpTensorDescriptor(probs), STATUS_SUCCESS); - CHECK_STATUS(pDesc->createTensor(), STATUS_SUCCESS); - - result->dt = I64; - CHECK_STATUS(rDesc->fromInfiniOpTensorDescriptor(result), STATUS_SUCCESS); - CHECK_STATUS(rDesc->createTensor(), STATUS_SUCCESS); - - aclTensor *tp = pDesc->t; - aclTensor *tr = rDesc->t; - - aclnnStatus ret; - - // temp = prob / temperature - auto &argmaxWorkspaceSize = (*desc_ptr)->argMaxWorkspaceSize; - auto &argmaxExecutor = (*desc_ptr)->argMaxExecutor; - ret = aclnnArgMaxGetWorkspaceSize(tp, - 0, - true, - tr, - &argmaxWorkspaceSize, - &argmaxExecutor); - CHECK_RET(ret == ACL_SUCCESS, - LOG_PRINT("aclnnArgMaxGetWorkspaceSize failed, ERROR: %d\n", ret); - return STATUS_EXECUTION_FAILED); - aclSetAclOpExecutorRepeatable(argmaxExecutor); - return STATUS_SUCCESS; -} - -infiniopStatus_t aclnnGetRandomSampleWorkspaceSize(RandomSampleAclnnDescriptor_t desc, uint64_t *size) { - *size = desc->argMaxWorkspaceSize; - return STATUS_SUCCESS; -} - -infiniopStatus_t aclnnRandomSample(RandomSampleAclnnDescriptor_t desc, - void *workspace, - uint64_t workspace_size, - void *result, - void const *probs, - float random_val, - float topp, - int topk, - float temperature, - void *stream) { - auto &pDesc = desc->pDesc; - auto &rDesc = desc->rDesc; - - aclTensor *tp = pDesc->t; - aclTensor *tr = rDesc->t; - - aclrtSetDevice(desc->device_id); - - auto &argmaxWorkspaceSize = desc->argMaxWorkspaceSize; - auto &argmaxExecutor = desc->argMaxExecutor; - - AclSetTensorAddr(argmaxExecutor, 0, tp, (void *) probs); - AclSetTensorAddr(argmaxExecutor, 1, tr, (void *) result); - auto ret = aclnnArgMax(workspace, - argmaxWorkspaceSize, - argmaxExecutor, - stream); - CHECK_RET(ret == ACL_SUCCESS, - LOG_PRINT("aclnnArgMax failed. ERROR: %d\n", ret); - return STATUS_EXECUTION_FAILED); - return STATUS_SUCCESS; -} - - -infiniopStatus_t aclnnDestroyRandomSampleDescriptor(RandomSampleAclnnDescriptor_t desc) { - delete desc->pDesc; - delete desc->rDesc; - aclDestroyAclOpExecutor(desc->argMaxExecutor); - delete desc; - - return STATUS_SUCCESS; -} diff --git a/src/ops/random_sample/ascend/random_sample_aclnn.h b/src/ops/random_sample/ascend/random_sample_aclnn.h deleted file mode 100644 index 8848cb99..00000000 --- a/src/ops/random_sample/ascend/random_sample_aclnn.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __ASCEND_RANDOM_SAMPLE_H__ -#define __ASCEND_RANDOM_SAMPLE_H__ - -#include "../../../devices/ascend/ascend_handle.h" -#include "../../../devices/ascend/tensor_aclnn.h" -#include "operators.h" -#include -#include -#include -#include - - -struct RandomSampleAclnnDescriptor { - Device device; - int device_id; - aclOpExecutor *argMaxExecutor; - aclnnTensorDescriptor_t pDesc; - aclnnTensorDescriptor_t rDesc; - float random_val; - float topp; - int topk; - float temperature; - uint64_t argMaxWorkspaceSize; - RandomSampleAclnnDescriptor(Device _device); -}; - -typedef struct RandomSampleAclnnDescriptor *RandomSampleAclnnDescriptor_t; - -infiniopStatus_t aclnnCreateRandomSampleDescriptor(AscendHandle_t handle, - RandomSampleAclnnDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t result, - infiniopTensorDescriptor_t probs); - -infiniopStatus_t aclnnGetRandomSampleWorkspaceSize(RandomSampleAclnnDescriptor_t desc, - uint64_t *size); - -infiniopStatus_t aclnnRandomSample(RandomSampleAclnnDescriptor_t desc, - void *workspace, - uint64_t workspace_size, - void *result, - void const *probs, - float random_val, - float topp, - int topk, - float temperature, - void *stream); - -infiniopStatus_t aclnnDestroyRandomSampleDescriptor(RandomSampleAclnnDescriptor_t desc); - - -#endif diff --git a/src/ops/random_sample/ascend/random_sample_kernel.cpp b/src/ops/random_sample/ascend/random_sample_kernel.cpp new file mode 100644 index 00000000..18b482bc --- /dev/null +++ b/src/ops/random_sample/ascend/random_sample_kernel.cpp @@ -0,0 +1,232 @@ +#include "../../../../include/status.h" +#include "kernel_operator.h" + +using namespace AscendC; + +template +class KernelRandomSample { +public: + __aicore__ inline KernelRandomSample() {} + __aicore__ inline void Init(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr, + GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_, + float topp_, float temper_, float random_) { + + topk = topk_; + voc = voc_; + topp = topp_; + temperature = temper_; + random = random_; + blockSize = 256 * 2; + + // CumSumInfo + if (sizeof(T) == sizeof(float)) { + topkAligned = (topk + 7) / 8 * 8; + vocAligned = (voc + 7) / 8 * 8; + } else { + topkAligned = (topk + 15) / 16 * 16; + vocAligned = (voc + 15) / 16 * 16; + } + topkIdxAligned = (topk + 3) / 4 * 4; + + // Set Gm + pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc); + topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk); + topkIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topkIdxAddr), topk); + resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1); + + // Global input and output + pipe.InitBuffer(pQue, 1, vocAligned * sizeof(T)); + pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T)); + pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t)); + pipe.InitBuffer(resQue, 1, 32);// 32 bytes for aligned + + pipe.InitBuffer(softMaxBuf1, blockSize); + pipe.InitBuffer(softMaxBuf2, blockSize); + pipe.InitBuffer(softMaxBuf3, blockSize); + pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T)); + + pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T)); + } + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + // Softmax + __aicore__ inline void SoftMax(LocalTensor &valIn, + LocalTensor &topkValIn, + LocalTensor &softMaxOut) { + int32_t repeatTimes = vocAligned * sizeof(T) / blockSize; + int32_t remainder = vocAligned * sizeof(T) % blockSize / sizeof(T); + int32_t tileLength = blockSize / sizeof(T); + float negMax = -static_cast(topkValIn(0)); + float invTemperature = 1.0f / temperature; + float sum = 0.f; + float sum_s = 0.f; + LocalTensor tmpBuffer = softMaxBuf1.Get(); + LocalTensor tmpBuffer2 = softMaxBuf2.Get(); + LocalTensor tmpBuffer3 = softMaxBuf3.Get(); + for (int32_t i = 0; i < repeatTimes; i++) { + Adds(tmpBuffer, valIn[i * tileLength], static_cast(negMax), tileLength); + Muls(tmpBuffer2, tmpBuffer, static_cast(invTemperature), tileLength); + Exp(tmpBuffer3, tmpBuffer2, tileLength); + sum_s = 0.f; + for (int j = 0; j < tileLength; ++j) { + sum_s += static_cast(tmpBuffer3(j)); + } + sum += sum_s; + } + if (remainder != 0) { + Adds(tmpBuffer, valIn[repeatTimes * tileLength], static_cast(negMax), remainder); + Muls(tmpBuffer2, tmpBuffer, static_cast(invTemperature), remainder); + Exp(tmpBuffer3, tmpBuffer2, remainder); + sum_s = 0.f; + for (int i = 0; i < remainder; ++i) { + sum_s += static_cast(tmpBuffer3(i)); + } + sum += sum_s; + } + float invSum = 1.0f / sum; + Adds(tmpBuffer, topkValIn, static_cast(negMax), topk); + Muls(tmpBuffer2, tmpBuffer, static_cast(invTemperature), topk); + Exp(tmpBuffer3, tmpBuffer2, topk); + Muls(softMaxOut, tmpBuffer3, static_cast(invSum), topk); + } + + // Cumsum + __aicore__ inline void InclusiveSum(LocalTensor &topkValIn, + LocalTensor &topkValOut) { + static constexpr CumSumConfig cumSumConfig{true, false, false}; + LocalTensor lastRowLocal; + CumSum(topkValOut, lastRowLocal, topkValIn, + {1, static_cast(topkAligned)}); + } + + // Random sample + __aicore__ inline void RandomSample(LocalTensor &valIn, + LocalTensor &Index, + LocalTensor &result) { + int end = 0; + for (end = 0; end < topk; end++) { + if (static_cast(valIn(end)) >= topp) { + break; + } + } + if (end < topk - 1) { + end += 1; + } else { + end = topk; + } + + auto randomVal = random * static_cast(valIn(end - 1)); + for (int i = 0; i < end; i++) { + if (randomVal < static_cast(valIn(i))) { + result(0) = Index(i); + return; + } + } + result(0) = Index(end - 1); + } + + __aicore__ inline void CopyIn() { + LocalTensor pLocal = pQue.AllocTensor(); + LocalTensor topkValLocal = topkQue.AllocTensor(); + LocalTensor topkIdxLocal = topkIdxQue.AllocTensor(); + + DataCopy(pLocal, pGm, vocAligned); + DataCopy(topkValLocal, topkGm, topkAligned); + DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned); + + pQue.EnQue(pLocal); + topkQue.EnQue(topkValLocal); + topkIdxQue.EnQue(topkIdxLocal); + } + + __aicore__ inline void Compute() { + // Get input data + LocalTensor pLocal = pQue.DeQue(); + LocalTensor topkValLocal = topkQue.DeQue(); + + // SoftMax + LocalTensor softMaxOutLocal = softMaxOutBuf.Get(); + SoftMax(pLocal, topkValLocal, softMaxOutLocal); + + // InclusiveSum + LocalTensor inclusiveOutLocal = inclusiveSumOutBuf.Get(); + InclusiveSum(softMaxOutLocal, inclusiveOutLocal); + + // randomSample + LocalTensor topkIdxLocal = topkIdxQue.DeQue(); + LocalTensor resultLocal = resQue.AllocTensor(); + RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal); + + pQue.FreeTensor(pLocal); + topkQue.FreeTensor(topkValLocal); + topkIdxQue.FreeTensor(topkIdxLocal); + resQue.EnQue(resultLocal); + } + __aicore__ inline void CopyOut() { + LocalTensor resLocal = resQue.DeQue(); + DataCopy(resGm, resLocal, 32 / sizeof(int64_t)); + resQue.FreeTensor(resLocal); + } + +private: + GlobalTensor pGm; + GlobalTensor topkGm; + GlobalTensor topkIdxGm; + GlobalTensor resGm; + + TPipe pipe; + + TQue pQue; + TQue topkQue; + TQue topkIdxQue; + TQue resQue; + + TBuf softMaxBuf1; + TBuf softMaxBuf2; + TBuf softMaxBuf3; + TBuf softMaxOutBuf; + + TBuf inclusiveSumOutBuf; + + // Kernel params + int32_t topk; + int32_t voc; + float topp; + float temperature; + float random; + + int32_t topkAligned; + int32_t topkIdxAligned; + int32_t vocAligned; + int32_t blockSize; +}; + +extern "C" __global__ __aicore__ void +random_sample_kernel_f16(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr, + GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_, + float topp_, float temper_, float random_) { + KernelRandomSample op; + op.Init(p, res, topkAddr, topkIdxAddr, topk_, voc_, topp_, temper_, random_); + op.Process(); +} + +extern "C" infiniopStatus_t +random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr, + int32_t topk, int32_t voc, float topp, float temper, + float random, int dtype, void *stream) { + + switch (dtype) { + case 0: + return STATUS_SUCCESS; + case 1: + random_sample_kernel_f16<<<1, nullptr, stream>>>( + p, res, topkAddr, topkIdxAddr, topk, voc, topp, temper, random); + return STATUS_SUCCESS; + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/random_sample/bang/random_sample_bang.cc b/src/ops/random_sample/bang/random_sample_bang.cc index b1c7180e..ed1945da 100644 --- a/src/ops/random_sample/bang/random_sample_bang.cc +++ b/src/ops/random_sample/bang/random_sample_bang.cc @@ -28,7 +28,7 @@ infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle, return STATUS_SUCCESS; } -infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size) { *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype)) + sizeof(desc->dtype); return STATUS_SUCCESS; } diff --git a/src/ops/random_sample/bang/random_sample_bang.h b/src/ops/random_sample/bang/random_sample_bang.h index 1bb0b7d5..de830fbf 100644 --- a/src/ops/random_sample/bang/random_sample_bang.h +++ b/src/ops/random_sample/bang/random_sample_bang.h @@ -20,11 +20,11 @@ infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle, RandomSampleBangDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs); -infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, unsigned long int *size); +infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size); infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *result, void const *probs, float random_val, diff --git a/src/ops/random_sample/bang/random_sample_bang.mlu b/src/ops/random_sample/bang/random_sample_bang.mlu index 5b6a0751..5fa66150 100644 --- a/src/ops/random_sample/bang/random_sample_bang.mlu +++ b/src/ops/random_sample/bang/random_sample_bang.mlu @@ -24,9 +24,9 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + taskDim * topk) * sizeof(T); uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum],必须要求maxNum >= max{step, topk} uint64_t *indGlobal = srcInd + maxNum;//[taskDim * topk] - + __sync_all(); - + T *src = (T *)nram_buffer;//[maxNum],必须要求maxNum >= max{step, topk} T *destSum = src + maxNum;//[maxNum] T *destSumFinal = destSum + maxNum;//[wSize] @@ -36,14 +36,14 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t __bang_write_zero(destSumFinal, wSize); __memcpy(srcInd, indGdram, voc * sizeof(uint64_t), GDRAM2NRAM); - + if(step){ for(int i = 0; i < step; i++){ srcInd[i] = indStart + i; } __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM); if(step >= topk){ - for(int i = 0; i < topk; i++){ + for(int i = 0; i < topk; i++){ for(int j = i + 1; j < step; j++){ if(src[i] < src[j]){ T tmp = src[i]; @@ -102,9 +102,9 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次 for(int i = 0; i < strip ; i++){ __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize); - } + } } - + __bang_reduce_sum(destSumFinal, destSum, wSize); } else{ @@ -116,27 +116,27 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t destSumFinal[0] = destSumFinal[0] - (maxNum - step);//把上面多加的(maxNum - step)减掉 } globalSum[0] = 0.0; - + __sync_all(); __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0 - + T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和 - + if(taskId == 0){ __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值 - + __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk); __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk); __bang_active_exp_less_0(srcGlobal, srcGlobal, topk); __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk); - + __bang_write_zero(destSum, 2 * topk); destSum[0] = srcGlobal[0]; for(int i = 1; i < topk; i++){ destSum[i] = destSum[i - 1] + srcGlobal[i]; } - + int end = 0; for(end = 0; end < topk; end++){ if(destSum[end] >= static_cast(topp)){ @@ -149,7 +149,7 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t else{ end = topk; } - + random_val *= destSum[end - 1]; for(int i = 0; i < end; i++){ if(random_val < destSum[i]){ @@ -164,7 +164,7 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t template __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t *indGdram, T *globalTopk, T *globalSum, float random_val, float topp, int topk, float temperature, int voc){ const int maxNum = SRC_MAX_SIZE/sizeof(T); - + int wSize = 128 / sizeof(T); int segNum = maxNum / wSize; @@ -178,7 +178,7 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy); - + char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + 2 * topk + taskDim * topk) * sizeof(T); uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum] uint64_t *topkInd = srcInd + maxNum;//[2 * topk] @@ -196,7 +196,7 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t srcInd[j] = r * taskSize + taskId * maxNum + j; } __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - for(int i = 0; i < topk; i++){ + for(int i = 0; i < topk; i++){ for(int j = i + 1; j < maxNum; j++){ if(src[i] < src[j]){ T tmp = src[i]; @@ -230,17 +230,17 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t } } } - - + + } - + if(step){ for(int j = 0; j < step; j++){ srcInd[j] = repeat * taskSize + indStart + j; } __memcpy(src, source + repeat * taskSize + indStart, step * sizeof(T), GDRAM2NRAM); if(step >= topk){ - for(int i = 0; i < topk; i++){ + for(int i = 0; i < topk; i++){ for(int j = i + 1; j < step; j++){ if(src[i] < src[j]){ T tmp = src[i]; @@ -289,11 +289,11 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t } } } - + __memcpy(globalTopk + taskId * topk, srcTopk, topk * sizeof(T), NRAM2GDRAM); __memcpy(indGdram + taskId * topk, topkInd, topk * sizeof(uint64_t), NRAM2GDRAM); __sync_all(); - + if(taskId == 0){ __memcpy(srcGlobal, globalTopk, taskDim * topk * sizeof(T), GDRAM2NRAM); __memcpy(indGlobal, indGdram, taskDim * topk * sizeof(uint64_t), GDRAM2NRAM); @@ -337,44 +337,44 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次 for(int i = 0; i < strip ; i++){ __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize); - } + } } - + __bang_reduce_sum(destSumFinal, destSum, wSize); } - + else{ for(int i = 0; i < maxNum; i++){ - + destSumFinal[0] += destSum[i]; } - + } if(step){ destSumFinal[0] = destSumFinal[0] - (maxNum - step);//把上面多加的(maxNum - step)减掉 } globalSum[0] = 0.0; - + __sync_all(); __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0 - + T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和 - + if(taskId == 0){ __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值 - + __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk); __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk); __bang_active_exp_less_0(srcGlobal, srcGlobal, topk); __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk); - + __bang_write_zero(srcTopk, 2 * topk); srcTopk[0] = srcGlobal[0]; for(int i = 1; i < topk; i++){ srcTopk[i] = srcTopk[i - 1] + srcGlobal[i]; } - + int end = 0; for(end = 0; end < topk; end++){ if(srcTopk[end] >= static_cast(topp)){ @@ -387,7 +387,7 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t else{ end = topk; } - + random_val *= srcTopk[end - 1]; for(int i = 0; i < end; i++){ if(random_val < srcTopk[i]){ @@ -415,7 +415,7 @@ __mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t * T *src = (T *)nram_buffer; T *srcMax = src + maxNum; uint64_t index = 0; - + T newMax = -INFINITY; for(uint64_t r = 0; r < repeat; r++){ __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); @@ -424,7 +424,7 @@ __mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t * newMax = srcMax[0]; index = r * taskSize + taskId * maxNum + *((int64_t*)&srcMax[1]); } - + } if(step){ __bang_write_value(src, maxNum, -INFINITY); @@ -434,9 +434,9 @@ __mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t * newMax = srcMax[0]; index = indStart + *((int64_t*)&srcMax[1]); } - + } - + indGdram[taskId] = index; __sync_all(); if(taskId == 0){ @@ -462,7 +462,7 @@ void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source, k_dim.y = 1; k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - + int taskNum = k_dim.x * k_dim.y * k_dim.z; if(topp > 0 && topk > 1){ const int maxNum = SRC_MAX_SIZE/sizeof(T); @@ -471,7 +471,7 @@ void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source, uint64_t *indGdram = (uint64_t *)origin; T *globalTopk = (T *)indTmp; T *globalSum = globalTopk + taskNum * topk; - + if(voc >= taskNum * maxNum){ random_sampleD<<>>(logits_, index_, indGdram, globalTopk, globalSum, random_val, topp, topk, temperature, voc); } @@ -484,8 +484,8 @@ void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source, random_sample<<>>(logits_, index_, indGdram, voc); } cnrtQueueSync(queue); - - + + } void random_sample_bang_f16(RandomSampleBangDescriptor_t desc, void *workspace, void *result, @@ -497,12 +497,12 @@ void random_sample_bang_f16(RandomSampleBangDescriptor_t desc, void *workspace, void *stream) { auto queue = reinterpret_cast(stream); int voc = desc->voc; - + random_sampleUnion(queue, workspace, probs, result, random_val, topp, topk, temperature, voc); } infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *result, void const *probs, float random_val, diff --git a/src/ops/random_sample/cpu/random_sample.cc b/src/ops/random_sample/cpu/random_sample.cc index 3706e1ea..28de5b93 100644 --- a/src/ops/random_sample/cpu/random_sample.cc +++ b/src/ops/random_sample/cpu/random_sample.cc @@ -31,7 +31,7 @@ infiniopStatus_t cpuCreateRandomSampleDescriptor(infiniopHandle_t, return STATUS_SUCCESS; } -infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, uint64_t *size) { *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype)); return STATUS_SUCCESS; } diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh index 4230fabc..d3fff76d 100644 --- a/src/ops/random_sample/cuda/random_sample.cuh +++ b/src/ops/random_sample/cuda/random_sample.cuh @@ -19,7 +19,7 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs); -infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size); +infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size); infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, void *workspace, diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc index a536ca19..022a113b 100644 --- a/src/ops/random_sample/cuda/random_sample_cuda.cc +++ b/src/ops/random_sample/cuda/random_sample_cuda.cc @@ -26,7 +26,7 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, return STATUS_SUCCESS; } -infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size) { *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)); return STATUS_SUCCESS; } diff --git a/src/ops/random_sample/operator.cc b/src/ops/random_sample/operator.cc index 9d3ef186..84b26ad0 100644 --- a/src/ops/random_sample/operator.cc +++ b/src/ops/random_sample/operator.cc @@ -12,7 +12,7 @@ #include "bang/random_sample_bang.h" #endif #ifdef ENABLE_ASCEND_NPU -#include "ascend/random_sample_aclnn.h" +#include "ascend/random_sample.h" #endif #ifdef ENABLE_MT_GPU #include "musa/random_sample_musa.h" @@ -37,8 +37,8 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl #endif #ifdef ENABLE_ASCEND_NPU case DevAscendNpu: { - return aclnnCreateRandomSampleDescriptor((AscendHandle_t) handle, - (RandomSampleAclnnDescriptor_t *) desc_ptr, result, probs); + return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle, + (RandomSampleAscendDescriptor_t *) desc_ptr, result, probs); } #endif #ifdef ENABLE_MT_GPU @@ -69,7 +69,7 @@ __C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDe #endif #ifdef ENABLE_ASCEND_NPU case DevAscendNpu: { - return aclnnGetRandomSampleWorkspaceSize((RandomSampleAclnnDescriptor_t) desc, size); + return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size); } #endif #ifdef ENABLE_MT_GPU @@ -107,7 +107,7 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc, #endif #ifdef ENABLE_ASCEND_NPU case DevAscendNpu: { - return aclnnRandomSample((RandomSampleAclnnDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); + return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); } #endif #ifdef ENABLE_MT_GPU @@ -135,7 +135,7 @@ __C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleD #endif #ifdef ENABLE_ASCEND_NPU case DevAscendNpu: { - return aclnnDestroyRandomSampleDescriptor((RandomSampleAclnnDescriptor_t) desc); + return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc); } #endif #ifdef ENABLE_MT_GPU diff --git a/src/ops/rearrange/ascend/rearrange_aclnn.cc b/src/ops/rearrange/ascend/rearrange_aclnn.cc index 4eead4a9..f1db82cd 100644 --- a/src/ops/rearrange/ascend/rearrange_aclnn.cc +++ b/src/ops/rearrange/ascend/rearrange_aclnn.cc @@ -56,24 +56,24 @@ infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc, /// TODO: something is wrong with aclSetTensorAddr, do all the preparation here for now desc->dstDesc->t = aclCreateTensor(desc->dstDesc->shape.data(), - desc->dstDesc->ndim, - desc->dstDesc->dataType, - desc->dstDesc->strides.data(), - desc->dstDesc->offset, - desc->dstDesc->format, - desc->dstDesc->storageShape.data(), - desc->dstDesc->storageNdim, - dst); + desc->dstDesc->ndim, + desc->dstDesc->dataType, + desc->dstDesc->strides.data(), + desc->dstDesc->offset, + desc->dstDesc->format, + desc->dstDesc->storageShape.data(), + desc->dstDesc->storageNdim, + dst); desc->srcDesc->t = aclCreateTensor(desc->srcDesc->shape.data(), - desc->srcDesc->ndim, - desc->srcDesc->dataType, - desc->srcDesc->strides.data(), - desc->srcDesc->offset, - desc->srcDesc->format, - desc->srcDesc->storageShape.data(), - desc->srcDesc->storageNdim, - (void*)src); - + desc->srcDesc->ndim, + desc->srcDesc->dataType, + desc->srcDesc->strides.data(), + desc->srcDesc->offset, + desc->srcDesc->format, + desc->srcDesc->storageShape.data(), + desc->srcDesc->storageNdim, + (void *) src); + aclTensor *td = desc->dstDesc->t; aclTensor *ts = desc->srcDesc->t; aclOpExecutor *executor; @@ -82,7 +82,7 @@ infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc, ts, &workspaceSize, &executor); - desc->workspaceAddr = mallocWorkspace(workspaceSize); + CHECK_STATUS(mallocWorkspace(&(desc->workspaceAddr), workspaceSize), STATUS_SUCCESS); // AclSetTensorAddr(executor, 0, td, dst); @@ -97,7 +97,7 @@ infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc, desc->dstDesc->destroyTensor(); desc->srcDesc->destroyTensor(); - freeWorkspace(desc->workspaceAddr); + CHECK_STATUS(freeWorkspace(desc->workspaceAddr), STATUS_SUCCESS); return STATUS_SUCCESS; } diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cc b/src/ops/rearrange/cpu/rearrange_cpu.cc index 9dad108d..a5540727 100644 --- a/src/ops/rearrange/cpu/rearrange_cpu.cc +++ b/src/ops/rearrange/cpu/rearrange_cpu.cc @@ -11,41 +11,52 @@ infiniopStatus_t cpuCreateRearrangeDescriptor(infiniopHandle_t, if (!dtype_eq(dst->dt, src->dt)) { return STATUS_BAD_TENSOR_DTYPE; } - if (dst->ndim != src->ndim || dst->ndim < 2) { + + auto ndim = dst->ndim; + if (src->ndim != ndim || ndim == 0) { return STATUS_BAD_TENSOR_SHAPE; } - std::vector shape; - std::vector strides_dst, strides_src; - auto ndim = dst->ndim; for (int i = 0; i < ndim; ++i) { if (dst->shape[i] != src->shape[i]) { return STATUS_BAD_TENSOR_SHAPE; } - shape.push_back(dst->shape[i]); - strides_dst.push_back(dst->strides[i]); - strides_src.push_back(src->strides[i]); } if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) { return STATUS_BAD_TENSOR_STRIDES; } + + std::vector + shape(dst->shape, dst->shape + ndim); + std::vector + strides_dst(dst->strides, dst->strides + ndim), + strides_src(src->strides, src->strides + ndim); + unsigned int r = 0; - if (ndim == 2) { - r = dst->shape[0]; - } else if (ndim == 3) { - r = dst->shape[0] * dst->shape[1]; - } else { - for (int i = ndim - 3; i >= 1; --i) { - if (dst->shape[i] * dst->strides[i] != dst->strides[i - 1] || src->shape[i] * src->strides[i] != src->strides[i - 1]) { - return STATUS_BAD_TENSOR_STRIDES; + switch (ndim) { + case 1: + ndim = 2; + strides_dst.insert(strides_dst.begin(), shape[0]); + strides_src.insert(strides_src.begin(), shape[0]); + shape.insert(shape.begin(), 1); + case 2: + r = shape[0]; + break; + case 3: + r = shape[0] * shape[1]; + break; + default: + for (int i = ndim - 3; i >= 1; --i) { + if (shape[i] * strides_dst[i] != strides_dst[i - 1] || shape[i] * strides_src[i] != strides_src[i - 1]) { + return STATUS_BAD_TENSOR_STRIDES; + } } - } - r = std::accumulate(dst->shape, dst->shape + ndim - 1, 1, std::multiplies()); + r = std::accumulate(shape.begin(), shape.end() - 1, 1, std::multiplies{}); + break; } *desc_ptr = new RearrangeCpuDescriptor{ DevCpu, dst->dt, r, - ndim, shape, strides_dst, strides_src, @@ -70,11 +81,12 @@ inline int indices(uint64_t i, uint64_t ndim, std::vector strides, std: void reform_cpu(RearrangeCpuDescriptor_t desc, void *dst, void const *src) { auto dst_ptr = reinterpret_cast(dst); auto src_ptr = reinterpret_cast(src); - int bytes_size = desc->shape[desc->ndim - 1] * desc->dt.size; + auto ndim = desc->shape.size(); + int bytes_size = desc->shape[ndim - 1] * desc->dt.size; #pragma omp parallel for for (uint64_t i = 0; i < desc->r; ++i) { - auto dst_offset = indices(i, desc->ndim, desc->strides_dst, desc->shape); - auto src_offset = indices(i, desc->ndim, desc->strides_src, desc->shape); + auto dst_offset = indices(i, ndim, desc->strides_dst, desc->shape); + auto src_offset = indices(i, ndim, desc->strides_src, desc->shape); std::memcpy(dst_ptr + dst_offset * desc->dt.size, src_ptr + src_offset * desc->dt.size, bytes_size); } } diff --git a/src/ops/rearrange/cpu/rearrange_cpu.h b/src/ops/rearrange/cpu/rearrange_cpu.h index f75fe549..99cc62e6 100644 --- a/src/ops/rearrange/cpu/rearrange_cpu.h +++ b/src/ops/rearrange/cpu/rearrange_cpu.h @@ -7,7 +7,6 @@ struct RearrangeCpuDescriptor { Device device; DataLayout dt; uint64_t r; - uint64_t ndim; std::vector shape; std::vector strides_dst; std::vector strides_src; diff --git a/src/ops/rearrange/cuda/rearrange.cc b/src/ops/rearrange/cuda/rearrange.cc index 96e8a890..da23489b 100644 --- a/src/ops/rearrange/cuda/rearrange.cc +++ b/src/ops/rearrange/cuda/rearrange.cc @@ -7,13 +7,15 @@ infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle, RearrangeCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) { - if (!dtype_eq(dst->dt, src->dt)) { + auto dt = dst->dt; + if (!dtype_eq(src->dt, dt)) { return STATUS_BAD_TENSOR_DTYPE; } - if (dst->ndim != src->ndim || dst->ndim < 2) { + + auto ndim = dst->ndim; + if (src->ndim != ndim || ndim == 0) { return STATUS_BAD_TENSOR_SHAPE; } - auto ndim = dst->ndim; for (int i = 0; i < ndim; ++i) { if (dst->shape[i] != src->shape[i]) { return STATUS_BAD_TENSOR_SHAPE; @@ -22,52 +24,44 @@ infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle, if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) { return STATUS_BAD_TENSOR_STRIDES; } - unsigned int r = 0, c = 0, b = 0; - unsigned int rsa = 0, csa = 0, rsb = 0, csb = 0; - if (ndim == 2) { - c = dst->shape[0]; - b = dst->shape[1]; - csa = dst->strides[0]; - csb = src->strides[0]; - } else if (ndim == 3) { - r = dst->shape[0]; - c = dst->shape[1]; - b = dst->shape[2]; - csa = dst->strides[1]; - csb = src->strides[1]; - rsa = dst->strides[0]; - rsb = src->strides[0]; - } else { - for (int i = ndim - 3; i >= 1; --i) { - if (dst->shape[i] * dst->strides[i] != dst->strides[i - 1] || src->shape[i] * src->strides[i] != src->strides[i - 1]) { - return STATUS_BAD_TENSOR_STRIDES; - } - } - r = std::accumulate(dst->shape, dst->shape + ndim - 2, 1, std::multiplies()); - c = dst->shape[ndim - 2]; - b = dst->shape[ndim - 1]; - csa = dst->strides[ndim - 2]; - csb = src->strides[ndim - 2]; - rsa = dst->strides[ndim - 3]; - rsb = src->strides[ndim - 3]; - } - auto contiguous_bytes = b * dst->dt.size; - if (contiguous_bytes % WARP_SIZE != 0) { - return STATUS_BAD_PARAM; - } - auto bytes_per_thread = contiguous_bytes / WARP_SIZE; - if (bytes_per_thread <= 0 || bytes_per_thread > 32 || (bytes_per_thread & (bytes_per_thread - 1)) != 0) { - return STATUS_BAD_PARAM; + + switch (ndim) { + case 1: + *desc_ptr = new RearrangeCudaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[0], + 1, 1, + 0, 0, + 0, 0}; + break; + case 2: + *desc_ptr = new RearrangeCudaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[1], + 1, dst->shape[0], + 0, dst->strides[0], + 0, src->strides[0]}; + break; + case 3: + *desc_ptr = new RearrangeCudaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[2], + dst->shape[0], dst->shape[1], + dst->strides[0], dst->strides[1], + src->strides[0], src->strides[1]}; + break; + default: + return STATUS_BAD_TENSOR_SHAPE; } - *desc_ptr = new RearrangeCudaDescriptor{ - handle->device, - handle->device_id, - rsa, - rsb, - csa, - csb, - r, c, b, - bytes_per_thread}; + + (*desc_ptr)->dst_rs *= dt.size; + (*desc_ptr)->dst_cs *= dt.size; + (*desc_ptr)->src_rs *= dt.size; + (*desc_ptr)->src_cs *= dt.size; + return STATUS_SUCCESS; } infiniopStatus_t cudaDestroyRearrangeDescriptor(RearrangeCudaDescriptor_t desc) { diff --git a/src/ops/rearrange/cuda/rearrange.cu b/src/ops/rearrange/cuda/rearrange.cu index 68d3ddbf..04651f6b 100644 --- a/src/ops/rearrange/cuda/rearrange.cu +++ b/src/ops/rearrange/cuda/rearrange.cu @@ -4,11 +4,11 @@ template static __global__ void rearrange( void *__restrict__ dst, - unsigned int const rsa, - unsigned int const csa, + int const rsa, + int const csa, void const *__restrict__ src, - unsigned int const rsb, - unsigned int const csb, + int const rsb, + int const csb, unsigned int const ncols) { auto row = blockIdx.y, @@ -23,37 +23,44 @@ static __global__ void rearrange( reinterpret_cast(dst)[i] = reinterpret_cast(src)[j]; } - void rearrange_nv_gpu(RearrangeCudaDescriptor_t desc, void *y, void const *x, void *stream) { - unsigned long int rsa = desc->rsa, csa = desc->csa, rsb = desc->rsb, csb = desc->csb; - unsigned int r = desc->r, c = desc->c, b = desc->b, bytes_per_thread = desc->bytes_per_thread; - auto dst_ptr = static_cast(reinterpret_cast(y)); - rsa /= b; - csa /= b; - auto src_ptr = static_cast(reinterpret_cast(x)); - rsb /= b; - csb /= b; auto cuda_stream = reinterpret_cast(stream); - dim3 grid_dims = dim3((c + MAX_WARP_PER_BLOCK - 1) / MAX_WARP_PER_BLOCK, r); - dim3 block_dims = dim3(WARP_SIZE, (c + grid_dims.x - 1) / grid_dims.x); - switch (bytes_per_thread) { + auto unit = desc->unit, + r = desc->r, c = desc->c; + auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs, + src_rs = desc->src_rs, src_cs = desc->src_cs; + + if (r == 1 && c == 1) { + cudaMemcpyAsync(y, x, unit, cudaMemcpyDeviceToDevice, cuda_stream); + return; + } + + auto warps = 1024 / WARP_SIZE; + auto grid = dim3((c + warps - 1) / warps, r); + auto block = dim3(WARP_SIZE, (c + grid.x - 1) / grid.x); + dst_rs /= unit; + dst_cs /= unit; + src_rs /= unit; + src_cs /= unit; + + switch (unit / WARP_SIZE) { case 1: - rearrange<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); break; case 2: - rearrange<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); break; case 4: - rearrange<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); break; case 8: - rearrange<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); break; case 16: - rearrange<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); break; case 32: - rearrange<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); break; default: break; diff --git a/src/ops/rearrange/cuda/rearrange.cuh b/src/ops/rearrange/cuda/rearrange.cuh index 2b0da93e..f31f74b3 100644 --- a/src/ops/rearrange/cuda/rearrange.cuh +++ b/src/ops/rearrange/cuda/rearrange.cuh @@ -7,12 +7,8 @@ struct RearrangeCudaDescriptor { Device device; int device_id; - unsigned long int rsa; - unsigned long int rsb; - unsigned long int csa; - unsigned long int csb; - unsigned long int r, c, b; - unsigned long int bytes_per_thread; + uint64_t unit, r, c; + int64_t dst_rs, dst_cs, src_rs, src_cs; }; typedef struct RearrangeCudaDescriptor *RearrangeCudaDescriptor_t; diff --git a/src/ops/rms_norm/ascend/rms_norm_aclnn.cc b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc index e71f943a..d264be39 100644 --- a/src/ops/rms_norm/ascend/rms_norm_aclnn.cc +++ b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc @@ -62,7 +62,7 @@ infiniopStatus_t aclnnCreateRMSNormDescriptor(AscendHandle_t handle, for (int64_t i = xDesc->ndim - 2; i >= 0; --i) { rstd_strides[i] = rstd_strides[i + 1] * rstd_shape[i + 1]; } - CHECK_STATUS(rstdDesc->setDescriptor(F32, rstd_shape, rstd_strides), STATUS_SUCCESS); + CHECK_STATUS(rstdDesc->setDescriptor(toAclDataType(F32), rstd_shape, rstd_strides), STATUS_SUCCESS); if (wDesc->dataType != xDesc->dataType) { castDesc = new aclnnTensorDescriptor(); diff --git a/src/ops/rms_norm/bang/rms_norm_bang.cc b/src/ops/rms_norm/bang/rms_norm_bang.cc index 6d57d269..fbf7f689 100644 --- a/src/ops/rms_norm/bang/rms_norm_bang.cc +++ b/src/ops/rms_norm/bang/rms_norm_bang.cc @@ -16,8 +16,8 @@ infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, RMSNormBangDes return STATUS_BAD_TENSOR_SHAPE; } - unsigned long int stride_y = y_desc->strides[0]; - unsigned long int stride_x = x_desc->strides[0]; + uint64_t stride_y = y_desc->strides[0]; + uint64_t stride_x = x_desc->strides[0]; auto w_datatype = w_desc->dt; *desc_ptr = new RMSNormBangDescriptor{ handle->device, @@ -33,7 +33,7 @@ infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, RMSNormBangDes return STATUS_SUCCESS; } -infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size) { *size = 0; return STATUS_SUCCESS; } diff --git a/src/ops/rms_norm/bang/rms_norm_bang.h b/src/ops/rms_norm/bang/rms_norm_bang.h index 15210cd2..bfd94158 100644 --- a/src/ops/rms_norm/bang/rms_norm_bang.h +++ b/src/ops/rms_norm/bang/rms_norm_bang.h @@ -9,10 +9,10 @@ struct RMSNormBangDescriptor { Device device; int device_id; DT dtype; - unsigned long int n; - unsigned long int d; - unsigned long int stride_y; - unsigned long int stride_x; + uint64_t n; + uint64_t d; + uint64_t stride_y; + uint64_t stride_x; DT w_datatype; float epsilon; }; @@ -26,11 +26,11 @@ infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, infiniopTensorDescriptor_t w_desc, float epsilon); -infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, unsigned long int *size); +infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size); infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *y, void const *x, void const *w, void *stream); diff --git a/src/ops/rms_norm/bang/rms_norm_bang.mlu b/src/ops/rms_norm/bang/rms_norm_bang.mlu index ac6c0d01..755e1e3c 100644 --- a/src/ops/rms_norm/bang/rms_norm_bang.mlu +++ b/src/ops/rms_norm/bang/rms_norm_bang.mlu @@ -18,7 +18,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, float const *weigh int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); if(dimsize >= maxNum){ - + char *nram_buffer1 = nram_buffer + (2 * maxNum + 3 * wSize) * sizeof(T); T *src = (T *)nram_buffer;//[maxNum] T *wet = src + maxNum;//[maxNum] @@ -43,7 +43,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, float const *weigh for(int s = 0; s < repeat; s++){ __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __bang_mul(src, src, src, maxNum);//src = src * src - + if(maxNum >= wSize){ for(int strip = segNum / 2; strip > 0; strip = strip / 2){ for(int j = 0; j < strip; j++){ @@ -111,7 +111,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, float const *weigh __bang_write_zero(srcTmp, wSize); float *wetTmp = (float *)nram_buffer1; - + int segNum = dimS / wSize; for(int i = indStart; i < indStart + step; i++){ @@ -159,9 +159,9 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - + if(dimsize >= maxNum){ - + T *src = (T *)nram_buffer;//[maxNum] T *wet = src + maxNum;//[maxNum] T *destSumFinal = wet + maxNum;//[wSize] @@ -184,7 +184,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i for(int s = 0; s < repeat; s++){ __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __bang_mul(src, src, src, maxNum);//src = src * src - + if(maxNum >= wSize){ for(int strip = segNum / 2; strip > 0; strip = strip / 2){ for(int j = 0; j < strip; j++){ @@ -241,14 +241,14 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i } } else{ - + T *src = (T *)nram_buffer;//[dimsize] T *wet = src + dimsize;//[dimsize] T *destSumFinal = wet + dimsize;//[wSize] T *destSum = destSumFinal + wSize;//[dimS] T *srcTmp = destSum + dimS;//[wSize] - + int segNum = dimS / wSize; for(int i = indStart; i < indStart + step; i++){ @@ -272,7 +272,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i else{ __memcpy(srcTmp, destSum, dimsize * sizeof(T), NRAM2NRAM); __bang_reduce_sum(destSumFinal, srcTmp, wSize); - + } destSumFinal[0] /= dimsize; destSumFinal[0] += eps; @@ -309,7 +309,7 @@ void rms_normUnion(cnrtQueue_t queue, T *y, T const *x, Tw const *w, int stride_ } void rms_norm_bang_f16(RMSNormBangDescriptor_t desc, void *y, void const *x, void const *w, void *stream){ - auto queue = reinterpret_cast(stream); + auto queue = reinterpret_cast(stream); int n = static_cast(desc->n); int d = static_cast(desc->d); auto y_ = reinterpret_cast(y); @@ -328,11 +328,11 @@ void rms_norm_bang_f16(RMSNormBangDescriptor_t desc, void *y, void const *x, voi auto w_ = reinterpret_cast(w); rms_normUnion(queue, y_, x_, w_, stride_y, stride_x, epsilon, n, d); } - + } infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *y, void const *x, void const *w, void *stream){ if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.cc b/src/ops/rms_norm/bang/rms_norm_cnnl.cc deleted file mode 100644 index 01e9aacd..00000000 --- a/src/ops/rms_norm/bang/rms_norm_cnnl.cc +++ /dev/null @@ -1,56 +0,0 @@ -#include "rms_norm_cnnl.h" -#include "../../../devices/bang/common_bang.h" -#include "../../../devices/bang/handle_pool.h" -#include "../../utils.h" -#include "cnrt.h" - -RMSNormCnnlDescriptor::RMSNormCnnlDescriptor(Device device) { - this->device = device; - get_cnnl_pool(); -} - -void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) { - ASSERT_EQ(y.layout->ndim, 2); - ASSERT_EQ(x.layout->ndim, 2); - ASSERT_EQ(w.layout->ndim, 1); - - auto n = y.layout->shape[0], - d = y.layout->shape[1]; - - ASSERT_EQ(x.layout->shape[0], n); - ASSERT_EQ(x.layout->shape[1], d); - ASSERT_EQ(w.layout->shape[0], d); - - cnnlTensorDescriptor_t yDesc, xDesc, wDesc; - cnnlCreateTensorDescriptor(&yDesc); - cnnlCreateTensorDescriptor(&xDesc); - cnnlCreateTensorDescriptor(&wDesc); - setCnnlTensor(yDesc, y.layout); - setCnnlTensor(xDesc, x.layout); - setCnnlTensor(wDesc, w.layout); - - cnnlFuseNormDescriptor_t opDesc; - cnnlCreateFuseNormDescriptor(&opDesc); - cnnlSetFuseNormDescriptor(opDesc, epsilon, 1.0, true, - false, false, false, false, - CNNL_DTYPE_HALF, CNNL_TRANSFORMER_RMSNORM); - - void *workspace; - - use_cnnl((cnrtQueue_t) stream, - [&](cnnlHandle_t handle) { - size_t wsSize; - cnnlGetFuseNormWorkspaceSize(handle, opDesc, xDesc, &wsSize); - cnrtMalloc(&workspace, wsSize); - cnnlFuseNorm(handle, opDesc, xDesc, x.data, - wDesc, w.data, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, - workspace, wsSize, yDesc, y.data, nullptr, nullptr); - }); - - cnrtFree(workspace); - cnnlDestroyFuseNormDescriptor(opDesc); - cnnlDestroyTensorDescriptor(xDesc); - cnnlDestroyTensorDescriptor(yDesc); - cnnlDestroyTensorDescriptor(wDesc); -} diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.h b/src/ops/rms_norm/bang/rms_norm_cnnl.h deleted file mode 100644 index c76bf2d0..00000000 --- a/src/ops/rms_norm/bang/rms_norm_cnnl.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __CNNL_RMS_NORM_H__ -#define __CNNL_RMS_NORM_H__ - -#include "cnnl.h" -#include "cnnl_extra.h" -#include "operators.h" - -struct RMSNormCnnlDescriptor { - Device device; - RMSNormCnnlDescriptor(Device device); -}; - -void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream); - -#endif// __CNNL_RMS_NORM_H__ diff --git a/src/ops/rms_norm/cuda/rms_norm.cc b/src/ops/rms_norm/cuda/rms_norm.cc index a54b3616..92d34a99 100644 --- a/src/ops/rms_norm/cuda/rms_norm.cc +++ b/src/ops/rms_norm/cuda/rms_norm.cc @@ -1,12 +1,12 @@ #include "rms_norm.cuh" -#include "../../utils.h" #include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t w_desc, - float epsilon) { + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon) { if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) { return STATUS_BAD_TENSOR_SHAPE; } @@ -18,8 +18,8 @@ infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDes return STATUS_BAD_TENSOR_SHAPE; } - unsigned long int stride_y = y_desc->strides[0]; - unsigned long int stride_x = x_desc->strides[0]; + int64_t stride_y = y_desc->strides[0]; + int64_t stride_x = x_desc->strides[0]; auto w_datatype = w_desc->dt; *desc_ptr = new RMSNormCudaDescriptor{ handle->device, @@ -35,7 +35,7 @@ infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDes return STATUS_SUCCESS; } -infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size) { *size = 0; return STATUS_SUCCESS; } diff --git a/src/ops/rms_norm/cuda/rms_norm.cu b/src/ops/rms_norm/cuda/rms_norm.cu index aabbdc20..0dac45f0 100644 --- a/src/ops/rms_norm/cuda/rms_norm.cu +++ b/src/ops/rms_norm/cuda/rms_norm.cu @@ -158,7 +158,7 @@ void rms_norm_nv_gpu_f16(RMSNormCudaDescriptor_t desc, void *y, void const *x, v infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *y, void const *x, void const *w, void *stream) { if (cudaSetDevice(desc->device_id) != cudaSuccess) { diff --git a/src/ops/rms_norm/cuda/rms_norm.cuh b/src/ops/rms_norm/cuda/rms_norm.cuh index 30701c2f..683011f2 100644 --- a/src/ops/rms_norm/cuda/rms_norm.cuh +++ b/src/ops/rms_norm/cuda/rms_norm.cuh @@ -8,10 +8,10 @@ struct RMSNormCudaDescriptor { Device device; int device_id; DT dtype; - unsigned long int n; - unsigned long int d; - unsigned long int stride_y; - unsigned long int stride_x; + uint64_t n; + uint64_t d; + int64_t stride_y; + int64_t stride_x; DT w_datatype; float epsilon; }; @@ -25,11 +25,11 @@ infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, infiniopTensorDescriptor_t w_desc, float epsilon); -infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, unsigned long int *size); +infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size); infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *y, void const *x, void const *w, void *stream); diff --git a/src/ops/rms_norm/operator.cc b/src/ops/rms_norm/operator.cc index 19752456..816ef5dc 100644 --- a/src/ops/rms_norm/operator.cc +++ b/src/ops/rms_norm/operator.cc @@ -13,7 +13,6 @@ #ifdef ENABLE_CAMBRICON_MLU #include "../../devices/bang/bang_handle.h" #include "bang/rms_norm_bang.h" -#include "bang/rms_norm_cnnl.h" #endif #ifdef ENABLE_ASCEND_NPU #include "ascend/rms_norm_aclnn.h" diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cc b/src/ops/rotary_embedding/cuda/rotary_embedding.cc index c92e6bd3..102eb474 100644 --- a/src/ops/rotary_embedding/cuda/rotary_embedding.cc +++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cc @@ -64,7 +64,7 @@ infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle, return STATUS_SUCCESS; } -infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, unsigned long int *size) { +infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size) { *size = 0; return STATUS_SUCCESS; } diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cu b/src/ops/rotary_embedding/cuda/rotary_embedding.cu index 99628248..a5f32a97 100644 --- a/src/ops/rotary_embedding/cuda/rotary_embedding.cu +++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cu @@ -4,7 +4,7 @@ static __global__ void padding_f16( half *__restrict__ x_, - unsigned long const *__restrict__ pos_, + uint64_t const *__restrict__ pos_, float const *__restrict__ sin_, float const *__restrict__ cos_, long const stride0, @@ -27,7 +27,7 @@ static __global__ void padding_f16( void rotary_embedding_nv_gpu_f16( RoPECudaDescriptor_t desc, half *t, - unsigned long const *pos, + uint64_t const *pos, float const *sin_, float const *cos_, void *stream) { auto nt = desc->seq_len, @@ -44,7 +44,7 @@ void rotary_embedding_nv_gpu_f16( infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *t, void const *pos_ids, void const *sin_table, @@ -56,7 +56,7 @@ infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc, if (dtype_eq(desc->dtype, F16)) { rotary_embedding_nv_gpu_f16(desc, reinterpret_cast(t), - reinterpret_cast(pos_ids), + reinterpret_cast(pos_ids), reinterpret_cast(sin_table), reinterpret_cast(cos_table), stream); diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh index 6dd5ab11..36b14194 100644 --- a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh +++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh @@ -24,11 +24,11 @@ infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle, infiniopTensorDescriptor_t sin_table, infiniopTensorDescriptor_t cos_table); -infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, unsigned long int *size); +infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size); infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc, void *workspace, - unsigned long int workspace_size, + uint64_t workspace_size, void *t, void const *pos_ids, void const *sin_table, diff --git a/src/ops/swiglu/ascend/swiglu_kernel.cpp b/src/ops/swiglu/ascend/swiglu_kernel.cpp index 839cd8ea..3dab674f 100644 --- a/src/ops/swiglu/ascend/swiglu_kernel.cpp +++ b/src/ops/swiglu/ascend/swiglu_kernel.cpp @@ -1,5 +1,5 @@ -#include "kernel_operator.h" #include "../../../../include/status.h" +#include "kernel_operator.h" using namespace AscendC; constexpr int32_t BUFFER_NUM = 1; @@ -141,27 +141,27 @@ __aicore__ inline void KernelSwiGLU::Process() { } __global__ __aicore__ void swiglu_kernel_f16(GM_ADDR c, GM_ADDR a, GM_ADDR b, - float beta, int32_t nt, int32_t dh, - int32_t sta, int32_t stb, int32_t stc, - uint32_t remainder, uint32_t base) { + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + uint32_t remainder, uint32_t base) { KernelSwiGLU op; op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base); op.Process(); } __global__ __aicore__ void swiglu_kernel_f32(GM_ADDR c, GM_ADDR a, GM_ADDR b, - float beta, int32_t nt, int32_t dh, - int32_t sta, int32_t stb, int32_t stc, - uint32_t remainder, uint32_t base) { + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + uint32_t remainder, uint32_t base) { KernelSwiGLU op; op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base); op.Process(); } extern "C" infiniopStatus_t swiglu_kernel_do(void *c, void *a, void *b, - float beta, int32_t nt, int32_t dh, - int32_t sta, int32_t stb, int32_t stc, - int dtype, void *stream) { + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + int dtype, void *stream) { // Tiling params auto base = static_cast(dh / BLOCK_NUM); diff --git a/src/ops/utils.h b/src/ops/utils.h index 86d6baa9..b48cf419 100644 --- a/src/ops/utils.h +++ b/src/ops/utils.h @@ -106,7 +106,14 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, // check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c, - uint64_t *broadcast_shape, uint64_t *padded_shape1, uint64_t *padded_shape2, uint64_t broadcast_ndim) { + uint64_t broadcast_ndim) { + std::vector + broadcast_shape_(broadcast_ndim), + padded_shape1_(broadcast_ndim), + padded_shape2_(broadcast_ndim); + auto broadcast_shape = broadcast_shape_.data(), + padded_shape1 = padded_shape1_.data(), + padded_shape2 = padded_shape2_.data(); if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) { return false; } @@ -118,7 +125,8 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor if (dst->ndim < src->ndim) { return false; } - uint64_t padded_shape[dst->ndim]; + std::vector padded_shape_(dst->ndim); + auto padded_shape = padded_shape_.data(); std::fill(padded_shape, padded_shape + dst->ndim, 1); std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim); for (size_t i = 0; i < dst->ndim; ++i) { @@ -131,11 +139,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor // check if the shape of tensor c is valid after broadcasting tensors a and b inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) { - uint64_t broadcast_ndim = std::max(a->ndim, b->ndim); - uint64_t broadcast_shape[broadcast_ndim]; - uint64_t padded_shape1[broadcast_ndim]; - uint64_t padded_shape2[broadcast_ndim]; - return isValidBroadcastShape(a, b, c, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim); + return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim)); } inline uint64_t get_byte_size(infiniopTensorDescriptor_t desc) { @@ -220,7 +224,7 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uin // split the dimension dim of a tensor descriptor into multiple dimensions inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector &dims) { uint64_t ndim = desc->ndim; - if (static_cast(desc->shape[dim]) != std::accumulate(dims.begin(), dims.end(), 1, std::multiplies())) { + if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) { return nullptr; } uint64_t new_ndim = ndim + dims.size() - 1; diff --git a/xmake.lua b/xmake.lua index 4bd0dd7e..a9d5a882 100644 --- a/xmake.lua +++ b/xmake.lua @@ -1,4 +1,8 @@ add_rules("mode.debug", "mode.release") +-- Define color codes +local GREEN = '\27[0;32m' +local YELLOW = '\27[1;33m' +local NC = '\27[0m' -- No Color add_includedirs("include") @@ -72,6 +76,15 @@ end if has_config("nv-gpu") then add_defines("ENABLE_NV_GPU") + local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") + local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH") + if CUDA_ROOT ~= nil then + add_includedirs(CUDA_ROOT .. "/include") + end + if CUDNN_ROOT ~= nil then + add_includedirs(CUDNN_ROOT .. "/include") + end + target("nv-gpu") set_kind("static") on_install(function (target) end) @@ -84,6 +97,9 @@ if has_config("nv-gpu") then if is_plat("windows") then add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") + if CUDNN_ROOT ~= nil then + add_linkdirs(CUDNN_ROOT .. "\\lib\\x64") + end else add_cuflags("-Xcompiler=-fPIC") add_culdflags("-Xcompiler=-fPIC") @@ -123,7 +139,7 @@ if has_config("cambricon-mlu") then local includedirs = table.concat(target:get("includedirs"), " ") local args = {"-c", sourcefile, "-o", objectfile, "-I/usr/local/neuware/include", "--bang-mlu-arch=mtp_592", "-O3", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread"} - + for _, includedir in ipairs(target:get("includedirs")) do table.insert(args, "-I" .. includedir) end @@ -132,8 +148,7 @@ if has_config("cambricon-mlu") then table.insert(target:objectfiles(), objectfile) end) -rule_end() - + rule_end() target("cambricon-mlu") set_kind("static") @@ -204,7 +219,7 @@ if has_config("ascend-npu") then add_links("libascendcl.so") add_links("libnnopbase.so") add_links("libopapi.so") - add_links("libruntime.so") + add_links("libruntime.so") add_linkdirs(ASCEND_HOME .. "/../../driver/lib64/driver") add_links("libascend_hal.so") local builddir = string.format( @@ -221,7 +236,7 @@ if has_config("ascend-npu") then os.exec("make") os.exec("cp $(projectdir)/src/devices/ascend/build/lib/libascend_kernels.a "..builddir.."/") os.cd(os.projectdir()) - + end) after_clean(function () local ascend_build_dir = path.join(os.projectdir(), "src/devices/ascend") @@ -229,9 +244,9 @@ if has_config("ascend-npu") then os.exec("make clean") os.cd(os.projectdir()) os.rm(builddir.. "/libascend_kernels.a") - + end) - rule_end() + rule_end() target("ascend-npu") -- Other configs @@ -242,7 +257,7 @@ if has_config("ascend-npu") then add_files("src/devices/ascend/*.cc", "src/ops/*/ascend/*.cc") add_cxflags("-lstdc++ -Wall -Werror -fPIC") - -- Add operator + -- Add operator add_rules("ascend-kernels") add_links(builddir.."/libascend_kernels.a") @@ -271,50 +286,10 @@ target("infiniop") add_files("src/devices/handle.cc") add_files("src/ops/*/operator.cc") add_files("src/tensor/*.cc") + after_build(function (target) print(YELLOW .. "You can install the libraries with \"xmake install\"" .. NC) end) - after_build(function (target) - local builddir = string.format( - "%s/build/%s/%s/%s", - os.projectdir(), - get_config("plat"), - get_config("arch"), - get_config("mode") - ) - - os.exec("mkdir -p $(projectdir)/lib/") - os.exec("cp " ..builddir.. "/libinfiniop.so $(projectdir)/lib/") - os.exec("cp -r $(projectdir)/include $(projectdir)/lib/") - -- Define color codes - local GREEN = '\27[0;32m' - local YELLOW = '\27[1;33m' - local NC = '\27[0m' -- No Color - - -- Get the current directory - local current_dir = os.curdir() - - -- Output messages with colors - os.exec("echo -e '" .. GREEN .. "Compilation completed successfully." .. NC .. "'") - os.exec("echo -e '" .. YELLOW .. "Install the libraries with \"xmake install\" or set INFINI_ROOT=" .. current_dir .. NC .. "'") - end) - - on_install(function (target) - local home_dir = os.getenv("HOME") - local infini_dir = home_dir .. "/.infini/" - - if os.isdir(infini_dir) then - print("~/.infini/ detected, duplicated contents will be overwritten.") - else - os.mkdir(infini_dir) - end - os.exec("cp -r " .. "$(projectdir)/lib " .. infini_dir) - - local GREEN = '\27[0;32m' - local YELLOW = '\27[1;33m' - local NC = '\27[0m' -- No Color - os.exec("echo -e '" .. GREEN .. "Installation completed successfully at ~/.infini/." .. NC .. "'") - os.exec("echo -e '" .. YELLOW .. "To set the environment variables, please run the following command:" .. NC .. "'") - os.exec("echo -e '" .. YELLOW .. "echo \"export INFINI_ROOT=~/.infini/\" >> ~/.bashrc" .. NC .. "'") - os.exec("echo -e '" .. YELLOW .. "echo \"export LD_LIBRARY_PATH=:~/.infini/lib:$LD_LIBRARY_PATH\" >> ~/.bashrc" .. NC .. "'") - end) + set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) + add_installfiles("include/(**/*.h)", {prefixdir = "include"}) + add_installfiles("include/*.h", {prefixdir = "include"}) target_end()