diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 331a8a98..84108c51 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -23,6 +23,7 @@ jobs:
 
     - name: Install Python dependencies
       run: |
+        pip install numpy
         pip install torch
 
     - name: Install xmake
@@ -33,15 +34,15 @@ jobs:
     - name: configure xmake
       run: xmake f --cpu=true -cv
 
-    - name: Build with XMake
-      run: xmake
-
-    - name: Find and Set INFINI_ROOT
-      id: set_infini_root
+    - name: Set INFINI_ROOT
       run: |
-        export INFINI_ROOT=$GITHUB_WORKSPACE
+        export INFINI_ROOT=$GITHUB_WORKSPACE/.infini
+        mkdir -p $INFINI_ROOT
         echo "INFINI_ROOT=$INFINI_ROOT" >> $GITHUB_ENV
 
+    - name: Build with XMake
+      run: xmake build && xmake install
+
     - name: Run Python Tests
       run: |
         GREEN='\033[0;32m'
diff --git a/README.md b/README.md
index 98913cb9..674a874f 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc
 
 ## 一、使用说明
 
-### 配置
+### 1. 配置
 
 #### 查看当前配置
 
@@ -99,23 +99,27 @@ xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
 xmake f --cambricon-mlu=true -cv
 ```
 
-### 编译
+#### 配置 NPU
+
+````xmake
+xmake f --ascend-npu=true -cv
+````
+
+### 2. 编译安装
 
 ```xmake
-xmake
+xmake build && xmake install
 ```
 
-### 将编译好的算子库添加至环境变量 `INFINI_ROOT`
+### 3. 设置环境变量
 
-```bash
-export INFINI_ROOT=[PATH_TO_LIBRARY]
-```
+按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
 
-### 运行算子测试
+### 4. 运行算子测试
 
 ```bash
 cd operatorspy/tests
-python operator_name.py
+python operator_name.py [--cpu | --cuda | --cambricon | --ascend]
 ```
 
 ## 二、开发说明
diff --git a/include/data_type.h b/include/data_type.h
index 1044936a..954a42ea 100644
--- a/include/data_type.h
+++ b/include/data_type.h
@@ -9,6 +9,7 @@ typedef struct DataLayout {
         mantissa : 8,
         exponent : 8;
 
+#ifdef __cplusplus
     bool operator==(const DataLayout &other) const {
         union TypePun {
             DataLayout layout;
@@ -24,12 +25,13 @@ typedef struct DataLayout {
     bool operator!=(const DataLayout &other) const {
         return !(*this == other);
     }
+#endif
 } DataLayout;
 
 typedef struct DataLayout DT;
 
 // clang-format off
-constexpr static struct DataLayout
+const static struct DataLayout
     I8   = {1, 1, 1,  7,  0},
     I16  = {1, 1, 2, 15,  0},
     I32  = {1, 1, 4, 31,  0},
diff --git a/include/ops/concat/concat.h b/include/ops/concat/concat.h
new file mode 100644
index 00000000..20ca6339
--- /dev/null
+++ b/include/ops/concat/concat.h
@@ -0,0 +1,27 @@
+#ifndef CONCAT_H
+#define CONCAT_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ConcatDescriptor {
+    Device device;  
+} ConcatDescriptor;
+
+typedef ConcatDescriptor *infiniopConcatDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateConcatDescriptor(infiniopHandle_t handle,
+                                                             infiniopConcatDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t y,
+                                                             infiniopTensorDescriptor_t *x,
+                                                             uint64_t num_inputs,
+                                                             int64_t axis);
+
+__C __export infiniopStatus_t infiniopConcat(infiniopConcatDescriptor_t desc,
+                                             void *y,
+                                             void const **x,
+                                             void *stream);
+                                             
+__C __export infiniopStatus_t infiniopDestroyConcatDescriptor(infiniopConcatDescriptor_t desc);
+
+#endif
diff --git a/include/ops/gemm/gemm.h b/include/ops/gemm/gemm.h
index 4a39da39..a6eac566 100644
--- a/include/ops/gemm/gemm.h
+++ b/include/ops/gemm/gemm.h
@@ -18,8 +18,8 @@ __C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t hand
                                                            infiniopTensorDescriptor_t c_desc,
                                                            float alpha,
                                                            float beta,
-                                                           bool transA,
-                                                           bool transB);
+                                                           char transA,
+                                                           char transB);
 
 __C __export infiniopStatus_t infiniopGetGEMMWorkspaceSize(infiniopGEMMDescriptor_t desc, uint64_t *size);
 
diff --git a/include/ops/mlp/mlp.h b/include/ops/mlp/mlp.h
index 7150c427..9c4c7dd2 100644
--- a/include/ops/mlp/mlp.h
+++ b/include/ops/mlp/mlp.h
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handl
                                                           infiniopTensorDescriptor_t w12_desc,
                                                           infiniopTensorDescriptor_t w3_desc,
                                                           float alpha,
-                                                          bool residual);
+                                                          char residual);
 
 __C __export infiniopStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, uint64_t *size);
 
diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py
index 868cc88d..fb58d6a7 100644
--- a/operatorspy/liboperators.py
+++ b/operatorspy/liboperators.py
@@ -10,7 +10,6 @@
 
 LIB_OPERATORS_DIR = os.path.join(os.environ.get("INFINI_ROOT"), "lib")
 
-
 class TensorDescriptor(Structure):
     _fields_ = [
         ("dt", DataLayout),
@@ -19,10 +18,8 @@ class TensorDescriptor(Structure):
         ("pattern", POINTER(c_int64)),
     ]
 
-
 infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
 
-
 class CTensor:
     def __init__(self, desc, data):
         self.descriptor = desc
diff --git a/operatorspy/tests/avg_pool.py b/operatorspy/tests/avg_pool.py
index c7d54c24..a1f47374 100644
--- a/operatorspy/tests/avg_pool.py
+++ b/operatorspy/tests/avg_pool.py
@@ -152,10 +152,6 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
-
-    print(x)
-    print(y)
-    print(ans)
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
 
@@ -205,7 +201,7 @@ def test_musa(lib, test_cases):
         # ((1, 1, 10), (3,), (1,), (1,)),
         ((1, 1, 2, 2), (2, 2), (1, 1), (1, 1)),
         ((32, 4, 224, 224), (3, 3), (1, 1), (2, 2)),
-        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+        # ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
     ]
     args = get_args()
     lib = open_lib()
diff --git a/operatorspy/tests/concat.py b/operatorspy/tests/concat.py
new file mode 100644
index 00000000..412a4b6f
--- /dev/null
+++ b/operatorspy/tests/concat.py
@@ -0,0 +1,218 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_int64
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+
+class ConcatDescriptor(Structure):
+    _fields_ = [("device", c_int32),]
+
+
+infiniopConcatDescriptor_t = POINTER(ConcatDescriptor)
+
+
+def concat_py(*tensors, dim=0):
+    return torch.cat(tensors, dim=dim)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape,
+    axis,
+    input_shapes,
+    tensor_dtype=torch.float32,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    """
+    测试 concat 算子
+    """
+    print(
+        f"Testing Concat on {torch_device} with output_shape:{c_shape}, input_shapes:{input_shapes}, axis:{axis}, dtype:{tensor_dtype}, inplace: {inplace.name}"
+    )
+    
+    inputs = [torch.rand(shape, dtype=tensor_dtype).to(torch_device) for shape in input_shapes]
+    
+    if inplace == Inplace.OUT_OF_PLACE:
+        c = torch.zeros(c_shape, dtype=tensor_dtype).to(torch_device)
+    else:
+        c = torch.zeros(c_shape, dtype=tensor_dtype).to(torch_device)
+    
+    ans = concat_py(*inputs, dim=axis)
+    
+    input_tensors = [to_tensor(t, lib) for t in inputs]
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else to_tensor(c, lib)
+    
+    descriptor = infiniopConcatDescriptor_t()
+
+    num_inputs = len(input_tensors)
+    input_desc_array_type = infiniopTensorDescriptor_t * num_inputs
+    input_desc_array = input_desc_array_type(*[t.descriptor for t in input_tensors])
+    
+    check_error(
+        lib.infiniopCreateConcatDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,            
+            input_desc_array,              
+            c_uint64(num_inputs),
+            c_int64(axis),
+        )
+    )
+
+    input_data_ptrs = (c_void_p * num_inputs)(*[t.data for t in input_tensors])
+    check_error(
+        lib.infiniopConcat(
+            descriptor,
+            c_tensor.data,
+            ctypes.cast(input_data_ptrs, POINTER(c_void_p)),
+            None  
+        )
+    )
+
+    assert torch.allclose(c, ans, atol=0, rtol=0), "Concat result does not match PyTorch's result."
+    
+    check_error(lib.infiniopDestroyConcatDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, axis, input_shapes, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, axis, input_shapes, tensor_dtype = torch.float16, inplace = inplace)
+        test(lib, handle, "cpu", c_shape, axis, input_shapes, tensor_dtype = torch.float32, inplace = inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, axis, input_shapes, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, axis, input_shapes, tensor_dtype = torch.float16, inplace = inplace)
+        test(lib, handle, "cuda", c_shape, axis, input_shapes, tensor_dtype = torch.float32, inplace = inplace)
+    destroy_handle(lib, handle)
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+    
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, axis, input_shapes, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, axis, input_shapes, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for c_shape, axis, input_shapes,inplace in test_cases:
+        test(lib, handle, "musa", c_shape, axis, input_shapes, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+
+    test_cases = [
+        #output_tensor, axis, inputs_tensors, inplace
+
+        ((6,), 0, [(2,), (4,)], Inplace.OUT_OF_PLACE),  
+
+        ((6, 3), 0, [(2, 3), (4, 3)], Inplace.OUT_OF_PLACE),  
+        ((3, 6), 1, [(3, 2), (3, 4)], Inplace.OUT_OF_PLACE),  
+        ((3, 7), 1, [(3, 2), (3, 4), (3, 1)], Inplace.OUT_OF_PLACE), 
+        ((3, 3, 10), 2, [(3, 3, 4), (3, 3, 6)], Inplace.OUT_OF_PLACE),  
+        ((4, 3, 6), 0, [(3, 3, 6), (1, 3, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 6, 3), 1, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 6), 2, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE),  
+        ((4, 3, 5, 6), 0, [(1, 3, 5, 6), (3, 3, 5, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 5, 5, 6), 1, [(2, 3, 5, 6), (2, 2, 5, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 5, 6), 2, [(2, 3, 2, 6), (2, 3, 3, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 5, 6), 3, [(2, 3, 5, 3), (2, 3, 5, 3)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 5, 15), 3, [(2, 3, 5, 3), (2, 3, 5, 3), (2, 3, 5, 9)], Inplace.OUT_OF_PLACE),  
+        ((4, 2, 3, 4, 5), 0, [(1, 2, 3, 4, 5), (3, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE),  
+        ((2, 4, 3, 2, 5), 1, [(2, 2, 3, 2, 5), (2, 2, 3, 2, 5)], Inplace.OUT_OF_PLACE),  
+        ((1, 2, 4, 4, 5), 2, [(1, 2, 2, 4, 5), (1, 2, 2, 4, 5)], Inplace.OUT_OF_PLACE),  
+        ((1, 2, 3, 8, 5), 3, [(1, 2, 3, 4, 5), (1, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE), 
+        ((1, 2, 3, 4, 5), 4, [(1, 2, 3, 4, 3), (1, 2, 3, 4, 2)], Inplace.OUT_OF_PLACE),  
+        ((4, 14, 3, 4, 5), 1, [(4, 3, 3, 4, 5), (4, 5, 3, 4, 5), (4, 6, 3, 4, 5)], Inplace.OUT_OF_PLACE),  
+
+        ((6,), -1, [(2,), (4,)], Inplace.OUT_OF_PLACE),  
+        ((6, 3), -2, [(2, 3), (4, 3)], Inplace.OUT_OF_PLACE),  
+        ((3, 6), -1, [(3, 2), (3, 4)], Inplace.OUT_OF_PLACE),  
+        ((3, 7), -1, [(3, 2), (3, 4), (3, 1)], Inplace.OUT_OF_PLACE), 
+        ((3, 3, 10), -1, [(3, 3, 4), (3, 3, 6)], Inplace.OUT_OF_PLACE),  
+        ((4, 3, 6), -3, [(3, 3, 6), (1, 3, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 6, 3), -2, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 6), -1, [(2, 3, 3), (2, 3, 3)], Inplace.OUT_OF_PLACE),  
+        ((4, 3, 5, 6), -4, [(1, 3, 5, 6), (3, 3, 5, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 5, 5, 6), -3, [(2, 3, 5, 6), (2, 2, 5, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 5, 6), -2, [(2, 3, 2, 6), (2, 3, 3, 6)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 5, 6), -1, [(2, 3, 5, 3), (2, 3, 5, 3)], Inplace.OUT_OF_PLACE),  
+        ((2, 3, 5, 15), -1, [(2, 3, 5, 3), (2, 3, 5, 3), (2, 3, 5, 9)], Inplace.OUT_OF_PLACE),  
+        ((4, 2, 3, 4, 5), -5, [(1, 2, 3, 4, 5), (3, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE),  
+        ((2, 4, 3, 2, 5), -4, [(2, 2, 3, 2, 5), (2, 2, 3, 2, 5)], Inplace.OUT_OF_PLACE),  
+        ((1, 2, 4, 4, 5), -3, [(1, 2, 2, 4, 5), (1, 2, 2, 4, 5)], Inplace.OUT_OF_PLACE),  
+        ((1, 2, 3, 8, 5), -2, [(1, 2, 3, 4, 5), (1, 2, 3, 4, 5)], Inplace.OUT_OF_PLACE), 
+        ((1, 2, 3, 4, 5), -1, [(1, 2, 3, 4, 3), (1, 2, 3, 4, 2)], Inplace.OUT_OF_PLACE),  
+        ((4, 14, 3, 4, 5), -4, [(4, 3, 3, 4, 5), (4, 5, 3, 4, 5), (4, 6, 3, 4, 5)], Inplace.OUT_OF_PLACE),     
+    
+    ]
+    
+    args = get_args()
+    lib = open_lib()
+    
+    lib.infiniopCreateConcatDescriptor.restype = c_int32
+    lib.infiniopCreateConcatDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopConcatDescriptor_t),
+        infiniopTensorDescriptor_t,  
+        POINTER(infiniopTensorDescriptor_t),  
+        c_uint64,  # nums_input
+        c_int64,  # axis
+    ]
+    
+    lib.infiniopConcat.restype = c_int32
+    lib.infiniopConcat.argtypes = [
+        infiniopConcatDescriptor_t,
+        c_void_p,  
+        POINTER(c_void_p),  
+        c_void_p, 
+    ]
+    
+    lib.infiniopDestroyConcatDescriptor.restype = c_int32
+    lib.infiniopDestroyConcatDescriptor.argtypes = [
+        infiniopConcatDescriptor_t,
+    ]
+    
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases) 
+    print("\033[92mConcat Test passed!\033[0m")
diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py
index 3ddb6a03..7c13aa69 100644
--- a/operatorspy/tests/conv.py
+++ b/operatorspy/tests/conv.py
@@ -223,14 +223,14 @@ def test_musa(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # x_shape, w_shape, pads, strides, dilations, x_strides
-        # (
-        #     (32, 3, 4),
-        #     (32, 3, 5),
-        #     (1,),
-        #     (1,),
-        #     (1,),
-        #     None,
-        # ),
+        (
+            (32, 3, 4),
+            (32, 3, 5),
+            (1,),
+            (1,),
+            (1,),
+            None,
+        ),
         (
             (1, 3, 4, 4),
             (2, 3, 3, 3),
diff --git a/operatorspy/tests/gemm.py b/operatorspy/tests/gemm.py
index e899c7cf..16278b7e 100644
--- a/operatorspy/tests/gemm.py
+++ b/operatorspy/tests/gemm.py
@@ -201,6 +201,29 @@ def test_cuda(lib, test_cases):
         test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_musa(lib, test_cases):
+    import torch_musa
+    
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        # test(lib, handle, "musa", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "musa", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+    destroy_handle(lib, handle)
+
 
 def test_bang(lib, test_cases):
     import torch_mlu
@@ -362,6 +385,8 @@ def test_bang(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.musa or args.bang):
         test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/global_avg_pool.py b/operatorspy/tests/global_avg_pool.py
index 5c586546..96703c31 100644
--- a/operatorspy/tests/global_avg_pool.py
+++ b/operatorspy/tests/global_avg_pool.py
@@ -145,23 +145,32 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "musa", x_shape, tensor_dtype = torch.float32)
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
         # x_shape
-        ((1, 3, 3)),
-        ((1, 3, 1, 1, 3)),
-        ((1, 3, 1, 1, 257)),
-        ((1, 2, 1, 1, 514)),
-        ((1, 3, 1, 1, 1025)),
-        ((32, 256, 1, 112, 112)),
-        ((2, 3, 2048000)),
-        ((2, 1, 10243)),
-        ((2, 20, 100)),
-        ((3, 33, 333)),
-        ((32, 20, 512)),
-        ((3, 3, 11, 11, 11, 3, 2)),
-        ((32, 256, 1, 112, 112)),
+        ((1, 1, 2, 2)),
+        ((1, 3, 3, 6)),
+        ((8, 4, 6, 12)),
+        # ((1, 3, 1, 1, 3)),
+        # ((1, 3, 1, 1, 257)),
+        # ((1, 2, 1, 1, 514)),
+        # ((1, 3, 1, 1, 1025)),
+        # ((32, 256, 1, 112, 112)),
+        # ((2, 3, 2048000)),
+        # ((2, 1, 10243)),
+        # ((2, 20, 100)),
+        # ((3, 33, 333)),
+        # ((32, 20, 512)),
+        # ((32, 256, 1, 112, 112)),
         ((32, 256, 112, 112)),
     ]
     args = get_args()
@@ -198,6 +207,8 @@ def test_bang(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
         test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index a434f7ca..ff4d8037 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -383,6 +383,6 @@ def test_musa(lib, test_cases):
         test_ascend(lib, test_cases)
     if args.musa:
         test_musa(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang or args.ascend):
+    if not (args.cpu or args.cuda or args.musa or args.bang or args.ascend):
         test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/max_pool.py b/operatorspy/tests/max_pool.py
index f80b8e4d..aea9767f 100644
--- a/operatorspy/tests/max_pool.py
+++ b/operatorspy/tests/max_pool.py
@@ -148,9 +148,9 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
-    print(x)
-    print(y)
-    print(ans)
+    # print(x)
+    # print(y)
+    # print(ans)
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
 
@@ -197,9 +197,12 @@ def test_musa(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # x_shape, kernel_shape, padding, strides
-        ((1, 1, 10), (3,), (1,), (1,)),
+        # ((1, 1, 10), (3,), (1,), (1,)),
         ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
-        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+        ((1, 3, 6, 6), (3, 3), (1, 1), (2, 2)),
+        ((8, 3, 12, 12), (3, 3), (1, 1), (2, 2)),
+        ((1, 1, 4, 4), (2, 2), (0, 0), (1, 1)),
+        # ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
     ]
     args = get_args()
     lib = open_lib()
diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
index c3de7cc0..bb398f9e 100644
--- a/operatorspy/tests/random_sample.py
+++ b/operatorspy/tests/random_sample.py
@@ -30,7 +30,7 @@ class RandomSampleDescriptor(Structure):
 
 
 def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
-    indices = torch.zeros([topk], dtype = torch.uint64)
+    indices = torch.zeros([topk], dtype = torch.int64)
     dataNp = data.clone().detach()
     sorted_indices = torch.arange(voc)
     
@@ -52,7 +52,7 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
     
     globalM = dataNp[0]
     dataNp = (dataNp - globalM) / temperature
-    dataNp = torch.softmax(dataNp, dim = 0)
+    dataNp = torch.softmax(dataNp.float(), dim = 0)
     sum_s = 0
     for end in range(topk):
         sum_s += dataNp[end]
@@ -63,8 +63,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
     else:
         end = topk
     
-    
-    
     sum_s = 0
     for i in range(end):
         sum_s += dataNp[i]
@@ -78,12 +76,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
 
 def random_sample_0(data):
     return torch.argmax(data)
+
 def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
     print(
         f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
     )
-    
-    data = torch.rand((voc), dtype=x_dtype).to(torch_device)
+    data = torch.arange(voc).float() * 0.0001
+    _perm = torch.randperm(voc)
+    data = data[_perm].to(x_dtype).to(torch_device)
     if(topp > 0 and topk > 1):
         ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
     else:
@@ -96,7 +96,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
         indices = torch.zeros([1], dtype = torch.uint64).to(torch_device)
     x_tensor = to_tensor(data, lib)
     indices_tensor = to_tensor(indices, lib)
-    if(torch_device == 'mlu' or torch_device == 'musa'):
+    if(torch_device == 'mlu' or torch_device == 'npu' or torch_device == 'musa'):
         indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64
     
     
@@ -127,12 +127,12 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
             None,
         )
     )
-    assert indices[0].type(ans.dtype) == ans or abs(data[indices[0]] - data[ans]) == 0.0, "compute error"
+    if torch_device == "npu":
+        torch.npu.synchronize()
 
-
-    
+    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
-
+    print("Test passed!")
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
@@ -179,13 +179,16 @@ def test_musa(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # voc, random_val, topp, topk, temperature
-        (512, 0.92, 0.8, 3, 0.5),
-        (4096, 0.95, 0.9, 5, 1.0),
-        (16384, 0.85, 0.85, 10, 2.0),
-        (512, 0.92, 0, 3, 0.5),
-        (4096, 0.95, 0.9, 0, 1.0),
-        (16384, 0.85, 0, 0, 2.0),
-        (16384, 0.85, 0, 1, 2.0),
+        (512, 0.8, 0.8, 3, 0.5),
+        (4096, 0.05, 0.9, 5, 1.0),
+        (16384, 0.15, 0.85, 10, 2.0),
+        (512, 0.08, 0, 3, 0.5),
+        (4096, 0.5, 0.9, 1, 1.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (32000, 0.08, 0.8, 50, 1.0),
+        (32000, 0.08, 1.0, 25, 1.0),
+        # (119696, 0.01, 1.0, 100, 1.0),
     ]
     
     args = get_args()
@@ -231,4 +234,4 @@ def test_musa(lib, test_cases):
         test_musa(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend or args.musa):
         test_cpu(lib, test_cases)
-    print("Test passed!")
+    print("\033[92mTest passed!\033[0m")
diff --git a/src/devices/ascend/CMakeLists.txt b/src/devices/ascend/CMakeLists.txt
index 5498de24..8cc7f7f8 100644
--- a/src/devices/ascend/CMakeLists.txt
+++ b/src/devices/ascend/CMakeLists.txt
@@ -23,5 +23,6 @@ include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 ascendc_library(ascend_kernels STATIC
     ../../ops/swiglu/ascend/swiglu_kernel.cpp
     ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    ../../ops/random_sample/ascend/random_sample_kernel.cpp
 )
 
diff --git a/src/devices/ascend/common_ascend.cc b/src/devices/ascend/common_ascend.cc
index e7b0e55d..fe988e5d 100644
--- a/src/devices/ascend/common_ascend.cc
+++ b/src/devices/ascend/common_ascend.cc
@@ -8,101 +8,138 @@ int64_t numElements(const int64_t *shape, int64_t num) {
     return numEle;
 }
 
-void *mallocWorkspace(uint64_t workspaceSize) {
-    void *workspaceAddr = nullptr;
+infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize) {
+    *workspaceAddr = nullptr;
     if (workspaceSize > 0) {
-        auto ret = aclrtMalloc(&workspaceAddr, workspaceSize,
-                          ACL_MEM_MALLOC_HUGE_FIRST);
+        auto ret = aclrtMalloc(workspaceAddr, workspaceSize,
+                               ACL_MEM_MALLOC_HUGE_FIRST);
         CHECK_RET(ret == ACL_SUCCESS,
-                  LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret));
+                  LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
     }
-    return workspaceAddr;
+    return STATUS_SUCCESS;
 }
 
-void freeWorkspace(void *workspaceAddr) {
-    aclrtFree(workspaceAddr);
+infiniopStatus_t freeWorkspace(void *workspaceAddr) {
+    if (workspaceAddr != nullptr) {
+        auto ret = aclrtFree(workspaceAddr);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclrtFree failed, ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+    return STATUS_SUCCESS;
 }
 
+aclDataType toAclDataType(DT dt) {
+    if (dt == I8)
+        return aclDataType::ACL_INT8;
+    else if (dt == I16)
+        return aclDataType::ACL_INT16;
+    else if (dt == I32)
+        return aclDataType::ACL_INT32;
+    else if (dt == I64)
+        return aclDataType::ACL_INT64;
+    else if (dt == U8)
+        return aclDataType::ACL_UINT8;
+    else if (dt == U16)
+        return aclDataType::ACL_UINT16;
+    else if (dt == U32)
+        return aclDataType::ACL_UINT32;
+    else if (dt == U64)
+        return aclDataType::ACL_UINT64;
+    else if (dt == F16)
+        return aclDataType::ACL_FLOAT16;
+    else if (dt == BF16)
+        return aclDataType::ACL_BF16;
+    else if (dt == F32)
+        return aclDataType::ACL_FLOAT;
+    else if (dt == F64)
+        return aclDataType::ACL_DOUBLE;
+    else
+        return aclDataType::ACL_DT_UNDEFINED;
+}
+
+
 const char *dataTypeToString(aclDataType dtype) {
     switch (dtype) {
-    case ACL_DT_UNDEFINED:
-        return "ACL_DT_UNDEFINED";
-    case ACL_FLOAT:
-        return "ACL_FLOAT";
-    case ACL_FLOAT16:
-        return "ACL_FLOAT16";
-    case ACL_INT8:
-        return "ACL_INT8";
-    case ACL_INT32:
-        return "ACL_INT32";
-    case ACL_UINT8:
-        return "ACL_UINT8";
-    case ACL_INT16:
-        return "ACL_INT16";
-    case ACL_UINT16:
-        return "ACL_UINT16";
-    case ACL_UINT32:
-        return "ACL_UINT32";
-    case ACL_INT64:
-        return "ACL_INT64";
-    case ACL_UINT64:
-        return "ACL_UINT64";
-    case ACL_DOUBLE:
-        return "ACL_DOUBLE";
-    case ACL_BOOL:
-        return "ACL_BOOL";
-    case ACL_STRING:
-        return "ACL_STRING";
-    case ACL_COMPLEX64:
-        return "ACL_COMPLEX64";
-    case ACL_COMPLEX128:
-        return "ACL_COMPLEX128";
-    case ACL_BF16:
-        return "ACL_BF16";
-    case ACL_INT4:
-        return "ACL_INT4";
-    case ACL_UINT1:
-        return "ACL_UINT1";
-    case ACL_COMPLEX32:
-        return "ACL_COMPLEX32";
-    default:
-        return "UNKNOWN";
+        case ACL_DT_UNDEFINED:
+            return "ACL_DT_UNDEFINED";
+        case ACL_FLOAT:
+            return "ACL_FLOAT";
+        case ACL_FLOAT16:
+            return "ACL_FLOAT16";
+        case ACL_INT8:
+            return "ACL_INT8";
+        case ACL_INT32:
+            return "ACL_INT32";
+        case ACL_UINT8:
+            return "ACL_UINT8";
+        case ACL_INT16:
+            return "ACL_INT16";
+        case ACL_UINT16:
+            return "ACL_UINT16";
+        case ACL_UINT32:
+            return "ACL_UINT32";
+        case ACL_INT64:
+            return "ACL_INT64";
+        case ACL_UINT64:
+            return "ACL_UINT64";
+        case ACL_DOUBLE:
+            return "ACL_DOUBLE";
+        case ACL_BOOL:
+            return "ACL_BOOL";
+        case ACL_STRING:
+            return "ACL_STRING";
+        case ACL_COMPLEX64:
+            return "ACL_COMPLEX64";
+        case ACL_COMPLEX128:
+            return "ACL_COMPLEX128";
+        case ACL_BF16:
+            return "ACL_BF16";
+        case ACL_INT4:
+            return "ACL_INT4";
+        case ACL_UINT1:
+            return "ACL_UINT1";
+        case ACL_COMPLEX32:
+            return "ACL_COMPLEX32";
+        default:
+            return "UNKNOWN";
     }
 }
 
 const char *formatToString(aclFormat format) {
     switch (format) {
-    case ACL_FORMAT_UNDEFINED:
-        return "ACL_FORMAT_UNDEFINED";
-    case ACL_FORMAT_NCHW:
-        return "ACL_FORMAT_NCHW";
-    case ACL_FORMAT_NHWC:
-        return "ACL_FORMAT_NHWC";
-    case ACL_FORMAT_ND:
-        return "ACL_FORMAT_ND";
-    case ACL_FORMAT_NC1HWC0:
-        return "ACL_FORMAT_NC1HWC0";
-    case ACL_FORMAT_FRACTAL_Z:
-        return "ACL_FORMAT_FRACTAL_Z";
-    case ACL_FORMAT_NC1HWC0_C04:
-        return "ACL_FORMAT_NC1HWC0_C04";
-    case ACL_FORMAT_HWCN:
-        return "ACL_FORMAT_HWCN";
-    case ACL_FORMAT_NDHWC:
-        return "ACL_FORMAT_NDHWC";
-    case ACL_FORMAT_FRACTAL_NZ:
-        return "ACL_FORMAT_FRACTAL_NZ";
-    case ACL_FORMAT_NCDHW:
-        return "ACL_FORMAT_NCDHW";
-    case ACL_FORMAT_NDC1HWC0:
-        return "ACL_FORMAT_NDC1HWC0";
-    case ACL_FRACTAL_Z_3D:
-        return "ACL_FRACTAL_Z_3D";
-    case ACL_FORMAT_NC:
-        return "ACL_FORMAT_NC";
-    case ACL_FORMAT_NCL:
-        return "ACL_FORMAT_NCL";
-    default:
-        return "UNKNOWN";
+        case ACL_FORMAT_UNDEFINED:
+            return "ACL_FORMAT_UNDEFINED";
+        case ACL_FORMAT_NCHW:
+            return "ACL_FORMAT_NCHW";
+        case ACL_FORMAT_NHWC:
+            return "ACL_FORMAT_NHWC";
+        case ACL_FORMAT_ND:
+            return "ACL_FORMAT_ND";
+        case ACL_FORMAT_NC1HWC0:
+            return "ACL_FORMAT_NC1HWC0";
+        case ACL_FORMAT_FRACTAL_Z:
+            return "ACL_FORMAT_FRACTAL_Z";
+        case ACL_FORMAT_NC1HWC0_C04:
+            return "ACL_FORMAT_NC1HWC0_C04";
+        case ACL_FORMAT_HWCN:
+            return "ACL_FORMAT_HWCN";
+        case ACL_FORMAT_NDHWC:
+            return "ACL_FORMAT_NDHWC";
+        case ACL_FORMAT_FRACTAL_NZ:
+            return "ACL_FORMAT_FRACTAL_NZ";
+        case ACL_FORMAT_NCDHW:
+            return "ACL_FORMAT_NCDHW";
+        case ACL_FORMAT_NDC1HWC0:
+            return "ACL_FORMAT_NDC1HWC0";
+        case ACL_FRACTAL_Z_3D:
+            return "ACL_FRACTAL_Z_3D";
+        case ACL_FORMAT_NC:
+            return "ACL_FORMAT_NC";
+        case ACL_FORMAT_NCL:
+            return "ACL_FORMAT_NCL";
+        default:
+            return "UNKNOWN";
     }
 }
diff --git a/src/devices/ascend/common_ascend.h b/src/devices/ascend/common_ascend.h
index 7d3a71b0..9b23fd35 100644
--- a/src/devices/ascend/common_ascend.h
+++ b/src/devices/ascend/common_ascend.h
@@ -1,29 +1,30 @@
 #ifndef __COMMON_ASCEND_H__
 #define __COMMON_ASCEND_H__
 
+#include "operators.h"
 #include <acl/acl.h>
 #include <acl/acl_base.h>
 #include <acl/acl_rt.h>
 #include <cstdio>
 #include <functional>
+#include <inttypes.h>
 #include <numeric>
 #include <vector>
-#include <inttypes.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define CHECK_RET(cond, return_expr)                                           \
-    do {                                                                       \
-        if (!(cond)) {                                                         \
-            return_expr;                                                       \
-        }                                                                      \
+#define CHECK_RET(cond, return_expr) \
+    do {                             \
+        if (!(cond)) {               \
+            return_expr;             \
+        }                            \
     } while (0)
 
-#define LOG_PRINT(message, ...)                                                \
-    do {                                                                       \
-        printf(message, ##__VA_ARGS__);                                        \
+#define LOG_PRINT(message, ...)         \
+    do {                                \
+        printf(message, ##__VA_ARGS__); \
     } while (0)
 
 #ifdef __cplusplus
@@ -33,7 +34,8 @@ extern "C" {
 int64_t numElements(const int64_t *shape, int64_t num);
 const char *dataTypeToString(aclDataType dtype);
 const char *formatToString(aclFormat format);
-void *mallocWorkspace(uint64_t workspaceSize);
-void freeWorkspace(void *workspaceAddr);
+infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize);
+infiniopStatus_t freeWorkspace(void *workspaceAddr);
+aclDataType toAclDataType(DT dt);
 
 #endif
diff --git a/src/devices/ascend/tensor_aclnn.cc b/src/devices/ascend/tensor_aclnn.cc
index 7fd41986..0a0fad74 100644
--- a/src/devices/ascend/tensor_aclnn.cc
+++ b/src/devices/ascend/tensor_aclnn.cc
@@ -2,25 +2,15 @@
 #include "../../ops/utils.h"
 #include <algorithm>
 
-infiniopStatus_t aclnnTensorDescriptor::setDescriptor(DT dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides) {
+infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides) {
     if (shape.size() != strides.size()) {
         return STATUS_BAD_PARAM;
     }
     this->ndim = shape.size();
     this->shape = std::vector<int64_t>(shape);
     this->strides = std::vector<int64_t>(strides);
+    this->dataType = dtype;
 
-    if (dtype_eq(dtype, F16)) {
-        this->dataType = aclDataType::ACL_FLOAT16;
-    } else if (dtype_eq(dtype, F32)) {
-        this->dataType = aclDataType::ACL_FLOAT;
-    } else if (dtype_eq(dtype, U64)) {
-        this->dataType = aclDataType::ACL_UINT64;
-    } else if (dtype_eq(dtype, I64)) {
-        this->dataType = aclDataType::ACL_INT64;
-    } else {
-        return STATUS_BAD_TENSOR_DTYPE;
-    }
     // Set format
     // TODO: Support other format
     aclFormat format = aclFormat::ACL_FORMAT_ND;
@@ -31,59 +21,6 @@ infiniopStatus_t aclnnTensorDescriptor::setDescriptor(DT dtype, const std::vecto
     return STATUS_SUCCESS;
 }
 
-// infiniopStatus_t aclnnTensorDescriptor::inferStorageShape(){
-//     auto shape = std::vector<int64_t>();
-//     auto strides = std::vector<int64_t>();
-//     for (uint64_t i = 0; i < this->ndim; ++i) {
-//         if (this->shape[i] > 1){
-//             shape.push_back(this->shape[i]);
-//             strides.push_back(this->strides[i]);
-//         }else if (this->shape[i] <= 0){
-//             return STATUS_BAD_TENSOR_SHAPE;
-//         }
-//     }
-
-//     this->storageNdim = shape.size();
-//     this->storageShape = std::vector<int64_t>(this->storageNdim, 1);
-//     std::vector<uint64_t> indices(this->storageNdim);
-//     for (int64_t i = 0; i < this->storageNdim; ++i) {
-//         indices[i] = i;
-//     }
-
-//     std::sort(indices.begin(), indices.end(), [&](uint64_t a, uint64_t b) {
-//         return strides[a] > strides[b];
-//     });
-//     auto bound = 0; // upper bound of non-zero-strided dimension
-//     for (int64_t i = 0; i < this->storageNdim; ++i) {
-//         // sort shape and strides by strides
-//         shape[i] = this->shape[indices[i]];
-//         strides[i] = this->strides[indices[i]];
-//         if (strides[i] >= 1){
-//             bound++;
-//         }else if (strides[i] < 0){
-//             // negative stride not supported
-//             return STATUS_BAD_TENSOR_STRIDES;
-//         }
-//     }
-//     // Treat the last non-zero-strided dimension as continuous
-//     // All trilling zero-strided dimensions are treated as 1
-//     shape[bound - 1] = shape[bound - 1] * strides[bound - 1];
-//     strides[bound - 1] = 1;
-//     int64_t carry = 1;
-//     for (int64_t i = bound - 1; i > 0; --i) {
-//         // Each non-cummulative stride should be no smaller than corresponding dim
-//         // and storage shape is the bigger one
-//         this->storageShape[i] = strides[i-1] / carry;
-//         if (shape[i] > this->storageShape[i]){
-//                 return STATUS_BAD_TENSOR_STRIDES;
-//         }
-//         carry *= this->storageShape[i];
-//     }
-//     this->storageShape[0] = shape[0];
-
-//     return STATUS_SUCCESS;
-// }
-
 
 /// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size.
 /// We don't see why higher dimensional storage shape is ever needed. To change if necesary.
@@ -108,7 +45,7 @@ infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTen
         shape[i] = static_cast<int64_t>(y->shape[i]);
         strides[i] = y->strides[i];
     }
-    return setDescriptor(y->dt, shape, strides);
+    return setDescriptor(toAclDataType(y->dt), shape, strides);
 }
 
 /// @brief Wrapper of aclCreateTensor. Create aclTensor.
@@ -117,7 +54,7 @@ infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTen
 /// @param data Data ptr on device global mem.
 /// @param tensor Pointer of pointer of aclTensor.
 /// @return
-infiniopStatus_t aclnnTensorDescriptor::createTensor() {
+infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) {
     if (this->t) {
         return STATUS_SUCCESS;
     }
@@ -129,7 +66,7 @@ infiniopStatus_t aclnnTensorDescriptor::createTensor() {
                               this->format,
                               this->storageShape.data(),
                               this->storageNdim,
-                              nullptr);
+                              data);
     return STATUS_SUCCESS;
 }
 
diff --git a/src/devices/ascend/tensor_aclnn.h b/src/devices/ascend/tensor_aclnn.h
index 44c9e051..4aa25074 100644
--- a/src/devices/ascend/tensor_aclnn.h
+++ b/src/devices/ascend/tensor_aclnn.h
@@ -2,9 +2,9 @@
 #define __ACLNN_TENSOR__
 
 #include "./common_ascend.h"
-#include "tensor/tensor_descriptor.h"
 #include "operators.h"
 #include "tensor.h"
+#include "tensor/tensor_descriptor.h"
 #include <acl/acl.h>
 #include <acl/acl_base.h>
 #include <aclnn/acl_meta.h>
@@ -24,11 +24,12 @@ struct aclnnTensorDescriptor {
 
     aclTensor *t;
 
-    infiniopStatus_t setDescriptor(DT dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides);
+    // Transfer from infiniOp DT to aclDataType
+    infiniopStatus_t setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides);
     infiniopStatus_t inferStorageShape();
     // Convert form InfiniOpTensorDescriptor
     infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc);
-    infiniopStatus_t createTensor();
+    infiniopStatus_t createTensor(void *data = nullptr);
     infiniopStatus_t destroyTensor();
     ~aclnnTensorDescriptor();
 
diff --git a/src/devices/cuda/common_cuda.h b/src/devices/cuda/common_cuda.h
index 1afe8c3d..0c10122f 100644
--- a/src/devices/cuda/common_cuda.h
+++ b/src/devices/cuda/common_cuda.h
@@ -40,7 +40,7 @@ typedef struct DTCudnnMapping {
 } DTCudnnMapping;
 
 // DT cudnnDataType_t mapping table
-constexpr DTCudnnMapping dtMappings[] = {
+const DTCudnnMapping dtMappings[] = {
     {F16, CUDNN_DATA_HALF},
     {F32, CUDNN_DATA_FLOAT},
     {F64, CUDNN_DATA_DOUBLE},
diff --git a/src/devices/cuda/cuda_handle.h b/src/devices/cuda/cuda_handle.h
index aa293377..f935ed5f 100644
--- a/src/devices/cuda/cuda_handle.h
+++ b/src/devices/cuda/cuda_handle.h
@@ -6,7 +6,6 @@
 #include "device.h"
 #include "status.h"
 #include <cublas_v2.h>
-#include <cuda_runtime.h>
 #include <cudnn.h>
 #include <memory>
 
diff --git a/src/devices/musa/musa_handle.h b/src/devices/musa/musa_handle.h
index 0c715b83..cc6209ab 100644
--- a/src/devices/musa/musa_handle.h
+++ b/src/devices/musa/musa_handle.h
@@ -61,4 +61,17 @@ void use_mudnn(std::shared_ptr<Pool<musa::dnn::Handle>> mudnn_handles_t, int dev
     mudnn_handles_t->push(handle);
 }
 
+
+// template<typename T>
+// musa::dnn::Status use_mudnn(std::shared_ptr<Pool<musa::dnn::Handle>> mudnn_handles_t, int device_id, musaStream_t stream, T const &f) {
+//     auto handle = mudnn_handles_t->pop();
+//     if (!handle) {
+//         musaSetDevice(device_id);
+//         handle = std::make_shared<musa::dnn::Handle>(device_id);
+//     }
+//     musa::dnn::Status status = f(*handle);
+//     mudnn_handles_t->push(std::move(handle));
+//     return status;
+// }
+
 #endif // __MUSA_HANDLE_H__
\ No newline at end of file
diff --git a/src/ops/add/cuda/add.cc b/src/ops/add/cuda/add.cc
index b010894f..eebcf4be 100644
--- a/src/ops/add/cuda/add.cc
+++ b/src/ops/add/cuda/add.cc
@@ -46,9 +46,9 @@ infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t handle,
     cudaGetDeviceProperties(&prop, handle->device_id);
 
     int64_t *a_strides_d, *b_strides_d, *c_strides_d;
-    checkCudaErrorWithCode(cudaMalloc(&a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
-    checkCudaErrorWithCode(cudaMalloc(&b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
-    checkCudaErrorWithCode(cudaMalloc(&c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void **) &a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void **) &b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void **) &c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
     checkCudaErrorWithCode(cudaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
     checkCudaErrorWithCode(cudaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
     checkCudaErrorWithCode(cudaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
diff --git a/src/ops/add/musa/add_musa.mu b/src/ops/add/musa/add_musa.mu
index ac43d66c..e51197f7 100644
--- a/src/ops/add/musa/add_musa.mu
+++ b/src/ops/add/musa/add_musa.mu
@@ -84,6 +84,7 @@ void _add_mt_gpu(AddMusaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const
         add<Tdata, BTdata><<<gridDims, blockDims, 0, musa_stream>>>(
             c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size);
     }
+    printf("[SUCCESS to execute add_mt_gpu]\n");
 }
 
 template<typename Tdata, typename TIdata>
diff --git a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
index 38dd61c5..26ed34c1 100644
--- a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
+++ b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
@@ -54,8 +54,8 @@ infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle,
         aclnn_strides[i] = aclnn_shape[i + 1] * aclnn_strides[i + 1];
     }
 
-    CHECK_STATUS(aDesc->setDescriptor(y->dt, aclnn_shape, aclnn_strides), STATUS_SUCCESS);
-    CHECK_STATUS(outDesc->setDescriptor(y->dt, aclnn_shape, aclnn_strides), STATUS_SUCCESS);
+    CHECK_STATUS(aDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS);
+    CHECK_STATUS(outDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS);
 
     // Set mask Desc
     auto &maskDesc = (*desc_ptr)->maskDesc;
@@ -70,7 +70,7 @@ infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle,
     }
     auto mask_strides = std::vector<int64_t>{total_seq_len * seq_len, total_seq_len, 1};
 
-    CHECK_STATUS(maskDesc->setDescriptor(y->dt, mask_shape, mask_strides), STATUS_SUCCESS);
+    CHECK_STATUS(maskDesc->setDescriptor(toAclDataType(y->dt), mask_shape, mask_strides), STATUS_SUCCESS);
 
     // Create aclTensor
     CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS);
@@ -118,7 +118,7 @@ infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle,
     // malloc mask space
     auto &maskAddr = (*desc_ptr)->maskAddr;
     auto mask_size = numElements(maskDesc->shape.data(), maskDesc->ndim) * ele_size;
-    maskAddr = mallocWorkspace(mask_size);
+    CHECK_STATUS(mallocWorkspace(&maskAddr, mask_size), STATUS_SUCCESS);
 
     // copy mask matrix to device mem
     ret = aclrtMemcpy(maskAddr,
@@ -181,7 +181,7 @@ infiniopStatus_t aclnnDestroyCausalSoftmaxDescriptor(CausalSoftmaxAclnnDescripto
     delete desc->maskDesc;
     delete desc->outDesc;
     aclDestroyAclOpExecutor(desc->executor);
-    freeWorkspace(desc->maskAddr);
+    CHECK_STATUS(freeWorkspace(desc->maskAddr), STATUS_SUCCESS);
     delete desc;
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.cc b/src/ops/causal_softmax/bang/causal_softmax_bang.cc
index e0e32ca8..cc9b6d37 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_bang.cc
+++ b/src/ops/causal_softmax/bang/causal_softmax_bang.cc
@@ -33,7 +33,7 @@ infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size) {
     if (desc->ndim > 3) {
         *size = desc->ndim * sizeof(int) * 2;
     } else {
diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.h b/src/ops/causal_softmax/bang/causal_softmax_bang.h
index a2e503f9..c9e09921 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_bang.h
+++ b/src/ops/causal_softmax/bang/causal_softmax_bang.h
@@ -21,11 +21,11 @@ infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle,
                                                    CausalSoftmaxBangDescriptor_t *desc_ptr,
                                                    infiniopTensorDescriptor_t y_desc);
 
-infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream);
 
diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu
index 57c445a3..bd7fd1af 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu
+++ b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu
@@ -787,7 +787,7 @@ void causal_softmax_bang_f16(CausalSoftmaxBangDescriptor_t desc, void *workspace
 
 infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream) {
     if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
@@ -798,4 +798,4 @@ infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
         return STATUS_SUCCESS;
     }
     return STATUS_BAD_TENSOR_DTYPE;
-}
\ No newline at end of file
+}
diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc
index 5e27cdf1..02adc37f 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc
+++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc
@@ -38,7 +38,7 @@ infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size) {
     *size = sizeof(bool) * desc->dims[0] * desc->dims[1] * desc->dims[2] * desc->dims[3];
     return STATUS_SUCCESS;
 }
@@ -52,7 +52,7 @@ infiniopStatus_t cnnlDestroyCausalSoftmaxDescriptor(CausalSoftmaxCnnlDescriptor_
 
 infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream) {
     if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h
index 74b35bf6..feaf274e 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h
+++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h
@@ -22,11 +22,11 @@ infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle,
                                                    CausalSoftmaxCnnlDescriptor_t *desc_ptr,
                                                    infiniopTensorDescriptor_t y_desc);
 
-infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream);
 
diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cc b/src/ops/causal_softmax/cuda/causal_softmax.cc
index 12e16e33..c7f4d5ed 100644
--- a/src/ops/causal_softmax/cuda/causal_softmax.cc
+++ b/src/ops/causal_softmax/cuda/causal_softmax.cc
@@ -1,11 +1,11 @@
 #include "causal_softmax.cuh"
-#include "../../utils.h"
 #include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
 
 infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle,
                                                    CausalSoftmaxCudaDescriptor_t *desc_ptr,
                                                    infiniopTensorDescriptor_t y) {
-    unsigned long int ndim = y->ndim;
+    uint64_t ndim = y->ndim;
     // TODO: only support 2d or 3d tensor
     if (ndim != 2 && ndim != 3) {
         return STATUS_BAD_TENSOR_SHAPE;
@@ -13,12 +13,12 @@ infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle,
     if (!dtype_eq(y->dt, F16)) {
         return STATUS_BAD_TENSOR_DTYPE;
     }
-    unsigned long int total_seq_len = y->shape[ndim - 1];
-    unsigned long int seq_len = y->shape[ndim - 2];
-    unsigned long int batch_size = 1;
-    unsigned long int stride_b = 0;
-    unsigned long int stride_i = y->strides[ndim - 2];
-    unsigned long int stride_j = y->strides[ndim - 1];
+    uint64_t total_seq_len = y->shape[ndim - 1];
+    uint64_t seq_len = y->shape[ndim - 2];
+    uint64_t batch_size = 1;
+    uint64_t stride_b = 0;
+    uint64_t stride_i = y->strides[ndim - 2];
+    uint64_t stride_j = y->strides[ndim - 1];
     if (stride_j != 1) {
         return STATUS_BAD_TENSOR_STRIDES;
     }
@@ -44,7 +44,7 @@ infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size) {
     *size = 0;
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cu b/src/ops/causal_softmax/cuda/causal_softmax.cu
index 280420a7..09fd1741 100644
--- a/src/ops/causal_softmax/cuda/causal_softmax.cu
+++ b/src/ops/causal_softmax/cuda/causal_softmax.cu
@@ -218,17 +218,17 @@ __global__ void fused_softmax_standard(
 }
 
 
-void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void* y, void *stream) {
-    unsigned long int total_seq_len = desc->total_seq_len;
-    unsigned long int seq_len = desc->seq_len;
-    unsigned long int batch_size = desc->batch_size;
-    unsigned long int stride_x = desc->stride_b;
-    unsigned long int stride_y = desc->stride_i;
-    unsigned long int stride_z = desc->stride_j;// covert byte strides to element strides
+void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void *y, void *stream) {
+    uint64_t total_seq_len = desc->total_seq_len;
+    uint64_t seq_len = desc->seq_len;
+    uint64_t batch_size = desc->batch_size;
+    uint64_t stride_x = desc->stride_b;
+    uint64_t stride_y = desc->stride_i;
+    uint64_t stride_z = desc->stride_j;// covert byte strides to element strides
     unsigned int max_items_per_thread = desc->max_items_per_thread;
 
     dim3 grid(batch_size, seq_len);
-    
+
     if (max_items_per_thread == 1) {
         fused_softmax_padding<MAX_THREADS_PER_BLOCK>
             <<<grid, total_seq_len, 0, (cudaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z);
@@ -243,13 +243,13 @@ void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void* y, void
 
 infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
-                                   void *stream){
-    if(cudaSetDevice(desc->device_id) != cudaSuccess){
+                                   void *stream) {
+    if (cudaSetDevice(desc->device_id) != cudaSuccess) {
         return STATUS_BAD_DEVICE;
     }
-    if (dtype_eq(desc->dtype, F16)){
+    if (dtype_eq(desc->dtype, F16)) {
         causal_softmax_nv_gpu_f16(desc, data, stream);
         return STATUS_SUCCESS;
     }
diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cuh b/src/ops/causal_softmax/cuda/causal_softmax.cuh
index a2f1f8df..30516bee 100644
--- a/src/ops/causal_softmax/cuda/causal_softmax.cuh
+++ b/src/ops/causal_softmax/cuda/causal_softmax.cuh
@@ -1,19 +1,19 @@
 #ifndef __CUDA_CAUSAL_SOFTMAX_H__
 #define __CUDA_CAUSAL_SOFTMAX_H__
 
-#include "operators.h"
 #include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
 
 struct CausalSoftmaxCudaDescriptor {
     Device device;
     int device_id;
     DT dtype;
-    unsigned long int batch_size;
-    unsigned long int stride_b;
-    unsigned long int seq_len;
-    unsigned long int stride_i;
-    unsigned long int total_seq_len;
-    unsigned long int stride_j;
+    uint64_t batch_size;
+    uint64_t stride_b;
+    uint64_t seq_len;
+    uint64_t stride_i;
+    uint64_t total_seq_len;
+    uint64_t stride_j;
     unsigned int max_items_per_thread;
 };
 
@@ -23,11 +23,11 @@ infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle,
                                                    CausalSoftmaxCudaDescriptor_t *desc_ptr,
                                                    infiniopTensorDescriptor_t y_desc);
 
-infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream);
 
diff --git a/src/ops/concat/cpu/concat_cpu.cc b/src/ops/concat/cpu/concat_cpu.cc
new file mode 100644
index 00000000..6c9bd419
--- /dev/null
+++ b/src/ops/concat/cpu/concat_cpu.cc
@@ -0,0 +1,139 @@
+#include "concat_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateConcatDescriptor(
+    infiniopHandle_t handle,
+    ConcatCpuDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t *x,
+    uint64_t num_inputs,
+    int64_t axis) {
+    if (y == nullptr || x == nullptr || desc_ptr == nullptr || num_inputs == 0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if (!is_contiguous(y)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    int64_t ndim = y->ndim;  
+    if (axis >= ndim || axis < -ndim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if(axis < 0){
+        axis = axis + ndim;
+    }
+
+    uint64_t total_size = 0;  
+    std::vector<std::vector<uint64_t>> input_shapes(num_inputs);  
+
+    std::vector<uint64_t> output_shape(y->shape, y->shape + ndim);
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+
+        if (!is_contiguous(x[i])) {
+            return STATUS_BAD_TENSOR_STRIDES;
+        }
+
+        if (x[i]->dt != y->dt) {
+            return STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        if (x[i]->ndim != ndim) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+        
+        for (size_t j = 0; j < ndim; ++j) {
+            if (j != axis && x[i]->shape[j] != y->shape[j]) {
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        input_shapes[i] = std::vector<uint64_t>(x[i]->shape, x[i]->shape + ndim);
+        total_size += x[i]->shape[axis];
+    }
+
+    if (total_size != y->shape[axis]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new ConcatCpuDescriptor{
+        DevCpu,
+        y->dt,
+        axis,
+        num_inputs,
+        input_shapes,
+        output_shape,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyConcatDescriptor(ConcatCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniopStatus_t concatCompute(const ConcatCpuDescriptor_t& desc,
+                               T* y,
+                               void const** x) {
+    int64_t axis = desc->axis;
+    uint64_t num_inputs = desc->num_inputs;
+    const std::vector<std::vector<uint64_t>>& input_shapes = desc->input_shapes;
+    const std::vector<uint64_t>& output_shape = desc->output_shape;
+
+    size_t blockOffsetInner = 1;
+    for (size_t i = output_shape.size() - 1; i > axis; --i) {
+        blockOffsetInner *= output_shape[i];
+    }
+    size_t blockOffset = output_shape[axis] * blockOffsetInner;
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+        const std::vector<uint64_t>& input_shape = input_shapes[i];
+
+        size_t dimOffset = 0;
+        for (size_t j = 0; j < i; ++j) {
+            dimOffset += input_shapes[j][axis];
+        }
+
+        size_t localBlockOffset = 1;
+        for (size_t j = input_shape.size() - 1; j >= axis && j != static_cast<size_t>(-1); --j) {
+            localBlockOffset *= input_shape[j];
+        }
+        
+        size_t innerOffset = blockOffsetInner * dimOffset;
+        size_t inSize = 1;
+        for (auto dim : input_shape) {
+            inSize *= dim;
+        }
+
+        T* input_data = static_cast<T*>(const_cast<void*>(x[i]));
+
+        #pragma omp parallel for
+        for (size_t iOffset = 0; iOffset < inSize; ++iOffset) {
+
+            size_t oOffset = iOffset % localBlockOffset + innerOffset +
+                             iOffset / localBlockOffset * blockOffset;
+
+            y[oOffset] = input_data[iOffset];
+        }
+    }
+
+    return STATUS_SUCCESS; 
+}
+
+infiniopStatus_t cpuConcat(ConcatCpuDescriptor_t desc,
+                           void *y,
+                           void const **x,
+                           void *stream) {
+    if (desc->dtype == F16) {
+        return concatCompute<uint16_t>(desc, reinterpret_cast<uint16_t*>(y), x);
+    }
+    if (desc->dtype == F32) {
+        return concatCompute<float>(desc, reinterpret_cast<float*>(y), x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/concat/cpu/concat_cpu.h b/src/ops/concat/cpu/concat_cpu.h
new file mode 100644
index 00000000..a8d4d71d
--- /dev/null
+++ b/src/ops/concat/cpu/concat_cpu.h
@@ -0,0 +1,32 @@
+#ifndef __CPU_CONCAT_H__
+#define __CPU_CONCAT_H__
+#include "operators.h"
+#include <vector>
+#include <cstring>
+
+struct ConcatCpuDescriptor {
+    Device device;                                
+    DT dtype;                                    
+    int64_t axis;                               
+    uint64_t num_inputs;                        
+    std::vector<std::vector<uint64_t>> input_shapes;  
+    std::vector<uint64_t> output_shape;              
+};
+
+typedef struct ConcatCpuDescriptor *ConcatCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateConcatDescriptor(infiniopHandle_t handle,
+                                           ConcatCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t y,
+                                           infiniopTensorDescriptor_t *x,
+                                           uint64_t num_inputs,
+                                           int64_t axis);
+
+infiniopStatus_t cpuConcat(ConcatCpuDescriptor_t desc,
+                           void *y,
+                           void const **x,
+                           void *stream);
+
+infiniopStatus_t cpuDestroyConcatDescriptor(ConcatCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/concat/cuda/concat.cc b/src/ops/concat/cuda/concat.cc
new file mode 100644
index 00000000..d99d167b
--- /dev/null
+++ b/src/ops/concat/cuda/concat.cc
@@ -0,0 +1,73 @@
+#include "concat.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateConcatDescriptor(CudaHandle_t handle,
+                                            ConcatCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t *x,
+                                            uint64_t num_inputs,
+                                            int64_t axis){
+    if (y == nullptr || x == nullptr || desc_ptr == nullptr || num_inputs == 0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if (!is_contiguous(y)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    int64_t ndim = y->ndim;  
+    if (axis >= ndim || axis < -ndim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if(axis < 0){
+        axis = axis + ndim;
+    }
+    uint64_t total_size = 0;  
+
+    std::vector<std::vector<uint64_t>> input_shapes(num_inputs);  
+    std::vector<uint64_t> output_shape(y->shape, y->shape + ndim);
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+
+        if (!is_contiguous(x[i])) {
+            return STATUS_BAD_TENSOR_STRIDES;
+        }
+        
+        if (x[i]->dt != y->dt) {
+            return STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (x[i]->ndim != ndim) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+        for (size_t j = 0; j < ndim; ++j) {
+            if (j != axis && x[i]->shape[j] != y->shape[j]) {
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        input_shapes[i] = std::vector<uint64_t>(x[i]->shape, x[i]->shape + ndim);
+        total_size += x[i]->shape[axis];
+    }
+
+    if (total_size != y->shape[axis]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new ConcatCudaDescriptor{
+        DevNvGpu,
+        y->dt,
+        axis,
+        num_inputs,
+        input_shapes,
+        output_shape,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyConcatDescriptor(ConcatCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/concat/cuda/concat.cu b/src/ops/concat/cuda/concat.cu
new file mode 100644
index 00000000..4d4ae521
--- /dev/null
+++ b/src/ops/concat/cuda/concat.cu
@@ -0,0 +1,86 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "concat.cuh"
+
+// Kernel function to perform concatenation on NvGPU
+template <typename T>
+__global__ void concatKernel(const T* x, T* y,
+                             size_t inSize, 
+                             size_t localBlockOffset,
+                             size_t innerOffset, 
+                             size_t blockOffset) {
+    size_t iOffset = blockIdx.x * blockDim.x + threadIdx.x;
+    if (iOffset < inSize) {
+        size_t oOffset = (iOffset % localBlockOffset) + innerOffset +
+                         (iOffset / localBlockOffset) * blockOffset;
+        y[oOffset] = x[iOffset];
+    }
+}
+
+template <typename T>
+infiniopStatus_t concatCompute(ConcatCudaDescriptor_t& desc,
+                               T* y,
+                               void const** x,
+                               cudaStream_t stream) {
+    int64_t axis = desc->axis;
+    uint64_t num_inputs = desc->num_inputs;
+    const std::vector<std::vector<uint64_t>>& input_shapes = desc->input_shapes;
+    const std::vector<uint64_t>& output_shape = desc->output_shape;
+
+    size_t blockOffsetInner = 1;
+    for (size_t i = output_shape.size() - 1; i > axis; --i) {
+        blockOffsetInner *= output_shape[i];
+    }
+    size_t blockOffset = output_shape[axis] * blockOffsetInner;
+
+#pragma unroll
+    for (size_t i = 0; i < num_inputs; ++i) {
+        const std::vector<uint64_t>& input_shape = input_shapes[i];
+
+        size_t dimOffset = 0;
+        for (size_t j = 0; j < i; ++j) {
+            dimOffset += input_shapes[j][axis];
+        }
+
+        size_t localBlockOffset = 1;
+        for (size_t j = input_shape.size() - 1; j >= axis && j != static_cast<size_t>(-1); --j) {
+            localBlockOffset *= input_shape[j];
+        }
+
+        size_t innerOffset = blockOffsetInner * dimOffset;
+        size_t inSize = 1;
+        for (auto dim : input_shape) {
+            inSize *= dim;
+        }
+
+        T* input_data = static_cast<T*>(const_cast<void*>(x[i]));
+
+        // Launch CUDA kernel
+        int threads = 256;
+        int blocks = (inSize + threads - 1) / threads;
+        concatKernel<<<blocks, threads, 0, stream>>>(input_data, y, inSize, localBlockOffset, innerOffset, blockOffset);
+
+        // Check for CUDA errors
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            return STATUS_EXECUTION_FAILED;
+        }
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaConcat(ConcatCudaDescriptor_t desc,
+                            void* y,
+                            void const** x,
+                            void* stream) {
+    cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream);
+
+    if (desc->dtype == F16) {
+        return concatCompute<uint16_t>(desc, reinterpret_cast<uint16_t*>(y), x, cudaStream);
+    }
+    if (desc->dtype == F32) {
+        return concatCompute<float>(desc, reinterpret_cast<float*>(y), x, cudaStream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/concat/cuda/concat.cuh b/src/ops/concat/cuda/concat.cuh
new file mode 100644
index 00000000..9eeaf06f
--- /dev/null
+++ b/src/ops/concat/cuda/concat.cuh
@@ -0,0 +1,36 @@
+#ifndef __CUDA_CONCAT_H__
+#define __CUDA_CONCAT_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <vector>
+#include <numeric>
+
+struct ConcatCudaDescriptor {
+    Device device;
+    DT dtype;
+    int64_t axis;
+    uint64_t num_inputs;
+    std::vector<std::vector<uint64_t>> input_shapes;  
+    std::vector<uint64_t> output_shape;          
+};
+
+typedef struct ConcatCudaDescriptor *ConcatCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateConcatDescriptor(CudaHandle_t handle,
+                                            ConcatCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t *x,
+                                            uint64_t nums_input,
+                                            int64_t axis);
+
+infiniopStatus_t cudaConcat(ConcatCudaDescriptor_t desc,
+                            void *y,
+                            void const **x,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyConcatDescriptor(ConcatCudaDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/src/ops/concat/musa/concat_musa.cc b/src/ops/concat/musa/concat_musa.cc
new file mode 100644
index 00000000..e4666dda
--- /dev/null
+++ b/src/ops/concat/musa/concat_musa.cc
@@ -0,0 +1,73 @@
+#include "concat_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+
+infiniopStatus_t musaCreateConcatDescriptor(MusaHandle_t handle,
+                                            ConcatMusaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t *x,
+                                            uint64_t num_inputs,
+                                            int64_t axis){
+    if (y == nullptr || x == nullptr || desc_ptr == nullptr || num_inputs == 0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if (!is_contiguous(y)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    int64_t ndim = y->ndim;  
+    if (axis >= ndim || axis < -ndim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if(axis < 0){
+        axis = axis + ndim;
+    }
+    uint64_t total_size = 0;  
+
+    std::vector<std::vector<uint64_t>> input_shapes(num_inputs);  
+    std::vector<uint64_t> output_shape(y->shape, y->shape + ndim);
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+
+        if (!is_contiguous(x[i])) {
+            return STATUS_BAD_TENSOR_STRIDES;
+        }
+        
+        if (x[i]->dt != y->dt) {
+            return STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (x[i]->ndim != ndim) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+        for (size_t j = 0; j < ndim; ++j) {
+            if (j != axis && x[i]->shape[j] != y->shape[j]) {
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        input_shapes[i] = std::vector<uint64_t>(x[i]->shape, x[i]->shape + ndim);
+        total_size += x[i]->shape[axis];
+    }
+
+    if (total_size != y->shape[axis]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new ConcatMusaDescriptor{
+        DevMtGpu,
+        y->dt,
+        axis,
+        num_inputs,
+        input_shapes,
+        output_shape,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyConcatDescriptor(ConcatMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/concat/musa/concat_musa.h b/src/ops/concat/musa/concat_musa.h
new file mode 100644
index 00000000..6bdea49d
--- /dev/null
+++ b/src/ops/concat/musa/concat_musa.h
@@ -0,0 +1,35 @@
+#ifndef __MUSA_CONCAT_H__
+#define __MUSA_CONCAT_H__
+
+#include "../../../devices/musa/common_musa.h"
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+#include <vector>
+#include <numeric>
+
+struct ConcatMusaDescriptor {
+    Device device;
+    DT dtype;
+    int64_t axis;
+    uint64_t num_inputs;
+    std::vector<std::vector<uint64_t>> input_shapes;  
+    std::vector<uint64_t> output_shape;          
+};
+
+typedef struct ConcatMusaDescriptor *ConcatMusaDescriptor_t;
+
+infiniopStatus_t musaCreateConcatDescriptor(MusaHandle_t handle,
+                                            ConcatMusaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t *x,
+                                            uint64_t nums_input,
+                                            int64_t axis);
+
+infiniopStatus_t musaConcat(ConcatMusaDescriptor_t desc,
+                            void *y,
+                            void const **x,
+                            void *stream);
+
+infiniopStatus_t musaDestroyConcatDescriptor(ConcatMusaDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/src/ops/concat/musa/concat_musa.mu b/src/ops/concat/musa/concat_musa.mu
new file mode 100644
index 00000000..734cf905
--- /dev/null
+++ b/src/ops/concat/musa/concat_musa.mu
@@ -0,0 +1,86 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "concat_musa.h"
+
+// Kernel function to perform concatenation on MtGPU
+template <typename T>
+__global__ void concatKernel(const T* x, T* y,
+                             size_t inSize, 
+                             size_t localBlockOffset,
+                             size_t innerOffset, 
+                             size_t blockOffset) {
+    size_t iOffset = blockIdx.x * blockDim.x + threadIdx.x;
+    if (iOffset < inSize) {
+        size_t oOffset = (iOffset % localBlockOffset) + innerOffset +
+                         (iOffset / localBlockOffset) * blockOffset;
+        y[oOffset] = x[iOffset];
+    }
+}
+
+template <typename T>
+infiniopStatus_t concatCompute(ConcatMusaDescriptor_t& desc,
+                               T* y,
+                               void const** x,
+                               musaStream_t stream) {
+    int64_t axis = desc->axis;
+    uint64_t num_inputs = desc->num_inputs;
+    const std::vector<std::vector<uint64_t>>& input_shapes = desc->input_shapes;
+    const std::vector<uint64_t>& output_shape = desc->output_shape;
+
+    size_t blockOffsetInner = 1;
+    for (size_t i = output_shape.size() - 1; i > axis; --i) {
+        blockOffsetInner *= output_shape[i];
+    }
+    size_t blockOffset = output_shape[axis] * blockOffsetInner;
+
+#pragma unroll
+    for (size_t i = 0; i < num_inputs; ++i) {
+        const std::vector<uint64_t>& input_shape = input_shapes[i];
+
+        size_t dimOffset = 0;
+        for (size_t j = 0; j < i; ++j) {
+            dimOffset += input_shapes[j][axis];
+        }
+
+        size_t localBlockOffset = 1;
+        for (size_t j = input_shape.size() - 1; j >= axis && j != static_cast<size_t>(-1); --j) {
+            localBlockOffset *= input_shape[j];
+        }
+
+        size_t innerOffset = blockOffsetInner * dimOffset;
+        size_t inSize = 1;
+        for (auto dim : input_shape) {
+            inSize *= dim;
+        }
+
+        T* input_data = static_cast<T*>(const_cast<void*>(x[i]));
+
+        // Launch MUSA kernel
+        int threads = 256;
+        int blocks = (inSize + threads - 1) / threads;
+        concatKernel<<<blocks, threads, 0, stream>>>(input_data, y, inSize, localBlockOffset, innerOffset, blockOffset);
+
+        // Check for MUSA errors
+        musaError_t err = musaGetLastError();
+        if (err != musaSuccess) {
+            return STATUS_EXECUTION_FAILED;
+        }
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaConcat(ConcatMusaDescriptor_t desc,
+                            void* y,
+                            void const** x,
+                            void* stream) {
+    musaStream_t musaStream = reinterpret_cast<musaStream_t>(stream);
+
+    if (desc->dtype == F16) {
+        return concatCompute<uint16_t>(desc, reinterpret_cast<uint16_t*>(y), x, musaStream);
+    }
+    if (desc->dtype == F32) {
+        return concatCompute<float>(desc, reinterpret_cast<float*>(y), x, musaStream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/concat/operator.cc b/src/ops/concat/operator.cc
new file mode 100644
index 00000000..f35bade6
--- /dev/null
+++ b/src/ops/concat/operator.cc
@@ -0,0 +1,87 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/concat/concat.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/concat_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/concat.cuh"
+#endif
+#ifdef ENABLE_MT_GPU
+#include "../../devices/musa/musa_handle.h"
+#include "musa/concat_musa.h"
+#endif
+
+__C infiniopStatus_t infiniopCreateConcatDescriptor(
+    infiniopHandle_t handle,
+    infiniopConcatDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t *x,
+    uint64_t num_inputs,
+    int64_t axis) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateConcatDescriptor(handle, (ConcatCpuDescriptor_t *) desc_ptr, y, x, num_inputs, axis);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateConcatDescriptor((CudaHandle_t) handle, (ConcatCudaDescriptor_t *) desc_ptr, y, x, num_inputs, axis);
+        }
+#endif
+#ifdef ENABLE_MT_GPU
+        case DevMtGpu:{
+            return musaCreateConcatDescriptor((MusaHandle_t) handle, (ConcatMusaDescriptor_t *) desc_ptr, y, x, num_inputs, axis);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+
+__C infiniopStatus_t infiniopConcat(infiniopConcatDescriptor_t desc, void *y, void const **x, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuConcat((ConcatCpuDescriptor_t) desc, y, x, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            printf("[INTO ENABLE_NV_GPU]\n");
+            return cudaConcat((ConcatCudaDescriptor_t) desc, y, x, stream);
+        }
+
+#endif
+#ifdef ENABLE_MT_GPU
+        case DevMtGpu: {
+
+            return musaConcat((ConcatMusaDescriptor_t) desc, y, x, stream);
+        }
+#endif
+
+    }
+    return STATUS_BAD_TENSOR_SHAPE;
+}
+
+
+__C infiniopStatus_t infiniopDestroyConcatDescriptor(infiniopConcatDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyConcatDescriptor((ConcatCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyConcatDescriptor((ConcatCudaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MT_GPU
+        case DevMtGpu: {
+            return musaDestroyConcatDescriptor((ConcatMusaDescriptor_t) desc);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/conv/cpu/conv_cpu.cc b/src/ops/conv/cpu/conv_cpu.cc
index dd198d97..ece37d0b 100644
--- a/src/ops/conv/cpu/conv_cpu.cc
+++ b/src/ops/conv/cpu/conv_cpu.cc
@@ -173,7 +173,8 @@ void _conv_cpu(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_siz
                Ydata *y, Xdata const *x, Xdata const *w) {
     if (desc->padded_x_size > 0) {
         auto padded_x = reinterpret_cast<Xdata *>(workspace);
-        uint64_t padded_shape[desc->ndim];
+        std::vector<uint64_t> padded_shape_(desc->ndim);
+        auto padded_shape = padded_shape_.data();
         std::fill(padded_x, padded_x + desc->padded_x_size, 0);
         getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape);
         fillPaddedInput<Xdata>(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0);
diff --git a/src/ops/conv/cuda/conv.cc b/src/ops/conv/cuda/conv.cc
index 9a352878..2ccabfda 100644
--- a/src/ops/conv/cuda/conv.cc
+++ b/src/ops/conv/cuda/conv.cc
@@ -25,7 +25,7 @@ infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle,
         return STATUS_BAD_TENSOR_DTYPE;
     }
 
-    const auto new_ndim = std::max(4UL, ndim);
+    const uint64_t new_ndim = std::max(ndim, (uint64_t)4);
     // convert pads, strides, dilations into int32[]
     int32_t *pad = new int32_t[new_ndim];
     int32_t *stride = new int32_t[new_ndim];
@@ -87,12 +87,12 @@ infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle,
 
     // create and set tensor descriptors for y
     cudnnTensorDescriptor_t y_desc;
-    int outDim[new_ndim];
+    std::vector<int> outDim_(new_ndim);
+    auto outDim = outDim_.data();
     checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(op_desc, x_desc, w_desc, new_ndim, outDim));
     checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
     checkCudnnError(cudnnSetTensorNdDescriptorEx(y_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), new_ndim, y_shape));
 
-
     // tuning: get the best algorithm
     int requestedAlgoCount = 1;
     checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr,
@@ -101,7 +101,8 @@ infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle,
     int chosenAlgoIndex = 0;
     bool chosen = false;
     size_t workspace_size = 0;
-    cudnnConvolutionFwdAlgoPerf_t perf_results[requestedAlgoCount];
+    std::vector<cudnnConvolutionFwdAlgoPerf_t> perf_results_(requestedAlgoCount);
+    auto perf_results = perf_results_.data();
     checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr,
                               [&](cudnnHandle_t handle) { return cudnnFindConvolutionForwardAlgorithm(handle, x_desc, w_desc, op_desc, y_desc, requestedAlgoCount, &algoCounts, perf_results); }));
     if (algoCounts < 1) {
diff --git a/src/ops/conv/musa/conv_musa.cc b/src/ops/conv/musa/conv_musa.cc
index 581bc287..d966956e 100644
--- a/src/ops/conv/musa/conv_musa.cc
+++ b/src/ops/conv/musa/conv_musa.cc
@@ -1,7 +1,6 @@
 #include "conv_musa.h"
 #include "../../../devices/musa/common_musa.h"
 #include "../../utils.h"
-#include <vector>
 
 infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle,
                                           ConvMusaDescriptor_t *desc_ptr,
@@ -16,6 +15,9 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle,
     if (ndim < 3 || ndim != x->ndim || ndim != w->ndim) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
+    if (ndim > 5 ) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
     if (x->shape[0] != y->shape[0] || w->shape[0] != y->shape[1] || x->shape[1] != w->shape[1]) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
@@ -26,11 +28,11 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle,
         return STATUS_BAD_TENSOR_DTYPE;
     }
 
-    const auto new_ndim = std::max(4UL, ndim);
+    const uint64_t new_ndim = std::max(ndim, (uint64_t)4);
     // convert pads, strides, dilations into int32[]
-    int *pad = new int[new_ndim];
-    int *stride = new int[new_ndim];
-    int *dilation = new int[new_ndim];
+    int32_t *pad = new int32_t[new_ndim];
+    int32_t *stride = new int32_t[new_ndim];
+    int32_t *dilation = new int32_t[new_ndim];
     int64_t *x_shape = new int64_t[new_ndim];
     int64_t *w_shape = new int64_t[new_ndim];
     int64_t *y_shape = new int64_t[new_ndim];
@@ -60,28 +62,74 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle,
         w_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
     }
 
-    x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
-    y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
-    w_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+    if (new_ndim == 5) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+    }
+    else if (new_ndim == 4) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+    }
+    else if (new_ndim == 3) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCW);
+    }
+    else {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
 
     x_tensor->SetNdInfo((int) new_ndim, x_shape);
     y_tensor->SetNdInfo((int) new_ndim, y_shape);
     w_tensor->SetNdInfo((int) new_ndim, w_shape);
 
+    // musa::dnn::Status status1 = y_tensor->SetNdInfo((int) new_ndim, y_shape);
+    // if (status1 == musa::dnn::Status::SUCCESS) {
+    //     std::cerr << "Success to set y_tensor." << std::endl;
+    // }
+
+    // 设置卷积的填充、步长和膨胀
     musa::dnn::Convolution* conv_operator = new musa::dnn::Convolution();
-    conv_operator->SetNdInfo((int) new_ndim-2, pad, stride, dilation);
+    musa::dnn::Status status2 = conv_operator->SetNdInfo(new_ndim - 2, pad, stride, dilation);
+    // if (status2 == musa::dnn::Status::SUCCESS) {
+    //     std::cerr << "Success to set convolution dimensions." << std::endl;
+    // }
+
+    musa::dnn::Status status3 = conv_operator->SetComputeMode(musa::dnn::Convolution::ComputeMode::TENSOR);
+    // if (status3 == musa::dnn::Status::SUCCESS) {
+    //     std::cerr << "Success to set compute mode." << std::endl;
+    //     // printf("status3: %s\n",status3);
+    //     printf("SetComputeMode Status:%d\n", static_cast<int>(status3));
+    // }    
+
     musa::dnn::Convolution::Algorithm algo = musa::dnn::Convolution::Algorithm::DIRECT;
-    size_t workspace_size = 0;
 
-    use_mudnn(handle->mudnn_handles_t, handle->device_id, nullptr, [&](musa::dnn::Handle* handle) {
-        printf(" %d \n", conv_operator->GetRecommendForwardAlgorithm(*handle, algo, *y_tensor, *x_tensor, *w_tensor));
-        // printf(" %d \n", conv_operator->GetForwardWorkspaceSize(*handle, workspace_size, *y_tensor, *x_tensor, *w_tensor, algo));
-    });
+
+    use_mudnn(handle->mudnn_handles_t, handle->device_id, nullptr, 
+                [&](musa::dnn::Handle* handle) {conv_operator->GetRecommendForwardAlgorithm(*handle, algo, *y_tensor, *x_tensor, *w_tensor);});
+
+    size_t workspace_size = 2;
+    // printf("workspace_size before: %zu\n", workspace_size);
+
+    use_mudnn(handle->mudnn_handles_t, handle->device_id, nullptr, 
+                [&](musa::dnn::Handle* handle) {
+                musa::dnn::Status status = conv_operator->GetForwardWorkspaceSize(*handle, workspace_size, *y_tensor, *x_tensor, *w_tensor, algo);
+                // printf("GetForwardWorkspaceSize status: %d\n", static_cast<int>(status));
+            });
+
+    // printf("workspace_size after: %zu\n", workspace_size);
+
     const float alpha = 1.0f;
     const float beta = 0.0f;
-    printf("after: %d\n", algo);
 
-    printf("A\n");
+    musa::dnn::MemoryMaintainer maintainer = [](size_t size) -> musa::dnn::MemoryHandler {
+        void* ptr = nullptr;
+        musaMalloc(&ptr, size);  
+        return musa::dnn::MemoryHandler(ptr, [](void* p) {
+            if (p) musaFree(p); 
+        });
+    };
+
+    // musa::dnn::MemoryHandler workspace_mem  = maintainer(workspace_size);
 
     *desc_ptr = new ConvMusaDescriptor{
         DevMtGpu,
@@ -95,7 +143,9 @@ infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle,
         algo,
         alpha,
         beta,
-        workspace_size};
+        workspace_size,
+        maintainer
+        };
 
     delete[] pad;
     delete[] stride;
diff --git a/src/ops/conv/musa/conv_musa.h b/src/ops/conv/musa/conv_musa.h
index d24baf25..959520d3 100644
--- a/src/ops/conv/musa/conv_musa.h
+++ b/src/ops/conv/musa/conv_musa.h
@@ -19,6 +19,7 @@ struct ConvMusaDescriptor {
     const float alpha;
     const float beta;
     uint64_t workspace_size;
+    musa::dnn::MemoryMaintainer maintainer;
 };
 
 typedef struct ConvMusaDescriptor *ConvMusaDescriptor_t;
diff --git a/src/ops/conv/musa/conv_musa.mu b/src/ops/conv/musa/conv_musa.mu
index cb1debec..f8f7cf12 100644
--- a/src/ops/conv/musa/conv_musa.mu
+++ b/src/ops/conv/musa/conv_musa.mu
@@ -8,12 +8,13 @@ infiniopStatus_t conv_mt_gpu(ConvMusaDescriptor_t desc, void *workspace, uint64_
     desc->y_tensor->SetAddr(y);
     desc->x_tensor->SetAddr(x);
     desc->w_tensor->SetAddr(w);
-    printf("b\n");
 
     use_mudnn(desc->mudnn_handles_t, desc->device_id, (musaStream_t) stream, [&](musa::dnn::Handle* handle) {
-        desc->conv_operator->Run(*handle, *(desc->y_tensor), *(desc->x_tensor), *(desc->w_tensor), desc->algo, nullptr);
-
+        desc->conv_operator->Run(*handle, *(desc->y_tensor), *(desc->x_tensor), *(desc->w_tensor), desc->algo, desc->maintainer);
     });
+
+    printf("[SUCCESS to execute conv_mt_gpu]\n");
+
     return STATUS_SUCCESS;
 }
 
diff --git a/src/ops/expand/cuda/expand.cc b/src/ops/expand/cuda/expand.cc
index cf43b326..d0467c01 100644
--- a/src/ops/expand/cuda/expand.cc
+++ b/src/ops/expand/cuda/expand.cc
@@ -24,7 +24,7 @@ infiniopStatus_t cudaCreateExpandDescriptor(CudaHandle_t handle,
 
     int64_t *x_strides_d, *y_strides_d;
     char *strides_and_shape_d;
-    checkCudaErrorWithCode(cudaMalloc(&strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void **) &strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED);
     checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d, x_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
     checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + ndim * sizeof(int64_t), y->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
     checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + 2 * ndim * sizeof(int64_t), y->shape, ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
diff --git a/src/ops/gemm/operator.cc b/src/ops/gemm/operator.cc
index 071c2870..7036b032 100644
--- a/src/ops/gemm/operator.cc
+++ b/src/ops/gemm/operator.cc
@@ -21,8 +21,8 @@ __C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t hand
                                                            infiniopTensorDescriptor_t c_desc,
                                                            float alpha,
                                                            float beta,
-                                                           bool transA,
-                                                           bool transB) {
+                                                           char transA,
+                                                           char transB) {
     // transpose a and b if needed
     a_desc = transA ? permute(a_desc, {1, 0}) : a_desc;
     b_desc = transB ? permute(b_desc, {1, 0}) : b_desc;
diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cc b/src/ops/global_avg_pool/cuda/global_avg_pool.cc
index da12cfb4..25d7acbe 100644
--- a/src/ops/global_avg_pool/cuda/global_avg_pool.cc
+++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cc
@@ -82,13 +82,13 @@ infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle,
         };
 
     } else if (x->ndim <= 5) {
-        int x_shape[ndim];
-        int x_strides[ndim];
-        int y_shape[ndim];
-        int y_strides[ndim];
-        int k_shape[ndim - 2];
-        int pads[ndim - 2];
-        int strides[ndim - 2];
+        std::vector<int> x_shape(ndim);
+        std::vector<int> x_strides(ndim);
+        std::vector<int> y_shape(ndim);
+        std::vector<int> y_strides(ndim);
+        std::vector<int> k_shape(ndim - 2);
+        std::vector<int> pads(ndim - 2);
+        std::vector<int> strides(ndim - 2);
 
 #pragma omp parallel for
         for (size_t i = 0; i < ndim; ++i) {
@@ -109,7 +109,7 @@ infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle,
         // create and set tensor descriptors for x
         cudnnTensorDescriptor_t x_desc;
         checkCudnnError(cudnnCreateTensorDescriptor(&x_desc));
-        checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, x_shape, x_strides));
+        checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, x_shape.data(), x_strides.data()));
 
         // Create and set pooling descriptor for average pooling
         cudnnPoolingDescriptor_t pool_desc;
@@ -118,14 +118,14 @@ infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle,
                                                     CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
                                                     CUDNN_NOT_PROPAGATE_NAN,
                                                     ndim - 2,
-                                                    k_shape,
-                                                    pads,
-                                                    strides));
+                                                    k_shape.data(),
+                                                    pads.data(),
+                                                    strides.data()));
         // create and set tensor descriptors for y
         cudnnTensorDescriptor_t y_desc;
         checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
-        checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape));
-        checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, y_shape, y_strides));
+        checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data()));
+        checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, y_shape.data(), y_strides.data()));
 
         float alpha = 1.0f, beta = 0.0f;
 
diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cuh b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh
index 35e38d7b..cd97be5b 100644
--- a/src/ops/global_avg_pool/cuda/global_avg_pool.cuh
+++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh
@@ -7,6 +7,7 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <numeric>
+#include <vector>
 
 struct GlobalAvgPoolCudaDescriptor {
     Device device;
diff --git a/src/ops/global_avg_pool/musa/global_avg_pool_musa.cc b/src/ops/global_avg_pool/musa/global_avg_pool_musa.cc
new file mode 100644
index 00000000..2739ce3b
--- /dev/null
+++ b/src/ops/global_avg_pool/musa/global_avg_pool_musa.cc
@@ -0,0 +1,157 @@
+#include "global_avg_pool_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+
+infiniopStatus_t musaCreateGlobalAvgPoolDescriptor(MusaHandle_t handle,
+                                                   GlobalAvgPoolMusaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y,
+                                                   infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim <= 2 || ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    for (size_t i = 0; i < ndim; ++i) {
+        if (i < 2 && y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        } else if (i >= 2 && y->shape[i] != 1) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    const uint64_t new_ndim = ndim;
+
+    int64_t *x_shape = new int64_t[new_ndim];
+    int64_t *y_shape = new int64_t[new_ndim];
+    for(size_t i = 0; i < new_ndim; ++i){
+        x_shape[i] = static_cast<int64_t>(x->shape[i]);
+        y_shape[i] = static_cast<int64_t>(y->shape[i]);
+
+    }
+
+    musa::dnn::Tensor *x_tensor = new musa::dnn::Tensor();
+    musa::dnn::Tensor *y_tensor = new musa::dnn::Tensor();
+    musa::dnn::Tensor *indices = new musa::dnn::Tensor(); 
+
+    x_tensor->SetNdInfo((int)new_ndim, x_shape);
+    y_tensor->SetNdInfo((int)new_ndim, y_shape);
+    indices->SetNdInfo((int)new_ndim, x_shape);
+
+    if (y->dt == F16) {
+        x_tensor->SetType(musa::dnn::Tensor::Type::HALF);
+        y_tensor->SetType(musa::dnn::Tensor::Type::HALF);
+        indices->SetType(musa::dnn::Tensor::Type::HALF);
+    } else if (y->dt == F32) {
+        x_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
+        y_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
+        indices->SetType(musa::dnn::Tensor::Type::FLOAT);
+    }
+
+    if (new_ndim == 5) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+        indices->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+    }
+    else if (new_ndim == 4) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+        indices->SetFormat(musa::dnn::Tensor::Format::NCHW);
+    }
+    else if (new_ndim == 3) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCW);
+        indices->SetFormat(musa::dnn::Tensor::Format::NCW);
+    }
+    else {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+
+    int N = x_shape[0];  // batch size
+    int C = x_shape[1];  // channels
+    int D = x_shape[2];
+    int H = x_shape[3];  // height
+    int W = x_shape[4];  // width
+
+    int kernel[] = {D, H, W};       
+    int pad[] = {0, 0, 0};          
+    int stride[] = {D, H, W};       
+    int dilation[] = {1, 1, 1};     
+        
+    musa::dnn::Status status;
+    musa::dnn::Pooling *pool_desc = new musa::dnn::Pooling();
+
+    status = pool_desc->SetMode(musa::dnn::Pooling::Mode::GLOBAL_AVGPOOL);
+    // if (status == musa::dnn::Status::SUCCESS) {
+    //     printf("pool_desc SetMode status:%d\n", static_cast<int>(status));
+    // }
+
+    // status = pool_desc->SetNdInfo(3, kernel, pad, stride, dilation);
+    // if (status == musa::dnn::Status::SUCCESS) {
+    //     printf("pool_desc SetNdInfo status:%d\n", static_cast<int>(status));
+    // }
+
+    // status = pool_desc->SetNdInfo({H}, {0}, {H}, {1});
+    // if (status == musa::dnn::Status::SUCCESS) {
+    //     printf("pool_desc SetNdInfo status:%d\n", static_cast<int>(status));
+    // }
+
+    // status = pool_desc->SetDivisor(H * W);
+    // if (status == musa::dnn::Status::SUCCESS) {
+    //     printf("pool_desc SetDivisor status:%d\n", static_cast<int>(status));
+    // }
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    *desc_ptr = new GlobalAvgPoolMusaDescriptor{
+        DevMtGpu,
+        y->dt,
+        handle->device_id,
+        ndim,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        handle->mudnn_handles_t,
+        x_tensor,
+        y_tensor,
+        pool_desc,
+        indices,
+        alpha,
+        beta,
+    };
+
+    delete[] x_shape;
+    delete[] y_shape;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolMusaDescriptor_t desc, uint64_t *size) {
+    *size = desc->ndim <= 5 ? 0 : (desc->dtype != F16 ? 0 : std::min(desc->dtype.size * 2, 8) * desc->y_data_size);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolMusaDescriptor_t desc) {
+    if (desc->ndim <= 5) {
+        delete desc->x_desc;
+        delete desc->y_desc;
+        delete desc->pool_desc;
+        delete desc->indices;
+    }
+    desc->mudnn_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/global_avg_pool/musa/global_avg_pool_musa.h b/src/ops/global_avg_pool/musa/global_avg_pool_musa.h
new file mode 100644
index 00000000..ccb5ee6d
--- /dev/null
+++ b/src/ops/global_avg_pool/musa/global_avg_pool_musa.h
@@ -0,0 +1,47 @@
+#ifndef __MUSA_GLOBAL_AVG_POOL_H__
+#define __MUSA_GLOBAL_AVG_POOL_H__
+
+#include "../../../devices/musa/common_musa.h"
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+#include <musa_fp16.h>
+#include <musa_runtime.h>
+#include <numeric>
+#include <vector>
+
+struct GlobalAvgPoolMusaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t data_size;
+    uint64_t y_data_size;
+    uint64_t x_per_NC_data_size;
+    unsigned max_block_size;
+    uint64_t max_grid_size;
+    uint64_t items_per_thread;
+    std::shared_ptr<Pool<musa::dnn::Handle>> mudnn_handles_t;
+    musa::dnn::Tensor *x_desc;
+    musa::dnn::Tensor *y_desc;
+    musa::dnn::Pooling *pool_desc;
+    musa::dnn::Tensor *indices;
+    const float alpha;
+    const float beta;
+};
+
+typedef struct GlobalAvgPoolMusaDescriptor *GlobalAvgPoolMusaDescriptor_t;
+
+infiniopStatus_t musaCreateGlobalAvgPoolDescriptor(MusaHandle_t,
+                                                   GlobalAvgPoolMusaDescriptor_t *,
+                                                   infiniopTensorDescriptor_t y,
+                                                   infiniopTensorDescriptor_t x);
+
+infiniopStatus_t musaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolMusaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t musaGlobalAvgPool(GlobalAvgPoolMusaDescriptor_t desc,
+                                   void *workspace, uint64_t workspace_size, void *y, void const *x,
+                                   void *stream);
+
+infiniopStatus_t musaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolMusaDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/src/ops/global_avg_pool/musa/global_avg_pool_musa.mu b/src/ops/global_avg_pool/musa/global_avg_pool_musa.mu
new file mode 100644
index 00000000..a55c3359
--- /dev/null
+++ b/src/ops/global_avg_pool/musa/global_avg_pool_musa.mu
@@ -0,0 +1,34 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "global_avg_pool_musa.h"
+
+
+infiniopStatus_t global_avg_pool_mt_gpu(GlobalAvgPoolMusaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream, unsigned pack_size) {
+    // use muDNN lib
+
+    checkMusaError(musaSetDevice(desc->device_id));
+    desc->y_desc->SetAddr(y);
+    desc->x_desc->SetAddr(x);
+
+    use_mudnn(desc->mudnn_handles_t, desc->device_id, (musaStream_t) stream, [&](musa::dnn::Handle* handle) {
+        desc->pool_desc->Run(*handle, *(desc->y_desc), *(desc->x_desc), *(desc->indices));
+    });
+
+    printf("[SUCCESS to execute global_avg_pool_mt_gpu]\n");
+
+    return STATUS_SUCCESS;
+}
+
+
+
+infiniopStatus_t musaGlobalAvgPool(GlobalAvgPoolMusaDescriptor_t desc,
+                                   void *workspace, uint64_t workspace_size,
+                                   void *y, void const *x,
+                                   void *stream) {
+    checkMusaError(musaSetDevice(desc->device_id));
+
+    if (desc->dtype == F32) {
+        return global_avg_pool_mt_gpu(desc, workspace, workspace_size, y, x, stream, 1);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/global_avg_pool/operator.cc b/src/ops/global_avg_pool/operator.cc
index 92484283..30ed0de7 100644
--- a/src/ops/global_avg_pool/operator.cc
+++ b/src/ops/global_avg_pool/operator.cc
@@ -9,6 +9,10 @@
 #include "../../devices/cuda/cuda_handle.h"
 #include "cuda/global_avg_pool.cuh"
 #endif
+#ifdef ENABLE_MT_GPU
+#include "../../devices/musa/musa_handle.h"
+#include "musa/global_avg_pool_musa.h"
+#endif
 #ifdef ENABLE_CAMBRICON_MLU
 // TODO: Cambricon
 #endif
@@ -29,6 +33,12 @@ __C infiniopStatus_t infiniopCreateGlobalAvgPoolDescriptor(
         }
 
 #endif
+#ifdef ENABLE_MT_GPU
+        case DevMtGpu: {
+            return musaCreateGlobalAvgPoolDescriptor((MusaHandle_t) handle, (GlobalAvgPoolMusaDescriptor_t *) desc_ptr, y, x);
+        }
+#endif
+
 #ifdef ENABLE_CAMBRICON_MLU
         // TODO
 #endif
@@ -47,6 +57,12 @@ __C infiniopStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPool
             return cudaGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolCudaDescriptor_t) desc, size);
         }
 
+#endif
+#ifdef ENABLE_MT_GPU
+        case DevMtGpu: {
+            return musaGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolMusaDescriptor_t) desc, size);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         // TODO: Cambricon support
@@ -67,6 +83,12 @@ __C infiniopStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t des
             return cudaGlobalAvgPool((GlobalAvgPoolCudaDescriptor_t) desc, workspace, workspace_size, y, x, stream);
         }
 
+#endif
+#ifdef ENABLE_MT_GPU
+        case DevMtGpu: {
+            return musaGlobalAvgPool((GlobalAvgPoolMusaDescriptor_t) desc, workspace, workspace_size, y, x, stream);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         // TODO
@@ -86,6 +108,12 @@ __C infiniopStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoo
             return cudaDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolCudaDescriptor_t) desc);
         }
 
+#endif
+#ifdef ENABLE_MT_GPU
+        case DevMtGpu: {
+            return musaDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolMusaDescriptor_t) desc);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         // TODO
diff --git a/src/ops/matmul/ascend/matmul_aclnn.cc b/src/ops/matmul/ascend/matmul_aclnn.cc
index 158e6d2c..1502469e 100644
--- a/src/ops/matmul/ascend/matmul_aclnn.cc
+++ b/src/ops/matmul/ascend/matmul_aclnn.cc
@@ -45,9 +45,9 @@ infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
     auto &bDesc = (*desc_ptr)->bDesc;
 
     // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation
-    CHECK_STATUS(cDesc->setDescriptor(c_desc->dt, {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS);
-    CHECK_STATUS(aDesc->setDescriptor(a_desc->dt, {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS);
-    CHECK_STATUS(bDesc->setDescriptor(b_desc->dt, {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS);
+    CHECK_STATUS(cDesc->setDescriptor(toAclDataType(c_desc->dt), {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS);
+    CHECK_STATUS(aDesc->setDescriptor(toAclDataType(a_desc->dt), {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS);
+    CHECK_STATUS(bDesc->setDescriptor(toAclDataType(b_desc->dt), {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS);
 
     CHECK_STATUS(cDesc->createTensor(), STATUS_SUCCESS);
     CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS);
@@ -69,13 +69,12 @@ infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
     // aclnnGemm support C = alpha * A @ B + beta * C
     // see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
     ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc,
-                                        (*desc_ptr)->mt, &workspaceSize, &executor);
+                                    (*desc_ptr)->mt, &workspaceSize, &executor);
     CHECK_RET(ret == ACL_SUCCESS,
-            LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
-            return STATUS_EXECUTION_FAILED);
+              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
     aclSetAclOpExecutorRepeatable(executor);
 
-
     return STATUS_SUCCESS;
 }
 
@@ -109,14 +108,14 @@ infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
     aclrtSetDevice(desc->device_id);
 
     for (int i = 0; i < batch; i++) {
-        AclSetTensorAddr(executor, 0, ta, (char *)(a) + i * desc->info->a_matrix.stride * desc->dtype.size);
-        AclSetTensorAddr(executor, 1, tb, (char *)(b) + i * desc->info->b_matrix.stride * desc->dtype.size);
-        AclSetTensorAddr(executor, 2, tc, (char *)(c) + i * desc->info->c_matrix.stride * desc->dtype.size);
-        AclSetTensorAddr(executor, 3, tc, (char *)(c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
         aclnnStatus ret = aclnnGemm(workspace,
-                        workspaceSize,
-                        executor,
-                        stream);
+                                    workspaceSize,
+                                    executor,
+                                    stream);
         CHECK_RET(ret == ACL_SUCCESS,
                   LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
                   return STATUS_EXECUTION_FAILED);
diff --git a/src/ops/mlp/operator.cc b/src/ops/mlp/operator.cc
index 1186a8dc..3cf7ab5d 100644
--- a/src/ops/mlp/operator.cc
+++ b/src/ops/mlp/operator.cc
@@ -26,7 +26,7 @@ __C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handl
                                                           infiniopTensorDescriptor_t w12_desc,
                                                           infiniopTensorDescriptor_t w3_desc,
                                                           float alpha,
-                                                          bool residual) {
+                                                          char residual) {
     if (y_desc->ndim != 2 || x_desc->ndim != 2 || w12_desc->ndim != 2 || w3_desc->ndim != 2) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
diff --git a/src/ops/pooling/cpu/pooling_cpu.cc b/src/ops/pooling/cpu/pooling_cpu.cc
index 6f411303..f5bd04d1 100644
--- a/src/ops/pooling/cpu/pooling_cpu.cc
+++ b/src/ops/pooling/cpu/pooling_cpu.cc
@@ -191,7 +191,8 @@ void _pooling_cpu(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspa
                   Ydata *y, Xdata const *x) {
     if (desc->padded_x_size > 0) {
         auto padded_x = reinterpret_cast<Xdata *>(workspace);
-        uint64_t padded_shape[desc->ndim];
+        std::vector<uint64_t> padded_shape_(desc->ndim);
+        auto padded_shape = padded_shape_.data();
         std::fill(padded_x, padded_x + desc->padded_x_size, 0);
         getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape);
         fillPaddedInput<Xdata>(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0);
diff --git a/src/ops/pooling/cuda/pooling.cc b/src/ops/pooling/cuda/pooling.cc
index 35f2f791..0cf45d64 100644
--- a/src/ops/pooling/cuda/pooling.cc
+++ b/src/ops/pooling/cuda/pooling.cc
@@ -91,16 +91,13 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle,
             beta,
         };
     } else {
-        int x_shape[ndim];
-        int x_strides[ndim];
-        int y_shape[ndim];
-        int y_strides[ndim];
-        int k_shape[ndim - 2];
-        int pads_int[ndim - 2];
-        int strides_int[ndim - 2];
-        const auto kernel_ = reinterpret_cast<uint64_t const *>(kernel_shape);
-        const auto pads_ = reinterpret_cast<uint64_t const *>(pads);
-        const auto strides_ = reinterpret_cast<int64_t const *>(strides);
+        std::vector<int> x_shape(ndim);
+        std::vector<int> x_strides(ndim);
+        std::vector<int> y_shape(ndim);
+        std::vector<int> y_strides(ndim);
+        std::vector<int> k_shape(ndim - 2);
+        std::vector<int> pads_int(ndim - 2);
+        std::vector<int> strides_int(ndim - 2);
 
 #pragma omp parallel for
         for (size_t i = 0; i < ndim; ++i) {
@@ -109,9 +106,9 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle,
             y_shape[i] = static_cast<int>(y->shape[i]);
             y_strides[i] = static_cast<int>(y->strides[i]);
             if (i < ndim - 2) {
-                k_shape[i] = static_cast<int>(kernel_[i]);
-                pads_int[i] = static_cast<int>(pads_[i]);
-                strides_int[i] = static_cast<int>(strides_[i]);
+                k_shape[i] = static_cast<int>(kernel_shape[i]);
+                pads_int[i] = static_cast<int>(pads[i]);
+                strides_int[i] = static_cast<int>(strides[i]);
             }
         }
 
@@ -121,7 +118,7 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle,
         // create and set tensor descriptors for x
         cudnnTensorDescriptor_t x_desc;
         checkCudnnError(cudnnCreateTensorDescriptor(&x_desc));
-        checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, x_shape, x_strides));
+        checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, x_shape.data(), x_strides.data()));
 
         // Create and set pooling descriptor for average pooling
         cudnnPoolingDescriptor_t pool_desc;
@@ -130,14 +127,14 @@ infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle,
                                                     getPoolingMode(pooling_type),
                                                     CUDNN_NOT_PROPAGATE_NAN,
                                                     ndim - 2,
-                                                    k_shape,
-                                                    pads_int,
-                                                    strides_int));
+                                                    k_shape.data(),
+                                                    pads_int.data(),
+                                                    strides_int.data()));
         // create and set tensor descriptors for y
         cudnnTensorDescriptor_t y_desc;
         checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
-        checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape));
-        checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, y_shape, y_strides));
+        checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data()));
+        checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, y_shape.data(), y_strides.data()));
 
         *desc_ptr = new PoolingCudaDescriptor{
             DevNvGpu,
diff --git a/src/ops/pooling/cuda/pooling.cuh b/src/ops/pooling/cuda/pooling.cuh
index ab26d280..dd080e1e 100644
--- a/src/ops/pooling/cuda/pooling.cuh
+++ b/src/ops/pooling/cuda/pooling.cuh
@@ -3,6 +3,7 @@
 
 #include "../../../devices/cuda/cuda_handle.h"
 #include "operators.h"
+#include <vector>
 
 struct PoolingCudaDescriptor {
     Device device;
diff --git a/src/ops/pooling/musa/pooling_musa.cc b/src/ops/pooling/musa/pooling_musa.cc
index ccb21b96..d7e1121c 100644
--- a/src/ops/pooling/musa/pooling_musa.cc
+++ b/src/ops/pooling/musa/pooling_musa.cc
@@ -32,44 +32,73 @@ infiniopStatus_t musaCreatePoolingDescriptor(MusaHandle_t handle,
         return STATUS_BAD_TENSOR_DTYPE;
     }
 
-    float alpha = 1.0f, beta = 0.0f;
+    const uint64_t new_ndim = ndim;
 
-    const auto kernel_ = reinterpret_cast<int const *>(kernel_shape);
-    const auto pads_ = reinterpret_cast<int const *>(pads);
-    const auto strides_ = reinterpret_cast<int const *>(strides);
+    int64_t *x_shape = new int64_t[new_ndim];
+    int64_t *y_shape = new int64_t[new_ndim];
+    for(size_t i = 0; i < new_ndim; ++i){
+        x_shape[i] = static_cast<int64_t>(x->shape[i]);
+        y_shape[i] = static_cast<int64_t>(y->shape[i]);
 
-    const auto x_shape = reinterpret_cast<int64_t const *>(x->shape);
-    const auto x_strides = reinterpret_cast<int64_t const *>(x->strides);
-    const auto y_shape = reinterpret_cast<int64_t const *>(y->shape);
-    const auto y_strides = reinterpret_cast<int64_t const *>(y->strides);
+    }
 
     musa::dnn::Tensor *x_tensor = new musa::dnn::Tensor();
     musa::dnn::Tensor *y_tensor = new musa::dnn::Tensor();
-    musa::dnn::Tensor *indices_tensor = new musa::dnn::Tensor();
+    musa::dnn::Tensor *indices = new musa::dnn::Tensor(); 
+
+    x_tensor->SetNdInfo((int)new_ndim, x_shape);
+    y_tensor->SetNdInfo((int)new_ndim, y_shape);
+    indices->SetNdInfo((int)new_ndim, y_shape);
 
     if (y->dt == F16) {
         x_tensor->SetType(musa::dnn::Tensor::Type::HALF);
         y_tensor->SetType(musa::dnn::Tensor::Type::HALF);
+        indices->SetType(musa::dnn::Tensor::Type::HALF);
     } else if (y->dt == F32) {
         x_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
         y_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
+        indices->SetType(musa::dnn::Tensor::Type::FLOAT);
     }
 
-    x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
-    y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+    if (new_ndim == 5) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+        indices->SetFormat(musa::dnn::Tensor::Format::NCDHW);
+    }
+    else if (new_ndim == 4) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+        indices->SetFormat(musa::dnn::Tensor::Format::NCHW);
+    }
+    else if (new_ndim == 3) {
+        x_tensor->SetFormat(musa::dnn::Tensor::Format::NCW);
+        y_tensor->SetFormat(musa::dnn::Tensor::Format::NCW);
+        indices->SetFormat(musa::dnn::Tensor::Format::NCW);
+    }
+    else {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
 
-    x_tensor->SetNdInfo((int) ndim, x_shape, x_strides);
-    y_tensor->SetNdInfo((int) ndim, y_shape, y_strides);
+    musa::dnn::Status status;
+    musa::dnn::Pooling* pooling_operator = new musa::dnn::Pooling();
 
-    int *indice = new int[ndim];
+    status = pooling_operator->SetMode(getPoolingMode(pooling_type));
+    // if (status == musa::dnn::Status::SUCCESS) {
+    //     printf("pool_desc SetMode status:%d\n", static_cast<int>(status));
+    // }
 
-    musa::dnn::Pooling* pooling_operator = new musa::dnn::Pooling();
-    pooling_operator->SetMode(getPoolingMode(pooling_type));
+    std::initializer_list<int> kernel = {static_cast<int>(kernel_shape[0]), static_cast<int>(kernel_shape[1])};
+    std::initializer_list<int> pad = {static_cast<int>(pads[0]), static_cast<int>(pads[1])};
+    std::initializer_list<int> stride = {static_cast<int>(strides[0]), static_cast<int>(strides[1])};
+    std::initializer_list<int> dilationList = {1, 1};
 
-    int* dilation_ = new int[n];
-    std::fill(dilation_, dilation_+((int) n), 1);
+    status = pooling_operator->SetNdInfo(kernel, pad, stride, dilationList);
+    // if (status == musa::dnn::Status::SUCCESS) {
+    //     printf("pool_desc SetNdInfo status:%d\n", static_cast<int>(status));
+    // }
 
-    pooling_operator->SetNdInfo((int) n, kernel_, pads_, strides_, (const int*) dilation_);
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
 
     *desc_ptr = new PoolingMusaDescriptor{
         DevMtGpu,
@@ -78,11 +107,14 @@ infiniopStatus_t musaCreatePoolingDescriptor(MusaHandle_t handle,
         handle->mudnn_handles_t,
         x_tensor,
         y_tensor,
-        indices_tensor,
+        indices,
         pooling_operator,
         alpha,
         beta,
     };
+
+    delete[] x_shape;
+    delete[] y_shape;
     
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/pooling/musa/pooling_musa.h b/src/ops/pooling/musa/pooling_musa.h
index 1b8daf11..73339b87 100644
--- a/src/ops/pooling/musa/pooling_musa.h
+++ b/src/ops/pooling/musa/pooling_musa.h
@@ -46,6 +46,7 @@ inline musa::dnn::Pooling::Mode getPoolingMode(int pooling_type) {
             return musa::dnn::Pooling::Mode::MAXPOOL;
         case 1:
             return musa::dnn::Pooling::Mode::AVGPOOL_COUNT_PAD;
+            // return musa::dnn::Pooling::Mode::AVGPOOL_COUNT_WITHOUT_PAD;
         default:
             return musa::dnn::Pooling::Mode::MAXPOOL;
     }
diff --git a/src/ops/pooling/musa/pooling_musa.mu b/src/ops/pooling/musa/pooling_musa.mu
index ece291ad..bb8081ca 100644
--- a/src/ops/pooling/musa/pooling_musa.mu
+++ b/src/ops/pooling/musa/pooling_musa.mu
@@ -12,6 +12,7 @@ infiniopStatus_t pooling_mt_gpu(PoolingMusaDescriptor_t desc, void *y, void cons
         desc->pool_operator->Run(*handle, *(desc->y_tensor), *(desc->x_tensor), *(desc->indices_tensor));
     });
 
+    printf("[SUCCESS to execute pooling_mt_gpu]\n");
     return STATUS_SUCCESS;
 }
 
diff --git a/src/ops/random_sample/ascend/random_sample.cc b/src/ops/random_sample/ascend/random_sample.cc
new file mode 100644
index 00000000..b16159dc
--- /dev/null
+++ b/src/ops/random_sample/ascend/random_sample.cc
@@ -0,0 +1,153 @@
+#include "random_sample.h"
+
+RandomSampleAscendDescriptor::RandomSampleAscendDescriptor(Device _device) {
+    device = _device;
+    device_id = 0;
+    pDesc = new aclnnTensorDescriptor();
+    topkIdxDesc = new aclnnTensorDescriptor();
+    topkValDesc = new aclnnTensorDescriptor();
+    resDesc = new aclnnTensorDescriptor();
+}
+
+infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle,
+                                                    RandomSampleAscendDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t result,
+                                                    infiniopTensorDescriptor_t probs) {
+    if (probs->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(result->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    if (result->ndim != 1 && result->shape[0] != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    (*desc_ptr) = new RandomSampleAscendDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+
+    CHECK_STATUS((*desc_ptr)->pDesc->fromInfiniOpTensorDescriptor(probs), STATUS_SUCCESS);
+    CHECK_STATUS((*desc_ptr)->resDesc->fromInfiniOpTensorDescriptor(result), STATUS_SUCCESS);
+    // Ascend aclnnTopk doesn't support U64 type
+    (*desc_ptr)->resDesc->dataType = aclDataType::ACL_INT64;
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc,
+                                                    uint64_t *size) {
+    auto &pDesc = desc->pDesc;
+    *size = numElements(pDesc->shape.data(), pDesc->ndim) * aclDataTypeSize(pDesc->dataType) +
+            numElements(pDesc->shape.data(), pDesc->ndim) * sizeof(I64);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream) {
+    if (topk <= 0 || topp < 0 || topp > 1.0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if (random_val < 0 || random_val > 1.0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    auto &pDesc = desc->pDesc;
+    auto &topkIdxDesc = desc->topkIdxDesc;
+    auto &topkValDesc = desc->topkValDesc;
+    auto ndim = static_cast<int64_t>(pDesc->ndim);
+    auto voc = pDesc->shape[0];
+    auto topk_ = topk <= voc ? topk : voc;
+    bool doSample = topk_ > 1 && temperature != 0 && topp != 0;
+
+    auto topkShape = std::vector<int64_t>(pDesc->shape);
+    topkShape[ndim - 1] = doSample ? topk_ : 1;
+
+    auto topkStrides = std::vector<int64_t>(pDesc->strides);
+    // Infer contiguous strides
+    topkStrides[ndim - 1] = 1;
+    for (int64_t i = ndim - 2; i >= 0; --i) {
+        topkStrides[i] = topkStrides[i + 1] * topkShape[i + 1];
+    }
+
+    CHECK_STATUS(topkValDesc->setDescriptor(pDesc->dataType, topkShape, topkStrides), STATUS_SUCCESS);
+    CHECK_STATUS(topkIdxDesc->setDescriptor(aclDataType::ACL_INT64, topkShape, topkStrides), STATUS_SUCCESS);
+
+    // Infer data ptr
+    auto workspaceTmp = workspace;
+    auto topkValAddr = workspaceTmp;
+    workspaceTmp = (void *) ((uint8_t *) workspace +
+                             numElements(topkValDesc->shape.data(), topkValDesc->ndim) * aclDataTypeSize(topkValDesc->dataType));
+    auto topkIdxAddr = workspaceTmp;
+    auto pAddr = (void *) probs;
+
+    // Create aclTensor
+    CHECK_STATUS(pDesc->createTensor(pAddr), STATUS_SUCCESS);
+    CHECK_STATUS(topkValDesc->createTensor(topkValAddr), STATUS_SUCCESS);
+    CHECK_STATUS(topkIdxDesc->createTensor(topkIdxAddr), STATUS_SUCCESS);
+    if (!doSample) {
+        CHECK_STATUS(desc->resDesc->createTensor(result), STATUS_SUCCESS);
+    }
+
+    // Do Topk calculate
+    uint64_t topkWorkspaceSize = 0;
+    aclOpExecutor *topkExecutor = nullptr;
+    auto ret = aclnnTopkGetWorkspaceSize(pDesc->t,
+                                         topkShape[ndim - 1],
+                                         ndim - 1,
+                                         true,
+                                         true,
+                                         topkValDesc->t,
+                                         doSample ? topkIdxDesc->t
+                                                  : desc->resDesc->t,
+                                         &topkWorkspaceSize,
+                                         &topkExecutor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnTopkGetWorkspaceSize failed ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    void *topkWorkspace;
+    CHECK_STATUS(mallocWorkspace(&topkWorkspace, topkWorkspaceSize), STATUS_SUCCESS);
+    ret = aclnnTopk(topkWorkspace,
+                    topkWorkspaceSize,
+                    topkExecutor,
+                    stream);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnTopk failed ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    CHECK_STATUS(freeWorkspace(topkWorkspace), STATUS_SUCCESS);
+
+    if (doSample) {
+        // Do softmax and topp random sample
+        CHECK_STATUS(random_sample_do(
+                         pAddr,
+                         result,
+                         topkValAddr,
+                         topkIdxAddr,
+                         topk,
+                         static_cast<int>(pDesc->shape[0]),
+                         topp,
+                         temperature,
+                         random_val,
+                         pDesc->dataType,
+                         stream),
+                     STATUS_SUCCESS);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc) {
+    delete desc->pDesc;
+    delete desc->topkIdxDesc;
+    delete desc->topkValDesc;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/random_sample/ascend/random_sample.h b/src/ops/random_sample/ascend/random_sample.h
new file mode 100644
index 00000000..1ecc16fc
--- /dev/null
+++ b/src/ops/random_sample/ascend/random_sample.h
@@ -0,0 +1,52 @@
+#ifndef __ASCEND_RANDOM_SAMPLE_H__
+#define __ASCEND_RANDOM_SAMPLE_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include "operators.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <acl/acl_rt.h>
+#include <aclnnop/aclnn_topk.h>
+
+
+struct RandomSampleAscendDescriptor {
+    Device device;
+    int device_id;
+    aclnnTensorDescriptor_t pDesc;
+    aclnnTensorDescriptor_t topkValDesc;
+    aclnnTensorDescriptor_t topkIdxDesc;
+    aclnnTensorDescriptor_t resDesc;
+    RandomSampleAscendDescriptor(Device _device);
+};
+
+typedef struct RandomSampleAscendDescriptor *RandomSampleAscendDescriptor_t;
+
+infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle,
+                                                    RandomSampleAscendDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t result,
+                                                    infiniopTensorDescriptor_t probs);
+
+infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc,
+                                                    uint64_t *size);
+
+infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream);
+
+infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc);
+
+extern "C" infiniopStatus_t
+random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
+                 int32_t topk, int32_t voc, float topp, float temper,
+                 float random, int dtype, void *stream);
+
+#endif
diff --git a/src/ops/random_sample/ascend/random_sample_aclnn.cc b/src/ops/random_sample/ascend/random_sample_aclnn.cc
deleted file mode 100644
index e888b2f9..00000000
--- a/src/ops/random_sample/ascend/random_sample_aclnn.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "random_sample_aclnn.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include "../../utils.h"
-
-RandomSampleAclnnDescriptor::RandomSampleAclnnDescriptor(Device _device) {
-    device = _device;
-    device_id = 0;
-    argMaxExecutor = nullptr;
-    pDesc = new aclnnTensorDescriptor();
-    rDesc = new aclnnTensorDescriptor();
-    random_val = 1.0;
-    topp = 0;
-    topk = 0;
-    temperature = 1.0;
-    argMaxWorkspaceSize = 0;
-}
-
-infiniopStatus_t aclnnCreateRandomSampleDescriptor(AscendHandle_t handle,
-                                                   RandomSampleAclnnDescriptor_t *desc_ptr,
-                                                   infiniopTensorDescriptor_t result,
-                                                   infiniopTensorDescriptor_t probs) {
-
-    (*desc_ptr) = new RandomSampleAclnnDescriptor(handle->device);
-    (*desc_ptr)->device_id = handle->device_id;
-    (*desc_ptr)->random_val = 0;
-    (*desc_ptr)->topp = 0;
-    (*desc_ptr)->topk = 0;
-    (*desc_ptr)->temperature = 1.0;
-
-    auto &pDesc = (*desc_ptr)->pDesc;
-    auto &rDesc = (*desc_ptr)->rDesc;
-
-    CHECK_STATUS(pDesc->fromInfiniOpTensorDescriptor(probs), STATUS_SUCCESS);
-    CHECK_STATUS(pDesc->createTensor(), STATUS_SUCCESS);
-
-    result->dt = I64;
-    CHECK_STATUS(rDesc->fromInfiniOpTensorDescriptor(result), STATUS_SUCCESS);
-    CHECK_STATUS(rDesc->createTensor(), STATUS_SUCCESS);
-
-    aclTensor *tp = pDesc->t;
-    aclTensor *tr = rDesc->t;
-
-    aclnnStatus ret;
-
-    // temp = prob / temperature
-    auto &argmaxWorkspaceSize = (*desc_ptr)->argMaxWorkspaceSize;
-    auto &argmaxExecutor = (*desc_ptr)->argMaxExecutor;
-    ret = aclnnArgMaxGetWorkspaceSize(tp,
-                                      0,
-                                      true,
-                                      tr,
-                                      &argmaxWorkspaceSize,
-                                      &argmaxExecutor);
-    CHECK_RET(ret == ACL_SUCCESS,
-              LOG_PRINT("aclnnArgMaxGetWorkspaceSize failed, ERROR: %d\n", ret);
-              return STATUS_EXECUTION_FAILED);
-    aclSetAclOpExecutorRepeatable(argmaxExecutor);
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t aclnnGetRandomSampleWorkspaceSize(RandomSampleAclnnDescriptor_t desc, uint64_t *size) {
-    *size = desc->argMaxWorkspaceSize;
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t aclnnRandomSample(RandomSampleAclnnDescriptor_t desc,
-                                   void *workspace,
-                                   uint64_t workspace_size,
-                                   void *result,
-                                   void const *probs,
-                                   float random_val,
-                                   float topp,
-                                   int topk,
-                                   float temperature,
-                                   void *stream) {
-    auto &pDesc = desc->pDesc;
-    auto &rDesc = desc->rDesc;
-
-    aclTensor *tp = pDesc->t;
-    aclTensor *tr = rDesc->t;
-
-    aclrtSetDevice(desc->device_id);
-
-    auto &argmaxWorkspaceSize = desc->argMaxWorkspaceSize;
-    auto &argmaxExecutor = desc->argMaxExecutor;
-
-    AclSetTensorAddr(argmaxExecutor, 0, tp, (void *) probs);
-    AclSetTensorAddr(argmaxExecutor, 1, tr, (void *) result);
-    auto ret = aclnnArgMax(workspace,
-                           argmaxWorkspaceSize,
-                           argmaxExecutor,
-                           stream);
-    CHECK_RET(ret == ACL_SUCCESS,
-              LOG_PRINT("aclnnArgMax failed. ERROR: %d\n", ret);
-              return STATUS_EXECUTION_FAILED);
-    return STATUS_SUCCESS;
-}
-
-
-infiniopStatus_t aclnnDestroyRandomSampleDescriptor(RandomSampleAclnnDescriptor_t desc) {
-    delete desc->pDesc;
-    delete desc->rDesc;
-    aclDestroyAclOpExecutor(desc->argMaxExecutor);
-    delete desc;
-
-    return STATUS_SUCCESS;
-}
diff --git a/src/ops/random_sample/ascend/random_sample_aclnn.h b/src/ops/random_sample/ascend/random_sample_aclnn.h
deleted file mode 100644
index 8848cb99..00000000
--- a/src/ops/random_sample/ascend/random_sample_aclnn.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __ASCEND_RANDOM_SAMPLE_H__
-#define __ASCEND_RANDOM_SAMPLE_H__
-
-#include "../../../devices/ascend/ascend_handle.h"
-#include "../../../devices/ascend/tensor_aclnn.h"
-#include "operators.h"
-#include <acl/acl.h>
-#include <acl/acl_base.h>
-#include <acl/acl_rt.h>
-#include <aclnnop/aclnn_argmax.h>
-
-
-struct RandomSampleAclnnDescriptor {
-    Device device;
-    int device_id;
-    aclOpExecutor *argMaxExecutor;
-    aclnnTensorDescriptor_t pDesc;
-    aclnnTensorDescriptor_t rDesc;
-    float random_val;
-    float topp;
-    int topk;
-    float temperature;
-    uint64_t argMaxWorkspaceSize;
-    RandomSampleAclnnDescriptor(Device _device);
-};
-
-typedef struct RandomSampleAclnnDescriptor *RandomSampleAclnnDescriptor_t;
-
-infiniopStatus_t aclnnCreateRandomSampleDescriptor(AscendHandle_t handle,
-                                                   RandomSampleAclnnDescriptor_t *desc_ptr,
-                                                   infiniopTensorDescriptor_t result,
-                                                   infiniopTensorDescriptor_t probs);
-
-infiniopStatus_t aclnnGetRandomSampleWorkspaceSize(RandomSampleAclnnDescriptor_t desc,
-                                                   uint64_t *size);
-
-infiniopStatus_t aclnnRandomSample(RandomSampleAclnnDescriptor_t desc,
-                                   void *workspace,
-                                   uint64_t workspace_size,
-                                   void *result,
-                                   void const *probs,
-                                   float random_val,
-                                   float topp,
-                                   int topk,
-                                   float temperature,
-                                   void *stream);
-
-infiniopStatus_t aclnnDestroyRandomSampleDescriptor(RandomSampleAclnnDescriptor_t desc);
-
-
-#endif
diff --git a/src/ops/random_sample/ascend/random_sample_kernel.cpp b/src/ops/random_sample/ascend/random_sample_kernel.cpp
new file mode 100644
index 00000000..18b482bc
--- /dev/null
+++ b/src/ops/random_sample/ascend/random_sample_kernel.cpp
@@ -0,0 +1,232 @@
+#include "../../../../include/status.h"
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+template<typename T>
+class KernelRandomSample {
+public:
+    __aicore__ inline KernelRandomSample() {}
+    __aicore__ inline void Init(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
+                                GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
+                                float topp_, float temper_, float random_) {
+
+        topk = topk_;
+        voc = voc_;
+        topp = topp_;
+        temperature = temper_;
+        random = random_;
+        blockSize = 256 * 2;
+
+        // CumSumInfo
+        if (sizeof(T) == sizeof(float)) {
+            topkAligned = (topk + 7) / 8 * 8;
+            vocAligned = (voc + 7) / 8 * 8;
+        } else {
+            topkAligned = (topk + 15) / 16 * 16;
+            vocAligned = (voc + 15) / 16 * 16;
+        }
+        topkIdxAligned = (topk + 3) / 4 * 4;
+
+        // Set Gm
+        pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc);
+        topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk);
+        topkIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topkIdxAddr), topk);
+        resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1);
+
+        // Global input and output
+        pipe.InitBuffer(pQue, 1, vocAligned * sizeof(T));
+        pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T));
+        pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t));
+        pipe.InitBuffer(resQue, 1, 32);// 32 bytes for aligned
+
+        pipe.InitBuffer(softMaxBuf1, blockSize);
+        pipe.InitBuffer(softMaxBuf2, blockSize);
+        pipe.InitBuffer(softMaxBuf3, blockSize);
+        pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T));
+
+        pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T));
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    // Softmax
+    __aicore__ inline void SoftMax(LocalTensor<T> &valIn,
+                                   LocalTensor<T> &topkValIn,
+                                   LocalTensor<T> &softMaxOut) {
+        int32_t repeatTimes = vocAligned * sizeof(T) / blockSize;
+        int32_t remainder = vocAligned * sizeof(T) % blockSize / sizeof(T);
+        int32_t tileLength = blockSize / sizeof(T);
+        float negMax = -static_cast<float>(topkValIn(0));
+        float invTemperature = 1.0f / temperature;
+        float sum = 0.f;
+        float sum_s = 0.f;
+        LocalTensor<T> tmpBuffer = softMaxBuf1.Get<T>();
+        LocalTensor<T> tmpBuffer2 = softMaxBuf2.Get<T>();
+        LocalTensor<T> tmpBuffer3 = softMaxBuf3.Get<T>();
+        for (int32_t i = 0; i < repeatTimes; i++) {
+            Adds(tmpBuffer, valIn[i * tileLength], static_cast<T>(negMax), tileLength);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), tileLength);
+            Exp(tmpBuffer3, tmpBuffer2, tileLength);
+            sum_s = 0.f;
+            for (int j = 0; j < tileLength; ++j) {
+                sum_s += static_cast<float>(tmpBuffer3(j));
+            }
+            sum += sum_s;
+        }
+        if (remainder != 0) {
+            Adds(tmpBuffer, valIn[repeatTimes * tileLength], static_cast<T>(negMax), remainder);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), remainder);
+            Exp(tmpBuffer3, tmpBuffer2, remainder);
+            sum_s = 0.f;
+            for (int i = 0; i < remainder; ++i) {
+                sum_s += static_cast<float>(tmpBuffer3(i));
+            }
+            sum += sum_s;
+        }
+        float invSum = 1.0f / sum;
+        Adds(tmpBuffer, topkValIn, static_cast<T>(negMax), topk);
+        Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), topk);
+        Exp(tmpBuffer3, tmpBuffer2, topk);
+        Muls(softMaxOut, tmpBuffer3, static_cast<T>(invSum), topk);
+    }
+
+    // Cumsum
+    __aicore__ inline void InclusiveSum(LocalTensor<T> &topkValIn,
+                                        LocalTensor<T> &topkValOut) {
+        static constexpr CumSumConfig cumSumConfig{true, false, false};
+        LocalTensor<T> lastRowLocal;
+        CumSum<T, cumSumConfig>(topkValOut, lastRowLocal, topkValIn,
+                                {1, static_cast<uint32_t>(topkAligned)});
+    }
+
+    // Random sample
+    __aicore__ inline void RandomSample(LocalTensor<T> &valIn,
+                                        LocalTensor<int64_t> &Index,
+                                        LocalTensor<int64_t> &result) {
+        int end = 0;
+        for (end = 0; end < topk; end++) {
+            if (static_cast<float>(valIn(end)) >= topp) {
+                break;
+            }
+        }
+        if (end < topk - 1) {
+            end += 1;
+        } else {
+            end = topk;
+        }
+
+        auto randomVal = random * static_cast<float>(valIn(end - 1));
+        for (int i = 0; i < end; i++) {
+            if (randomVal < static_cast<float>(valIn(i))) {
+                result(0) = Index(i);
+                return;
+            }
+        }
+        result(0) = Index(end - 1);
+    }
+
+    __aicore__ inline void CopyIn() {
+        LocalTensor<T> pLocal = pQue.AllocTensor<T>();
+        LocalTensor<T> topkValLocal = topkQue.AllocTensor<T>();
+        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.AllocTensor<int64_t>();
+
+        DataCopy(pLocal, pGm, vocAligned);
+        DataCopy(topkValLocal, topkGm, topkAligned);
+        DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned);
+
+        pQue.EnQue(pLocal);
+        topkQue.EnQue(topkValLocal);
+        topkIdxQue.EnQue(topkIdxLocal);
+    }
+
+    __aicore__ inline void Compute() {
+        // Get input data
+        LocalTensor<T> pLocal = pQue.DeQue<T>();
+        LocalTensor<T> topkValLocal = topkQue.DeQue<T>();
+
+        // SoftMax
+        LocalTensor<T> softMaxOutLocal = softMaxOutBuf.Get<T>();
+        SoftMax(pLocal, topkValLocal, softMaxOutLocal);
+
+        // InclusiveSum
+        LocalTensor<T> inclusiveOutLocal = inclusiveSumOutBuf.Get<T>();
+        InclusiveSum(softMaxOutLocal, inclusiveOutLocal);
+
+        // randomSample
+        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.DeQue<int64_t>();
+        LocalTensor<int64_t> resultLocal = resQue.AllocTensor<int64_t>();
+        RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal);
+
+        pQue.FreeTensor(pLocal);
+        topkQue.FreeTensor(topkValLocal);
+        topkIdxQue.FreeTensor(topkIdxLocal);
+        resQue.EnQue(resultLocal);
+    }
+    __aicore__ inline void CopyOut() {
+        LocalTensor<int64_t> resLocal = resQue.DeQue<int64_t>();
+        DataCopy(resGm, resLocal, 32 / sizeof(int64_t));
+        resQue.FreeTensor(resLocal);
+    }
+
+private:
+    GlobalTensor<T> pGm;
+    GlobalTensor<T> topkGm;
+    GlobalTensor<int64_t> topkIdxGm;
+    GlobalTensor<int64_t> resGm;
+
+    TPipe pipe;
+
+    TQue<QuePosition::VECIN, 1> pQue;
+    TQue<QuePosition::VECIN, 1> topkQue;
+    TQue<QuePosition::VECIN, 1> topkIdxQue;
+    TQue<QuePosition::VECOUT, 1> resQue;
+
+    TBuf<TPosition::VECCALC> softMaxBuf1;
+    TBuf<TPosition::VECCALC> softMaxBuf2;
+    TBuf<TPosition::VECCALC> softMaxBuf3;
+    TBuf<TPosition::VECCALC> softMaxOutBuf;
+
+    TBuf<TPosition::VECCALC> inclusiveSumOutBuf;
+
+    // Kernel params
+    int32_t topk;
+    int32_t voc;
+    float topp;
+    float temperature;
+    float random;
+
+    int32_t topkAligned;
+    int32_t topkIdxAligned;
+    int32_t vocAligned;
+    int32_t blockSize;
+};
+
+extern "C" __global__ __aicore__ void
+random_sample_kernel_f16(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
+                         GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
+                         float topp_, float temper_, float random_) {
+    KernelRandomSample<half> op;
+    op.Init(p, res, topkAddr, topkIdxAddr, topk_, voc_, topp_, temper_, random_);
+    op.Process();
+}
+
+extern "C" infiniopStatus_t
+random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
+                 int32_t topk, int32_t voc, float topp, float temper,
+                 float random, int dtype, void *stream) {
+
+    switch (dtype) {
+        case 0:
+            return STATUS_SUCCESS;
+        case 1:
+            random_sample_kernel_f16<<<1, nullptr, stream>>>(
+                p, res, topkAddr, topkIdxAddr, topk, voc, topp, temper, random);
+            return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/random_sample/bang/random_sample_bang.cc b/src/ops/random_sample/bang/random_sample_bang.cc
index b1c7180e..ed1945da 100644
--- a/src/ops/random_sample/bang/random_sample_bang.cc
+++ b/src/ops/random_sample/bang/random_sample_bang.cc
@@ -28,7 +28,7 @@ infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size) {
     *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype)) + sizeof(desc->dtype);
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/random_sample/bang/random_sample_bang.h b/src/ops/random_sample/bang/random_sample_bang.h
index 1bb0b7d5..de830fbf 100644
--- a/src/ops/random_sample/bang/random_sample_bang.h
+++ b/src/ops/random_sample/bang/random_sample_bang.h
@@ -20,11 +20,11 @@ infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle,
                                                   RandomSampleBangDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
                                                   infiniopTensorDescriptor_t probs);
 
-infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc,
                                   void *workspace,
-                                  unsigned long int workspace_size,
+                                  uint64_t workspace_size,
                                   void *result,
                                   void const *probs,
                                   float random_val,
diff --git a/src/ops/random_sample/bang/random_sample_bang.mlu b/src/ops/random_sample/bang/random_sample_bang.mlu
index 5b6a0751..5fa66150 100644
--- a/src/ops/random_sample/bang/random_sample_bang.mlu
+++ b/src/ops/random_sample/bang/random_sample_bang.mlu
@@ -24,9 +24,9 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t
     char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + taskDim * topk) * sizeof(T);
     uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum],必须要求maxNum >= max{step, topk}
     uint64_t *indGlobal = srcInd + maxNum;//[taskDim * topk]
-    
+
     __sync_all();
-    
+
     T *src = (T *)nram_buffer;//[maxNum],必须要求maxNum >= max{step, topk}
     T *destSum = src + maxNum;//[maxNum]
     T *destSumFinal = destSum + maxNum;//[wSize]
@@ -36,14 +36,14 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t
     __bang_write_zero(destSumFinal, wSize);
 
     __memcpy(srcInd, indGdram, voc * sizeof(uint64_t), GDRAM2NRAM);
-    
+
     if(step){
         for(int i = 0; i < step; i++){
             srcInd[i] = indStart + i;
         }
         __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM);
         if(step >= topk){
-            for(int i = 0; i < topk; i++){  
+            for(int i = 0; i < topk; i++){
                 for(int j = i + 1; j < step; j++){
                     if(src[i] < src[j]){
                         T tmp = src[i];
@@ -102,9 +102,9 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t
         for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次
             for(int i = 0; i < strip ; i++){
                 __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize);
-            } 
+            }
         }
-        
+
         __bang_reduce_sum(destSumFinal, destSum, wSize);
     }
     else{
@@ -116,27 +116,27 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t
         destSumFinal[0] = destSumFinal[0] - (maxNum - step);//把上面多加的(maxNum - step)减掉
     }
     globalSum[0] = 0.0;
-    
+
     __sync_all();
     __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0
-    
+
     T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和
-    
+
     if(taskId == 0){
         __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值
-        
+
 
         __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk);
         __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk);
         __bang_active_exp_less_0(srcGlobal, srcGlobal, topk);
         __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk);
-        
+
         __bang_write_zero(destSum, 2 * topk);
         destSum[0] = srcGlobal[0];
         for(int i = 1; i < topk; i++){
             destSum[i] = destSum[i - 1] + srcGlobal[i];
         }
-        
+
         int end = 0;
         for(end = 0; end < topk; end++){
             if(destSum[end] >= static_cast<T>(topp)){
@@ -149,7 +149,7 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t
         else{
             end = topk;
         }
-        
+
         random_val *= destSum[end - 1];
         for(int i = 0; i < end; i++){
             if(random_val < destSum[i]){
@@ -164,7 +164,7 @@ __mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t
 template <typename T>
 __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t *indGdram, T *globalTopk, T *globalSum, float random_val, float topp, int topk, float temperature, int voc){
     const int maxNum = SRC_MAX_SIZE/sizeof(T);
-    
+
     int wSize = 128 / sizeof(T);
     int segNum = maxNum / wSize;
 
@@ -178,7 +178,7 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t
     int stepHard = stepEasy + 1;
     int step = (taskId < remainT ? stepHard : stepEasy);
     int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy);
-    
+
     char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + 2 * topk + taskDim * topk) * sizeof(T);
     uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum]
     uint64_t *topkInd = srcInd + maxNum;//[2 * topk]
@@ -196,7 +196,7 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t
             srcInd[j] = r * taskSize + taskId * maxNum + j;
         }
         __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-        for(int i = 0; i < topk; i++){  
+        for(int i = 0; i < topk; i++){
             for(int j = i + 1; j < maxNum; j++){
                 if(src[i] < src[j]){
                     T tmp = src[i];
@@ -230,17 +230,17 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t
                 }
             }
         }
-        
-        
+
+
     }
-    
+
     if(step){
         for(int j = 0; j < step; j++){
             srcInd[j] = repeat * taskSize + indStart + j;
         }
         __memcpy(src, source + repeat * taskSize + indStart, step * sizeof(T), GDRAM2NRAM);
         if(step >= topk){
-            for(int i = 0; i < topk; i++){  
+            for(int i = 0; i < topk; i++){
                 for(int j = i + 1; j < step; j++){
                     if(src[i] < src[j]){
                         T tmp = src[i];
@@ -289,11 +289,11 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t
             }
         }
     }
-    
+
     __memcpy(globalTopk + taskId * topk, srcTopk, topk * sizeof(T), NRAM2GDRAM);
     __memcpy(indGdram + taskId * topk, topkInd, topk * sizeof(uint64_t), NRAM2GDRAM);
     __sync_all();
-    
+
     if(taskId == 0){
         __memcpy(srcGlobal, globalTopk, taskDim * topk * sizeof(T), GDRAM2NRAM);
         __memcpy(indGlobal, indGdram, taskDim * topk * sizeof(uint64_t), GDRAM2NRAM);
@@ -337,44 +337,44 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t
         for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次
             for(int i = 0; i < strip ; i++){
                 __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize);
-            } 
+            }
         }
-        
+
         __bang_reduce_sum(destSumFinal, destSum, wSize);
     }
-    
+
     else{
         for(int i = 0; i < maxNum; i++){
-            
+
             destSumFinal[0] += destSum[i];
         }
-        
+
     }
     if(step){
         destSumFinal[0] = destSumFinal[0] - (maxNum - step);//把上面多加的(maxNum - step)减掉
     }
     globalSum[0] = 0.0;
-    
+
     __sync_all();
     __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0
-    
+
     T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和
-    
+
     if(taskId == 0){
         __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值
-        
+
 
         __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk);
         __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk);
         __bang_active_exp_less_0(srcGlobal, srcGlobal, topk);
         __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk);
-        
+
         __bang_write_zero(srcTopk, 2 * topk);
         srcTopk[0] = srcGlobal[0];
         for(int i = 1; i < topk; i++){
             srcTopk[i] = srcTopk[i - 1] + srcGlobal[i];
         }
-        
+
         int end = 0;
         for(end = 0; end < topk; end++){
             if(srcTopk[end] >= static_cast<T>(topp)){
@@ -387,7 +387,7 @@ __mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t
         else{
             end = topk;
         }
-        
+
         random_val *= srcTopk[end - 1];
         for(int i = 0; i < end; i++){
             if(random_val < srcTopk[i]){
@@ -415,7 +415,7 @@ __mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t *
     T *src = (T *)nram_buffer;
     T *srcMax = src + maxNum;
     uint64_t index = 0;
-    
+
     T newMax = -INFINITY;
     for(uint64_t r = 0; r < repeat; r++){
         __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
@@ -424,7 +424,7 @@ __mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t *
             newMax = srcMax[0];
             index = r * taskSize + taskId * maxNum + *((int64_t*)&srcMax[1]);
         }
-        
+
     }
     if(step){
         __bang_write_value(src, maxNum, -INFINITY);
@@ -434,9 +434,9 @@ __mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t *
             newMax = srcMax[0];
             index = indStart + *((int64_t*)&srcMax[1]);
         }
-        
+
     }
-    
+
     indGdram[taskId] = index;
     __sync_all();
     if(taskId == 0){
@@ -462,7 +462,7 @@ void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source,
     k_dim.y = 1;
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
-    
+
     int taskNum = k_dim.x * k_dim.y * k_dim.z;
     if(topp > 0 && topk > 1){
         const int maxNum = SRC_MAX_SIZE/sizeof(T);
@@ -471,7 +471,7 @@ void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source,
         uint64_t *indGdram = (uint64_t *)origin;
         T *globalTopk = (T *)indTmp;
         T *globalSum = globalTopk + taskNum * topk;
-        
+
         if(voc >= taskNum * maxNum){
             random_sampleD<T><<<k_dim, k_type, queue>>>(logits_, index_, indGdram, globalTopk, globalSum, random_val, topp, topk, temperature, voc);
         }
@@ -484,8 +484,8 @@ void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source,
         random_sample<T><<<k_dim, k_type, queue>>>(logits_, index_, indGdram, voc);
     }
     cnrtQueueSync(queue);
-    
-    
+
+
 }
 
 void random_sample_bang_f16(RandomSampleBangDescriptor_t desc, void *workspace, void *result,
@@ -497,12 +497,12 @@ void random_sample_bang_f16(RandomSampleBangDescriptor_t desc, void *workspace,
                                     void *stream) {
     auto queue = reinterpret_cast<cnrtQueue_t>(stream);
     int voc = desc->voc;
-    
+
     random_sampleUnion<half>(queue, workspace, probs, result, random_val, topp, topk, temperature, voc);
 }
 infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc,
                                     void *workspace,
-                                    unsigned long int workspace_size,
+                                    uint64_t workspace_size,
                                     void *result,
                                     void const *probs,
                                     float random_val,
diff --git a/src/ops/random_sample/cpu/random_sample.cc b/src/ops/random_sample/cpu/random_sample.cc
index 3706e1ea..28de5b93 100644
--- a/src/ops/random_sample/cpu/random_sample.cc
+++ b/src/ops/random_sample/cpu/random_sample.cc
@@ -31,7 +31,7 @@ infiniopStatus_t cpuCreateRandomSampleDescriptor(infiniopHandle_t,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, uint64_t *size) {
     *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype));
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh
index 4230fabc..d3fff76d 100644
--- a/src/ops/random_sample/cuda/random_sample.cuh
+++ b/src/ops/random_sample/cuda/random_sample.cuh
@@ -19,7 +19,7 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
                                                   RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
                                                   infiniopTensorDescriptor_t probs);
 
-infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
                                   void *workspace,
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc
index a536ca19..022a113b 100644
--- a/src/ops/random_sample/cuda/random_sample_cuda.cc
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cc
@@ -26,7 +26,7 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size) {
     *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype));
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/random_sample/operator.cc b/src/ops/random_sample/operator.cc
index 9d3ef186..84b26ad0 100644
--- a/src/ops/random_sample/operator.cc
+++ b/src/ops/random_sample/operator.cc
@@ -12,7 +12,7 @@
 #include "bang/random_sample_bang.h"
 #endif
 #ifdef ENABLE_ASCEND_NPU
-#include "ascend/random_sample_aclnn.h"
+#include "ascend/random_sample.h"
 #endif
 #ifdef ENABLE_MT_GPU
 #include "musa/random_sample_musa.h"
@@ -37,8 +37,8 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
 #endif
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu: {
-            return aclnnCreateRandomSampleDescriptor((AscendHandle_t) handle,
-                                                     (RandomSampleAclnnDescriptor_t *) desc_ptr, result, probs);
+            return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle,
+                                                     (RandomSampleAscendDescriptor_t *) desc_ptr, result, probs);
         }
 #endif
 #ifdef ENABLE_MT_GPU
@@ -69,7 +69,7 @@ __C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDe
 #endif
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu: {
-            return aclnnGetRandomSampleWorkspaceSize((RandomSampleAclnnDescriptor_t) desc, size);
+            return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size);
         }
 #endif
 #ifdef ENABLE_MT_GPU
@@ -107,7 +107,7 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
 #endif
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu: {
-            return aclnnRandomSample((RandomSampleAclnnDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+            return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
         }
 #endif
 #ifdef ENABLE_MT_GPU
@@ -135,7 +135,7 @@ __C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleD
 #endif
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu: {
-            return aclnnDestroyRandomSampleDescriptor((RandomSampleAclnnDescriptor_t) desc);
+            return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc);
         }
 #endif
 #ifdef ENABLE_MT_GPU
diff --git a/src/ops/rearrange/ascend/rearrange_aclnn.cc b/src/ops/rearrange/ascend/rearrange_aclnn.cc
index 4eead4a9..f1db82cd 100644
--- a/src/ops/rearrange/ascend/rearrange_aclnn.cc
+++ b/src/ops/rearrange/ascend/rearrange_aclnn.cc
@@ -56,24 +56,24 @@ infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc,
 
     /// TODO: something is wrong with aclSetTensorAddr, do all the preparation here for now
     desc->dstDesc->t = aclCreateTensor(desc->dstDesc->shape.data(),
-                              desc->dstDesc->ndim,
-                              desc->dstDesc->dataType,
-                              desc->dstDesc->strides.data(),
-                              desc->dstDesc->offset,
-                              desc->dstDesc->format,
-                              desc->dstDesc->storageShape.data(),
-                              desc->dstDesc->storageNdim,
-                              dst);
+                                       desc->dstDesc->ndim,
+                                       desc->dstDesc->dataType,
+                                       desc->dstDesc->strides.data(),
+                                       desc->dstDesc->offset,
+                                       desc->dstDesc->format,
+                                       desc->dstDesc->storageShape.data(),
+                                       desc->dstDesc->storageNdim,
+                                       dst);
     desc->srcDesc->t = aclCreateTensor(desc->srcDesc->shape.data(),
-                              desc->srcDesc->ndim,
-                              desc->srcDesc->dataType,
-                              desc->srcDesc->strides.data(),
-                              desc->srcDesc->offset,
-                              desc->srcDesc->format,
-                              desc->srcDesc->storageShape.data(),
-                              desc->srcDesc->storageNdim,
-                              (void*)src);
-    
+                                       desc->srcDesc->ndim,
+                                       desc->srcDesc->dataType,
+                                       desc->srcDesc->strides.data(),
+                                       desc->srcDesc->offset,
+                                       desc->srcDesc->format,
+                                       desc->srcDesc->storageShape.data(),
+                                       desc->srcDesc->storageNdim,
+                                       (void *) src);
+
     aclTensor *td = desc->dstDesc->t;
     aclTensor *ts = desc->srcDesc->t;
     aclOpExecutor *executor;
@@ -82,7 +82,7 @@ infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc,
                                      ts,
                                      &workspaceSize,
                                      &executor);
-    desc->workspaceAddr = mallocWorkspace(workspaceSize);
+    CHECK_STATUS(mallocWorkspace(&(desc->workspaceAddr), workspaceSize), STATUS_SUCCESS);
 
 
     // AclSetTensorAddr(executor, 0, td, dst);
@@ -97,7 +97,7 @@ infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc,
 
     desc->dstDesc->destroyTensor();
     desc->srcDesc->destroyTensor();
-    freeWorkspace(desc->workspaceAddr);
+    CHECK_STATUS(freeWorkspace(desc->workspaceAddr), STATUS_SUCCESS);
     return STATUS_SUCCESS;
 }
 
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cc b/src/ops/rearrange/cpu/rearrange_cpu.cc
index 9dad108d..a5540727 100644
--- a/src/ops/rearrange/cpu/rearrange_cpu.cc
+++ b/src/ops/rearrange/cpu/rearrange_cpu.cc
@@ -11,41 +11,52 @@ infiniopStatus_t cpuCreateRearrangeDescriptor(infiniopHandle_t,
     if (!dtype_eq(dst->dt, src->dt)) {
         return STATUS_BAD_TENSOR_DTYPE;
     }
-    if (dst->ndim != src->ndim || dst->ndim < 2) {
+
+    auto ndim = dst->ndim;
+    if (src->ndim != ndim || ndim == 0) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
-    std::vector<uint64_t> shape;
-    std::vector<int64_t> strides_dst, strides_src;
-    auto ndim = dst->ndim;
     for (int i = 0; i < ndim; ++i) {
         if (dst->shape[i] != src->shape[i]) {
             return STATUS_BAD_TENSOR_SHAPE;
         }
-        shape.push_back(dst->shape[i]);
-        strides_dst.push_back(dst->strides[i]);
-        strides_src.push_back(src->strides[i]);
     }
     if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) {
         return STATUS_BAD_TENSOR_STRIDES;
     }
+
+    std::vector<uint64_t>
+        shape(dst->shape, dst->shape + ndim);
+    std::vector<int64_t>
+        strides_dst(dst->strides, dst->strides + ndim),
+        strides_src(src->strides, src->strides + ndim);
+
     unsigned int r = 0;
-    if (ndim == 2) {
-        r = dst->shape[0];
-    } else if (ndim == 3) {
-        r = dst->shape[0] * dst->shape[1];
-    } else {
-        for (int i = ndim - 3; i >= 1; --i) {
-            if (dst->shape[i] * dst->strides[i] != dst->strides[i - 1] || src->shape[i] * src->strides[i] != src->strides[i - 1]) {
-                return STATUS_BAD_TENSOR_STRIDES;
+    switch (ndim) {
+        case 1:
+            ndim = 2;
+            strides_dst.insert(strides_dst.begin(), shape[0]);
+            strides_src.insert(strides_src.begin(), shape[0]);
+            shape.insert(shape.begin(), 1);
+        case 2:
+            r = shape[0];
+            break;
+        case 3:
+            r = shape[0] * shape[1];
+            break;
+        default:
+            for (int i = ndim - 3; i >= 1; --i) {
+                if (shape[i] * strides_dst[i] != strides_dst[i - 1] || shape[i] * strides_src[i] != strides_src[i - 1]) {
+                    return STATUS_BAD_TENSOR_STRIDES;
+                }
             }
-        }
-        r = std::accumulate(dst->shape, dst->shape + ndim - 1, 1, std::multiplies<unsigned int>());
+            r = std::accumulate(shape.begin(), shape.end() - 1, 1, std::multiplies{});
+            break;
     }
     *desc_ptr = new RearrangeCpuDescriptor{
         DevCpu,
         dst->dt,
         r,
-        ndim,
         shape,
         strides_dst,
         strides_src,
@@ -70,11 +81,12 @@ inline int indices(uint64_t i, uint64_t ndim, std::vector<int64_t> strides, std:
 void reform_cpu(RearrangeCpuDescriptor_t desc, void *dst, void const *src) {
     auto dst_ptr = reinterpret_cast<uint8_t *>(dst);
     auto src_ptr = reinterpret_cast<const uint8_t *>(src);
-    int bytes_size = desc->shape[desc->ndim - 1] * desc->dt.size;
+    auto ndim = desc->shape.size();
+    int bytes_size = desc->shape[ndim - 1] * desc->dt.size;
 #pragma omp parallel for
     for (uint64_t i = 0; i < desc->r; ++i) {
-        auto dst_offset = indices(i, desc->ndim, desc->strides_dst, desc->shape);
-        auto src_offset = indices(i, desc->ndim, desc->strides_src, desc->shape);
+        auto dst_offset = indices(i, ndim, desc->strides_dst, desc->shape);
+        auto src_offset = indices(i, ndim, desc->strides_src, desc->shape);
         std::memcpy(dst_ptr + dst_offset * desc->dt.size, src_ptr + src_offset * desc->dt.size, bytes_size);
     }
 }
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.h b/src/ops/rearrange/cpu/rearrange_cpu.h
index f75fe549..99cc62e6 100644
--- a/src/ops/rearrange/cpu/rearrange_cpu.h
+++ b/src/ops/rearrange/cpu/rearrange_cpu.h
@@ -7,7 +7,6 @@ struct RearrangeCpuDescriptor {
     Device device;
     DataLayout dt;
     uint64_t r;
-    uint64_t ndim;
     std::vector<uint64_t> shape;
     std::vector<int64_t> strides_dst;
     std::vector<int64_t> strides_src;
diff --git a/src/ops/rearrange/cuda/rearrange.cc b/src/ops/rearrange/cuda/rearrange.cc
index 96e8a890..da23489b 100644
--- a/src/ops/rearrange/cuda/rearrange.cc
+++ b/src/ops/rearrange/cuda/rearrange.cc
@@ -7,13 +7,15 @@ infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle,
                                                RearrangeCudaDescriptor_t *desc_ptr,
                                                infiniopTensorDescriptor_t dst,
                                                infiniopTensorDescriptor_t src) {
-    if (!dtype_eq(dst->dt, src->dt)) {
+    auto dt = dst->dt;
+    if (!dtype_eq(src->dt, dt)) {
         return STATUS_BAD_TENSOR_DTYPE;
     }
-    if (dst->ndim != src->ndim || dst->ndim < 2) {
+
+    auto ndim = dst->ndim;
+    if (src->ndim != ndim || ndim == 0) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
-    auto ndim = dst->ndim;
     for (int i = 0; i < ndim; ++i) {
         if (dst->shape[i] != src->shape[i]) {
             return STATUS_BAD_TENSOR_SHAPE;
@@ -22,52 +24,44 @@ infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle,
     if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) {
         return STATUS_BAD_TENSOR_STRIDES;
     }
-    unsigned int r = 0, c = 0, b = 0;
-    unsigned int rsa = 0, csa = 0, rsb = 0, csb = 0;
-    if (ndim == 2) {
-        c = dst->shape[0];
-        b = dst->shape[1];
-        csa = dst->strides[0];
-        csb = src->strides[0];
-    } else if (ndim == 3) {
-        r = dst->shape[0];
-        c = dst->shape[1];
-        b = dst->shape[2];
-        csa = dst->strides[1];
-        csb = src->strides[1];
-        rsa = dst->strides[0];
-        rsb = src->strides[0];
-    } else {
-        for (int i = ndim - 3; i >= 1; --i) {
-            if (dst->shape[i] * dst->strides[i] != dst->strides[i - 1] || src->shape[i] * src->strides[i] != src->strides[i - 1]) {
-                return STATUS_BAD_TENSOR_STRIDES;
-            }
-        }
-        r = std::accumulate(dst->shape, dst->shape + ndim - 2, 1, std::multiplies<unsigned int>());
-        c = dst->shape[ndim - 2];
-        b = dst->shape[ndim - 1];
-        csa = dst->strides[ndim - 2];
-        csb = src->strides[ndim - 2];
-        rsa = dst->strides[ndim - 3];
-        rsb = src->strides[ndim - 3];
-    }
-    auto contiguous_bytes = b * dst->dt.size;
-    if (contiguous_bytes % WARP_SIZE != 0) {
-        return STATUS_BAD_PARAM;
-    }
-    auto bytes_per_thread = contiguous_bytes / WARP_SIZE;
-    if (bytes_per_thread <= 0 || bytes_per_thread > 32 || (bytes_per_thread & (bytes_per_thread - 1)) != 0) {
-        return STATUS_BAD_PARAM;
+
+    switch (ndim) {
+        case 1:
+            *desc_ptr = new RearrangeCudaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[0],
+                1, 1,
+                0, 0,
+                0, 0};
+            break;
+        case 2:
+            *desc_ptr = new RearrangeCudaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[1],
+                1, dst->shape[0],
+                0, dst->strides[0],
+                0, src->strides[0]};
+            break;
+        case 3:
+            *desc_ptr = new RearrangeCudaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[2],
+                dst->shape[0], dst->shape[1],
+                dst->strides[0], dst->strides[1],
+                src->strides[0], src->strides[1]};
+            break;
+        default:
+            return STATUS_BAD_TENSOR_SHAPE;
     }
-    *desc_ptr = new RearrangeCudaDescriptor{
-        handle->device,
-		handle->device_id,
-        rsa,
-        rsb,
-        csa,
-        csb,
-        r, c, b,
-        bytes_per_thread};
+
+    (*desc_ptr)->dst_rs *= dt.size;
+    (*desc_ptr)->dst_cs *= dt.size;
+    (*desc_ptr)->src_rs *= dt.size;
+    (*desc_ptr)->src_cs *= dt.size;
+
     return STATUS_SUCCESS;
 }
 infiniopStatus_t cudaDestroyRearrangeDescriptor(RearrangeCudaDescriptor_t desc) {
diff --git a/src/ops/rearrange/cuda/rearrange.cu b/src/ops/rearrange/cuda/rearrange.cu
index 68d3ddbf..04651f6b 100644
--- a/src/ops/rearrange/cuda/rearrange.cu
+++ b/src/ops/rearrange/cuda/rearrange.cu
@@ -4,11 +4,11 @@
 template<class Tmem>
 static __global__ void rearrange(
     void *__restrict__ dst,
-    unsigned int const rsa,
-    unsigned int const csa,
+    int const rsa,
+    int const csa,
     void const *__restrict__ src,
-    unsigned int const rsb,
-    unsigned int const csb,
+    int const rsb,
+    int const csb,
     unsigned int const ncols) {
 
     auto row = blockIdx.y,
@@ -23,37 +23,44 @@ static __global__ void rearrange(
     reinterpret_cast<Tmem *>(dst)[i] = reinterpret_cast<Tmem const *>(src)[j];
 }
 
-
 void rearrange_nv_gpu(RearrangeCudaDescriptor_t desc, void *y, void const *x, void *stream) {
-    unsigned long int rsa = desc->rsa, csa = desc->csa, rsb = desc->rsb, csb = desc->csb;
-    unsigned int r = desc->r, c = desc->c, b = desc->b, bytes_per_thread = desc->bytes_per_thread;
-    auto dst_ptr = static_cast<void *>(reinterpret_cast<uint8_t *>(y));
-    rsa /= b;
-    csa /= b;
-    auto src_ptr = static_cast<void const *>(reinterpret_cast<uint8_t const *>(x));
-    rsb /= b;
-    csb /= b;
     auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-    dim3 grid_dims = dim3((c + MAX_WARP_PER_BLOCK - 1) / MAX_WARP_PER_BLOCK, r);
-    dim3 block_dims = dim3(WARP_SIZE, (c + grid_dims.x - 1) / grid_dims.x);
-    switch (bytes_per_thread) {
+    auto unit = desc->unit,
+         r = desc->r, c = desc->c;
+    auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs,
+         src_rs = desc->src_rs, src_cs = desc->src_cs;
+
+    if (r == 1 && c == 1) {
+        cudaMemcpyAsync(y, x, unit, cudaMemcpyDeviceToDevice, cuda_stream);
+        return;
+    }
+
+    auto warps = 1024 / WARP_SIZE;
+    auto grid = dim3((c + warps - 1) / warps, r);
+    auto block = dim3(WARP_SIZE, (c + grid.x - 1) / grid.x);
+    dst_rs /= unit;
+    dst_cs /= unit;
+    src_rs /= unit;
+    src_cs /= unit;
+
+    switch (unit / WARP_SIZE) {
         case 1:
-            rearrange<uchar1><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
+            rearrange<uchar1><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
             break;
         case 2:
-            rearrange<uchar2><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
+            rearrange<uchar2><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
             break;
         case 4:
-            rearrange<float1><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
+            rearrange<float1><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
             break;
         case 8:
-            rearrange<float2><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
+            rearrange<float2><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
             break;
         case 16:
-            rearrange<float4><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
+            rearrange<float4><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
             break;
         case 32:
-            rearrange<double4><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
+            rearrange<double4><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
             break;
         default:
             break;
diff --git a/src/ops/rearrange/cuda/rearrange.cuh b/src/ops/rearrange/cuda/rearrange.cuh
index 2b0da93e..f31f74b3 100644
--- a/src/ops/rearrange/cuda/rearrange.cuh
+++ b/src/ops/rearrange/cuda/rearrange.cuh
@@ -7,12 +7,8 @@
 struct RearrangeCudaDescriptor {
     Device device;
     int device_id;
-    unsigned long int rsa;
-    unsigned long int rsb;
-    unsigned long int csa;
-    unsigned long int csb;
-    unsigned long int r, c, b;
-    unsigned long int bytes_per_thread;
+    uint64_t unit, r, c;
+    int64_t dst_rs, dst_cs, src_rs, src_cs;
 };
 
 typedef struct RearrangeCudaDescriptor *RearrangeCudaDescriptor_t;
diff --git a/src/ops/rms_norm/ascend/rms_norm_aclnn.cc b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc
index e71f943a..d264be39 100644
--- a/src/ops/rms_norm/ascend/rms_norm_aclnn.cc
+++ b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -62,7 +62,7 @@ infiniopStatus_t aclnnCreateRMSNormDescriptor(AscendHandle_t handle,
     for (int64_t i = xDesc->ndim - 2; i >= 0; --i) {
         rstd_strides[i] = rstd_strides[i + 1] * rstd_shape[i + 1];
     }
-    CHECK_STATUS(rstdDesc->setDescriptor(F32, rstd_shape, rstd_strides), STATUS_SUCCESS);
+    CHECK_STATUS(rstdDesc->setDescriptor(toAclDataType(F32), rstd_shape, rstd_strides), STATUS_SUCCESS);
 
     if (wDesc->dataType != xDesc->dataType) {
         castDesc = new aclnnTensorDescriptor();
diff --git a/src/ops/rms_norm/bang/rms_norm_bang.cc b/src/ops/rms_norm/bang/rms_norm_bang.cc
index 6d57d269..fbf7f689 100644
--- a/src/ops/rms_norm/bang/rms_norm_bang.cc
+++ b/src/ops/rms_norm/bang/rms_norm_bang.cc
@@ -16,8 +16,8 @@ infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, RMSNormBangDes
         return STATUS_BAD_TENSOR_SHAPE;
     }
 
-    unsigned long int stride_y = y_desc->strides[0];
-    unsigned long int stride_x = x_desc->strides[0];
+    uint64_t stride_y = y_desc->strides[0];
+    uint64_t stride_x = x_desc->strides[0];
     auto w_datatype = w_desc->dt;
     *desc_ptr = new RMSNormBangDescriptor{
         handle->device,
@@ -33,7 +33,7 @@ infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, RMSNormBangDes
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size) {
     *size = 0;
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/rms_norm/bang/rms_norm_bang.h b/src/ops/rms_norm/bang/rms_norm_bang.h
index 15210cd2..bfd94158 100644
--- a/src/ops/rms_norm/bang/rms_norm_bang.h
+++ b/src/ops/rms_norm/bang/rms_norm_bang.h
@@ -9,10 +9,10 @@ struct RMSNormBangDescriptor {
     Device device;
     int device_id;
     DT dtype;
-    unsigned long int n;
-    unsigned long int d;
-    unsigned long int stride_y;
-    unsigned long int stride_x;
+    uint64_t n;
+    uint64_t d;
+    uint64_t stride_y;
+    uint64_t stride_x;
     DT w_datatype;
     float epsilon;
 };
@@ -26,11 +26,11 @@ infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle,
                                              infiniopTensorDescriptor_t w_desc,
                                              float epsilon);
 
-infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc,
                              void *workspace,
-                             unsigned long int workspace_size,
+                             uint64_t workspace_size,
                              void *y, void const *x, void const *w,
                              void *stream);
 
diff --git a/src/ops/rms_norm/bang/rms_norm_bang.mlu b/src/ops/rms_norm/bang/rms_norm_bang.mlu
index ac6c0d01..755e1e3c 100644
--- a/src/ops/rms_norm/bang/rms_norm_bang.mlu
+++ b/src/ops/rms_norm/bang/rms_norm_bang.mlu
@@ -18,7 +18,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, float const *weigh
     int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
 
     if(dimsize >= maxNum){
-        
+
         char *nram_buffer1 = nram_buffer + (2 * maxNum + 3 * wSize) * sizeof(T);
         T *src = (T *)nram_buffer;//[maxNum]
         T *wet = src + maxNum;//[maxNum]
@@ -43,7 +43,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, float const *weigh
             for(int s = 0; s < repeat; s++){
                 __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                 __bang_mul(src, src, src, maxNum);//src = src * src
-                
+
                 if(maxNum >= wSize){
                     for(int strip = segNum / 2; strip > 0; strip = strip / 2){
                         for(int j = 0; j < strip; j++){
@@ -111,7 +111,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, float const *weigh
         __bang_write_zero(srcTmp, wSize);
         float *wetTmp = (float *)nram_buffer1;
 
-        
+
         int segNum = dimS / wSize;
 
         for(int i = indStart; i < indStart + step; i++){
@@ -159,9 +159,9 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i
     int stepHard = stepEasy + 1;
     int step = (taskId < remainT ? stepHard : stepEasy);
     int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-    
+
     if(dimsize >= maxNum){
-        
+
         T *src = (T *)nram_buffer;//[maxNum]
         T *wet = src + maxNum;//[maxNum]
         T *destSumFinal = wet + maxNum;//[wSize]
@@ -184,7 +184,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i
             for(int s = 0; s < repeat; s++){
                 __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                 __bang_mul(src, src, src, maxNum);//src = src * src
-                
+
                 if(maxNum >= wSize){
                     for(int strip = segNum / 2; strip > 0; strip = strip / 2){
                         for(int j = 0; j < strip; j++){
@@ -241,14 +241,14 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i
         }
     }
     else{
-        
+
         T *src = (T *)nram_buffer;//[dimsize]
         T *wet = src + dimsize;//[dimsize]
         T *destSumFinal = wet + dimsize;//[wSize]
         T *destSum = destSumFinal + wSize;//[dimS]
         T *srcTmp = destSum + dimS;//[wSize]
 
-        
+
         int segNum = dimS / wSize;
 
         for(int i = indStart; i < indStart + step; i++){
@@ -272,7 +272,7 @@ __mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, i
             else{
                 __memcpy(srcTmp, destSum, dimsize * sizeof(T), NRAM2NRAM);
                 __bang_reduce_sum(destSumFinal, srcTmp, wSize);
-                
+
             }
             destSumFinal[0] /= dimsize;
             destSumFinal[0] += eps;
@@ -309,7 +309,7 @@ void rms_normUnion(cnrtQueue_t queue, T *y, T const *x, Tw const *w, int stride_
 }
 void rms_norm_bang_f16(RMSNormBangDescriptor_t desc, void *y, void const *x, void const *w,
                              void *stream){
-    auto queue = reinterpret_cast<cnrtQueue_t>(stream);                            
+    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
     int n = static_cast<int>(desc->n);
     int d = static_cast<int>(desc->d);
     auto y_ = reinterpret_cast<half *>(y);
@@ -328,11 +328,11 @@ void rms_norm_bang_f16(RMSNormBangDescriptor_t desc, void *y, void const *x, voi
         auto w_ = reinterpret_cast<float const *>(w);
         rms_normUnion<half, float>(queue, y_, x_, w_, stride_y, stride_x, epsilon, n, d);
     }
-    
+
 }
 infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc,
                              void *workspace,
-                             unsigned long int workspace_size,
+                             uint64_t workspace_size,
                              void *y, void const *x, void const *w,
                              void *stream){
     if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.cc b/src/ops/rms_norm/bang/rms_norm_cnnl.cc
deleted file mode 100644
index 01e9aacd..00000000
--- a/src/ops/rms_norm/bang/rms_norm_cnnl.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-﻿#include "rms_norm_cnnl.h"
-#include "../../../devices/bang/common_bang.h"
-#include "../../../devices/bang/handle_pool.h"
-#include "../../utils.h"
-#include "cnrt.h"
-
-RMSNormCnnlDescriptor::RMSNormCnnlDescriptor(Device device) {
-    this->device = device;
-    get_cnnl_pool();
-}
-
-void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) {
-    ASSERT_EQ(y.layout->ndim, 2);
-    ASSERT_EQ(x.layout->ndim, 2);
-    ASSERT_EQ(w.layout->ndim, 1);
-
-    auto n = y.layout->shape[0],
-         d = y.layout->shape[1];
-
-    ASSERT_EQ(x.layout->shape[0], n);
-    ASSERT_EQ(x.layout->shape[1], d);
-    ASSERT_EQ(w.layout->shape[0], d);
-
-    cnnlTensorDescriptor_t yDesc, xDesc, wDesc;
-    cnnlCreateTensorDescriptor(&yDesc);
-    cnnlCreateTensorDescriptor(&xDesc);
-    cnnlCreateTensorDescriptor(&wDesc);
-    setCnnlTensor(yDesc, y.layout);
-    setCnnlTensor(xDesc, x.layout);
-    setCnnlTensor(wDesc, w.layout);
-
-    cnnlFuseNormDescriptor_t opDesc;
-    cnnlCreateFuseNormDescriptor(&opDesc);
-    cnnlSetFuseNormDescriptor(opDesc, epsilon, 1.0, true,
-                              false, false, false, false,
-                              CNNL_DTYPE_HALF, CNNL_TRANSFORMER_RMSNORM);
-
-    void *workspace;
-    
-    use_cnnl((cnrtQueue_t) stream,
-             [&](cnnlHandle_t handle) {
-                 size_t wsSize;
-                 cnnlGetFuseNormWorkspaceSize(handle, opDesc, xDesc, &wsSize);
-                 cnrtMalloc(&workspace, wsSize);
-                 cnnlFuseNorm(handle, opDesc, xDesc, x.data,
-                              wDesc, w.data, nullptr, nullptr,
-                              nullptr, nullptr, nullptr, nullptr,
-                              workspace, wsSize, yDesc, y.data, nullptr, nullptr);
-             });
-
-    cnrtFree(workspace);
-    cnnlDestroyFuseNormDescriptor(opDesc);
-    cnnlDestroyTensorDescriptor(xDesc);
-    cnnlDestroyTensorDescriptor(yDesc);
-    cnnlDestroyTensorDescriptor(wDesc);
-}
diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.h b/src/ops/rms_norm/bang/rms_norm_cnnl.h
deleted file mode 100644
index c76bf2d0..00000000
--- a/src/ops/rms_norm/bang/rms_norm_cnnl.h
+++ /dev/null
@@ -1,15 +0,0 @@
-﻿#ifndef __CNNL_RMS_NORM_H__
-#define __CNNL_RMS_NORM_H__
-
-#include "cnnl.h"
-#include "cnnl_extra.h"
-#include "operators.h"
-
-struct RMSNormCnnlDescriptor {
-    Device device;
-    RMSNormCnnlDescriptor(Device device);
-};
-
-void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream);
-
-#endif// __CNNL_RMS_NORM_H__
diff --git a/src/ops/rms_norm/cuda/rms_norm.cc b/src/ops/rms_norm/cuda/rms_norm.cc
index a54b3616..92d34a99 100644
--- a/src/ops/rms_norm/cuda/rms_norm.cc
+++ b/src/ops/rms_norm/cuda/rms_norm.cc
@@ -1,12 +1,12 @@
 #include "rms_norm.cuh"
-#include "../../utils.h"
 #include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
 
 infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDescriptor_t *desc_ptr,
-                                    infiniopTensorDescriptor_t y_desc,
-                                    infiniopTensorDescriptor_t x_desc,
-                                    infiniopTensorDescriptor_t w_desc,
-                                    float epsilon) {
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon) {
     if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
@@ -18,8 +18,8 @@ infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDes
         return STATUS_BAD_TENSOR_SHAPE;
     }
 
-    unsigned long int stride_y = y_desc->strides[0];
-    unsigned long int stride_x = x_desc->strides[0];
+    int64_t stride_y = y_desc->strides[0];
+    int64_t stride_x = x_desc->strides[0];
     auto w_datatype = w_desc->dt;
     *desc_ptr = new RMSNormCudaDescriptor{
         handle->device,
@@ -35,7 +35,7 @@ infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDes
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size) {
     *size = 0;
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/rms_norm/cuda/rms_norm.cu b/src/ops/rms_norm/cuda/rms_norm.cu
index aabbdc20..0dac45f0 100644
--- a/src/ops/rms_norm/cuda/rms_norm.cu
+++ b/src/ops/rms_norm/cuda/rms_norm.cu
@@ -158,7 +158,7 @@ void rms_norm_nv_gpu_f16(RMSNormCudaDescriptor_t desc, void *y, void const *x, v
 
 infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc,
                              void *workspace,
-                             unsigned long int workspace_size,
+                             uint64_t workspace_size,
                              void *y, void const *x, void const *w,
                              void *stream) {
     if (cudaSetDevice(desc->device_id) != cudaSuccess) {
diff --git a/src/ops/rms_norm/cuda/rms_norm.cuh b/src/ops/rms_norm/cuda/rms_norm.cuh
index 30701c2f..683011f2 100644
--- a/src/ops/rms_norm/cuda/rms_norm.cuh
+++ b/src/ops/rms_norm/cuda/rms_norm.cuh
@@ -8,10 +8,10 @@ struct RMSNormCudaDescriptor {
     Device device;
     int device_id;
     DT dtype;
-    unsigned long int n;
-    unsigned long int d;
-    unsigned long int stride_y;
-    unsigned long int stride_x;
+    uint64_t n;
+    uint64_t d;
+    int64_t stride_y;
+    int64_t stride_x;
     DT w_datatype;
     float epsilon;
 };
@@ -25,11 +25,11 @@ infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle,
                                              infiniopTensorDescriptor_t w_desc,
                                              float epsilon);
 
-infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc,
                              void *workspace,
-                             unsigned long int workspace_size,
+                             uint64_t workspace_size,
                              void *y, void const *x, void const *w,
                              void *stream);
 
diff --git a/src/ops/rms_norm/operator.cc b/src/ops/rms_norm/operator.cc
index 19752456..816ef5dc 100644
--- a/src/ops/rms_norm/operator.cc
+++ b/src/ops/rms_norm/operator.cc
@@ -13,7 +13,6 @@
 #ifdef ENABLE_CAMBRICON_MLU
 #include "../../devices/bang/bang_handle.h"
 #include "bang/rms_norm_bang.h"
-#include "bang/rms_norm_cnnl.h"
 #endif
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/rms_norm_aclnn.h"
diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cc b/src/ops/rotary_embedding/cuda/rotary_embedding.cc
index c92e6bd3..102eb474 100644
--- a/src/ops/rotary_embedding/cuda/rotary_embedding.cc
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cc
@@ -64,7 +64,7 @@ infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, unsigned long int *size) {
+infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size) {
     *size = 0;
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cu b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
index 99628248..a5f32a97 100644
--- a/src/ops/rotary_embedding/cuda/rotary_embedding.cu
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
@@ -4,7 +4,7 @@
 
 static __global__ void padding_f16(
     half *__restrict__ x_,
-    unsigned long const *__restrict__ pos_,
+    uint64_t const *__restrict__ pos_,
     float const *__restrict__ sin_,
     float const *__restrict__ cos_,
     long const stride0,
@@ -27,7 +27,7 @@ static __global__ void padding_f16(
 void rotary_embedding_nv_gpu_f16(
     RoPECudaDescriptor_t desc,
     half *t,
-    unsigned long const *pos,
+    uint64_t const *pos,
     float const *sin_, float const *cos_,
     void *stream) {
     auto nt = desc->seq_len,
@@ -44,7 +44,7 @@ void rotary_embedding_nv_gpu_f16(
 
 infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc,
                           void *workspace,
-                          unsigned long int workspace_size,
+                          uint64_t workspace_size,
                           void *t,
                           void const *pos_ids,
                           void const *sin_table,
@@ -56,7 +56,7 @@ infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc,
     if (dtype_eq(desc->dtype, F16)) {
         rotary_embedding_nv_gpu_f16(desc,
                                     reinterpret_cast<half *>(t),
-                                    reinterpret_cast<unsigned long const *>(pos_ids),
+                                    reinterpret_cast<uint64_t const *>(pos_ids),
                                     reinterpret_cast<float const *>(sin_table),
                                     reinterpret_cast<float const *>(cos_table),
                                     stream);
diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh
index 6dd5ab11..36b14194 100644
--- a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh
@@ -24,11 +24,11 @@ infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle,
                                           infiniopTensorDescriptor_t sin_table,
                                           infiniopTensorDescriptor_t cos_table);
 
-infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, unsigned long int *size);
+infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size);
 
 infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc,
                           void *workspace,
-                          unsigned long int workspace_size,
+                          uint64_t workspace_size,
                           void *t,
                           void const *pos_ids,
                           void const *sin_table,
diff --git a/src/ops/swiglu/ascend/swiglu_kernel.cpp b/src/ops/swiglu/ascend/swiglu_kernel.cpp
index 839cd8ea..3dab674f 100644
--- a/src/ops/swiglu/ascend/swiglu_kernel.cpp
+++ b/src/ops/swiglu/ascend/swiglu_kernel.cpp
@@ -1,5 +1,5 @@
-#include "kernel_operator.h"
 #include "../../../../include/status.h"
+#include "kernel_operator.h"
 using namespace AscendC;
 
 constexpr int32_t BUFFER_NUM = 1;
@@ -141,27 +141,27 @@ __aicore__ inline void KernelSwiGLU<T>::Process() {
 }
 
 __global__ __aicore__ void swiglu_kernel_f16(GM_ADDR c, GM_ADDR a, GM_ADDR b,
-                                                        float beta, int32_t nt, int32_t dh,
-                                                        int32_t sta, int32_t stb, int32_t stc,
-                                                        uint32_t remainder, uint32_t base) {
+                                             float beta, int32_t nt, int32_t dh,
+                                             int32_t sta, int32_t stb, int32_t stc,
+                                             uint32_t remainder, uint32_t base) {
     KernelSwiGLU<half> op;
     op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base);
     op.Process();
 }
 
 __global__ __aicore__ void swiglu_kernel_f32(GM_ADDR c, GM_ADDR a, GM_ADDR b,
-                                                        float beta, int32_t nt, int32_t dh,
-                                                        int32_t sta, int32_t stb, int32_t stc,
-                                                        uint32_t remainder, uint32_t base) {
+                                             float beta, int32_t nt, int32_t dh,
+                                             int32_t sta, int32_t stb, int32_t stc,
+                                             uint32_t remainder, uint32_t base) {
     KernelSwiGLU<float> op;
     op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base);
     op.Process();
 }
 
 extern "C" infiniopStatus_t swiglu_kernel_do(void *c, void *a, void *b,
-                                 float beta, int32_t nt, int32_t dh,
-                                 int32_t sta, int32_t stb, int32_t stc,
-                                 int dtype, void *stream) {
+                                             float beta, int32_t nt, int32_t dh,
+                                             int32_t sta, int32_t stb, int32_t stc,
+                                             int dtype, void *stream) {
 
     // Tiling params
     auto base = static_cast<uint32_t>(dh / BLOCK_NUM);
diff --git a/src/ops/utils.h b/src/ops/utils.h
index 86d6baa9..b48cf419 100644
--- a/src/ops/utils.h
+++ b/src/ops/utils.h
@@ -106,7 +106,14 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
 
 // check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
 inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c,
-                                  uint64_t *broadcast_shape, uint64_t *padded_shape1, uint64_t *padded_shape2, uint64_t broadcast_ndim) {
+                                  uint64_t broadcast_ndim) {
+    std::vector<uint64_t>
+        broadcast_shape_(broadcast_ndim),
+        padded_shape1_(broadcast_ndim),
+        padded_shape2_(broadcast_ndim);
+    auto broadcast_shape = broadcast_shape_.data(),
+         padded_shape1 = padded_shape1_.data(),
+         padded_shape2 = padded_shape2_.data();
     if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
         return false;
     }
@@ -118,7 +125,8 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor
     if (dst->ndim < src->ndim) {
         return false;
     }
-    uint64_t padded_shape[dst->ndim];
+    std::vector<uint64_t> padded_shape_(dst->ndim);
+    auto padded_shape = padded_shape_.data();
     std::fill(padded_shape, padded_shape + dst->ndim, 1);
     std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim);
     for (size_t i = 0; i < dst->ndim; ++i) {
@@ -131,11 +139,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor
 
 // check if the shape of tensor c is valid after broadcasting tensors a and b
 inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) {
-    uint64_t broadcast_ndim = std::max(a->ndim, b->ndim);
-    uint64_t broadcast_shape[broadcast_ndim];
-    uint64_t padded_shape1[broadcast_ndim];
-    uint64_t padded_shape2[broadcast_ndim];
-    return isValidBroadcastShape(a, b, c, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim);
+    return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
 }
 
 inline uint64_t get_byte_size(infiniopTensorDescriptor_t desc) {
@@ -220,7 +224,7 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uin
 // split the dimension dim of a tensor descriptor into multiple dimensions
 inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector<uint64_t> &dims) {
     uint64_t ndim = desc->ndim;
-    if (static_cast<int64_t>(desc->shape[dim]) != std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<uint64_t>())) {
+    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) {
         return nullptr;
     }
     uint64_t new_ndim = ndim + dims.size() - 1;
diff --git a/xmake.lua b/xmake.lua
index 4bd0dd7e..a9d5a882 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,4 +1,8 @@
 add_rules("mode.debug", "mode.release")
+-- Define color codes
+local GREEN = '\27[0;32m'
+local YELLOW = '\27[1;33m'
+local NC = '\27[0m'  -- No Color
 
 add_includedirs("include")
 
@@ -72,6 +76,15 @@ end
 if has_config("nv-gpu") then
 
     add_defines("ENABLE_NV_GPU")
+    local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
+    local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
+    if CUDA_ROOT ~= nil then
+        add_includedirs(CUDA_ROOT .. "/include")
+    end
+    if CUDNN_ROOT ~= nil then
+        add_includedirs(CUDNN_ROOT .. "/include")
+    end
+
     target("nv-gpu")
         set_kind("static")
         on_install(function (target) end)
@@ -84,6 +97,9 @@ if has_config("nv-gpu") then
 
         if is_plat("windows") then
             add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
+            if CUDNN_ROOT ~= nil then
+                add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
+            end
         else
             add_cuflags("-Xcompiler=-fPIC")
             add_culdflags("-Xcompiler=-fPIC")
@@ -123,7 +139,7 @@ if has_config("cambricon-mlu") then
 
             local includedirs = table.concat(target:get("includedirs"), " ")
             local args = {"-c", sourcefile, "-o", objectfile, "-I/usr/local/neuware/include", "--bang-mlu-arch=mtp_592", "-O3", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread"}
-            
+
             for _, includedir in ipairs(target:get("includedirs")) do
                 table.insert(args, "-I" .. includedir)
             end
@@ -132,8 +148,7 @@ if has_config("cambricon-mlu") then
             table.insert(target:objectfiles(), objectfile)
         end)
 
-rule_end()
-
+    rule_end()
 
     target("cambricon-mlu")
         set_kind("static")
@@ -204,7 +219,7 @@ if has_config("ascend-npu") then
     add_links("libascendcl.so")
     add_links("libnnopbase.so")
     add_links("libopapi.so")
-    add_links("libruntime.so")  
+    add_links("libruntime.so")
     add_linkdirs(ASCEND_HOME .. "/../../driver/lib64/driver")
     add_links("libascend_hal.so")
     local builddir = string.format(
@@ -221,7 +236,7 @@ if has_config("ascend-npu") then
             os.exec("make")
             os.exec("cp $(projectdir)/src/devices/ascend/build/lib/libascend_kernels.a "..builddir.."/")
             os.cd(os.projectdir())
-            
+
         end)
         after_clean(function ()
             local ascend_build_dir = path.join(os.projectdir(), "src/devices/ascend")
@@ -229,9 +244,9 @@ if has_config("ascend-npu") then
             os.exec("make clean")
             os.cd(os.projectdir())
             os.rm(builddir.. "/libascend_kernels.a")
-            
+
         end)
-        rule_end()
+    rule_end()
 
     target("ascend-npu")
         -- Other configs
@@ -242,7 +257,7 @@ if has_config("ascend-npu") then
         add_files("src/devices/ascend/*.cc", "src/ops/*/ascend/*.cc")
         add_cxflags("-lstdc++ -Wall -Werror -fPIC")
 
-        -- Add operator 
+        -- Add operator
         add_rules("ascend-kernels")
         add_links(builddir.."/libascend_kernels.a")
 
@@ -271,50 +286,10 @@ target("infiniop")
     add_files("src/devices/handle.cc")
     add_files("src/ops/*/operator.cc")
     add_files("src/tensor/*.cc")
+    after_build(function (target) print(YELLOW .. "You can install the libraries with \"xmake install\"" .. NC) end)
 
-    after_build(function (target) 
-        local builddir = string.format(
-            "%s/build/%s/%s/%s",
-            os.projectdir(),
-            get_config("plat"),
-            get_config("arch"),
-            get_config("mode")
-        )
-
-        os.exec("mkdir -p $(projectdir)/lib/")
-        os.exec("cp " ..builddir.. "/libinfiniop.so $(projectdir)/lib/")
-        os.exec("cp -r $(projectdir)/include $(projectdir)/lib/")
-        -- Define color codes
-        local GREEN = '\27[0;32m'
-        local YELLOW = '\27[1;33m'
-        local NC = '\27[0m'  -- No Color
-
-        -- Get the current directory
-        local current_dir = os.curdir()
-
-        -- Output messages with colors
-        os.exec("echo -e '" .. GREEN .. "Compilation completed successfully." .. NC .. "'")
-        os.exec("echo -e '" .. YELLOW .. "Install the libraries with \"xmake install\" or set INFINI_ROOT=" .. current_dir .. NC .. "'")
-    end)
-    
-    on_install(function (target) 
-        local home_dir = os.getenv("HOME")
-        local infini_dir = home_dir .. "/.infini/"
-
-        if os.isdir(infini_dir) then
-            print("~/.infini/ detected, duplicated contents will be overwritten.")
-        else
-            os.mkdir(infini_dir)
-        end
-        os.exec("cp -r " .. "$(projectdir)/lib " .. infini_dir)
-
-        local GREEN = '\27[0;32m'
-        local YELLOW = '\27[1;33m'
-        local NC = '\27[0m'  -- No Color
-        os.exec("echo -e '" .. GREEN .. "Installation completed successfully at ~/.infini/." .. NC .. "'")
-        os.exec("echo -e '" .. YELLOW .. "To set the environment variables, please run the following command:" .. NC .. "'")
-        os.exec("echo -e '" .. YELLOW .. "echo \"export INFINI_ROOT=~/.infini/\" >> ~/.bashrc" .. NC .. "'")
-        os.exec("echo -e '" .. YELLOW .. "echo \"export LD_LIBRARY_PATH=:~/.infini/lib:$LD_LIBRARY_PATH\" >> ~/.bashrc" .. NC .. "'")
-    end)
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+    add_installfiles("include/(**/*.h)", {prefixdir = "include"})
+    add_installfiles("include/*.h", {prefixdir = "include"})
 
 target_end()