issue/1105 fix kernel header include

PanZezhong1725 · PanZezhong1725 · commit 3dfb950ea92d · 2026-03-25T13:52:57.000+08:00
diff --git a/src/infiniop/devices/metax/metax_common.h b/src/infiniop/devices/metax/metax_common.h
@@ -6,9 +6,11 @@
 #ifdef ENABLE_METAX_MC_API
 #include <mcblas/mcblas.h>
 #include <mcdnn/mcdnn.h>
+#include <mcr/mc_runtime.h>
 #else
 #include <hcblas/hcblas.h>
 #include <hcdnn/hcdnn.h>
+#include <hcr/hc_runtime.h>
 #endif
 #include <functional>
 #include <memory>
diff --git a/src/infiniop/devices/metax/metax_kernel_common.h b/src/infiniop/devices/metax/metax_kernel_common.h
@@ -1,8 +1,12 @@
 #define INFINIOP_METAX_KERNEL __global__ void
 
 #ifdef ENABLE_METAX_MC_API
+#include <maca_bfloat16.h>
+#include <maca_fp16.h>
 #include <maca_fp8.h>
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
 #include <hpcc_fp8.h>
 #endif
 
diff --git a/src/infiniop/ops/addcmul/cuda/kernel.cuh b/src/infiniop/ops/addcmul/cuda/kernel.cuh
@@ -1,10 +1,6 @@
 #ifndef __ADDCMUL_CUDA_CUH__
 #define __ADDCMUL_CUDA_CUH__
 
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 #include <type_traits>
 
 namespace op::addcmul::cuda {
diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc
@@ -37,27 +37,36 @@ infiniStatus_t calculateArgWhere(
     const void *x) {
 
     const Tdata *x_data = reinterpret_cast<const Tdata *>(x);
-    // int64_t *y_data = reinterpret_cast<int64_t *>(y);
-    std::vector<size_t> positions;
-    // #pragma omp parallel for
+
+    std::vector<int64_t> positions;
+    const size_t ndim = info.shapes.size();
+
     for (size_t i = 0; i < info.num_elements; i++) {
-        size_t pos = 0, tem = i;
-        std::vector<size_t> position(info.strides.size());
-        for (size_t j = info.strides.size() - 1; j >= 0; j--) {
-            position[j] = tem % info.shapes[j];
-            tem /= info.shapes[j];
-            pos += position[j] * info.strides[j];
+        size_t pos = 0;
+        size_t tmp = i;
+
+        std::vector<int64_t> coord(ndim);
+
+        // unravel index
+        for (size_t j = ndim; j-- > 0;) {
+            coord[j] = tmp % info.shapes[j];
+            tmp /= info.shapes[j];
+            pos += coord[j] * info.strides[j];
         }
-        if (fabs(x_data[pos] - 0.0f) > 1e-5) {
-            for (auto p : position) {
-                positions.push_back(p);
+
+        // PyTorch semantics: != 0
+        if (x_data[pos] != Tdata(0)) {
+            for (size_t j = 0; j < ndim; j++) {
+                positions.push_back(coord[j]);
             }
         }
     }
 
+    *count = positions.size() / ndim;
+
     *y = new int64_t[positions.size()];
     memcpy(*y, positions.data(), positions.size() * sizeof(int64_t));
-    *count = positions.size() / info.strides.size();
+
     return INFINI_STATUS_SUCCESS;
 }
 
diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -1,11 +1,6 @@
 #ifndef __ATANH_CUDA_H__
 #define __ATANH_CUDA_H__
 
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-
 namespace op::atanh::cuda {
 typedef struct AtanhOp {
 public:
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca
@@ -1,12 +1,9 @@
 #include "../../../devices/metax/metax_common.h"
 #include "../../../devices/metax/metax_handle.h"
 #include "../../../devices/metax/metax_kernel_common.h"
+
 #include "binary_cross_entropy_with_logits_metax.h"
-#if defined(ENABLE_METAX_MC_API)
-#include <mc_runtime.h>
-#else
-#include <hc_runtime.h>
-#endif
+
 #include <type_traits>
 
 namespace op::bce_with_logits::metax {
diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh
@@ -1,18 +1,6 @@
 #ifndef __EQUAL_CUDA_H__
 #define __EQUAL_CUDA_H__
 
-#if ENABLE_METAX_API
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#endif
-#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 #include <type_traits>
 
 namespace op::equal::cuda {
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -2,18 +2,6 @@
 #define __HARDSWISH_CUDA_H__
 
 #include <cmath>
-#if ENABLE_METAX_API
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#endif
-#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 
 namespace op::hardswish::cuda {
 
diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh
@@ -1,18 +1,6 @@
 #ifndef __HARDTANH_CUDA_H__
 #define __HARDTANH_CUDA_H__
 
-#if ENABLE_METAX_API
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#endif
-#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 #include <type_traits>
 
 namespace op::hardtanh::cuda {
diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh
@@ -3,18 +3,6 @@
 
 #include <cmath>
 #include <type_traits>
-#if ENABLE_METAX_API
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#endif
-#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 
 namespace op::hypot::cuda {
 
diff --git a/src/infiniop/ops/index_add/cuda/kernel.cuh b/src/infiniop/ops/index_add/cuda/kernel.cuh
@@ -1,20 +1,6 @@
 #ifndef __INDEX_ADD_CUDA_H__
 #define __INDEX_ADD_CUDA_H__
 
-#if ENABLE_METAX_API
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#endif
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 #include <cstdint>
 
 namespace op::index_add::cuda {
diff --git a/src/infiniop/ops/index_add/metax/index_add_metax.maca b/src/infiniop/ops/index_add/metax/index_add_metax.maca
@@ -2,21 +2,12 @@
 #include "../../../devices/metax/metax_handle.h"
 #include "../../../devices/metax/metax_kernel_common.h"
 #include "../../../tensor.h"
+
 #include "../cuda/kernel.cuh"
 #include "index_add_metax.h"
 #include <cmath>
 #include <cstdio>
-#if defined(ENABLE_METAX_MC_API)
-#include <common/mc_library_types.h>
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#include <mcr/mc_runtime.h>
-#else
-#include <common/hc_library_types.h>
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#include <hcr/hc_runtime.h>
-#endif
+
 #include <vector>
 
 namespace op::index_add::metax {
diff --git a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu
@@ -1,3 +1,5 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include "../../../handle.h"
 
 // Iluvatar does not support atomic add yet
@@ -7,10 +9,6 @@
 
 #include "index_add_nvidia.cuh"
 #include <cstdint>
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 
 namespace op::index_add::nvidia {
 
diff --git a/src/infiniop/ops/index_copy/cuda/kernel.cuh b/src/infiniop/ops/index_copy/cuda/kernel.cuh
@@ -1,21 +1,6 @@
 #ifndef __INDEX_COPY_CUDA_H__
 #define __INDEX_COPY_CUDA_H__
 
-// #include <cuda_runtime.h>
-#if defined(ENABLE_METAX_API)
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#endif
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-
 #include <cstdint>
 
 namespace op::index_copy::cuda {
diff --git a/src/infiniop/ops/index_copy/metax/index_copy_metax.maca b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca
@@ -5,19 +5,6 @@
 #include <cmath>
 #include <cstdio>
 #include <vector>
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#include <common/mc_library_types.h>
-#include <mcr/mc_runtime.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#include <common/hc_library_types.h>
-#include <hcr/hc_runtime.h>
-#endif
-#include "../../../tensor.h"
-#include "../cuda/kernel.cuh"
 
 #include "../../../tensor.h"
 #include "../cuda/kernel.cuh"
diff --git a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu
@@ -1,14 +1,11 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include "../../../handle.h"
-#include "../cuda/kernel.cuh" // 假设这是通用 kernel 头文件路径，或者是 index_copy_cuda.h
+
+#include "../cuda/kernel.cuh"
 #include "index_copy_nvidia.cuh"
 #include <cstdint>
 
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-
 namespace op::index_copy::nvidia {
 
 // ==================================================================
diff --git a/src/infiniop/ops/sigmoid/cuda/kernel.cuh b/src/infiniop/ops/sigmoid/cuda/kernel.cuh
@@ -1,12 +1,6 @@
 #ifndef __SIDMOID_CUDA_H__
 #define __SIDMOID_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-
 namespace op::sigmoid::cuda {
 typedef struct SigmoidOp {
 public:
@@ -20,8 +14,8 @@ public:
         } else if constexpr (std::is_same_v<T, half>) {
             half denominator = __hadd(__float2half(1.0f), hexp(__hneg(x)));
             return hrcp(denominator);
-        } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
-            __nv_bfloat16 denominator = __float2bfloat16(__fadd_rn(1.0f, __expf(__bfloat162float(-x))));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            cuda_bfloat16 denominator = __float2bfloat16(__fadd_rn(1.0f, __expf(__bfloat162float(-x))));
             return __float2bfloat16(1.0f) / denominator;
         } else if constexpr (std::is_same_v<T, float>) {
             if (x >= 0.0f) {
diff --git a/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu
@@ -1,3 +1,5 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
 #include "../cuda/kernel.cuh"
 #include "sigmoid_nvidia.cuh"
 
@@ -43,7 +45,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F16:
         return _device_info->calculate<256, cuda::SigmoidOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::SigmoidOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SigmoidOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
         return _device_info->calculate<256, cuda::SigmoidOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
diff --git a/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh
@@ -1,21 +1,6 @@
 #ifndef __SMOOTH_L1_LOSS_CUDA_CUH__
 #define __SMOOTH_L1_LOSS_CUDA_CUH__
 
-#if defined(ENABLE_METAX_API)
-#if defined(ENABLE_METAX_MC_API)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <hpcc_bfloat16.h>
-#include <hpcc_fp16.h>
-#endif
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#endif
-
 #include <cmath>
 
 namespace op::smooth_l1_loss::cuda {
diff --git a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca
@@ -5,13 +5,6 @@
 #include <cmath>
 #include <cstdio>
 #include <vector>
-#if defined(ENABLE_METAX_MC_API)
-#include <common/mc_library_types.h>
-#include <mcr/mc_runtime.h>
-#else
-#include <common/hc_library_types.h>
-#include <hcr/hc_runtime.h>
-#endif
 
 #include "../../../tensor.h"
 #include "../cuda/kernel.cuh"
@@ -217,9 +210,7 @@ infiniStatus_t Descriptor::calculate(
         LAUNCH(__half);
         break;
     case INFINI_DTYPE_BF16:
-#if defined(__MACA__) || defined(ENABLE_METAX_MC_API)
         LAUNCH(cuda_bfloat16);
-#endif
         break;
     case INFINI_DTYPE_F32:
         LAUNCH(float);
diff --git a/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu
diff --git a/src/infiniop/ops/take/cuda/kernel.cuh b/src/infiniop/ops/take/cuda/kernel.cuh
diff --git a/src/infiniop/ops/take/metax/take_metax.maca b/src/infiniop/ops/take/metax/take_metax.maca
diff --git a/src/infiniop/ops/take/nvidia/take_nvidia.cu b/src/infiniop/ops/take/nvidia/take_nvidia.cu
diff --git a/test/infinicore/ops/topk.py b/test/infinicore/ops/topk.py