diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index cd7c5cb4a..53b3a2f10 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -3,13 +3,9 @@ #include "ops/add.hpp" #include "ops/add_rms_norm.hpp" #include "ops/attention.hpp" -#include "ops/avg_pool1d.hpp" #include "ops/causal_softmax.hpp" -#include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" -#include "ops/hardswish.hpp" -#include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" diff --git a/include/infinicore/ops/all.hpp b/include/infinicore/ops/all.hpp deleted file mode 100644 index 50d76f2d7..000000000 --- a/include/infinicore/ops/all.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" -#include -#include -namespace infinicore::op { -class All { -public: - using schema = void (*)(Tensor, Tensor, std::vector, bool); - static void execute(Tensor output, Tensor input, std::vector dim, bool keepdim = false); - static common::OpDispatcher &dispatcher(); -}; - -Tensor all(Tensor input, std::vector dim, bool keepdim = false); -void all_(Tensor output, Tensor input, std::vector dim, bool keepdim = false); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/avg_pool1d.hpp b/include/infinicore/ops/avg_pool1d.hpp deleted file mode 100644 index 4bf69bc2a..000000000 --- a/include/infinicore/ops/avg_pool1d.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" - -namespace infinicore::op { - -class AvgPool1d { -public: - using schema = void (*)(Tensor, Tensor, size_t, size_t, size_t); - static void execute(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding); - static common::OpDispatcher &dispatcher(); -}; - -Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0); -void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/cross_entropy.hpp b/include/infinicore/ops/cross_entropy.hpp deleted file mode 100644 index 9a6d446d2..000000000 --- a/include/infinicore/ops/cross_entropy.hpp +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" - -namespace infinicore::op { - -class CrossEntropy { -public: - // Schema 定义:函数指针类型 - // CrossEntropy 需要接收三个 Tensor: Output (Loss), Input (Logits), Target (Labels) - using schema = void (*)(Tensor, Tensor, Tensor); - - // 执行入口 - static void execute(Tensor output, Tensor input, Tensor target); - - // 分发器访问接口 - static common::OpDispatcher &dispatcher(); -}; - -// ================================================================== -// 对外 Functional API -// ================================================================== - -// 1. Out-of-place 接口: -// 输入 Logits 和 Target,内部自动创建 Output Tensor 并返回 -Tensor cross_entropy(Tensor input, Tensor target); - -// 2. Explicit Output 接口 (类似于 In-place 风格): -// 用户显式提供 Output Tensor 用于存储结果 -// 注意:虽然命名带有下划线 _,但通常 CrossEntropy 无法真正原地修改 input, -// 所以这里只是表示“写入指定的 output 内存” -void cross_entropy_(Tensor output, Tensor input, Tensor target); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/equal.hpp b/include/infinicore/ops/equal.hpp deleted file mode 100644 index 1a158bf1e..000000000 --- a/include/infinicore/ops/equal.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" - -namespace infinicore::op { - -class Equal { -public: - using schema = void (*)(Tensor, Tensor, Tensor); - - static void execute(Tensor out, Tensor a, Tensor b); - static common::OpDispatcher &dispatcher(); -}; - -Tensor equal(Tensor a, Tensor b); -void equal_(Tensor out, Tensor a, Tensor b); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/hardswish.hpp b/include/infinicore/ops/hardswish.hpp deleted file mode 100644 index 15313f461..000000000 --- a/include/infinicore/ops/hardswish.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" - -namespace infinicore::op { - -class Hardswish { -public: - using schema = void (*)(Tensor, Tensor); - static void execute(Tensor output, Tensor input); - static common::OpDispatcher &dispatcher(); -}; - -Tensor hardswish(Tensor input); -void hardswish_(Tensor output, Tensor input); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/hardtanh.hpp b/include/infinicore/ops/hardtanh.hpp deleted file mode 100644 index 511408fee..000000000 --- a/include/infinicore/ops/hardtanh.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" - -namespace infinicore::op { - -class HardTanh { -public: - using schema = void (*)(Tensor, Tensor, float, float); - static void execute(Tensor output, Tensor input, float min_val, float max_val); - static common::OpDispatcher &dispatcher(); -}; - -Tensor hardtanh(Tensor input, float min_val = -1.0f, float max_val = 1.0f); -void hardtanh_(Tensor output, Tensor input, float min_val = -1.0f, float max_val = 1.0f); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/sum.hpp b/include/infinicore/ops/sum.hpp deleted file mode 100644 index 0ead8de26..000000000 --- a/include/infinicore/ops/sum.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" -#include -#include - -namespace infinicore::op { -class Sum { -public: - using schema = void (*)(Tensor, Tensor, std::vector, bool); - static void execute(Tensor output, Tensor input, std::vector dim, bool keepdim = false); - static common::OpDispatcher &dispatcher(); -}; - -Tensor sum(Tensor input, std::vector dim, bool keepdim = false); -void sum_(Tensor output, Tensor input, std::vector dim, bool keepdim = false); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/topk.hpp b/include/infinicore/ops/topk.hpp deleted file mode 100644 index d8486112c..000000000 --- a/include/infinicore/ops/topk.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" -namespace infinicore::op { -class TopK { -public: - using schema = void (*)(Tensor, Tensor, Tensor, size_t, size_t, bool, bool); - static void execute(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest = true, bool sorted = true); - static common::OpDispatcher &dispatcher(); -}; - -std::pair topk(Tensor input, size_t k, size_t dim, bool largest = true, bool sorted = true); -void topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest = true, bool sorted = true); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/var.hpp b/include/infinicore/ops/var.hpp deleted file mode 100644 index d1e01e1bf..000000000 --- a/include/infinicore/ops/var.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" -#include -#include -#include -namespace infinicore::op { -class Var { -public: - using schema = void (*)(Tensor, Tensor, std::vector, bool, bool); // var_output, input, dim, unbiased, keepdim - static void execute(Tensor var_output, Tensor input, std::vector dim, bool unbiased = true, bool keepdim = false); - static common::OpDispatcher &dispatcher(); -}; - -Tensor var(Tensor input, std::vector dim, bool unbiased = true, bool keepdim = false); -void var_(Tensor var_output, Tensor input, std::vector dim, bool unbiased = true, bool keepdim = false); - -} // namespace infinicore::op diff --git a/include/infinicore/ops/var_mean.hpp b/include/infinicore/ops/var_mean.hpp deleted file mode 100644 index a9679187c..000000000 --- a/include/infinicore/ops/var_mean.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "../device.hpp" -#include "common/op.hpp" -#include -#include -#include -namespace infinicore::op { -class Var_Mean { -public: - using schema = void (*)(Tensor, Tensor, Tensor, std::vector, bool, bool); // var_output, mean_output, input, dim, unbiased, keepdim - static void execute(Tensor var_output, Tensor mean_output, Tensor input, std::vector dim, bool unbiased = true, bool keepdim = false); - static common::OpDispatcher &dispatcher(); -}; - -std::pair var_mean(Tensor input, std::vector dim, bool unbiased = true, bool keepdim = false); -void var_mean_(Tensor var_output, Tensor mean_output, Tensor input, std::vector dim, bool unbiased = true, bool keepdim = false); - -} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index f596a312b..11d42c1d1 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -4,7 +4,6 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" #include "infiniop/ops/add_rms_norm.h" -#include "infiniop/ops/all.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" @@ -36,21 +35,11 @@ #include "infiniop/ops/softmax.h" #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" -#include "infiniop/ops/sum.h" #include "infiniop/ops/swiglu.h" #include "infiniop/ops/tanh.h" -#include "infiniop/ops/topk.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" -#include "infiniop/ops/var.h" -#include "infiniop/ops/var_mean.h" #include "infiniop/ops/zeros.h" #include "infiniop/tensor_descriptor.h" -#include "infiniop/ops/cross_entropy.h" -#include "infiniop/ops/hardswish.h" -#include "infiniop/ops/avg_pool1d.h" -#include "infiniop/ops/equal.h" -#include "infiniop/ops/hardtanh.h" - #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/all.h b/include/infiniop/ops/all.h deleted file mode 100644 index 41d74cf9a..000000000 --- a/include/infiniop/ops/all.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __INFINIOP_ALL_API_H__ -#define __INFINIOP_ALL_API_H__ - -#include "../operator_descriptor.h" -#include -#include -typedef struct InfiniopDescriptor *infiniopAllDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateAllDescriptor(infiniopHandle_t handle, - infiniopAllDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim); - -__INFINI_C __export infiniStatus_t infiniopGetAllWorkspaceSize(infiniopAllDescriptor_t desc, size_t *size); - -__INFINI_C __export infiniStatus_t infiniopAll(infiniopAllDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyAllDescriptor(infiniopAllDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/avg_pool1d.h b/include/infiniop/ops/avg_pool1d.h deleted file mode 100644 index 81c489dd7..000000000 --- a/include/infiniop/ops/avg_pool1d.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __INFINIOP_AVG_POOL1D_API_H__ -#define __INFINIOP_AVG_POOL1D_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopAvgPool1dDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateAvgPool1dDescriptor( - infiniopHandle_t handle, - infiniopAvgPool1dDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input, - size_t kernel_size, - size_t stride, - size_t padding); - -__INFINI_C __export infiniStatus_t infiniopGetAvgPool1dWorkspaceSize( - infiniopAvgPool1dDescriptor_t desc, - size_t *size); - -__INFINI_C __export infiniStatus_t infiniopAvgPool1d( - infiniopAvgPool1dDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyAvgPool1dDescriptor( - infiniopAvgPool1dDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/cross_entropy.h b/include/infiniop/ops/cross_entropy.h deleted file mode 100644 index 2ebd4b168..000000000 --- a/include/infiniop/ops/cross_entropy.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __INFINIOP_CROSS_ENTROPY_API_H__ -#define __INFINIOP_CROSS_ENTROPY_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopCrossEntropyDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateCrossEntropyDescriptor( - infiniopHandle_t handle, - infiniopCrossEntropyDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t target_desc); - -__INFINI_C __export infiniStatus_t infiniopGetCrossEntropyWorkspaceSize( - infiniopCrossEntropyDescriptor_t desc, - size_t *size); - -__INFINI_C __export infiniStatus_t infiniopCrossEntropy( - infiniopCrossEntropyDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - const void *target, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyCrossEntropyDescriptor( - infiniopCrossEntropyDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h deleted file mode 100644 index 90c4f3386..000000000 --- a/include/infiniop/ops/equal.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __INFINIOP_EQUAL_API_H__ -#define __INFINIOP_EQUAL_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateEqualDescriptor( - infiniopHandle_t handle, - infiniopEqualDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__INFINI_C __export infiniStatus_t infiniopGetEqualWorkspaceSize( - infiniopEqualDescriptor_t desc, - size_t *size); - -__INFINI_C __export infiniStatus_t infiniopEqual( - infiniopEqualDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyEqualDescriptor( - infiniopEqualDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h deleted file mode 100644 index 1cdeecf67..000000000 --- a/include/infiniop/ops/hardswish.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef __INFINIOP_HARDSWISH_API_H__ -#define __INFINIOP_HARDSWISH_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateHardSwishDescriptor( - infiniopHandle_t handle, - infiniopHardSwishDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); - -__INFINI_C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize( - infiniopHardSwishDescriptor_t desc, - size_t *size); - -__INFINI_C __export infiniStatus_t infiniopHardSwish( - infiniopHardSwishDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyHardSwishDescriptor( - infiniopHardSwishDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/hardtanh.h b/include/infiniop/ops/hardtanh.h deleted file mode 100644 index d2f98cedd..000000000 --- a/include/infiniop/ops/hardtanh.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef __INFINIOP_HARDTANH_API_H__ -#define __INFINIOP_HARDTANH_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopHardTanhDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateHardTanhDescriptor(infiniopHandle_t handle, - infiniopHardTanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input, - float min_val, - float max_val); - -__INFINI_C __export infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, - size_t *size); - -__INFINI_C __export infiniStatus_t infiniopHardTanh(infiniopHardTanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/sum.h b/include/infiniop/ops/sum.h deleted file mode 100644 index c97104c90..000000000 --- a/include/infiniop/ops/sum.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __INFINIOP_SUM_API_H__ -#define __INFINIOP_SUM_API_H__ - -#include "../operator_descriptor.h" -#include -#include -typedef struct InfiniopDescriptor *infiniopSumDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateSumDescriptor(infiniopHandle_t handle, - infiniopSumDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim); - -__INFINI_C __export infiniStatus_t infiniopGetSumWorkspaceSize(infiniopSumDescriptor_t desc, size_t *size); - -__INFINI_C __export infiniStatus_t infiniopSum(infiniopSumDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroySumDescriptor(infiniopSumDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/topk.h b/include/infiniop/ops/topk.h deleted file mode 100644 index 3eaf94289..000000000 --- a/include/infiniop/ops/topk.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __INFINIOP_TOPK_API_H__ -#define __INFINIOP_TOPK_API_H__ - -#include "../operator_descriptor.h" -#include -#include -typedef struct InfiniopDescriptor *infiniopTopKDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateTopKDescriptor(infiniopHandle_t handle, - infiniopTopKDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t values_output_desc, - infiniopTensorDescriptor_t indices_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t k, - size_t dim, - bool largest, - bool sorted); - -__INFINI_C __export infiniStatus_t infiniopGetTopKWorkspaceSize(infiniopTopKDescriptor_t desc, size_t *size); - -__INFINI_C __export infiniStatus_t infiniopTopK(infiniopTopKDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *values_output, - void *indices_output, - const void *input, - size_t k, - size_t dim, - bool largest, - bool sorted, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyTopKDescriptor(infiniopTopKDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/var.h b/include/infiniop/ops/var.h deleted file mode 100644 index 7dc601a94..000000000 --- a/include/infiniop/ops/var.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __INFINIOP_VAR_API_H__ -#define __INFINIOP_VAR_API_H__ - -#include "../operator_descriptor.h" -#include -#include -typedef struct InfiniopDescriptor *infiniopVarDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateVarDescriptor(infiniopHandle_t handle, - infiniopVarDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim); - -__INFINI_C __export infiniStatus_t infiniopGetVarWorkspaceSize(infiniopVarDescriptor_t desc, size_t *size); - -__INFINI_C __export infiniStatus_t infiniopVar(infiniopVarDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *var_output, - const void *input, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyVarDescriptor(infiniopVarDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/var_mean.h b/include/infiniop/ops/var_mean.h deleted file mode 100644 index 358a55636..000000000 --- a/include/infiniop/ops/var_mean.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __INFINIOP_VAR_MEAN_API_H__ -#define __INFINIOP_VAR_MEAN_API_H__ - -#include "../operator_descriptor.h" -#include -#include -typedef struct InfiniopDescriptor *infiniopVarMeanDescriptor_t; - -__INFINI_C __export infiniStatus_t infiniopCreateVarMeanDescriptor(infiniopHandle_t handle, - infiniopVarMeanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t mean_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim); - -__INFINI_C __export infiniStatus_t infiniopGetVarMeanWorkspaceSize(infiniopVarMeanDescriptor_t desc, size_t *size); - -__INFINI_C __export infiniStatus_t infiniopVarMean(infiniopVarMeanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *var_output, - void *mean_output, - const void *input, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim, - void *stream); - -__INFINI_C __export infiniStatus_t infiniopDestroyVarMeanDescriptor(infiniopVarMeanDescriptor_t desc); - -#endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 229792b39..0b3eb9655 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -49,10 +49,7 @@ ) from infinicore.ops.add import add from infinicore.ops.add_rms_norm import add_rms_norm -from infinicore.ops.all import all from infinicore.ops.attention import attention -from infinicore.ops.cross_entropy import cross_entropy -from infinicore.ops.equal import equal from infinicore.ops.kv_caching import kv_caching from infinicore.ops.matmul import matmul from infinicore.ops.mha_kvcache import mha_kvcache @@ -64,11 +61,7 @@ from infinicore.ops.paged_caching import paged_caching from infinicore.ops.rearrange import rearrange from infinicore.ops.squeeze import squeeze -from infinicore.ops.sum import sum -from infinicore.ops.topk import topk from infinicore.ops.unsqueeze import unsqueeze -from infinicore.ops.var import var -from infinicore.ops.var_mean import var_mean from infinicore.tensor import ( Tensor, empty, @@ -127,22 +120,16 @@ "uint8", # Operators. "add", - "addcmul", "add_rms_norm", "add_rms_norm_", - "atanh", "attention", - "binary_cross_entropy_with_logits", - "cdist", "kv_caching", "matmul", - "equal", "mul", "narrow", "squeeze", "unsqueeze", "rearrange", - "cross_entropy", "empty", "empty_like", "from_blob", @@ -155,15 +142,9 @@ "paged_attention", "paged_attention_prefill", "ones", - "reciprocal", "strided_empty", "strided_from_blob", "zeros", - "sum", - "var_mean", - "var", - "topk", - "all", ] use_ntops = False diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 46ff04ae4..934930d56 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,9 +1,6 @@ -from .avg_pool1d import avg_pool1d from .causal_softmax import causal_softmax from .embedding import embedding from .flash_attention import flash_attention -from .hardswish import hardswish -from .hardtanh import hardtanh from .linear import linear from .linear_w8a8i8 import linear_w8a8i8 from .random_sample import random_sample @@ -23,9 +20,6 @@ "RopeAlgo", "rope", "silu", - "hardswish", - "hardtanh", - "avg_pool1d", "swiglu", "linear_w8a8i8", "silu_and_mul", diff --git a/python/infinicore/nn/functional/avg_pool1d.py b/python/infinicore/nn/functional/avg_pool1d.py deleted file mode 100644 index 0cf4759ad..000000000 --- a/python/infinicore/nn/functional/avg_pool1d.py +++ /dev/null @@ -1,24 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def avg_pool1d( - input: Tensor, - kernel_size: int, - stride: int | None = None, - padding: int = 0, - *, - out=None, -) -> Tensor: - if stride is None: - stride = 0 - - if out is None: - return Tensor( - _infinicore.avg_pool1d(input._underlying, kernel_size, stride, padding) - ) - - _infinicore.avg_pool1d_( - out._underlying, input._underlying, kernel_size, stride, padding - ) - return out diff --git a/python/infinicore/nn/functional/hardswish.py b/python/infinicore/nn/functional/hardswish.py deleted file mode 100644 index b054b8978..000000000 --- a/python/infinicore/nn/functional/hardswish.py +++ /dev/null @@ -1,28 +0,0 @@ -import infinicore -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def hardswish(input: Tensor, inplace: bool = False, *, out=None) -> Tensor: - r"""Apply the Hardswish activation function element-wise.""" - - if ( - infinicore.use_ntops - and input.device.type in ("cuda", "musa") - and out is None - and hasattr(infinicore.ntops.torch, "hardswish") - ): - try: - return infinicore.ntops.torch.hardswish(input, inplace=inplace) - except AttributeError: - pass - - if inplace: - _infinicore.hardswish_(input._underlying, input._underlying) - return input - - if out is None: - return Tensor(_infinicore.hardswish(input._underlying)) - - _infinicore.hardswish_(out._underlying, input._underlying) - return out diff --git a/python/infinicore/nn/functional/hardtanh.py b/python/infinicore/nn/functional/hardtanh.py deleted file mode 100644 index 925de33d6..000000000 --- a/python/infinicore/nn/functional/hardtanh.py +++ /dev/null @@ -1,46 +0,0 @@ -import infinicore -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def hardtanh( - input: Tensor, - min_val: float = -1.0, - max_val: float = 1.0, - inplace: bool = False, - *, - out=None, -) -> Tensor: - """Clamp the input tensor to the range [min_val, max_val].""" - - if min_val > max_val: - raise ValueError("min_val must be less than or equal to max_val") - - if ( - infinicore.use_ntops - and input.device.type in ("cuda", "musa") - and out is None - and hasattr(infinicore.ntops.torch, "hardtanh") - ): - try: - return infinicore.ntops.torch.hardtanh( - input, min_val=min_val, max_val=max_val, inplace=inplace - ) - except AttributeError: - pass - - if inplace: - _infinicore.hardtanh_( - input._underlying, input._underlying, float(min_val), float(max_val) - ) - return input - - if out is None: - return Tensor( - _infinicore.hardtanh(input._underlying, float(min_val), float(max_val)) - ) - - _infinicore.hardtanh_( - out._underlying, input._underlying, float(min_val), float(max_val) - ) - return out diff --git a/python/infinicore/ops/all.py b/python/infinicore/ops/all.py deleted file mode 100644 index 6aacd519d..000000000 --- a/python/infinicore/ops/all.py +++ /dev/null @@ -1,11 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def all(input, dim=None, keepdim=False, out=None): - if out is None: - return Tensor(_infinicore.all(input._underlying, dim, keepdim)) - - _infinicore.all_(out._underlying, input._underlying, dim, keepdim) - - return out diff --git a/python/infinicore/ops/cross_entropy.py b/python/infinicore/ops/cross_entropy.py deleted file mode 100644 index 5b47697b5..000000000 --- a/python/infinicore/ops/cross_entropy.py +++ /dev/null @@ -1,33 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def cross_entropy( - logits, - target, - weight=None, - *, - ignore_index=None, - reduction="none", - out=None, -): - """ - Token-wise cross entropy without reduction. The output tensor has the same - shape as target and uses the logits dtype. - """ - if weight is not None: - raise NotImplementedError("class weights are not supported yet.") - if ignore_index is not None: - raise NotImplementedError("ignore_index is not supported yet.") - if reduction not in (None, "none"): - raise NotImplementedError("Only reduction='none' is implemented.") - - if out is None: - return Tensor(_infinicore.cross_entropy(logits._underlying, target._underlying)) - - _infinicore.cross_entropy_( - out._underlying, - logits._underlying, - target._underlying, - ) - return out diff --git a/python/infinicore/ops/equal.py b/python/infinicore/ops/equal.py deleted file mode 100644 index 5a656ab30..000000000 --- a/python/infinicore/ops/equal.py +++ /dev/null @@ -1,10 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def equal(input, other, *, out=None): - if out is None: - return Tensor(_infinicore.equal(input._underlying, other._underlying)) - - _infinicore.equal_(out._underlying, input._underlying, other._underlying) - return out diff --git a/python/infinicore/ops/sum.py b/python/infinicore/ops/sum.py deleted file mode 100644 index 5f264c24b..000000000 --- a/python/infinicore/ops/sum.py +++ /dev/null @@ -1,28 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def sum(input, dim=None, keepdim=False, out=None): - """ - Sum the elements of the input tensor along the given dimensions. - - Args: - input (Tensor): The input tensor. - out (Tensor, optional): The output tensor. - - Returns: - Tensor: The output tensor. - - Example: - >>> import infinicore - >>> input = infinicore.tensor([[1, 2, 3], [4, 5, 6]]) - >>> output = infinicore.sum(input) - >>> print(output) - tensor([15]) - """ - if out is None: - return Tensor(_infinicore.sum(input._underlying, dim, keepdim)) - - _infinicore.sum_(out._underlying, input._underlying, dim, keepdim) - - return out diff --git a/python/infinicore/ops/topk.py b/python/infinicore/ops/topk.py deleted file mode 100644 index 86eb32ee6..000000000 --- a/python/infinicore/ops/topk.py +++ /dev/null @@ -1,12 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def topk(input, k, dim, largest=True, sorted=True, out=None): - if out is None: - values, indices = _infinicore.topk(input._underlying, k, dim, largest, sorted) - return Tensor(values), Tensor(indices) - - _infinicore.topk_(out._underlying, input._underlying, k, dim, largest, sorted) - - return out diff --git a/python/infinicore/ops/var.py b/python/infinicore/ops/var.py deleted file mode 100644 index 71911ab10..000000000 --- a/python/infinicore/ops/var.py +++ /dev/null @@ -1,12 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def var(input, dim=None, unbiased=True, keepdim=False, out=None): - if out is None: - var_tensor = _infinicore.var(input._underlying, dim, unbiased, keepdim) - return Tensor(var_tensor) - var_output = out - _infinicore.var_(var_output._underlying, input._underlying, dim, unbiased, keepdim) - - return out diff --git a/python/infinicore/ops/var_mean.py b/python/infinicore/ops/var_mean.py deleted file mode 100644 index 0a9573938..000000000 --- a/python/infinicore/ops/var_mean.py +++ /dev/null @@ -1,21 +0,0 @@ -from infinicore.lib import _infinicore -from infinicore.tensor import Tensor - - -def var_mean(input, dim=None, unbiased=True, keepdim=False, out=None): - if out is None: - var_tensor, mean_tensor = _infinicore.var_mean( - input._underlying, dim, unbiased, keepdim - ) - return Tensor(var_tensor), Tensor(mean_tensor) - var_output, mean_output = out - _infinicore.var_mean_( - var_output._underlying, - mean_output._underlying, - input._underlying, - dim, - unbiased, - keepdim, - ) - - return out diff --git a/python/infinicore/utils.py b/python/infinicore/utils.py index e0019dc89..094b2230e 100644 --- a/python/infinicore/utils.py +++ b/python/infinicore/utils.py @@ -1,13 +1,9 @@ +import ml_dtypes import numpy as np import torch import infinicore -try: - import ml_dtypes -except ModuleNotFoundError: - ml_dtypes = None - def to_torch_dtype(infini_dtype): """Convert infinicore data type to PyTorch data type""" @@ -61,9 +57,7 @@ def numpy_to_infinicore_dtype(numpy_dtype): return infinicore.float64 elif numpy_dtype == np.float16: return infinicore.float16 - elif hasattr(np, "bfloat16") and numpy_dtype == np.bfloat16: - return infinicore.bfloat16 - elif ml_dtypes is not None and numpy_dtype == ml_dtypes.bfloat16: + elif numpy_dtype == ml_dtypes.bfloat16: return infinicore.bfloat16 elif numpy_dtype == np.int8: return infinicore.int8 @@ -92,13 +86,6 @@ def infinicore_to_numpy_dtype(infini_dtype): elif infini_dtype == infinicore.int16: return np.int16 elif infini_dtype == infinicore.bfloat16: - if hasattr(np, "bfloat16"): - return np.bfloat16 - if ml_dtypes is None: - raise ModuleNotFoundError( - "ml_dtypes is required for bfloat16 numpy conversion. " - "Please install ml_dtypes." - ) return ml_dtypes.bfloat16 elif infini_dtype == infinicore.int32: return np.int32 diff --git a/scripts/python_test.py b/scripts/python_test.py index 13b69a013..0bd8bc26d 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -17,12 +17,12 @@ def run_tests(args): "causal_softmax.py", "clip.py", "conv.py", - # "dequantize_awq.py", + #"dequantize_awq.py", "gelu.py", "gemm.py", - # "layer_norm.py", + #"layer_norm.py", "logsoftmax.py", - # "lp_norm.py", + #"lp_norm.py", "mul.py", "ones.py", "random_sample.py", @@ -31,7 +31,7 @@ def run_tests(args): "rms_norm.py", "rope.py", "sigmoid.py", - # "softmax.py", + #"softmax.py", "softplus.py", "sub.py", "swiglu.py", @@ -42,7 +42,6 @@ def run_tests(args): # "paged_attention.py", # "paged_caching.py", # "paged_attention_prefill.py" - "cross_entropy.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infinicore/ops/all/al_infiniop.cc b/src/infinicore/ops/all/al_infiniop.cc deleted file mode 100644 index 094716ba8..000000000 --- a/src/infinicore/ops/all/al_infiniop.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/all.hpp" -#include "infinicore/ops/common/cache.hpp" -#include - -namespace infinicore::op::all_impl::infiniop { - -thread_local common::OpCache caches( - 100, // capacity - [](infiniopAllDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyAllDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor output, Tensor input, std::vector dim, bool keepdim) { - size_t seed = hash_combine(output, input, dim.size(), keepdim); - - auto device_type = context::getDevice().getType(); - auto device_index = context::getDevice().getIndex(); - - auto &cache = caches.getCache(device_type, device_index); - - auto desc_opt = cache.get(seed); - infiniopAllDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateAllDescriptor( - context::getInfiniopHandle(output->device()), &desc, - output->desc(), input->desc(), dim.data(), dim.size(), keepdim)); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetAllWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); - - INFINICORE_CHECK_ERROR(infiniopAll( - desc, workspace->data(), workspace_size, - output->data(), input->data(), dim.data(), dim.size(), keepdim, context::getStream())); -} - -static bool registered = []() { - All::dispatcher().registerDevice({Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR}, - &calculate, false); - return true; -}(); - -} // namespace infinicore::op::all_impl::infiniop diff --git a/src/infinicore/ops/all/all.cc b/src/infinicore/ops/all/all.cc deleted file mode 100644 index c695623b8..000000000 --- a/src/infinicore/ops/all/all.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include "infinicore/ops/all.hpp" - -#include "../../utils.hpp" -#include -#include -#include -namespace infinicore::op { - -common::OpDispatcher &All::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; -void All::execute(Tensor output, Tensor input, std::vector dim, bool keepdim) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); - infinicore::context::setDevice(input->device()); - auto device_type = context::getDevice().getType(); - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error("No All implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(output, input, dim, keepdim); -} - -Tensor all(Tensor input, std::vector dim, bool keepdim) { - auto in_shape = input->shape(); - std::vector out_shape; - if (dim.empty()) { - for (size_t i = 0; i < in_shape.size(); i++) { - dim.push_back(i); - } - } - std::sort(dim.begin(), dim.end()); - if (dim.size() == in_shape.size() && !keepdim) { - out_shape = {}; - } else { - if (keepdim) { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - out_shape.push_back(1); - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } else { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } - } - auto output = Tensor::empty(out_shape, DataType::BOOL, input->device()); - all_(output, input, dim, keepdim); - return output; -} - -void all_(Tensor output, Tensor input, std::vector dim, bool keepdim) { - All::execute(output, input, dim, keepdim); -} -} // namespace infinicore::op diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc deleted file mode 100644 index 907b25b00..000000000 --- a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc +++ /dev/null @@ -1,68 +0,0 @@ -#include "infinicore/ops/avg_pool1d.hpp" - -#include "../../utils.hpp" - -#include - -namespace infinicore::op { - -common::OpDispatcher &AvgPool1d::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -} - -void AvgPool1d::execute( - Tensor output, - Tensor input, - size_t kernel_size, - size_t stride, - size_t padding) { - - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); - if (stride == 0) { - stride = kernel_size; - } - - infinicore::context::setDevice(output->device()); - auto device_type = output->device().getType(); - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error( - "No AvgPool1d implementation for device type: " + std::to_string(static_cast(device_type))); - } - - func(output, input, kernel_size, stride, padding); -} - -Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) { - if (stride == 0) { - stride = kernel_size; - } - - const auto &shape = input->shape(); - if (shape.size() != 3) { - throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]"); - } - - const size_t n = shape[0]; - const size_t c = shape[1]; - const size_t l_in = shape[2]; - - if (l_in + 2 * padding < kernel_size) { - throw std::runtime_error("AvgPool1d kernel_size is larger than padded length"); - } - - const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1; - - Shape out_shape = {n, c, out_width}; - auto output = Tensor::empty(out_shape, input->dtype(), input->device()); - avg_pool1d_(output, input, kernel_size, stride, padding); - return output; -} - -void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) { - AvgPool1d::execute(output, input, kernel_size, stride, padding); -} - -} // namespace infinicore::op diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc deleted file mode 100644 index df7ebda8d..000000000 --- a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc +++ /dev/null @@ -1,69 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/avg_pool1d.hpp" -#include "infinicore/ops/common/cache.hpp" -#include - -namespace infinicore::op::avg_pool1d_impl::infiniop { - -thread_local common::OpCache caches( - 100, - [](infiniopAvgPool1dDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate( - Tensor output, - Tensor input, - size_t kernel_size, - size_t stride, - size_t padding) { - - if (stride == 0) { - stride = kernel_size; - } - - size_t seed = hash_combine(output, input, kernel_size, stride, padding); - - auto device = context::getDevice(); - auto &cache = caches.getCache(device); - - auto desc_opt = cache.get(seed); - infiniopAvgPool1dDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor( - context::getInfiniopHandle(device), - &desc, - output->desc(), - input->desc(), - kernel_size, - stride, - padding)); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); - - INFINICORE_CHECK_ERROR(infiniopAvgPool1d( - desc, - workspace->data(), - workspace_size, - output->data(), - input->data(), - context::getStream())); -} - -static bool registered = []() { - AvgPool1d::dispatcher().registerAll(&calculate, false); - return true; -}(); - -} // namespace infinicore::op::avg_pool1d_impl::infiniop diff --git a/src/infinicore/ops/cross_entropy/cross_entropy.cc b/src/infinicore/ops/cross_entropy/cross_entropy.cc deleted file mode 100644 index 84aebc1b1..000000000 --- a/src/infinicore/ops/cross_entropy/cross_entropy.cc +++ /dev/null @@ -1,45 +0,0 @@ -#include "infinicore/ops/cross_entropy.hpp" - -#include "../../utils.hpp" - -#include - -namespace infinicore::op { - -common::OpDispatcher &CrossEntropy::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; - -void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) { - - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target); - - infinicore::context::setDevice(output->device()); - auto device_type = output->device().getType(); - - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(output, input, target); -} - -Tensor cross_entropy(Tensor input, Tensor target) { - - Shape shape = target->shape(); - - auto output = Tensor::empty(shape, input->dtype(), input->device()); - - cross_entropy_(output, input, target); - return output; -} - -void cross_entropy_(Tensor output, Tensor input, Tensor target) { - CrossEntropy::execute(output, input, target); -} - -} // namespace infinicore::op diff --git a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc deleted file mode 100644 index 5fa7963d7..000000000 --- a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" - -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/cross_entropy.hpp" - -#include - -namespace infinicore::op::cross_entropy_impl::infiniop { - -thread_local common::OpCache caches( - 100, - [](infiniopCrossEntropyDescriptor_t &desc) { - if (desc != nullptr) { - - INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor output, Tensor input, Tensor target) { - - size_t seed = hash_combine(output, input, target); - - auto device = context::getDevice(); - auto &cache = caches.getCache(device); - - auto desc_opt = cache.get(seed); - infiniopCrossEntropyDescriptor_t desc = nullptr; - - if (!desc_opt) { - - INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor( - context::getInfiniopHandle(device), - &desc, - output->desc(), - input->desc(), - target->desc())); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size)); - - std::shared_ptr workspace = context::allocateMemory(workspace_size); - - INFINICORE_CHECK_ERROR(infiniopCrossEntropy( - desc, - workspace->data(), - workspace_size, - output->data(), - input->data(), - target->data(), - context::getStream())); -} - -static bool registered = []() { - CrossEntropy::dispatcher().registerAll(&calculate, false); - return true; -}(); - -} // namespace infinicore::op::cross_entropy_impl::infiniop diff --git a/src/infinicore/ops/equal/equal.cc b/src/infinicore/ops/equal/equal.cc deleted file mode 100644 index b6acc4d25..000000000 --- a/src/infinicore/ops/equal/equal.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "infinicore/ops/equal.hpp" - -#include "../../utils.hpp" - -namespace infinicore::op { - -common::OpDispatcher &Equal::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; - -void Equal::execute(Tensor out, Tensor a, Tensor b) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b); - infinicore::context::setDevice(out->device()); - dispatcher().lookup(out->device().getType())(out, a, b); -} - -Tensor equal(Tensor a, Tensor b) { - auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device()); - equal_(out, a, b); - return out; -} - -void equal_(Tensor out, Tensor a, Tensor b) { - if (out->dtype() != DataType::BOOL) { - throw std::runtime_error("Equal expects bool output tensor."); - } - Equal::execute(out, a, b); -} - -} // namespace infinicore::op diff --git a/src/infinicore/ops/equal/equal_infiniop.cc b/src/infinicore/ops/equal/equal_infiniop.cc deleted file mode 100644 index 1b4e4cffa..000000000 --- a/src/infinicore/ops/equal/equal_infiniop.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/equal.hpp" -#include - -namespace infinicore::op::equal_impl::infiniop { - -thread_local common::OpCache caches( - 100, - [](infiniopEqualDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor out, Tensor a, Tensor b) { - size_t seed = hash_combine(out, a, b); - auto device = context::getDevice(); - auto &cache = caches.getCache(device); - - infiniopEqualDescriptor_t desc = nullptr; - if (auto cached = cache.get(seed)) { - desc = *cached; - } else { - INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor( - context::getInfiniopHandle(device), &desc, - out->desc(), a->desc(), b->desc())); - cache.put(seed, desc); - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace; - void *workspace_ptr = nullptr; - if (workspace_size != 0) { - workspace = context::allocateMemory(workspace_size); - workspace_ptr = workspace->data(); - } - - INFINICORE_CHECK_ERROR(infiniopEqual( - desc, - workspace_ptr, - workspace_size, - out->data(), - a->data(), - b->data(), - context::getStream())); -} - -static bool registered = []() { - Equal::dispatcher().registerAll(&calculate, false); - return true; -}(); - -} // namespace infinicore::op::equal_impl::infiniop diff --git a/src/infinicore/ops/hardswish/hardswish.cc b/src/infinicore/ops/hardswish/hardswish.cc deleted file mode 100644 index ec8db75ff..000000000 --- a/src/infinicore/ops/hardswish/hardswish.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include "infinicore/ops/hardswish.hpp" - -#include "../../utils.hpp" - -#include - -namespace infinicore::op { - -common::OpDispatcher &Hardswish::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -} - -void Hardswish::execute(Tensor output, Tensor input) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); - infinicore::context::setDevice(output->device()); - auto device_type = output->device().getType(); - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error( - "No Hardswish implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(output, input); -} - -Tensor hardswish(Tensor input) { - auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); - hardswish_(output, input); - return output; -} - -void hardswish_(Tensor output, Tensor input) { - Hardswish::execute(output, input); -} - -} // namespace infinicore::op diff --git a/src/infinicore/ops/hardswish/hardswish_infiniop.cc b/src/infinicore/ops/hardswish/hardswish_infiniop.cc deleted file mode 100644 index 44d4054e8..000000000 --- a/src/infinicore/ops/hardswish/hardswish_infiniop.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/hardswish.hpp" -#include - -namespace infinicore::op::hardswish_impl::infiniop { - -thread_local common::OpCache caches( - 100, - [](infiniopHardSwishDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor output, Tensor input) { - size_t seed = hash_combine(output, input); - - auto device = context::getDevice(); - auto &cache = caches.getCache(device); - - auto desc_opt = cache.get(seed); - infiniopHardSwishDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor( - context::getInfiniopHandle(device), - &desc, - output->desc(), - input->desc())); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace; - void *workspace_ptr = nullptr; - if (workspace_size != 0) { - workspace = context::allocateMemory(workspace_size); - workspace_ptr = workspace->data(); - } - - INFINICORE_CHECK_ERROR(infiniopHardSwish( - desc, - workspace_ptr, - workspace_size, - output->data(), - input->data(), - context::getStream())); -} - -static bool registered = []() { - Hardswish::dispatcher().registerAll(&calculate, false); - return true; -}(); - -} // namespace infinicore::op::hardswish_impl::infiniop diff --git a/src/infinicore/ops/hardtanh/hardtanh.cc b/src/infinicore/ops/hardtanh/hardtanh.cc deleted file mode 100644 index 5a4df2142..000000000 --- a/src/infinicore/ops/hardtanh/hardtanh.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include "infinicore/ops/hardtanh.hpp" - -#include "../../utils.hpp" - -#include - -namespace infinicore::op { - -common::OpDispatcher &HardTanh::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -} - -void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); - infinicore::context::setDevice(output->device()); - - auto device_type = output->device().getType(); - auto func = dispatcher().lookup(device_type); - if (func == nullptr) { - throw std::runtime_error( - "No HardTanh implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(output, input, min_val, max_val); -} - -Tensor hardtanh(Tensor input, float min_val, float max_val) { - auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); - hardtanh_(output, input, min_val, max_val); - return output; -} - -void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) { - HardTanh::execute(output, input, min_val, max_val); -} - -} // namespace infinicore::op diff --git a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc deleted file mode 100644 index d8af439d8..000000000 --- a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc +++ /dev/null @@ -1,63 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/hardtanh.hpp" -#include - -namespace infinicore::op::hardtanh_impl::infiniop { - -thread_local common::OpCache caches( - 100, - [](infiniopHardTanhDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor output, Tensor input, float min_val, float max_val) { - size_t seed = hash_combine(output, input, min_val, max_val); - - auto device = context::getDevice(); - auto &cache = caches.getCache(device); - - auto desc_opt = cache.get(seed); - infiniopHardTanhDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor( - context::getInfiniopHandle(device), - &desc, - output->desc(), - input->desc(), - min_val, - max_val)); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace; - void *workspace_ptr = nullptr; - if (workspace_size != 0) { - workspace = context::allocateMemory(workspace_size); - workspace_ptr = workspace->data(); - } - - INFINICORE_CHECK_ERROR(infiniopHardTanh( - desc, - workspace_ptr, - workspace_size, - output->data(), - input->data(), - context::getStream())); -} - -static bool registered = []() { - HardTanh::dispatcher().registerAll(&calculate, false); - return true; -}(); - -} // namespace infinicore::op::hardtanh_impl::infiniop diff --git a/src/infinicore/ops/sum/sum.cc b/src/infinicore/ops/sum/sum.cc deleted file mode 100644 index 5fcecda5e..000000000 --- a/src/infinicore/ops/sum/sum.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include "infinicore/ops/sum.hpp" - -#include "../../utils.hpp" -#include -#include - -namespace infinicore::op { - -common::OpDispatcher &Sum::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; -void Sum::execute(Tensor output, Tensor input, std::vector dim, bool keepdim) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); - infinicore::context::setDevice(input->device()); - auto device_type = context::getDevice().getType(); - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error("No Sum implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(output, input, dim, keepdim); -} - -Tensor sum(Tensor input, std::vector dim, bool keepdim) { - auto in_shape = input->shape(); - std::vector out_shape; - if (dim.empty()) { - for (size_t i = 0; i < in_shape.size(); i++) { - dim.push_back(i); - } - } - std::sort(dim.begin(), dim.end()); - if (dim.size() == in_shape.size() && !keepdim) { - out_shape = {}; - } else { - if (keepdim) { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - out_shape.push_back(1); - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } else { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } - } - auto output = Tensor::empty(out_shape, input->dtype(), input->device()); - sum_(output, input, dim, keepdim); - return output; -} - -void sum_(Tensor output, Tensor input, std::vector dim, bool keepdim) { - Sum::execute(output, input, dim, keepdim); -} -} // namespace infinicore::op diff --git a/src/infinicore/ops/sum/sum_infiniop.cc b/src/infinicore/ops/sum/sum_infiniop.cc deleted file mode 100644 index 9a696a9b5..000000000 --- a/src/infinicore/ops/sum/sum_infiniop.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/sum.hpp" -#include - -namespace infinicore::op::sum_impl::infiniop { - -thread_local common::OpCache caches( - 100, // capacity - [](infiniopSumDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroySumDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor output, Tensor input, std::vector dim, bool keepdim) { - size_t seed = hash_combine(output, input, dim.size(), keepdim); - - auto device_type = context::getDevice().getType(); - auto device_index = context::getDevice().getIndex(); - - auto &cache = caches.getCache(device_type, device_index); - - auto desc_opt = cache.get(seed); - infiniopSumDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateSumDescriptor( - context::getInfiniopHandle(output->device()), &desc, - output->desc(), input->desc(), dim.data(), dim.size(), keepdim)); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetSumWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); - - INFINICORE_CHECK_ERROR(infiniopSum( - desc, workspace->data(), workspace_size, - output->data(), input->data(), dim.data(), dim.size(), keepdim, context::getStream())); -} - -static bool registered = []() { - Sum::dispatcher().registerDevice({Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR}, - &calculate, false); - return true; -}(); - -} // namespace infinicore::op::sum_impl::infiniop diff --git a/src/infinicore/ops/topk/topk.cc b/src/infinicore/ops/topk/topk.cc deleted file mode 100644 index a5b52fccf..000000000 --- a/src/infinicore/ops/topk/topk.cc +++ /dev/null @@ -1,40 +0,0 @@ -#include "infinicore/ops/topk.hpp" - -#include "../../utils.hpp" -#include -#include - -namespace infinicore::op { - -common::OpDispatcher &TopK::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; -void TopK::execute(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(values_output, input); - infinicore::context::setDevice(input->device()); - auto device_type = context::getDevice().getType(); - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error("No Topk implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(values_output, indices_output, input, k, dim, largest, sorted); -} - -std::pair topk(Tensor input, size_t k, size_t dim, bool largest, bool sorted) { - auto in_shape = input->shape(); - std::vector out_shape = in_shape; - out_shape[dim] = k; - - auto values_output = Tensor::empty(out_shape, input->dtype(), input->device()); - auto indices_output = Tensor::empty(out_shape, DataType::I32, input->device()); - topk_(values_output, indices_output, input, k, dim, largest, sorted); - return {values_output, indices_output}; -} - -void topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) { - TopK::execute(values_output, indices_output, input, k, dim, largest, sorted); -} -} // namespace infinicore::op diff --git a/src/infinicore/ops/topk/topk_infiniop.cc b/src/infinicore/ops/topk/topk_infiniop.cc deleted file mode 100644 index 5cc8d4d98..000000000 --- a/src/infinicore/ops/topk/topk_infiniop.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/topk.hpp" -#include - -namespace infinicore::op::topk_impl::infiniop { - -thread_local common::OpCache caches( - 100, // capacity - [](infiniopTopKDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyTopKDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) { - size_t seed = hash_combine(values_output, indices_output, input, k, dim, largest, sorted); - - auto device_type = context::getDevice().getType(); - auto device_index = context::getDevice().getIndex(); - - auto &cache = caches.getCache(device_type, device_index); - - auto desc_opt = cache.get(seed); - infiniopTopKDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateTopKDescriptor( - context::getInfiniopHandle(values_output->device()), &desc, - values_output->desc(), indices_output->desc(), input->desc(), k, dim, largest, sorted)); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetTopKWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); - - INFINICORE_CHECK_ERROR(infiniopTopK( - desc, workspace->data(), workspace_size, - values_output->data(), indices_output->data(), input->data(), k, dim, largest, sorted, context::getStream())); -} - -static bool registered = []() { - TopK::dispatcher().registerDevice({Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR}, - &calculate, false); - return true; -}(); - -} // namespace infinicore::op::topk_impl::infiniop diff --git a/src/infinicore/ops/var/var.cc b/src/infinicore/ops/var/var.cc deleted file mode 100644 index bc0849e64..000000000 --- a/src/infinicore/ops/var/var.cc +++ /dev/null @@ -1,68 +0,0 @@ -#include "infinicore/ops/var.hpp" - -#include "../../utils.hpp" -#include -#include - -namespace infinicore::op { - -common::OpDispatcher &Var::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; - -void Var::execute(Tensor var_output, Tensor input, std::vector dim, bool unbiased, bool keepdim) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(var_output, input); - infinicore::context::setDevice(input->device()); - auto device_type = context::getDevice().getType(); - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error("No Var implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(var_output, input, dim, unbiased, keepdim); -} - -Tensor var(Tensor input, std::vector dim, bool unbiased, bool keepdim) { - auto in_shape = input->shape(); - std::vector out_shape; - if (dim.empty()) { - for (size_t i = 0; i < in_shape.size(); i++) { - dim.push_back(i); - } - } - std::sort(dim.begin(), dim.end()); - if (dim.size() == in_shape.size() && !keepdim) { - out_shape = {}; - } else { - if (keepdim) { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - out_shape.push_back(1); - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } else { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } - } - auto var_output = Tensor::empty(out_shape, input->dtype(), input->device()); - var_(var_output, input, dim, unbiased, keepdim); - return var_output; -} - -void var_(Tensor var_output, Tensor input, std::vector dim, bool unbiased, bool keepdim) { - Var::execute(var_output, input, dim, unbiased, keepdim); -} -} // namespace infinicore::op diff --git a/src/infinicore/ops/var/var_infiniop.cc b/src/infinicore/ops/var/var_infiniop.cc deleted file mode 100644 index c74eb2628..000000000 --- a/src/infinicore/ops/var/var_infiniop.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/var.hpp" -#include - -namespace infinicore::op::var_impl::infiniop { - -thread_local common::OpCache caches( - 100, // capacity - [](infiniopVarDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyVarDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor var_output, Tensor input, std::vector dim, bool unbiased, bool keepdim) { - size_t seed = hash_combine(var_output, input, dim.size(), unbiased, keepdim); - - auto device_type = context::getDevice().getType(); - auto device_index = context::getDevice().getIndex(); - - auto &cache = caches.getCache(device_type, device_index); - - auto desc_opt = cache.get(seed); - infiniopVarDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateVarDescriptor( - context::getInfiniopHandle(var_output->device()), &desc, - var_output->desc(), input->desc(), dim.data(), dim.size(), unbiased, keepdim)); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetVarWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); - - INFINICORE_CHECK_ERROR(infiniopVar( - desc, workspace->data(), workspace_size, - var_output->data(), input->data(), dim.data(), dim.size(), unbiased, keepdim, context::getStream())); -} - -static bool registered = []() { - Var::dispatcher().registerDevice({Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR}, - &calculate, false); - return true; -}(); - -} // namespace infinicore::op::var_impl::infiniop diff --git a/src/infinicore/ops/var_mean/var_mean.cc b/src/infinicore/ops/var_mean/var_mean.cc deleted file mode 100644 index 817be7bcf..000000000 --- a/src/infinicore/ops/var_mean/var_mean.cc +++ /dev/null @@ -1,69 +0,0 @@ -#include "infinicore/ops/var_mean.hpp" - -#include "../../utils.hpp" -#include -#include - -namespace infinicore::op { - -common::OpDispatcher &Var_Mean::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; - -void Var_Mean::execute(Tensor var_output, Tensor mean_output, Tensor input, std::vector dim, bool unbiased, bool keepdim) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(var_output, mean_output, input); - infinicore::context::setDevice(input->device()); - auto device_type = context::getDevice().getType(); - auto func = dispatcher().lookup(device_type); - - if (func == nullptr) { - throw std::runtime_error("No Var_Mean implementation found for device type: " + std::to_string(static_cast(device_type))); - } - - func(var_output, mean_output, input, dim, unbiased, keepdim); -} - -std::pair var_mean(Tensor input, std::vector dim, bool unbiased, bool keepdim) { - auto in_shape = input->shape(); - std::vector out_shape; - if (dim.empty()) { - for (size_t i = 0; i < in_shape.size(); i++) { - dim.push_back(i); - } - } - std::sort(dim.begin(), dim.end()); - if (dim.size() == in_shape.size() && !keepdim) { - out_shape = {}; - } else { - if (keepdim) { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - out_shape.push_back(1); - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } else { - size_t j = 0; - for (size_t i = 0; i < in_shape.size(); i++) { - if (j < dim.size() && dim[j] == i) { - j++; - } else { - out_shape.push_back(in_shape[i]); - } - } - } - } - auto var_output = Tensor::empty(out_shape, input->dtype(), input->device()); - auto mean_output = Tensor::empty(out_shape, input->dtype(), input->device()); - var_mean_(var_output, mean_output, input, dim, unbiased, keepdim); - return {var_output, mean_output}; -} - -void var_mean_(Tensor var_output, Tensor mean_output, Tensor input, std::vector dim, bool unbiased, bool keepdim) { - Var_Mean::execute(var_output, mean_output, input, dim, unbiased, keepdim); -} -} // namespace infinicore::op diff --git a/src/infinicore/ops/var_mean/var_mean_infiniop.cc b/src/infinicore/ops/var_mean/var_mean_infiniop.cc deleted file mode 100644 index 89332d074..000000000 --- a/src/infinicore/ops/var_mean/var_mean_infiniop.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" -#include "infinicore/ops/var_mean.hpp" -#include - -// todo 实现需要修改calculate函数 - -namespace infinicore::op::var_mean_impl::infiniop { - -thread_local common::OpCache caches( - 100, // capacity - [](infiniopVarMeanDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyVarMeanDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor var_output, Tensor mean_output, Tensor input, std::vector dim, bool unbiased, bool keepdim) { - size_t seed = hash_combine(var_output, mean_output, input, dim.size(), unbiased, keepdim); - - auto device_type = context::getDevice().getType(); - auto device_index = context::getDevice().getIndex(); - - auto &cache = caches.getCache(device_type, device_index); - - auto desc_opt = cache.get(seed); - infiniopVarMeanDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateVarMeanDescriptor( - context::getInfiniopHandle(var_output->device()), &desc, - var_output->desc(), mean_output->desc(), input->desc(), dim.data(), dim.size(), unbiased, keepdim)); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetVarMeanWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); - - INFINICORE_CHECK_ERROR(infiniopVarMean( - desc, workspace->data(), workspace_size, - var_output->data(), mean_output->data(), input->data(), dim.data(), dim.size(), unbiased, keepdim, context::getStream())); -} - -static bool registered = []() { - Var_Mean::dispatcher().registerDevice({Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR}, - &calculate, false); - return true; -}(); - -} // namespace infinicore::op::var_mean_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 1c841961a..2eecb843f 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -4,16 +4,10 @@ #include "ops/add.hpp" #include "ops/add_rms_norm.hpp" -#include "ops/all.hpp" #include "ops/attention.hpp" -#include "ops/avg_pool1d.hpp" #include "ops/causal_softmax.hpp" -#include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" -#include "ops/equal.hpp" #include "ops/flash_attention.hpp" -#include "ops/hardswish.hpp" -#include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" @@ -30,11 +24,7 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/silu_and_mul.hpp" -#include "ops/sum.hpp" #include "ops/swiglu.hpp" -#include "ops/topk.hpp" -#include "ops/var.hpp" -#include "ops/var_mean.hpp" namespace py = pybind11; @@ -52,28 +42,18 @@ inline void bind(py::module &m) { bind_mul(m); bind_mha_kvcache(m); bind_mha_varlen(m); - bind_hardswish(m); - bind_hardtanh(m); bind_paged_attention(m); bind_paged_attention_prefill(m); bind_paged_caching(m); bind_random_sample(m); - bind_cross_entropy(m); bind_rearrange(m); bind_rms_norm(m); - bind_avg_pool1d(m); bind_silu(m); bind_swiglu(m); bind_rope(m); bind_embedding(m); bind_linear_w8a8i8(m); bind_silu_and_mul(m); - bind_sum(m); - bind_var_mean(m); - bind_var(m); - bind_topk(m); - bind_all(m); - bind_equal(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/all.hpp b/src/infinicore/pybind11/ops/all.hpp deleted file mode 100644 index 4ccac685b..000000000 --- a/src/infinicore/pybind11/ops/all.hpp +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/all.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -Tensor py_all(Tensor input, py::object dim, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - return op::all(input, dim_vec, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - return op::all(input, dim.cast>(), keepdim); - } else if (py::isinstance(dim)) { - return op::all(input, std::vector(1, dim.cast()), keepdim); - } else { - throw std::invalid_argument("dim must be a tuple or an integer"); - } -} - -void py_all_(Tensor output, Tensor input, py::object dim, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - op::all_(output, input, dim_vec, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - op::all_(output, input, dim.cast>(), keepdim); - } else if (py::isinstance(dim)) { - op::all_(output, input, std::vector(1, dim.cast()), keepdim); - } else { - throw std::invalid_argument("dim must be a tuple or an integer"); - } -} - -inline void bind_all(py::module &m) { - m.def("all", - &py_all, - py::arg("input"), - py::arg("dim"), - py::arg("keepdim"), - R"doc(All of input tensor along the given dimensions.)doc"); - - m.def("all_", - &py_all_, - py::arg("output"), - py::arg("input"), - py::arg("dim"), - py::arg("keepdim"), - R"doc(In-place tensor all.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/avg_pool1d.hpp b/src/infinicore/pybind11/ops/avg_pool1d.hpp deleted file mode 100644 index 32394552a..000000000 --- a/src/infinicore/pybind11/ops/avg_pool1d.hpp +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include -#include - -#include "infinicore/ops/avg_pool1d.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -inline void bind_avg_pool1d(py::module &m) { - m.def( - "avg_pool1d", - [](::infinicore::Tensor input, size_t kernel_size, std::optional stride, size_t padding) { - return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding); - }, - py::arg("input"), - py::arg("kernel_size"), - py::arg("stride") = py::none(), - py::arg("padding") = 0, - R"doc(AvgPool1d out-of-place.)doc"); - - m.def( - "avg_pool1d_", - [](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional stride, size_t padding) { - op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding); - }, - py::arg("output"), - py::arg("input"), - py::arg("kernel_size"), - py::arg("stride") = py::none(), - py::arg("padding") = 0, - R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/cross_entropy.hpp b/src/infinicore/pybind11/ops/cross_entropy.hpp deleted file mode 100644 index 8105642a6..000000000 --- a/src/infinicore/pybind11/ops/cross_entropy.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/cross_entropy.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -inline void bind_cross_entropy(py::module &m) { - m.def("cross_entropy", - &op::cross_entropy, - py::arg("logits"), - py::arg("target"), - R"doc(Token-wise cross entropy loss without reduction.)doc"); - - m.def("cross_entropy_", - &op::cross_entropy_, - py::arg("loss"), - py::arg("logits"), - py::arg("target"), - R"doc(Write cross entropy loss into a provided tensor.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/equal.hpp b/src/infinicore/pybind11/ops/equal.hpp deleted file mode 100644 index d14a6b61d..000000000 --- a/src/infinicore/pybind11/ops/equal.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/equal.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -inline void bind_equal(py::module &m) { - m.def("equal", - &op::equal, - py::arg("a"), - py::arg("b"), - R"doc(Elementwise equality returning a bool tensor.)doc"); - - m.def("equal_", - &op::equal_, - py::arg("out"), - py::arg("a"), - py::arg("b"), - R"doc(In-place elementwise equality writing into `out`.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/hardswish.hpp b/src/infinicore/pybind11/ops/hardswish.hpp deleted file mode 100644 index daaccec62..000000000 --- a/src/infinicore/pybind11/ops/hardswish.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/hardswish.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -inline void bind_hardswish(py::module &m) { - m.def("hardswish", - &op::hardswish, - py::arg("input"), - R"doc(Out-of-place Hardswish activation.)doc"); - - m.def("hardswish_", - &op::hardswish_, - py::arg("output"), - py::arg("input"), - R"doc(In-place Hardswish activation.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/hardtanh.hpp b/src/infinicore/pybind11/ops/hardtanh.hpp deleted file mode 100644 index ff9abb872..000000000 --- a/src/infinicore/pybind11/ops/hardtanh.hpp +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/hardtanh.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -inline void bind_hardtanh(py::module &m) { - m.def("hardtanh", - &op::hardtanh, - py::arg("input"), - py::arg("min_val") = -1.0f, - py::arg("max_val") = 1.0f, - R"doc(Apply the HardTanh activation.)doc"); - - m.def("hardtanh_", - &op::hardtanh_, - py::arg("output"), - py::arg("input"), - py::arg("min_val") = -1.0f, - py::arg("max_val") = 1.0f, - R"doc(In-place HardTanh activation.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/sum.hpp b/src/infinicore/pybind11/ops/sum.hpp deleted file mode 100644 index 50fef7539..000000000 --- a/src/infinicore/pybind11/ops/sum.hpp +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/sum.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -Tensor py_sum(Tensor input, py::object dim, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - return op::sum(input, dim_vec, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - return op::sum(input, dim.cast>(), keepdim); - } else if (py::isinstance(dim)) { - return op::sum(input, std::vector(1, dim.cast()), keepdim); - } else { - throw std::invalid_argument("dim must be a tuple or an integer"); - } -} - -void py_sum_(Tensor output, Tensor input, py::object dim, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - op::sum_(output, input, dim_vec, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - op::sum_(output, input, dim.cast>(), keepdim); - } else if (py::isinstance(dim)) { - op::sum_(output, input, std::vector(1, dim.cast()), keepdim); - } else { - throw std::invalid_argument("dim must be a tuple or an integer"); - } -} - -inline void bind_sum(py::module &m) { - m.def("sum", - &py_sum, - py::arg("input"), - py::arg("dim"), - py::arg("keepdim"), - R"doc(Sum of input tensor along the given dimensions.)doc"); - - m.def("sum_", - &py_sum_, - py::arg("output"), - py::arg("input"), - py::arg("dim"), - py::arg("keepdim"), - R"doc(In-place tensor sum.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/topk.hpp b/src/infinicore/pybind11/ops/topk.hpp deleted file mode 100644 index 1341f39fa..000000000 --- a/src/infinicore/pybind11/ops/topk.hpp +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include -#include // 添加这行 - -#include "infinicore/ops/topk.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -std::pair py_topk(Tensor input, size_t k, int dim, bool largest, bool sorted) { - if (dim == -1) { - return op::topk(input, k, input->ndim() - 1, largest, sorted); - } else if (dim >= 0) { - return op::topk(input, k, static_cast(dim), largest, sorted); - } else { - throw std::invalid_argument("invalid argument: dim"); - } -} - -void py_topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, int dim, bool largest, bool sorted) { - if (dim == -1) { - op::topk_(values_output, indices_output, input, k, input->ndim() - 1, largest, sorted); - } else if (dim >= 0) { - op::topk_(values_output, indices_output, input, k, static_cast(dim), largest, sorted); - } else { - throw std::invalid_argument("invalid argument: dim"); - } -} - -inline void bind_topk(py::module &m) { - m.def("topk", - &py_topk, - py::arg("input"), - py::arg("k"), - py::arg("dim"), - py::arg("largest"), - py::arg("sorted"), - R"doc(topk of input tensor along the given dimensions.)doc"); - - m.def("topk_", - &py_topk_, - py::arg("values_output"), - py::arg("indices_output"), - py::arg("input"), - py::arg("k"), - py::arg("dim"), - py::arg("largest"), - py::arg("sorted"), - R"doc(In-place tensor topk_.)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/var.hpp b/src/infinicore/pybind11/ops/var.hpp deleted file mode 100644 index 9668fef5f..000000000 --- a/src/infinicore/pybind11/ops/var.hpp +++ /dev/null @@ -1,62 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/var.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -Tensor py_var(Tensor input, py::object dim, bool unbiased, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - return op::var(input, dim_vec, unbiased, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - return op::var(input, dim.cast>(), unbiased, keepdim); - } else if (py::isinstance(dim)) { - return op::var(input, std::vector(1, dim.cast()), unbiased, keepdim); - } else { - throw std::invalid_argument("dim must be a tuple or an integer"); - } -} - -void py_var_(Tensor var_output, Tensor input, py::object dim, bool unbiased, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - op::var_(var_output, input, dim_vec, unbiased, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - op::var_(var_output, input, dim.cast>(), unbiased, keepdim); - } else if (py::isinstance(dim)) { - op::var_(var_output, input, std::vector(1, dim.cast()), unbiased, keepdim); - } else { - throw std::invalid_argument("dim must be a list/tuple or an integer"); - } -} - -inline void bind_var(py::module &m) { - m.def("var", - &py_var, - py::arg("input"), - py::arg("dim"), - py::arg("unbiased"), - py::arg("keepdim"), - R"doc(Var of input tensor along the given dimensions.)doc"); - - m.def("var_", - &py_var_, - py::arg("var_output"), - py::arg("input"), - py::arg("dim"), - py::arg("unbiased"), - py::arg("keepdim"), - R"doc(In-place tensor Var .)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/var_mean.hpp b/src/infinicore/pybind11/ops/var_mean.hpp deleted file mode 100644 index 986ec49f7..000000000 --- a/src/infinicore/pybind11/ops/var_mean.hpp +++ /dev/null @@ -1,63 +0,0 @@ -#pragma once - -#include - -#include "infinicore/ops/var_mean.hpp" - -namespace py = pybind11; - -namespace infinicore::ops { - -std::pair py_var_mean(Tensor input, py::object dim, bool unbiased, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - return op::var_mean(input, dim_vec, unbiased, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - return op::var_mean(input, dim.cast>(), unbiased, keepdim); - } else if (py::isinstance(dim)) { - return op::var_mean(input, std::vector(1, dim.cast()), unbiased, keepdim); - } else { - throw std::invalid_argument("dim must be a tuple or an integer"); - } -} - -void py_var_mean_(Tensor var_output, Tensor mean_output, Tensor input, py::object dim, bool unbiased, bool keepdim) { - if (dim.is_none()) { - std::vector dim_vec; - for (int i = 0; i < input->shape().size(); i++) { - dim_vec.push_back(i); - } - op::var_mean_(var_output, mean_output, input, dim_vec, unbiased, keepdim); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - op::var_mean_(var_output, mean_output, input, dim.cast>(), unbiased, keepdim); - } else if (py::isinstance(dim)) { - op::var_mean_(var_output, mean_output, input, std::vector(1, dim.cast()), unbiased, keepdim); - } else { - throw std::invalid_argument("dim must be a list/tuple or an integer"); - } -} - -inline void bind_var_mean(py::module &m) { - m.def("var_mean", - &py_var_mean, - py::arg("input"), - py::arg("dim"), - py::arg("unbiased"), - py::arg("keepdim"), - R"doc(Var & Mean of input tensor along the given dimensions.)doc"); - - m.def("var_mean_", - &py_var_mean_, - py::arg("var_output"), - py::arg("mean_output"), - py::arg("input"), - py::arg("dim"), - py::arg("unbiased"), - py::arg("keepdim"), - R"doc(In-place tensor Var & Mean .)doc"); -} - -} // namespace infinicore::ops diff --git a/src/infiniop/ops/all/all_desc.h b/src/infiniop/ops/all/all_desc.h deleted file mode 100644 index 9b7a1e0d6..000000000 --- a/src/infiniop/ops/all/all_desc.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef INFINIOP_ALL_DESCRIPTOR_H_ -#define INFINIOP_ALL_DESCRIPTOR_H_ -#include "../../../utils.h" -#include "../../operator.h" -#include "../../tensor.h" - -#include "info.h" - -#define DESCRIPTOR(NAMESPACE) \ - \ - namespace op::all::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - AllInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - AllInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t output_desc, \ - infiniopTensorDescriptor_t input_desc, \ - size_t *dim, \ - size_t dim_size, \ - bool keepdim); \ - \ - infiniStatus_t calculate( \ - void *workspace, size_t workspace_size, \ - void *output, \ - const void *input, \ - size_t *dim, \ - size_t dim_size, \ - bool keepdim, \ - void *stream) const; \ - }; \ - } - -#endif diff --git a/src/infiniop/ops/all/cpu/all_cpu.cc b/src/infiniop/ops/all/cpu/all_cpu.cc deleted file mode 100644 index dbe03fc3b..000000000 --- a/src/infiniop/ops/all/cpu/all_cpu.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "all_cpu.h" -#include "../../../../utils.h" -#include "../../../devices/cpu/common_cpu.h" -#include -namespace op::all::cpu { - -Descriptor::~Descriptor() {} -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - - *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -template -infiniStatus_t calculateAll( - const AllInfo &info, - bool *output, - const Tdata *input, - size_t *dim, - size_t dim_size, - bool keepdim) { - if (info.reduce_dim_size == info.ndim) { - bool result = true; - for (size_t index = 0; index < info.input_size; index++) { - size_t input_offset = op::common_cpu::indexToOffset(index, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data()); - result = result && input[input_offset]; - } - output[0] = result; - return INFINI_STATUS_SUCCESS; - } else { - for (size_t i = info.output_size; i-- > 0;) { - size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data()); - bool result = true; - for (size_t j = 0; j < info.reduce_num; j++) { - size_t input_flat = j + i * info.reduce_num; - size_t input_offset = op::common_cpu::indexToOffset(input_flat, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data()); - Tdata input_val = input[input_offset]; - bool bool_val = static_cast(input_val); - result = result && bool_val; - } - output[output_offset] = result; - } - return INFINI_STATUS_SUCCESS; - } -} -} // namespace -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream) const { - switch (_info.dtype) { - case INFINI_DTYPE_BOOL: - return calculateAll(_info, reinterpret_cast(output), reinterpret_cast(input), dim, dim_size, keepdim); - case INFINI_DTYPE_U8: - return calculateAll(_info, reinterpret_cast(output), reinterpret_cast(input), dim, dim_size, keepdim); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::all::cpu diff --git a/src/infiniop/ops/all/cpu/all_cpu.h b/src/infiniop/ops/all/cpu/all_cpu.h deleted file mode 100644 index 71fd83689..000000000 --- a/src/infiniop/ops/all/cpu/all_cpu.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ALL_CPU_H__ -#define __INFINIOP_ALL_CPU_H__ - -#include "../all_desc.h" - -DESCRIPTOR(cpu); - -#endif // __INFINIOP_ALL_CPU_H__ diff --git a/src/infiniop/ops/all/cuda/kernel.cuh b/src/infiniop/ops/all/cuda/kernel.cuh deleted file mode 100644 index b32d1da23..000000000 --- a/src/infiniop/ops/all/cuda/kernel.cuh +++ /dev/null @@ -1,98 +0,0 @@ -#ifndef __ALL_CUDA_H__ -#define __ALL_CUDA_H__ - -__forceinline__ __device__ __host__ size_t -indexToOffset( - size_t flat_index, - size_t ndim, - const size_t *shape, - const ptrdiff_t *strides) { - size_t res = 0; - for (size_t i = ndim; i-- > 0;) { - res += (flat_index % shape[i]) * strides[i]; - flat_index /= shape[i]; - } - return res; -} - -template -__global__ void allReduceTempKernel( - bool *temp_output, - const Tdata *input, - size_t input_size, - size_t permuted_input_shape_size, - size_t *permuted_input_shape, - ptrdiff_t *permuted_input_strides) { - __shared__ bool s_data[BLOCK_SIZE]; - size_t tid = threadIdx.x; - size_t idx = tid + blockIdx.x * blockDim.x; - if (idx < input_size) { - size_t input_offset = indexToOffset(idx, permuted_input_shape_size, permuted_input_shape, permuted_input_strides); - s_data[tid] = static_cast(input[input_offset]); - } else { - s_data[tid] = true; - } - __syncthreads(); - for (size_t s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - s_data[tid] = s_data[tid] && s_data[tid + s]; - } - __syncthreads(); - } - if (tid == 0) { - temp_output[blockIdx.x] = s_data[0]; - } -} - -template -__global__ void finalAllReduceKernel( - bool *output, - const bool *block_results, - size_t num_blocks) { - __shared__ bool s_data[BLOCK_SIZE]; - size_t tid = threadIdx.x; - bool thread_val = true; - for (size_t i = tid; i < num_blocks; i += blockDim.x) { - thread_val = thread_val && block_results[i]; - } - s_data[tid] = thread_val; - __syncthreads(); - for (size_t s = BLOCK_SIZE / 2; s > 0; s >>= 1) { - if (tid < s) { - s_data[tid] = s_data[tid] && s_data[tid + s]; - } - __syncthreads(); - } - - if (tid == 0) { - *output = s_data[0]; - } -} - -template -__global__ void allKernel( - bool *output, - const Tdata *input, - size_t permuted_input_shape_size, - size_t output_shape_size, - size_t output_size, - size_t reduce_num, - size_t *permuted_input_shape, - size_t *output_shape, - ptrdiff_t *permuted_input_strides, - ptrdiff_t *output_strides) { - size_t tid = threadIdx.x; - size_t idx = tid + blockIdx.x * blockDim.x; - if (idx >= output_size) { - return; - } - size_t output_index = indexToOffset(idx, output_shape_size, output_shape, output_strides); - bool tempRes = true; - for (size_t i = 0; i < reduce_num; i++) { - size_t input_offset = indexToOffset(i + idx * reduce_num, permuted_input_shape_size, permuted_input_shape, permuted_input_strides); - tempRes = tempRes && static_cast(input[input_offset]); - } - output[output_index] = tempRes; -} - -#endif // __ALL_CUDA_H__ diff --git a/src/infiniop/ops/all/info.h b/src/infiniop/ops/all/info.h deleted file mode 100644 index f3f333fc8..000000000 --- a/src/infiniop/ops/all/info.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef __ALL_INFO_H__ -#define __ALL_INFO_H__ -#include "../../../utils.h" -#include "../../tensor.h" -#include -#include -#include - -namespace op::all { -class AllInfo { - AllInfo() = default; - -public: - infiniDtype_t dtype; - std::vector permuted_input_shape; // need to permute - std::vector output_shape; - std::vector permuted_input_strides; // need to permute - std::vector output_strides; - size_t reduce_dim_size; // reduce dim size - size_t reduce_num; // number of elements to reduce for each output element - size_t input_size; // total number of input elements - size_t output_size; // total number of output elements - size_t ndim; // number of dimensions - static utils::Result create( - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto input_shape = input_desc->shape(); - auto input_strides = input_desc->strides(); - size_t input_ndim = input_desc->ndim(); - size_t reduce_num = 1; - for (size_t i = 0; i < dim_size; i++) { - reduce_num *= input_shape[dim[i]]; - } - std::vector permute_order; - for (size_t i = 0; i < input_ndim; i++) { - if (std::find(dim, dim + dim_size, i) == dim + dim_size) { - permute_order.push_back(i); - } - } - for (size_t i = 0; i < dim_size; i++) { - permute_order.push_back(dim[i]); - } - std::vector permuted_input_shape; - std::vector permuted_input_strides; - for (size_t i = 0; i < permute_order.size(); i++) { - permuted_input_shape.push_back(input_shape[permute_order[i]]); - permuted_input_strides.push_back(input_strides[permute_order[i]]); - } - return utils::Result(AllInfo{input_desc->dtype(), - permuted_input_shape, - output_desc->shape(), - permuted_input_strides, - output_desc->strides(), - dim_size, - reduce_num, - input_desc->numel(), - output_desc->numel(), - input_ndim}); - } -}; -} // namespace op::all - -#endif diff --git a/src/infiniop/ops/all/metax/all_metax.h b/src/infiniop/ops/all/metax/all_metax.h deleted file mode 100644 index 0f0ecc742..000000000 --- a/src/infiniop/ops/all/metax/all_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ALL_METAX_H__ -#define __ALL_METAX_H__ - -#include "../all_desc.h" - -DESCRIPTOR(metax); - -#endif diff --git a/src/infiniop/ops/all/metax/all_metax.maca b/src/infiniop/ops/all/metax/all_metax.maca deleted file mode 100644 index b95936585..000000000 --- a/src/infiniop/ops/all/metax/all_metax.maca +++ /dev/null @@ -1,117 +0,0 @@ -#include "../../../devices/metax/metax_common.h" -#include "../../../devices/metax/metax_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "all_metax.h" - -namespace op::all::metax { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const AllInfo &info, - bool *output, const Tdata *input, - hcStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *permuted_input_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_hc = permuted_input_shape_hc + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - - if (info.reduce_num == input_size) { - size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - bool *temp_output; - CHECK_METAX(hcMalloc(&temp_output, grid_size * sizeof(bool))); - allReduceTempKernel<<>>( - temp_output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc); - finalAllReduceKernel<<<1, BLOCK_SIZE>>>(output, temp_output, grid_size); - CHECK_METAX(hcFree(temp_output)); - } else { - size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - allKernel<<>>( - output, input, input_ndim, output_ndim, output_size, reduce_num, - permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream_) const { - - hcStream_t stream = (hcStream_t)stream_; - -#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \ - launchKernel( \ - _info, \ - (bool *)output, (const Tdata *)input, \ - stream, workspace, workspace_size) - -#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BOOL) \ - return CALCULATE_ALL(BLOCK_SIZE, bool); \ - else if (_info.dtype == INFINI_DTYPE_U8) \ - return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_ALL_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::all::metax diff --git a/src/infiniop/ops/all/moore/all_moore.h b/src/infiniop/ops/all/moore/all_moore.h deleted file mode 100644 index d7dab5396..000000000 --- a/src/infiniop/ops/all/moore/all_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ALL_MOORE_H__ -#define __ALL_MOORE_H__ - -#include "../all_desc.h" - -DESCRIPTOR(moore); - -#endif diff --git a/src/infiniop/ops/all/moore/all_moore.mu b/src/infiniop/ops/all/moore/all_moore.mu deleted file mode 100644 index 624d47391..000000000 --- a/src/infiniop/ops/all/moore/all_moore.mu +++ /dev/null @@ -1,117 +0,0 @@ -#include "../../../devices/moore/moore_common.h" -#include "../../../devices/moore/moore_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "all_moore.h" - -namespace op::all::moore { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const AllInfo &info, - bool *output, const Tdata *input, - musaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *permuted_input_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_musa = permuted_input_shape_musa + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - - if (info.reduce_num == input_size) { - size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - bool *temp_output; - CHECK_MOORE(musaMalloc(&temp_output, grid_size * sizeof(bool))); - allReduceTempKernel<<>>( - temp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa); - finalAllReduceKernel<<<1, BLOCK_SIZE>>>(output, temp_output, grid_size); - CHECK_MOORE(musaFree(temp_output)); - } else { - size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - allKernel<<>>( - output, input, input_ndim, output_ndim, output_size, reduce_num, - permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream_) const { - - musaStream_t stream = (musaStream_t)stream_; - -#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \ - launchKernel( \ - _info, \ - (bool *)output, (const Tdata *)input, \ - stream, workspace, workspace_size) - -#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BOOL) \ - return CALCULATE_ALL(BLOCK_SIZE, bool); \ - else if (_info.dtype == INFINI_DTYPE_U8) \ - return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_ALL_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::all::moore diff --git a/src/infiniop/ops/all/nvidia/all_nvidia.cu b/src/infiniop/ops/all/nvidia/all_nvidia.cu deleted file mode 100644 index f0858d2f7..000000000 --- a/src/infiniop/ops/all/nvidia/all_nvidia.cu +++ /dev/null @@ -1,117 +0,0 @@ -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "../cuda/kernel.cuh" -#include "all_nvidia.cuh" - -namespace op::all::nvidia { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const AllInfo &info, - bool *output, const Tdata *input, - cudaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *permuted_input_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_cuda = permuted_input_shape_cuda + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_cuda = permuted_input_strides_cuda + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - - if (info.reduce_num == input_size) { - size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - bool *temp_output; - CHECK_CUDA(cudaMalloc(&temp_output, grid_size * sizeof(bool))); - allReduceTempKernel<<>>( - temp_output, input, input_size, input_ndim, permuted_input_shape_cuda, permuted_input_strides_cuda); - finalAllReduceKernel<<<1, BLOCK_SIZE>>>(output, temp_output, grid_size); - CHECK_CUDA(cudaFree(temp_output)); - } else { - size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - allKernel<<>>( - output, input, input_ndim, output_ndim, output_size, reduce_num, - permuted_input_shape_cuda, output_shape_cuda, permuted_input_strides_cuda, output_strides_cuda); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream_) const { - - cudaStream_t stream = (cudaStream_t)stream_; - -#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \ - launchKernel( \ - _info, \ - (bool *)output, (const Tdata *)input, \ - stream, workspace, workspace_size) - -#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BOOL) \ - return CALCULATE_ALL(BLOCK_SIZE, bool); \ - else if (_info.dtype == INFINI_DTYPE_U8) \ - return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_ALL_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::all::nvidia diff --git a/src/infiniop/ops/all/nvidia/all_nvidia.cuh b/src/infiniop/ops/all/nvidia/all_nvidia.cuh deleted file mode 100644 index 111e0816f..000000000 --- a/src/infiniop/ops/all/nvidia/all_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ALL_NVIDIA_H__ -#define __ALL_NVIDIA_H__ - -#include "../all_desc.h" - -DESCRIPTOR(nvidia); - -#endif // __ALL_CUDA_API_H__ diff --git a/src/infiniop/ops/all/operator.cc b/src/infiniop/ops/all/operator.cc deleted file mode 100644 index c7e70caa7..000000000 --- a/src/infiniop/ops/all/operator.cc +++ /dev/null @@ -1,194 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/all.h" -#include - -#ifdef ENABLE_CPU_API -#include "cpu/all_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) -#include "nvidia/all_nvidia.cuh" -#endif -#ifdef ENABLE_METAX_API -#include "metax/all_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/all_kunlun.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/all_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateAllDescriptor( - infiniopHandle_t handle, - infiniopAllDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::all::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - input_desc, \ - dim, \ - dim_size, \ - keepdim) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetAllWorkspaceSize(infiniopAllDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopAll( - infiniopAllDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, input, dim, dim_size, keepdim, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyAllDescriptor(infiniopAllDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/avg_pool1d/avg_pool1d.h b/src/infiniop/ops/avg_pool1d/avg_pool1d.h deleted file mode 100644 index a81f46464..000000000 --- a/src/infiniop/ops/avg_pool1d/avg_pool1d.h +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef __AVG_POOL1D_H__ -#define __AVG_POOL1D_H__ - -#include "../../../utils.h" -#include "../../operator.h" -#include "../../tensor.h" -#include "infiniop/ops/avg_pool1d.h" - -#define DESCRIPTOR(NAMESPACE) \ - namespace op::avg_pool1d::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - AvgPool1dInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - AvgPool1dInfo info, \ - size_t workspace_size_, \ - Opaque *opaque, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size_) {} \ - \ - public: \ - ~Descriptor(); \ - \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t y_desc, \ - infiniopTensorDescriptor_t x_desc, \ - size_t kernel_size, \ - size_t stride, \ - size_t padding); \ - \ - infiniStatus_t calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *y, \ - const void *x, \ - void *stream) const; \ - }; \ - } - -class AvgPool1dInfo { -private: - AvgPool1dInfo() = default; - -public: - infiniDtype_t dtype; - size_t batch, channels, in_width, out_width; - size_t kernel_size, stride, padding; - - ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width; - ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width; - - static utils::Result createAvgPool1dInfo( - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - size_t kernel_size, - size_t stride, - size_t padding) { - - CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER); - - const infiniDtype_t dtype = y_desc->dtype(); - CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - - CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE); - - size_t batch = x_desc->dim(0); - size_t channels = x_desc->dim(1); - size_t in_width = x_desc->dim(2); - - CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE); - CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE); - - size_t padded_len = in_width + 2 * padding; - - CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE); - - size_t expected_out_width = (padded_len - kernel_size) / stride + 1; - CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE); - - size_t out_width = expected_out_width; - - return utils::Result(AvgPool1dInfo{ - dtype, - batch, channels, in_width, out_width, - kernel_size, stride, padding, - y_desc->stride(0), y_desc->stride(1), y_desc->stride(2), - x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)}); - } -}; - -#endif diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc deleted file mode 100644 index 67e5b6623..000000000 --- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc +++ /dev/null @@ -1,96 +0,0 @@ -#include "avg_pool1d_cpu.h" -#include "../../../devices/cpu/common_cpu.h" -#include - -namespace op::avg_pool1d::cpu { - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - size_t kernel_size, - size_t stride, - size_t padding) { - - auto handle = reinterpret_cast(handle_); - - auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); - CHECK_RESULT(info); - - *desc_ptr = new Descriptor( - info.take(), - 0, - nullptr, - handle->device, - handle->device_id); - - return INFINI_STATUS_SUCCESS; -} - -template -infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info, - T *y, - const T *x) { - const float inv_kernel = 1.0f / static_cast(info.kernel_size); - -#pragma omp parallel for - for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) { - - ptrdiff_t b = bc / info.channels; - ptrdiff_t c = bc % info.channels; - - size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel; - size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel; - - for (size_t ow = 0; ow < info.out_width; ++ow) { - size_t y_offset = y_base + ow * info.y_stride_width; - - long long start_w = static_cast(ow * info.stride) - info.padding; - long long end_w = start_w + info.kernel_size; - - long long valid_start = std::max(0LL, start_w); - long long valid_end = std::min(static_cast(info.in_width), end_w); - - float sum = 0.0f; - for (long long iw = valid_start; iw < valid_end; ++iw) { - size_t x_offset = x_base + iw * info.x_stride_width; - sum += utils::cast(x[x_offset]); - } - - const float avg = sum * inv_kernel; - y[y_offset] = utils::cast(avg); - } - } - - return INFINI_STATUS_SUCCESS; -} - -#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x) - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) const { - - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return CALCULATE(fp16_t); - case INFINI_DTYPE_BF16: - return CALCULATE(bf16_t); - case INFINI_DTYPE_F32: - return CALCULATE(float); - case INFINI_DTYPE_F64: - return CALCULATE(double); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -#undef CALCULATE - -} // namespace op::avg_pool1d::cpu diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h deleted file mode 100644 index 0b9f6c666..000000000 --- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_AVG_POOL1D_CPU_H__ -#define __INFINIOP_AVG_POOL1D_CPU_H__ - -#include "../avg_pool1d.h" - -DESCRIPTOR(cpu) - -#endif diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh deleted file mode 100644 index 36a11acfc..000000000 --- a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__ -#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__ - -template -__device__ void avgPool1dKernel( - T *y, - const T *x, - size_t batch, - size_t channels, - size_t in_width, - size_t out_width, - size_t kernel_size, - size_t stride, - size_t padding, - - ptrdiff_t y_stride_batch, - ptrdiff_t y_stride_channel, - ptrdiff_t y_stride_width, - ptrdiff_t x_stride_batch, - ptrdiff_t x_stride_channel, - ptrdiff_t x_stride_width) { - - size_t total_elements = batch * channels * out_width; - - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < total_elements; - idx += gridDim.x * blockDim.x) { - - size_t ow = idx % out_width; - size_t temp = idx / out_width; - size_t c = temp % channels; - size_t b = temp / channels; - - size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; - - long long start_w = static_cast(ow * stride) - padding; - - T sum = 0; - - for (size_t k = 0; k < kernel_size; ++k) { - long long iw = start_w + k; - - if (iw >= 0 && iw < static_cast(in_width)) { - size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width; - sum += x[x_offset]; - } - } - -#if defined(ENABLE_ILUVATAR_API) - // Iluvatar __half doesn't accept size_t directly. - y[y_offset] = sum / static_cast(static_cast(kernel_size)); -#else - y[y_offset] = sum / static_cast(kernel_size); -#endif - } -} - -#endif diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h deleted file mode 100644 index 576da66de..000000000 --- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_AVG_POOL1D_METAX_H__ -#define __INFINIOP_AVG_POOL1D_METAX_H__ - -#include "../avg_pool1d.h" - -DESCRIPTOR(metax) - -#endif // __INFINIOP_AVG_POOL1D_METAX_H__ diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca deleted file mode 100644 index 9b3f15b9a..000000000 --- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca +++ /dev/null @@ -1,170 +0,0 @@ -#include "../../../devices/metax/metax_common.h" -#include "avg_pool1d_metax.h" -#include "../../../devices/metax/metax_kernel_common.h" - -#include - -namespace op::avg_pool1d::metax { - -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - size_t kernel_size, - size_t stride, - size_t padding) { - - auto handle = reinterpret_cast(handle_); - - auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); - CHECK_RESULT(info); - - *desc_ptr = new Descriptor( - info.take(), - 0, - new Opaque{handle->internal()}, - handle->device, - handle->device_id); - - return INFINI_STATUS_SUCCESS; -} - -template -__device__ __forceinline__ Tdata castToOutput(Tcompute val) { - if constexpr (std::is_same_v) { - return __float2half(static_cast(val)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16(static_cast(val)); - } else { - return static_cast(val); - } -} - -template -INFINIOP_METAX_KERNEL avgPool1dGlobalKernel( - Tdata *y, - const Tdata *x, - size_t batch, - size_t channels, - size_t in_width, - size_t out_width, - size_t kernel_size, - size_t stride, - size_t padding, - ptrdiff_t y_stride_batch, - ptrdiff_t y_stride_channel, - ptrdiff_t y_stride_width, - ptrdiff_t x_stride_batch, - ptrdiff_t x_stride_channel, - ptrdiff_t x_stride_width) { - - size_t total_elements = batch * channels * out_width; - Tcompute inv_kernel = Tcompute(1) / static_cast(kernel_size); - - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < total_elements; - idx += gridDim.x * blockDim.x) { - - size_t ow = idx % out_width; - size_t temp = idx / out_width; - size_t c = temp % channels; - size_t b = temp / channels; - - size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; - size_t x_base = b * x_stride_batch + c * x_stride_channel; - - long long start_w = static_cast(ow * stride) - static_cast(padding); - long long end_w = start_w + static_cast(kernel_size); - long long iw_start = start_w < 0 ? 0 : start_w; - long long iw_end = end_w > static_cast(in_width) ? static_cast(in_width) : end_w; - - Tcompute sum = Tcompute(0); - if (iw_start < iw_end) { - size_t x_offset = x_base + static_cast(iw_start) * x_stride_width; - for (long long iw = iw_start; iw < iw_end; ++iw) { - sum += static_cast(x[x_offset]); - x_offset += x_stride_width; - } - } - - y[y_offset] = castToOutput(sum * inv_kernel); - } -} - -template -infiniStatus_t calculateAvgPool1d( - const AvgPool1dInfo &info, - int max_threads_per_block, - Tdata *y, - const Tdata *x, - hcStream_t stream) { - - size_t total_elements = info.batch * info.channels * info.out_width; - - int block_size = 256; - if (max_threads_per_block > 0 && max_threads_per_block < block_size) { - block_size = max_threads_per_block; - } - - size_t grid_size = (total_elements + block_size - 1) / block_size; - if (grid_size > 65535) { - grid_size = 65535; - } - - avgPool1dGlobalKernel<<>>( - y, x, - info.batch, info.channels, info.in_width, info.out_width, - info.kernel_size, info.stride, info.padding, - info.y_stride_batch, info.y_stride_channel, info.y_stride_width, - info.x_stride_batch, info.x_stride_channel, info.x_stride_width); - - return INFINI_STATUS_SUCCESS; -} - -#define CALCULATE(TDATA, TCOMPUTE) \ - calculateAvgPool1d( \ - _info, \ - _opaque->internal->maxThreadsPerBlock(), \ - (TDATA *)y, \ - (const TDATA *)x, \ - (hcStream_t)stream) - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) const { - - (void)workspace; - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return CALCULATE(half, float); - case INFINI_DTYPE_BF16: - return CALCULATE(cuda_bfloat16, float); - case INFINI_DTYPE_F32: - return CALCULATE(float, float); - case INFINI_DTYPE_F64: - return CALCULATE(double, double); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -#undef CALCULATE - -} // namespace op::avg_pool1d::metax diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h deleted file mode 100644 index 9034d7358..000000000 --- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ -#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ - -#include - -namespace op::avg_pool1d::moore { - -template -__device__ __forceinline__ Tdata castToOutput(Tcompute val) { - if constexpr (std::is_same_v) { - return __float2half(static_cast(val)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(static_cast(val)); - } else { - return static_cast(val); - } -} - -template -__device__ void avgPool1dKernel( - Tdata *y, - const Tdata *x, - size_t batch, - size_t channels, - size_t in_width, - size_t out_width, - size_t kernel_size, - size_t stride, - size_t padding, - ptrdiff_t y_stride_batch, - ptrdiff_t y_stride_channel, - ptrdiff_t y_stride_width, - ptrdiff_t x_stride_batch, - ptrdiff_t x_stride_channel, - ptrdiff_t x_stride_width) { - - size_t total_elements = batch * channels * out_width; - Tcompute inv_kernel = Tcompute(1) / static_cast(kernel_size); - - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < total_elements; - idx += gridDim.x * blockDim.x) { - - size_t ow = idx % out_width; - size_t temp = idx / out_width; - size_t c = temp % channels; - size_t b = temp / channels; - - size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; - size_t x_base = b * x_stride_batch + c * x_stride_channel; - - long long start_w = static_cast(ow * stride) - static_cast(padding); - long long end_w = start_w + static_cast(kernel_size); - long long iw_start = start_w < 0 ? 0 : start_w; - long long iw_end = end_w > static_cast(in_width) ? static_cast(in_width) : end_w; - - Tcompute sum = Tcompute(0); - if (iw_start < iw_end) { - size_t x_offset = x_base + static_cast(iw_start) * x_stride_width; - for (long long iw = iw_start; iw < iw_end; ++iw) { - sum += static_cast(x[x_offset]); - x_offset += x_stride_width; - } - } - - y[y_offset] = castToOutput(sum * inv_kernel); - } -} - -} // namespace op::avg_pool1d::moore - -#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h deleted file mode 100644 index 604d06012..000000000 --- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__ -#define __INFINIOP_AVG_POOL1D_MOORE_H__ - -#include "../avg_pool1d.h" - -DESCRIPTOR(moore) - -#endif // __INFINIOP_AVG_POOL1D_MOORE_H__ diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu deleted file mode 100644 index 518d249b9..000000000 --- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu +++ /dev/null @@ -1,135 +0,0 @@ -#include "../../../devices/moore/moore_common.h" -#include "avg_pool1d_moore.h" - -#include "../../../devices/moore/moore_kernel_common.h" - -#include "avg_pool1d_kernel.h" - -namespace op::avg_pool1d::moore { - -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - size_t kernel_size, - size_t stride, - size_t padding) { - - auto handle = reinterpret_cast(handle_); - - auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); - CHECK_RESULT(info); - - *desc_ptr = new Descriptor( - info.take(), - 0, - new Opaque{handle->internal()}, - handle->device, - handle->device_id); - - return INFINI_STATUS_SUCCESS; -} - -template -INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel( - Tdata *y, - const Tdata *x, - size_t batch, - size_t channels, - size_t in_width, - size_t out_width, - size_t kernel_size, - size_t stride, - size_t padding, - ptrdiff_t y_stride_batch, - ptrdiff_t y_stride_channel, - ptrdiff_t y_stride_width, - ptrdiff_t x_stride_batch, - ptrdiff_t x_stride_channel, - ptrdiff_t x_stride_width) { - - avgPool1dKernel( - y, x, - batch, channels, in_width, out_width, - kernel_size, stride, padding, - y_stride_batch, y_stride_channel, y_stride_width, - x_stride_batch, x_stride_channel, x_stride_width); -} - -template -infiniStatus_t calculateAvgPool1d( - const AvgPool1dInfo &info, - int max_threads_per_block, - Tdata *y, - const Tdata *x, - musaStream_t stream) { - - size_t total_elements = info.batch * info.channels * info.out_width; - - int block_size = 256; - if (max_threads_per_block > 0 && max_threads_per_block < block_size) { - block_size = max_threads_per_block; - } - - size_t grid_size = (total_elements + block_size - 1) / block_size; - if (grid_size > 65535) { - grid_size = 65535; - } - - avgPool1dGlobalKernel<<>>( - y, x, - info.batch, info.channels, info.in_width, info.out_width, - info.kernel_size, info.stride, info.padding, - info.y_stride_batch, info.y_stride_channel, info.y_stride_width, - info.x_stride_batch, info.x_stride_channel, info.x_stride_width); - - return INFINI_STATUS_SUCCESS; -} - -#define CALCULATE(TDATA, TCOMPUTE) \ - calculateAvgPool1d(\ - _info,\ - _opaque->internal->maxThreadsPerBlock(),\ - (TDATA *)y,\ - (const TDATA *)x,\ - (musaStream_t)stream) - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) const { - - (void)workspace; - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return CALCULATE(half, float); - case INFINI_DTYPE_BF16: - return CALCULATE(cuda_bfloat16, float); - case INFINI_DTYPE_F32: - return CALCULATE(float, float); - case INFINI_DTYPE_F64: - return CALCULATE(double, double); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -#undef CALCULATE - -} // namespace op::avg_pool1d::moore diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu deleted file mode 100644 index 634ce9018..000000000 --- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu +++ /dev/null @@ -1,126 +0,0 @@ -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "../cuda/kernel.cuh" -#include "avg_pool1d_nvidia.cuh" - -template -__global__ void avgPool1dGlobalKernel( - T *y, - const T *x, - size_t batch, - size_t channels, - size_t in_width, - size_t out_width, - size_t kernel_size, - size_t stride, - size_t padding, - ptrdiff_t y_stride_batch, - ptrdiff_t y_stride_channel, - ptrdiff_t y_stride_width, - ptrdiff_t x_stride_batch, - ptrdiff_t x_stride_channel, - ptrdiff_t x_stride_width) { - - avgPool1dKernel( - y, x, - batch, channels, in_width, out_width, - kernel_size, stride, padding, - y_stride_batch, y_stride_channel, y_stride_width, - x_stride_batch, x_stride_channel, x_stride_width); -} - -namespace op::avg_pool1d::nvidia { - -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - size_t kernel_size, - size_t stride, - size_t padding) { - - auto handle = reinterpret_cast(handle_); - - auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); - CHECK_RESULT(info); - - *desc_ptr = new Descriptor( - info.take(), - 0, - new Opaque{reinterpret_cast(handle)->internal()}, - handle->device, - handle->device_id); - - return INFINI_STATUS_SUCCESS; -} - -template -infiniStatus_t calculateAvgPool1d( - const AvgPool1dInfo &info, - int max_threads_per_block, - T *y, - const T *x, - cudaStream_t stream) { - - size_t total_elements = info.batch * info.channels * info.out_width; - - int block_size = 256; - if (max_threads_per_block > 0 && max_threads_per_block < 256) { - block_size = max_threads_per_block; - } - - size_t grid_size = (total_elements + block_size - 1) / block_size; - if (grid_size > 65535) { - grid_size = 65535; - } - - avgPool1dGlobalKernel<<>>( - y, x, - info.batch, info.channels, info.in_width, info.out_width, - info.kernel_size, info.stride, info.padding, - info.y_stride_batch, info.y_stride_channel, info.y_stride_width, - info.x_stride_batch, info.x_stride_channel, info.x_stride_width); - - return INFINI_STATUS_SUCCESS; -} - -#define CALCULATE(TDATA) \ - calculateAvgPool1d(_info, \ - _opaque->internal->maxThreadsPerBlock(), \ - (TDATA *)y, \ - (const TDATA *)x, \ - (cudaStream_t)stream) - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) const { - - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return CALCULATE(half); - case INFINI_DTYPE_BF16: - return CALCULATE(cuda_bfloat16); - case INFINI_DTYPE_F32: - return CALCULATE(float); - case INFINI_DTYPE_F64: - return CALCULATE(double); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -#undef CALCULATE - -} // namespace op::avg_pool1d::nvidia diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh deleted file mode 100644 index 1019ce354..000000000 --- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__ -#define __INFINIOP_AVG_POOL1D_CUDA_H__ - -#include "../avg_pool1d.h" - -DESCRIPTOR(nvidia) - -#endif diff --git a/src/infiniop/ops/avg_pool1d/operator.cc b/src/infiniop/ops/avg_pool1d/operator.cc deleted file mode 100644 index c3696daa1..000000000 --- a/src/infiniop/ops/avg_pool1d/operator.cc +++ /dev/null @@ -1,225 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/avg_pool1d.h" - -#ifdef ENABLE_CPU_API -#include "cpu/avg_pool1d_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) -#include "nvidia/avg_pool1d_nvidia.cuh" -#endif -#ifdef ENABLE_ASCEND_API -#include "ascend/avg_pool1d_ascend.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/avg_pool1d_bang.h" -#endif -#ifdef ENABLE_METAX_API -#include "metax/avg_pool1d_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/avg_pool1d_kunlun.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/avg_pool1d_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor( - infiniopHandle_t handle, - infiniopAvgPool1dDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x, - size_t kernel_size, - size_t stride, - size_t padding) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::avg_pool1d::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y, \ - x, \ - kernel_size, \ - stride, \ - padding) - - switch (handle->device) { -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - CREATE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_ASCEND_API - CREATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc, - size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - GET(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef GET -} - -__INFINI_C infiniStatus_t infiniopAvgPool1d( - infiniopAvgPool1dDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, x, stream) - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - CALCULATE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - CALCULATE(INFINI_DEVICE_ASCEND, ascend); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - DELETE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - DELETE(INFINI_DEVICE_ASCEND, ascend); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc deleted file mode 100644 index c1098f3ee..000000000 --- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc +++ /dev/null @@ -1,99 +0,0 @@ -#include "cross_entropy_cpu.h" -#include "../../../devices/cpu/common_cpu.h" -#include "../../../reduce/cpu/reduce.h" -#include -#include - -namespace op::cross_entropy::cpu { - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t target_desc) { - - auto x_dtype = x_desc->dtype(); - auto t_dtype = target_desc->dtype(); - - CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); - CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); - - CrossEntropyInfo info{}; - info.dtype = x_dtype; - info.target_dtype = t_dtype; - - info.outer_size = target_desc->numel(); - - info.vocab_size = x_desc->shape().back(); - - info.x_stride = static_cast(info.vocab_size); - - *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -template -infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info, - T *y, const T *x, const void *target) { - const Tidx *label = reinterpret_cast(target); - -#pragma omp parallel for - for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) { - const T *row = x + i * info->x_stride; - Tidx idx = label[i]; - - if (idx < 0 || static_cast(idx) >= info->vocab_size) { - y[i] = utils::cast(0.f); - continue; - } - - float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1); - - float sum_exp = 0.f; - for (size_t j = 0; j < info->vocab_size; ++j) { - sum_exp += std::exp(utils::cast(row[j]) - max_val); - } - - float log_term = std::log(sum_exp) + max_val; - float target_logit = utils::cast(row[idx]); - y[i] = utils::cast(log_term - target_logit); - } - return INFINI_STATUS_SUCCESS; -} - -template -infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info, - T *y, const T *x, const void *target) { - - if (info->target_dtype == INFINI_DTYPE_I32) { - return cross_entropy_kernel(info, y, x, target); - } else if (info->target_dtype == INFINI_DTYPE_I64) { - return cross_entropy_kernel(info, y, x, target); - } - return INFINI_STATUS_BAD_TENSOR_DTYPE; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *y, - const void *x, - const void *target, - void *stream) const { - - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target); - case INFINI_DTYPE_BF16: - return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target); - case INFINI_DTYPE_F32: - return dispatch_target_type(&_info, (float *)y, (const float *)x, target); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -} // namespace op::cross_entropy::cpu diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h deleted file mode 100644 index e274efc9d..000000000 --- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __CROSS_ENTROPY_CPU_H__ -#define __CROSS_ENTROPY_CPU_H__ - -#include "../cross_entropy.h" - -DESCRIPTOR(cpu) - -#endif diff --git a/src/infiniop/ops/cross_entropy/cross_entropy.h b/src/infiniop/ops/cross_entropy/cross_entropy.h deleted file mode 100644 index 075b17142..000000000 --- a/src/infiniop/ops/cross_entropy/cross_entropy.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef CROSS_ENTROPY_H -#define CROSS_ENTROPY_H - -#include "../../operator.h" -#include "info.h" - -#define DESCRIPTOR(NAMESPACE) \ - namespace op::cross_entropy::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - CrossEntropyInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor(Opaque *opaque, \ - CrossEntropyInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - size_t workspaceSize() const { return _workspace_size; } \ - static infiniStatus_t create(infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t y_desc, \ - infiniopTensorDescriptor_t x_desc, \ - infiniopTensorDescriptor_t target_desc); \ - infiniStatus_t calculate(void *workspace, \ - size_t workspace_size, \ - void *y, \ - const void *x, \ - const void *target, \ - void *stream) const; \ - }; \ - } - -#endif diff --git a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh deleted file mode 100644 index c048c1233..000000000 --- a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef __CROSS_ENTROPY_KERNEL_CUH__ -#define __CROSS_ENTROPY_KERNEL_CUH__ - -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../reduce/cuda/reduce.cuh" - -template -__device__ void crossEntropyKernel( - Tdata *y_, - const Tdata *x_, - const void *target_, - size_t outer_size, - size_t vocab_size, - ptrdiff_t x_stride) { - - size_t row_idx = blockIdx.x; - if (row_idx >= outer_size) { - return; - } - - const Tdata *x = x_ + row_idx * x_stride; - const Tidx *target = reinterpret_cast(target_); - - Tidx label = target[row_idx]; - - Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); - __shared__ Tcompute max_val_shared; - if (threadIdx.x == 0) { - max_val_shared = static_cast(max_val_raw); - } - __syncthreads(); - Tcompute max_val = max_val_shared; - - Tcompute thread_sum = 0.0f; - for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { - Tcompute val = static_cast(x[col]); - thread_sum += expf(val - max_val); - } - - for (int offset = warpSize / 2; offset > 0; offset /= 2) { - thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset); - } - - static __shared__ Tcompute shared_sum[32]; - int lane = threadIdx.x % warpSize; - int warp = threadIdx.x / warpSize; - - if (lane == 0) { - shared_sum[warp] = thread_sum; - } - __syncthreads(); - - Tcompute block_sum = 0.0f; - if (warp == 0) { - - if (lane < (BLOCK_SIZE + warpSize - 1) / warpSize) { - block_sum = shared_sum[lane]; - } - for (int offset = warpSize / 2; offset > 0; offset /= 2) { - block_sum += __shfl_down_sync(0xffffffff, block_sum, offset); - } - } - - if (threadIdx.x == 0) { - Tcompute log_term = logf(block_sum) + max_val; - - Tcompute target_logit = 0.0f; - - if (label >= 0 && static_cast(label) < vocab_size) { - target_logit = static_cast(x[label]); - } else { - - log_term = 0.0f; - } - - y_[row_idx] = static_cast(log_term - target_logit); - } -} - -#endif diff --git a/src/infiniop/ops/cross_entropy/info.h b/src/infiniop/ops/cross_entropy/info.h deleted file mode 100644 index a915a4fe4..000000000 --- a/src/infiniop/ops/cross_entropy/info.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef CROSS_ENTROPY_INFO_H -#define CROSS_ENTROPY_INFO_H -#include "../../../utils.h" -#include "../../tensor.h" -#include - -#include - -struct CrossEntropyInfo { - int dtype; - int target_dtype; - size_t outer_size; - size_t vocab_size; - ptrdiff_t x_stride; -}; - -#endif diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h deleted file mode 100644 index 57bccea91..000000000 --- a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __CROSS_ENTROPY_METAX_H__ -#define __CROSS_ENTROPY_METAX_H__ - -#include "../cross_entropy.h" - -DESCRIPTOR(metax) - -#endif // __CROSS_ENTROPY_METAX_H__ diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca deleted file mode 100644 index efd791183..000000000 --- a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca +++ /dev/null @@ -1,188 +0,0 @@ -#include "../../../devices/metax/metax_common.h" -#include "cross_entropy_metax.h" -#include "../../../devices/metax/metax_kernel_common.h" - -#include - -#include "../../../reduce/cuda/reduce.cuh" - -#include - -namespace { - -template -__device__ void crossEntropyKernel( - Tdata *y_, - const Tdata *x_, - const void *target_, - size_t outer_size, - size_t vocab_size, - ptrdiff_t x_stride) { - - size_t row_idx = blockIdx.x; - if (row_idx >= outer_size) { - return; - } - - const Tdata *x = x_ + row_idx * x_stride; - const Tidx *target = reinterpret_cast(target_); - - Tidx label = target[row_idx]; - - Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); - __shared__ Tcompute max_val_shared; - if (threadIdx.x == 0) { - max_val_shared = static_cast(max_val_raw); - } - __syncthreads(); - - Tcompute max_val = max_val_shared; - - Tcompute thread_sum = Tcompute(0); - for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { - Tcompute val = static_cast(x[col]); - thread_sum += expf(val - max_val); - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum); - - if (threadIdx.x == 0) { - if (label < 0 || static_cast(label) >= vocab_size) { - y_[row_idx] = static_cast(0.0f); - return; - } - Tcompute log_term = logf(block_sum) + max_val; - Tcompute target_logit = static_cast(x[label]); - y_[row_idx] = static_cast(log_term - target_logit); - } -} - -template -INFINIOP_METAX_KERNEL crossEntropy( - Tdata *y, const Tdata *x, const void *target, - size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { - crossEntropyKernel( - y, x, target, outer_size, vocab_size, x_stride); -} - -} // namespace - -namespace op::cross_entropy::metax { - -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t target_desc) { - - (void)y_desc; - - auto x_dtype = x_desc->dtype(); - auto t_dtype = target_desc->dtype(); - - CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); - CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); - - CrossEntropyInfo info{}; - info.dtype = x_dtype; - info.target_dtype = t_dtype; - info.vocab_size = x_desc->shape().back(); - info.outer_size = target_desc->numel(); - info.x_stride = static_cast(info.vocab_size); - - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -template -infiniStatus_t launchKernel(void *y, const void *x, const void *target, - const CrossEntropyInfo &info, hcStream_t stream) { - dim3 grid(static_cast(info.outer_size), 1, 1); - - if (info.target_dtype == INFINI_DTYPE_I64) { - if (info.dtype == INFINI_DTYPE_F16) { - crossEntropy - <<>>( - (half *)y, (const half *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_BF16) { - crossEntropy - <<>>( - (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_F32) { - crossEntropy - <<>>( - (float *)y, (const float *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - } else if (info.target_dtype == INFINI_DTYPE_I32) { - if (info.dtype == INFINI_DTYPE_F16) { - crossEntropy - <<>>( - (half *)y, (const half *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_BF16) { - crossEntropy - <<>>( - (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_F32) { - crossEntropy - <<>>( - (float *)y, (const float *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *y, - const void *x, - const void *target, - void *stream_) const { - - (void)workspace; - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - auto stream = reinterpret_cast(stream_); - int max_threads = _opaque->internal->maxThreadsPerBlock(); - - if (max_threads >= METAX_BLOCK_SIZE_1024) { - CHECK_STATUS(launchKernel(y, x, target, _info, stream)); - } else if (max_threads >= METAX_BLOCK_SIZE_512) { - CHECK_STATUS(launchKernel(y, x, target, _info, stream)); - } else { - CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream)); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::cross_entropy::metax diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h deleted file mode 100644 index 6648b0e32..000000000 --- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef __CROSS_ENTROPY_KERNEL_CUH__ -#define __CROSS_ENTROPY_KERNEL_CUH__ - -template -__device__ void crossEntropyKernel( - Tdata *y_, - const Tdata *x_, - const void *target_, - size_t outer_size, - size_t vocab_size, - ptrdiff_t x_stride) { - - size_t row_idx = blockIdx.x; - if (row_idx >= outer_size) { - return; - } - - const Tdata *x = x_ + row_idx * x_stride; - const Tidx *target = reinterpret_cast(target_); - - Tidx label = target[row_idx]; - - Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); - __shared__ Tcompute max_val_shared; - if (threadIdx.x == 0) { - max_val_shared = static_cast(max_val_raw); - } - __syncthreads(); - - Tcompute max_val = max_val_shared; - - Tcompute thread_sum = Tcompute(0); - for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { - Tcompute val = static_cast(x[col]); - thread_sum += expf(val - max_val); - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum); - - if (threadIdx.x == 0) { - if (label < 0 || static_cast(label) >= vocab_size) { - y_[row_idx] = static_cast(0.0f); - return; - } - Tcompute log_term = logf(block_sum) + max_val; - Tcompute target_logit = static_cast(x[label]); - y_[row_idx] = static_cast(log_term - target_logit); - } -} - -#endif diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h deleted file mode 100644 index 454b14617..000000000 --- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __CROSS_ENTROPY_MOORE_H__ -#define __CROSS_ENTROPY_MOORE_H__ - -#include "../cross_entropy.h" - -DESCRIPTOR(moore) - -#endif diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu deleted file mode 100644 index 2535679dd..000000000 --- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu +++ /dev/null @@ -1,129 +0,0 @@ -#include "../../../devices/moore/moore_common.h" -#include "cross_entropy_moore.h" - -#include -#include "../../../devices/moore/moore_kernel_common.h" - -#include "../../../reduce/cuda/reduce.cuh" - -#include "cross_entropy_kernel.h" - -template -INFINIOP_MOORE_KERNEL crossEntropy( - Tdata *y, const Tdata *x, const void *target, - size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { - crossEntropyKernel( - y, x, target, outer_size, vocab_size, x_stride); -} - -namespace op::cross_entropy::moore { - -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t target_desc) { - - (void)y_desc; - - auto x_dtype = x_desc->dtype(); - auto t_dtype = target_desc->dtype(); - - CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); - CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); - - CrossEntropyInfo info{}; - info.dtype = x_dtype; - info.target_dtype = t_dtype; - info.vocab_size = x_desc->shape().back(); - info.outer_size = target_desc->numel(); - info.x_stride = static_cast(info.vocab_size); - - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -template -infiniStatus_t launchKernel(void *y, const void *x, const void *target, - const CrossEntropyInfo &info, musaStream_t stream) { - dim3 grid(static_cast(info.outer_size), 1, 1); - - if (info.target_dtype == INFINI_DTYPE_I64) { - if (info.dtype == INFINI_DTYPE_F16) { - crossEntropy - <<>>( - (half *)y, (const half *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_BF16) { - crossEntropy - <<>>( - (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_F32) { - crossEntropy - <<>>( - (float *)y, (const float *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - } else if (info.target_dtype == INFINI_DTYPE_I32) { - if (info.dtype == INFINI_DTYPE_F16) { - crossEntropy - <<>>( - (half *)y, (const half *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_BF16) { - crossEntropy - <<>>( - (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_F32) { - crossEntropy - <<>>( - (float *)y, (const float *)x, target, - info.outer_size, info.vocab_size, info.x_stride); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, - void *y, - const void *x, - const void *target, - void *stream_) const { - musaStream_t stream = (musaStream_t)stream_; - (void)workspace; - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { - CHECK_STATUS(launchKernel(y, x, target, _info, stream)); - } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { - CHECK_STATUS(launchKernel(y, x, target, _info, stream)); - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::cross_entropy::moore diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu deleted file mode 100644 index 77e3d2d58..000000000 --- a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu +++ /dev/null @@ -1,107 +0,0 @@ -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "../cuda/kernel.cuh" -#include "cross_entropy_nvidia.cuh" - -template -INFINIOP_CUDA_KERNEL crossEntropy( - Tdata *y, const Tdata *x, const void *target, - size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { - - crossEntropyKernel( - y, x, target, outer_size, vocab_size, x_stride); -} - -namespace op::cross_entropy::nvidia { - -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t target_desc) { - - auto x_dtype = x_desc->dtype(); - auto t_dtype = target_desc->dtype(); - - CrossEntropyInfo info; - info.dtype = x_dtype; - info.target_dtype = t_dtype; - - info.vocab_size = x_desc->shape().back(); - info.outer_size = target_desc->numel(); - info.x_stride = static_cast(info.vocab_size); - - auto internal = reinterpret_cast(handle)->internal(); - - *desc_ptr = new Descriptor( - new Opaque{internal}, - info, 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -template -infiniStatus_t launchKernel(void *y, const void *x, const void *target, - const CrossEntropyInfo &info, cudaStream_t stream) { - - dim3 grid(static_cast(info.outer_size), 1, 1); - - if (info.target_dtype == INFINI_DTYPE_I64) { - if (info.dtype == INFINI_DTYPE_F16) { - crossEntropy - <<>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_BF16) { - crossEntropy - <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_F32) { - crossEntropy - <<>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride); - } - } else if (info.target_dtype == INFINI_DTYPE_I32) { - - if (info.dtype == INFINI_DTYPE_F16) { - crossEntropy - <<>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_BF16) { - crossEntropy - <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride); - } else if (info.dtype == INFINI_DTYPE_F32) { - crossEntropy - <<>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride); - } - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, - void *y, - const void *x, - const void *target, - void *stream_) const { - cudaStream_t stream = (cudaStream_t)stream_; - - int max_threads = _opaque->internal->maxThreadsPerBlock(); - - if (max_threads >= 1024) { - CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream)); - } else if (max_threads >= 512) { - CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream)); - } else { - CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream)); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::cross_entropy::nvidia diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh deleted file mode 100644 index 786e9d88f..000000000 --- a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __CROSS_ENTROPY_NVIDIA_H__ -#define __CROSS_ENTROPY_NVIDIA_H__ - -#include "../cross_entropy.h" - -DESCRIPTOR(nvidia) - -#endif diff --git a/src/infiniop/ops/cross_entropy/operator.cc b/src/infiniop/ops/cross_entropy/operator.cc deleted file mode 100644 index 75f35fcb7..000000000 --- a/src/infiniop/ops/cross_entropy/operator.cc +++ /dev/null @@ -1,174 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/cross_entropy.h" - -#ifdef ENABLE_CPU_API -#include "cpu/cross_entropy_cpu.h" -#endif - -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) -#include "nvidia/cross_entropy_nvidia.cuh" -#endif - -#ifdef ENABLE_MOORE_API -#include "moore/cross_entropy_moore.h" -#endif -#ifdef ENABLE_METAX_API -#include "metax/cross_entropy_metax.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor( - infiniopHandle_t handle, - infiniopCrossEntropyDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t target_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::cross_entropy::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, x_desc, target_desc); - - switch (handle->device) { -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia) -#endif -#ifdef ENABLE_HYGON_API - CREATE(INFINI_DEVICE_HYGON, nvidia) -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore) -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize( - infiniopCrossEntropyDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif -#ifdef ENABLE_HYGON_API - GET(INFINI_DEVICE_HYGON, nvidia) -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore) -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET -} - -__INFINI_C infiniStatus_t infiniopCrossEntropy( - infiniopCrossEntropyDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - const void *target, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, x, target, stream); - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia) -#endif -#ifdef ENABLE_HYGON_API - CALCULATE(INFINI_DEVICE_HYGON, nvidia) -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore) -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef CALCULATE -} - -__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor( - infiniopCrossEntropyDescriptor_t desc) { - -#define DESTROY(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - DESTROY(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - DESTROY(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - DESTROY(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - DESTROY(INFINI_DEVICE_QY, nvidia) -#endif -#ifdef ENABLE_HYGON_API - DESTROY(INFINI_DEVICE_HYGON, nvidia) -#endif -#ifdef ENABLE_MOORE_API - DESTROY(INFINI_DEVICE_MOORE, moore) -#endif -#ifdef ENABLE_METAX_API - DESTROY(INFINI_DEVICE_METAX, metax) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef DESTROY -} diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc deleted file mode 100644 index ff8ebe395..000000000 --- a/src/infiniop/ops/equal/cpu/equal_cpu.cc +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include - -#include "equal_cpu.h" - -namespace op::equal::cpu { - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - auto compute_dtype = a_desc->dtype(); - auto out_dtype = out_desc->dtype(); - - if (compute_dtype != b_desc->dtype()) { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); - - CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, - INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); - - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_I32: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_I64: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::equal::cpu diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h deleted file mode 100644 index fd811f4b0..000000000 --- a/src/infiniop/ops/equal/cpu/equal_cpu.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef __EQUAL_CPU_H__ -#define __EQUAL_CPU_H__ - -#include - -#include "../../../elementwise/cpu/elementwise_cpu.h" - -ELEMENTWISE_DESCRIPTOR(equal, cpu) - -namespace op::equal::cpu { - -typedef struct EqualOp { -public: - static constexpr size_t num_inputs = 2; - - template - bool operator()(const Tin0 &a, const Tin1 &b) { - if constexpr (std::is_same_v) { - return a == b; - } else { - return false; - } - } -} EqualOp; - -} // namespace op::equal::cpu - -#endif diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh deleted file mode 100644 index 11ad5981e..000000000 --- a/src/infiniop/ops/equal/cuda/kernel.cuh +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __EQUAL_CUDA_H__ -#define __EQUAL_CUDA_H__ - -#if defined(__MACACC__) -#include -#include -#else -#include -#include -#endif -#include - -namespace op::equal::cuda { - -typedef struct EqualOp { -public: - static constexpr size_t num_inputs = 2; - - template - __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const { - if constexpr (std::is_same_v) { - if constexpr (std::is_same_v) { - static_assert(!std::is_same_v, "half2 is not supported for mixed output dtype"); - } else if constexpr (std::is_same_v) { - return static_cast(__heq(a, b)); - } else { - return static_cast(a == b); - } - } else { - return false; - } - } -} EqualOp; - -} // namespace op::equal::cuda - -#endif diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h deleted file mode 100644 index 6e4cd64b9..000000000 --- a/src/infiniop/ops/equal/metax/equal_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __EQUAL_METAX_API_H__ -#define __EQUAL_METAX_API_H__ - -#include "../../../elementwise/metax/elementwise_metax_api.h" - -ELEMENTWISE_DESCRIPTOR(equal, metax) - -#endif // __EQUAL_METAX_API_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca deleted file mode 100644 index 265e5b5a6..000000000 --- a/src/infiniop/ops/equal/metax/equal_metax.maca +++ /dev/null @@ -1,69 +0,0 @@ -#include "equal_metax.h" - -#include "../../../elementwise/metax/elementwise_metax.h" - -#include "../cuda/kernel.cuh" - -namespace op::equal::metax { - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - - const auto &a_desc = input_desc_vec.at(0); - auto compute_dtype = a_desc->dtype(); - auto out_dtype = out_desc->dtype(); - - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, - INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); - - CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_I32: - return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_I64: - return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -} // namespace op::equal::metax diff --git a/src/infiniop/ops/equal/moore/equal_moore.h b/src/infiniop/ops/equal/moore/equal_moore.h deleted file mode 100644 index 2fed1bb40..000000000 --- a/src/infiniop/ops/equal/moore/equal_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __EQUAL_MOORE_API_H__ -#define __EQUAL_MOORE_API_H__ - -#include "../../../elementwise/moore/elementwise_moore_api.h" - -ELEMENTWISE_DESCRIPTOR(equal, moore) - -#endif // __EQUAL_MOORE_API_H__ diff --git a/src/infiniop/ops/equal/moore/equal_moore.mu b/src/infiniop/ops/equal/moore/equal_moore.mu deleted file mode 100644 index d0eb8395d..000000000 --- a/src/infiniop/ops/equal/moore/equal_moore.mu +++ /dev/null @@ -1,140 +0,0 @@ -#include "equal_moore.h" - -#include "../../../elementwise/moore/elementwise_moore.h" - -#include "equal_moore_kernel.h" - -namespace op::equal::moore { -namespace { - -inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { - if (!info.isOutputContiguous()) { - return false; - } - const bool *input_contiguous = info.getInputContiguous(); - const bool *input_broadcasted = info.getInputBroadcasted(); - for (size_t i = 0; i < 2; ++i) { - if (!input_contiguous[i] || input_broadcasted[i]) { - return false; - } - } - return true; -} - -template -INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) { - const auto op = op::equal::moore::EqualOp{}; - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - for (; idx < numel; idx += stride) { - output[idx] = op.template operator()(a[idx], b[idx]); - } -} - -template -infiniStatus_t launch_fast_path(size_t numel, - void *output, - const std::vector &inputs, - void *stream) { - if (numel == 0) { - return INFINI_STATUS_SUCCESS; - } - - constexpr int kBlockSize = 256; - int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); - if (grid > 65535) { - grid = 65535; - } - - auto musa_stream = reinterpret_cast(stream); - equal_contiguous_kernel<<>>( - numel, - reinterpret_cast(output), - reinterpret_cast(inputs[0]), - reinterpret_cast(inputs[1])); - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - - const auto &a_desc = input_desc_vec.at(0); - auto compute_dtype = a_desc->dtype(); - auto out_dtype = out_desc->dtype(); - - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, - INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); - - CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create MOORE elementwise descriptor - CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (can_use_contiguous_fast_path(_info)) { - size_t numel = _info.getOutputSize(); - switch (_dtype) { - case INFINI_DTYPE_F16: - return launch_fast_path(numel, output, inputs, stream); - case INFINI_DTYPE_BF16: - return launch_fast_path(numel, output, inputs, stream); - case INFINI_DTYPE_F32: - return launch_fast_path(numel, output, inputs, stream); - case INFINI_DTYPE_I32: - return launch_fast_path(numel, output, inputs, stream); - case INFINI_DTYPE_I64: - return launch_fast_path(numel, output, inputs, stream); - case INFINI_DTYPE_F64: - return launch_fast_path(numel, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - } - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_I32: - return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_I64: - return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -} // namespace op::equal::moore diff --git a/src/infiniop/ops/equal/moore/equal_moore_kernel.h b/src/infiniop/ops/equal/moore/equal_moore_kernel.h deleted file mode 100644 index a4e32880b..000000000 --- a/src/infiniop/ops/equal/moore/equal_moore_kernel.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef __EQUAL_MOORE_KERNEL_H__ -#define __EQUAL_MOORE_KERNEL_H__ - -#include - -namespace op::equal::moore { - -typedef struct EqualOp { -public: - static constexpr size_t num_inputs = 2; - - template - __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const { - if constexpr (std::is_same_v) { - if constexpr (std::is_same_v) { - return __half2float(a) == __half2float(b); - } else if constexpr (std::is_same_v) { - return __bfloat162float(a) == __bfloat162float(b); - } else { - return a == b; - } - } else { - return false; - } - } -} EqualOp; - -} // namespace op::equal::moore - -#endif // __EQUAL_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu deleted file mode 100644 index 5bdf92e6c..000000000 --- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu +++ /dev/null @@ -1,137 +0,0 @@ -#include -#include -#include - -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" - -#include "../cuda/kernel.cuh" -#include "equal_nvidia.cuh" - -namespace { - -template -INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) { - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - op::equal::cuda::EqualOp op{}; - for (; idx < n; idx += stride) { - output[idx] = op.template operator()(a[idx], b[idx]); - } -} - -template -infiniStatus_t launchFastEqualKernel(size_t numel, - void *output, - const std::vector &inputs, - void *stream) { - if (numel == 0) { - return INFINI_STATUS_SUCCESS; - } - constexpr int block = 256; - int grid = static_cast((numel + block - 1) / block); - grid = std::min(grid, 65535); - auto cuda_stream = reinterpret_cast(stream); - FastEqualKernel<<>>( - numel, - reinterpret_cast(output), - reinterpret_cast(inputs[0]), - reinterpret_cast(inputs[1])); - auto err = cudaGetLastError(); - return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; -} - -} // namespace - -namespace op::equal::nvidia { - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - - const auto &a_desc = input_desc_vec.at(0); - auto compute_dtype = a_desc->dtype(); - auto out_dtype = out_desc->dtype(); - - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, - INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); - - CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - bool fast_path = _info.isOutputContiguous(); - if (fast_path) { - const bool *input_contiguous = _info.getInputContiguous(); - const bool *input_broadcasted = _info.getInputBroadcasted(); - for (size_t i = 0; i < 2; ++i) { - fast_path &= input_contiguous[i] && !input_broadcasted[i]; - } - } - - if (fast_path) { - size_t numel = _info.getOutputSize(); - switch (_dtype) { - case INFINI_DTYPE_F16: - return launchFastEqualKernel(numel, output, inputs, stream); - case INFINI_DTYPE_BF16: - return launchFastEqualKernel(numel, output, inputs, stream); - case INFINI_DTYPE_F32: - return launchFastEqualKernel(numel, output, inputs, stream); - case INFINI_DTYPE_I32: - return launchFastEqualKernel(numel, output, inputs, stream); - case INFINI_DTYPE_I64: - return launchFastEqualKernel(numel, output, inputs, stream); - case INFINI_DTYPE_F64: - return launchFastEqualKernel(numel, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - } - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_I32: - return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_I64: - return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::equal::nvidia diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh deleted file mode 100644 index 6565a80b5..000000000 --- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __EQUAL_CUDA_API_H__ -#define __EQUAL_CUDA_API_H__ - -#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" - -ELEMENTWISE_DESCRIPTOR(equal, nvidia) - -#endif diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc deleted file mode 100644 index 80da07e01..000000000 --- a/src/infiniop/ops/equal/operator.cc +++ /dev/null @@ -1,201 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/equal.h" - -#ifdef ENABLE_CPU_API -#include "cpu/equal_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) -#include "nvidia/equal_nvidia.cuh" -#endif -#ifdef ENABLE_METAX_API -#include "metax/equal_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/equal_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/equal_bang.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/equal_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateEqualDescriptor( - infiniopHandle_t handle, - infiniopEqualDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::equal::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopEqual( - infiniopEqualDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc deleted file mode 100644 index f47198580..000000000 --- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include "hardswish_cpu.h" - -#include - -namespace op::hardswish::cpu { -namespace { - -inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { - return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; -} - -template -infiniStatus_t launch_contiguous_cpu(const op::elementwise::ElementwiseInfo &info, - void *output, - const std::vector &inputs) { - const T *in = reinterpret_cast(inputs[0]); - T *out = reinterpret_cast(output); - const ptrdiff_t size = static_cast(info.getOutputSize()); - -#pragma omp parallel for if (size > 1024) - for (ptrdiff_t i = 0; i < size; ++i) { - out[i] = HardSwishOp{}(in[i]); - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - const bool fast_path = can_use_contiguous_fast_path(_info); - if (fast_path) { - switch (_dtype) { - case INFINI_DTYPE_BF16: - return launch_contiguous_cpu(_info, output, inputs); - case INFINI_DTYPE_F16: - return launch_contiguous_cpu(_info, output, inputs); - case INFINI_DTYPE_F32: - return launch_contiguous_cpu(_info, output, inputs); - case INFINI_DTYPE_F64: - return launch_contiguous_cpu(_info, output, inputs); - default: - break; - } - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h deleted file mode 100644 index b853663aa..000000000 --- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef __HARDSWISH_CPU_H__ -#define __HARDSWISH_CPU_H__ - -#include "../../../elementwise/cpu/elementwise_cpu.h" - -ELEMENTWISE_DESCRIPTOR(hardswish, cpu) - -#include -#include - -namespace op::hardswish::cpu { - -typedef struct HardSwishOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - const float x_f = utils::cast(x); - const float clamped = std::min(std::max(x_f + 3.0f, 0.0f), 6.0f); - const float result = x_f * clamped * (1.0f / 6.0f); - return utils::cast(result); - } -} HardSwishOp; - -typedef struct HardSwishContiguousOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - - T three = static_cast(3); - T zero = static_cast(0); - T six = static_cast(6); - - T scale = static_cast(0.16666667f); - - T val = x + three; - - val = std::max(zero, val); - val = std::min(six, val); - - return x * val * scale; - } -} HardSwishContiguousOp; - -} // namespace op::hardswish::cpu - -#endif diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh deleted file mode 100644 index 21b6a5f8d..000000000 --- a/src/infiniop/ops/hardswish/cuda/kernel.cuh +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef __HARDSWISH_CUDA_H__ -#define __HARDSWISH_CUDA_H__ - -#include -#if defined(__MACACC__) -#include -#include -#else -#include -#include -#endif - -namespace op::hardswish::cuda { - -typedef struct HardSwishOp { -public: - static constexpr size_t num_inputs = 1; - - template - __device__ __forceinline__ T operator()(const T &x) const { - - if constexpr (std::is_same_v) { - - const half2 three = __float2half2_rn(3.0f); - const half2 scale = __float2half2_rn(0.16666667f); - - half2 val = __hadd2(x, three); - -#if defined(ENABLE_ILUVATAR_API) - - float2 val_f = __half22float2(val); - val_f.x = fminf(fmaxf(val_f.x, 0.0f), 6.0f); - val_f.y = fminf(fmaxf(val_f.y, 0.0f), 6.0f); - val = __floats2half2_rn(val_f.x, val_f.y); -#else - - const half2 zero = __float2half2_rn(0.0f); - const half2 six = __float2half2_rn(6.0f); - -#if __CUDA_ARCH__ >= 800 - - val = __hmin2(__hmax2(val, zero), six); -#else - - val = __hmax2(val, zero); - val = __hmin2(val, six); -#endif -#endif - - return __hmul2(__hmul2(x, val), scale); - - } - - else if constexpr (std::is_same_v) { - - const float x_f = __bfloat162float(x); - - const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); - return __float2bfloat16(x_f * val * 0.16666667f); - - } - - else if constexpr (std::is_same_v) { - const float x_f = __half2float(x); - const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); - return __float2half(x_f * val * 0.16666667f); - - } - - else if constexpr (std::is_same_v) { - - const float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f); - return x * val * 0.16666667f; - - } - - else if constexpr (std::is_same_v) { - const double val = fmin(fmax(x + 3.0, 0.0), 6.0); - return x * val * (1.0 / 6.0); - } - } -} HardSwishOp; - -} // namespace op::hardswish::cuda - -#endif diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h deleted file mode 100644 index 16b131aa9..000000000 --- a/src/infiniop/ops/hardswish/metax/hardswish_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __HARDSWISH_METAX_API_H__ -#define __HARDSWISH_METAX_API_H__ - -#include "../../../elementwise/metax/elementwise_metax_api.h" - -ELEMENTWISE_DESCRIPTOR(hardswish, metax) - -#endif // __HARDSWISH_METAX_API_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca deleted file mode 100644 index fc57a9b20..000000000 --- a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca +++ /dev/null @@ -1,58 +0,0 @@ -#include "hardswish_metax.h" - -#include "../../../elementwise/metax/elementwise_metax.h" - -#include "../cuda/kernel.cuh" - -namespace op::hardswish::metax { - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -} // namespace op::hardswish::metax diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.h b/src/infiniop/ops/hardswish/moore/hardswish_moore.h deleted file mode 100644 index e5861a158..000000000 --- a/src/infiniop/ops/hardswish/moore/hardswish_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __HARDSWISH_MOORE_API_H__ -#define __HARDSWISH_MOORE_API_H__ - -#include "../../../elementwise/moore/elementwise_moore_api.h" - -ELEMENTWISE_DESCRIPTOR(hardswish, moore) - -#endif // __HARDSWISH_MOORE_API_H__ diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.mu b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu deleted file mode 100644 index 3a1290b35..000000000 --- a/src/infiniop/ops/hardswish/moore/hardswish_moore.mu +++ /dev/null @@ -1,118 +0,0 @@ -#include "hardswish_moore.h" - -#include "../../../elementwise/moore/elementwise_moore.h" - -#include "hardswish_moore_kernel.h" - -namespace op::hardswish::moore { -namespace { - -inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { - return info.isOutputContiguous() && info.getInputSize() == 1 && - info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; -} - -template -INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) { - const auto op = op::hardswish::moore::HardSwishOp{}; - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - for (; idx < numel; idx += stride) { - out[idx] = op(in[idx]); - } -} - -template -infiniStatus_t launch_fast_path(size_t numel, - void *output, - const std::vector &inputs, - void *stream) { - if (numel == 0) { - return INFINI_STATUS_SUCCESS; - } - - constexpr int kBlockSize = 256; - int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); - if (grid > 65535) { - grid = 65535; - } - - auto musa_stream = reinterpret_cast(stream); - hardswish_contiguous_kernel<<>>( - numel, - reinterpret_cast(output), - reinterpret_cast(inputs[0])); - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - // create MOORE elementwise descriptor - CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - const bool fast_path = can_use_contiguous_fast_path(_info); - if (fast_path) { - switch (_dtype) { - case INFINI_DTYPE_BF16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - case INFINI_DTYPE_F16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - case INFINI_DTYPE_F32: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - case INFINI_DTYPE_F64: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - default: - break; - } - } - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::hardswish::moore diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h deleted file mode 100644 index 60e3dbc60..000000000 --- a/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __HARDSWISH_MOORE_KERNEL_H__ -#define __HARDSWISH_MOORE_KERNEL_H__ - -#include -#include - -namespace op::hardswish::moore { - -typedef struct HardSwishOp { -public: - static constexpr size_t num_inputs = 1; - - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - float x_f = __half2float(x); - float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); - return __float2half(x_f * val * 0.16666667f); - } else if constexpr (std::is_same_v) { - float x_f = __bfloat162float(x); - float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); - return __float2bfloat16_rn(x_f * val * 0.16666667f); - } else if constexpr (std::is_same_v) { - float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f); - return x * val * 0.16666667f; - } else if constexpr (std::is_same_v) { - double val = fmin(fmax(x + 3.0, 0.0), 6.0); - return x * val * (1.0 / 6.0); - } else { - float x_f = static_cast(x); - float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); - return static_cast(x_f * val * 0.16666667f); - } - } -} HardSwishOp; - -} // namespace op::hardswish::moore - -#endif // __HARDSWISH_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu deleted file mode 100644 index f7736a7fd..000000000 --- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu +++ /dev/null @@ -1,115 +0,0 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" - -#include "../cuda/kernel.cuh" -#include "hardswish_nvidia.cuh" - -#include - -namespace op::hardswish::nvidia { -namespace { - -inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { - return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; -} - -template -__global__ void hardswish_contiguous_kernel(size_t numel, T *out, const T *in) { - const auto op = op::hardswish::cuda::HardSwishOp{}; - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - while (idx < numel) { - out[idx] = op(in[idx]); - idx += blockDim.x * gridDim.x; - } -} - -template -infiniStatus_t launch_fast_path(size_t numel, - void *output, - const std::vector &inputs, - void *stream) { - if (numel == 0) { - return INFINI_STATUS_SUCCESS; - } - - constexpr int BLOCK_SIZE = 256; - int grid = static_cast((numel + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid = std::min(grid, 65535); - - auto *out_ptr = reinterpret_cast(output); - auto *in_ptr = reinterpret_cast(inputs[0]); - auto cuda_stream = reinterpret_cast(stream); - - hardswish_contiguous_kernel<<>>(numel, out_ptr, in_ptr); - cudaError_t err = cudaGetLastError(); - return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; -} - -} // namespace - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - const bool fast_path = can_use_contiguous_fast_path(_info); - if (fast_path) { - switch (_dtype) { - case INFINI_DTYPE_BF16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - case INFINI_DTYPE_F16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - case INFINI_DTYPE_F32: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - case INFINI_DTYPE_F64: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream); - default: - break; - } - } - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh deleted file mode 100644 index eac0dd994..000000000 --- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __HARDSWISH_CUDA_API_H__ -#define __HARDSWISH_CUDA_API_H__ - -#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" - -ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) - -#endif diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc deleted file mode 100644 index ddce97f16..000000000 --- a/src/infiniop/ops/hardswish/operator.cc +++ /dev/null @@ -1,157 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/hardswish.h" - -#ifdef ENABLE_CPU_API -#include "cpu/hardswish_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) -#include "nvidia/hardswish_nvidia.cuh" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/hardswish_moore.h" -#endif -#ifdef ENABLE_METAX_API -#include "metax/hardswish_metax.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateHardSwishDescriptor( - infiniopHandle_t handle, - infiniopHardSwishDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::hardswish::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopHardSwish( - infiniopHardSwishDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, {input}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc deleted file mode 100644 index 1bd276308..000000000 --- a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc +++ /dev/null @@ -1,124 +0,0 @@ -#include "hardtanh_cpu.h" - -#include - -namespace op::hardtanh::cpu { - -Descriptor::Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val) - : InfiniopDescriptor{device_type, device_id}, - _dtype(dtype), - _info(std::move(info)), - _workspace_size(workspace_size), - _min_val(min_val), - _max_val(max_val) {} - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - CHECK_SAME_SHAPE(output_shape, input_shape); - - auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); - CHECK_RESULT(info_result); - - *desc_ptr = new Descriptor( - dtype, - info_result.take(), - 0, - handle->device, - handle->device_id, - min_val, - max_val); - - return INFINI_STATUS_SUCCESS; -} - -template -static infiniStatus_t launchCpuHardTanh(const op::elementwise::ElementwiseInfo &info, - void *output, - const std::vector &inputs, - float min_val, - float max_val) { - if (inputs.empty()) { - return INFINI_STATUS_BAD_PARAM; - } - - T *out = reinterpret_cast(output); - const T *in = reinterpret_cast(inputs[0]); - const auto ndim = info.getNdim(); - const auto *output_shape = info.getOutputShape(); - const auto *output_strides = info.getOutputStrides(); - const auto *input_shape = info.getInputShape(0); - const auto *input_strides = info.getInputStrides(0); - const auto *input_contiguous = info.getInputContiguous(); - ptrdiff_t output_size = info.getOutputSize(); - -#pragma omp parallel for if (output_size > 1024) - for (ptrdiff_t i = 0; i < output_size; ++i) { - const size_t out_idx = info.isOutputContiguous() - ? static_cast(i) - : op::common_cpu::indexToOffset(i, ndim, output_shape, output_strides); - const size_t in_idx = input_contiguous[0] - ? static_cast(i) - : op::common_cpu::indexToOffset(i, ndim, input_shape, input_strides); - - if constexpr (std::is_same_v || std::is_same_v) { - float value = utils::cast(in[in_idx]); - float clamped = HardTanhOp{}(value, min_val, max_val); - out[out_idx] = utils::cast(clamped); - } else { - out[out_idx] = HardTanhOp{}(in[in_idx], min_val, max_val); - } - } - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - (void)workspace; - (void)workspace_size; - (void)stream; - - if (inputs.size() != 1) { - return INFINI_STATUS_BAD_PARAM; - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); - case INFINI_DTYPE_F16: - return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); - case INFINI_DTYPE_F32: - return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); - case INFINI_DTYPE_F64: - return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} -} // namespace op::hardtanh::cpu diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h deleted file mode 100644 index 09bfb340c..000000000 --- a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef __HARDTANH_CPU_H__ -#define __HARDTANH_CPU_H__ - -#include "../../../elementwise/cpu/elementwise_cpu.h" -#include - -namespace op::hardtanh::cpu { - -class Descriptor final : public InfiniopDescriptor { - infiniDtype_t _dtype; - op::elementwise::ElementwiseInfo _info; - size_t _workspace_size; - float _min_val; - float _max_val; - - Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val); - -public: - ~Descriptor(); - - size_t workspaceSize() const { return _workspace_size; } - - static infiniStatus_t create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val); - - infiniStatus_t calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const; - - float minVal() const { return _min_val; } - float maxVal() const { return _max_val; } -}; - -typedef struct HardTanhOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x, float min_val, float max_val) const { - T low = static_cast(min_val); - T high = static_cast(max_val); - T val = x < low ? low : x; - return val > high ? high : val; - } -} HardTanhOp; - -} // namespace op::hardtanh::cpu - -#endif diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh deleted file mode 100644 index 28987f82c..000000000 --- a/src/infiniop/ops/hardtanh/cuda/kernel.cuh +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __HARDTANH_CUDA_H__ -#define __HARDTANH_CUDA_H__ - -#if defined(__MACACC__) -#include -#include -#else -#include -#include -#endif -#include - -namespace op::hardtanh::cuda { - -typedef struct HardTanhOp { -public: - static constexpr size_t num_inputs = 1; - - template - __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const { - if constexpr (std::is_same_v) { - - float2 x_f2 = __half22float2(x); - x_f2.x = fminf(max_val, fmaxf(min_val, x_f2.x)); - x_f2.y = fminf(max_val, fmaxf(min_val, x_f2.y)); - return __float22half2_rn(x_f2); - - } else if constexpr (std::is_same_v) { - - float x_f = __bfloat162float(x); - return __float2bfloat16(fminf(max_val, fmaxf(min_val, x_f))); - - } else if constexpr (std::is_same_v) { - - float x_f = __half2float(x); - return __float2half(fminf(max_val, fmaxf(min_val, x_f))); - - } else if constexpr (std::is_same_v) { - - return fminf(max_val, fmaxf(min_val, x)); - - } else if constexpr (std::is_same_v) { - - return fmin((double)max_val, fmax((double)min_val, x)); - } - } -} HardTanhOp; - -} // namespace op::hardtanh::cuda - -#endif diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h deleted file mode 100644 index 182157116..000000000 --- a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef __HARDTANH_METAX_API_H__ -#define __HARDTANH_METAX_API_H__ - -#include "../../../elementwise/metax/elementwise_metax_api.h" - -namespace op::hardtanh::metax { - -class Descriptor final : public InfiniopDescriptor { - infiniDtype_t _dtype; - op::elementwise::ElementwiseInfo _info; - std::unique_ptr _device_info; - size_t _workspace_size; - float _min_val; - float _max_val; - - Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - op::elementwise::metax::DeviceImpl *device_info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val); - -public: - ~Descriptor(); - - size_t workspaceSize() const { return _workspace_size; } - - static infiniStatus_t create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val); - - infiniStatus_t calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const; -}; - -} // namespace op::hardtanh::metax - -#endif // __HARDTANH_METAX_API_H__ diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca deleted file mode 100644 index 596316e23..000000000 --- a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca +++ /dev/null @@ -1,95 +0,0 @@ -#include "hardtanh_metax.h" - -#include "../../../elementwise/metax/elementwise_metax.h" - -#include "../cuda/kernel.cuh" - -namespace op::hardtanh::metax { - -Descriptor::Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - op::elementwise::metax::DeviceImpl *device_info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val) - : InfiniopDescriptor{device_type, device_id}, - _dtype(dtype), - _info(std::move(info)), - _device_info(device_info), - _workspace_size(workspace_size), - _min_val(min_val), - _max_val(max_val) {} - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - CHECK_SAME_SHAPE(output_shape, input_shape); - - auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); - CHECK_RESULT(info_result); - auto info = info_result.take(); - auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); - - auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal()); - CHECK_RESULT(device_impl_result); - - *desc_ptr = new Descriptor( - dtype, - std::move(info), - device_impl_result.take(), - workspace_size, - handle->device, - handle->device_id, - min_val, - max_val); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>( - _info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::HardTanhOp, half>( - _info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::HardTanhOp, float>( - _info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::HardTanhOp, double>( - _info, workspace, output, inputs, stream, _min_val, _max_val); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } -} - -} // namespace op::hardtanh::metax diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h deleted file mode 100644 index 470790d52..000000000 --- a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __HARDTANH_MOORE_API_H__ -#define __HARDTANH_MOORE_API_H__ - -#include "../../../elementwise/moore/elementwise_moore_api.h" - -namespace op::hardtanh::moore { - -class Descriptor final : public InfiniopDescriptor { - infiniDtype_t _dtype; - op::elementwise::ElementwiseInfo _info; - std::unique_ptr _device_info; - size_t _workspace_size; - float _min_val; - float _max_val; - - Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - op::elementwise::moore::DeviceImpl *device_info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val); - -public: - ~Descriptor(); - - size_t workspaceSize() const { return _workspace_size; } - - static infiniStatus_t create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val); - - infiniStatus_t calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const; - - float minVal() const { return _min_val; } - float maxVal() const { return _max_val; } -}; - -} // namespace op::hardtanh::moore - -#endif // __HARDTANH_MOORE_API_H__ diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu deleted file mode 100644 index 40e3dbe41..000000000 --- a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu +++ /dev/null @@ -1,158 +0,0 @@ -#include "hardtanh_moore.h" - -#include "../../../elementwise/moore/elementwise_moore.h" - -#include "hardtanh_moore_kernel.h" - -namespace op::hardtanh::moore { -namespace { - -inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { - return info.isOutputContiguous() && info.getInputSize() == 1 && - info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; -} - -template -INFINIOP_MOORE_KERNEL hardtanh_contiguous_kernel(size_t numel, - T *out, - const T *in, - float min_val, - float max_val) { - const auto op = op::hardtanh::moore::HardTanhOp{}; - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - for (; idx < numel; idx += stride) { - out[idx] = op(in[idx], min_val, max_val); - } -} - -template -infiniStatus_t launch_fast_path(size_t numel, - void *output, - const std::vector &inputs, - void *stream, - float min_val, - float max_val) { - if (numel == 0) { - return INFINI_STATUS_SUCCESS; - } - - constexpr int kBlockSize = 256; - int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); - if (grid > 65535) { - grid = 65535; - } - - auto musa_stream = reinterpret_cast(stream); - hardtanh_contiguous_kernel<<>>( - numel, - reinterpret_cast(output), - reinterpret_cast(inputs[0]), - min_val, - max_val); - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -Descriptor::Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - op::elementwise::moore::DeviceImpl *device_info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val) - : InfiniopDescriptor{device_type, device_id}, - _dtype(dtype), - _info(std::move(info)), - _device_info(device_info), - _workspace_size(workspace_size), - _min_val(min_val), - _max_val(max_val) {} - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - CHECK_SAME_SHAPE(output_shape, input_shape); - - auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); - CHECK_RESULT(info_result); - auto info = info_result.take(); - auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); - - auto device_impl_result = op::elementwise::moore::DeviceImpl::create(handle->internal()); - CHECK_RESULT(device_impl_result); - - *desc_ptr = new Descriptor( - dtype, - std::move(info), - device_impl_result.take(), - workspace_size, - handle->device, - handle->device_id, - min_val, - max_val); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - const bool fast_path = can_use_contiguous_fast_path(_info); - if (fast_path) { - switch (_dtype) { - case INFINI_DTYPE_BF16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F32: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F64: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - default: - break; - } - } - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, moore::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::hardtanh::moore diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h deleted file mode 100644 index db0a3c024..000000000 --- a/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __HARDTANH_MOORE_KERNEL_H__ -#define __HARDTANH_MOORE_KERNEL_H__ - -#include -#include - -namespace op::hardtanh::moore { - -typedef struct HardTanhOp { -public: - static constexpr size_t num_inputs = 1; - - template - __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const { - if constexpr (std::is_same_v) { - float x_f = __half2float(x); - return __float2half(fminf(max_val, fmaxf(min_val, x_f))); - } else if constexpr (std::is_same_v) { - float x_f = __bfloat162float(x); - return __float2bfloat16_rn(fminf(max_val, fmaxf(min_val, x_f))); - } else if constexpr (std::is_same_v) { - return fminf(max_val, fmaxf(min_val, x)); - } else if constexpr (std::is_same_v) { - return fmin((double)max_val, fmax((double)min_val, x)); - } else { - float x_f = static_cast(x); - return static_cast(fminf(max_val, fmaxf(min_val, x_f))); - } - } -} HardTanhOp; - -} // namespace op::hardtanh::moore - -#endif // __HARDTANH_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu deleted file mode 100644 index 31ba489ab..000000000 --- a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu +++ /dev/null @@ -1,150 +0,0 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" - -#include "../cuda/kernel.cuh" -#include "hardtanh_nvidia.cuh" - -#include - -namespace op::hardtanh::nvidia { -namespace { - -inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { - return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; -} - -template -__global__ void hardtanh_contiguous_kernel(size_t numel, T *out, const T *in, float min_val, float max_val) { - const auto op = op::hardtanh::cuda::HardTanhOp{}; - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - while (idx < numel) { - out[idx] = op(in[idx], min_val, max_val); - idx += blockDim.x * gridDim.x; - } -} - -template -infiniStatus_t launch_fast_path(size_t numel, - void *output, - const std::vector &inputs, - void *stream, - float min_val, - float max_val) { - if (numel == 0) { - return INFINI_STATUS_SUCCESS; - } - - constexpr int BLOCK_SIZE = 256; - int grid = static_cast((numel + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid = std::min(grid, 65535); - - auto *out_ptr = reinterpret_cast(output); - auto *in_ptr = reinterpret_cast(inputs[0]); - auto cuda_stream = reinterpret_cast(stream); - - hardtanh_contiguous_kernel<<>>(numel, out_ptr, in_ptr, min_val, max_val); - cudaError_t err = cudaGetLastError(); - return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; -} - -} // namespace - -Descriptor::Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - op::elementwise::nvidia::DeviceImpl *device_info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val) - : InfiniopDescriptor{device_type, device_id}, - _dtype(dtype), - _info(std::move(info)), - _device_info(device_info), - _workspace_size(workspace_size), - _min_val(min_val), - _max_val(max_val) {} - -Descriptor::~Descriptor() = default; - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); - CHECK_SAME_SHAPE(output_shape, input_shape); - - auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); - CHECK_RESULT(info_result); - auto info = info_result.take(); - auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); - - auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal()); - CHECK_RESULT(device_impl_result); - - *desc_ptr = new Descriptor( - dtype, - std::move(info), - device_impl_result.take(), - workspace_size, - handle->device, - handle->device_id, - min_val, - max_val); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - const bool fast_path = can_use_contiguous_fast_path(_info); - if (fast_path) { - switch (_dtype) { - case INFINI_DTYPE_BF16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F16: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F32: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F64: - return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); - default: - break; - } - } - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::hardtanh::nvidia diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh deleted file mode 100644 index ebd27d80e..000000000 --- a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __HARDTANH_CUDA_API_H__ -#define __HARDTANH_CUDA_API_H__ - -#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" - -namespace op::hardtanh::nvidia { - -class Descriptor final : public InfiniopDescriptor { - infiniDtype_t _dtype; - op::elementwise::ElementwiseInfo _info; - std::unique_ptr _device_info; - size_t _workspace_size; - float _min_val; - float _max_val; - - Descriptor(infiniDtype_t dtype, - op::elementwise::ElementwiseInfo info, - op::elementwise::nvidia::DeviceImpl *device_info, - size_t workspace_size, - infiniDevice_t device_type, - int device_id, - float min_val, - float max_val); - -public: - ~Descriptor(); - - size_t workspaceSize() const { return _workspace_size; } - - static infiniStatus_t create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec, - float min_val, - float max_val); - - infiniStatus_t calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const; - - float minVal() const { return _min_val; } - float maxVal() const { return _max_val; } -}; - -} // namespace op::hardtanh::nvidia - -#endif diff --git a/src/infiniop/ops/hardtanh/operator.cc b/src/infiniop/ops/hardtanh/operator.cc deleted file mode 100644 index f3c782224..000000000 --- a/src/infiniop/ops/hardtanh/operator.cc +++ /dev/null @@ -1,161 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/hardtanh.h" - -#ifdef ENABLE_CPU_API -#include "cpu/hardtanh_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) -#include "nvidia/hardtanh_nvidia.cuh" -#endif -#ifdef ENABLE_METAX_API -#include "metax/hardtanh_metax.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/hardtanh_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateHardTanhDescriptor( - infiniopHandle_t handle, - infiniopHardTanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - float min_val, - float max_val) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::hardtanh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}, \ - min_val, \ - max_val) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopHardTanh( - infiniopHardTanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, {input}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/sum/cpu/sum_cpu.cc b/src/infiniop/ops/sum/cpu/sum_cpu.cc deleted file mode 100644 index cbc9c6fe0..000000000 --- a/src/infiniop/ops/sum/cpu/sum_cpu.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include "sum_cpu.h" -#include "../../../../utils.h" -#include "../../../devices/cpu/common_cpu.h" -namespace op::sum::cpu { - -Descriptor::~Descriptor() {} -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - - *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -template -infiniStatus_t calculateSum( - const SumInfo *info, - T *output, - const T *input) { - if (info->reduce_dim_size == info->permuted_input_shape.size()) { // 规约到标量 - float tempSum = 0.; - for (size_t index = 0; index < info->input_size; index++) { - size_t input_offset = op::common_cpu::indexToOffset(index, info->permuted_input_shape.size(), info->permuted_input_shape.data(), info->permuted_input_strides.data()); - tempSum += utils::cast(input[input_offset]); - } - output[0] = utils::cast(tempSum); - return INFINI_STATUS_SUCCESS; - } else { - for (size_t i = 0; i < info->output_size; i++) { - size_t output_offset = op::common_cpu::indexToOffset(i, info->output_shape.size(), info->output_shape.data(), info->output_strides.data()); - float tempSum = 0.; - for (size_t j = 0; j < info->reduce_num; j++) { - size_t input_offset = op::common_cpu::indexToOffset(j + i * info->reduce_num, info->permuted_input_shape.size(), info->permuted_input_shape.data(), info->permuted_input_strides.data()); - tempSum += utils::cast(input[input_offset]); - } - output[output_offset] = utils::cast(tempSum); - } - return INFINI_STATUS_SUCCESS; - } -} -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream) const { - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return calculateSum(&_info, (fp16_t *)output, reinterpret_cast(input)); - case INFINI_DTYPE_F32: - return calculateSum(&_info, (float *)output, reinterpret_cast(input)); - case INFINI_DTYPE_BF16: - return calculateSum(&_info, (bf16_t *)output, reinterpret_cast(input)); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::sum::cpu diff --git a/src/infiniop/ops/sum/cpu/sum_cpu.h b/src/infiniop/ops/sum/cpu/sum_cpu.h deleted file mode 100644 index 26d6789d1..000000000 --- a/src/infiniop/ops/sum/cpu/sum_cpu.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_SUM_CPU_H__ -#define __INFINIOP_SUM_CPU_H__ - -#include "../sum_desc.h" - -DESCRIPTOR(cpu); - -#endif // __INFINIOP_SUM_CPU_H__ diff --git a/src/infiniop/ops/sum/cuda/kernel.cuh b/src/infiniop/ops/sum/cuda/kernel.cuh deleted file mode 100644 index 5808446b4..000000000 --- a/src/infiniop/ops/sum/cuda/kernel.cuh +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef __SUM_CUDA_H__ -#define __SUM_CUDA_H__ - -__forceinline__ __device__ __host__ size_t -indexToOffset( - size_t flat_index, - size_t ndim, - const size_t *shape, - const ptrdiff_t *strides) { - size_t res = 0; - for (size_t i = ndim; i-- > 0;) { - res += (flat_index % shape[i]) * strides[i]; - flat_index /= shape[i]; - } - return res; -} - -template -__global__ void sumAllKernel( - Tcompute *output, - const Tdata *input, - size_t input_size, - size_t permuted_input_shape_size, - size_t *permuted_input_shape, - ptrdiff_t *permuted_input_strides) { - __shared__ Tcompute s_data[BLOCK_SIZE]; - size_t tid = threadIdx.x; - size_t idx = tid + blockIdx.x * blockDim.x; - if (idx < input_size) { - size_t input_offset = indexToOffset(idx, permuted_input_shape_size, permuted_input_shape, permuted_input_strides); - s_data[tid] = static_cast(input[input_offset]); - } else { - s_data[tid] = static_cast(0.f); - } - __syncthreads(); - for (size_t s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - s_data[tid] += s_data[tid + s]; - } - __syncthreads(); - } - - if (tid == 0) { - atomicAdd(output, s_data[0]); - } -} - -template -__global__ void sumKernel( - T *output, - const T *input, - size_t permuted_input_shape_size, - size_t output_shape_size, - size_t output_size, - size_t reduce_num, - size_t *permuted_input_shape, - size_t *output_shape, - ptrdiff_t *permuted_input_strides, - ptrdiff_t *output_strides) { - size_t tid = threadIdx.x; - size_t idx = tid + blockIdx.x * blockDim.x; - if (idx >= output_size) { - return; - } - size_t output_index = indexToOffset(idx, output_shape_size, output_shape, output_strides); - float tempSum = static_cast(0.f); - for (size_t i = 0; i < reduce_num; i++) { - size_t input_offset = indexToOffset(i + idx * reduce_num, permuted_input_shape_size, permuted_input_shape, permuted_input_strides); - tempSum += static_cast(input[input_offset]); - } - output[output_index] = static_cast(tempSum); -} - -#endif // __SUM_CUDA_H__ diff --git a/src/infiniop/ops/sum/info.h b/src/infiniop/ops/sum/info.h deleted file mode 100644 index a69af8b44..000000000 --- a/src/infiniop/ops/sum/info.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef __SUM_INFO_H__ -#define __SUM_INFO_H__ -#include "../../../utils.h" -#include "../../tensor.h" -#include -#include -#include - -namespace op::sum { -class SumInfo { - SumInfo() = default; - -public: - infiniDtype_t dtype; - std::vector permuted_input_shape; // need to permute - std::vector output_shape; - std::vector permuted_input_strides; // need to permute - std::vector output_strides; - size_t reduce_dim_size; // reduce dim size - size_t reduce_num; // number of elements to reduce for each output element - size_t input_size; // total number of input elements - size_t output_size; // total number of output elements - static utils::Result create( - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto input_shape = input_desc->shape(); - auto input_strides = input_desc->strides(); - size_t input_ndim = input_desc->ndim(); - size_t reduce_num = 1; - for (size_t i = 0; i < dim_size; i++) { - reduce_num *= input_shape[dim[i]]; - } - std::vector permute_order; - for (size_t i = 0; i < input_ndim; i++) { - if (std::find(dim, dim + dim_size, i) == dim + dim_size) { - permute_order.push_back(i); - } - } - for (size_t i = 0; i < dim_size; i++) { - permute_order.push_back(dim[i]); - } - std::vector permuted_input_shape; - std::vector permuted_input_strides; - for (size_t i = 0; i < permute_order.size(); i++) { - permuted_input_shape.push_back(input_shape[permute_order[i]]); - permuted_input_strides.push_back(input_strides[permute_order[i]]); - } - return utils::Result(SumInfo{input_desc->dtype(), - permuted_input_shape, - output_desc->shape(), - permuted_input_strides, - output_desc->strides(), - dim_size, - reduce_num, - input_desc->numel(), - output_desc->numel()}); - } -}; -} // namespace op::sum - -#endif diff --git a/src/infiniop/ops/sum/metax/sum_metax.h b/src/infiniop/ops/sum/metax/sum_metax.h deleted file mode 100644 index 5e8e6754c..000000000 --- a/src/infiniop/ops/sum/metax/sum_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __SUM_METAX_H__ -#define __SUM_METAX_H__ - -#include "../sum_desc.h" - -DESCRIPTOR(metax); - -#endif diff --git a/src/infiniop/ops/sum/metax/sum_metax.maca b/src/infiniop/ops/sum/metax/sum_metax.maca deleted file mode 100644 index 5affe779f..000000000 --- a/src/infiniop/ops/sum/metax/sum_metax.maca +++ /dev/null @@ -1,116 +0,0 @@ -#include "../../../devices/metax/metax_common.h" -#include "../../../devices/metax/metax_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "sum_metax.h" - -namespace op::sum::metax { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const SumInfo &info, - T *output, const T *input, - hcStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *permuted_input_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_hc = permuted_input_shape_hc + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - - if (info.reduce_num == input_size) { - T zero = static_cast(0.0f); - CHECK_METAX(hcMemcpyAsync(output, &zero, sizeof(T), hcMemcpyHostToDevice, stream)); - size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - sumAllKernel<<>>( - output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc); - } else { - size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - sumKernel<<>>( - output, input, input_ndim, output_ndim, output_size, reduce_num, - permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream_) const { - - hcStream_t stream = (hcStream_t)stream_; - -#define CALCULATE_SUM(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (T *)output, (const T *)input, \ - stream, workspace, workspace_size) - -#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_SUM(BLOCK_SIZE, __hpcc_bfloat16); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_SUM(BLOCK_SIZE, half); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_SUM(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { - CALCULATE_SUM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024) - } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { - CALCULATE_SUM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::sum::metax diff --git a/src/infiniop/ops/sum/moore/sum_moore.h b/src/infiniop/ops/sum/moore/sum_moore.h deleted file mode 100644 index ca7e18aa3..000000000 --- a/src/infiniop/ops/sum/moore/sum_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __SUM_MOORE_H__ -#define __SUM_MOORE_H__ - -#include "../sum_desc.h" - -DESCRIPTOR(moore); - -#endif diff --git a/src/infiniop/ops/sum/moore/sum_moore.mu b/src/infiniop/ops/sum/moore/sum_moore.mu deleted file mode 100644 index 8c465460e..000000000 --- a/src/infiniop/ops/sum/moore/sum_moore.mu +++ /dev/null @@ -1,133 +0,0 @@ -#include "../../../devices/moore/moore_common.h" -#include "../../../devices/moore/moore_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "sum_moore.h" - -namespace op::sum::moore { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const SumInfo &info, - T *output, const T *input, - musaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *permuted_input_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_musa = permuted_input_shape_musa + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - - if (info.reduce_num == input_size) { - if constexpr (std::is_same_v) { - // 需要解决 moore不支持bf16的atomic add的问题 - float zero = 0.0f; - float *tmp_output; - CHECK_MOORE(musaMalloc(&tmp_output, sizeof(float))); - CHECK_MOORE(musaMemcpyAsync(tmp_output, &zero, sizeof(float), musaMemcpyHostToDevice, stream)); - size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - sumAllKernel<<>>( - tmp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa); - // 可以自定义 kernel,将 float -> T,这里直接memcpy了 - float host_val; - CHECK_MOORE(musaMemcpy(&host_val, tmp_output, sizeof(float), musaMemcpyDeviceToHost)); - T out_val = static_cast(host_val); - CHECK_MOORE(musaMemcpyAsync(output, &out_val, sizeof(T), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaFree(tmp_output)); - } else { - T zero = static_cast(0.0f); - CHECK_MOORE(musaMemcpyAsync(output, &zero, sizeof(T), musaMemcpyHostToDevice, stream)); - size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - sumAllKernel<<>>( - output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa); - } - } else { - size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - sumKernel<<>>( - output, input, input_ndim, output_ndim, output_size, reduce_num, - permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream_) const { - - musaStream_t stream = (musaStream_t)stream_; - -#define CALCULATE_SUM(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (T *)output, (const T *)input, \ - stream, workspace, workspace_size) - -#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_SUM(BLOCK_SIZE, __mt_bfloat16); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_SUM(BLOCK_SIZE, half); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_SUM(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { - CALCULATE_SUM_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024) - } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { - CALCULATE_SUM_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::sum::moore diff --git a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu b/src/infiniop/ops/sum/nvidia/sum_nvidia.cu deleted file mode 100644 index af052be0a..000000000 --- a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu +++ /dev/null @@ -1,118 +0,0 @@ -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "../cuda/kernel.cuh" -#include "sum_nvidia.cuh" - -namespace op::sum::nvidia { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const SumInfo &info, - T *output, const T *input, - cudaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *permuted_input_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_cuda = permuted_input_shape_cuda + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_cuda = permuted_input_strides_cuda + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - - if (info.reduce_num == input_size) { - T zero = static_cast(0.0f); - CHECK_CUDA(cudaMemcpyAsync(output, &zero, sizeof(T), cudaMemcpyHostToDevice, stream)); - size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - sumAllKernel<<>>( - output, input, input_size, input_ndim, permuted_input_shape_cuda, permuted_input_strides_cuda); - } else { - size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - sumKernel<<>>( - output, input, input_ndim, output_ndim, output_size, reduce_num, - permuted_input_shape_cuda, output_shape_cuda, permuted_input_strides_cuda, output_strides_cuda); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream_) const { - - cudaStream_t stream = (cudaStream_t)stream_; - -#define CALCULATE_SUM(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (T *)output, (const T *)input, \ - stream, workspace, workspace_size) - -#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_SUM(BLOCK_SIZE, __nv_bfloat16); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_SUM(BLOCK_SIZE, half); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_SUM(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { - CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) - } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { - CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) - } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { - CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::sum::nvidia diff --git a/src/infiniop/ops/sum/nvidia/sum_nvidia.cuh b/src/infiniop/ops/sum/nvidia/sum_nvidia.cuh deleted file mode 100644 index fd44a0246..000000000 --- a/src/infiniop/ops/sum/nvidia/sum_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __SUM_NVIDIA_H__ -#define __SUM_NVIDIA_H__ - -#include "../sum_desc.h" - -DESCRIPTOR(nvidia); - -#endif // __SUM_CUDA_API_H__ diff --git a/src/infiniop/ops/sum/operator.cc b/src/infiniop/ops/sum/operator.cc deleted file mode 100644 index b6e1fa7f5..000000000 --- a/src/infiniop/ops/sum/operator.cc +++ /dev/null @@ -1,194 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/sum.h" -#include - -#ifdef ENABLE_CPU_API -#include "cpu/sum_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) -#include "nvidia/sum_nvidia.cuh" -#endif -#ifdef ENABLE_METAX_API -#include "metax/sum_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/sum_kunlun.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/sum_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateSumDescriptor( - infiniopHandle_t handle, - infiniopSumDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool keepdim) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sum::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - input_desc, \ - dim, \ - dim_size, \ - keepdim) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetSumWorkspaceSize(infiniopSumDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopSum( - infiniopSumDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - size_t *dim, - size_t dim_size, - bool keepdim, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, input, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroySumDescriptor(infiniopSumDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/sum/sum_desc.h b/src/infiniop/ops/sum/sum_desc.h deleted file mode 100644 index 2477f9bec..000000000 --- a/src/infiniop/ops/sum/sum_desc.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef INFINIOP_SUM_DESCRIPTOR_H_ -#define INFINIOP_SUM_DESCRIPTOR_H_ -#include "../../../utils.h" -#include "../../operator.h" -#include "../../tensor.h" - -#include "info.h" - -#define DESCRIPTOR(NAMESPACE) \ - \ - namespace op::sum::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - SumInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - SumInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t output_desc, \ - infiniopTensorDescriptor_t input_desc, \ - size_t *dim, \ - size_t dim_size, \ - bool keepdim); \ - \ - infiniStatus_t calculate( \ - void *workspace, size_t workspace_size, \ - void *output, \ - const void *input, \ - void *stream) const; \ - }; \ - } - -#endif diff --git a/src/infiniop/ops/topk/cpu/topk_cpu.cc b/src/infiniop/ops/topk/cpu/topk_cpu.cc deleted file mode 100644 index 388cf2b05..000000000 --- a/src/infiniop/ops/topk/cpu/topk_cpu.cc +++ /dev/null @@ -1,130 +0,0 @@ -#include "topk_cpu.h" -#include "../../../../utils.h" -#include "../../../devices/cpu/common_cpu.h" -#include -#include -namespace op::topk::cpu { - -Descriptor::~Descriptor() {} -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t values_output_desc, - infiniopTensorDescriptor_t indices_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t k, - size_t dim, - bool largest, - bool sorted) { - auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted); - CHECK_RESULT(result); - - *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -template -infiniStatus_t calculateTopK( - const TopKInfo &info, - Tdata *values_output, - int32_t *indices_output, - const Tdata *input, - size_t k, - size_t dim, - bool largest, - bool sorted) { - if (k == 0) { - return INFINI_STATUS_SUCCESS; - } - for (size_t i = 0; i < info.n_iteration; i++) { - size_t index = i; - size_t input_start = 0; - size_t output_start = 0; - for (size_t j = info.ndim - 1; j >= 0; j--) { - if (j == dim) { - continue; - } - input_start += (index % info.input_shape[j]) * info.input_strides[j]; - output_start += (index % info.output_shape[j]) * info.output_strides[j]; - index /= info.input_shape[j]; - } - using elem_t = std::pair; - std::vector vi_queue(info.dim_elements); - for (size_t j = 0; j < info.dim_elements; j++) { - vi_queue[j].first = input[input_start + j * info.input_strides[dim]]; - vi_queue[j].second = j; - } - bool use_partial_sort = static_cast(k) * 64 <= info.dim_elements; - - if (use_partial_sort) { - if (largest) { - std::partial_sort(vi_queue.begin(), vi_queue.begin() + k, vi_queue.end(), - [](const elem_t &a, const elem_t &b) -> bool { - return utils::cast(a.first) > utils::cast(b.first); - }); - } else { - std::partial_sort(vi_queue.begin(), vi_queue.begin() + k, vi_queue.end(), - [](const elem_t &a, const elem_t &b) -> bool { - return utils::cast(a.first) < utils::cast(b.first); - }); - } - } else { - if (largest) { - std::nth_element(vi_queue.begin(), vi_queue.begin() + k - 1, vi_queue.end(), - [](const elem_t &a, const elem_t &b) -> bool { - return utils::cast(a.first) > utils::cast(b.first); - }); - if (sorted) { - std::sort(vi_queue.begin(), vi_queue.begin() + k, // 注意:PyTorch 这里是 k,不是 k-1 - [](const elem_t &a, const elem_t &b) -> bool { - return utils::cast(a.first) > utils::cast(b.first); - }); - } - } else { - std::nth_element(vi_queue.begin(), vi_queue.begin() + k - 1, vi_queue.end(), - [](const elem_t &a, const elem_t &b) -> bool { - return utils::cast(a.first) < utils::cast(b.first); - }); - if (sorted) { - std::sort(vi_queue.begin(), vi_queue.begin() + k, // 注意:PyTorch 这里是 k,不是 k-1 - [](const elem_t &a, const elem_t &b) -> bool { - return utils::cast(a.first) < utils::cast(b.first); - }); - } - } - } - for (size_t j = 0; j < k; j++) { - values_output[output_start + j * info.output_strides[dim]] = vi_queue[j].first; - indices_output[output_start + j * info.output_strides[dim]] = (int32_t)vi_queue[j].second; - } - } - return INFINI_STATUS_SUCCESS; -} -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *values_output, - void *indices_output, - const void *input, - size_t k, - size_t dim, - bool largest, - bool sorted, - void *stream) const { - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return calculateTopK(_info, (fp16_t *)values_output, (int32_t *)indices_output, reinterpret_cast(input), k, dim, largest, sorted); - case INFINI_DTYPE_F32: - return calculateTopK(_info, (float *)values_output, (int32_t *)indices_output, reinterpret_cast(input), k, dim, largest, sorted); - case INFINI_DTYPE_BF16: - return calculateTopK(_info, (bf16_t *)values_output, (int32_t *)indices_output, reinterpret_cast(input), k, dim, largest, sorted); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::topk::cpu diff --git a/src/infiniop/ops/topk/cpu/topk_cpu.h b/src/infiniop/ops/topk/cpu/topk_cpu.h deleted file mode 100644 index 57888f326..000000000 --- a/src/infiniop/ops/topk/cpu/topk_cpu.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_TOPK_CPU_H__ -#define __INFINIOP_TOPK_CPU_H__ - -#include "../topk_desc.h" - -DESCRIPTOR(cpu); - -#endif // __INFINIOP_TOPK_CPU_H__ diff --git a/src/infiniop/ops/topk/cuda/kernel.cuh b/src/infiniop/ops/topk/cuda/kernel.cuh deleted file mode 100644 index 13146b45f..000000000 --- a/src/infiniop/ops/topk/cuda/kernel.cuh +++ /dev/null @@ -1,253 +0,0 @@ -#ifndef __TOPK_CUDA_KERNEL_CUH__ -#define __TOPK_CUDA_KERNEL_CUH__ - -#include // NAN -#include -#include - -namespace op::topk::cuda { -__forceinline__ __device__ __host__ size_t baseOffsetExcludingDim( - size_t flat_row, - size_t ndim, - const size_t *shape, - const ptrdiff_t *strides, - size_t dim) { - size_t res = 0; - for (size_t i = ndim; i-- > 0;) { - if (i == dim) { - continue; - } - res += (flat_row % shape[i]) * strides[i]; - flat_row /= shape[i]; - } - return res; -} - -__forceinline__ __device__ __host__ size_t indexToOffset( - size_t flat_index, - size_t ndim, - const size_t *shape, - const ptrdiff_t *strides) { - size_t res = 0; - for (size_t i = ndim; i-- > 0;) { - res += (flat_index % shape[i]) * strides[i]; - flat_index /= shape[i]; - } - return res; -} - -template -__device__ __forceinline__ float to_float(Tdata v); - -template <> -__device__ __forceinline__ float to_float(float v) { return v; } - -template <> -__device__ __forceinline__ float to_float(half v) { return __half2float(v); } - -#if defined(ENABLE_MOORE_API) -using bf16_t = __mt_bfloat16; -#elif defined(ENABLE_METAX_API) -using bf16_t = __hpcc_bfloat16; -#else -// CUDA / NVIDIA / ILUVATAR -using bf16_t = __nv_bfloat16; -#endif -template <> -__device__ __forceinline__ float to_float(bf16_t v) { - return __bfloat162float(v); -} - -// float -> ordered uint32 -__device__ __forceinline__ uint32_t float_to_uint_ordered(float value) { - uint32_t bits = *reinterpret_cast(&value); - uint32_t mask = (uint32_t)(-((int32_t)bits >> 31)) | 0x80000000u; - return bits ^ mask; -} - -template -__global__ void gather_rowwise(const Tdata *input, uint32_t *cur_vals, int32_t *cur_idx, - size_t rows, size_t n, size_t ndim, size_t dim, const size_t *shape, const ptrdiff_t *strides) { - size_t row = blockIdx.y; - size_t i = threadIdx.x + blockIdx.x * blockDim.x; - if (row >= rows || i >= n) { - return; - } - size_t base = baseOffsetExcludingDim(row, ndim, shape, strides, dim); - size_t off = base + i * strides[dim]; - cur_vals[row * n + i] = float_to_uint_ordered(to_float(input[off])); - cur_idx[row * n + i] = i; -} - -__global__ void init_row_state(int32_t *cur_n, int32_t *rem_k, int32_t *out_pos, size_t rows, size_t n, size_t k) { - int32_t r = blockIdx.x * blockDim.x + threadIdx.x; - if (r < rows) { - cur_n[r] = n; - rem_k[r] = k; - out_pos[r] = 0; - } -} - -__global__ void zero_row_counters(int32_t *ones_count, int32_t *zeros_count, size_t rows) { - int r = blockIdx.x * blockDim.x + threadIdx.x; - if (r < rows) { - ones_count[r] = 0; - zeros_count[r] = 0; - } -} - -template -__global__ void partition_rowwise(const uint32_t *cur_vals, int32_t *cur_idx, uint32_t *ones_vals, int32_t *ones_idx, - uint32_t *zeros_vals, int32_t *zeros_idx, const int32_t *cur_n, size_t rows, size_t n, - int32_t bit_pos, bool largest, int32_t *ones_count, int32_t *zeros_count) { - int32_t row = blockIdx.y; - if (row >= rows) { - return; - } - - __shared__ uint32_t sh1_vals[BLOCK_SIZE]; - __shared__ int32_t sh1_idx[BLOCK_SIZE]; - __shared__ uint32_t sh0_vals[BLOCK_SIZE]; - __shared__ int32_t sh0_idx[BLOCK_SIZE]; - __shared__ int sh1_n, sh0_n; - __shared__ int32_t base1, base0; - - int32_t tid = threadIdx.x; - if (tid == 0) { - sh1_n = 0; - sh0_n = 0; - } - __syncthreads(); - - int32_t i = blockIdx.x * blockDim.x + tid; - int32_t cn = cur_n[row]; - if (i < cn) { - int32_t off = row * n + i; - int32_t idx = cur_idx[off]; - uint32_t key = cur_vals[off]; - uint32_t cmp_key = largest ? key : ~key; - int32_t b = (cmp_key >> bit_pos) & 1; - - if (b) { - int32_t p = atomicAdd(&sh1_n, 1); - sh1_vals[p] = key; - sh1_idx[p] = idx; - } else { - int32_t p = atomicAdd(&sh0_n, 1); - sh0_vals[p] = key; - sh0_idx[p] = idx; - } - } - __syncthreads(); - - if (tid == 0) { - base1 = atomicAdd(&ones_count[row], sh1_n); - base0 = atomicAdd(&zeros_count[row], sh0_n); - } - __syncthreads(); - - for (int32_t j = tid; j < sh1_n; j += blockDim.x) { - int32_t o = row * n + base1 + j; - ones_vals[o] = sh1_vals[j]; - ones_idx[o] = sh1_idx[j]; - } - for (int32_t j = tid; j < sh0_n; j += blockDim.x) { - int32_t o = row * n + base0 + j; - zeros_vals[o] = sh0_vals[j]; - zeros_idx[o] = sh0_idx[j]; - } -} - -template -__global__ void decide_and_compact(uint32_t *cur_vals, int32_t *cur_idx, const uint32_t *ones_vals, const int32_t *ones_idx, const uint32_t *zeros_vals, const int32_t *zeros_idx, - const int32_t *ones_count, const int32_t *zeros_count, int32_t *cur_n, int32_t *rem_k, int32_t *out_pos, - uint32_t *sel_vals, int32_t *sel_idx, size_t rows, size_t n, size_t k) { - int32_t row = blockIdx.x; - if (row >= rows) { - return; - } - int32_t tid = threadIdx.x; - int32_t rem = rem_k[row]; - if (rem <= 0) { - return; - } - int32_t oc = ones_count[row]; - int32_t zc = zeros_count[row]; - int32_t pos = out_pos[row]; - - bool keep_ones = (oc >= rem); - if (!keep_ones) { - for (int32_t j = tid; j < oc; j += blockDim.x) { - if (pos + j < k) { - int32_t o = row * n + j; - sel_vals[row * k + pos + j] = ones_vals[o]; - sel_idx[row * k + pos + j] = ones_idx[o]; - } - } - } - __syncthreads(); - if (tid == 0) { - if (keep_ones) { - cur_n[row] = oc; - } else { - out_pos[row] = pos + oc; - rem_k[row] = rem - oc; - cur_n[row] = zc; - } - } - __syncthreads(); - int32_t new_n = cur_n[row]; - for (int32_t j = tid; j < new_n; j += blockDim.x) { - int32_t o = row * n + j; - cur_vals[o] = keep_ones ? ones_vals[o] : zeros_vals[o]; - cur_idx[o] = keep_ones ? ones_idx[o] : zeros_idx[o]; - } -} - -template -__global__ void take_remaining(const uint32_t *cur_vals, const int32_t *cur_idx, const int32_t *cur_n, const int32_t *rem_k, const int32_t *out_pos, - uint32_t *sel_vals, int32_t *sel_idx, size_t rows, size_t n, size_t k) { - int32_t row = blockIdx.x; - int32_t tid = threadIdx.x; - if (row >= rows) { - return; - } - int32_t rem = rem_k[row]; - int32_t pos = out_pos[row]; - int32_t cn = cur_n[row]; - - int32_t take = rem; - if (take > cn) { - take = cn; - } - for (int32_t j = tid; j < take; j += blockDim.x) { - if (pos + j < k) { - int32_t o = row * k + pos + j; - sel_vals[o] = cur_vals[row * n + j]; - sel_idx[o] = cur_idx[row * n + j]; - } - } -} - -template -__global__ void scatter_to_output(const Tdata *input, const int32_t *sel_idx, Tdata *values_out, int32_t *indices_out, - size_t rows, size_t k, size_t ndim, size_t dim, const size_t *input_shape, const ptrdiff_t *input_strides, - const size_t *output_shape, const ptrdiff_t *output_strides) { - int32_t row = blockIdx.y; - int32_t j = blockIdx.x * blockDim.x + threadIdx.x; - if (row >= rows || j >= k) { - return; - } - - int32_t output_base = baseOffsetExcludingDim(row, ndim, output_shape, output_strides, dim); - int32_t output_off = output_base + j * output_strides[dim]; - int32_t input_base = baseOffsetExcludingDim(row, ndim, input_shape, input_strides, dim); - int32_t input_off = input_base + sel_idx[row * k + j] * input_strides[dim]; - - values_out[output_off] = input[input_off]; - indices_out[output_off] = sel_idx[row * k + j]; -} - -} // namespace op::topk::cuda - -#endif // __TOPK_CUDA_KERNEL_H__ diff --git a/src/infiniop/ops/topk/info.h b/src/infiniop/ops/topk/info.h deleted file mode 100644 index 4d73d0a5d..000000000 --- a/src/infiniop/ops/topk/info.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef __TOPK_INFO_H__ -#define __TOPK_INFO_H__ -#include "../../../utils.h" -#include "../../tensor.h" -#include -#include -#include - -namespace op::topk { -class TopKInfo { - TopKInfo() = default; - -public: - infiniDtype_t dtype; - std::vector input_shape; - std::vector output_shape; - std::vector input_strides; - std::vector output_strides; - size_t k; - size_t dim; - bool largest; - bool sorted; - size_t ndim; - size_t dim_elements; // processed dim elements - size_t n_iteration; // total number of topk iteration - static utils::Result create( - infiniopTensorDescriptor_t values_output_desc, - infiniopTensorDescriptor_t indices_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t k, - size_t dim, - bool largest, - bool sorted) { - auto input_shape = input_desc->shape(); - auto input_strides = input_desc->strides(); - size_t input_ndim = input_desc->ndim(); - size_t dim_elements = input_shape[dim]; - size_t n_iteration = 1; - for (size_t i = 0; i < input_ndim; i++) { - if (i != dim) { - n_iteration *= input_shape[i]; - } - } - return utils::Result(TopKInfo{input_desc->dtype(), - input_desc->shape(), - values_output_desc->shape(), - input_desc->strides(), - values_output_desc->strides(), - k, - dim, - largest, - sorted, - input_ndim, - dim_elements, - n_iteration}); - } -}; -} // namespace op::topk - -#endif diff --git a/src/infiniop/ops/topk/metax/topk_metax.h b/src/infiniop/ops/topk/metax/topk_metax.h deleted file mode 100644 index 04268bb66..000000000 --- a/src/infiniop/ops/topk/metax/topk_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __TOPK_METAX_H__ -#define __TOPK_METAX_H__ - -#include "../topk_desc.h" - -DESCRIPTOR(metax); - -#endif diff --git a/src/infiniop/ops/topk/metax/topk_metax.maca b/src/infiniop/ops/topk/metax/topk_metax.maca deleted file mode 100644 index 48cd5b97f..000000000 --- a/src/infiniop/ops/topk/metax/topk_metax.maca +++ /dev/null @@ -1,280 +0,0 @@ -#include "../../../devices/metax/metax_common.h" -#include "../../../devices/metax/metax_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "topk_metax.h" - -#include -#include - -namespace op::topk::metax { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t values_output_desc, - infiniopTensorDescriptor_t indices_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t k, - size_t dim, - bool largest, - bool sorted) { - auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - - workspace_size += (input_desc->ndim() + values_output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - size_t dim_elements = input_desc->shape()[dim]; - size_t n_iteration = 1; - for (size_t i = 0; i < input_desc->ndim(); i++) { - if (i != dim) { - n_iteration *= input_desc->shape()[i]; - } - } - size_t total = n_iteration * dim_elements; - - workspace_size += 3 * total * sizeof(uint32_t); - workspace_size += 3 * total * sizeof(int32_t); - workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t)); - if (sorted) { - workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t)); - } - workspace_size += 5 * n_iteration * sizeof(int32_t); - - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const TopKInfo &info, - Tdata *values_output, int32_t *indices_output, const Tdata *input, - size_t k, size_t dim, bool largest, bool sorted, - hcStream_t stream, void *workspace, size_t workspace_size) { - if (dim >= info.ndim) { - return INFINI_STATUS_BAD_PARAM; - } - if (k == 0) { - return INFINI_STATUS_SUCCESS; - } - if (k > info.dim_elements) { - return INFINI_STATUS_BAD_PARAM; - } - size_t input_ndim = info.ndim; - size_t output_ndim = input_ndim; - size_t n_iteration = info.n_iteration; - size_t dim_elements = info.dim_elements; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *input_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_hc = input_shape_hc + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_hc = input_strides_hc + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_METAX(hcMemcpyAsync(input_shape_hc, info.input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - - const int32_t total = n_iteration * dim_elements; - - uint32_t *cur_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - uint32_t *ones_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - uint32_t *zeros_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - - int32_t *cur_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - int32_t *ones_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - int32_t *zeros_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - - uint32_t *sel_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(uint32_t); - int32_t *sel_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(int32_t); - uint32_t *sel_sorted_vals = nullptr; - int32_t *sel_sorted_idx = nullptr; - if (sorted) { - sel_sorted_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(uint32_t); - sel_sorted_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(int32_t); - } - - int32_t *cur_n = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *rem_k = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *out_pos = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *ones_count = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *zeros_count = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - // init - { - size_t threads = 256; - size_t blocks = (n_iteration + threads - 1) / threads; - op::topk::cuda::init_row_state<<>>(cur_n, rem_k, out_pos, n_iteration, dim_elements, k); - } - // gather input -> cur - { - dim3 block(BLOCK_SIZE); - dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::gather_rowwise<<>>( - input, cur_vals, cur_idx, - n_iteration, dim_elements, - input_ndim, dim, - input_shape_hc, input_strides_hc); - } - // radix select/filter - for (int bit = 31; bit >= 0; --bit) { - { - size_t threads = 256; - size_t blocks = (n_iteration + threads - 1) / threads; - op::topk::cuda::zero_row_counters<<>>(ones_count, zeros_count, n_iteration); - } - - { - dim3 block(BLOCK_SIZE); - dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::partition_rowwise<<>>( - cur_vals, cur_idx, - ones_vals, ones_idx, - zeros_vals, zeros_idx, - cur_n, n_iteration, dim_elements, - bit, largest, - ones_count, zeros_count); - } - - { - op::topk::cuda::decide_and_compact<<>>( - cur_vals, cur_idx, - ones_vals, ones_idx, - zeros_vals, zeros_idx, - ones_count, zeros_count, - cur_n, rem_k, out_pos, - sel_vals, sel_idx, - n_iteration, dim_elements, k); - } - } - - // append remaining - - op::topk::cuda::take_remaining<<>>( - cur_vals, cur_idx, - cur_n, rem_k, out_pos, - sel_vals, sel_idx, - n_iteration, dim_elements, k); - - // sort (CUB block radix sort) - const int32_t *final_idx = sel_idx; - - if (sorted) { - std::vector h_offsets(n_iteration + 1); - for (size_t i = 0; i <= n_iteration; i++) { - h_offsets[i] = i * k; - } - int *d_offsets; - CHECK_METAX(hcMalloc(&d_offsets, (n_iteration + 1) * sizeof(int))); - CHECK_METAX(hcMemcpy(d_offsets, h_offsets.data(), (n_iteration + 1) * sizeof(int), hcMemcpyHostToDevice)); - - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - - if (!largest) { - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - hcMalloc(&d_temp_storage, temp_storage_bytes); - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - } else { - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - hcMalloc(&d_temp_storage, temp_storage_bytes); - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - } - CHECK_METAX(hcFree(d_offsets)); - CHECK_METAX(hcFree(d_temp_storage)); - final_idx = sel_sorted_idx; - } - - // scatter to output (strided write) - { - dim3 block(BLOCK_SIZE); - dim3 grid((k + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::scatter_to_output<<>>( - input, final_idx, - values_output, indices_output, - n_iteration, k, - input_ndim, dim, - input_shape_hc, input_strides_hc, - output_shape_hc, output_strides_hc); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *values_output, - void *indices_output, - const void *input, - size_t k, - size_t dim, - bool largest, - bool sorted, - void *stream_) const { - - hcStream_t stream = (hcStream_t)stream_; - constexpr int ITEMS = 4; -#define CALCULATE_TOPK(BLOCK_SIZE, Tdata) \ - launchKernel( \ - _info, \ - (Tdata *)values_output, (int32_t *)indices_output, (const Tdata *)input, \ - k, dim, largest, sorted, \ - stream, workspace, workspace_size) - -#define CALCULATE_TOPK_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_TOPK(BLOCK_SIZE, __hpcc_bfloat16); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_TOPK(BLOCK_SIZE, half); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_TOPK(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_TOPK_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::topk::metax diff --git a/src/infiniop/ops/topk/moore/topk_moore.h b/src/infiniop/ops/topk/moore/topk_moore.h deleted file mode 100644 index 37753992f..000000000 --- a/src/infiniop/ops/topk/moore/topk_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __TOPK_MOORE_H__ -#define __TOPK_MOORE_H__ - -#include "../topk_desc.h" - -DESCRIPTOR(moore); - -#endif diff --git a/src/infiniop/ops/topk/moore/topk_moore.mu b/src/infiniop/ops/topk/moore/topk_moore.mu deleted file mode 100644 index b78b9fa0d..000000000 --- a/src/infiniop/ops/topk/moore/topk_moore.mu +++ /dev/null @@ -1,280 +0,0 @@ -#include "../../../devices/moore/moore_common.h" -#include "../../../devices/moore/moore_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "topk_moore.h" - -#include -#include - -namespace op::topk::moore { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t values_output_desc, - infiniopTensorDescriptor_t indices_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t k, - size_t dim, - bool largest, - bool sorted) { - auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - - workspace_size += (input_desc->ndim() + values_output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - size_t dim_elements = input_desc->shape()[dim]; - size_t n_iteration = 1; - for (size_t i = 0; i < input_desc->ndim(); i++) { - if (i != dim) { - n_iteration *= input_desc->shape()[i]; - } - } - size_t total = n_iteration * dim_elements; - - workspace_size += 3 * total * sizeof(uint32_t); - workspace_size += 3 * total * sizeof(int32_t); - workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t)); - if (sorted) { - workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t)); - } - workspace_size += 5 * n_iteration * sizeof(int32_t); - - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const TopKInfo &info, - Tdata *values_output, int32_t *indices_output, const Tdata *input, - size_t k, size_t dim, bool largest, bool sorted, - musaStream_t stream, void *workspace, size_t workspace_size) { - if (dim >= info.ndim) { - return INFINI_STATUS_BAD_PARAM; - } - if (k == 0) { - return INFINI_STATUS_SUCCESS; - } - if (k > info.dim_elements) { - return INFINI_STATUS_BAD_PARAM; - } - size_t input_ndim = info.ndim; - size_t output_ndim = input_ndim; - size_t n_iteration = info.n_iteration; - size_t dim_elements = info.dim_elements; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *input_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_musa = input_shape_musa + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_musa = input_strides_musa + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_MOORE(musaMemcpyAsync(input_shape_musa, info.input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - - const int32_t total = n_iteration * dim_elements; - - uint32_t *cur_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - uint32_t *ones_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - uint32_t *zeros_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - - int32_t *cur_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - int32_t *ones_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - int32_t *zeros_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - - uint32_t *sel_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(uint32_t); - int32_t *sel_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(int32_t); - uint32_t *sel_sorted_vals = nullptr; - int32_t *sel_sorted_idx = nullptr; - if (sorted) { - sel_sorted_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(uint32_t); - sel_sorted_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(int32_t); - } - - int32_t *cur_n = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *rem_k = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *out_pos = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *ones_count = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *zeros_count = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - // init - { - size_t threads = 256; - size_t blocks = (n_iteration + threads - 1) / threads; - op::topk::cuda::init_row_state<<>>(cur_n, rem_k, out_pos, n_iteration, dim_elements, k); - } - // gather input -> cur - { - dim3 block(BLOCK_SIZE); - dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::gather_rowwise<<>>( - input, cur_vals, cur_idx, - n_iteration, dim_elements, - input_ndim, dim, - input_shape_musa, input_strides_musa); - } - // radix select/filter - for (int bit = 31; bit >= 0; --bit) { - { - size_t threads = 256; - size_t blocks = (n_iteration + threads - 1) / threads; - op::topk::cuda::zero_row_counters<<>>(ones_count, zeros_count, n_iteration); - } - - { - dim3 block(BLOCK_SIZE); - dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::partition_rowwise<<>>( - cur_vals, cur_idx, - ones_vals, ones_idx, - zeros_vals, zeros_idx, - cur_n, n_iteration, dim_elements, - bit, largest, - ones_count, zeros_count); - } - - { - op::topk::cuda::decide_and_compact<<>>( - cur_vals, cur_idx, - ones_vals, ones_idx, - zeros_vals, zeros_idx, - ones_count, zeros_count, - cur_n, rem_k, out_pos, - sel_vals, sel_idx, - n_iteration, dim_elements, k); - } - } - - // append remaining - - op::topk::cuda::take_remaining<<>>( - cur_vals, cur_idx, - cur_n, rem_k, out_pos, - sel_vals, sel_idx, - n_iteration, dim_elements, k); - - // sort (CUB block radix sort) - const int32_t *final_idx = sel_idx; - - if (sorted) { - std::vector h_offsets(n_iteration + 1); - for (size_t i = 0; i <= n_iteration; i++) { - h_offsets[i] = i * k; - } - int *d_offsets; - CHECK_MOORE(musaMalloc(&d_offsets, (n_iteration + 1) * sizeof(int))); - CHECK_MOORE(musaMemcpy(d_offsets, h_offsets.data(), (n_iteration + 1) * sizeof(int), musaMemcpyHostToDevice)); - - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - - if (!largest) { - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - musaMalloc(&d_temp_storage, temp_storage_bytes); - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - } else { - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - musaMalloc(&d_temp_storage, temp_storage_bytes); - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - } - CHECK_MOORE(musaFree(d_offsets)); - CHECK_MOORE(musaFree(d_temp_storage)); - final_idx = sel_sorted_idx; - } - - // scatter to output (strided write) - { - dim3 block(BLOCK_SIZE); - dim3 grid((k + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::scatter_to_output<<>>( - input, final_idx, - values_output, indices_output, - n_iteration, k, - input_ndim, dim, - input_shape_musa, input_strides_musa, - output_shape_musa, output_strides_musa); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *values_output, - void *indices_output, - const void *input, - size_t k, - size_t dim, - bool largest, - bool sorted, - void *stream_) const { - - musaStream_t stream = (musaStream_t)stream_; - constexpr int ITEMS = 4; -#define CALCULATE_TOPK(BLOCK_SIZE, Tdata) \ - launchKernel( \ - _info, \ - (Tdata *)values_output, (int32_t *)indices_output, (const Tdata *)input, \ - k, dim, largest, sorted, \ - stream, workspace, workspace_size) - -#define CALCULATE_TOPK_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_TOPK(BLOCK_SIZE, __mt_bfloat16); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_TOPK(BLOCK_SIZE, half); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_TOPK(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_TOPK_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::topk::moore diff --git a/src/infiniop/ops/topk/nvidia/topk_nvidia.cu b/src/infiniop/ops/topk/nvidia/topk_nvidia.cu deleted file mode 100644 index 0f73d4857..000000000 --- a/src/infiniop/ops/topk/nvidia/topk_nvidia.cu +++ /dev/null @@ -1,283 +0,0 @@ -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "../cuda/kernel.cuh" -#include "topk_nvidia.cuh" - -#include -#include - -namespace op::topk::nvidia { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t values_output_desc, - infiniopTensorDescriptor_t indices_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t k, - size_t dim, - bool largest, - bool sorted) { - auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - - workspace_size += (input_desc->ndim() + values_output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t)); - // 计算临时变量空间 - size_t dim_elements = input_desc->shape()[dim]; - size_t n_iteration = 1; - for (size_t i = 0; i < input_desc->ndim(); i++) { - if (i != dim) { - n_iteration *= input_desc->shape()[i]; - } - } - size_t total = n_iteration * dim_elements; - - workspace_size += 3 * total * sizeof(uint32_t); - workspace_size += 3 * total * sizeof(int32_t); - workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t)); - if (sorted) { - workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t)); - } - workspace_size += 5 * n_iteration * sizeof(int32_t); - - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { - -template -infiniStatus_t launchKernel( - const TopKInfo &info, - Tdata *values_output, int32_t *indices_output, const Tdata *input, - size_t k, size_t dim, bool largest, bool sorted, - cudaStream_t stream, void *workspace, size_t workspace_size) { - if (dim >= info.ndim) { - return INFINI_STATUS_BAD_PARAM; - } - if (k == 0) { - return INFINI_STATUS_SUCCESS; - } - if (k > info.dim_elements) { - return INFINI_STATUS_BAD_PARAM; - } - size_t input_ndim = info.ndim; - size_t output_ndim = input_ndim; - size_t n_iteration = info.n_iteration; - size_t dim_elements = info.dim_elements; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *input_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - size_t *output_shape_cuda = input_shape_cuda + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(size_t); - - ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - ptrdiff_t *output_strides_cuda = input_strides_cuda + input_ndim; - workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t); - - CHECK_CUDA(cudaMemcpyAsync(input_shape_cuda, info.input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - - const int32_t total = n_iteration * dim_elements; - - uint32_t *cur_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - uint32_t *ones_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - uint32_t *zeros_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(uint32_t); - - int32_t *cur_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - int32_t *ones_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - int32_t *zeros_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += total * sizeof(int32_t); - - uint32_t *sel_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(uint32_t); - int32_t *sel_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(int32_t); - uint32_t *sel_sorted_vals = nullptr; - int32_t *sel_sorted_idx = nullptr; - if (sorted) { - sel_sorted_vals = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(uint32_t); - sel_sorted_idx = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * k * sizeof(int32_t); - } - - int32_t *cur_n = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *rem_k = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *out_pos = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *ones_count = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - int32_t *zeros_count = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += n_iteration * sizeof(int32_t); - // init - { - size_t threads = 256; - size_t blocks = (n_iteration + threads - 1) / threads; - op::topk::cuda::init_row_state<<>>(cur_n, rem_k, out_pos, n_iteration, dim_elements, k); - } - // gather input -> cur - { - dim3 block(BLOCK_SIZE); - dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::gather_rowwise<<>>( - input, cur_vals, cur_idx, - n_iteration, dim_elements, - input_ndim, dim, - input_shape_cuda, input_strides_cuda); - } - // radix select/filter - for (int bit = 31; bit >= 0; --bit) { - { - size_t threads = 256; - size_t blocks = (n_iteration + threads - 1) / threads; - op::topk::cuda::zero_row_counters<<>>(ones_count, zeros_count, n_iteration); - } - - { - dim3 block(BLOCK_SIZE); - dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::partition_rowwise<<>>( - cur_vals, cur_idx, - ones_vals, ones_idx, - zeros_vals, zeros_idx, - cur_n, n_iteration, dim_elements, - bit, largest, - ones_count, zeros_count); - } - - { - op::topk::cuda::decide_and_compact<<>>( - cur_vals, cur_idx, - ones_vals, ones_idx, - zeros_vals, zeros_idx, - ones_count, zeros_count, - cur_n, rem_k, out_pos, - sel_vals, sel_idx, - n_iteration, dim_elements, k); - } - } - - // append remaining - - op::topk::cuda::take_remaining<<>>( - cur_vals, cur_idx, - cur_n, rem_k, out_pos, - sel_vals, sel_idx, - n_iteration, dim_elements, k); - - // sort (CUB block radix sort) - const int32_t *final_idx = sel_idx; - - if (sorted) { - std::vector h_offsets(n_iteration + 1); - for (size_t i = 0; i <= n_iteration; i++) { - h_offsets[i] = i * k; - } - int *d_offsets; - CHECK_CUDA(cudaMalloc(&d_offsets, (n_iteration + 1) * sizeof(int))); - CHECK_CUDA(cudaMemcpy(d_offsets, h_offsets.data(), (n_iteration + 1) * sizeof(int), cudaMemcpyHostToDevice)); - - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - - if (!largest) { - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - cudaMalloc(&d_temp_storage, temp_storage_bytes); - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - } else { - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - cudaMalloc(&d_temp_storage, temp_storage_bytes); - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx, - n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream); - } - CHECK_CUDA(cudaFree(d_offsets)); - CHECK_CUDA(cudaFree(d_temp_storage)); - final_idx = sel_sorted_idx; - } - - // scatter to output (strided write) - { - dim3 block(BLOCK_SIZE); - dim3 grid((k + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration); - op::topk::cuda::scatter_to_output<<>>( - input, final_idx, - values_output, indices_output, - n_iteration, k, - input_ndim, dim, - input_shape_cuda, input_strides_cuda, - output_shape_cuda, output_strides_cuda); - } - - CHECK_CUDA(cudaGetLastError()); - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *values_output, - void *indices_output, - const void *input, - size_t k, - size_t dim, - bool largest, - bool sorted, - void *stream_) const { - - cudaStream_t stream = (cudaStream_t)stream_; - constexpr int ITEMS = 4; -#define CALCULATE_TOPK(BLOCK_SIZE, Tdata) \ - launchKernel( \ - _info, \ - (Tdata *)values_output, (int32_t *)indices_output, (const Tdata *)input, \ - k, dim, largest, sorted, \ - stream, workspace, workspace_size) - -#define CALCULATE_TOPK_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_TOPK(BLOCK_SIZE, __nv_bfloat16); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_TOPK(BLOCK_SIZE, half); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_TOPK(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_TOPK_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::topk::nvidia diff --git a/src/infiniop/ops/topk/nvidia/topk_nvidia.cuh b/src/infiniop/ops/topk/nvidia/topk_nvidia.cuh deleted file mode 100644 index dfeb2977b..000000000 --- a/src/infiniop/ops/topk/nvidia/topk_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __TOPK_NVIDIA_H__ -#define __TOPK_NVIDIA_H__ - -#include "../topk_desc.h" - -DESCRIPTOR(nvidia); - -#endif // __TOPK_NVIDIA_H__ diff --git a/src/infiniop/ops/topk/operator.cc b/src/infiniop/ops/topk/operator.cc deleted file mode 100644 index 08ec6d18b..000000000 --- a/src/infiniop/ops/topk/operator.cc +++ /dev/null @@ -1,200 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/topk.h" -#include - -#ifdef ENABLE_CPU_API -#include "cpu/topk_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) -#include "nvidia/topk_nvidia.cuh" -#endif -#ifdef ENABLE_METAX_API -#include "metax/topk_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/topk_kunlun.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/topk_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateTopKDescriptor( - infiniopHandle_t handle, - infiniopTopKDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t values_output_desc, - infiniopTensorDescriptor_t indices_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t k, - size_t dim, - bool largest, - bool sorted) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::topk::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - values_output_desc, \ - indices_output_desc, \ - input_desc, \ - k, \ - dim, \ - largest, \ - sorted) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetTopKWorkspaceSize(infiniopTopKDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopTopK( - infiniopTopKDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *values_output, - void *indices_output, - const void *input, - size_t k, - size_t dim, - bool largest, - bool sorted, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, values_output, indices_output, input, k, dim, largest, sorted, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyTopKDescriptor(infiniopTopKDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/topk/topk_desc.h b/src/infiniop/ops/topk/topk_desc.h deleted file mode 100644 index 309ec939a..000000000 --- a/src/infiniop/ops/topk/topk_desc.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef INFINIOP_TOPK_DESCRIPTOR_H_ -#define INFINIOP_TOPK_DESCRIPTOR_H_ -#include "../../../utils.h" -#include "../../operator.h" -#include "../../tensor.h" - -#include "info.h" - -#define DESCRIPTOR(NAMESPACE) \ - \ - namespace op::topk::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - TopKInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - TopKInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t values_output_desc, \ - infiniopTensorDescriptor_t indices_output_desc, \ - infiniopTensorDescriptor_t input_desc, \ - size_t k, \ - size_t dim, \ - bool largest, \ - bool sorted); \ - \ - infiniStatus_t calculate( \ - void *workspace, size_t workspace_size, \ - void *values_output, \ - void *indices_output, \ - const void *input, \ - size_t k, \ - size_t dim, \ - bool largest, \ - bool sorted, \ - void *stream) const; \ - }; \ - } - -#endif diff --git a/src/infiniop/ops/var/cpu/var_cpu.cc b/src/infiniop/ops/var/cpu/var_cpu.cc deleted file mode 100644 index bd749a4ef..000000000 --- a/src/infiniop/ops/var/cpu/var_cpu.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include "var_cpu.h" -#include "../../../../utils.h" -#include "../../../devices/cpu/common_cpu.h" -namespace op::var::cpu { - -Descriptor::~Descriptor() {} -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - - *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -// welford -namespace { -bool IsNanOut(const VarInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -// 直接用float计算 -template -void computeVarUsingWelfordCpu(const Tdata *input_ptr, float &var_output, size_t start, size_t end, const VarInfo &info) { - if (start >= end) { - return; - } - float old_mean = 0.0f; // previous mean - float mean = 0.0f; // new mean - float M2 = 0.0f; // variance sum - size_t count = 0; // element count of new sum - for (size_t idx = start; idx < end; ++idx) { - size_t input_offset = op::common_cpu::indexToOffset(idx, info.permuted_input_shape.size(), info.permuted_input_shape.data(), info.permuted_input_strides.data()); - ; - float value = utils::cast(input_ptr[input_offset]); - count++; - old_mean = mean; - mean += (value - mean) / count; - M2 += (value - old_mean) * (value - mean); - } - var_output = M2 / (info.unbiased_var ? (count - 1) : count); -} - -template -infiniStatus_t calculateVar( - const VarInfo &info, - Tdata *var_output, - const Tdata *input) { - Tdata nan_value = utils::cast(NAN); - bool is_scalar = (info.reduce_dim_size == info.permuted_input_shape.size()); - for (size_t i = 0; i < info.output_size; ++i) { - size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data()); - if (IsNanOut(info)) { - var_output[output_offset] = nan_value; - } else { - size_t start = is_scalar ? 0 : i * info.reduce_num; - size_t end = is_scalar ? info.input_size : (i + 1) * info.reduce_num; - float var = 0.0f; - computeVarUsingWelfordCpu(input, var, start, end, info); - var_output[output_offset] = utils::cast(var); - } - } - return INFINI_STATUS_SUCCESS; -} -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream) const { - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return calculateVar(_info, (fp16_t *)var_output, reinterpret_cast(input)); - case INFINI_DTYPE_F32: - return calculateVar(_info, (float *)var_output, reinterpret_cast(input)); - case INFINI_DTYPE_BF16: - return calculateVar(_info, (bf16_t *)var_output, reinterpret_cast(input)); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::var::cpu diff --git a/src/infiniop/ops/var/cpu/var_cpu.h b/src/infiniop/ops/var/cpu/var_cpu.h deleted file mode 100644 index 12f1b243c..000000000 --- a/src/infiniop/ops/var/cpu/var_cpu.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_VAR_CPU_H__ -#define __INFINIOP_VAR_CPU_H__ - -#include "../var_desc.h" - -DESCRIPTOR(cpu); - -#endif // __INFINIOP_VAR_CPU_H__ diff --git a/src/infiniop/ops/var/cuda/kernel.cuh b/src/infiniop/ops/var/cuda/kernel.cuh deleted file mode 100644 index 03df669b5..000000000 --- a/src/infiniop/ops/var/cuda/kernel.cuh +++ /dev/null @@ -1,370 +0,0 @@ -#ifndef __VAR_CUDA_H__ -#define __VAR_CUDA_H__ - -#include // NAN - -__forceinline__ __device__ __host__ size_t indexToOffset( - size_t flat_index, - size_t ndim, - const size_t *shape, - const ptrdiff_t *strides) { - size_t res = 0; - for (size_t i = ndim; i-- > 0;) { - res += (flat_index % shape[i]) * strides[i]; - flat_index /= shape[i]; - } - return res; -} - -namespace device { -namespace cuda { -template -__inline__ __device__ Tdata Nan(); -template <> -__inline__ __device__ float Nan() { - return NAN; -} -template <> -__inline__ __device__ double Nan() { - return NAN; -} -template <> -__inline__ __device__ half Nan() { - return __float2half(NAN); -} - -#if defined(ENABLE_MOORE_API) -using bf16_t = __mt_bfloat16; -#elif defined(ENABLE_METAX_API) -using bf16_t = __hpcc_bfloat16; -#else -using bf16_t = __nv_bfloat16; -#endif - -/* bf16 */ -template <> -__inline__ __device__ bf16_t Nan() { - return __float2bfloat16_rn(NAN); -} - -template -__inline__ __device__ Tdata Div(Tdata a, Tdata b); -template <> -__inline__ __device__ float Div(float a, float b) { -#ifdef OF_LAYER_NORM_USE_FAST_MATH - return __fdividef(a, b); -#else - return a / b; -#endif -} -template <> -__inline__ __device__ double Div(double a, double b) { - return a / b; -} -template <> -__inline__ __device__ half Div(half a, half b) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) - return __hdiv(a, b); -#else - return __float2half(__half2float(a) / __half2float(b)); -#endif -} -template <> -__inline__ __device__ bf16_t Div(bf16_t a, bf16_t b) { - -#if defined(ENABLE_NVIDIA_API) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) - return __hdiv(a, b); -#else - return __float2bfloat16_rn( - __bfloat162float(a) / __bfloat162float(b)); -#endif -} - -template -inline __device__ void WelfordReduce(const Tdata *input_ptr, ComputeType &mean, ComputeType &m2, ComputeType &count, - const size_t start, const size_t end, const size_t step, - const size_t ndim, const size_t *shape, const ptrdiff_t *strides) { - ComputeType old_mean = 0.0; - for (size_t i = start; i < end; i += step) { - ++count; - old_mean = mean; - size_t input_offset = indexToOffset(i, ndim, shape, strides); - ComputeType input_value = static_cast(input_ptr[input_offset]); - mean += (input_value - mean) / count; - m2 += (input_value - mean) - * (input_value - old_mean); - } -} - -template -inline __device__ void WelfordCombine(Tdata val, Tdata &mean, Tdata &m2, Tdata &count) { - count += 1; - Tdata delta1 = val - mean; - mean += Div(delta1, count); - Tdata delta2 = val - mean; - m2 += delta1 * delta2; -} - -template -inline __device__ void WelfordCombine(Tdata b_mean, Tdata b_m2, Tdata b_count, Tdata &mean, Tdata &m2, Tdata &count) { - if (b_count == 0) { - return; - } - Tdata new_count = count + b_count; // n1 + n2 - Tdata nb_over_n = Div(b_count, new_count); // n2 / (n1 + n2) - Tdata delta = b_mean - mean; // mean2 - mean1 - mean += delta * nb_over_n; // mean1 + n2 * (mean2 - mean1) / (n1 + n2) - m2 += b_m2 + delta * delta * count * nb_over_n; // m21 + m22 + n2 * (mean2 - mean1) ^ 2 / (n1 + n2) - count = new_count; -} - -template -inline __device__ void WelfordCombineLoop(const Tdata *b_mean, const Tdata *b_m2, const Tdata *b_count, - Tdata &mean, Tdata &m2, Tdata &count, - const size_t start, const size_t end, const size_t step) { - for (size_t i = start; i < end; i += step) { - WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count); - } -} - -template -__inline__ __device__ void WelfordWarpReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count, - Tdata &mean, Tdata &m2, Tdata &count) { - mean = thread_mean; - m2 = thread_m2; - count = thread_count; - for (int lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) { - Tdata b_mean = __shfl_down_sync(0xffffffff, mean, lane_mask, thread_group_width); - Tdata b_m2 = __shfl_down_sync(0xffffffff, m2, lane_mask, thread_group_width); - Tdata b_count = __shfl_down_sync(0xffffffff, count, lane_mask, thread_group_width); - WelfordCombine(b_mean, b_m2, b_count, mean, m2, count); - } -} - -template -__inline__ __device__ void WelfordBlockAllReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count, - Tdata &result_mean, Tdata &result_m2, Tdata &result_count) { - __shared__ Tdata mean_shared[kWarpSize]; - __shared__ Tdata m2_shared[kWarpSize]; - __shared__ Tdata count_shared[kWarpSize]; - __shared__ Tdata mean_result_broadcast; - __shared__ Tdata m2_result_broadcast; - __shared__ Tdata count_result_broadcast; - const int lid = threadIdx.x % kWarpSize; - const int wid = threadIdx.x / kWarpSize; - // warp内规约 - Tdata warp_mean = 0.0; - Tdata warp_m2 = 0.0; - Tdata warp_count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, warp_mean, warp_m2, warp_count); - __syncthreads(); - if (lid == 0) { // 每个warp内的的thread0 保存warp结果 - mean_shared[wid] = warp_mean; - m2_shared[wid] = warp_m2; - count_shared[wid] = warp_count; - } - __syncthreads(); - // warp间规约 - if (wid == 0) { - if (threadIdx.x < blockDim.x / kWarpSize) { - warp_mean = mean_shared[lid]; - warp_m2 = m2_shared[lid]; - warp_count = count_shared[lid]; - } else { - warp_mean = static_cast(0); - warp_m2 = static_cast(0); - warp_count = static_cast(0); - } - __syncwarp(); - Tdata block_mean = 0; - Tdata block_m2 = 0; - Tdata block_count = 0; - WelfordWarpReduce(warp_mean, warp_m2, warp_count, block_mean, block_m2, block_count); - if (lid == 0) { - mean_result_broadcast = block_mean; - m2_result_broadcast = block_m2; - count_result_broadcast = block_count; - } - } - __syncthreads(); - result_mean = mean_result_broadcast; - result_m2 = m2_result_broadcast; - result_count = count_result_broadcast; -} -} // namespace cuda -} // namespace device - -__device__ int32_t done_block_counts = 0; - -template -__global__ void ComputeVarScalarOut(const Tdata *input_ptr, Tdata *var_output_ptr, ComputeType *tmp_buffer_ptr, // Tdata *mean_output_ptr, - size_t input_size, size_t input_ndim, size_t *permuted_input_shape, ptrdiff_t *permuted_input_strides, - bool unbiased, bool is_nan) { - // 处理 NaN 情况 - if (is_nan) { - if (blockIdx.x == 0 && threadIdx.x == 0) { - *var_output_ptr = device::cuda::Nan(); - } // mean_output_ptr[0] = (input_size == 0) ? device::cuda::Nan() : input_ptr[0];} - return; - } - - // 计算每个 block 和 thread 的工作量 - const size_t elems_per_block = input_size / gridDim.x; - const size_t elems_per_thread = elems_per_block / blockDim.x; - // 线程级 Welford 累积 - ComputeType thread_mean = 0.0, thread_m2 = 0.0, thread_count = 0; - - // 每个线程处理常规元素(stride 访问) - if (elems_per_thread > 0) { - const size_t block_start = blockIdx.x * elems_per_block; - const size_t regular_elems = elems_per_block - (elems_per_block % blockDim.x); - device::cuda::WelfordReduce(input_ptr, thread_mean, thread_m2, thread_count, - /*start=*/block_start + threadIdx.x, /*end=*/block_start + regular_elems, /*step=*/blockDim.x, - /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides); - } - - // thread 0 处理本 block 的尾部元素以及跨 block 的尾部元素(单个线程处理) - if (threadIdx.x == 0) { - size_t tail_count = elems_per_block % blockDim.x; - // 最后一个 block 还需要处理总元素数的尾部 - if (blockIdx.x == gridDim.x - 1) { - tail_count += input_size % gridDim.x; - } - if (tail_count > 0) { - const size_t tail_start = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread; - device::cuda::WelfordReduce(input_ptr, thread_mean, thread_m2, thread_count, - /*start=*/tail_start, /*end=*/tail_start + tail_count, /*step=*/1, - /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides); - } - } - - // Block 级规约 - ComputeType block_mean = 0.0, block_m2 = 0.0, block_count = 0; - device::cuda::WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, - block_mean, block_m2, block_count); - - // 单 block 情况:直接输出结果 - if (gridDim.x == 1) { - if (threadIdx.x == 0) { - ComputeType divisor = unbiased ? block_count - 1 : block_count; - var_output_ptr[0] = device::cuda::Div(block_m2, divisor); - } - return; - } - - // 多 block 情况:使用临时缓冲区 - ComputeType *tmp_mean_ptr = tmp_buffer_ptr; - ComputeType *tmp_m2_ptr = tmp_mean_ptr + gridDim.x; - ComputeType *tmp_count_ptr = tmp_m2_ptr + gridDim.x; - - // 保存本 block 的结果 - if (threadIdx.x == 0) { - tmp_mean_ptr[blockIdx.x] = block_mean; - tmp_m2_ptr[blockIdx.x] = block_m2; - tmp_count_ptr[blockIdx.x] = block_count; - } - - // 最后一个 block 负责最终规约 - __shared__ bool is_last_block; - if (threadIdx.x == 0) { - is_last_block = (atomicAdd(&done_block_counts, 1) == gridDim.x - 1); - } - __syncthreads(); - - if (is_last_block) { - // 每个线程合并一部分 block 的结果 - ComputeType final_thread_mean = 0.0, final_thread_m2 = 0.0, final_thread_count = 0; - const size_t blocks_per_thread = gridDim.x / blockDim.x; - const size_t regular_blocks = blocks_per_thread * blockDim.x; - - if (blocks_per_thread > 0) { - device::cuda::WelfordCombineLoop(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr, - final_thread_mean, final_thread_m2, final_thread_count, - /*start=*/threadIdx.x, /*end=*/regular_blocks, /*step=*/blockDim.x); - } - - // thread 0 处理尾部 block - if (threadIdx.x == 0 && regular_blocks < gridDim.x) { - device::cuda::WelfordCombineLoop(&tmp_mean_ptr[regular_blocks], &tmp_m2_ptr[regular_blocks], &tmp_count_ptr[regular_blocks], - final_thread_mean, final_thread_m2, final_thread_count, - /*start=*/0, /*end=*/gridDim.x - regular_blocks, /*step=*/1); - } - - // 最终 block 级规约并输出 - ComputeType final_mean = 0, final_m2 = 0, final_count = 0; - device::cuda::WelfordBlockAllReduce(final_thread_mean, final_thread_m2, final_thread_count, - final_mean, final_m2, final_count); - if (threadIdx.x == 0) { - ComputeType divisor = unbiased ? final_count - 1 : final_count; - var_output_ptr[0] = device::cuda::Div(final_m2, divisor); - done_block_counts = 0; // 重置计数器 - } - } -} - -// CUDA: grid stride looping -#define CUDA_1D_KERNEL_LOOP(i, n) \ - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \ - i += step) - -template -__forceinline__ __device__ __host__ void ComputeVarUsingWelford( - const Tdata *input_ptr, - size_t offset, - Tdata &var_output, - size_t reduce_num, - size_t input_ndim, - size_t *permuted_input_shape, - ptrdiff_t *permuted_input_strides, - bool unbiased) { - size_t count = 0; - ComputeType mean = 0.0; - ComputeType old_mean = 0.0; - ComputeType m2 = 0.0; - for (size_t i = 0; i < reduce_num; ++i) { - size_t input_offset = indexToOffset(offset + i, input_ndim, permuted_input_shape, permuted_input_strides); - count++; - old_mean = mean; - mean = old_mean + (static_cast(input_ptr[input_offset]) - old_mean) / count; - m2 += (static_cast(input_ptr[input_offset]) - old_mean) * (static_cast(input_ptr[input_offset]) - mean); - } - var_output = static_cast(m2 / (unbiased ? count - 1 : count)); -} - -template -__global__ void ComputeVarUsingWelfordWrapper( - const Tdata *input_ptr, Tdata *var_output_ptr, - size_t input_ndim, - size_t output_size, - size_t reduce_num, - size_t *permuted_input_shape, - ptrdiff_t *permuted_input_strides, - bool unbiased, - bool is_nan) { - if (is_nan) { - if (reduce_num == 0) { - CUDA_1D_KERNEL_LOOP(i, output_size) { - var_output_ptr[i] = device::cuda::Nan(); - } - } else { - CUDA_1D_KERNEL_LOOP(i, output_size) { - // const size_t input_offset = indexToOffset(i * reduce_num, input_ndim, permuted_input_shape, permuted_input_strides); - var_output_ptr[i] = device::cuda::Nan(); - } - } - } else { - CUDA_1D_KERNEL_LOOP(i, output_size) { - ComputeVarUsingWelford( - input_ptr, - i * reduce_num, - var_output_ptr[i], - reduce_num, - input_ndim, - permuted_input_shape, - permuted_input_strides, - unbiased); - } - } -} - -#endif // __VAR_CUDA_H__ diff --git a/src/infiniop/ops/var/info.h b/src/infiniop/ops/var/info.h deleted file mode 100644 index f89e1c0dc..000000000 --- a/src/infiniop/ops/var/info.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef __VAR_INFO_H__ -#define __VAR_INFO_H__ -#include "../../../utils.h" -#include "../../tensor.h" -#include -#include -#include - -namespace op::var { -class VarInfo { - VarInfo() = default; - -public: - infiniDtype_t dtype; - std::vector permuted_input_shape; // need to permute - std::vector output_shape; - std::vector permuted_input_strides; // need to permute - std::vector output_strides; - size_t reduce_dim_size; // reduce dim size - size_t reduce_num; // number of elements to reduce for each output element - size_t input_size; // total number of input elements - size_t output_size; // total number of output elements - bool unbiased_var; - static utils::Result create( - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto input_shape = input_desc->shape(); - auto input_strides = input_desc->strides(); - size_t input_ndim = input_desc->ndim(); - size_t reduce_num = 1; - for (size_t i = 0; i < dim_size; i++) { - reduce_num *= input_shape[dim[i]]; - } - std::vector permute_order; - for (size_t i = 0; i < input_ndim; i++) { - if (std::find(dim, dim + dim_size, i) == dim + dim_size) { - permute_order.push_back(i); - } - } - for (size_t i = 0; i < dim_size; i++) { - permute_order.push_back(dim[i]); - } - std::vector permuted_input_shape; - std::vector permuted_input_strides; - for (size_t i = 0; i < permute_order.size(); i++) { - permuted_input_shape.push_back(input_shape[permute_order[i]]); - permuted_input_strides.push_back(input_strides[permute_order[i]]); - } - return utils::Result(VarInfo{input_desc->dtype(), - permuted_input_shape, - var_output_desc->shape(), - permuted_input_strides, - var_output_desc->strides(), - dim_size, - reduce_num, - input_desc->numel(), - var_output_desc->numel(), - unbiased}); - } -}; -} // namespace op::var - -#endif diff --git a/src/infiniop/ops/var/metax/var_metax.h b/src/infiniop/ops/var/metax/var_metax.h deleted file mode 100644 index 99edcee98..000000000 --- a/src/infiniop/ops/var/metax/var_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __VAR_METAX_H__ -#define __VAR_METAX_H__ - -#include "../var_desc.h" - -DESCRIPTOR(metax); - -#endif // __VAR_METAX_H__ diff --git a/src/infiniop/ops/var/metax/var_metax.maca b/src/infiniop/ops/var/metax/var_metax.maca deleted file mode 100644 index ae8218646..000000000 --- a/src/infiniop/ops/var/metax/var_metax.maca +++ /dev/null @@ -1,124 +0,0 @@ -#include "../../../devices/metax/metax_common.h" -#include "../../../devices/metax/metax_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "var_metax.h" - -namespace op::var::metax { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -bool IsNanOut(const VarInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -template -infiniStatus_t launchKernel( - const VarInfo &info, - Tdata *var_output, const Tdata *input, - bool unbiased, bool keepdim, - hcStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - - size_t *permuted_input_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(ptrdiff_t); - - CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - bool is_nan = IsNanOut(info); - if (info.reduce_num == input_size) { // scalar output - ComputeType *tmp_buffer; - constexpr size_t MAX_GRID_SIZE = 128; - size_t grid_size = std::min(MAX_GRID_SIZE, - (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - CHECK_METAX(hcMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType))); - ComputeVarScalarOut<<>>( - input, var_output, tmp_buffer, input_size, input_ndim, - permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan); - CHECK_METAX(hcFree(tmp_buffer)); - } else { - size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - ComputeVarUsingWelfordWrapper<<>>( - input, var_output, input_ndim, output_size, reduce_num, - permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream_) const { - - hcStream_t stream = (hcStream_t)stream_; - -#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \ - launchKernel( \ - _info, \ - (Tdata *)var_output, (const Tdata *)input, \ - unbiased, keepdim, \ - stream, workspace, workspace_size) - -#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_VAR(BLOCK_SIZE, __hpcc_bfloat16, double); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_VAR(BLOCK_SIZE, half, double); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_VAR(BLOCK_SIZE, float, double); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_VAR_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::var::metax diff --git a/src/infiniop/ops/var/moore/var_moore.h b/src/infiniop/ops/var/moore/var_moore.h deleted file mode 100644 index 220912b5e..000000000 --- a/src/infiniop/ops/var/moore/var_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __VAR_MOORE_H__ -#define __VAR_MOORE_H__ - -#include "../var_desc.h" - -DESCRIPTOR(moore); - -#endif // __VAR_MOORE_H__ diff --git a/src/infiniop/ops/var/moore/var_moore.mu b/src/infiniop/ops/var/moore/var_moore.mu deleted file mode 100644 index 3e72da2b4..000000000 --- a/src/infiniop/ops/var/moore/var_moore.mu +++ /dev/null @@ -1,124 +0,0 @@ -#include "../../../devices/moore/moore_common.h" -#include "../../../devices/moore/moore_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "var_moore.h" - -namespace op::var::moore { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -bool IsNanOut(const VarInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -template -infiniStatus_t launchKernel( - const VarInfo &info, - Tdata *var_output, const Tdata *input, - bool unbiased, bool keepdim, - musaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - - size_t *permuted_input_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(ptrdiff_t); - - CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - bool is_nan = IsNanOut(info); - if (info.reduce_num == input_size) { // scalar output - ComputeType *tmp_buffer; - constexpr size_t MAX_GRID_SIZE = 128; - size_t grid_size = std::min(MAX_GRID_SIZE, - (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - CHECK_MOORE(musaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType))); - ComputeVarScalarOut<<>>( - input, var_output, tmp_buffer, input_size, input_ndim, - permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan); - CHECK_MOORE(musaFree(tmp_buffer)); - } else { - size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - ComputeVarUsingWelfordWrapper<<>>( - input, var_output, input_ndim, output_size, reduce_num, - permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream_) const { - - musaStream_t stream = (musaStream_t)stream_; - -#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \ - launchKernel( \ - _info, \ - (Tdata *)var_output, (const Tdata *)input, \ - unbiased, keepdim, \ - stream, workspace, workspace_size) - -#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_VAR(BLOCK_SIZE, __mt_bfloat16, double); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_VAR(BLOCK_SIZE, half, double); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_VAR(BLOCK_SIZE, float, double); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_VAR_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::var::moore diff --git a/src/infiniop/ops/var/nvidia/var_nvidia.cu b/src/infiniop/ops/var/nvidia/var_nvidia.cu deleted file mode 100644 index a0166f804..000000000 --- a/src/infiniop/ops/var/nvidia/var_nvidia.cu +++ /dev/null @@ -1,124 +0,0 @@ -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "../cuda/kernel.cuh" -#include "var_nvidia.cuh" - -namespace op::var::nvidia { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -bool IsNanOut(const VarInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -template -infiniStatus_t launchKernel( - const VarInfo &info, - Tdata *var_output, const Tdata *input, - bool unbiased, bool keepdim, - cudaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - // size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - - size_t *permuted_input_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(ptrdiff_t); - - CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - bool is_nan = IsNanOut(info); - if (info.reduce_num == input_size) { // scalar output - ComputeType *tmp_buffer; - constexpr size_t MAX_GRID_SIZE = 128; - size_t grid_size = std::min(MAX_GRID_SIZE, - (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - CHECK_CUDA(cudaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType))); - ComputeVarScalarOut<<>>( - input, var_output, tmp_buffer, input_size, input_ndim, - permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan); - CHECK_CUDA(cudaFree(tmp_buffer)); - } else { - size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - ComputeVarUsingWelfordWrapper<<>>( - input, var_output, input_ndim, output_size, reduce_num, - permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream_) const { - - cudaStream_t stream = (cudaStream_t)stream_; - -#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \ - launchKernel( \ - _info, \ - (Tdata *)var_output, (const Tdata *)input, \ - unbiased, keepdim, \ - stream, workspace, workspace_size) - -#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_VAR(BLOCK_SIZE, __nv_bfloat16, double); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_VAR(BLOCK_SIZE, half, double); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_VAR(BLOCK_SIZE, float, double); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_VAR_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::var::nvidia diff --git a/src/infiniop/ops/var/nvidia/var_nvidia.cuh b/src/infiniop/ops/var/nvidia/var_nvidia.cuh deleted file mode 100644 index 8abfa87a0..000000000 --- a/src/infiniop/ops/var/nvidia/var_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __VAR_NVIDIA_H__ -#define __VAR_NVIDIA_H__ - -#include "../var_desc.h" - -DESCRIPTOR(nvidia); - -#endif // __VAR_NVIDIA_H__ diff --git a/src/infiniop/ops/var/operator.cc b/src/infiniop/ops/var/operator.cc deleted file mode 100644 index b963c0531..000000000 --- a/src/infiniop/ops/var/operator.cc +++ /dev/null @@ -1,197 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/var.h" -#include - -#ifdef ENABLE_CPU_API -#include "cpu/var_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) -#include "nvidia/var_nvidia.cuh" -#endif -#ifdef ENABLE_METAX_API -#include "metax/var_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/var_kunlun.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/var_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateVarDescriptor( - infiniopHandle_t handle, - infiniopVarDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::var::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - var_output_desc, \ - input_desc, \ - dim, \ - dim_size, \ - unbiased, \ - keepdim) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetVarWorkspaceSize(infiniopVarDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopVar( - infiniopVarDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *var_output, - const void *input, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, var_output, input, unbiased, keepdim, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyVarDescriptor(infiniopVarDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/var/var_desc.h b/src/infiniop/ops/var/var_desc.h deleted file mode 100644 index e0cae2c89..000000000 --- a/src/infiniop/ops/var/var_desc.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef INFINIOP_VAR_DESCRIPTOR_H_ -#define INFINIOP_VAR_DESCRIPTOR_H_ -#include "../../../utils.h" -#include "../../operator.h" -#include "../../tensor.h" - -#include "info.h" - -#define DESCRIPTOR(NAMESPACE) \ - \ - namespace op::var::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - VarInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - VarInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t var_output_desc, \ - infiniopTensorDescriptor_t input_desc, \ - size_t *dim, \ - size_t dim_size, \ - bool unbiased, \ - bool keepdim); \ - \ - infiniStatus_t calculate( \ - void *workspace, size_t workspace_size, \ - void *var_output, \ - const void *input, \ - bool unbiased, \ - bool keepdim, \ - void *stream) const; \ - }; \ - } - -#endif diff --git a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc b/src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc deleted file mode 100644 index 0747b0c26..000000000 --- a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc +++ /dev/null @@ -1,107 +0,0 @@ -#include "var_mean_cpu.h" -#include "../../../../utils.h" -#include "../../../devices/cpu/common_cpu.h" -namespace op::var_mean::cpu { - -Descriptor::~Descriptor() {} -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t mean_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - - *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -// welford -namespace { -bool IsNanOut(const VarMeanInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -// 直接用float计算 -template -void computeVarMeanUsingWelfordCpu(const Tdata *input_ptr, float &var_output, float &mean_output, size_t start, size_t end, const VarMeanInfo &info) { - if (start >= end) { - return; - } - float old_mean = 0.0f; // previous mean - float mean = 0.0f; // new mean - float M2 = 0.0f; // variance sum - size_t count = 0; // element count of new sum - for (size_t idx = start; idx < end; ++idx) { - size_t input_offset = op::common_cpu::indexToOffset(idx, info.permuted_input_shape.size(), info.permuted_input_shape.data(), info.permuted_input_strides.data()); - ; - float value = utils::cast(input_ptr[input_offset]); - count++; - old_mean = mean; - mean += (value - mean) / count; - M2 += (value - old_mean) * (value - mean); - } - mean_output = mean; - var_output = M2 / (info.unbiased_var ? (count - 1) : count); -} - -template -infiniStatus_t calculateVarMean( - const VarMeanInfo &info, - Tdata *var_output, - Tdata *mean_output, - const Tdata *input) { - Tdata nan_value = utils::cast(NAN); - bool is_scalar = (info.reduce_dim_size == info.permuted_input_shape.size()); - // #pragma omp parallel for - for (size_t i = 0; i < info.output_size; ++i) { - size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data()); - if (IsNanOut(info)) { - var_output[output_offset] = nan_value; - if (info.reduce_num == 0) { - mean_output[output_offset] = nan_value; - } else { - size_t input_idx = is_scalar ? 0 : i * info.reduce_num; - size_t input_offset = op::common_cpu::indexToOffset(input_idx, info.permuted_input_shape.size(), info.permuted_input_shape.data(), info.permuted_input_strides.data()); - mean_output[output_offset] = input[input_offset]; - } - } else { - size_t start = is_scalar ? 0 : i * info.reduce_num; - size_t end = is_scalar ? info.input_size : (i + 1) * info.reduce_num; - float var = 0.0f, mean = 0.0f; - computeVarMeanUsingWelfordCpu(input, var, mean, start, end, info); - var_output[output_offset] = utils::cast(var); - mean_output[output_offset] = utils::cast(mean); - } - } - return INFINI_STATUS_SUCCESS; -} -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - void *mean_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream) const { - switch (_info.dtype) { - case INFINI_DTYPE_F16: - return calculateVarMean(_info, (fp16_t *)var_output, (fp16_t *)mean_output, reinterpret_cast(input)); - case INFINI_DTYPE_F32: - return calculateVarMean(_info, (float *)var_output, (float *)mean_output, reinterpret_cast(input)); - case INFINI_DTYPE_BF16: - return calculateVarMean(_info, (bf16_t *)var_output, (bf16_t *)mean_output, reinterpret_cast(input)); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} -} // namespace op::var_mean::cpu diff --git a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.h b/src/infiniop/ops/var_mean/cpu/var_mean_cpu.h deleted file mode 100644 index 205d02d14..000000000 --- a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_VAR_MEAN_CPU_H__ -#define __INFINIOP_VAR_MEAN_CPU_H__ - -#include "../var_mean_desc.h" - -DESCRIPTOR(cpu); - -#endif // __INFINIOP_VAR_MEAN_CPU_H__ diff --git a/src/infiniop/ops/var_mean/cuda/kernel.cuh b/src/infiniop/ops/var_mean/cuda/kernel.cuh deleted file mode 100644 index ed50c37e2..000000000 --- a/src/infiniop/ops/var_mean/cuda/kernel.cuh +++ /dev/null @@ -1,378 +0,0 @@ -#ifndef __VAR_MEAN_CUDA_H__ -#define __VAR_MEAN_CUDA_H__ - -#include // NAN - -__forceinline__ __device__ __host__ size_t indexToOffset( - size_t flat_index, - size_t ndim, - const size_t *shape, - const ptrdiff_t *strides) { - size_t res = 0; - for (size_t i = ndim; i-- > 0;) { - res += (flat_index % shape[i]) * strides[i]; - flat_index /= shape[i]; - } - return res; -} - -namespace device { -namespace cuda { -template -__inline__ __device__ Tdata Nan(); -template <> -__inline__ __device__ float Nan() { - return NAN; -} -template <> -__inline__ __device__ double Nan() { - return NAN; -} -template <> -__inline__ __device__ half Nan() { - return __float2half(NAN); -} - -#if defined(ENABLE_MOORE_API) -using bf16_t = __mt_bfloat16; -#elif defined(ENABLE_METAX_API) -using bf16_t = __hpcc_bfloat16; -#else -using bf16_t = __nv_bfloat16; -#endif - -/* bf16 */ -template <> -__inline__ __device__ bf16_t Nan() { - return __float2bfloat16_rn(NAN); -} - -template -__inline__ __device__ Tdata Div(Tdata a, Tdata b); -template <> -__inline__ __device__ float Div(float a, float b) { -#ifdef OF_LAYER_NORM_USE_FAST_MATH - return __fdividef(a, b); -#else - return a / b; -#endif -} -template <> -__inline__ __device__ double Div(double a, double b) { - return a / b; -} -template <> -__inline__ __device__ half Div(half a, half b) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) - return __hdiv(a, b); -#else - return __float2half(__half2float(a) / __half2float(b)); -#endif -} -template <> -__inline__ __device__ bf16_t Div(bf16_t a, bf16_t b) { - -#if defined(ENABLE_NVIDIA_API) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) - return __hdiv(a, b); -#else - return __float2bfloat16_rn( - __bfloat162float(a) / __bfloat162float(b)); -#endif -} - -template -inline __device__ void WelfordReduce(const Tdata *input_ptr, ComputeType &mean, ComputeType &m2, ComputeType &count, - const size_t start, const size_t end, const size_t step, - const size_t ndim, const size_t *shape, const ptrdiff_t *strides) { - ComputeType old_mean = 0.0; - for (size_t i = start; i < end; i += step) { - ++count; - old_mean = mean; - size_t input_offset = indexToOffset(i, ndim, shape, strides); - ComputeType input_value = static_cast(input_ptr[input_offset]); - mean += (input_value - mean) / count; - m2 += (input_value - mean) - * (input_value - old_mean); - } -} - -template -inline __device__ void WelfordCombine(Tdata val, Tdata &mean, Tdata &m2, Tdata &count) { - count += 1; - Tdata delta1 = val - mean; - mean += Div(delta1, count); - Tdata delta2 = val - mean; - m2 += delta1 * delta2; -} - -template -inline __device__ void WelfordCombine(Tdata b_mean, Tdata b_m2, Tdata b_count, Tdata &mean, Tdata &m2, Tdata &count) { - if (b_count == 0) { - return; - } - Tdata new_count = count + b_count; // n1 + n2 - Tdata nb_over_n = Div(b_count, new_count); // n2 / (n1 + n2) - Tdata delta = b_mean - mean; // mean2 - mean1 - mean += delta * nb_over_n; // mean1 + n2 * (mean2 - mean1) / (n1 + n2) - m2 += b_m2 + delta * delta * count * nb_over_n; // m21 + m22 + n2 * (mean2 - mean1) ^ 2 / (n1 + n2) - count = new_count; -} - -template -inline __device__ void WelfordCombineLoop(const Tdata *b_mean, const Tdata *b_m2, const Tdata *b_count, - Tdata &mean, Tdata &m2, Tdata &count, - const size_t start, const size_t end, const size_t step) { - for (size_t i = start; i < end; i += step) { - WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count); - } -} - -template -__inline__ __device__ void WelfordWarpReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count, - Tdata &mean, Tdata &m2, Tdata &count) { - mean = thread_mean; - m2 = thread_m2; - count = thread_count; - for (int lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) { - Tdata b_mean = __shfl_down_sync(0xffffffff, mean, lane_mask, thread_group_width); - Tdata b_m2 = __shfl_down_sync(0xffffffff, m2, lane_mask, thread_group_width); - Tdata b_count = __shfl_down_sync(0xffffffff, count, lane_mask, thread_group_width); - WelfordCombine(b_mean, b_m2, b_count, mean, m2, count); - } -} - -template -__inline__ __device__ void WelfordBlockAllReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count, - Tdata &result_mean, Tdata &result_m2, Tdata &result_count) { - __shared__ Tdata mean_shared[kWarpSize]; - __shared__ Tdata m2_shared[kWarpSize]; - __shared__ Tdata count_shared[kWarpSize]; - __shared__ Tdata mean_result_broadcast; - __shared__ Tdata m2_result_broadcast; - __shared__ Tdata count_result_broadcast; - const int lid = threadIdx.x % kWarpSize; - const int wid = threadIdx.x / kWarpSize; - // warp内规约 - Tdata warp_mean = 0.0; - Tdata warp_m2 = 0.0; - Tdata warp_count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, warp_mean, warp_m2, warp_count); - __syncthreads(); - if (lid == 0) { // 每个warp内的的thread0 保存warp结果 - mean_shared[wid] = warp_mean; - m2_shared[wid] = warp_m2; - count_shared[wid] = warp_count; - } - __syncthreads(); - // warp间规约 - if (wid == 0) { - if (threadIdx.x < blockDim.x / kWarpSize) { - warp_mean = mean_shared[lid]; - warp_m2 = m2_shared[lid]; - warp_count = count_shared[lid]; - } else { - warp_mean = static_cast(0); - warp_m2 = static_cast(0); - warp_count = static_cast(0); - } - __syncwarp(); - Tdata block_mean = 0; - Tdata block_m2 = 0; - Tdata block_count = 0; - WelfordWarpReduce(warp_mean, warp_m2, warp_count, block_mean, block_m2, block_count); - if (lid == 0) { - mean_result_broadcast = block_mean; - m2_result_broadcast = block_m2; - count_result_broadcast = block_count; - } - } - __syncthreads(); - result_mean = mean_result_broadcast; - result_m2 = m2_result_broadcast; - result_count = count_result_broadcast; -} -} // namespace cuda -} // namespace device - -__device__ int32_t done_block_count = 0; - -template -__global__ void ComputeVarScalarOut(const Tdata *input_ptr, Tdata *var_output_ptr, Tdata *mean_output_ptr, ComputeType *tmp_buffer_ptr, - size_t input_size, size_t input_ndim, size_t *permuted_input_shape, ptrdiff_t *permuted_input_strides, - bool unbiased, bool is_nan) { - // 处理 NaN 情况 - if (is_nan) { - if (blockIdx.x == 0 && threadIdx.x == 0) { - *var_output_ptr = device::cuda::Nan(); - mean_output_ptr[0] = (input_size == 0) ? device::cuda::Nan() : input_ptr[0]; - } - return; - } - - // 计算每个 block 和 thread 的工作量 - const size_t elems_per_block = input_size / gridDim.x; - const size_t elems_per_thread = elems_per_block / blockDim.x; - // 线程级 Welford 累积 - ComputeType thread_mean = 0.0, thread_m2 = 0.0, thread_count = 0; - - // 每个线程处理常规元素(stride 访问) - if (elems_per_thread > 0) { - const size_t block_start = blockIdx.x * elems_per_block; - const size_t regular_elems = elems_per_block - (elems_per_block % blockDim.x); - device::cuda::WelfordReduce(input_ptr, thread_mean, thread_m2, thread_count, - /*start=*/block_start + threadIdx.x, /*end=*/block_start + regular_elems, /*step=*/blockDim.x, - /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides); - } - - // thread 0 处理本 block 的尾部元素以及跨 block 的尾部元素(单个线程处理) - if (threadIdx.x == 0) { - size_t tail_count = elems_per_block % blockDim.x; - // 最后一个 block 还需要处理总元素数的尾部 - if (blockIdx.x == gridDim.x - 1) { - tail_count += input_size % gridDim.x; - } - if (tail_count > 0) { - const size_t tail_start = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread; - device::cuda::WelfordReduce(input_ptr, thread_mean, thread_m2, thread_count, - /*start=*/tail_start, /*end=*/tail_start + tail_count, /*step=*/1, - /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides); - } - } - - // Block 级规约 - ComputeType block_mean = 0.0, block_m2 = 0.0, block_count = 0; - device::cuda::WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, - block_mean, block_m2, block_count); - - // 单 block 情况:直接输出结果 - if (gridDim.x == 1) { - if (threadIdx.x == 0) { - ComputeType divisor = unbiased ? block_count - 1 : block_count; - var_output_ptr[0] = device::cuda::Div(block_m2, divisor); - mean_output_ptr[0] = static_cast(block_mean); - } - return; - } - - // 多 block 情况:使用临时缓冲区 - ComputeType *tmp_mean_ptr = tmp_buffer_ptr; - ComputeType *tmp_m2_ptr = tmp_mean_ptr + gridDim.x; - ComputeType *tmp_count_ptr = tmp_m2_ptr + gridDim.x; - - // 保存本 block 的结果 - if (threadIdx.x == 0) { - tmp_mean_ptr[blockIdx.x] = block_mean; - tmp_m2_ptr[blockIdx.x] = block_m2; - tmp_count_ptr[blockIdx.x] = block_count; - } - - // 最后一个 block 负责最终规约 - __shared__ bool is_last_block; - if (threadIdx.x == 0) { - is_last_block = (atomicAdd(&done_block_count, 1) == gridDim.x - 1); - } - __syncthreads(); - - if (is_last_block) { - // 每个线程合并一部分 block 的结果 - ComputeType final_thread_mean = 0.0, final_thread_m2 = 0.0, final_thread_count = 0; - const size_t blocks_per_thread = gridDim.x / blockDim.x; - const size_t regular_blocks = blocks_per_thread * blockDim.x; - - if (blocks_per_thread > 0) { - device::cuda::WelfordCombineLoop(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr, - final_thread_mean, final_thread_m2, final_thread_count, - /*start=*/threadIdx.x, /*end=*/regular_blocks, /*step=*/blockDim.x); - } - - // thread 0 处理尾部 block - if (threadIdx.x == 0 && regular_blocks < gridDim.x) { - device::cuda::WelfordCombineLoop(&tmp_mean_ptr[regular_blocks], &tmp_m2_ptr[regular_blocks], &tmp_count_ptr[regular_blocks], - final_thread_mean, final_thread_m2, final_thread_count, - /*start=*/0, /*end=*/gridDim.x - regular_blocks, /*step=*/1); - } - - // 最终 block 级规约并输出 - ComputeType final_mean = 0, final_m2 = 0, final_count = 0; - device::cuda::WelfordBlockAllReduce(final_thread_mean, final_thread_m2, final_thread_count, - final_mean, final_m2, final_count); - if (threadIdx.x == 0) { - ComputeType divisor = unbiased ? final_count - 1 : final_count; - var_output_ptr[0] = device::cuda::Div(final_m2, divisor); - mean_output_ptr[0] = static_cast(final_mean); - done_block_count = 0; // 重置计数器 - } - } -} - -// CUDA: grid stride looping -#define CUDA_1D_KERNEL_LOOP(i, n) \ - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \ - i += step) - -template -__forceinline__ __device__ __host__ void ComputeVarMeanUsingWelford( - const Tdata *input_ptr, - size_t offset, - Tdata &var_output, - Tdata &mean_output, - size_t reduce_num, - size_t input_ndim, - size_t *permuted_input_shape, - ptrdiff_t *permuted_input_strides, - bool unbiased) { - size_t count = 0; - ComputeType mean = 0.0; - ComputeType old_mean = 0.0; - ComputeType m2 = 0.0; - for (size_t i = 0; i < reduce_num; ++i) { - size_t input_offset = indexToOffset(offset + i, input_ndim, permuted_input_shape, permuted_input_strides); - count++; - old_mean = mean; - mean = old_mean + (static_cast(input_ptr[input_offset]) - old_mean) / count; - m2 += (static_cast(input_ptr[input_offset]) - old_mean) * (static_cast(input_ptr[input_offset]) - mean); - } - var_output = static_cast(m2 / (unbiased ? count - 1 : count)); - mean_output = static_cast(mean); -} - -template -__global__ void ComputeVarMeanUsingWelfordWrapper( - const Tdata *input_ptr, Tdata *var_output_ptr, Tdata *mean_output_ptr, - size_t input_ndim, - size_t output_size, - size_t reduce_num, - size_t *permuted_input_shape, - ptrdiff_t *permuted_input_strides, - bool unbiased, - bool is_nan) { - if (is_nan) { - if (reduce_num == 0) { - CUDA_1D_KERNEL_LOOP(i, output_size) { - var_output_ptr[i] = device::cuda::Nan(); - mean_output_ptr[i] = device::cuda::Nan(); - } - } else { - CUDA_1D_KERNEL_LOOP(i, output_size) { - const size_t input_offset = indexToOffset(i * reduce_num, input_ndim, permuted_input_shape, permuted_input_strides); - var_output_ptr[i] = device::cuda::Nan(); - mean_output_ptr[i] = input_ptr[input_offset]; - } - } - } else { - CUDA_1D_KERNEL_LOOP(i, output_size) { - ComputeVarMeanUsingWelford( - input_ptr, - i * reduce_num, - var_output_ptr[i], - mean_output_ptr[i], - reduce_num, - input_ndim, - permuted_input_shape, - permuted_input_strides, - unbiased); - } - } -} - -#endif // __VAR_MEAN_CUDA_H__ diff --git a/src/infiniop/ops/var_mean/info.h b/src/infiniop/ops/var_mean/info.h deleted file mode 100644 index 38eb3d1b1..000000000 --- a/src/infiniop/ops/var_mean/info.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef __VAR_MEAN_INFO_H__ -#define __VAR_MEAN_INFO_H__ -#include "../../../utils.h" -#include "../../tensor.h" -#include -#include -#include - -namespace op::var_mean { -class VarMeanInfo { - VarMeanInfo() = default; - -public: - infiniDtype_t dtype; - std::vector permuted_input_shape; // need to permute - std::vector output_shape; - std::vector permuted_input_strides; // need to permute - std::vector output_strides; - size_t reduce_dim_size; // reduce dim size - size_t reduce_num; // number of elements to reduce for each output element - size_t input_size; // total number of input elements - size_t output_size; // total number of output elements - bool unbiased_var; - static utils::Result create( - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto input_shape = input_desc->shape(); - auto input_strides = input_desc->strides(); - size_t input_ndim = input_desc->ndim(); - size_t reduce_num = 1; - for (size_t i = 0; i < dim_size; i++) { - reduce_num *= input_shape[dim[i]]; - } - std::vector permute_order; - for (size_t i = 0; i < input_ndim; i++) { - if (std::find(dim, dim + dim_size, i) == dim + dim_size) { - permute_order.push_back(i); - } - } - for (size_t i = 0; i < dim_size; i++) { - permute_order.push_back(dim[i]); - } - std::vector permuted_input_shape; - std::vector permuted_input_strides; - for (size_t i = 0; i < permute_order.size(); i++) { - permuted_input_shape.push_back(input_shape[permute_order[i]]); - permuted_input_strides.push_back(input_strides[permute_order[i]]); - } - return utils::Result(VarMeanInfo{input_desc->dtype(), - permuted_input_shape, - var_output_desc->shape(), - permuted_input_strides, - var_output_desc->strides(), - dim_size, - reduce_num, - input_desc->numel(), - var_output_desc->numel(), - unbiased}); - } -}; -} // namespace op::var_mean - -#endif diff --git a/src/infiniop/ops/var_mean/metax/var_mean_metax.h b/src/infiniop/ops/var_mean/metax/var_mean_metax.h deleted file mode 100644 index bc303987a..000000000 --- a/src/infiniop/ops/var_mean/metax/var_mean_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __VAR_MEAN_METAX_H__ -#define __VAR_MEAN_METAX_H__ - -#include "../var_mean_desc.h" - -DESCRIPTOR(metax); - -#endif // __VAR_MEAN_METAX_H__ diff --git a/src/infiniop/ops/var_mean/metax/var_mean_metax.maca b/src/infiniop/ops/var_mean/metax/var_mean_metax.maca deleted file mode 100644 index ac4c61114..000000000 --- a/src/infiniop/ops/var_mean/metax/var_mean_metax.maca +++ /dev/null @@ -1,126 +0,0 @@ -#include "../../../devices/metax/metax_common.h" -#include "../../../devices/metax/metax_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "var_mean_metax.h" - -namespace op::var_mean::metax { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t mean_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -bool IsNanOut(const VarMeanInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -template -infiniStatus_t launchKernel( - const VarMeanInfo &info, - Tdata *var_output, Tdata *mean_output, const Tdata *input, - bool unbiased, bool keepdim, - hcStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - - size_t *permuted_input_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(ptrdiff_t); - - CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); - CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); - bool is_nan = IsNanOut(info); - if (info.reduce_num == input_size) { // scalar output - ComputeType *tmp_buffer; - constexpr size_t MAX_GRID_SIZE = 128; - size_t grid_size = std::min(MAX_GRID_SIZE, - (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - CHECK_METAX(hcMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType))); - ComputeVarScalarOut<<>>( - input, var_output, mean_output, tmp_buffer, input_size, input_ndim, - permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan); - CHECK_METAX(hcFree(tmp_buffer)); - } else { - size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - ComputeVarMeanUsingWelfordWrapper<<>>( - input, var_output, mean_output, input_ndim, output_size, reduce_num, - permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - void *mean_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream_) const { - - hcStream_t stream = (hcStream_t)stream_; - -#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType) \ - launchKernel( \ - _info, \ - (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \ - unbiased, keepdim, \ - stream, workspace, workspace_size) - -#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, __hpcc_bfloat16, double); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::var_mean::metax diff --git a/src/infiniop/ops/var_mean/moore/var_mean_moore.h b/src/infiniop/ops/var_mean/moore/var_mean_moore.h deleted file mode 100644 index 79f297e70..000000000 --- a/src/infiniop/ops/var_mean/moore/var_mean_moore.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __VAR_MEAN_MOORE_H__ -#define __VAR_MEAN_MOORE_H__ - -#include "../var_mean_desc.h" - -DESCRIPTOR(moore); - -#endif // __VAR_MEAN_MOORE_H__ diff --git a/src/infiniop/ops/var_mean/moore/var_mean_moore.mu b/src/infiniop/ops/var_mean/moore/var_mean_moore.mu deleted file mode 100644 index 62e44e3c8..000000000 --- a/src/infiniop/ops/var_mean/moore/var_mean_moore.mu +++ /dev/null @@ -1,125 +0,0 @@ -#include "../../../devices/moore/moore_common.h" -#include "../../../devices/moore/moore_kernel_common.h" -#include "../cuda/kernel.cuh" -#include "var_mean_moore.h" - -namespace op::var_mean::moore { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t mean_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -bool IsNanOut(const VarMeanInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -template -infiniStatus_t launchKernel( - const VarMeanInfo &info, - Tdata *var_output, Tdata *mean_output, const Tdata *input, - bool unbiased, bool keepdim, - musaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - size_t *permuted_input_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(ptrdiff_t); - - CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); - CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); - bool is_nan = IsNanOut(info); - if (info.reduce_num == input_size) { // scalar output - ComputeType *tmp_buffer; - constexpr size_t MAX_GRID_SIZE = 128; - size_t grid_size = std::min(MAX_GRID_SIZE, - (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - CHECK_MOORE(musaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType))); - ComputeVarScalarOut<<>>( - input, var_output, mean_output, tmp_buffer, input_size, input_ndim, - permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan); - CHECK_MOORE(musaFree(tmp_buffer)); - } else { - size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - ComputeVarMeanUsingWelfordWrapper<<>>( - input, var_output, mean_output, input_ndim, output_size, reduce_num, - permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - void *mean_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream_) const { - - musaStream_t stream = (musaStream_t)stream_; - -#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType) \ - launchKernel( \ - _info, \ - (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \ - unbiased, keepdim, \ - stream, workspace, workspace_size) - -#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, __mt_bfloat16, double); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::var_mean::moore diff --git a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu b/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu deleted file mode 100644 index 95352a106..000000000 --- a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu +++ /dev/null @@ -1,126 +0,0 @@ -#include "../../../devices/nvidia/nvidia_common.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "../cuda/kernel.cuh" -#include "var_mean_nvidia.cuh" - -namespace op::var_mean::nvidia { -struct Descriptor::Opaque { - std::shared_ptr internal; -}; - -Descriptor::~Descriptor() { - delete _opaque; -} - -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t mean_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim); - CHECK_RESULT(result); - auto info = result.take(); - size_t workspace_size = 0; - workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides - *desc_ptr = new Descriptor( - new Opaque{reinterpret_cast(handle)->internal()}, - info, workspace_size, handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; -} - -namespace { -bool IsNanOut(const VarMeanInfo &info) { - return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true); -} -template -infiniStatus_t launchKernel( - const VarMeanInfo &info, - Tdata *var_output, Tdata *mean_output, const Tdata *input, - bool unbiased, bool keepdim, - cudaStream_t stream, void *workspace, size_t workspace_size) { - size_t input_ndim = info.permuted_input_shape.size(); - size_t output_ndim = info.output_shape.size(); - size_t input_size = info.input_size; - size_t output_size = info.output_size; - size_t reduce_num = info.reduce_num; - unsigned char *workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - - size_t *permuted_input_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(size_t); - - ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); - workspace_offset += input_ndim * sizeof(ptrdiff_t); - - CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); - bool is_nan = IsNanOut(info); - if (info.reduce_num == input_size) { // scalar output - ComputeType *tmp_buffer; - constexpr size_t MAX_GRID_SIZE = 128; - size_t grid_size = std::min(MAX_GRID_SIZE, - (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - CHECK_CUDA(cudaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType))); - ComputeVarScalarOut<<>>( - input, var_output, mean_output, tmp_buffer, input_size, input_ndim, - permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan); - CHECK_CUDA(cudaFree(tmp_buffer)); - } else { - size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); - grid_size = std::max(1UL, grid_size); - ComputeVarMeanUsingWelfordWrapper<<>>( - input, var_output, mean_output, input_ndim, output_size, reduce_num, - permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan); - } - - return INFINI_STATUS_SUCCESS; -} - -} // namespace - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *var_output, - void *mean_output, - const void *input, - bool unbiased, - bool keepdim, - void *stream_) const { - - cudaStream_t stream = (cudaStream_t)stream_; - -#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType) \ - launchKernel( \ - _info, \ - (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \ - unbiased, keepdim, \ - stream, workspace, workspace_size) - -#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, __nv_bfloat16, double); \ - else if (_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double); \ - else if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } - - if (_opaque->internal->maxThreadsPerBlock() >= 256) { - CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256) - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; -} - -} // namespace op::var_mean::nvidia diff --git a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh b/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh deleted file mode 100644 index d8115883f..000000000 --- a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __VAR_MEAN_NVIDIA_H__ -#define __VAR_MEAN_NVIDIA_H__ - -#include "../var_mean_desc.h" - -DESCRIPTOR(nvidia); - -#endif // __VAR_MEAN_NVIDIA_H__ diff --git a/src/infiniop/ops/var_mean/operator.cc b/src/infiniop/ops/var_mean/operator.cc deleted file mode 100644 index 9b408ed23..000000000 --- a/src/infiniop/ops/var_mean/operator.cc +++ /dev/null @@ -1,200 +0,0 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/var_mean.h" -#include - -#ifdef ENABLE_CPU_API -#include "cpu/var_mean_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) -#include "nvidia/var_mean_nvidia.cuh" -#endif -#ifdef ENABLE_METAX_API -#include "metax/var_mean_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/var_mean_kunlun.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/var_mean_moore.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateVarMeanDescriptor( - infiniopHandle_t handle, - infiniopVarMeanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t var_output_desc, - infiniopTensorDescriptor_t mean_output_desc, - infiniopTensorDescriptor_t input_desc, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::var_mean::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - var_output_desc, \ - mean_output_desc, \ - input_desc, \ - dim, \ - dim_size, \ - unbiased, \ - keepdim) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t infiniopGetVarMeanWorkspaceSize(infiniopVarMeanDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__INFINI_C infiniStatus_t infiniopVarMean( - infiniopVarMeanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *var_output, - void *mean_output, - const void *input, - size_t *dim, - size_t dim_size, - bool unbiased, - bool keepdim, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, var_output, mean_output, input, unbiased, keepdim, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyVarMeanDescriptor(infiniopVarMeanDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} diff --git a/src/infiniop/ops/var_mean/var_mean_desc.h b/src/infiniop/ops/var_mean/var_mean_desc.h deleted file mode 100644 index 71b76814f..000000000 --- a/src/infiniop/ops/var_mean/var_mean_desc.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef INFINIOP_VAR_MEAN_DESCRIPTOR_H_ -#define INFINIOP_VAR_MEAN_DESCRIPTOR_H_ -#include "../../../utils.h" -#include "../../operator.h" -#include "../../tensor.h" - -#include "info.h" - -#define DESCRIPTOR(NAMESPACE) \ - \ - namespace op::var_mean::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - VarMeanInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - VarMeanInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t var_output_desc, \ - infiniopTensorDescriptor_t mean_output_desc, \ - infiniopTensorDescriptor_t input_desc, \ - size_t *dim, \ - size_t dim_size, \ - bool unbiased, \ - bool keepdim); \ - \ - infiniStatus_t calculate( \ - void *workspace, size_t workspace_size, \ - void *var_output, \ - void *mean_output, \ - const void *input, \ - bool unbiased, \ - bool keepdim, \ - void *stream) const; \ - }; \ - } - -#endif diff --git a/src/utils/custom_types.h b/src/utils/custom_types.h index 23be702ff..05a5c2fca 100644 --- a/src/utils/custom_types.h +++ b/src/utils/custom_types.h @@ -13,22 +13,6 @@ struct CustomBFloat16 { }; typedef struct CustomBFloat16 bf16_t; -inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) { - return lhs._v == rhs._v; -} - -inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) { - return !(lhs == rhs); -} - -inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) { - return lhs._v == rhs._v; -} - -inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) { - return !(lhs == rhs); -} - float _f16_to_f32(fp16_t val); fp16_t _f32_to_f16(float val); diff --git a/test/infinicore/ops/all.py b/test/infinicore/ops/all.py index 2d295e262..cded59ee8 100644 --- a/test/infinicore/ops/all.py +++ b/test/infinicore/ops/all.py @@ -56,7 +56,7 @@ def parse_test_cases(): for data in _TEST_CASES_DATA: shape, strides, dim, keepdim, out_strides = data input_supports_inplace = not is_broadcast(strides) - # out_supports_inplace = not is_broadcast(out_strides) + out_supports_inplace = not is_broadcast(out_strides) for dtype in _TENSOR_DTYPES: tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0}) @@ -81,19 +81,19 @@ def parse_test_cases(): ) # explicit out when supported (create out tensor with computed shape) - # out_shape = _compute_out_shape(shape, dim, keepdim) - # out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool) - # if out_supports_inplace: - # test_cases.append( - # TestCase( - # inputs=[in_spec], - # kwargs=kwargs, - # output_spec=out_spec, - # comparison_target="out", - # tolerance=tol, - # description="All - INPLACE(out)", - # ) - # ) + out_shape = _compute_out_shape(shape, dim, keepdim) + out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool) + if out_supports_inplace: + test_cases.append( + TestCase( + inputs=[in_spec], + kwargs=kwargs, + output_spec=out_spec, + comparison_target="out", + tolerance=tol, + description="All - INPLACE(out)", + ) + ) return test_cases @@ -110,9 +110,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.all(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation (operator not yet available).""" - return infinicore.all(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.all(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/avg_pool1d.py b/test/infinicore/ops/avg_pool1d.py index 539951628..5a0318571 100644 --- a/test/infinicore/ops/avg_pool1d.py +++ b/test/infinicore/ops/avg_pool1d.py @@ -74,8 +74,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.avg_pool1d(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - return infinicore.nn.functional.avg_pool1d(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.nn.functional.avg_pool1d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/cross_entropy.py b/test/infinicore/ops/cross_entropy.py index 269216bc7..e71a30567 100644 --- a/test/infinicore/ops/cross_entropy.py +++ b/test/infinicore/ops/cross_entropy.py @@ -11,8 +11,6 @@ # Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None) # infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean') -# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。 -# 仍然保留原始配置,后续实现这些特性时只需放开过滤条件即可。 _TEST_CASES_DATA = [ ((4, 5), (4,), None, False, None), ((8, 10), (8,), None, True, -1), @@ -22,9 +20,6 @@ ((2, 2), (2,), None, True, -100), ] -_SUPPORT_WEIGHT = False -_SUPPORT_IGNORE_INDEX = False - _TOLERANCE_MAP = { infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}, infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, @@ -45,11 +40,6 @@ def parse_test_cases(): ) in _TEST_CASES_DATA: for dtype in _TENSOR_DTYPES: tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) - if weight_present and not _SUPPORT_WEIGHT: - continue - if ignore_index is not None and not _SUPPORT_IGNORE_INDEX: - continue - logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype) target = TensorSpec.from_tensor( target_shape, @@ -61,7 +51,7 @@ def parse_test_cases(): ) inputs = [logits, target] - kwargs = {"reduction": "none"} + kwargs = {} if weight_present: weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype) inputs.append(weight_spec) @@ -94,10 +84,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.cross_entropy(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation.""" - out = kwargs.pop("out", None) - return infinicore.cross_entropy(*args, out=out, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.nn.functional.cross_entropy(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/equal.py b/test/infinicore/ops/equal.py index fd5c37261..10aae3fcb 100644 --- a/test/infinicore/ops/equal.py +++ b/test/infinicore/ops/equal.py @@ -74,11 +74,8 @@ def parse_test_cases(): ) ) - # Equal 结果为 bool,无法安全复用浮点/整型输入作为输出缓冲区。 - # 只有当输入 dtype 本身为 bool 时才允许 inplace,这里提前留出开关。 - allow_input_inplace = dtype == infinicore.bool - - if allow_input_inplace and a_supports_inplace: + # in-place a + if a_supports_inplace: test_cases.append( TestCase( inputs=[a_spec, b_spec], @@ -90,7 +87,8 @@ def parse_test_cases(): ) ) - if allow_input_inplace and b_supports_inplace: + # in-place b + if b_supports_inplace: test_cases.append( TestCase( inputs=[a_spec, b_spec], @@ -117,8 +115,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.eq(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - return infinicore.equal(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.eq(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/hardswish.py b/test/infinicore/ops/hardswish.py index 5ab38d594..9f31cdc62 100644 --- a/test/infinicore/ops/hardswish.py +++ b/test/infinicore/ops/hardswish.py @@ -70,8 +70,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.hardswish(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - return infinicore.nn.functional.hardswish(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.nn.functional.hardswish(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/hardtanh.py b/test/infinicore/ops/hardtanh.py index a88ea6c8d..6861e464e 100644 --- a/test/infinicore/ops/hardtanh.py +++ b/test/infinicore/ops/hardtanh.py @@ -17,6 +17,7 @@ _TEST_CASES_DATA = [ ((13, 4), None, -1.0, 1.0), + ((13, 4), (10, 1), -0.5, 0.5), ((8, 8, 8), None, -2.0, 2.0), ] @@ -86,11 +87,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.hardtanh(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation.""" - import infinicore.nn.functional as F - - return F.hardtanh(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.nn.functional.hardtanh(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/sum.py b/test/infinicore/ops/sum.py index b22f77242..74d9e29fc 100644 --- a/test/infinicore/ops/sum.py +++ b/test/infinicore/ops/sum.py @@ -20,7 +20,7 @@ ((8, 8), None, None, None, None), ((8, 8), (16, 1), 1, False, None), ((2, 3, 4), None, 0, True, None), - ((1, 8), None, (0,), False, None), # tuple 导致 infini_list kwargs dim,[0] + ((1, 8), None, (0,), False, None), ((16, 64), (128, 1), None, None, None), ((4, 5, 6), (60, 12, 2), 2, True, None), ] @@ -61,6 +61,7 @@ def parse_test_cases(): description="Sum - OUT_OF_PLACE", ) ) + return test_cases @@ -76,11 +77,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.sum(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation (operator not yet available).""" - return infinicore.sum( - *args, **kwargs - ) # todo 找到具体对应的 python/infinicore/ops/sum.py + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.sum(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/topk.py b/test/infinicore/ops/topk.py index 50876b1b7..a0b9bdcd8 100644 --- a/test/infinicore/ops/topk.py +++ b/test/infinicore/ops/topk.py @@ -15,7 +15,7 @@ # Test cases format: (shape, input_strides, k, dim, largest, sorted) _TEST_CASES_DATA = [ - ((6, 8), None, 1, 1, False, True), + ((6, 8), None, 1, 1, True, True), ((8, 4), (16, 1), 2, 0, True, False), ((5, 5), None, 3, -1, False, True), ((3, 7), (14, 1), 2, 1, True, True), @@ -55,7 +55,6 @@ def parse_test_cases(): comparison_target=None, tolerance=tol, description=f"topk - OUT_OF_PLACE", - output_count=2, ) ) @@ -78,9 +77,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.topk(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation (operator not yet available).""" - return infinicore.topk(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.topk(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/var.py b/test/infinicore/ops/var.py index d441ed4ab..e0ce9f463 100644 --- a/test/infinicore/ops/var.py +++ b/test/infinicore/ops/var.py @@ -76,9 +76,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.var(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation (operator not yet available).""" - return infinicore.var(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.var(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/var_mean.py b/test/infinicore/ops/var_mean.py index b1de0bf90..18015d2cd 100644 --- a/test/infinicore/ops/var_mean.py +++ b/test/infinicore/ops/var_mean.py @@ -15,7 +15,7 @@ # Test cases format: (in_shape, in_strides_or_None, dim_or_None, unbiased_or_None, keepdim_or_None) # var_mean returns (var, mean) -# Changed in torch version 2.0: Previously this argument was called unbiased and was a boolean with True corresponding to correction=1 and False being correction=0. + _TEST_CASES_DATA = [ ((8, 8), None, None, None, None), ((8, 8), (16, 1), 1, True, False), @@ -27,7 +27,7 @@ _TOLERANCE_MAP = { infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}, - infinicore.float32: {"atol": 1e-5, "rtol": 1e-3}, + infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, } _TENSOR_DTYPES = [infinicore.float16, infinicore.float32] @@ -47,8 +47,6 @@ def parse_test_cases(): kwargs["dim"] = dim if unbiased is not None: kwargs["unbiased"] = unbiased - # Changed in version 2.0: Previously this argument was called unbiased and was a boolean with True - # corresponding to correction=1 and False being correction=0. if keepdim is not None: kwargs["keepdim"] = keepdim @@ -78,9 +76,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.var_mean(*args, **kwargs) - def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation (operator not yet available).""" - return infinicore.var_mean(*args, **kwargs) + # def infinicore_operator(self, *args, **kwargs): + # """InfiniCore implementation (operator not yet available).""" + # return infinicore.var_mean(*args, **kwargs) def main(): diff --git a/test/infiniop/avg_pool1d.py b/test/infiniop/avg_pool1d.py deleted file mode 100644 index dd9e771c0..000000000 --- a/test/infiniop/avg_pool1d.py +++ /dev/null @@ -1,183 +0,0 @@ -import ctypes -from ctypes import c_uint64 - -import torch - -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -_TEST_CASES = [ - # input_shape, x_stride, y_stride, kernel_size, stride, padding - ((2, 3, 16), None, None, 3, None, 0), - ((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2), - ((2, 1, 32), None, (32, 16, 1), 2, 2, 0), - ((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1), - ((4, 6, 31), None, None, 4, 2, 1), - ((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0), -] - -# Data types used for testing -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, - InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2}, - InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def _effective_stride(stride, kernel_size): - if stride in (None, 0): - return kernel_size - return stride - - -def _compute_output_shape(input_shape, kernel_size, stride, padding): - stride = _effective_stride(stride, kernel_size) - width = input_shape[2] - out_width = (width + 2 * padding - kernel_size) // stride + 1 - return (input_shape[0], input_shape[1], out_width) - - -def avg_pool1d_ref(x, kernel_size, stride, padding): - stride = _effective_stride(stride, kernel_size) - out = torch.nn.functional.avg_pool1d( - x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding - ) - return out.to(x.dtype) - - -def test( - handle, - device, - input_shape, - x_stride, - y_stride, - kernel_size, - stride, - padding, - dtype=InfiniDtype.F16, - sync=None, -): - stride_value = _effective_stride(stride, kernel_size) - out_shape = _compute_output_shape( - input_shape, kernel_size, stride_value, padding - ) - print( - f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, " - f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, " - f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}" - ) - - x = TestTensor(input_shape, x_stride, dtype, device) - y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros") - - ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAvgPool1dDescriptor( - handle, - ctypes.byref(descriptor), - y.descriptor, - x.descriptor, - kernel_size, - stride_value, - padding, - ) - ) - - # Invalidate descriptors in tensors after creation to make sure kernels read from arguments - x.destroy_desc() - y.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, x.device) - - def lib_avg_pool1d(): - check_error( - LIBINFINIOP.infiniopAvgPool1d( - descriptor, - workspace.data(), - workspace.size(), - y.data(), - x.data(), - None, - ) - ) - - lib_avg_pool1d() - - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) - - if PROFILE: - # fmt: off - profile_operation( - "PyTorch", - lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding), - device, - NUM_PRERUN, - NUM_ITERATIONS, - ) - profile_operation( - " lib", - lambda: lib_avg_pool1d(), - device, - NUM_PRERUN, - NUM_ITERATIONS, - ) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") - diff --git a/test/infiniop/cross_entropy.py b/test/infiniop/cross_entropy.py deleted file mode 100644 index 987f2d11a..000000000 --- a/test/infiniop/cross_entropy.py +++ /dev/null @@ -1,106 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) - -# ------------------------------------------------------------ -# 用例配置 -# ------------------------------------------------------------ -_TEST_CASES_ = [ - ((2, 4, 10), None, None), # logits shape, x_stride, y_stride - ((1, 128, 32000), None, None), - ((4, 512, 1000), None, None), -] - -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, - InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2}, - InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, -} - -# ------------------------------------------------------------ -# PyTorch 参考实现 -# ------------------------------------------------------------ -def cross_entropy_ref(logits, target): - vocab = logits.shape[-1] - logits_flat = logits.reshape(-1, vocab).float() - target_flat = target.reshape(-1).long() - loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none") - return loss.view(target.shape).to(logits.dtype) - - -def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None): - logits_shape = shape - label_shape = shape[:-1] - vocab = shape[-1] - - print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}") - - x = TestTensor(logits_shape, x_stride, dtype, device) - target = TestTensor(label_shape, None, InfiniDtype.I64, device) - - # 生成有效标签 - tgt = target.torch_tensor() - tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device)) - target.actual_tensor().copy_(tgt) - - reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor()) - y = TestTensor(label_shape, y_stride, dtype, device) - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateCrossEntropyDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor - ) - ) - - for tensor in [x, y, target]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size))) - workspace = TestWorkspace(workspace_size.value, x.device) - - def run(): - check_error( - LIBINFINIOP.infiniopCrossEntropy( - descriptor, - workspace.data(), - workspace.size(), - y.data(), - x.data(), - target.data(), - None, - ) - ) - - run() - if sync: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol) - - check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES) - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py deleted file mode 100644 index e333b94b3..000000000 --- a/test/infiniop/equal.py +++ /dev/null @@ -1,181 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -_TEST_CASES_ = [ - # shape, a_stride, b_stride, c_stride - ((13, 4), None, None, None), - ((13, 4), (10, 1), (10, 1), (10, 1)), - ((13, 4), (0, 1), None, None), - ((13, 4, 4), None, None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), - ((16, 5632), None, None, None), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), - ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), - ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), - ((4, 4, 5632), None, None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), -] - -# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool,内存大小不同) -class Inplace(Enum): - OUT_OF_PLACE = auto() - -_INPLACE = [ - Inplace.OUT_OF_PLACE, -] - -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# 测试的输入数据类型 -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64] - -# 容差设置 (对于 Bool 比较,通常要求完全匹配) -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 0, "rtol": 0}, - InfiniDtype.F32: {"atol": 0, "rtol": 0}, - InfiniDtype.BF16: {"atol": 0, "rtol": 0}, - InfiniDtype.I32: {"atol": 0, "rtol": 0}, - InfiniDtype.I64: {"atol": 0, "rtol": 0}, - InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - -# PyTorch 标准实现 -def equal_func(c, a, b): - torch.eq(a, b, out=c) - -def test( - handle, - device, - shape, - a_stride=None, - b_stride=None, - c_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=torch.float16, - sync=None, -): - # 输入 Tensor 使用指定的 dtype (如 float16) - a = TestTensor(shape, a_stride, dtype, device) - b = TestTensor(shape, b_stride, dtype, device) - - # [关键修改] 输出 Tensor 强制使用 Bool 类型 - # 注意:这里 c_stride 如果是按字节计算的,对于 Bool 类型通常是 1 byte - c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device) - - if c.is_broadcast(): - return - - print( - f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " - f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL" - ) - - # 运行 PyTorch 对照组 - equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - - # [关键修改] 调用 Equal 的 Create 函数 - check_error( - LIBINFINIOP.infiniopCreateEqualDescriptor( - handle, - ctypes.byref(descriptor), - c.descriptor, # Output (Bool) - a.descriptor, # Input A - b.descriptor, # Input B - ) - ) - - # Invalidate descriptors - for tensor in [a, b, c]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetEqualWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, c.device) - - def lib_equal(): - check_error( - LIBINFINIOP.infiniopEqual( - descriptor, - workspace.data(), - workspace.size(), - c.data(), - a.data(), - b.data(), - None, - ) - ) - - lib_equal() - - # 使用 Bool 类型的容差 (实际上就是全等) - atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL) - - if DEBUG: - debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) - - # 验证结果 - assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py deleted file mode 100644 index b60439d16..000000000 --- a/test/infiniop/hardswish.py +++ /dev/null @@ -1,171 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# 复用相同的测试用例配置,因为 HardSwish 也是逐元素操作 -_TEST_CASES_ = [ - # shape, input_stride, output_stride - ((13, 4), None, None), - ((13, 4), (10, 1), (10, 1)), - ((13, 4), (0, 1), None), - ((13, 4, 4), None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), None), - ((16, 5632), None, None), - ((16, 5632), (13312, 1), (13312, 1)), - ((4, 4, 5632), None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE = auto() - - -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE, -] - -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] - -_TOLERANCE_MAP = { - InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, - InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def test( - handle, - device, - shape, - input_stride=None, - output_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=torch.float16, - sync=None, -): - input = TestTensor(shape, input_stride, dtype, device) - if inplace == Inplace.INPLACE: - if input_stride != output_stride: - return - output = input - else: - output = TestTensor(shape, output_stride, dtype, device, mode="ones") - - if output.is_broadcast(): - return - - print( - f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}" - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - - new_output = torch.nn.functional.hardswish(input.torch_tensor()) - output.update_torch_tensor(new_output) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - - check_error( - LIBINFINIOP.infiniopCreateHardSwishDescriptor( - handle, - ctypes.byref(descriptor), - output.descriptor, - input.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [input, output]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetHardSwishWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, output.device) - - def lib_hardswish(): - check_error( - LIBINFINIOP.infiniopHardSwish( - descriptor, - workspace.data(), - workspace.size(), - output.data(), - input.data(), - None, - ) - ) - - lib_hardswish() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) - - assert torch.allclose( - output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol - ) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/hardtanh.py b/test/infiniop/hardtanh.py deleted file mode 100644 index 02549ed7d..000000000 --- a/test/infiniop/hardtanh.py +++ /dev/null @@ -1,169 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64, c_float -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration -# ============================================================================== -_TEST_CASES_ = [ - # shape, input_stride, output_stride - ((13, 4), None, None), - ((13, 4), (10, 1), (10, 1)), - ((16, 5632), None, None), - ((4, 4, 5632), None, None), -] - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE = auto() - -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE, -] - -# HardTanh 特有的参数测试组合 (min_val, max_val) -_PARAM_CASES = [ - (-1.0, 1.0), - (0.0, 6.0), # 类似于 ReLU6 - (-2.5, 2.5), -] - -# 组合所有测试用例:shape + inplace + params -_TEST_CASES = [ - test_case + (inplace_item, p_min, p_max) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE - for p_min, p_max in _PARAM_CASES -] - -_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] - -_TOLERANCE_MAP = { - InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - -def test( - handle, - device, - shape, - input_stride=None, - output_stride=None, - inplace=Inplace.OUT_OF_PLACE, - min_val=-1.0, - max_val=1.0, - dtype=torch.float16, - sync=None, -): - input = TestTensor(shape, input_stride, dtype, device) - if inplace == Inplace.INPLACE: - if input_stride != output_stride: - return - output = input - else: - output = TestTensor(shape, output_stride, dtype, device, mode="ones") - - if output.is_broadcast(): - return - - print( - f"Testing HardTanh on {InfiniDeviceNames[device]} | shape:{shape} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace} range:[{min_val}, {max_val}]" - ) - - # 计算 PyTorch 真值 - new_output = torch.nn.functional.hardtanh(input.torch_tensor(), min_val=min_val, max_val=max_val) - output.update_torch_tensor(new_output) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - - check_error( - LIBINFINIOP.infiniopCreateHardTanhDescriptor( - handle, - ctypes.byref(descriptor), - output.descriptor, - input.descriptor, - c_float(min_val), - c_float(max_val), - ) - ) - - for tensor in [input, output]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetHardTanhWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, output.device) - - def lib_hardtanh(): - check_error( - LIBINFINIOP.infiniopHardTanh( - descriptor, - workspace.data(), - workspace.size(), - output.data(), - input.data(), - None, - ) - ) - - lib_hardtanh() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) - - assert torch.allclose( - output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol - ) - - if PROFILE: - profile_operation("PyTorch", lambda: torch.nn.functional.hardtanh(input.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_hardtanh(), device, NUM_PRERUN, NUM_ITERATIONS) - - check_error(LIBINFINIOP.infiniopDestroyHardTanhDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mHardTanh Test passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 8aeba0100..275689e78 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -54,54 +54,6 @@ def add_(lib): infiniopOperatorDescriptor_t, ] -@OpRegister.operator -def equal_(lib): - # ========================================================= - # 1. 注册 Create 函数 - # C函数签名: (handle, &desc, output_desc, input_a_desc, input_b_desc) - # ========================================================= - lib.infiniopCreateEqualDescriptor.restype = c_int32 - lib.infiniopCreateEqualDescriptor.argtypes = [ - infiniopHandle_t, # handle - POINTER(infiniopOperatorDescriptor_t),# desc_ptr (输出) - infiniopTensorDescriptor_t, # output (c) - infiniopTensorDescriptor_t, # input_a - infiniopTensorDescriptor_t, # input_b - ] - - # ========================================================= - # 2. 注册 GetWorkspaceSize 函数 - # C函数签名: (desc, &size) - # ========================================================= - lib.infiniopGetEqualWorkspaceSize.restype = c_int32 - lib.infiniopGetEqualWorkspaceSize.argtypes = [ - infiniopOperatorDescriptor_t, - POINTER(c_size_t), - ] - - # ========================================================= - # 3. 注册 Execute (计算) 函数 - # C函数签名: (desc, workspace, size, output_data, input_a_data, input_b_data, stream) - # ========================================================= - lib.infiniopEqual.restype = c_int32 - lib.infiniopEqual.argtypes = [ - infiniopOperatorDescriptor_t, # desc - c_void_p, # workspace ptr - c_size_t, # workspace size - c_void_p, # output data ptr - c_void_p, # input a data ptr - c_void_p, # input b data ptr - c_void_p, # stream - ] - - # ========================================================= - # 4. 注册 Destroy 函数 - # C函数签名: (desc) - # ========================================================= - lib.infiniopDestroyEqualDescriptor.restype = c_int32 - lib.infiniopDestroyEqualDescriptor.argtypes = [ - infiniopOperatorDescriptor_t, - ] @OpRegister.operator def attention_(lib): @@ -210,40 +162,6 @@ def clip_(lib): ] -@OpRegister.operator -def cross_entropy_(lib): - lib.infiniopCreateCrossEntropyDescriptor.restype = c_int32 - lib.infiniopCreateCrossEntropyDescriptor.argtypes = [ - infiniopHandle_t, - POINTER(infiniopOperatorDescriptor_t), - infiniopTensorDescriptor_t, - infiniopTensorDescriptor_t, - infiniopTensorDescriptor_t, - ] - - lib.infiniopGetCrossEntropyWorkspaceSize.restype = c_int32 - lib.infiniopGetCrossEntropyWorkspaceSize.argtypes = [ - infiniopOperatorDescriptor_t, - POINTER(c_size_t), - ] - - lib.infiniopCrossEntropy.restype = c_int32 - lib.infiniopCrossEntropy.argtypes = [ - infiniopOperatorDescriptor_t, - c_void_p, - c_size_t, - c_void_p, - c_void_p, - c_void_p, - c_void_p, - ] - - lib.infiniopDestroyCrossEntropyDescriptor.restype = c_int32 - lib.infiniopDestroyCrossEntropyDescriptor.argtypes = [ - infiniopOperatorDescriptor_t, - ] - - @OpRegister.operator def logsoftmax_(lib): lib.infiniopCreateLogSoftmaxDescriptor.restype = c_int32 @@ -991,112 +909,6 @@ def silu_(lib): infiniopOperatorDescriptor_t, ] -@OpRegister.operator -def hardtanh_(lib): - # 1. Create Descriptor - 注意增加了两个 c_float 参数 - lib.infiniopCreateHardTanhDescriptor.restype = c_int32 - lib.infiniopCreateHardTanhDescriptor.argtypes = [ - infiniopHandle_t, # handle - POINTER(infiniopOperatorDescriptor_t), # desc_ptr - infiniopTensorDescriptor_t, # output - infiniopTensorDescriptor_t, # input - c_float, # min_val - c_float, # max_val - ] - - # 2. Get Workspace Size - lib.infiniopGetHardTanhWorkspaceSize.restype = c_int32 - lib.infiniopGetHardTanhWorkspaceSize.argtypes = [ - infiniopOperatorDescriptor_t, # desc - POINTER(c_size_t), # size - ] - - # 3. Execute Operator - lib.infiniopHardTanh.restype = c_int32 - lib.infiniopHardTanh.argtypes = [ - infiniopOperatorDescriptor_t, # desc - c_void_p, # workspace - c_size_t, # workspace_size - c_void_p, # output - c_void_p, # input - c_void_p, # stream - ] - - # 4. Destroy Descriptor - lib.infiniopDestroyHardTanhDescriptor.restype = c_int32 - lib.infiniopDestroyHardTanhDescriptor.argtypes = [ - infiniopOperatorDescriptor_t, # desc - ] - -@OpRegister.operator -def hardswish_(lib): - lib.infiniopCreateHardSwishDescriptor.restype = c_int32 - lib.infiniopCreateHardSwishDescriptor.argtypes = [ - infiniopHandle_t, - POINTER(infiniopOperatorDescriptor_t), - infiniopTensorDescriptor_t, - infiniopTensorDescriptor_t, - ] - - lib.infiniopGetHardSwishWorkspaceSize.restype = c_int32 - lib.infiniopGetHardSwishWorkspaceSize.argtypes = [ - infiniopOperatorDescriptor_t, - POINTER(c_size_t), - ] - - lib.infiniopHardSwish.restype = c_int32 - lib.infiniopHardSwish.argtypes = [ - infiniopOperatorDescriptor_t, - c_void_p, - c_size_t, - c_void_p, - c_void_p, - c_void_p, - ] - - lib.infiniopDestroyHardSwishDescriptor.restype = c_int32 - lib.infiniopDestroyHardSwishDescriptor.argtypes = [ - infiniopOperatorDescriptor_t, - ] - -@OpRegister.operator -def avg_pool1d_(lib): - # 1. Create 函数 - # C签名: (handle, *desc, y, x, kernel_size, stride, padding) - lib.infiniopCreateAvgPool1dDescriptor.restype = c_int32 - lib.infiniopCreateAvgPool1dDescriptor.argtypes = [ - infiniopHandle_t, - POINTER(infiniopOperatorDescriptor_t), - infiniopTensorDescriptor_t, # y_desc (Output) - infiniopTensorDescriptor_t, # x_desc (Input) - c_size_t, # kernel_size - c_size_t, # stride - c_size_t, # padding - ] - - # 2. GetWorkspaceSize 函数 - lib.infiniopGetAvgPool1dWorkspaceSize.restype = c_int32 - lib.infiniopGetAvgPool1dWorkspaceSize.argtypes = [ - infiniopOperatorDescriptor_t, - POINTER(c_size_t), - ] - - # 3. Execute 函数 - lib.infiniopAvgPool1d.restype = c_int32 - lib.infiniopAvgPool1d.argtypes = [ - infiniopOperatorDescriptor_t, - c_void_p, # workspace - c_size_t, # workspace_size - c_void_p, # y (output pointer) - c_void_p, # x (input pointer) - c_void_p, # stream - ] - - # 4. Destroy 函数 - lib.infiniopDestroyAvgPool1dDescriptor.restype = c_int32 - lib.infiniopDestroyAvgPool1dDescriptor.argtypes = [ - infiniopOperatorDescriptor_t, - ] @OpRegister.operator def layer_norm_(lib): diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index b690e74d4..ec8763a4e 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -83,12 +83,8 @@ def __init__( InfiniDtype.BYTE, InfiniDtype.BOOL, ]: - if dt == InfiniDtype.BOOL: - randint_low = 0 if randint_low is None else randint_low - randint_high = 2 if randint_high is None else randint_high - else: - randint_low = -2000000000 if randint_low is None else randint_low - randint_high = 2000000000 if randint_high is None else randint_high + randint_low = -2000000000 if randint_low is None else randint_low + randint_high = 2000000000 if randint_high is None else randint_high self._torch_tensor = torch.randint( randint_low, randint_high,