diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index cd7c5cb4a..53b3a2f10 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -3,13 +3,9 @@
 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
-#include "ops/avg_pool1d.hpp"
 #include "ops/causal_softmax.hpp"
-#include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
-#include "ops/hardswish.hpp"
-#include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
diff --git a/include/infinicore/ops/all.hpp b/include/infinicore/ops/all.hpp
deleted file mode 100644
index 50d76f2d7..000000000
--- a/include/infinicore/ops/all.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-#include <optional>
-#include <vector>
-namespace infinicore::op {
-class All {
-public:
-    using schema = void (*)(Tensor, Tensor, std::vector<size_t>, bool);
-    static void execute(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim = false);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-Tensor all(Tensor input, std::vector<size_t> dim, bool keepdim = false);
-void all_(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim = false);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/avg_pool1d.hpp b/include/infinicore/ops/avg_pool1d.hpp
deleted file mode 100644
index 4bf69bc2a..000000000
--- a/include/infinicore/ops/avg_pool1d.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-
-namespace infinicore::op {
-
-class AvgPool1d {
-public:
-    using schema = void (*)(Tensor, Tensor, size_t, size_t, size_t);
-    static void execute(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0);
-void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/cross_entropy.hpp b/include/infinicore/ops/cross_entropy.hpp
deleted file mode 100644
index 9a6d446d2..000000000
--- a/include/infinicore/ops/cross_entropy.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-
-namespace infinicore::op {
-
-class CrossEntropy {
-public:
-    // Schema 定义：函数指针类型
-    // CrossEntropy 需要接收三个 Tensor: Output (Loss), Input (Logits), Target (Labels)
-    using schema = void (*)(Tensor, Tensor, Tensor);
-
-    // 执行入口
-    static void execute(Tensor output, Tensor input, Tensor target);
-
-    // 分发器访问接口
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-// ==================================================================
-// 对外 Functional API
-// ==================================================================
-
-// 1. Out-of-place 接口：
-// 输入 Logits 和 Target，内部自动创建 Output Tensor 并返回
-Tensor cross_entropy(Tensor input, Tensor target);
-
-// 2. Explicit Output 接口 (类似于 In-place 风格)：
-// 用户显式提供 Output Tensor 用于存储结果
-// 注意：虽然命名带有下划线 _，但通常 CrossEntropy 无法真正原地修改 input，
-// 所以这里只是表示“写入指定的 output 内存”
-void cross_entropy_(Tensor output, Tensor input, Tensor target);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/equal.hpp b/include/infinicore/ops/equal.hpp
deleted file mode 100644
index 1a158bf1e..000000000
--- a/include/infinicore/ops/equal.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-
-namespace infinicore::op {
-
-class Equal {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-
-    static void execute(Tensor out, Tensor a, Tensor b);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-Tensor equal(Tensor a, Tensor b);
-void equal_(Tensor out, Tensor a, Tensor b);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/hardswish.hpp b/include/infinicore/ops/hardswish.hpp
deleted file mode 100644
index 15313f461..000000000
--- a/include/infinicore/ops/hardswish.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-
-namespace infinicore::op {
-
-class Hardswish {
-public:
-    using schema = void (*)(Tensor, Tensor);
-    static void execute(Tensor output, Tensor input);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-Tensor hardswish(Tensor input);
-void hardswish_(Tensor output, Tensor input);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/hardtanh.hpp b/include/infinicore/ops/hardtanh.hpp
deleted file mode 100644
index 511408fee..000000000
--- a/include/infinicore/ops/hardtanh.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-
-namespace infinicore::op {
-
-class HardTanh {
-public:
-    using schema = void (*)(Tensor, Tensor, float, float);
-    static void execute(Tensor output, Tensor input, float min_val, float max_val);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-Tensor hardtanh(Tensor input, float min_val = -1.0f, float max_val = 1.0f);
-void hardtanh_(Tensor output, Tensor input, float min_val = -1.0f, float max_val = 1.0f);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/sum.hpp b/include/infinicore/ops/sum.hpp
deleted file mode 100644
index 0ead8de26..000000000
--- a/include/infinicore/ops/sum.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-#include <optional>
-#include <vector>
-
-namespace infinicore::op {
-class Sum {
-public:
-    using schema = void (*)(Tensor, Tensor, std::vector<size_t>, bool);
-    static void execute(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim = false);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-Tensor sum(Tensor input, std::vector<size_t> dim, bool keepdim = false);
-void sum_(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim = false);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/topk.hpp b/include/infinicore/ops/topk.hpp
deleted file mode 100644
index d8486112c..000000000
--- a/include/infinicore/ops/topk.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-namespace infinicore::op {
-class TopK {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor, size_t, size_t, bool, bool);
-    static void execute(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest = true, bool sorted = true);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-std::pair<Tensor, Tensor> topk(Tensor input, size_t k, size_t dim, bool largest = true, bool sorted = true);
-void topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest = true, bool sorted = true);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/var.hpp b/include/infinicore/ops/var.hpp
deleted file mode 100644
index d1e01e1bf..000000000
--- a/include/infinicore/ops/var.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-#include <optional>
-#include <utility>
-#include <vector>
-namespace infinicore::op {
-class Var {
-public:
-    using schema = void (*)(Tensor, Tensor, std::vector<size_t>, bool, bool); // var_output, input, dim, unbiased, keepdim
-    static void execute(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased = true, bool keepdim = false);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-Tensor var(Tensor input, std::vector<size_t> dim, bool unbiased = true, bool keepdim = false);
-void var_(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased = true, bool keepdim = false);
-
-} // namespace infinicore::op
diff --git a/include/infinicore/ops/var_mean.hpp b/include/infinicore/ops/var_mean.hpp
deleted file mode 100644
index a9679187c..000000000
--- a/include/infinicore/ops/var_mean.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include "../device.hpp"
-#include "common/op.hpp"
-#include <optional>
-#include <utility>
-#include <vector>
-namespace infinicore::op {
-class Var_Mean {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor, std::vector<size_t>, bool, bool); // var_output, mean_output, input, dim, unbiased, keepdim
-    static void execute(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased = true, bool keepdim = false);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
-std::pair<Tensor, Tensor> var_mean(Tensor input, std::vector<size_t> dim, bool unbiased = true, bool keepdim = false);
-void var_mean_(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased = true, bool keepdim = false);
-
-} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index f596a312b..11d42c1d1 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -4,7 +4,6 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/add_rms_norm.h"
-#include "infiniop/ops/all.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
@@ -36,21 +35,11 @@
 #include "infiniop/ops/softmax.h"
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
-#include "infiniop/ops/sum.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/tanh.h"
-#include "infiniop/ops/topk.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
-#include "infiniop/ops/var.h"
-#include "infiniop/ops/var_mean.h"
 #include "infiniop/ops/zeros.h"
 #include "infiniop/tensor_descriptor.h"
 
-#include "infiniop/ops/cross_entropy.h"
-#include "infiniop/ops/hardswish.h"
-#include "infiniop/ops/avg_pool1d.h"
-#include "infiniop/ops/equal.h"
-#include "infiniop/ops/hardtanh.h"
-
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/all.h b/include/infiniop/ops/all.h
deleted file mode 100644
index 41d74cf9a..000000000
--- a/include/infiniop/ops/all.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef __INFINIOP_ALL_API_H__
-#define __INFINIOP_ALL_API_H__
-
-#include "../operator_descriptor.h"
-#include <cstddef>
-#include <vector>
-typedef struct InfiniopDescriptor *infiniopAllDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateAllDescriptor(infiniopHandle_t handle,
-                                                               infiniopAllDescriptor_t *desc_ptr,
-                                                               infiniopTensorDescriptor_t output_desc,
-                                                               infiniopTensorDescriptor_t input_desc,
-                                                               size_t *dim,
-                                                               size_t dim_size,
-                                                               bool keepdim);
-
-__INFINI_C __export infiniStatus_t infiniopGetAllWorkspaceSize(infiniopAllDescriptor_t desc, size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopAll(infiniopAllDescriptor_t desc,
-                                               void *workspace,
-                                               size_t workspace_size,
-                                               void *output,
-                                               const void *input,
-                                               size_t *dim,
-                                               size_t dim_size,
-                                               bool keepdim,
-                                               void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyAllDescriptor(infiniopAllDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/avg_pool1d.h b/include/infiniop/ops/avg_pool1d.h
deleted file mode 100644
index 81c489dd7..000000000
--- a/include/infiniop/ops/avg_pool1d.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __INFINIOP_AVG_POOL1D_API_H__
-#define __INFINIOP_AVG_POOL1D_API_H__
-
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopAvgPool1dDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateAvgPool1dDescriptor(
-    infiniopHandle_t handle,
-    infiniopAvgPool1dDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output,
-    infiniopTensorDescriptor_t input,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding);
-
-__INFINI_C __export infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(
-    infiniopAvgPool1dDescriptor_t desc,
-    size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopAvgPool1d(
-    infiniopAvgPool1dDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyAvgPool1dDescriptor(
-    infiniopAvgPool1dDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/cross_entropy.h b/include/infiniop/ops/cross_entropy.h
deleted file mode 100644
index 2ebd4b168..000000000
--- a/include/infiniop/ops/cross_entropy.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef __INFINIOP_CROSS_ENTROPY_API_H__
-#define __INFINIOP_CROSS_ENTROPY_API_H__
-
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopCrossEntropyDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateCrossEntropyDescriptor(
-    infiniopHandle_t handle,
-    infiniopCrossEntropyDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc);
-
-__INFINI_C __export infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
-    infiniopCrossEntropyDescriptor_t desc,
-    size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopCrossEntropy(
-    infiniopCrossEntropyDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    const void *target,
-    void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
-    infiniopCrossEntropyDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h
deleted file mode 100644
index 90c4f3386..000000000
--- a/include/infiniop/ops/equal.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef __INFINIOP_EQUAL_API_H__
-#define __INFINIOP_EQUAL_API_H__
-
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateEqualDescriptor(
-    infiniopHandle_t handle,
-    infiniopEqualDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c,
-    infiniopTensorDescriptor_t a,
-    infiniopTensorDescriptor_t b);
-
-__INFINI_C __export infiniStatus_t infiniopGetEqualWorkspaceSize(
-    infiniopEqualDescriptor_t desc,
-    size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopEqual(
-    infiniopEqualDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyEqualDescriptor(
-    infiniopEqualDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
deleted file mode 100644
index 1cdeecf67..000000000
--- a/include/infiniop/ops/hardswish.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __INFINIOP_HARDSWISH_API_H__
-#define __INFINIOP_HARDSWISH_API_H__
-
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateHardSwishDescriptor(
-    infiniopHandle_t handle,
-    infiniopHardSwishDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output,
-    infiniopTensorDescriptor_t input);
-
-__INFINI_C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize(
-    infiniopHardSwishDescriptor_t desc,
-    size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopHardSwish(
-    infiniopHardSwishDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyHardSwishDescriptor(
-    infiniopHardSwishDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/hardtanh.h b/include/infiniop/ops/hardtanh.h
deleted file mode 100644
index d2f98cedd..000000000
--- a/include/infiniop/ops/hardtanh.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __INFINIOP_HARDTANH_API_H__
-#define __INFINIOP_HARDTANH_API_H__
-
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopHardTanhDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateHardTanhDescriptor(infiniopHandle_t handle,
-                                                                    infiniopHardTanhDescriptor_t *desc_ptr,
-                                                                    infiniopTensorDescriptor_t output,
-                                                                    infiniopTensorDescriptor_t input,
-                                                                    float min_val,
-                                                                    float max_val);
-
-__INFINI_C __export infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc,
-                                                                    size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopHardTanh(infiniopHardTanhDescriptor_t desc,
-                                                    void *workspace,
-                                                    size_t workspace_size,
-                                                    void *output,
-                                                    const void *input,
-                                                    void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/sum.h b/include/infiniop/ops/sum.h
deleted file mode 100644
index c97104c90..000000000
--- a/include/infiniop/ops/sum.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef __INFINIOP_SUM_API_H__
-#define __INFINIOP_SUM_API_H__
-
-#include "../operator_descriptor.h"
-#include <cstddef>
-#include <vector>
-typedef struct InfiniopDescriptor *infiniopSumDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateSumDescriptor(infiniopHandle_t handle,
-                                                               infiniopSumDescriptor_t *desc_ptr,
-                                                               infiniopTensorDescriptor_t output_desc,
-                                                               infiniopTensorDescriptor_t input_desc,
-                                                               size_t *dim,
-                                                               size_t dim_size,
-                                                               bool keepdim);
-
-__INFINI_C __export infiniStatus_t infiniopGetSumWorkspaceSize(infiniopSumDescriptor_t desc, size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopSum(infiniopSumDescriptor_t desc,
-                                               void *workspace,
-                                               size_t workspace_size,
-                                               void *output,
-                                               const void *input,
-                                               size_t *dim,
-                                               size_t dim_size,
-                                               bool keepdim,
-                                               void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroySumDescriptor(infiniopSumDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/topk.h b/include/infiniop/ops/topk.h
deleted file mode 100644
index 3eaf94289..000000000
--- a/include/infiniop/ops/topk.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef __INFINIOP_TOPK_API_H__
-#define __INFINIOP_TOPK_API_H__
-
-#include "../operator_descriptor.h"
-#include <cstddef>
-#include <vector>
-typedef struct InfiniopDescriptor *infiniopTopKDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateTopKDescriptor(infiniopHandle_t handle,
-                                                                infiniopTopKDescriptor_t *desc_ptr,
-                                                                infiniopTensorDescriptor_t values_output_desc,
-                                                                infiniopTensorDescriptor_t indices_output_desc,
-                                                                infiniopTensorDescriptor_t input_desc,
-                                                                size_t k,
-                                                                size_t dim,
-                                                                bool largest,
-                                                                bool sorted);
-
-__INFINI_C __export infiniStatus_t infiniopGetTopKWorkspaceSize(infiniopTopKDescriptor_t desc, size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopTopK(infiniopTopKDescriptor_t desc,
-                                                void *workspace,
-                                                size_t workspace_size,
-                                                void *values_output,
-                                                void *indices_output,
-                                                const void *input,
-                                                size_t k,
-                                                size_t dim,
-                                                bool largest,
-                                                bool sorted,
-                                                void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyTopKDescriptor(infiniopTopKDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/var.h b/include/infiniop/ops/var.h
deleted file mode 100644
index 7dc601a94..000000000
--- a/include/infiniop/ops/var.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __INFINIOP_VAR_API_H__
-#define __INFINIOP_VAR_API_H__
-
-#include "../operator_descriptor.h"
-#include <cstddef>
-#include <vector>
-typedef struct InfiniopDescriptor *infiniopVarDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateVarDescriptor(infiniopHandle_t handle,
-                                                               infiniopVarDescriptor_t *desc_ptr,
-                                                               infiniopTensorDescriptor_t var_output_desc,
-                                                               infiniopTensorDescriptor_t input_desc,
-                                                               size_t *dim,
-                                                               size_t dim_size,
-                                                               bool unbiased,
-                                                               bool keepdim);
-
-__INFINI_C __export infiniStatus_t infiniopGetVarWorkspaceSize(infiniopVarDescriptor_t desc, size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopVar(infiniopVarDescriptor_t desc,
-                                               void *workspace,
-                                               size_t workspace_size,
-                                               void *var_output,
-                                               const void *input,
-                                               size_t *dim,
-                                               size_t dim_size,
-                                               bool unbiased,
-                                               bool keepdim,
-                                               void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyVarDescriptor(infiniopVarDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/var_mean.h b/include/infiniop/ops/var_mean.h
deleted file mode 100644
index 358a55636..000000000
--- a/include/infiniop/ops/var_mean.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef __INFINIOP_VAR_MEAN_API_H__
-#define __INFINIOP_VAR_MEAN_API_H__
-
-#include "../operator_descriptor.h"
-#include <cstddef>
-#include <vector>
-typedef struct InfiniopDescriptor *infiniopVarMeanDescriptor_t;
-
-__INFINI_C __export infiniStatus_t infiniopCreateVarMeanDescriptor(infiniopHandle_t handle,
-                                                                   infiniopVarMeanDescriptor_t *desc_ptr,
-                                                                   infiniopTensorDescriptor_t var_output_desc,
-                                                                   infiniopTensorDescriptor_t mean_output_desc,
-                                                                   infiniopTensorDescriptor_t input_desc,
-                                                                   size_t *dim,
-                                                                   size_t dim_size,
-                                                                   bool unbiased,
-                                                                   bool keepdim);
-
-__INFINI_C __export infiniStatus_t infiniopGetVarMeanWorkspaceSize(infiniopVarMeanDescriptor_t desc, size_t *size);
-
-__INFINI_C __export infiniStatus_t infiniopVarMean(infiniopVarMeanDescriptor_t desc,
-                                                   void *workspace,
-                                                   size_t workspace_size,
-                                                   void *var_output,
-                                                   void *mean_output,
-                                                   const void *input,
-                                                   size_t *dim,
-                                                   size_t dim_size,
-                                                   bool unbiased,
-                                                   bool keepdim,
-                                                   void *stream);
-
-__INFINI_C __export infiniStatus_t infiniopDestroyVarMeanDescriptor(infiniopVarMeanDescriptor_t desc);
-
-#endif
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 229792b39..0b3eb9655 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -49,10 +49,7 @@
 )
 from infinicore.ops.add import add
 from infinicore.ops.add_rms_norm import add_rms_norm
-from infinicore.ops.all import all
 from infinicore.ops.attention import attention
-from infinicore.ops.cross_entropy import cross_entropy
-from infinicore.ops.equal import equal
 from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mha_kvcache import mha_kvcache
@@ -64,11 +61,7 @@
 from infinicore.ops.paged_caching import paged_caching
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.squeeze import squeeze
-from infinicore.ops.sum import sum
-from infinicore.ops.topk import topk
 from infinicore.ops.unsqueeze import unsqueeze
-from infinicore.ops.var import var
-from infinicore.ops.var_mean import var_mean
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -127,22 +120,16 @@
     "uint8",
     # Operators.
     "add",
-    "addcmul",
     "add_rms_norm",
     "add_rms_norm_",
-    "atanh",
     "attention",
-    "binary_cross_entropy_with_logits",
-    "cdist",
     "kv_caching",
     "matmul",
-    "equal",
     "mul",
     "narrow",
     "squeeze",
     "unsqueeze",
     "rearrange",
-    "cross_entropy",
     "empty",
     "empty_like",
     "from_blob",
@@ -155,15 +142,9 @@
     "paged_attention",
     "paged_attention_prefill",
     "ones",
-    "reciprocal",
     "strided_empty",
     "strided_from_blob",
     "zeros",
-    "sum",
-    "var_mean",
-    "var",
-    "topk",
-    "all",
 ]
 
 use_ntops = False
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 46ff04ae4..934930d56 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -1,9 +1,6 @@
-from .avg_pool1d import avg_pool1d
 from .causal_softmax import causal_softmax
 from .embedding import embedding
 from .flash_attention import flash_attention
-from .hardswish import hardswish
-from .hardtanh import hardtanh
 from .linear import linear
 from .linear_w8a8i8 import linear_w8a8i8
 from .random_sample import random_sample
@@ -23,9 +20,6 @@
     "RopeAlgo",
     "rope",
     "silu",
-    "hardswish",
-    "hardtanh",
-    "avg_pool1d",
     "swiglu",
     "linear_w8a8i8",
     "silu_and_mul",
diff --git a/python/infinicore/nn/functional/avg_pool1d.py b/python/infinicore/nn/functional/avg_pool1d.py
deleted file mode 100644
index 0cf4759ad..000000000
--- a/python/infinicore/nn/functional/avg_pool1d.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def avg_pool1d(
-    input: Tensor,
-    kernel_size: int,
-    stride: int | None = None,
-    padding: int = 0,
-    *,
-    out=None,
-) -> Tensor:
-    if stride is None:
-        stride = 0
-
-    if out is None:
-        return Tensor(
-            _infinicore.avg_pool1d(input._underlying, kernel_size, stride, padding)
-        )
-
-    _infinicore.avg_pool1d_(
-        out._underlying, input._underlying, kernel_size, stride, padding
-    )
-    return out
diff --git a/python/infinicore/nn/functional/hardswish.py b/python/infinicore/nn/functional/hardswish.py
deleted file mode 100644
index b054b8978..000000000
--- a/python/infinicore/nn/functional/hardswish.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import infinicore
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def hardswish(input: Tensor, inplace: bool = False, *, out=None) -> Tensor:
-    r"""Apply the Hardswish activation function element-wise."""
-
-    if (
-        infinicore.use_ntops
-        and input.device.type in ("cuda", "musa")
-        and out is None
-        and hasattr(infinicore.ntops.torch, "hardswish")
-    ):
-        try:
-            return infinicore.ntops.torch.hardswish(input, inplace=inplace)
-        except AttributeError:
-            pass
-
-    if inplace:
-        _infinicore.hardswish_(input._underlying, input._underlying)
-        return input
-
-    if out is None:
-        return Tensor(_infinicore.hardswish(input._underlying))
-
-    _infinicore.hardswish_(out._underlying, input._underlying)
-    return out
diff --git a/python/infinicore/nn/functional/hardtanh.py b/python/infinicore/nn/functional/hardtanh.py
deleted file mode 100644
index 925de33d6..000000000
--- a/python/infinicore/nn/functional/hardtanh.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import infinicore
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def hardtanh(
-    input: Tensor,
-    min_val: float = -1.0,
-    max_val: float = 1.0,
-    inplace: bool = False,
-    *,
-    out=None,
-) -> Tensor:
-    """Clamp the input tensor to the range [min_val, max_val]."""
-
-    if min_val > max_val:
-        raise ValueError("min_val must be less than or equal to max_val")
-
-    if (
-        infinicore.use_ntops
-        and input.device.type in ("cuda", "musa")
-        and out is None
-        and hasattr(infinicore.ntops.torch, "hardtanh")
-    ):
-        try:
-            return infinicore.ntops.torch.hardtanh(
-                input, min_val=min_val, max_val=max_val, inplace=inplace
-            )
-        except AttributeError:
-            pass
-
-    if inplace:
-        _infinicore.hardtanh_(
-            input._underlying, input._underlying, float(min_val), float(max_val)
-        )
-        return input
-
-    if out is None:
-        return Tensor(
-            _infinicore.hardtanh(input._underlying, float(min_val), float(max_val))
-        )
-
-    _infinicore.hardtanh_(
-        out._underlying, input._underlying, float(min_val), float(max_val)
-    )
-    return out
diff --git a/python/infinicore/ops/all.py b/python/infinicore/ops/all.py
deleted file mode 100644
index 6aacd519d..000000000
--- a/python/infinicore/ops/all.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def all(input, dim=None, keepdim=False, out=None):
-    if out is None:
-        return Tensor(_infinicore.all(input._underlying, dim, keepdim))
-
-    _infinicore.all_(out._underlying, input._underlying, dim, keepdim)
-
-    return out
diff --git a/python/infinicore/ops/cross_entropy.py b/python/infinicore/ops/cross_entropy.py
deleted file mode 100644
index 5b47697b5..000000000
--- a/python/infinicore/ops/cross_entropy.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def cross_entropy(
-    logits,
-    target,
-    weight=None,
-    *,
-    ignore_index=None,
-    reduction="none",
-    out=None,
-):
-    """
-    Token-wise cross entropy without reduction. The output tensor has the same
-    shape as target and uses the logits dtype.
-    """
-    if weight is not None:
-        raise NotImplementedError("class weights are not supported yet.")
-    if ignore_index is not None:
-        raise NotImplementedError("ignore_index is not supported yet.")
-    if reduction not in (None, "none"):
-        raise NotImplementedError("Only reduction='none' is implemented.")
-
-    if out is None:
-        return Tensor(_infinicore.cross_entropy(logits._underlying, target._underlying))
-
-    _infinicore.cross_entropy_(
-        out._underlying,
-        logits._underlying,
-        target._underlying,
-    )
-    return out
diff --git a/python/infinicore/ops/equal.py b/python/infinicore/ops/equal.py
deleted file mode 100644
index 5a656ab30..000000000
--- a/python/infinicore/ops/equal.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def equal(input, other, *, out=None):
-    if out is None:
-        return Tensor(_infinicore.equal(input._underlying, other._underlying))
-
-    _infinicore.equal_(out._underlying, input._underlying, other._underlying)
-    return out
diff --git a/python/infinicore/ops/sum.py b/python/infinicore/ops/sum.py
deleted file mode 100644
index 5f264c24b..000000000
--- a/python/infinicore/ops/sum.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def sum(input, dim=None, keepdim=False, out=None):
-    """
-    Sum the elements of the input tensor along the given dimensions.
-
-    Args:
-        input (Tensor): The input tensor.
-        out (Tensor, optional): The output tensor.
-
-    Returns:
-        Tensor: The output tensor.
-
-    Example:
-        >>> import infinicore
-        >>> input = infinicore.tensor([[1, 2, 3], [4, 5, 6]])
-        >>> output = infinicore.sum(input)
-        >>> print(output)
-        tensor([15])
-    """
-    if out is None:
-        return Tensor(_infinicore.sum(input._underlying, dim, keepdim))
-
-    _infinicore.sum_(out._underlying, input._underlying, dim, keepdim)
-
-    return out
diff --git a/python/infinicore/ops/topk.py b/python/infinicore/ops/topk.py
deleted file mode 100644
index 86eb32ee6..000000000
--- a/python/infinicore/ops/topk.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def topk(input, k, dim, largest=True, sorted=True, out=None):
-    if out is None:
-        values, indices = _infinicore.topk(input._underlying, k, dim, largest, sorted)
-        return Tensor(values), Tensor(indices)
-
-    _infinicore.topk_(out._underlying, input._underlying, k, dim, largest, sorted)
-
-    return out
diff --git a/python/infinicore/ops/var.py b/python/infinicore/ops/var.py
deleted file mode 100644
index 71911ab10..000000000
--- a/python/infinicore/ops/var.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def var(input, dim=None, unbiased=True, keepdim=False, out=None):
-    if out is None:
-        var_tensor = _infinicore.var(input._underlying, dim, unbiased, keepdim)
-        return Tensor(var_tensor)
-    var_output = out
-    _infinicore.var_(var_output._underlying, input._underlying, dim, unbiased, keepdim)
-
-    return out
diff --git a/python/infinicore/ops/var_mean.py b/python/infinicore/ops/var_mean.py
deleted file mode 100644
index 0a9573938..000000000
--- a/python/infinicore/ops/var_mean.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def var_mean(input, dim=None, unbiased=True, keepdim=False, out=None):
-    if out is None:
-        var_tensor, mean_tensor = _infinicore.var_mean(
-            input._underlying, dim, unbiased, keepdim
-        )
-        return Tensor(var_tensor), Tensor(mean_tensor)
-    var_output, mean_output = out
-    _infinicore.var_mean_(
-        var_output._underlying,
-        mean_output._underlying,
-        input._underlying,
-        dim,
-        unbiased,
-        keepdim,
-    )
-
-    return out
diff --git a/python/infinicore/utils.py b/python/infinicore/utils.py
index e0019dc89..094b2230e 100644
--- a/python/infinicore/utils.py
+++ b/python/infinicore/utils.py
@@ -1,13 +1,9 @@
+import ml_dtypes
 import numpy as np
 import torch
 
 import infinicore
 
-try:
-    import ml_dtypes
-except ModuleNotFoundError:
-    ml_dtypes = None
-
 
 def to_torch_dtype(infini_dtype):
     """Convert infinicore data type to PyTorch data type"""
@@ -61,9 +57,7 @@ def numpy_to_infinicore_dtype(numpy_dtype):
         return infinicore.float64
     elif numpy_dtype == np.float16:
         return infinicore.float16
-    elif hasattr(np, "bfloat16") and numpy_dtype == np.bfloat16:
-        return infinicore.bfloat16
-    elif ml_dtypes is not None and numpy_dtype == ml_dtypes.bfloat16:
+    elif numpy_dtype == ml_dtypes.bfloat16:
         return infinicore.bfloat16
     elif numpy_dtype == np.int8:
         return infinicore.int8
@@ -92,13 +86,6 @@ def infinicore_to_numpy_dtype(infini_dtype):
     elif infini_dtype == infinicore.int16:
         return np.int16
     elif infini_dtype == infinicore.bfloat16:
-        if hasattr(np, "bfloat16"):
-            return np.bfloat16
-        if ml_dtypes is None:
-            raise ModuleNotFoundError(
-                "ml_dtypes is required for bfloat16 numpy conversion. "
-                "Please install ml_dtypes."
-            )
         return ml_dtypes.bfloat16
     elif infini_dtype == infinicore.int32:
         return np.int32
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 13b69a013..0bd8bc26d 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -17,12 +17,12 @@ def run_tests(args):
         "causal_softmax.py",
         "clip.py",
         "conv.py",
-        # "dequantize_awq.py",
+        #"dequantize_awq.py",
         "gelu.py",
         "gemm.py",
-        # "layer_norm.py",
+        #"layer_norm.py",
         "logsoftmax.py",
-        # "lp_norm.py",
+        #"lp_norm.py",
         "mul.py",
         "ones.py",
         "random_sample.py",
@@ -31,7 +31,7 @@ def run_tests(args):
         "rms_norm.py",
         "rope.py",
         "sigmoid.py",
-        # "softmax.py",
+        #"softmax.py",
         "softplus.py",
         "sub.py",
         "swiglu.py",
@@ -42,7 +42,6 @@ def run_tests(args):
         # "paged_attention.py",
         # "paged_caching.py",
         # "paged_attention_prefill.py"
-        "cross_entropy.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infinicore/ops/all/al_infiniop.cc b/src/infinicore/ops/all/al_infiniop.cc
deleted file mode 100644
index 094716ba8..000000000
--- a/src/infinicore/ops/all/al_infiniop.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/all.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::all_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopAllDescriptor_t> caches(
-    100, // capacity
-    [](infiniopAllDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyAllDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
-    size_t seed = hash_combine(output, input, dim.size(), keepdim);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopAllDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateAllDescriptor(
-            context::getInfiniopHandle(output->device()), &desc,
-            output->desc(), input->desc(), dim.data(), dim.size(), keepdim));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetAllWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopAll(
-        desc, workspace->data(), workspace_size,
-        output->data(), input->data(), dim.data(), dim.size(), keepdim, context::getStream()));
-}
-
-static bool registered = []() {
-    All::dispatcher().registerDevice({Device::Type::CPU,
-                                      Device::Type::NVIDIA,
-                                      Device::Type::METAX,
-                                      Device::Type::MOORE,
-                                      Device::Type::ILUVATAR},
-                                     &calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::all_impl::infiniop
diff --git a/src/infinicore/ops/all/all.cc b/src/infinicore/ops/all/all.cc
deleted file mode 100644
index c695623b8..000000000
--- a/src/infinicore/ops/all/all.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "infinicore/ops/all.hpp"
-
-#include "../../utils.hpp"
-#include <iostream>
-#include <stdexcept>
-#include <vector>
-namespace infinicore::op {
-
-common::OpDispatcher<All::schema> &All::dispatcher() {
-    static common::OpDispatcher<All::schema> dispatcher_;
-    return dispatcher_;
-};
-void All::execute(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    infinicore::context::setDevice(input->device());
-    auto device_type = context::getDevice().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No All implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input, dim, keepdim);
-}
-
-Tensor all(Tensor input, std::vector<size_t> dim, bool keepdim) {
-    auto in_shape = input->shape();
-    std::vector<size_t> out_shape;
-    if (dim.empty()) {
-        for (size_t i = 0; i < in_shape.size(); i++) {
-            dim.push_back(i);
-        }
-    }
-    std::sort(dim.begin(), dim.end());
-    if (dim.size() == in_shape.size() && !keepdim) {
-        out_shape = {};
-    } else {
-        if (keepdim) {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    out_shape.push_back(1);
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        } else {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        }
-    }
-    auto output = Tensor::empty(out_shape, DataType::BOOL, input->device());
-    all_(output, input, dim, keepdim);
-    return output;
-}
-
-void all_(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
-    All::execute(output, input, dim, keepdim);
-}
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc
deleted file mode 100644
index 907b25b00..000000000
--- a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-#include "infinicore/ops/avg_pool1d.hpp"
-
-#include "../../utils.hpp"
-
-#include <stdexcept>
-
-namespace infinicore::op {
-
-common::OpDispatcher<AvgPool1d::schema> &AvgPool1d::dispatcher() {
-    static common::OpDispatcher<AvgPool1d::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void AvgPool1d::execute(
-    Tensor output,
-    Tensor input,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    if (stride == 0) {
-        stride = kernel_size;
-    }
-
-    infinicore::context::setDevice(output->device());
-    auto device_type = output->device().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error(
-            "No AvgPool1d implementation for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input, kernel_size, stride, padding);
-}
-
-Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) {
-    if (stride == 0) {
-        stride = kernel_size;
-    }
-
-    const auto &shape = input->shape();
-    if (shape.size() != 3) {
-        throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]");
-    }
-
-    const size_t n = shape[0];
-    const size_t c = shape[1];
-    const size_t l_in = shape[2];
-
-    if (l_in + 2 * padding < kernel_size) {
-        throw std::runtime_error("AvgPool1d kernel_size is larger than padded length");
-    }
-
-    const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1;
-
-    Shape out_shape = {n, c, out_width};
-    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
-    avg_pool1d_(output, input, kernel_size, stride, padding);
-    return output;
-}
-
-void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) {
-    AvgPool1d::execute(output, input, kernel_size, stride, padding);
-}
-
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc
deleted file mode 100644
index df7ebda8d..000000000
--- a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/avg_pool1d.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::avg_pool1d_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopAvgPool1dDescriptor_t> caches(
-    100,
-    [](infiniopAvgPool1dDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(
-    Tensor output,
-    Tensor input,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    if (stride == 0) {
-        stride = kernel_size;
-    }
-
-    size_t seed = hash_combine(output, input, kernel_size, stride, padding);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopAvgPool1dDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            output->desc(),
-            input->desc(),
-            kernel_size,
-            stride,
-            padding));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopAvgPool1d(
-        desc,
-        workspace->data(),
-        workspace_size,
-        output->data(),
-        input->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    AvgPool1d::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::avg_pool1d_impl::infiniop
diff --git a/src/infinicore/ops/cross_entropy/cross_entropy.cc b/src/infinicore/ops/cross_entropy/cross_entropy.cc
deleted file mode 100644
index 84aebc1b1..000000000
--- a/src/infinicore/ops/cross_entropy/cross_entropy.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "infinicore/ops/cross_entropy.hpp"
-
-#include "../../utils.hpp"
-
-#include <stdexcept>
-
-namespace infinicore::op {
-
-common::OpDispatcher<CrossEntropy::schema> &CrossEntropy::dispatcher() {
-    static common::OpDispatcher<CrossEntropy::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) {
-
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target);
-
-    infinicore::context::setDevice(output->device());
-    auto device_type = output->device().getType();
-
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input, target);
-}
-
-Tensor cross_entropy(Tensor input, Tensor target) {
-
-    Shape shape = target->shape();
-
-    auto output = Tensor::empty(shape, input->dtype(), input->device());
-
-    cross_entropy_(output, input, target);
-    return output;
-}
-
-void cross_entropy_(Tensor output, Tensor input, Tensor target) {
-    CrossEntropy::execute(output, input, target);
-}
-
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
deleted file mode 100644
index 5fa7963d7..000000000
--- a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/cross_entropy.hpp"
-
-#include <infiniop.h>
-
-namespace infinicore::op::cross_entropy_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopCrossEntropyDescriptor_t> caches(
-    100,
-    [](infiniopCrossEntropyDescriptor_t &desc) {
-        if (desc != nullptr) {
-
-            INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input, Tensor target) {
-
-    size_t seed = hash_combine(output, input, target);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopCrossEntropyDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-
-        INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            output->desc(),
-            input->desc(),
-            target->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size));
-
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopCrossEntropy(
-        desc,
-        workspace->data(),
-        workspace_size,
-        output->data(),
-        input->data(),
-        target->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    CrossEntropy::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::cross_entropy_impl::infiniop
diff --git a/src/infinicore/ops/equal/equal.cc b/src/infinicore/ops/equal/equal.cc
deleted file mode 100644
index b6acc4d25..000000000
--- a/src/infinicore/ops/equal/equal.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "infinicore/ops/equal.hpp"
-
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-common::OpDispatcher<Equal::schema> &Equal::dispatcher() {
-    static common::OpDispatcher<Equal::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Equal::execute(Tensor out, Tensor a, Tensor b) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b);
-    infinicore::context::setDevice(out->device());
-    dispatcher().lookup(out->device().getType())(out, a, b);
-}
-
-Tensor equal(Tensor a, Tensor b) {
-    auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device());
-    equal_(out, a, b);
-    return out;
-}
-
-void equal_(Tensor out, Tensor a, Tensor b) {
-    if (out->dtype() != DataType::BOOL) {
-        throw std::runtime_error("Equal expects bool output tensor.");
-    }
-    Equal::execute(out, a, b);
-}
-
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/equal/equal_infiniop.cc b/src/infinicore/ops/equal/equal_infiniop.cc
deleted file mode 100644
index 1b4e4cffa..000000000
--- a/src/infinicore/ops/equal/equal_infiniop.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/equal.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::equal_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopEqualDescriptor_t> caches(
-    100,
-    [](infiniopEqualDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor out, Tensor a, Tensor b) {
-    size_t seed = hash_combine(out, a, b);
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    infiniopEqualDescriptor_t desc = nullptr;
-    if (auto cached = cache.get(seed)) {
-        desc = *cached;
-    } else {
-        INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            out->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace;
-    void *workspace_ptr = nullptr;
-    if (workspace_size != 0) {
-        workspace = context::allocateMemory(workspace_size);
-        workspace_ptr = workspace->data();
-    }
-
-    INFINICORE_CHECK_ERROR(infiniopEqual(
-        desc,
-        workspace_ptr,
-        workspace_size,
-        out->data(),
-        a->data(),
-        b->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    Equal::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::equal_impl::infiniop
diff --git a/src/infinicore/ops/hardswish/hardswish.cc b/src/infinicore/ops/hardswish/hardswish.cc
deleted file mode 100644
index ec8db75ff..000000000
--- a/src/infinicore/ops/hardswish/hardswish.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "infinicore/ops/hardswish.hpp"
-
-#include "../../utils.hpp"
-
-#include <stdexcept>
-
-namespace infinicore::op {
-
-common::OpDispatcher<Hardswish::schema> &Hardswish::dispatcher() {
-    static common::OpDispatcher<Hardswish::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void Hardswish::execute(Tensor output, Tensor input) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    infinicore::context::setDevice(output->device());
-    auto device_type = output->device().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error(
-            "No Hardswish implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input);
-}
-
-Tensor hardswish(Tensor input) {
-    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
-    hardswish_(output, input);
-    return output;
-}
-
-void hardswish_(Tensor output, Tensor input) {
-    Hardswish::execute(output, input);
-}
-
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/hardswish/hardswish_infiniop.cc b/src/infinicore/ops/hardswish/hardswish_infiniop.cc
deleted file mode 100644
index 44d4054e8..000000000
--- a/src/infinicore/ops/hardswish/hardswish_infiniop.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/hardswish.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::hardswish_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopHardSwishDescriptor_t> caches(
-    100,
-    [](infiniopHardSwishDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input) {
-    size_t seed = hash_combine(output, input);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopHardSwishDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            output->desc(),
-            input->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace;
-    void *workspace_ptr = nullptr;
-    if (workspace_size != 0) {
-        workspace = context::allocateMemory(workspace_size);
-        workspace_ptr = workspace->data();
-    }
-
-    INFINICORE_CHECK_ERROR(infiniopHardSwish(
-        desc,
-        workspace_ptr,
-        workspace_size,
-        output->data(),
-        input->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    Hardswish::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::hardswish_impl::infiniop
diff --git a/src/infinicore/ops/hardtanh/hardtanh.cc b/src/infinicore/ops/hardtanh/hardtanh.cc
deleted file mode 100644
index 5a4df2142..000000000
--- a/src/infinicore/ops/hardtanh/hardtanh.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "infinicore/ops/hardtanh.hpp"
-
-#include "../../utils.hpp"
-
-#include <stdexcept>
-
-namespace infinicore::op {
-
-common::OpDispatcher<HardTanh::schema> &HardTanh::dispatcher() {
-    static common::OpDispatcher<HardTanh::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    infinicore::context::setDevice(output->device());
-
-    auto device_type = output->device().getType();
-    auto func = dispatcher().lookup(device_type);
-    if (func == nullptr) {
-        throw std::runtime_error(
-            "No HardTanh implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input, min_val, max_val);
-}
-
-Tensor hardtanh(Tensor input, float min_val, float max_val) {
-    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
-    hardtanh_(output, input, min_val, max_val);
-    return output;
-}
-
-void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) {
-    HardTanh::execute(output, input, min_val, max_val);
-}
-
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
deleted file mode 100644
index d8af439d8..000000000
--- a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/hardtanh.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::hardtanh_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopHardTanhDescriptor_t> caches(
-    100,
-    [](infiniopHardTanhDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input, float min_val, float max_val) {
-    size_t seed = hash_combine(output, input, min_val, max_val);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopHardTanhDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            output->desc(),
-            input->desc(),
-            min_val,
-            max_val));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace;
-    void *workspace_ptr = nullptr;
-    if (workspace_size != 0) {
-        workspace = context::allocateMemory(workspace_size);
-        workspace_ptr = workspace->data();
-    }
-
-    INFINICORE_CHECK_ERROR(infiniopHardTanh(
-        desc,
-        workspace_ptr,
-        workspace_size,
-        output->data(),
-        input->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    HardTanh::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::hardtanh_impl::infiniop
diff --git a/src/infinicore/ops/sum/sum.cc b/src/infinicore/ops/sum/sum.cc
deleted file mode 100644
index 5fcecda5e..000000000
--- a/src/infinicore/ops/sum/sum.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "infinicore/ops/sum.hpp"
-
-#include "../../utils.hpp"
-#include <stdexcept>
-#include <vector>
-
-namespace infinicore::op {
-
-common::OpDispatcher<Sum::schema> &Sum::dispatcher() {
-    static common::OpDispatcher<Sum::schema> dispatcher_;
-    return dispatcher_;
-};
-void Sum::execute(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    infinicore::context::setDevice(input->device());
-    auto device_type = context::getDevice().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No Sum implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input, dim, keepdim);
-}
-
-Tensor sum(Tensor input, std::vector<size_t> dim, bool keepdim) {
-    auto in_shape = input->shape();
-    std::vector<size_t> out_shape;
-    if (dim.empty()) {
-        for (size_t i = 0; i < in_shape.size(); i++) {
-            dim.push_back(i);
-        }
-    }
-    std::sort(dim.begin(), dim.end());
-    if (dim.size() == in_shape.size() && !keepdim) {
-        out_shape = {};
-    } else {
-        if (keepdim) {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    out_shape.push_back(1);
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        } else {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        }
-    }
-    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
-    sum_(output, input, dim, keepdim);
-    return output;
-}
-
-void sum_(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
-    Sum::execute(output, input, dim, keepdim);
-}
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/sum/sum_infiniop.cc b/src/infinicore/ops/sum/sum_infiniop.cc
deleted file mode 100644
index 9a696a9b5..000000000
--- a/src/infinicore/ops/sum/sum_infiniop.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/sum.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::sum_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopSumDescriptor_t> caches(
-    100, // capacity
-    [](infiniopSumDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroySumDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
-    size_t seed = hash_combine(output, input, dim.size(), keepdim);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopSumDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateSumDescriptor(
-            context::getInfiniopHandle(output->device()), &desc,
-            output->desc(), input->desc(), dim.data(), dim.size(), keepdim));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetSumWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopSum(
-        desc, workspace->data(), workspace_size,
-        output->data(), input->data(), dim.data(), dim.size(), keepdim, context::getStream()));
-}
-
-static bool registered = []() {
-    Sum::dispatcher().registerDevice({Device::Type::CPU,
-                                      Device::Type::NVIDIA,
-                                      Device::Type::METAX,
-                                      Device::Type::MOORE,
-                                      Device::Type::ILUVATAR},
-                                     &calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::sum_impl::infiniop
diff --git a/src/infinicore/ops/topk/topk.cc b/src/infinicore/ops/topk/topk.cc
deleted file mode 100644
index a5b52fccf..000000000
--- a/src/infinicore/ops/topk/topk.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "infinicore/ops/topk.hpp"
-
-#include "../../utils.hpp"
-#include <stdexcept>
-#include <vector>
-
-namespace infinicore::op {
-
-common::OpDispatcher<TopK::schema> &TopK::dispatcher() {
-    static common::OpDispatcher<TopK::schema> dispatcher_;
-    return dispatcher_;
-};
-void TopK::execute(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(values_output, input);
-    infinicore::context::setDevice(input->device());
-    auto device_type = context::getDevice().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No Topk implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(values_output, indices_output, input, k, dim, largest, sorted);
-}
-
-std::pair<Tensor, Tensor> topk(Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
-    auto in_shape = input->shape();
-    std::vector<size_t> out_shape = in_shape;
-    out_shape[dim] = k;
-
-    auto values_output = Tensor::empty(out_shape, input->dtype(), input->device());
-    auto indices_output = Tensor::empty(out_shape, DataType::I32, input->device());
-    topk_(values_output, indices_output, input, k, dim, largest, sorted);
-    return {values_output, indices_output};
-}
-
-void topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
-    TopK::execute(values_output, indices_output, input, k, dim, largest, sorted);
-}
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/topk/topk_infiniop.cc b/src/infinicore/ops/topk/topk_infiniop.cc
deleted file mode 100644
index 5cc8d4d98..000000000
--- a/src/infinicore/ops/topk/topk_infiniop.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/topk.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::topk_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopTopKDescriptor_t> caches(
-    100, // capacity
-    [](infiniopTopKDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyTopKDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
-    size_t seed = hash_combine(values_output, indices_output, input, k, dim, largest, sorted);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopTopKDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateTopKDescriptor(
-            context::getInfiniopHandle(values_output->device()), &desc,
-            values_output->desc(), indices_output->desc(), input->desc(), k, dim, largest, sorted));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetTopKWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopTopK(
-        desc, workspace->data(), workspace_size,
-        values_output->data(), indices_output->data(), input->data(), k, dim, largest, sorted, context::getStream()));
-}
-
-static bool registered = []() {
-    TopK::dispatcher().registerDevice({Device::Type::CPU,
-                                       Device::Type::NVIDIA,
-                                       Device::Type::METAX,
-                                       Device::Type::MOORE,
-                                       Device::Type::ILUVATAR},
-                                      &calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::topk_impl::infiniop
diff --git a/src/infinicore/ops/var/var.cc b/src/infinicore/ops/var/var.cc
deleted file mode 100644
index bc0849e64..000000000
--- a/src/infinicore/ops/var/var.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-#include "infinicore/ops/var.hpp"
-
-#include "../../utils.hpp"
-#include <stdexcept>
-#include <vector>
-
-namespace infinicore::op {
-
-common::OpDispatcher<Var::schema> &Var::dispatcher() {
-    static common::OpDispatcher<Var::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Var::execute(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(var_output, input);
-    infinicore::context::setDevice(input->device());
-    auto device_type = context::getDevice().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No Var implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(var_output, input, dim, unbiased, keepdim);
-}
-
-Tensor var(Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    auto in_shape = input->shape();
-    std::vector<size_t> out_shape;
-    if (dim.empty()) {
-        for (size_t i = 0; i < in_shape.size(); i++) {
-            dim.push_back(i);
-        }
-    }
-    std::sort(dim.begin(), dim.end());
-    if (dim.size() == in_shape.size() && !keepdim) {
-        out_shape = {};
-    } else {
-        if (keepdim) {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    out_shape.push_back(1);
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        } else {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        }
-    }
-    auto var_output = Tensor::empty(out_shape, input->dtype(), input->device());
-    var_(var_output, input, dim, unbiased, keepdim);
-    return var_output;
-}
-
-void var_(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    Var::execute(var_output, input, dim, unbiased, keepdim);
-}
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/var/var_infiniop.cc b/src/infinicore/ops/var/var_infiniop.cc
deleted file mode 100644
index c74eb2628..000000000
--- a/src/infinicore/ops/var/var_infiniop.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/var.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::var_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopVarDescriptor_t> caches(
-    100, // capacity
-    [](infiniopVarDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyVarDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    size_t seed = hash_combine(var_output, input, dim.size(), unbiased, keepdim);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopVarDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateVarDescriptor(
-            context::getInfiniopHandle(var_output->device()), &desc,
-            var_output->desc(), input->desc(), dim.data(), dim.size(), unbiased, keepdim));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetVarWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopVar(
-        desc, workspace->data(), workspace_size,
-        var_output->data(), input->data(), dim.data(), dim.size(), unbiased, keepdim, context::getStream()));
-}
-
-static bool registered = []() {
-    Var::dispatcher().registerDevice({Device::Type::CPU,
-                                      Device::Type::NVIDIA,
-                                      Device::Type::METAX,
-                                      Device::Type::MOORE,
-                                      Device::Type::ILUVATAR},
-                                     &calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::var_impl::infiniop
diff --git a/src/infinicore/ops/var_mean/var_mean.cc b/src/infinicore/ops/var_mean/var_mean.cc
deleted file mode 100644
index 817be7bcf..000000000
--- a/src/infinicore/ops/var_mean/var_mean.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "infinicore/ops/var_mean.hpp"
-
-#include "../../utils.hpp"
-#include <stdexcept>
-#include <vector>
-
-namespace infinicore::op {
-
-common::OpDispatcher<Var_Mean::schema> &Var_Mean::dispatcher() {
-    static common::OpDispatcher<Var_Mean::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Var_Mean::execute(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(var_output, mean_output, input);
-    infinicore::context::setDevice(input->device());
-    auto device_type = context::getDevice().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No Var_Mean implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(var_output, mean_output, input, dim, unbiased, keepdim);
-}
-
-std::pair<Tensor, Tensor> var_mean(Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    auto in_shape = input->shape();
-    std::vector<size_t> out_shape;
-    if (dim.empty()) {
-        for (size_t i = 0; i < in_shape.size(); i++) {
-            dim.push_back(i);
-        }
-    }
-    std::sort(dim.begin(), dim.end());
-    if (dim.size() == in_shape.size() && !keepdim) {
-        out_shape = {};
-    } else {
-        if (keepdim) {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    out_shape.push_back(1);
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        } else {
-            size_t j = 0;
-            for (size_t i = 0; i < in_shape.size(); i++) {
-                if (j < dim.size() && dim[j] == i) {
-                    j++;
-                } else {
-                    out_shape.push_back(in_shape[i]);
-                }
-            }
-        }
-    }
-    auto var_output = Tensor::empty(out_shape, input->dtype(), input->device());
-    auto mean_output = Tensor::empty(out_shape, input->dtype(), input->device());
-    var_mean_(var_output, mean_output, input, dim, unbiased, keepdim);
-    return {var_output, mean_output};
-}
-
-void var_mean_(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    Var_Mean::execute(var_output, mean_output, input, dim, unbiased, keepdim);
-}
-} // namespace infinicore::op
diff --git a/src/infinicore/ops/var_mean/var_mean_infiniop.cc b/src/infinicore/ops/var_mean/var_mean_infiniop.cc
deleted file mode 100644
index 89332d074..000000000
--- a/src/infinicore/ops/var_mean/var_mean_infiniop.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/var_mean.hpp"
-#include <infiniop.h>
-
-// todo 实现需要修改calculate函数
-
-namespace infinicore::op::var_mean_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopVarMeanDescriptor_t> caches(
-    100, // capacity
-    [](infiniopVarMeanDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyVarMeanDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
-    size_t seed = hash_combine(var_output, mean_output, input, dim.size(), unbiased, keepdim);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopVarMeanDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateVarMeanDescriptor(
-            context::getInfiniopHandle(var_output->device()), &desc,
-            var_output->desc(), mean_output->desc(), input->desc(), dim.data(), dim.size(), unbiased, keepdim));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetVarMeanWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopVarMean(
-        desc, workspace->data(), workspace_size,
-        var_output->data(), mean_output->data(), input->data(), dim.data(), dim.size(), unbiased, keepdim, context::getStream()));
-}
-
-static bool registered = []() {
-    Var_Mean::dispatcher().registerDevice({Device::Type::CPU,
-                                           Device::Type::NVIDIA,
-                                           Device::Type::METAX,
-                                           Device::Type::MOORE,
-                                           Device::Type::ILUVATAR},
-                                          &calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::var_mean_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 1c841961a..2eecb843f 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -4,16 +4,10 @@
 
 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
-#include "ops/all.hpp"
 #include "ops/attention.hpp"
-#include "ops/avg_pool1d.hpp"
 #include "ops/causal_softmax.hpp"
-#include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
-#include "ops/equal.hpp"
 #include "ops/flash_attention.hpp"
-#include "ops/hardswish.hpp"
-#include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/linear_w8a8i8.hpp"
@@ -30,11 +24,7 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/silu_and_mul.hpp"
-#include "ops/sum.hpp"
 #include "ops/swiglu.hpp"
-#include "ops/topk.hpp"
-#include "ops/var.hpp"
-#include "ops/var_mean.hpp"
 
 namespace py = pybind11;
 
@@ -52,28 +42,18 @@ inline void bind(py::module &m) {
     bind_mul(m);
     bind_mha_kvcache(m);
     bind_mha_varlen(m);
-    bind_hardswish(m);
-    bind_hardtanh(m);
     bind_paged_attention(m);
     bind_paged_attention_prefill(m);
     bind_paged_caching(m);
     bind_random_sample(m);
-    bind_cross_entropy(m);
     bind_rearrange(m);
     bind_rms_norm(m);
-    bind_avg_pool1d(m);
     bind_silu(m);
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
     bind_linear_w8a8i8(m);
     bind_silu_and_mul(m);
-    bind_sum(m);
-    bind_var_mean(m);
-    bind_var(m);
-    bind_topk(m);
-    bind_all(m);
-    bind_equal(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/all.hpp b/src/infinicore/pybind11/ops/all.hpp
deleted file mode 100644
index 4ccac685b..000000000
--- a/src/infinicore/pybind11/ops/all.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/all.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-Tensor py_all(Tensor input, py::object dim, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        return op::all(input, dim_vec, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        return op::all(input, dim.cast<std::vector<size_t>>(), keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        return op::all(input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a tuple or an integer");
-    }
-}
-
-void py_all_(Tensor output, Tensor input, py::object dim, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        op::all_(output, input, dim_vec, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        op::all_(output, input, dim.cast<std::vector<size_t>>(), keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        op::all_(output, input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a tuple or an integer");
-    }
-}
-
-inline void bind_all(py::module &m) {
-    m.def("all",
-          &py_all,
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("keepdim"),
-          R"doc(All of input tensor along the given dimensions.)doc");
-
-    m.def("all_",
-          &py_all_,
-          py::arg("output"),
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("keepdim"),
-          R"doc(In-place tensor all.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/avg_pool1d.hpp b/src/infinicore/pybind11/ops/avg_pool1d.hpp
deleted file mode 100644
index 32394552a..000000000
--- a/src/infinicore/pybind11/ops/avg_pool1d.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-#pragma once
-
-#include <optional>
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/avg_pool1d.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_avg_pool1d(py::module &m) {
-    m.def(
-        "avg_pool1d",
-        [](::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
-            return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding);
-        },
-        py::arg("input"),
-        py::arg("kernel_size"),
-        py::arg("stride") = py::none(),
-        py::arg("padding") = 0,
-        R"doc(AvgPool1d out-of-place.)doc");
-
-    m.def(
-        "avg_pool1d_",
-        [](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
-            op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding);
-        },
-        py::arg("output"),
-        py::arg("input"),
-        py::arg("kernel_size"),
-        py::arg("stride") = py::none(),
-        py::arg("padding") = 0,
-        R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/cross_entropy.hpp b/src/infinicore/pybind11/ops/cross_entropy.hpp
deleted file mode 100644
index 8105642a6..000000000
--- a/src/infinicore/pybind11/ops/cross_entropy.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/cross_entropy.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_cross_entropy(py::module &m) {
-    m.def("cross_entropy",
-          &op::cross_entropy,
-          py::arg("logits"),
-          py::arg("target"),
-          R"doc(Token-wise cross entropy loss without reduction.)doc");
-
-    m.def("cross_entropy_",
-          &op::cross_entropy_,
-          py::arg("loss"),
-          py::arg("logits"),
-          py::arg("target"),
-          R"doc(Write cross entropy loss into a provided tensor.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/equal.hpp b/src/infinicore/pybind11/ops/equal.hpp
deleted file mode 100644
index d14a6b61d..000000000
--- a/src/infinicore/pybind11/ops/equal.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/equal.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_equal(py::module &m) {
-    m.def("equal",
-          &op::equal,
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(Elementwise equality returning a bool tensor.)doc");
-
-    m.def("equal_",
-          &op::equal_,
-          py::arg("out"),
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(In-place elementwise equality writing into `out`.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/hardswish.hpp b/src/infinicore/pybind11/ops/hardswish.hpp
deleted file mode 100644
index daaccec62..000000000
--- a/src/infinicore/pybind11/ops/hardswish.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/hardswish.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_hardswish(py::module &m) {
-    m.def("hardswish",
-          &op::hardswish,
-          py::arg("input"),
-          R"doc(Out-of-place Hardswish activation.)doc");
-
-    m.def("hardswish_",
-          &op::hardswish_,
-          py::arg("output"),
-          py::arg("input"),
-          R"doc(In-place Hardswish activation.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/hardtanh.hpp b/src/infinicore/pybind11/ops/hardtanh.hpp
deleted file mode 100644
index ff9abb872..000000000
--- a/src/infinicore/pybind11/ops/hardtanh.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/hardtanh.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_hardtanh(py::module &m) {
-    m.def("hardtanh",
-          &op::hardtanh,
-          py::arg("input"),
-          py::arg("min_val") = -1.0f,
-          py::arg("max_val") = 1.0f,
-          R"doc(Apply the HardTanh activation.)doc");
-
-    m.def("hardtanh_",
-          &op::hardtanh_,
-          py::arg("output"),
-          py::arg("input"),
-          py::arg("min_val") = -1.0f,
-          py::arg("max_val") = 1.0f,
-          R"doc(In-place HardTanh activation.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/sum.hpp b/src/infinicore/pybind11/ops/sum.hpp
deleted file mode 100644
index 50fef7539..000000000
--- a/src/infinicore/pybind11/ops/sum.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/sum.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-Tensor py_sum(Tensor input, py::object dim, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        return op::sum(input, dim_vec, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        return op::sum(input, dim.cast<std::vector<size_t>>(), keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        return op::sum(input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a tuple or an integer");
-    }
-}
-
-void py_sum_(Tensor output, Tensor input, py::object dim, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        op::sum_(output, input, dim_vec, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        op::sum_(output, input, dim.cast<std::vector<size_t>>(), keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        op::sum_(output, input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a tuple or an integer");
-    }
-}
-
-inline void bind_sum(py::module &m) {
-    m.def("sum",
-          &py_sum,
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("keepdim"),
-          R"doc(Sum of input tensor along the given dimensions.)doc");
-
-    m.def("sum_",
-          &py_sum_,
-          py::arg("output"),
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("keepdim"),
-          R"doc(In-place tensor sum.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/topk.hpp b/src/infinicore/pybind11/ops/topk.hpp
deleted file mode 100644
index 1341f39fa..000000000
--- a/src/infinicore/pybind11/ops/topk.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h> // 添加这行
-
-#include "infinicore/ops/topk.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-std::pair<Tensor, Tensor> py_topk(Tensor input, size_t k, int dim, bool largest, bool sorted) {
-    if (dim == -1) {
-        return op::topk(input, k, input->ndim() - 1, largest, sorted);
-    } else if (dim >= 0) {
-        return op::topk(input, k, static_cast<size_t>(dim), largest, sorted);
-    } else {
-        throw std::invalid_argument("invalid argument: dim");
-    }
-}
-
-void py_topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, int dim, bool largest, bool sorted) {
-    if (dim == -1) {
-        op::topk_(values_output, indices_output, input, k, input->ndim() - 1, largest, sorted);
-    } else if (dim >= 0) {
-        op::topk_(values_output, indices_output, input, k, static_cast<size_t>(dim), largest, sorted);
-    } else {
-        throw std::invalid_argument("invalid argument: dim");
-    }
-}
-
-inline void bind_topk(py::module &m) {
-    m.def("topk",
-          &py_topk,
-          py::arg("input"),
-          py::arg("k"),
-          py::arg("dim"),
-          py::arg("largest"),
-          py::arg("sorted"),
-          R"doc(topk of input tensor along the given dimensions.)doc");
-
-    m.def("topk_",
-          &py_topk_,
-          py::arg("values_output"),
-          py::arg("indices_output"),
-          py::arg("input"),
-          py::arg("k"),
-          py::arg("dim"),
-          py::arg("largest"),
-          py::arg("sorted"),
-          R"doc(In-place tensor topk_.)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/var.hpp b/src/infinicore/pybind11/ops/var.hpp
deleted file mode 100644
index 9668fef5f..000000000
--- a/src/infinicore/pybind11/ops/var.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/var.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-Tensor py_var(Tensor input, py::object dim, bool unbiased, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        return op::var(input, dim_vec, unbiased, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        return op::var(input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        return op::var(input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a tuple or an integer");
-    }
-}
-
-void py_var_(Tensor var_output, Tensor input, py::object dim, bool unbiased, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        op::var_(var_output, input, dim_vec, unbiased, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        op::var_(var_output, input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        op::var_(var_output, input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a list/tuple or an integer");
-    }
-}
-
-inline void bind_var(py::module &m) {
-    m.def("var",
-          &py_var,
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("unbiased"),
-          py::arg("keepdim"),
-          R"doc(Var of input tensor along the given dimensions.)doc");
-
-    m.def("var_",
-          &py_var_,
-          py::arg("var_output"),
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("unbiased"),
-          py::arg("keepdim"),
-          R"doc(In-place tensor Var .)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/var_mean.hpp b/src/infinicore/pybind11/ops/var_mean.hpp
deleted file mode 100644
index 986ec49f7..000000000
--- a/src/infinicore/pybind11/ops/var_mean.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/var_mean.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-std::pair<Tensor, Tensor> py_var_mean(Tensor input, py::object dim, bool unbiased, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        return op::var_mean(input, dim_vec, unbiased, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        return op::var_mean(input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        return op::var_mean(input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a tuple or an integer");
-    }
-}
-
-void py_var_mean_(Tensor var_output, Tensor mean_output, Tensor input, py::object dim, bool unbiased, bool keepdim) {
-    if (dim.is_none()) {
-        std::vector<size_t> dim_vec;
-        for (int i = 0; i < input->shape().size(); i++) {
-            dim_vec.push_back(i);
-        }
-        op::var_mean_(var_output, mean_output, input, dim_vec, unbiased, keepdim);
-    } else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
-        op::var_mean_(var_output, mean_output, input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
-    } else if (py::isinstance<py::int_>(dim)) {
-        op::var_mean_(var_output, mean_output, input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
-    } else {
-        throw std::invalid_argument("dim must be a list/tuple or an integer");
-    }
-}
-
-inline void bind_var_mean(py::module &m) {
-    m.def("var_mean",
-          &py_var_mean,
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("unbiased"),
-          py::arg("keepdim"),
-          R"doc(Var & Mean of input tensor along the given dimensions.)doc");
-
-    m.def("var_mean_",
-          &py_var_mean_,
-          py::arg("var_output"),
-          py::arg("mean_output"),
-          py::arg("input"),
-          py::arg("dim"),
-          py::arg("unbiased"),
-          py::arg("keepdim"),
-          R"doc(In-place tensor Var & Mean .)doc");
-}
-
-} // namespace infinicore::ops
diff --git a/src/infiniop/ops/all/all_desc.h b/src/infiniop/ops/all/all_desc.h
deleted file mode 100644
index 9b7a1e0d6..000000000
--- a/src/infiniop/ops/all/all_desc.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef INFINIOP_ALL_DESCRIPTOR_H_
-#define INFINIOP_ALL_DESCRIPTOR_H_
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::all::NAMESPACE {                               \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        AllInfo _info;                                           \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            AllInfo info,                                        \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t output_desc,              \
-            infiniopTensorDescriptor_t input_desc,               \
-            size_t *dim,                                         \
-            size_t dim_size,                                     \
-            bool keepdim);                                       \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *output,                                        \
-            const void *input,                                   \
-            size_t *dim,                                         \
-            size_t dim_size,                                     \
-            bool keepdim,                                        \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif
diff --git a/src/infiniop/ops/all/cpu/all_cpu.cc b/src/infiniop/ops/all/cpu/all_cpu.cc
deleted file mode 100644
index dbe03fc3b..000000000
--- a/src/infiniop/ops/all/cpu/all_cpu.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-#include "all_cpu.h"
-#include "../../../../utils.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include <iostream>
-namespace op::all::cpu {
-
-Descriptor::~Descriptor() {}
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-
-    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-template <typename Tdata>
-infiniStatus_t calculateAll(
-    const AllInfo &info,
-    bool *output,
-    const Tdata *input,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    if (info.reduce_dim_size == info.ndim) {
-        bool result = true;
-        for (size_t index = 0; index < info.input_size; index++) {
-            size_t input_offset = op::common_cpu::indexToOffset(index, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data());
-            result = result && input[input_offset];
-        }
-        output[0] = result;
-        return INFINI_STATUS_SUCCESS;
-    } else {
-        for (size_t i = info.output_size; i-- > 0;) {
-            size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data());
-            bool result = true;
-            for (size_t j = 0; j < info.reduce_num; j++) {
-                size_t input_flat = j + i * info.reduce_num;
-                size_t input_offset = op::common_cpu::indexToOffset(input_flat, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data());
-                Tdata input_val = input[input_offset];
-                bool bool_val = static_cast<bool>(input_val);
-                result = result && bool_val;
-            }
-            output[output_offset] = result;
-        }
-        return INFINI_STATUS_SUCCESS;
-    }
-}
-} // namespace
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim,
-    void *stream) const {
-    switch (_info.dtype) {
-    case INFINI_DTYPE_BOOL:
-        return calculateAll<bool>(_info, reinterpret_cast<bool *>(output), reinterpret_cast<const bool *>(input), dim, dim_size, keepdim);
-    case INFINI_DTYPE_U8:
-        return calculateAll<uint8_t>(_info, reinterpret_cast<bool *>(output), reinterpret_cast<const uint8_t *>(input), dim, dim_size, keepdim);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::all::cpu
diff --git a/src/infiniop/ops/all/cpu/all_cpu.h b/src/infiniop/ops/all/cpu/all_cpu.h
deleted file mode 100644
index 71fd83689..000000000
--- a/src/infiniop/ops/all/cpu/all_cpu.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ALL_CPU_H__
-#define __INFINIOP_ALL_CPU_H__
-
-#include "../all_desc.h"
-
-DESCRIPTOR(cpu);
-
-#endif // __INFINIOP_ALL_CPU_H__
diff --git a/src/infiniop/ops/all/cuda/kernel.cuh b/src/infiniop/ops/all/cuda/kernel.cuh
deleted file mode 100644
index b32d1da23..000000000
--- a/src/infiniop/ops/all/cuda/kernel.cuh
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef __ALL_CUDA_H__
-#define __ALL_CUDA_H__
-
-__forceinline__ __device__ __host__ size_t
-indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
-
-template <size_t BLOCK_SIZE, typename Tdata>
-__global__ void allReduceTempKernel(
-    bool *temp_output,
-    const Tdata *input,
-    size_t input_size,
-    size_t permuted_input_shape_size,
-    size_t *permuted_input_shape,
-    ptrdiff_t *permuted_input_strides) {
-    __shared__ bool s_data[BLOCK_SIZE];
-    size_t tid = threadIdx.x;
-    size_t idx = tid + blockIdx.x * blockDim.x;
-    if (idx < input_size) {
-        size_t input_offset = indexToOffset(idx, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
-        s_data[tid] = static_cast<bool>(input[input_offset]);
-    } else {
-        s_data[tid] = true;
-    }
-    __syncthreads();
-    for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            s_data[tid] = s_data[tid] && s_data[tid + s];
-        }
-        __syncthreads();
-    }
-    if (tid == 0) {
-        temp_output[blockIdx.x] = s_data[0];
-    }
-}
-
-template <size_t BLOCK_SIZE>
-__global__ void finalAllReduceKernel(
-    bool *output,
-    const bool *block_results,
-    size_t num_blocks) {
-    __shared__ bool s_data[BLOCK_SIZE];
-    size_t tid = threadIdx.x;
-    bool thread_val = true;
-    for (size_t i = tid; i < num_blocks; i += blockDim.x) {
-        thread_val = thread_val && block_results[i];
-    }
-    s_data[tid] = thread_val;
-    __syncthreads();
-    for (size_t s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            s_data[tid] = s_data[tid] && s_data[tid + s];
-        }
-        __syncthreads();
-    }
-
-    if (tid == 0) {
-        *output = s_data[0];
-    }
-}
-
-template <size_t BLOCK_SIZE, typename Tdata>
-__global__ void allKernel(
-    bool *output,
-    const Tdata *input,
-    size_t permuted_input_shape_size,
-    size_t output_shape_size,
-    size_t output_size,
-    size_t reduce_num,
-    size_t *permuted_input_shape,
-    size_t *output_shape,
-    ptrdiff_t *permuted_input_strides,
-    ptrdiff_t *output_strides) {
-    size_t tid = threadIdx.x;
-    size_t idx = tid + blockIdx.x * blockDim.x;
-    if (idx >= output_size) {
-        return;
-    }
-    size_t output_index = indexToOffset(idx, output_shape_size, output_shape, output_strides);
-    bool tempRes = true;
-    for (size_t i = 0; i < reduce_num; i++) {
-        size_t input_offset = indexToOffset(i + idx * reduce_num, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
-        tempRes = tempRes && static_cast<bool>(input[input_offset]);
-    }
-    output[output_index] = tempRes;
-}
-
-#endif // __ALL_CUDA_H__
diff --git a/src/infiniop/ops/all/info.h b/src/infiniop/ops/all/info.h
deleted file mode 100644
index f3f333fc8..000000000
--- a/src/infiniop/ops/all/info.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef __ALL_INFO_H__
-#define __ALL_INFO_H__
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <algorithm>
-#include <cstddef>
-#include <vector>
-
-namespace op::all {
-class AllInfo {
-    AllInfo() = default;
-
-public:
-    infiniDtype_t dtype;
-    std::vector<size_t> permuted_input_shape; // need to permute
-    std::vector<size_t> output_shape;
-    std::vector<ptrdiff_t> permuted_input_strides; // need to permute
-    std::vector<ptrdiff_t> output_strides;
-    size_t reduce_dim_size; // reduce dim size
-    size_t reduce_num;      // number of elements to reduce for each output element
-    size_t input_size;      // total number of input elements
-    size_t output_size;     // total number of output elements
-    size_t ndim;            // number of dimensions
-    static utils::Result<AllInfo> create(
-        infiniopTensorDescriptor_t output_desc,
-        infiniopTensorDescriptor_t input_desc,
-        size_t *dim,
-        size_t dim_size,
-        bool keepdim) {
-        auto input_shape = input_desc->shape();
-        auto input_strides = input_desc->strides();
-        size_t input_ndim = input_desc->ndim();
-        size_t reduce_num = 1;
-        for (size_t i = 0; i < dim_size; i++) {
-            reduce_num *= input_shape[dim[i]];
-        }
-        std::vector<size_t> permute_order;
-        for (size_t i = 0; i < input_ndim; i++) {
-            if (std::find(dim, dim + dim_size, i) == dim + dim_size) {
-                permute_order.push_back(i);
-            }
-        }
-        for (size_t i = 0; i < dim_size; i++) {
-            permute_order.push_back(dim[i]);
-        }
-        std::vector<size_t> permuted_input_shape;
-        std::vector<ptrdiff_t> permuted_input_strides;
-        for (size_t i = 0; i < permute_order.size(); i++) {
-            permuted_input_shape.push_back(input_shape[permute_order[i]]);
-            permuted_input_strides.push_back(input_strides[permute_order[i]]);
-        }
-        return utils::Result<AllInfo>(AllInfo{input_desc->dtype(),
-                                              permuted_input_shape,
-                                              output_desc->shape(),
-                                              permuted_input_strides,
-                                              output_desc->strides(),
-                                              dim_size,
-                                              reduce_num,
-                                              input_desc->numel(),
-                                              output_desc->numel(),
-                                              input_ndim});
-    }
-};
-} // namespace op::all
-
-#endif
diff --git a/src/infiniop/ops/all/metax/all_metax.h b/src/infiniop/ops/all/metax/all_metax.h
deleted file mode 100644
index 0f0ecc742..000000000
--- a/src/infiniop/ops/all/metax/all_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __ALL_METAX_H__
-#define __ALL_METAX_H__
-
-#include "../all_desc.h"
-
-DESCRIPTOR(metax);
-
-#endif
diff --git a/src/infiniop/ops/all/metax/all_metax.maca b/src/infiniop/ops/all/metax/all_metax.maca
deleted file mode 100644
index b95936585..000000000
--- a/src/infiniop/ops/all/metax/all_metax.maca
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "../../../devices/metax/metax_common.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "all_metax.h"
-
-namespace op::all::metax {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, typename Tdata>
-infiniStatus_t launchKernel(
-    const AllInfo &info,
-    bool *output, const Tdata *input,
-    hcStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_hc = permuted_input_shape_hc + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-
-    if (info.reduce_num == input_size) {
-        size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        bool *temp_output;
-        CHECK_METAX(hcMalloc(&temp_output, grid_size * sizeof(bool)));
-        allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
-            temp_output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc);
-        finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
-        CHECK_METAX(hcFree(temp_output));
-    } else {
-        size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            output, input, input_ndim, output_ndim, output_size, reduce_num,
-            permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim,
-    void *stream_) const {
-
-    hcStream_t stream = (hcStream_t)stream_;
-
-#define CALCULATE_ALL(BLOCK_SIZE, Tdata)      \
-    launchKernel<BLOCK_SIZE, Tdata>(          \
-        _info,                                \
-        (bool *)output, (const Tdata *)input, \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE)      \
-    {                                                  \
-        if (_info.dtype == INFINI_DTYPE_BOOL)          \
-            return CALCULATE_ALL(BLOCK_SIZE, bool);    \
-        else if (_info.dtype == INFINI_DTYPE_U8)       \
-            return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
-        else                                           \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;     \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_ALL_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::all::metax
diff --git a/src/infiniop/ops/all/moore/all_moore.h b/src/infiniop/ops/all/moore/all_moore.h
deleted file mode 100644
index d7dab5396..000000000
--- a/src/infiniop/ops/all/moore/all_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __ALL_MOORE_H__
-#define __ALL_MOORE_H__
-
-#include "../all_desc.h"
-
-DESCRIPTOR(moore);
-
-#endif
diff --git a/src/infiniop/ops/all/moore/all_moore.mu b/src/infiniop/ops/all/moore/all_moore.mu
deleted file mode 100644
index 624d47391..000000000
--- a/src/infiniop/ops/all/moore/all_moore.mu
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "../../../devices/moore/moore_common.h"
-#include "../../../devices/moore/moore_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "all_moore.h"
-
-namespace op::all::moore {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, typename Tdata>
-infiniStatus_t launchKernel(
-    const AllInfo &info,
-    bool *output, const Tdata *input,
-    musaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_musa = permuted_input_shape_musa + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-
-    if (info.reduce_num == input_size) {
-        size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        bool *temp_output;
-        CHECK_MOORE(musaMalloc(&temp_output, grid_size * sizeof(bool)));
-        allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
-            temp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
-        finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
-        CHECK_MOORE(musaFree(temp_output));
-    } else {
-        size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            output, input, input_ndim, output_ndim, output_size, reduce_num,
-            permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim,
-    void *stream_) const {
-
-    musaStream_t stream = (musaStream_t)stream_;
-
-#define CALCULATE_ALL(BLOCK_SIZE, Tdata)      \
-    launchKernel<BLOCK_SIZE, Tdata>(          \
-        _info,                                \
-        (bool *)output, (const Tdata *)input, \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE)      \
-    {                                                  \
-        if (_info.dtype == INFINI_DTYPE_BOOL)          \
-            return CALCULATE_ALL(BLOCK_SIZE, bool);    \
-        else if (_info.dtype == INFINI_DTYPE_U8)       \
-            return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
-        else                                           \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;     \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_ALL_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::all::moore
diff --git a/src/infiniop/ops/all/nvidia/all_nvidia.cu b/src/infiniop/ops/all/nvidia/all_nvidia.cu
deleted file mode 100644
index f0858d2f7..000000000
--- a/src/infiniop/ops/all/nvidia/all_nvidia.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "all_nvidia.cuh"
-
-namespace op::all::nvidia {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, typename Tdata>
-infiniStatus_t launchKernel(
-    const AllInfo &info,
-    bool *output, const Tdata *input,
-    cudaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_cuda = permuted_input_shape_cuda + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_cuda = permuted_input_strides_cuda + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-
-    if (info.reduce_num == input_size) {
-        size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        bool *temp_output;
-        CHECK_CUDA(cudaMalloc(&temp_output, grid_size * sizeof(bool)));
-        allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
-            temp_output, input, input_size, input_ndim, permuted_input_shape_cuda, permuted_input_strides_cuda);
-        finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
-        CHECK_CUDA(cudaFree(temp_output));
-    } else {
-        size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            output, input, input_ndim, output_ndim, output_size, reduce_num,
-            permuted_input_shape_cuda, output_shape_cuda, permuted_input_strides_cuda, output_strides_cuda);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim,
-    void *stream_) const {
-
-    cudaStream_t stream = (cudaStream_t)stream_;
-
-#define CALCULATE_ALL(BLOCK_SIZE, Tdata)      \
-    launchKernel<BLOCK_SIZE, Tdata>(          \
-        _info,                                \
-        (bool *)output, (const Tdata *)input, \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE)      \
-    {                                                  \
-        if (_info.dtype == INFINI_DTYPE_BOOL)          \
-            return CALCULATE_ALL(BLOCK_SIZE, bool);    \
-        else if (_info.dtype == INFINI_DTYPE_U8)       \
-            return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
-        else                                           \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;     \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_ALL_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::all::nvidia
diff --git a/src/infiniop/ops/all/nvidia/all_nvidia.cuh b/src/infiniop/ops/all/nvidia/all_nvidia.cuh
deleted file mode 100644
index 111e0816f..000000000
--- a/src/infiniop/ops/all/nvidia/all_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __ALL_NVIDIA_H__
-#define __ALL_NVIDIA_H__
-
-#include "../all_desc.h"
-
-DESCRIPTOR(nvidia);
-
-#endif // __ALL_CUDA_API_H__
diff --git a/src/infiniop/ops/all/operator.cc b/src/infiniop/ops/all/operator.cc
deleted file mode 100644
index c7e70caa7..000000000
--- a/src/infiniop/ops/all/operator.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/all.h"
-#include <vector>
-
-#ifdef ENABLE_CPU_API
-#include "cpu/all_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/all_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/all_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/all_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/all_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateAllDescriptor(
-    infiniopHandle_t handle,
-    infiniopAllDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::all::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::all::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                   \
-            input_desc,                                                    \
-            dim,                                                           \
-            dim_size,                                                      \
-            keepdim)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetAllWorkspaceSize(infiniopAllDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::all::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopAll(
-    infiniopAllDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, output, input, dim, dim_size, keepdim, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyAllDescriptor(infiniopAllDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/avg_pool1d/avg_pool1d.h b/src/infiniop/ops/avg_pool1d/avg_pool1d.h
deleted file mode 100644
index a81f46464..000000000
--- a/src/infiniop/ops/avg_pool1d/avg_pool1d.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef __AVG_POOL1D_H__
-#define __AVG_POOL1D_H__
-
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-#include "infiniop/ops/avg_pool1d.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-    namespace op::avg_pool1d::NAMESPACE {                        \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        AvgPool1dInfo _info;                                     \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            AvgPool1dInfo info,                                  \
-            size_t workspace_size_,                              \
-            Opaque *opaque,                                      \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size_) {}                \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-                                                                 \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t y_desc,                   \
-            infiniopTensorDescriptor_t x_desc,                   \
-            size_t kernel_size,                                  \
-            size_t stride,                                       \
-            size_t padding);                                     \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace,                                     \
-            size_t workspace_size,                               \
-            void *y,                                             \
-            const void *x,                                       \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-class AvgPool1dInfo {
-private:
-    AvgPool1dInfo() = default;
-
-public:
-    infiniDtype_t dtype;
-    size_t batch, channels, in_width, out_width;
-    size_t kernel_size, stride, padding;
-
-    ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width;
-    ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width;
-
-    static utils::Result<AvgPool1dInfo> createAvgPool1dInfo(
-        infiniopTensorDescriptor_t y_desc,
-        infiniopTensorDescriptor_t x_desc,
-        size_t kernel_size,
-        size_t stride,
-        size_t padding) {
-
-        CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
-
-        const infiniDtype_t dtype = y_desc->dtype();
-        CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
-        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-        CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t batch = x_desc->dim(0);
-        size_t channels = x_desc->dim(1);
-        size_t in_width = x_desc->dim(2);
-
-        CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE);
-        CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t padded_len = in_width + 2 * padding;
-
-        CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t expected_out_width = (padded_len - kernel_size) / stride + 1;
-        CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t out_width = expected_out_width;
-
-        return utils::Result<AvgPool1dInfo>(AvgPool1dInfo{
-            dtype,
-            batch, channels, in_width, out_width,
-            kernel_size, stride, padding,
-            y_desc->stride(0), y_desc->stride(1), y_desc->stride(2),
-            x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)});
-    }
-};
-
-#endif
diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
deleted file mode 100644
index 67e5b6623..000000000
--- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "avg_pool1d_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include <algorithm>
-
-namespace op::avg_pool1d::cpu {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        nullptr,
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info,
-                                  T *y,
-                                  const T *x) {
-    const float inv_kernel = 1.0f / static_cast<float>(info.kernel_size);
-
-#pragma omp parallel for
-    for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) {
-
-        ptrdiff_t b = bc / info.channels;
-        ptrdiff_t c = bc % info.channels;
-
-        size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel;
-        size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel;
-
-        for (size_t ow = 0; ow < info.out_width; ++ow) {
-            size_t y_offset = y_base + ow * info.y_stride_width;
-
-            long long start_w = static_cast<long long>(ow * info.stride) - info.padding;
-            long long end_w = start_w + info.kernel_size;
-
-            long long valid_start = std::max(0LL, start_w);
-            long long valid_end = std::min(static_cast<long long>(info.in_width), end_w);
-
-            float sum = 0.0f;
-            for (long long iw = valid_start; iw < valid_end; ++iw) {
-                size_t x_offset = x_base + iw * info.x_stride_width;
-                sum += utils::cast<float>(x[x_offset]);
-            }
-
-            const float avg = sum * inv_kernel;
-            y[y_offset] = utils::cast<T>(avg);
-        }
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(fp16_t);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(bf16_t);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::cpu
diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
deleted file mode 100644
index 0b9f6c666..000000000
--- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_AVG_POOL1D_CPU_H__
-#define __INFINIOP_AVG_POOL1D_CPU_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(cpu)
-
-#endif
diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
deleted file mode 100644
index 36a11acfc..000000000
--- a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
-#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
-
-template <typename T>
-__device__ void avgPool1dKernel(
-    T *y,
-    const T *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    size_t total_elements = batch * channels * out_width;
-
-    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-         idx < total_elements;
-         idx += gridDim.x * blockDim.x) {
-
-        size_t ow = idx % out_width;
-        size_t temp = idx / out_width;
-        size_t c = temp % channels;
-        size_t b = temp / channels;
-
-        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
-
-        long long start_w = static_cast<long long>(ow * stride) - padding;
-
-        T sum = 0;
-
-        for (size_t k = 0; k < kernel_size; ++k) {
-            long long iw = start_w + k;
-
-            if (iw >= 0 && iw < static_cast<long long>(in_width)) {
-                size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width;
-                sum += x[x_offset];
-            }
-        }
-
-#if defined(ENABLE_ILUVATAR_API)
-        // Iluvatar __half doesn't accept size_t directly.
-        y[y_offset] = sum / static_cast<T>(static_cast<double>(kernel_size));
-#else
-        y[y_offset] = sum / static_cast<T>(kernel_size);
-#endif
-    }
-}
-
-#endif
diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
deleted file mode 100644
index 576da66de..000000000
--- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_AVG_POOL1D_METAX_H__
-#define __INFINIOP_AVG_POOL1D_METAX_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(metax)
-
-#endif // __INFINIOP_AVG_POOL1D_METAX_H__
diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
deleted file mode 100644
index 9b3f15b9a..000000000
--- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
+++ /dev/null
@@ -1,170 +0,0 @@
-#include "../../../devices/metax/metax_common.h"
-#include "avg_pool1d_metax.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-
-#include <type_traits>
-
-namespace op::avg_pool1d::metax {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        new Opaque{handle->internal()},
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
-    if constexpr (std::is_same_v<Tdata, half>) {
-        return __float2half(static_cast<float>(val));
-    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
-        return __float2bfloat16(static_cast<float>(val));
-    } else {
-        return static_cast<Tdata>(val);
-    }
-}
-
-template <typename Tdata, typename Tcompute>
-INFINIOP_METAX_KERNEL avgPool1dGlobalKernel(
-    Tdata *y,
-    const Tdata *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    size_t total_elements = batch * channels * out_width;
-    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
-
-    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-         idx < total_elements;
-         idx += gridDim.x * blockDim.x) {
-
-        size_t ow = idx % out_width;
-        size_t temp = idx / out_width;
-        size_t c = temp % channels;
-        size_t b = temp / channels;
-
-        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
-        size_t x_base = b * x_stride_batch + c * x_stride_channel;
-
-        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
-        long long end_w = start_w + static_cast<long long>(kernel_size);
-        long long iw_start = start_w < 0 ? 0 : start_w;
-        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
-
-        Tcompute sum = Tcompute(0);
-        if (iw_start < iw_end) {
-            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
-            for (long long iw = iw_start; iw < iw_end; ++iw) {
-                sum += static_cast<Tcompute>(x[x_offset]);
-                x_offset += x_stride_width;
-            }
-        }
-
-        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
-    }
-}
-
-template <typename Tdata, typename Tcompute>
-infiniStatus_t calculateAvgPool1d(
-    const AvgPool1dInfo &info,
-    int max_threads_per_block,
-    Tdata *y,
-    const Tdata *x,
-    hcStream_t stream) {
-
-    size_t total_elements = info.batch * info.channels * info.out_width;
-
-    int block_size = 256;
-    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
-        block_size = max_threads_per_block;
-    }
-
-    size_t grid_size = (total_elements + block_size - 1) / block_size;
-    if (grid_size > 65535) {
-        grid_size = 65535;
-    }
-
-    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
-        y, x,
-        info.batch, info.channels, info.in_width, info.out_width,
-        info.kernel_size, info.stride, info.padding,
-        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
-        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA, TCOMPUTE) \
-    calculateAvgPool1d<TDATA, TCOMPUTE>( \
-        _info, \
-        _opaque->internal->maxThreadsPerBlock(), \
-        (TDATA *)y, \
-        (const TDATA *)x, \
-        (hcStream_t)stream)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    (void)workspace;
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(half, float);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(cuda_bfloat16, float);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float, float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double, double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::metax
diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
deleted file mode 100644
index 9034d7358..000000000
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
-#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
-
-#include <type_traits>
-
-namespace op::avg_pool1d::moore {
-
-template <typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
-    if constexpr (std::is_same_v<Tdata, half>) {
-        return __float2half(static_cast<float>(val));
-    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
-        return __float2bfloat16_rn(static_cast<float>(val));
-    } else {
-        return static_cast<Tdata>(val);
-    }
-}
-
-template <typename Tdata, typename Tcompute>
-__device__ void avgPool1dKernel(
-    Tdata *y,
-    const Tdata *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    size_t total_elements = batch * channels * out_width;
-    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
-
-    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-         idx < total_elements;
-         idx += gridDim.x * blockDim.x) {
-
-        size_t ow = idx % out_width;
-        size_t temp = idx / out_width;
-        size_t c = temp % channels;
-        size_t b = temp / channels;
-
-        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
-        size_t x_base = b * x_stride_batch + c * x_stride_channel;
-
-        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
-        long long end_w = start_w + static_cast<long long>(kernel_size);
-        long long iw_start = start_w < 0 ? 0 : start_w;
-        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
-
-        Tcompute sum = Tcompute(0);
-        if (iw_start < iw_end) {
-            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
-            for (long long iw = iw_start; iw < iw_end; ++iw) {
-                sum += static_cast<Tcompute>(x[x_offset]);
-                x_offset += x_stride_width;
-            }
-        }
-
-        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
-    }
-}
-
-} // namespace op::avg_pool1d::moore
-
-#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
deleted file mode 100644
index 604d06012..000000000
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__
-#define __INFINIOP_AVG_POOL1D_MOORE_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(moore)
-
-#endif // __INFINIOP_AVG_POOL1D_MOORE_H__
diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
deleted file mode 100644
index 518d249b9..000000000
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
+++ /dev/null
@@ -1,135 +0,0 @@
-#include "../../../devices/moore/moore_common.h"
-#include "avg_pool1d_moore.h"
-
-#include "../../../devices/moore/moore_kernel_common.h"
-
-#include "avg_pool1d_kernel.h"
-
-namespace op::avg_pool1d::moore {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        new Opaque{handle->internal()},
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename Tdata, typename Tcompute>
-INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel(
-    Tdata *y,
-    const Tdata *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    avgPool1dKernel<Tdata, Tcompute>(
-        y, x,
-        batch, channels, in_width, out_width,
-        kernel_size, stride, padding,
-        y_stride_batch, y_stride_channel, y_stride_width,
-        x_stride_batch, x_stride_channel, x_stride_width);
-}
-
-template <typename Tdata, typename Tcompute>
-infiniStatus_t calculateAvgPool1d(
-    const AvgPool1dInfo &info,
-    int max_threads_per_block,
-    Tdata *y,
-    const Tdata *x,
-    musaStream_t stream) {
-
-    size_t total_elements = info.batch * info.channels * info.out_width;
-
-    int block_size = 256;
-    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
-        block_size = max_threads_per_block;
-    }
-
-    size_t grid_size = (total_elements + block_size - 1) / block_size;
-    if (grid_size > 65535) {
-        grid_size = 65535;
-    }
-
-    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
-        y, x,
-        info.batch, info.channels, info.in_width, info.out_width,
-        info.kernel_size, info.stride, info.padding,
-        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
-        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA, TCOMPUTE) \
-    calculateAvgPool1d<TDATA, TCOMPUTE>(\
-        _info,\
-        _opaque->internal->maxThreadsPerBlock(),\
-        (TDATA *)y,\
-        (const TDATA *)x,\
-        (musaStream_t)stream)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    (void)workspace;
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(half, float);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(cuda_bfloat16, float);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float, float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double, double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::moore
diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
deleted file mode 100644
index 634ce9018..000000000
--- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "avg_pool1d_nvidia.cuh"
-
-template <typename T>
-__global__ void avgPool1dGlobalKernel(
-    T *y,
-    const T *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    avgPool1dKernel<T>(
-        y, x,
-        batch, channels, in_width, out_width,
-        kernel_size, stride, padding,
-        y_stride_batch, y_stride_channel, y_stride_width,
-        x_stride_batch, x_stride_channel, x_stride_width);
-}
-
-namespace op::avg_pool1d::nvidia {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t calculateAvgPool1d(
-    const AvgPool1dInfo &info,
-    int max_threads_per_block,
-    T *y,
-    const T *x,
-    cudaStream_t stream) {
-
-    size_t total_elements = info.batch * info.channels * info.out_width;
-
-    int block_size = 256;
-    if (max_threads_per_block > 0 && max_threads_per_block < 256) {
-        block_size = max_threads_per_block;
-    }
-
-    size_t grid_size = (total_elements + block_size - 1) / block_size;
-    if (grid_size > 65535) {
-        grid_size = 65535;
-    }
-
-    avgPool1dGlobalKernel<T><<<grid_size, block_size, 0, stream>>>(
-        y, x,
-        info.batch, info.channels, info.in_width, info.out_width,
-        info.kernel_size, info.stride, info.padding,
-        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
-        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA)                                        \
-    calculateAvgPool1d(_info,                                   \
-                       _opaque->internal->maxThreadsPerBlock(), \
-                       (TDATA *)y,                              \
-                       (const TDATA *)x,                        \
-                       (cudaStream_t)stream)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(half);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(cuda_bfloat16);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::nvidia
diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
deleted file mode 100644
index 1019ce354..000000000
--- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__
-#define __INFINIOP_AVG_POOL1D_CUDA_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(nvidia)
-
-#endif
diff --git a/src/infiniop/ops/avg_pool1d/operator.cc b/src/infiniop/ops/avg_pool1d/operator.cc
deleted file mode 100644
index c3696daa1..000000000
--- a/src/infiniop/ops/avg_pool1d/operator.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/avg_pool1d.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/avg_pool1d_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-#include "nvidia/avg_pool1d_nvidia.cuh"
-#endif
-#ifdef ENABLE_ASCEND_API
-#include "ascend/avg_pool1d_ascend.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/avg_pool1d_bang.h"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/avg_pool1d_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/avg_pool1d_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/avg_pool1d_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor(
-    infiniopHandle_t handle,
-    infiniopAvgPool1dDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y,
-    infiniopTensorDescriptor_t x,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-#define CREATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                    \
-        return op::avg_pool1d::NAMESPACE::Descriptor::create(                     \
-            handle,                                                               \
-            reinterpret_cast<op::avg_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
-            y,                                                                    \
-            x,                                                                    \
-            kernel_size,                                                          \
-            stride,                                                               \
-            padding)
-
-    switch (handle->device) {
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        CREATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc,
-                                                            size_t *size) {
-#define GET(CASE, NAMESPACE)                                                                            \
-    case CASE:                                                                                          \
-        *size = reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        GET(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef GET
-}
-
-__INFINI_C infiniStatus_t infiniopAvgPool1d(
-    infiniopAvgPool1dDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                       \
-        return reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, x, stream)
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
-        delete reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        DELETE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        DELETE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
deleted file mode 100644
index c1098f3ee..000000000
--- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-#include "cross_entropy_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include "../../../reduce/cpu/reduce.h"
-#include <algorithm>
-#include <cmath>
-
-namespace op::cross_entropy::cpu {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-    auto x_dtype = x_desc->dtype();
-    auto t_dtype = target_desc->dtype();
-
-    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
-    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-
-    CrossEntropyInfo info{};
-    info.dtype = x_dtype;
-    info.target_dtype = t_dtype;
-
-    info.outer_size = target_desc->numel();
-
-    info.vocab_size = x_desc->shape().back();
-
-    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
-
-    *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T, typename Tidx>
-infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info,
-                                    T *y, const T *x, const void *target) {
-    const Tidx *label = reinterpret_cast<const Tidx *>(target);
-
-#pragma omp parallel for
-    for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) {
-        const T *row = x + i * info->x_stride;
-        Tidx idx = label[i];
-
-        if (idx < 0 || static_cast<size_t>(idx) >= info->vocab_size) {
-            y[i] = utils::cast<T>(0.f);
-            continue;
-        }
-
-        float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1);
-
-        float sum_exp = 0.f;
-        for (size_t j = 0; j < info->vocab_size; ++j) {
-            sum_exp += std::exp(utils::cast<float>(row[j]) - max_val);
-        }
-
-        float log_term = std::log(sum_exp) + max_val;
-        float target_logit = utils::cast<float>(row[idx]);
-        y[i] = utils::cast<T>(log_term - target_logit);
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info,
-                                    T *y, const T *x, const void *target) {
-
-    if (info->target_dtype == INFINI_DTYPE_I32) {
-        return cross_entropy_kernel<T, int32_t>(info, y, x, target);
-    } else if (info->target_dtype == INFINI_DTYPE_I64) {
-        return cross_entropy_kernel<T, int64_t>(info, y, x, target);
-    }
-    return INFINI_STATUS_BAD_TENSOR_DTYPE;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    const void *target,
-    void *stream) const {
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target);
-    case INFINI_DTYPE_BF16:
-        return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target);
-    case INFINI_DTYPE_F32:
-        return dispatch_target_type(&_info, (float *)y, (const float *)x, target);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::cross_entropy::cpu
diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
deleted file mode 100644
index e274efc9d..000000000
--- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CROSS_ENTROPY_CPU_H__
-#define __CROSS_ENTROPY_CPU_H__
-
-#include "../cross_entropy.h"
-
-DESCRIPTOR(cpu)
-
-#endif
diff --git a/src/infiniop/ops/cross_entropy/cross_entropy.h b/src/infiniop/ops/cross_entropy/cross_entropy.h
deleted file mode 100644
index 075b17142..000000000
--- a/src/infiniop/ops/cross_entropy/cross_entropy.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef CROSS_ENTROPY_H
-#define CROSS_ENTROPY_H
-
-#include "../../operator.h"
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                                 \
-    namespace op::cross_entropy::NAMESPACE {                                  \
-    class Descriptor final : public InfiniopDescriptor {                      \
-        struct Opaque;                                                        \
-        Opaque *_opaque;                                                      \
-        CrossEntropyInfo _info;                                               \
-        size_t _workspace_size;                                               \
-                                                                              \
-        Descriptor(Opaque *opaque,                                            \
-                   CrossEntropyInfo info,                                     \
-                   size_t workspace_size,                                     \
-                   infiniDevice_t device_type,                                \
-                   int device_id)                                             \
-            : InfiniopDescriptor{device_type, device_id},                     \
-              _opaque(opaque),                                                \
-              _info(info),                                                    \
-              _workspace_size(workspace_size) {}                              \
-                                                                              \
-    public:                                                                   \
-        ~Descriptor();                                                        \
-        size_t workspaceSize() const { return _workspace_size; }              \
-        static infiniStatus_t create(infiniopHandle_t handle,                 \
-                                     Descriptor **desc_ptr,                   \
-                                     infiniopTensorDescriptor_t y_desc,       \
-                                     infiniopTensorDescriptor_t x_desc,       \
-                                     infiniopTensorDescriptor_t target_desc); \
-        infiniStatus_t calculate(void *workspace,                             \
-                                 size_t workspace_size,                       \
-                                 void *y,                                     \
-                                 const void *x,                               \
-                                 const void *target,                          \
-                                 void *stream) const;                         \
-    };                                                                        \
-    }
-
-#endif
diff --git a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh
deleted file mode 100644
index c048c1233..000000000
--- a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef __CROSS_ENTROPY_KERNEL_CUH__
-#define __CROSS_ENTROPY_KERNEL_CUH__
-
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../reduce/cuda/reduce.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
-__device__ void crossEntropyKernel(
-    Tdata *y_,
-    const Tdata *x_,
-    const void *target_,
-    size_t outer_size,
-    size_t vocab_size,
-    ptrdiff_t x_stride) {
-
-    size_t row_idx = blockIdx.x;
-    if (row_idx >= outer_size) {
-        return;
-    }
-
-    const Tdata *x = x_ + row_idx * x_stride;
-    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
-
-    Tidx label = target[row_idx];
-
-    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
-    __shared__ Tcompute max_val_shared;
-    if (threadIdx.x == 0) {
-        max_val_shared = static_cast<Tcompute>(max_val_raw);
-    }
-    __syncthreads();
-    Tcompute max_val = max_val_shared;
-
-    Tcompute thread_sum = 0.0f;
-    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
-        Tcompute val = static_cast<Tcompute>(x[col]);
-        thread_sum += expf(val - max_val);
-    }
-
-    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-        thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset);
-    }
-
-    static __shared__ Tcompute shared_sum[32];
-    int lane = threadIdx.x % warpSize;
-    int warp = threadIdx.x / warpSize;
-
-    if (lane == 0) {
-        shared_sum[warp] = thread_sum;
-    }
-    __syncthreads();
-
-    Tcompute block_sum = 0.0f;
-    if (warp == 0) {
-
-        if (lane < (BLOCK_SIZE + warpSize - 1) / warpSize) {
-            block_sum = shared_sum[lane];
-        }
-        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-            block_sum += __shfl_down_sync(0xffffffff, block_sum, offset);
-        }
-    }
-
-    if (threadIdx.x == 0) {
-        Tcompute log_term = logf(block_sum) + max_val;
-
-        Tcompute target_logit = 0.0f;
-
-        if (label >= 0 && static_cast<size_t>(label) < vocab_size) {
-            target_logit = static_cast<Tcompute>(x[label]);
-        } else {
-
-            log_term = 0.0f;
-        }
-
-        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
-    }
-}
-
-#endif
diff --git a/src/infiniop/ops/cross_entropy/info.h b/src/infiniop/ops/cross_entropy/info.h
deleted file mode 100644
index a915a4fe4..000000000
--- a/src/infiniop/ops/cross_entropy/info.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CROSS_ENTROPY_INFO_H
-#define CROSS_ENTROPY_INFO_H
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <vector>
-
-#include <cstddef>
-
-struct CrossEntropyInfo {
-    int dtype;
-    int target_dtype;
-    size_t outer_size;
-    size_t vocab_size;
-    ptrdiff_t x_stride;
-};
-
-#endif
diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
deleted file mode 100644
index 57bccea91..000000000
--- a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CROSS_ENTROPY_METAX_H__
-#define __CROSS_ENTROPY_METAX_H__
-
-#include "../cross_entropy.h"
-
-DESCRIPTOR(metax)
-
-#endif // __CROSS_ENTROPY_METAX_H__
diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
deleted file mode 100644
index efd791183..000000000
--- a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "../../../devices/metax/metax_common.h"
-#include "cross_entropy_metax.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-
-#include <cub/block/block_reduce.cuh>
-
-#include "../../../reduce/cuda/reduce.cuh"
-
-#include <cmath>
-
-namespace {
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
-__device__ void crossEntropyKernel(
-    Tdata *y_,
-    const Tdata *x_,
-    const void *target_,
-    size_t outer_size,
-    size_t vocab_size,
-    ptrdiff_t x_stride) {
-
-    size_t row_idx = blockIdx.x;
-    if (row_idx >= outer_size) {
-        return;
-    }
-
-    const Tdata *x = x_ + row_idx * x_stride;
-    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
-
-    Tidx label = target[row_idx];
-
-    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
-    __shared__ Tcompute max_val_shared;
-    if (threadIdx.x == 0) {
-        max_val_shared = static_cast<Tcompute>(max_val_raw);
-    }
-    __syncthreads();
-
-    Tcompute max_val = max_val_shared;
-
-    Tcompute thread_sum = Tcompute(0);
-    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
-        Tcompute val = static_cast<Tcompute>(x[col]);
-        thread_sum += expf(val - max_val);
-    }
-
-    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
-
-    if (threadIdx.x == 0) {
-        if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
-            y_[row_idx] = static_cast<Tdata>(0.0f);
-            return;
-        }
-        Tcompute log_term = logf(block_sum) + max_val;
-        Tcompute target_logit = static_cast<Tcompute>(x[label]);
-        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
-    }
-}
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
-INFINIOP_METAX_KERNEL crossEntropy(
-    Tdata *y, const Tdata *x, const void *target,
-    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
-    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
-        y, x, target, outer_size, vocab_size, x_stride);
-}
-
-} // namespace
-
-namespace op::cross_entropy::metax {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-    (void)y_desc;
-
-    auto x_dtype = x_desc->dtype();
-    auto t_dtype = target_desc->dtype();
-
-    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
-    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-
-    CrossEntropyInfo info{};
-    info.dtype = x_dtype;
-    info.target_dtype = t_dtype;
-    info.vocab_size = x_desc->shape().back();
-    info.outer_size = target_desc->numel();
-    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        info, 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(void *y, const void *x, const void *target,
-                            const CrossEntropyInfo &info, hcStream_t stream) {
-    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
-
-    if (info.target_dtype == INFINI_DTYPE_I64) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (half *)y, (const half *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, cuda_bfloat16, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (float *)y, (const float *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    } else if (info.target_dtype == INFINI_DTYPE_I32) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (half *)y, (const half *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, cuda_bfloat16, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (float *)y, (const float *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    const void *target,
-    void *stream_) const {
-
-    (void)workspace;
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    auto stream = reinterpret_cast<hcStream_t>(stream_);
-    int max_threads = _opaque->internal->maxThreadsPerBlock();
-
-    if (max_threads >= METAX_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(y, x, target, _info, stream));
-    } else if (max_threads >= METAX_BLOCK_SIZE_512) {
-        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(y, x, target, _info, stream));
-    } else {
-        CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::cross_entropy::metax
diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
deleted file mode 100644
index 6648b0e32..000000000
--- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef __CROSS_ENTROPY_KERNEL_CUH__
-#define __CROSS_ENTROPY_KERNEL_CUH__
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
-__device__ void crossEntropyKernel(
-    Tdata *y_,
-    const Tdata *x_,
-    const void *target_,
-    size_t outer_size,
-    size_t vocab_size,
-    ptrdiff_t x_stride) {
-
-    size_t row_idx = blockIdx.x;
-    if (row_idx >= outer_size) {
-        return;
-    }
-
-    const Tdata *x = x_ + row_idx * x_stride;
-    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
-
-    Tidx label = target[row_idx];
-
-    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
-    __shared__ Tcompute max_val_shared;
-    if (threadIdx.x == 0) {
-        max_val_shared = static_cast<Tcompute>(max_val_raw);
-    }
-    __syncthreads();
-
-    Tcompute max_val = max_val_shared;
-
-    Tcompute thread_sum = Tcompute(0);
-    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
-        Tcompute val = static_cast<Tcompute>(x[col]);
-        thread_sum += expf(val - max_val);
-    }
-
-    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
-
-    if (threadIdx.x == 0) {
-        if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
-            y_[row_idx] = static_cast<Tdata>(0.0f);
-            return;
-        }
-        Tcompute log_term = logf(block_sum) + max_val;
-        Tcompute target_logit = static_cast<Tcompute>(x[label]);
-        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
-    }
-}
-
-#endif
diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
deleted file mode 100644
index 454b14617..000000000
--- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CROSS_ENTROPY_MOORE_H__
-#define __CROSS_ENTROPY_MOORE_H__
-
-#include "../cross_entropy.h"
-
-DESCRIPTOR(moore)
-
-#endif
diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
deleted file mode 100644
index 2535679dd..000000000
--- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
+++ /dev/null
@@ -1,129 +0,0 @@
-#include "../../../devices/moore/moore_common.h"
-#include "cross_entropy_moore.h"
-
-#include <cub/block/block_reduce.cuh>
-#include "../../../devices/moore/moore_kernel_common.h"
-
-#include "../../../reduce/cuda/reduce.cuh"
-
-#include "cross_entropy_kernel.h"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
-INFINIOP_MOORE_KERNEL crossEntropy(
-    Tdata *y, const Tdata *x, const void *target,
-    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
-    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
-        y, x, target, outer_size, vocab_size, x_stride);
-}
-
-namespace op::cross_entropy::moore {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-    (void)y_desc;
-
-    auto x_dtype = x_desc->dtype();
-    auto t_dtype = target_desc->dtype();
-
-    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
-    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-
-    CrossEntropyInfo info{};
-    info.dtype = x_dtype;
-    info.target_dtype = t_dtype;
-    info.vocab_size = x_desc->shape().back();
-    info.outer_size = target_desc->numel();
-    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(void *y, const void *x, const void *target,
-                            const CrossEntropyInfo &info, musaStream_t stream) {
-    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
-
-    if (info.target_dtype == INFINI_DTYPE_I64) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (half *)y, (const half *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (float *)y, (const float *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    } else if (info.target_dtype == INFINI_DTYPE_I32) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (half *)y, (const half *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (float *)y, (const float *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
-                                     void *y,
-                                     const void *x,
-                                     const void *target,
-                                     void *stream_) const {
-    musaStream_t stream = (musaStream_t)stream_;
-    (void)workspace;
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, target, _info, stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(y, x, target, _info, stream));
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::cross_entropy::moore
diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
deleted file mode 100644
index 77e3d2d58..000000000
--- a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "cross_entropy_nvidia.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
-INFINIOP_CUDA_KERNEL crossEntropy(
-    Tdata *y, const Tdata *x, const void *target,
-    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
-
-    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
-        y, x, target, outer_size, vocab_size, x_stride);
-}
-
-namespace op::cross_entropy::nvidia {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-    auto x_dtype = x_desc->dtype();
-    auto t_dtype = target_desc->dtype();
-
-    CrossEntropyInfo info;
-    info.dtype = x_dtype;
-    info.target_dtype = t_dtype;
-
-    info.vocab_size = x_desc->shape().back();
-    info.outer_size = target_desc->numel();
-    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
-
-    auto internal = reinterpret_cast<device::nvidia::Handle *>(handle)->internal();
-
-    *desc_ptr = new Descriptor(
-        new Opaque{internal},
-        info, 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(void *y, const void *x, const void *target,
-                            const CrossEntropyInfo &info, cudaStream_t stream) {
-
-    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
-
-    if (info.target_dtype == INFINI_DTYPE_I64) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int64_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int64_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int64_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        }
-    } else if (info.target_dtype == INFINI_DTYPE_I32) {
-
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int32_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int32_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int32_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        }
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
-                                     void *y,
-                                     const void *x,
-                                     const void *target,
-                                     void *stream_) const {
-    cudaStream_t stream = (cudaStream_t)stream_;
-
-    int max_threads = _opaque->internal->maxThreadsPerBlock();
-
-    if (max_threads >= 1024) {
-        CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream));
-    } else if (max_threads >= 512) {
-        CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream));
-    } else {
-        CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::cross_entropy::nvidia
diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
deleted file mode 100644
index 786e9d88f..000000000
--- a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CROSS_ENTROPY_NVIDIA_H__
-#define __CROSS_ENTROPY_NVIDIA_H__
-
-#include "../cross_entropy.h"
-
-DESCRIPTOR(nvidia)
-
-#endif
diff --git a/src/infiniop/ops/cross_entropy/operator.cc b/src/infiniop/ops/cross_entropy/operator.cc
deleted file mode 100644
index 75f35fcb7..000000000
--- a/src/infiniop/ops/cross_entropy/operator.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/cross_entropy.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/cross_entropy_cpu.h"
-#endif
-
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-#include "nvidia/cross_entropy_nvidia.cuh"
-#endif
-
-#ifdef ENABLE_MOORE_API
-#include "moore/cross_entropy_moore.h"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/cross_entropy_metax.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor(
-    infiniopHandle_t handle,
-    infiniopCrossEntropyDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                      \
-    case CASE:                                                                       \
-        return op::cross_entropy::NAMESPACE::Descriptor::create(                     \
-            handle,                                                                  \
-            reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc, x_desc, target_desc);
-
-    switch (handle->device) {
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        CREATE(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
-    infiniopCrossEntropyDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                         \
-    case CASE:                                                                                       \
-        *size = reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        GET(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-}
-
-__INFINI_C infiniStatus_t infiniopCrossEntropy(
-    infiniopCrossEntropyDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    const void *target,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                    \
-        return reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, x, target, stream);
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        CALCULATE(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
-    infiniopCrossEntropyDescriptor_t desc) {
-
-#define DESTROY(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                     \
-        delete reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        DESTROY(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        DESTROY(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        DESTROY(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        DESTROY(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef DESTROY
-}
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc
deleted file mode 100644
index ff8ebe395..000000000
--- a/src/infiniop/ops/equal/cpu/equal_cpu.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <cstdint>
-#include <type_traits>
-
-#include "equal_cpu.h"
-
-namespace op::equal::cpu {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    if (compute_dtype != b_desc->dtype()) {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
-                INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<EqualOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<EqualOp, bool, float, float>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<EqualOp, bool, double, double>(_info, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<EqualOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<EqualOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<EqualOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::equal::cpu
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h
deleted file mode 100644
index fd811f4b0..000000000
--- a/src/infiniop/ops/equal/cpu/equal_cpu.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __EQUAL_CPU_H__
-#define __EQUAL_CPU_H__
-
-#include <type_traits>
-
-#include "../../../elementwise/cpu/elementwise_cpu.h"
-
-ELEMENTWISE_DESCRIPTOR(equal, cpu)
-
-namespace op::equal::cpu {
-
-typedef struct EqualOp {
-public:
-    static constexpr size_t num_inputs = 2;
-
-    template <typename Tout, typename Tin0, typename Tin1>
-    bool operator()(const Tin0 &a, const Tin1 &b) {
-        if constexpr (std::is_same_v<Tin0, Tin1>) {
-            return a == b;
-        } else {
-            return false;
-        }
-    }
-} EqualOp;
-
-} // namespace op::equal::cpu
-
-#endif
diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh
deleted file mode 100644
index 11ad5981e..000000000
--- a/src/infiniop/ops/equal/cuda/kernel.cuh
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef __EQUAL_CUDA_H__
-#define __EQUAL_CUDA_H__
-
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-#include <type_traits>
-
-namespace op::equal::cuda {
-
-typedef struct EqualOp {
-public:
-    static constexpr size_t num_inputs = 2;
-
-    template <typename Tout, typename Tin0, typename Tin1>
-    __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
-        if constexpr (std::is_same_v<Tin0, Tin1>) {
-            if constexpr (std::is_same_v<Tin0, half2>) {
-                static_assert(!std::is_same_v<Tin0, half2>, "half2 is not supported for mixed output dtype");
-            } else if constexpr (std::is_same_v<Tin0, half>) {
-                return static_cast<Tout>(__heq(a, b));
-            } else {
-                return static_cast<Tout>(a == b);
-            }
-        } else {
-            return false;
-        }
-    }
-} EqualOp;
-
-} // namespace op::equal::cuda
-
-#endif
diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h
deleted file mode 100644
index 6e4cd64b9..000000000
--- a/src/infiniop/ops/equal/metax/equal_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __EQUAL_METAX_API_H__
-#define __EQUAL_METAX_API_H__
-
-#include "../../../elementwise/metax/elementwise_metax_api.h"
-
-ELEMENTWISE_DESCRIPTOR(equal, metax)
-
-#endif // __EQUAL_METAX_API_H__
diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca
deleted file mode 100644
index 265e5b5a6..000000000
--- a/src/infiniop/ops/equal/metax/equal_metax.maca
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "equal_metax.h"
-
-#include "../../../elementwise/metax/elementwise_metax.h"
-
-#include "../cuda/kernel.cuh"
-
-namespace op::equal::metax {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
-                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::equal::metax
diff --git a/src/infiniop/ops/equal/moore/equal_moore.h b/src/infiniop/ops/equal/moore/equal_moore.h
deleted file mode 100644
index 2fed1bb40..000000000
--- a/src/infiniop/ops/equal/moore/equal_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __EQUAL_MOORE_API_H__
-#define __EQUAL_MOORE_API_H__
-
-#include "../../../elementwise/moore/elementwise_moore_api.h"
-
-ELEMENTWISE_DESCRIPTOR(equal, moore)
-
-#endif // __EQUAL_MOORE_API_H__
diff --git a/src/infiniop/ops/equal/moore/equal_moore.mu b/src/infiniop/ops/equal/moore/equal_moore.mu
deleted file mode 100644
index d0eb8395d..000000000
--- a/src/infiniop/ops/equal/moore/equal_moore.mu
+++ /dev/null
@@ -1,140 +0,0 @@
-#include "equal_moore.h"
-
-#include "../../../elementwise/moore/elementwise_moore.h"
-
-#include "equal_moore_kernel.h"
-
-namespace op::equal::moore {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    if (!info.isOutputContiguous()) {
-        return false;
-    }
-    const bool *input_contiguous = info.getInputContiguous();
-    const bool *input_broadcasted = info.getInputBroadcasted();
-    for (size_t i = 0; i < 2; ++i) {
-        if (!input_contiguous[i] || input_broadcasted[i]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-template <typename Tout, typename Tin>
-INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) {
-    const auto op = op::equal::moore::EqualOp{};
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t stride = blockDim.x * gridDim.x;
-    for (; idx < numel; idx += stride) {
-        output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
-    }
-}
-
-template <typename Tout, typename Tin>
-infiniStatus_t launch_fast_path(size_t numel,
-                                void *output,
-                                const std::vector<const void *> &inputs,
-                                void *stream) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-
-    constexpr int kBlockSize = 256;
-    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
-    if (grid > 65535) {
-        grid = 65535;
-    }
-
-    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
-    equal_contiguous_kernel<Tout, Tin><<<grid, kBlockSize, 0, musa_stream>>>(
-        numel,
-        reinterpret_cast<Tout *>(output),
-        reinterpret_cast<const Tin *>(inputs[0]),
-        reinterpret_cast<const Tin *>(inputs[1]));
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
-                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create MOORE elementwise descriptor
-    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (can_use_contiguous_fast_path(_info)) {
-        size_t numel = _info.getOutputSize();
-        switch (_dtype) {
-        case INFINI_DTYPE_F16:
-            return launch_fast_path<bool, half>(numel, output, inputs, stream);
-        case INFINI_DTYPE_BF16:
-            return launch_fast_path<bool, cuda_bfloat16>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F32:
-            return launch_fast_path<bool, float>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I32:
-            return launch_fast_path<bool, int32_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I64:
-            return launch_fast_path<bool, int64_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F64:
-            return launch_fast_path<bool, double>(numel, output, inputs, stream);
-        default:
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::equal::moore
diff --git a/src/infiniop/ops/equal/moore/equal_moore_kernel.h b/src/infiniop/ops/equal/moore/equal_moore_kernel.h
deleted file mode 100644
index a4e32880b..000000000
--- a/src/infiniop/ops/equal/moore/equal_moore_kernel.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef __EQUAL_MOORE_KERNEL_H__
-#define __EQUAL_MOORE_KERNEL_H__
-
-#include <type_traits>
-
-namespace op::equal::moore {
-
-typedef struct EqualOp {
-public:
-    static constexpr size_t num_inputs = 2;
-
-    template <typename Tout, typename Tin0, typename Tin1>
-    __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
-        if constexpr (std::is_same_v<Tin0, Tin1>) {
-            if constexpr (std::is_same_v<Tin0, half>) {
-                return __half2float(a) == __half2float(b);
-            } else if constexpr (std::is_same_v<Tin0, cuda_bfloat16>) {
-                return __bfloat162float(a) == __bfloat162float(b);
-            } else {
-                return a == b;
-            }
-        } else {
-            return false;
-        }
-    }
-} EqualOp;
-
-} // namespace op::equal::moore
-
-#endif // __EQUAL_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
deleted file mode 100644
index 5bdf92e6c..000000000
--- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
+++ /dev/null
@@ -1,137 +0,0 @@
-#include <algorithm>
-#include <cstdint>
-#include <type_traits>
-
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-
-#include "../cuda/kernel.cuh"
-#include "equal_nvidia.cuh"
-
-namespace {
-
-template <typename Tout, typename Tin>
-INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) {
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t stride = blockDim.x * gridDim.x;
-    op::equal::cuda::EqualOp op{};
-    for (; idx < n; idx += stride) {
-        output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
-    }
-}
-
-template <typename Tout, typename Tin>
-infiniStatus_t launchFastEqualKernel(size_t numel,
-                                     void *output,
-                                     const std::vector<const void *> &inputs,
-                                     void *stream) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-    constexpr int block = 256;
-    int grid = static_cast<int>((numel + block - 1) / block);
-    grid = std::min(grid, 65535);
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-    FastEqualKernel<Tout, Tin><<<grid, block, 0, cuda_stream>>>(
-        numel,
-        reinterpret_cast<Tout *>(output),
-        reinterpret_cast<const Tin *>(inputs[0]),
-        reinterpret_cast<const Tin *>(inputs[1]));
-    auto err = cudaGetLastError();
-    return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
-}
-
-} // namespace
-
-namespace op::equal::nvidia {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
-                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    bool fast_path = _info.isOutputContiguous();
-    if (fast_path) {
-        const bool *input_contiguous = _info.getInputContiguous();
-        const bool *input_broadcasted = _info.getInputBroadcasted();
-        for (size_t i = 0; i < 2; ++i) {
-            fast_path &= input_contiguous[i] && !input_broadcasted[i];
-        }
-    }
-
-    if (fast_path) {
-        size_t numel = _info.getOutputSize();
-        switch (_dtype) {
-        case INFINI_DTYPE_F16:
-            return launchFastEqualKernel<bool, half>(numel, output, inputs, stream);
-        case INFINI_DTYPE_BF16:
-            return launchFastEqualKernel<bool, cuda_bfloat16>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F32:
-            return launchFastEqualKernel<bool, float>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I32:
-            return launchFastEqualKernel<bool, int32_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I64:
-            return launchFastEqualKernel<bool, int64_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F64:
-            return launchFastEqualKernel<bool, double>(numel, output, inputs, stream);
-        default:
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::equal::nvidia
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
deleted file mode 100644
index 6565a80b5..000000000
--- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __EQUAL_CUDA_API_H__
-#define __EQUAL_CUDA_API_H__
-
-#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
-
-ELEMENTWISE_DESCRIPTOR(equal, nvidia)
-
-#endif
diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc
deleted file mode 100644
index 80da07e01..000000000
--- a/src/infiniop/ops/equal/operator.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/equal.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/equal_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/equal_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/equal_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/equal_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/equal_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/equal_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateEqualDescriptor(
-    infiniopHandle_t handle,
-    infiniopEqualDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::equal::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                          \
-            {a_desc, b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopEqual(
-    infiniopEqualDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
deleted file mode 100644
index f47198580..000000000
--- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "hardswish_cpu.h"
-
-#include <cstddef>
-
-namespace op::hardswish::cpu {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
-}
-
-template <typename T>
-infiniStatus_t launch_contiguous_cpu(const op::elementwise::ElementwiseInfo &info,
-                                     void *output,
-                                     const std::vector<const void *> &inputs) {
-    const T *in = reinterpret_cast<const T *>(inputs[0]);
-    T *out = reinterpret_cast<T *>(output);
-    const ptrdiff_t size = static_cast<ptrdiff_t>(info.getOutputSize());
-
-#pragma omp parallel for if (size > 1024)
-    for (ptrdiff_t i = 0; i < size; ++i) {
-        out[i] = HardSwishOp{}(in[i]);
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    const bool fast_path = can_use_contiguous_fast_path(_info);
-    if (fast_path) {
-        switch (_dtype) {
-        case INFINI_DTYPE_BF16:
-            return launch_contiguous_cpu<bf16_t>(_info, output, inputs);
-        case INFINI_DTYPE_F16:
-            return launch_contiguous_cpu<fp16_t>(_info, output, inputs);
-        case INFINI_DTYPE_F32:
-            return launch_contiguous_cpu<float>(_info, output, inputs);
-        case INFINI_DTYPE_F64:
-            return launch_contiguous_cpu<double>(_info, output, inputs);
-        default:
-            break;
-        }
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<HardSwishOp, bf16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<HardSwishOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<HardSwishOp, float>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<HardSwishOp, double>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::hardswish::cpu
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
deleted file mode 100644
index b853663aa..000000000
--- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef __HARDSWISH_CPU_H__
-#define __HARDSWISH_CPU_H__
-
-#include "../../../elementwise/cpu/elementwise_cpu.h"
-
-ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
-
-#include <algorithm>
-#include <cmath>
-
-namespace op::hardswish::cpu {
-
-typedef struct HardSwishOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        const float x_f = utils::cast<float>(x);
-        const float clamped = std::min(std::max(x_f + 3.0f, 0.0f), 6.0f);
-        const float result = x_f * clamped * (1.0f / 6.0f);
-        return utils::cast<T>(result);
-    }
-} HardSwishOp;
-
-typedef struct HardSwishContiguousOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-
-        T three = static_cast<T>(3);
-        T zero = static_cast<T>(0);
-        T six = static_cast<T>(6);
-
-        T scale = static_cast<T>(0.16666667f);
-
-        T val = x + three;
-
-        val = std::max(zero, val);
-        val = std::min(six, val);
-
-        return x * val * scale;
-    }
-} HardSwishContiguousOp;
-
-} // namespace op::hardswish::cpu
-
-#endif
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
deleted file mode 100644
index 21b6a5f8d..000000000
--- a/src/infiniop/ops/hardswish/cuda/kernel.cuh
+++ /dev/null
@@ -1,86 +0,0 @@
-#ifndef __HARDSWISH_CUDA_H__
-#define __HARDSWISH_CUDA_H__
-
-#include <cmath>
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-
-namespace op::hardswish::cuda {
-
-typedef struct HardSwishOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-
-        if constexpr (std::is_same_v<T, half2>) {
-
-            const half2 three = __float2half2_rn(3.0f);
-            const half2 scale = __float2half2_rn(0.16666667f);
-
-            half2 val = __hadd2(x, three);
-
-#if defined(ENABLE_ILUVATAR_API)
-
-            float2 val_f = __half22float2(val);
-            val_f.x = fminf(fmaxf(val_f.x, 0.0f), 6.0f);
-            val_f.y = fminf(fmaxf(val_f.y, 0.0f), 6.0f);
-            val = __floats2half2_rn(val_f.x, val_f.y);
-#else
-
-            const half2 zero = __float2half2_rn(0.0f);
-            const half2 six = __float2half2_rn(6.0f);
-
-#if __CUDA_ARCH__ >= 800
-
-            val = __hmin2(__hmax2(val, zero), six);
-#else
-
-            val = __hmax2(val, zero);
-            val = __hmin2(val, six);
-#endif
-#endif
-
-            return __hmul2(__hmul2(x, val), scale);
-
-        }
-
-        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-
-            const float x_f = __bfloat162float(x);
-
-            const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
-            return __float2bfloat16(x_f * val * 0.16666667f);
-
-        }
-
-        else if constexpr (std::is_same_v<T, half>) {
-            const float x_f = __half2float(x);
-            const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
-            return __float2half(x_f * val * 0.16666667f);
-
-        }
-
-        else if constexpr (std::is_same_v<T, float>) {
-
-            const float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
-            return x * val * 0.16666667f;
-
-        }
-
-        else if constexpr (std::is_same_v<T, double>) {
-            const double val = fmin(fmax(x + 3.0, 0.0), 6.0);
-            return x * val * (1.0 / 6.0);
-        }
-    }
-} HardSwishOp;
-
-} // namespace op::hardswish::cuda
-
-#endif
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
deleted file mode 100644
index 16b131aa9..000000000
--- a/src/infiniop/ops/hardswish/metax/hardswish_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __HARDSWISH_METAX_API_H__
-#define __HARDSWISH_METAX_API_H__
-
-#include "../../../elementwise/metax/elementwise_metax_api.h"
-
-ELEMENTWISE_DESCRIPTOR(hardswish, metax)
-
-#endif // __HARDSWISH_METAX_API_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
deleted file mode 100644
index fc57a9b20..000000000
--- a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "hardswish_metax.h"
-
-#include "../../../elementwise/metax/elementwise_metax.h"
-
-#include "../cuda/kernel.cuh"
-
-namespace op::hardswish::metax {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::hardswish::metax
diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.h b/src/infiniop/ops/hardswish/moore/hardswish_moore.h
deleted file mode 100644
index e5861a158..000000000
--- a/src/infiniop/ops/hardswish/moore/hardswish_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __HARDSWISH_MOORE_API_H__
-#define __HARDSWISH_MOORE_API_H__
-
-#include "../../../elementwise/moore/elementwise_moore_api.h"
-
-ELEMENTWISE_DESCRIPTOR(hardswish, moore)
-
-#endif // __HARDSWISH_MOORE_API_H__
diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.mu b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu
deleted file mode 100644
index 3a1290b35..000000000
--- a/src/infiniop/ops/hardswish/moore/hardswish_moore.mu
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "hardswish_moore.h"
-
-#include "../../../elementwise/moore/elementwise_moore.h"
-
-#include "hardswish_moore_kernel.h"
-
-namespace op::hardswish::moore {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    return info.isOutputContiguous() && info.getInputSize() == 1 &&
-           info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
-}
-
-template <typename T>
-INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
-    const auto op = op::hardswish::moore::HardSwishOp{};
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t stride = blockDim.x * gridDim.x;
-    for (; idx < numel; idx += stride) {
-        out[idx] = op(in[idx]);
-    }
-}
-
-template <typename T>
-infiniStatus_t launch_fast_path(size_t numel,
-                                void *output,
-                                const std::vector<const void *> &inputs,
-                                void *stream) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-
-    constexpr int kBlockSize = 256;
-    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
-    if (grid > 65535) {
-        grid = 65535;
-    }
-
-    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
-    hardswish_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
-        numel,
-        reinterpret_cast<T *>(output),
-        reinterpret_cast<const T *>(inputs[0]));
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    // create MOORE elementwise descriptor
-    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    const bool fast_path = can_use_contiguous_fast_path(_info);
-    if (fast_path) {
-        switch (_dtype) {
-        case INFINI_DTYPE_BF16:
-            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
-        case INFINI_DTYPE_F16:
-            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
-        case INFINI_DTYPE_F32:
-            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
-        case INFINI_DTYPE_F64:
-            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
-        default:
-            break;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::hardswish::moore
diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h
deleted file mode 100644
index 60e3dbc60..000000000
--- a/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __HARDSWISH_MOORE_KERNEL_H__
-#define __HARDSWISH_MOORE_KERNEL_H__
-
-#include <cmath>
-#include <type_traits>
-
-namespace op::hardswish::moore {
-
-typedef struct HardSwishOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half>) {
-            float x_f = __half2float(x);
-            float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
-            return __float2half(x_f * val * 0.16666667f);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float x_f = __bfloat162float(x);
-            float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
-            return __float2bfloat16_rn(x_f * val * 0.16666667f);
-        } else if constexpr (std::is_same_v<T, float>) {
-            float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
-            return x * val * 0.16666667f;
-        } else if constexpr (std::is_same_v<T, double>) {
-            double val = fmin(fmax(x + 3.0, 0.0), 6.0);
-            return x * val * (1.0 / 6.0);
-        } else {
-            float x_f = static_cast<float>(x);
-            float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
-            return static_cast<T>(x_f * val * 0.16666667f);
-        }
-    }
-} HardSwishOp;
-
-} // namespace op::hardswish::moore
-
-#endif // __HARDSWISH_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
deleted file mode 100644
index f7736a7fd..000000000
--- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-
-#include "../cuda/kernel.cuh"
-#include "hardswish_nvidia.cuh"
-
-#include <cuda_runtime.h>
-
-namespace op::hardswish::nvidia {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
-}
-
-template <typename T>
-__global__ void hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
-    const auto op = op::hardswish::cuda::HardSwishOp{};
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    while (idx < numel) {
-        out[idx] = op(in[idx]);
-        idx += blockDim.x * gridDim.x;
-    }
-}
-
-template <typename T>
-infiniStatus_t launch_fast_path(size_t numel,
-                                void *output,
-                                const std::vector<const void *> &inputs,
-                                void *stream) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-
-    constexpr int BLOCK_SIZE = 256;
-    int grid = static_cast<int>((numel + BLOCK_SIZE - 1) / BLOCK_SIZE);
-    grid = std::min(grid, 65535);
-
-    auto *out_ptr = reinterpret_cast<T *>(output);
-    auto *in_ptr = reinterpret_cast<const T *>(inputs[0]);
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-
-    hardswish_contiguous_kernel<<<grid, BLOCK_SIZE, 0, cuda_stream>>>(numel, out_ptr, in_ptr);
-    cudaError_t err = cudaGetLastError();
-    return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
-}
-
-} // namespace
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    const bool fast_path = can_use_contiguous_fast_path(_info);
-    if (fast_path) {
-        switch (_dtype) {
-        case INFINI_DTYPE_BF16:
-            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
-        case INFINI_DTYPE_F16:
-            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
-        case INFINI_DTYPE_F32:
-            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
-        case INFINI_DTYPE_F64:
-            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
-        default:
-            break;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::hardswish::nvidia
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
deleted file mode 100644
index eac0dd994..000000000
--- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __HARDSWISH_CUDA_API_H__
-#define __HARDSWISH_CUDA_API_H__
-
-#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
-
-ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
-
-#endif
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
deleted file mode 100644
index ddce97f16..000000000
--- a/src/infiniop/ops/hardswish/operator.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/hardswish.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/hardswish_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
-#include "nvidia/hardswish_nvidia.cuh"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/hardswish_moore.h"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/hardswish_metax.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateHardSwishDescriptor(
-    infiniopHandle_t handle,
-    infiniopHardSwishDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        return op::hardswish::NAMESPACE::Descriptor::create(                     \
-            handle,                                                              \
-            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                         \
-            {input_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                     \
-    case CASE:                                                                                   \
-        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopHardSwish(
-    infiniopHardSwishDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                      \
-        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, output, {input}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                      \
-    case CASE:                                                                       \
-        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc
deleted file mode 100644
index 1bd276308..000000000
--- a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-#include "hardtanh_cpu.h"
-
-#include <type_traits>
-
-namespace op::hardtanh::cpu {
-
-Descriptor::Descriptor(infiniDtype_t dtype,
-                       op::elementwise::ElementwiseInfo info,
-                       size_t workspace_size,
-                       infiniDevice_t device_type,
-                       int device_id,
-                       float min_val,
-                       float max_val)
-    : InfiniopDescriptor{device_type, device_id},
-      _dtype(dtype),
-      _info(std::move(info)),
-      _workspace_size(workspace_size),
-      _min_val(min_val),
-      _max_val(max_val) {}
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-    float min_val,
-    float max_val) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
-    CHECK_RESULT(info_result);
-
-    *desc_ptr = new Descriptor(
-        dtype,
-        info_result.take(),
-        0,
-        handle->device,
-        handle->device_id,
-        min_val,
-        max_val);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-static infiniStatus_t launchCpuHardTanh(const op::elementwise::ElementwiseInfo &info,
-                                        void *output,
-                                        const std::vector<const void *> &inputs,
-                                        float min_val,
-                                        float max_val) {
-    if (inputs.empty()) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-
-    T *out = reinterpret_cast<T *>(output);
-    const T *in = reinterpret_cast<const T *>(inputs[0]);
-    const auto ndim = info.getNdim();
-    const auto *output_shape = info.getOutputShape();
-    const auto *output_strides = info.getOutputStrides();
-    const auto *input_shape = info.getInputShape(0);
-    const auto *input_strides = info.getInputStrides(0);
-    const auto *input_contiguous = info.getInputContiguous();
-    ptrdiff_t output_size = info.getOutputSize();
-
-#pragma omp parallel for if (output_size > 1024)
-    for (ptrdiff_t i = 0; i < output_size; ++i) {
-        const size_t out_idx = info.isOutputContiguous()
-                                 ? static_cast<size_t>(i)
-                                 : op::common_cpu::indexToOffset(i, ndim, output_shape, output_strides);
-        const size_t in_idx = input_contiguous[0]
-                                ? static_cast<size_t>(i)
-                                : op::common_cpu::indexToOffset(i, ndim, input_shape, input_strides);
-
-        if constexpr (std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>) {
-            float value = utils::cast<float>(in[in_idx]);
-            float clamped = HardTanhOp{}(value, min_val, max_val);
-            out[out_idx] = utils::cast<T>(clamped);
-        } else {
-            out[out_idx] = HardTanhOp{}(in[in_idx], min_val, max_val);
-        }
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    (void)workspace;
-    (void)workspace_size;
-    (void)stream;
-
-    if (inputs.size() != 1) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return launchCpuHardTanh<bf16_t>(_info, output, inputs, _min_val, _max_val);
-    case INFINI_DTYPE_F16:
-        return launchCpuHardTanh<fp16_t>(_info, output, inputs, _min_val, _max_val);
-    case INFINI_DTYPE_F32:
-        return launchCpuHardTanh<float>(_info, output, inputs, _min_val, _max_val);
-    case INFINI_DTYPE_F64:
-        return launchCpuHardTanh<double>(_info, output, inputs, _min_val, _max_val);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-} // namespace op::hardtanh::cpu
diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h
deleted file mode 100644
index 09bfb340c..000000000
--- a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef __HARDTANH_CPU_H__
-#define __HARDTANH_CPU_H__
-
-#include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <algorithm>
-
-namespace op::hardtanh::cpu {
-
-class Descriptor final : public InfiniopDescriptor {
-    infiniDtype_t _dtype;
-    op::elementwise::ElementwiseInfo _info;
-    size_t _workspace_size;
-    float _min_val;
-    float _max_val;
-
-    Descriptor(infiniDtype_t dtype,
-               op::elementwise::ElementwiseInfo info,
-               size_t workspace_size,
-               infiniDevice_t device_type,
-               int device_id,
-               float min_val,
-               float max_val);
-
-public:
-    ~Descriptor();
-
-    size_t workspaceSize() const { return _workspace_size; }
-
-    static infiniStatus_t create(
-        infiniopHandle_t handle,
-        Descriptor **desc_ptr,
-        infiniopTensorDescriptor_t out_desc,
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-        float min_val,
-        float max_val);
-
-    infiniStatus_t calculate(
-        void *workspace,
-        size_t workspace_size,
-        void *output,
-        std::vector<const void *> inputs,
-        void *stream) const;
-
-    float minVal() const { return _min_val; }
-    float maxVal() const { return _max_val; }
-};
-
-typedef struct HardTanhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x, float min_val, float max_val) const {
-        T low = static_cast<T>(min_val);
-        T high = static_cast<T>(max_val);
-        T val = x < low ? low : x;
-        return val > high ? high : val;
-    }
-} HardTanhOp;
-
-} // namespace op::hardtanh::cpu
-
-#endif
diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh
deleted file mode 100644
index 28987f82c..000000000
--- a/src/infiniop/ops/hardtanh/cuda/kernel.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __HARDTANH_CUDA_H__
-#define __HARDTANH_CUDA_H__
-
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-#include <type_traits>
-
-namespace op::hardtanh::cuda {
-
-typedef struct HardTanhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
-        if constexpr (std::is_same_v<T, half2>) {
-
-            float2 x_f2 = __half22float2(x);
-            x_f2.x = fminf(max_val, fmaxf(min_val, x_f2.x));
-            x_f2.y = fminf(max_val, fmaxf(min_val, x_f2.y));
-            return __float22half2_rn(x_f2);
-
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-
-            float x_f = __bfloat162float(x);
-            return __float2bfloat16(fminf(max_val, fmaxf(min_val, x_f)));
-
-        } else if constexpr (std::is_same_v<T, half>) {
-
-            float x_f = __half2float(x);
-            return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
-
-        } else if constexpr (std::is_same_v<T, float>) {
-
-            return fminf(max_val, fmaxf(min_val, x));
-
-        } else if constexpr (std::is_same_v<T, double>) {
-
-            return fmin((double)max_val, fmax((double)min_val, x));
-        }
-    }
-} HardTanhOp;
-
-} // namespace op::hardtanh::cuda
-
-#endif
diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h
deleted file mode 100644
index 182157116..000000000
--- a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef __HARDTANH_METAX_API_H__
-#define __HARDTANH_METAX_API_H__
-
-#include "../../../elementwise/metax/elementwise_metax_api.h"
-
-namespace op::hardtanh::metax {
-
-class Descriptor final : public InfiniopDescriptor {
-    infiniDtype_t _dtype;
-    op::elementwise::ElementwiseInfo _info;
-    std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
-    size_t _workspace_size;
-    float _min_val;
-    float _max_val;
-
-    Descriptor(infiniDtype_t dtype,
-               op::elementwise::ElementwiseInfo info,
-               op::elementwise::metax::DeviceImpl *device_info,
-               size_t workspace_size,
-               infiniDevice_t device_type,
-               int device_id,
-               float min_val,
-               float max_val);
-
-public:
-    ~Descriptor();
-
-    size_t workspaceSize() const { return _workspace_size; }
-
-    static infiniStatus_t create(
-        infiniopHandle_t handle,
-        Descriptor **desc_ptr,
-        infiniopTensorDescriptor_t out_desc,
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-        float min_val,
-        float max_val);
-
-    infiniStatus_t calculate(
-        void *workspace,
-        size_t workspace_size,
-        void *output,
-        std::vector<const void *> inputs,
-        void *stream) const;
-};
-
-} // namespace op::hardtanh::metax
-
-#endif // __HARDTANH_METAX_API_H__
diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca
deleted file mode 100644
index 596316e23..000000000
--- a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "hardtanh_metax.h"
-
-#include "../../../elementwise/metax/elementwise_metax.h"
-
-#include "../cuda/kernel.cuh"
-
-namespace op::hardtanh::metax {
-
-Descriptor::Descriptor(infiniDtype_t dtype,
-                       op::elementwise::ElementwiseInfo info,
-                       op::elementwise::metax::DeviceImpl *device_info,
-                       size_t workspace_size,
-                       infiniDevice_t device_type,
-                       int device_id,
-                       float min_val,
-                       float max_val)
-    : InfiniopDescriptor{device_type, device_id},
-      _dtype(dtype),
-      _info(std::move(info)),
-      _device_info(device_info),
-      _workspace_size(workspace_size),
-      _min_val(min_val),
-      _max_val(max_val) {}
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-    float min_val,
-    float max_val) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
-    CHECK_RESULT(info_result);
-    auto info = info_result.take();
-    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
-
-    auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal());
-    CHECK_RESULT(device_impl_result);
-
-    *desc_ptr = new Descriptor(
-        dtype,
-        std::move(info),
-        device_impl_result.take(),
-        workspace_size,
-        handle->device,
-        handle->device_id,
-        min_val,
-        max_val);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(
-            _info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::HardTanhOp, half>(
-            _info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::HardTanhOp, float>(
-            _info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::HardTanhOp, double>(
-            _info, workspace, output, inputs, stream, _min_val, _max_val);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::hardtanh::metax
diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h
deleted file mode 100644
index 470790d52..000000000
--- a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __HARDTANH_MOORE_API_H__
-#define __HARDTANH_MOORE_API_H__
-
-#include "../../../elementwise/moore/elementwise_moore_api.h"
-
-namespace op::hardtanh::moore {
-
-class Descriptor final : public InfiniopDescriptor {
-    infiniDtype_t _dtype;
-    op::elementwise::ElementwiseInfo _info;
-    std::unique_ptr<op::elementwise::moore::DeviceImpl> _device_info;
-    size_t _workspace_size;
-    float _min_val;
-    float _max_val;
-
-    Descriptor(infiniDtype_t dtype,
-               op::elementwise::ElementwiseInfo info,
-               op::elementwise::moore::DeviceImpl *device_info,
-               size_t workspace_size,
-               infiniDevice_t device_type,
-               int device_id,
-               float min_val,
-               float max_val);
-
-public:
-    ~Descriptor();
-
-    size_t workspaceSize() const { return _workspace_size; }
-
-    static infiniStatus_t create(
-        infiniopHandle_t handle,
-        Descriptor **desc_ptr,
-        infiniopTensorDescriptor_t out_desc,
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-        float min_val,
-        float max_val);
-
-    infiniStatus_t calculate(
-        void *workspace,
-        size_t workspace_size,
-        void *output,
-        std::vector<const void *> inputs,
-        void *stream) const;
-
-    float minVal() const { return _min_val; }
-    float maxVal() const { return _max_val; }
-};
-
-} // namespace op::hardtanh::moore
-
-#endif // __HARDTANH_MOORE_API_H__
diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu
deleted file mode 100644
index 40e3dbe41..000000000
--- a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "hardtanh_moore.h"
-
-#include "../../../elementwise/moore/elementwise_moore.h"
-
-#include "hardtanh_moore_kernel.h"
-
-namespace op::hardtanh::moore {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    return info.isOutputContiguous() && info.getInputSize() == 1 &&
-           info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
-}
-
-template <typename T>
-INFINIOP_MOORE_KERNEL hardtanh_contiguous_kernel(size_t numel,
-                                                 T *out,
-                                                 const T *in,
-                                                 float min_val,
-                                                 float max_val) {
-    const auto op = op::hardtanh::moore::HardTanhOp{};
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t stride = blockDim.x * gridDim.x;
-    for (; idx < numel; idx += stride) {
-        out[idx] = op(in[idx], min_val, max_val);
-    }
-}
-
-template <typename T>
-infiniStatus_t launch_fast_path(size_t numel,
-                                void *output,
-                                const std::vector<const void *> &inputs,
-                                void *stream,
-                                float min_val,
-                                float max_val) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-
-    constexpr int kBlockSize = 256;
-    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
-    if (grid > 65535) {
-        grid = 65535;
-    }
-
-    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
-    hardtanh_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
-        numel,
-        reinterpret_cast<T *>(output),
-        reinterpret_cast<const T *>(inputs[0]),
-        min_val,
-        max_val);
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-Descriptor::Descriptor(infiniDtype_t dtype,
-                       op::elementwise::ElementwiseInfo info,
-                       op::elementwise::moore::DeviceImpl *device_info,
-                       size_t workspace_size,
-                       infiniDevice_t device_type,
-                       int device_id,
-                       float min_val,
-                       float max_val)
-    : InfiniopDescriptor{device_type, device_id},
-      _dtype(dtype),
-      _info(std::move(info)),
-      _device_info(device_info),
-      _workspace_size(workspace_size),
-      _min_val(min_val),
-      _max_val(max_val) {}
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-    float min_val,
-    float max_val) {
-
-    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
-    CHECK_RESULT(info_result);
-    auto info = info_result.take();
-    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
-
-    auto device_impl_result = op::elementwise::moore::DeviceImpl::create(handle->internal());
-    CHECK_RESULT(device_impl_result);
-
-    *desc_ptr = new Descriptor(
-        dtype,
-        std::move(info),
-        device_impl_result.take(),
-        workspace_size,
-        handle->device,
-        handle->device_id,
-        min_val,
-        max_val);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    const bool fast_path = can_use_contiguous_fast_path(_info);
-    if (fast_path) {
-        switch (_dtype) {
-        case INFINI_DTYPE_BF16:
-            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        case INFINI_DTYPE_F16:
-            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        case INFINI_DTYPE_F32:
-            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        case INFINI_DTYPE_F64:
-            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        default:
-            break;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, moore::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, moore::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, moore::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, moore::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::hardtanh::moore
diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h
deleted file mode 100644
index db0a3c024..000000000
--- a/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __HARDTANH_MOORE_KERNEL_H__
-#define __HARDTANH_MOORE_KERNEL_H__
-
-#include <cmath>
-#include <type_traits>
-
-namespace op::hardtanh::moore {
-
-typedef struct HardTanhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
-        if constexpr (std::is_same_v<T, half>) {
-            float x_f = __half2float(x);
-            return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float x_f = __bfloat162float(x);
-            return __float2bfloat16_rn(fminf(max_val, fmaxf(min_val, x_f)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return fminf(max_val, fmaxf(min_val, x));
-        } else if constexpr (std::is_same_v<T, double>) {
-            return fmin((double)max_val, fmax((double)min_val, x));
-        } else {
-            float x_f = static_cast<float>(x);
-            return static_cast<T>(fminf(max_val, fmaxf(min_val, x_f)));
-        }
-    }
-} HardTanhOp;
-
-} // namespace op::hardtanh::moore
-
-#endif // __HARDTANH_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu
deleted file mode 100644
index 31ba489ab..000000000
--- a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu
+++ /dev/null
@@ -1,150 +0,0 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-
-#include "../cuda/kernel.cuh"
-#include "hardtanh_nvidia.cuh"
-
-#include <cuda_runtime.h>
-
-namespace op::hardtanh::nvidia {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
-}
-
-template <typename T>
-__global__ void hardtanh_contiguous_kernel(size_t numel, T *out, const T *in, float min_val, float max_val) {
-    const auto op = op::hardtanh::cuda::HardTanhOp{};
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    while (idx < numel) {
-        out[idx] = op(in[idx], min_val, max_val);
-        idx += blockDim.x * gridDim.x;
-    }
-}
-
-template <typename T>
-infiniStatus_t launch_fast_path(size_t numel,
-                                void *output,
-                                const std::vector<const void *> &inputs,
-                                void *stream,
-                                float min_val,
-                                float max_val) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-
-    constexpr int BLOCK_SIZE = 256;
-    int grid = static_cast<int>((numel + BLOCK_SIZE - 1) / BLOCK_SIZE);
-    grid = std::min(grid, 65535);
-
-    auto *out_ptr = reinterpret_cast<T *>(output);
-    auto *in_ptr = reinterpret_cast<const T *>(inputs[0]);
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-
-    hardtanh_contiguous_kernel<<<grid, BLOCK_SIZE, 0, cuda_stream>>>(numel, out_ptr, in_ptr, min_val, max_val);
-    cudaError_t err = cudaGetLastError();
-    return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
-}
-
-} // namespace
-
-Descriptor::Descriptor(infiniDtype_t dtype,
-                       op::elementwise::ElementwiseInfo info,
-                       op::elementwise::nvidia::DeviceImpl *device_info,
-                       size_t workspace_size,
-                       infiniDevice_t device_type,
-                       int device_id,
-                       float min_val,
-                       float max_val)
-    : InfiniopDescriptor{device_type, device_id},
-      _dtype(dtype),
-      _info(std::move(info)),
-      _device_info(device_info),
-      _workspace_size(workspace_size),
-      _min_val(min_val),
-      _max_val(max_val) {}
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-    float min_val,
-    float max_val) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
-    CHECK_RESULT(info_result);
-    auto info = info_result.take();
-    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
-
-    auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal());
-    CHECK_RESULT(device_impl_result);
-
-    *desc_ptr = new Descriptor(
-        dtype,
-        std::move(info),
-        device_impl_result.take(),
-        workspace_size,
-        handle->device,
-        handle->device_id,
-        min_val,
-        max_val);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    const bool fast_path = can_use_contiguous_fast_path(_info);
-    if (fast_path) {
-        switch (_dtype) {
-        case INFINI_DTYPE_BF16:
-            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        case INFINI_DTYPE_F16:
-            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        case INFINI_DTYPE_F32:
-            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        case INFINI_DTYPE_F64:
-            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
-        default:
-            break;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::hardtanh::nvidia
diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh
deleted file mode 100644
index ebd27d80e..000000000
--- a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __HARDTANH_CUDA_API_H__
-#define __HARDTANH_CUDA_API_H__
-
-#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
-
-namespace op::hardtanh::nvidia {
-
-class Descriptor final : public InfiniopDescriptor {
-    infiniDtype_t _dtype;
-    op::elementwise::ElementwiseInfo _info;
-    std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
-    size_t _workspace_size;
-    float _min_val;
-    float _max_val;
-
-    Descriptor(infiniDtype_t dtype,
-               op::elementwise::ElementwiseInfo info,
-               op::elementwise::nvidia::DeviceImpl *device_info,
-               size_t workspace_size,
-               infiniDevice_t device_type,
-               int device_id,
-               float min_val,
-               float max_val);
-
-public:
-    ~Descriptor();
-
-    size_t workspaceSize() const { return _workspace_size; }
-
-    static infiniStatus_t create(
-        infiniopHandle_t handle,
-        Descriptor **desc_ptr,
-        infiniopTensorDescriptor_t out_desc,
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
-        float min_val,
-        float max_val);
-
-    infiniStatus_t calculate(
-        void *workspace,
-        size_t workspace_size,
-        void *output,
-        std::vector<const void *> inputs,
-        void *stream) const;
-
-    float minVal() const { return _min_val; }
-    float maxVal() const { return _max_val; }
-};
-
-} // namespace op::hardtanh::nvidia
-
-#endif
diff --git a/src/infiniop/ops/hardtanh/operator.cc b/src/infiniop/ops/hardtanh/operator.cc
deleted file mode 100644
index f3c782224..000000000
--- a/src/infiniop/ops/hardtanh/operator.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/hardtanh.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/hardtanh_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
-#include "nvidia/hardtanh_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/hardtanh_metax.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/hardtanh_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateHardTanhDescriptor(
-    infiniopHandle_t handle,
-    infiniopHardTanhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    float min_val,
-    float max_val) {
-
-#define CREATE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        return op::hardtanh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                             \
-            reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                        \
-            {input_desc},                                                       \
-            min_val,                                                            \
-            max_val)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                    \
-    case CASE:                                                                                  \
-        *size = reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopHardTanh(
-    infiniopHardTanhDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                     \
-        return reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, output, {input}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                     \
-    case CASE:                                                                      \
-        delete reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/sum/cpu/sum_cpu.cc b/src/infiniop/ops/sum/cpu/sum_cpu.cc
deleted file mode 100644
index cbc9c6fe0..000000000
--- a/src/infiniop/ops/sum/cpu/sum_cpu.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "sum_cpu.h"
-#include "../../../../utils.h"
-#include "../../../devices/cpu/common_cpu.h"
-namespace op::sum::cpu {
-
-Descriptor::~Descriptor() {}
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-
-    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-template <typename T>
-infiniStatus_t calculateSum(
-    const SumInfo *info,
-    T *output,
-    const T *input) {
-    if (info->reduce_dim_size == info->permuted_input_shape.size()) { // 规约到标量
-        float tempSum = 0.;
-        for (size_t index = 0; index < info->input_size; index++) {
-            size_t input_offset = op::common_cpu::indexToOffset(index, info->permuted_input_shape.size(), info->permuted_input_shape.data(), info->permuted_input_strides.data());
-            tempSum += utils::cast<float>(input[input_offset]);
-        }
-        output[0] = utils::cast<T>(tempSum);
-        return INFINI_STATUS_SUCCESS;
-    } else {
-        for (size_t i = 0; i < info->output_size; i++) {
-            size_t output_offset = op::common_cpu::indexToOffset(i, info->output_shape.size(), info->output_shape.data(), info->output_strides.data());
-            float tempSum = 0.;
-            for (size_t j = 0; j < info->reduce_num; j++) {
-                size_t input_offset = op::common_cpu::indexToOffset(j + i * info->reduce_num, info->permuted_input_shape.size(), info->permuted_input_shape.data(), info->permuted_input_strides.data());
-                tempSum += utils::cast<float>(input[input_offset]);
-            }
-            output[output_offset] = utils::cast<T>(tempSum);
-        }
-        return INFINI_STATUS_SUCCESS;
-    }
-}
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream) const {
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return calculateSum<fp16_t>(&_info, (fp16_t *)output, reinterpret_cast<const fp16_t *>(input));
-    case INFINI_DTYPE_F32:
-        return calculateSum<float>(&_info, (float *)output, reinterpret_cast<const float *>(input));
-    case INFINI_DTYPE_BF16:
-        return calculateSum<bf16_t>(&_info, (bf16_t *)output, reinterpret_cast<const bf16_t *>(input));
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::sum::cpu
diff --git a/src/infiniop/ops/sum/cpu/sum_cpu.h b/src/infiniop/ops/sum/cpu/sum_cpu.h
deleted file mode 100644
index 26d6789d1..000000000
--- a/src/infiniop/ops/sum/cpu/sum_cpu.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_SUM_CPU_H__
-#define __INFINIOP_SUM_CPU_H__
-
-#include "../sum_desc.h"
-
-DESCRIPTOR(cpu);
-
-#endif // __INFINIOP_SUM_CPU_H__
diff --git a/src/infiniop/ops/sum/cuda/kernel.cuh b/src/infiniop/ops/sum/cuda/kernel.cuh
deleted file mode 100644
index 5808446b4..000000000
--- a/src/infiniop/ops/sum/cuda/kernel.cuh
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef __SUM_CUDA_H__
-#define __SUM_CUDA_H__
-
-__forceinline__ __device__ __host__ size_t
-indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
-
-template <size_t BLOCK_SIZE, typename Tdata, typename Tcompute>
-__global__ void sumAllKernel(
-    Tcompute *output,
-    const Tdata *input,
-    size_t input_size,
-    size_t permuted_input_shape_size,
-    size_t *permuted_input_shape,
-    ptrdiff_t *permuted_input_strides) {
-    __shared__ Tcompute s_data[BLOCK_SIZE];
-    size_t tid = threadIdx.x;
-    size_t idx = tid + blockIdx.x * blockDim.x;
-    if (idx < input_size) {
-        size_t input_offset = indexToOffset(idx, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
-        s_data[tid] = static_cast<Tcompute>(input[input_offset]);
-    } else {
-        s_data[tid] = static_cast<Tcompute>(0.f);
-    }
-    __syncthreads();
-    for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            s_data[tid] += s_data[tid + s];
-        }
-        __syncthreads();
-    }
-
-    if (tid == 0) {
-        atomicAdd(output, s_data[0]);
-    }
-}
-
-template <size_t BLOCK_SIZE, typename T>
-__global__ void sumKernel(
-    T *output,
-    const T *input,
-    size_t permuted_input_shape_size,
-    size_t output_shape_size,
-    size_t output_size,
-    size_t reduce_num,
-    size_t *permuted_input_shape,
-    size_t *output_shape,
-    ptrdiff_t *permuted_input_strides,
-    ptrdiff_t *output_strides) {
-    size_t tid = threadIdx.x;
-    size_t idx = tid + blockIdx.x * blockDim.x;
-    if (idx >= output_size) {
-        return;
-    }
-    size_t output_index = indexToOffset(idx, output_shape_size, output_shape, output_strides);
-    float tempSum = static_cast<float>(0.f);
-    for (size_t i = 0; i < reduce_num; i++) {
-        size_t input_offset = indexToOffset(i + idx * reduce_num, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
-        tempSum += static_cast<float>(input[input_offset]);
-    }
-    output[output_index] = static_cast<T>(tempSum);
-}
-
-#endif // __SUM_CUDA_H__
diff --git a/src/infiniop/ops/sum/info.h b/src/infiniop/ops/sum/info.h
deleted file mode 100644
index a69af8b44..000000000
--- a/src/infiniop/ops/sum/info.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef __SUM_INFO_H__
-#define __SUM_INFO_H__
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <algorithm>
-#include <cstddef>
-#include <vector>
-
-namespace op::sum {
-class SumInfo {
-    SumInfo() = default;
-
-public:
-    infiniDtype_t dtype;
-    std::vector<size_t> permuted_input_shape; // need to permute
-    std::vector<size_t> output_shape;
-    std::vector<ptrdiff_t> permuted_input_strides; // need to permute
-    std::vector<ptrdiff_t> output_strides;
-    size_t reduce_dim_size; // reduce dim size
-    size_t reduce_num;      // number of elements to reduce for each output element
-    size_t input_size;      // total number of input elements
-    size_t output_size;     // total number of output elements
-    static utils::Result<SumInfo> create(
-        infiniopTensorDescriptor_t output_desc,
-        infiniopTensorDescriptor_t input_desc,
-        size_t *dim,
-        size_t dim_size,
-        bool keepdim) {
-        auto input_shape = input_desc->shape();
-        auto input_strides = input_desc->strides();
-        size_t input_ndim = input_desc->ndim();
-        size_t reduce_num = 1;
-        for (size_t i = 0; i < dim_size; i++) {
-            reduce_num *= input_shape[dim[i]];
-        }
-        std::vector<size_t> permute_order;
-        for (size_t i = 0; i < input_ndim; i++) {
-            if (std::find(dim, dim + dim_size, i) == dim + dim_size) {
-                permute_order.push_back(i);
-            }
-        }
-        for (size_t i = 0; i < dim_size; i++) {
-            permute_order.push_back(dim[i]);
-        }
-        std::vector<size_t> permuted_input_shape;
-        std::vector<ptrdiff_t> permuted_input_strides;
-        for (size_t i = 0; i < permute_order.size(); i++) {
-            permuted_input_shape.push_back(input_shape[permute_order[i]]);
-            permuted_input_strides.push_back(input_strides[permute_order[i]]);
-        }
-        return utils::Result<SumInfo>(SumInfo{input_desc->dtype(),
-                                              permuted_input_shape,
-                                              output_desc->shape(),
-                                              permuted_input_strides,
-                                              output_desc->strides(),
-                                              dim_size,
-                                              reduce_num,
-                                              input_desc->numel(),
-                                              output_desc->numel()});
-    }
-};
-} // namespace op::sum
-
-#endif
diff --git a/src/infiniop/ops/sum/metax/sum_metax.h b/src/infiniop/ops/sum/metax/sum_metax.h
deleted file mode 100644
index 5e8e6754c..000000000
--- a/src/infiniop/ops/sum/metax/sum_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __SUM_METAX_H__
-#define __SUM_METAX_H__
-
-#include "../sum_desc.h"
-
-DESCRIPTOR(metax);
-
-#endif
diff --git a/src/infiniop/ops/sum/metax/sum_metax.maca b/src/infiniop/ops/sum/metax/sum_metax.maca
deleted file mode 100644
index 5affe779f..000000000
--- a/src/infiniop/ops/sum/metax/sum_metax.maca
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "../../../devices/metax/metax_common.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "sum_metax.h"
-
-namespace op::sum::metax {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, typename T>
-infiniStatus_t launchKernel(
-    const SumInfo &info,
-    T *output, const T *input,
-    hcStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_hc = permuted_input_shape_hc + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-
-    if (info.reduce_num == input_size) {
-        T zero = static_cast<T>(0.0f);
-        CHECK_METAX(hcMemcpyAsync(output, &zero, sizeof(T), hcMemcpyHostToDevice, stream));
-        size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        sumAllKernel<BLOCK_SIZE, T, T><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(T), stream>>>(
-            output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc);
-    } else {
-        size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        sumKernel<BLOCK_SIZE, T><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            output, input, input_ndim, output_ndim, output_size, reduce_num,
-            permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream_) const {
-
-    hcStream_t stream = (hcStream_t)stream_;
-
-#define CALCULATE_SUM(BLOCK_SIZE, T)   \
-    launchKernel<BLOCK_SIZE, T>(       \
-        _info,                         \
-        (T *)output, (const T *)input, \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE)              \
-    {                                                          \
-        if (_info.dtype == INFINI_DTYPE_BF16)                  \
-            return CALCULATE_SUM(BLOCK_SIZE, __hpcc_bfloat16); \
-        else if (_info.dtype == INFINI_DTYPE_F16)              \
-            return CALCULATE_SUM(BLOCK_SIZE, half);            \
-        else if (_info.dtype == INFINI_DTYPE_F32)              \
-            return CALCULATE_SUM(BLOCK_SIZE, float);           \
-        else                                                   \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;             \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
-        CALCULATE_SUM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
-    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
-        CALCULATE_SUM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::sum::metax
diff --git a/src/infiniop/ops/sum/moore/sum_moore.h b/src/infiniop/ops/sum/moore/sum_moore.h
deleted file mode 100644
index ca7e18aa3..000000000
--- a/src/infiniop/ops/sum/moore/sum_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __SUM_MOORE_H__
-#define __SUM_MOORE_H__
-
-#include "../sum_desc.h"
-
-DESCRIPTOR(moore);
-
-#endif
diff --git a/src/infiniop/ops/sum/moore/sum_moore.mu b/src/infiniop/ops/sum/moore/sum_moore.mu
deleted file mode 100644
index 8c465460e..000000000
--- a/src/infiniop/ops/sum/moore/sum_moore.mu
+++ /dev/null
@@ -1,133 +0,0 @@
-#include "../../../devices/moore/moore_common.h"
-#include "../../../devices/moore/moore_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "sum_moore.h"
-
-namespace op::sum::moore {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, typename T>
-infiniStatus_t launchKernel(
-    const SumInfo &info,
-    T *output, const T *input,
-    musaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_musa = permuted_input_shape_musa + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-
-    if (info.reduce_num == input_size) {
-        if constexpr (std::is_same_v<T, __mt_bfloat16>) {
-            // 需要解决 moore不支持bf16的atomic add的问题
-            float zero = 0.0f;
-            float *tmp_output;
-            CHECK_MOORE(musaMalloc(&tmp_output, sizeof(float)));
-            CHECK_MOORE(musaMemcpyAsync(tmp_output, &zero, sizeof(float), musaMemcpyHostToDevice, stream));
-            size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-            sumAllKernel<BLOCK_SIZE, T, float><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), stream>>>(
-                tmp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
-            // 可以自定义 kernel，将 float -> T，这里直接memcpy了
-            float host_val;
-            CHECK_MOORE(musaMemcpy(&host_val, tmp_output, sizeof(float), musaMemcpyDeviceToHost));
-            T out_val = static_cast<T>(host_val);
-            CHECK_MOORE(musaMemcpyAsync(output, &out_val, sizeof(T), musaMemcpyHostToDevice, stream));
-            CHECK_MOORE(musaFree(tmp_output));
-        } else {
-            T zero = static_cast<T>(0.0f);
-            CHECK_MOORE(musaMemcpyAsync(output, &zero, sizeof(T), musaMemcpyHostToDevice, stream));
-            size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-            sumAllKernel<BLOCK_SIZE, T, T><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(T), stream>>>(
-                output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
-        }
-    } else {
-        size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        sumKernel<BLOCK_SIZE, T><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            output, input, input_ndim, output_ndim, output_size, reduce_num,
-            permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream_) const {
-
-    musaStream_t stream = (musaStream_t)stream_;
-
-#define CALCULATE_SUM(BLOCK_SIZE, T)   \
-    launchKernel<BLOCK_SIZE, T>(       \
-        _info,                         \
-        (T *)output, (const T *)input, \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
-    {                                                        \
-        if (_info.dtype == INFINI_DTYPE_BF16)                \
-            return CALCULATE_SUM(BLOCK_SIZE, __mt_bfloat16); \
-        else if (_info.dtype == INFINI_DTYPE_F16)            \
-            return CALCULATE_SUM(BLOCK_SIZE, half);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)            \
-            return CALCULATE_SUM(BLOCK_SIZE, float);         \
-        else                                                 \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;           \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
-        CALCULATE_SUM_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024)
-    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
-        CALCULATE_SUM_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::sum::moore
diff --git a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu b/src/infiniop/ops/sum/nvidia/sum_nvidia.cu
deleted file mode 100644
index af052be0a..000000000
--- a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "sum_nvidia.cuh"
-
-namespace op::sum::nvidia {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-    auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, typename T>
-infiniStatus_t launchKernel(
-    const SumInfo &info,
-    T *output, const T *input,
-    cudaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_cuda = permuted_input_shape_cuda + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_cuda = permuted_input_strides_cuda + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-
-    if (info.reduce_num == input_size) {
-        T zero = static_cast<T>(0.0f);
-        CHECK_CUDA(cudaMemcpyAsync(output, &zero, sizeof(T), cudaMemcpyHostToDevice, stream));
-        size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        sumAllKernel<BLOCK_SIZE, T, T><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(T), stream>>>(
-            output, input, input_size, input_ndim, permuted_input_shape_cuda, permuted_input_strides_cuda);
-    } else {
-        size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        sumKernel<BLOCK_SIZE, T><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            output, input, input_ndim, output_ndim, output_size, reduce_num,
-            permuted_input_shape_cuda, output_shape_cuda, permuted_input_strides_cuda, output_strides_cuda);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream_) const {
-
-    cudaStream_t stream = (cudaStream_t)stream_;
-
-#define CALCULATE_SUM(BLOCK_SIZE, T)   \
-    launchKernel<BLOCK_SIZE, T>(       \
-        _info,                         \
-        (T *)output, (const T *)input, \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
-    {                                                        \
-        if (_info.dtype == INFINI_DTYPE_BF16)                \
-            return CALCULATE_SUM(BLOCK_SIZE, __nv_bfloat16); \
-        else if (_info.dtype == INFINI_DTYPE_F16)            \
-            return CALCULATE_SUM(BLOCK_SIZE, half);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)            \
-            return CALCULATE_SUM(BLOCK_SIZE, float);         \
-        else                                                 \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;           \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
-        CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
-    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
-        CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
-    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
-        CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::sum::nvidia
diff --git a/src/infiniop/ops/sum/nvidia/sum_nvidia.cuh b/src/infiniop/ops/sum/nvidia/sum_nvidia.cuh
deleted file mode 100644
index fd44a0246..000000000
--- a/src/infiniop/ops/sum/nvidia/sum_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __SUM_NVIDIA_H__
-#define __SUM_NVIDIA_H__
-
-#include "../sum_desc.h"
-
-DESCRIPTOR(nvidia);
-
-#endif // __SUM_CUDA_API_H__
diff --git a/src/infiniop/ops/sum/operator.cc b/src/infiniop/ops/sum/operator.cc
deleted file mode 100644
index b6e1fa7f5..000000000
--- a/src/infiniop/ops/sum/operator.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/sum.h"
-#include <vector>
-
-#ifdef ENABLE_CPU_API
-#include "cpu/sum_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/sum_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/sum_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/sum_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/sum_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateSumDescriptor(
-    infiniopHandle_t handle,
-    infiniopSumDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::sum::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::sum::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                   \
-            input_desc,                                                    \
-            dim,                                                           \
-            dim_size,                                                      \
-            keepdim)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetSumWorkspaceSize(infiniopSumDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::sum::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopSum(
-    infiniopSumDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool keepdim,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::sum::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, output, input, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroySumDescriptor(infiniopSumDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::sum::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/sum/sum_desc.h b/src/infiniop/ops/sum/sum_desc.h
deleted file mode 100644
index 2477f9bec..000000000
--- a/src/infiniop/ops/sum/sum_desc.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef INFINIOP_SUM_DESCRIPTOR_H_
-#define INFINIOP_SUM_DESCRIPTOR_H_
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::sum::NAMESPACE {                               \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        SumInfo _info;                                           \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            SumInfo info,                                        \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t output_desc,              \
-            infiniopTensorDescriptor_t input_desc,               \
-            size_t *dim,                                         \
-            size_t dim_size,                                     \
-            bool keepdim);                                       \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *output,                                        \
-            const void *input,                                   \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif
diff --git a/src/infiniop/ops/topk/cpu/topk_cpu.cc b/src/infiniop/ops/topk/cpu/topk_cpu.cc
deleted file mode 100644
index 388cf2b05..000000000
--- a/src/infiniop/ops/topk/cpu/topk_cpu.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "topk_cpu.h"
-#include "../../../../utils.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include <algorithm>
-#include <vector>
-namespace op::topk::cpu {
-
-Descriptor::~Descriptor() {}
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t values_output_desc,
-    infiniopTensorDescriptor_t indices_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted) {
-    auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted);
-    CHECK_RESULT(result);
-
-    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-template <typename Tdata>
-infiniStatus_t calculateTopK(
-    const TopKInfo &info,
-    Tdata *values_output,
-    int32_t *indices_output,
-    const Tdata *input,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted) {
-    if (k == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-    for (size_t i = 0; i < info.n_iteration; i++) {
-        size_t index = i;
-        size_t input_start = 0;
-        size_t output_start = 0;
-        for (size_t j = info.ndim - 1; j >= 0; j--) {
-            if (j == dim) {
-                continue;
-            }
-            input_start += (index % info.input_shape[j]) * info.input_strides[j];
-            output_start += (index % info.output_shape[j]) * info.output_strides[j];
-            index /= info.input_shape[j];
-        }
-        using elem_t = std::pair<Tdata, size_t>;
-        std::vector<elem_t> vi_queue(info.dim_elements);
-        for (size_t j = 0; j < info.dim_elements; j++) {
-            vi_queue[j].first = input[input_start + j * info.input_strides[dim]];
-            vi_queue[j].second = j;
-        }
-        bool use_partial_sort = static_cast<size_t>(k) * 64 <= info.dim_elements;
-
-        if (use_partial_sort) {
-            if (largest) {
-                std::partial_sort(vi_queue.begin(), vi_queue.begin() + k, vi_queue.end(),
-                                  [](const elem_t &a, const elem_t &b) -> bool {
-                                      return utils::cast<float>(a.first) > utils::cast<float>(b.first);
-                                  });
-            } else {
-                std::partial_sort(vi_queue.begin(), vi_queue.begin() + k, vi_queue.end(),
-                                  [](const elem_t &a, const elem_t &b) -> bool {
-                                      return utils::cast<float>(a.first) < utils::cast<float>(b.first);
-                                  });
-            }
-        } else {
-            if (largest) {
-                std::nth_element(vi_queue.begin(), vi_queue.begin() + k - 1, vi_queue.end(),
-                                 [](const elem_t &a, const elem_t &b) -> bool {
-                                     return utils::cast<float>(a.first) > utils::cast<float>(b.first);
-                                 });
-                if (sorted) {
-                    std::sort(vi_queue.begin(), vi_queue.begin() + k, // 注意：PyTorch 这里是 k，不是 k-1
-                              [](const elem_t &a, const elem_t &b) -> bool {
-                                  return utils::cast<float>(a.first) > utils::cast<float>(b.first);
-                              });
-                }
-            } else {
-                std::nth_element(vi_queue.begin(), vi_queue.begin() + k - 1, vi_queue.end(),
-                                 [](const elem_t &a, const elem_t &b) -> bool {
-                                     return utils::cast<float>(a.first) < utils::cast<float>(b.first);
-                                 });
-                if (sorted) {
-                    std::sort(vi_queue.begin(), vi_queue.begin() + k, // 注意：PyTorch 这里是 k，不是 k-1
-                              [](const elem_t &a, const elem_t &b) -> bool {
-                                  return utils::cast<float>(a.first) < utils::cast<float>(b.first);
-                              });
-                }
-            }
-        }
-        for (size_t j = 0; j < k; j++) {
-            values_output[output_start + j * info.output_strides[dim]] = vi_queue[j].first;
-            indices_output[output_start + j * info.output_strides[dim]] = (int32_t)vi_queue[j].second;
-        }
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *values_output,
-    void *indices_output,
-    const void *input,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted,
-    void *stream) const {
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return calculateTopK<fp16_t>(_info, (fp16_t *)values_output, (int32_t *)indices_output, reinterpret_cast<const fp16_t *>(input), k, dim, largest, sorted);
-    case INFINI_DTYPE_F32:
-        return calculateTopK<float>(_info, (float *)values_output, (int32_t *)indices_output, reinterpret_cast<const float *>(input), k, dim, largest, sorted);
-    case INFINI_DTYPE_BF16:
-        return calculateTopK<bf16_t>(_info, (bf16_t *)values_output, (int32_t *)indices_output, reinterpret_cast<const bf16_t *>(input), k, dim, largest, sorted);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::topk::cpu
diff --git a/src/infiniop/ops/topk/cpu/topk_cpu.h b/src/infiniop/ops/topk/cpu/topk_cpu.h
deleted file mode 100644
index 57888f326..000000000
--- a/src/infiniop/ops/topk/cpu/topk_cpu.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_TOPK_CPU_H__
-#define __INFINIOP_TOPK_CPU_H__
-
-#include "../topk_desc.h"
-
-DESCRIPTOR(cpu);
-
-#endif // __INFINIOP_TOPK_CPU_H__
diff --git a/src/infiniop/ops/topk/cuda/kernel.cuh b/src/infiniop/ops/topk/cuda/kernel.cuh
deleted file mode 100644
index 13146b45f..000000000
--- a/src/infiniop/ops/topk/cuda/kernel.cuh
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef __TOPK_CUDA_KERNEL_CUH__
-#define __TOPK_CUDA_KERNEL_CUH__
-
-#include <cmath> // NAN
-#include <cub/block/block_radix_sort.cuh>
-#include <stdint.h>
-
-namespace op::topk::cuda {
-__forceinline__ __device__ __host__ size_t baseOffsetExcludingDim(
-    size_t flat_row,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides,
-    size_t dim) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        if (i == dim) {
-            continue;
-        }
-        res += (flat_row % shape[i]) * strides[i];
-        flat_row /= shape[i];
-    }
-    return res;
-}
-
-__forceinline__ __device__ __host__ size_t indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
-
-template <typename Tdata>
-__device__ __forceinline__ float to_float(Tdata v);
-
-template <>
-__device__ __forceinline__ float to_float<float>(float v) { return v; }
-
-template <>
-__device__ __forceinline__ float to_float<half>(half v) { return __half2float(v); }
-
-#if defined(ENABLE_MOORE_API)
-using bf16_t = __mt_bfloat16;
-#elif defined(ENABLE_METAX_API)
-using bf16_t = __hpcc_bfloat16;
-#else
-// CUDA / NVIDIA / ILUVATAR
-using bf16_t = __nv_bfloat16;
-#endif
-template <>
-__device__ __forceinline__ float to_float<bf16_t>(bf16_t v) {
-    return __bfloat162float(v);
-}
-
-// float -> ordered uint32
-__device__ __forceinline__ uint32_t float_to_uint_ordered(float value) {
-    uint32_t bits = *reinterpret_cast<uint32_t *>(&value);
-    uint32_t mask = (uint32_t)(-((int32_t)bits >> 31)) | 0x80000000u;
-    return bits ^ mask;
-}
-
-template <typename Tdata>
-__global__ void gather_rowwise(const Tdata *input, uint32_t *cur_vals, int32_t *cur_idx,
-                               size_t rows, size_t n, size_t ndim, size_t dim, const size_t *shape, const ptrdiff_t *strides) {
-    size_t row = blockIdx.y;
-    size_t i = threadIdx.x + blockIdx.x * blockDim.x;
-    if (row >= rows || i >= n) {
-        return;
-    }
-    size_t base = baseOffsetExcludingDim(row, ndim, shape, strides, dim);
-    size_t off = base + i * strides[dim];
-    cur_vals[row * n + i] = float_to_uint_ordered(to_float<Tdata>(input[off]));
-    cur_idx[row * n + i] = i;
-}
-
-__global__ void init_row_state(int32_t *cur_n, int32_t *rem_k, int32_t *out_pos, size_t rows, size_t n, size_t k) {
-    int32_t r = blockIdx.x * blockDim.x + threadIdx.x;
-    if (r < rows) {
-        cur_n[r] = n;
-        rem_k[r] = k;
-        out_pos[r] = 0;
-    }
-}
-
-__global__ void zero_row_counters(int32_t *ones_count, int32_t *zeros_count, size_t rows) {
-    int r = blockIdx.x * blockDim.x + threadIdx.x;
-    if (r < rows) {
-        ones_count[r] = 0;
-        zeros_count[r] = 0;
-    }
-}
-
-template <size_t BLOCK_SIZE>
-__global__ void partition_rowwise(const uint32_t *cur_vals, int32_t *cur_idx, uint32_t *ones_vals, int32_t *ones_idx,
-                                  uint32_t *zeros_vals, int32_t *zeros_idx, const int32_t *cur_n, size_t rows, size_t n,
-                                  int32_t bit_pos, bool largest, int32_t *ones_count, int32_t *zeros_count) {
-    int32_t row = blockIdx.y;
-    if (row >= rows) {
-        return;
-    }
-
-    __shared__ uint32_t sh1_vals[BLOCK_SIZE];
-    __shared__ int32_t sh1_idx[BLOCK_SIZE];
-    __shared__ uint32_t sh0_vals[BLOCK_SIZE];
-    __shared__ int32_t sh0_idx[BLOCK_SIZE];
-    __shared__ int sh1_n, sh0_n;
-    __shared__ int32_t base1, base0;
-
-    int32_t tid = threadIdx.x;
-    if (tid == 0) {
-        sh1_n = 0;
-        sh0_n = 0;
-    }
-    __syncthreads();
-
-    int32_t i = blockIdx.x * blockDim.x + tid;
-    int32_t cn = cur_n[row];
-    if (i < cn) {
-        int32_t off = row * n + i;
-        int32_t idx = cur_idx[off];
-        uint32_t key = cur_vals[off];
-        uint32_t cmp_key = largest ? key : ~key;
-        int32_t b = (cmp_key >> bit_pos) & 1;
-
-        if (b) {
-            int32_t p = atomicAdd(&sh1_n, 1);
-            sh1_vals[p] = key;
-            sh1_idx[p] = idx;
-        } else {
-            int32_t p = atomicAdd(&sh0_n, 1);
-            sh0_vals[p] = key;
-            sh0_idx[p] = idx;
-        }
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-        base1 = atomicAdd(&ones_count[row], sh1_n);
-        base0 = atomicAdd(&zeros_count[row], sh0_n);
-    }
-    __syncthreads();
-
-    for (int32_t j = tid; j < sh1_n; j += blockDim.x) {
-        int32_t o = row * n + base1 + j;
-        ones_vals[o] = sh1_vals[j];
-        ones_idx[o] = sh1_idx[j];
-    }
-    for (int32_t j = tid; j < sh0_n; j += blockDim.x) {
-        int32_t o = row * n + base0 + j;
-        zeros_vals[o] = sh0_vals[j];
-        zeros_idx[o] = sh0_idx[j];
-    }
-}
-
-template <size_t BLOCK_SIZE>
-__global__ void decide_and_compact(uint32_t *cur_vals, int32_t *cur_idx, const uint32_t *ones_vals, const int32_t *ones_idx, const uint32_t *zeros_vals, const int32_t *zeros_idx,
-                                   const int32_t *ones_count, const int32_t *zeros_count, int32_t *cur_n, int32_t *rem_k, int32_t *out_pos,
-                                   uint32_t *sel_vals, int32_t *sel_idx, size_t rows, size_t n, size_t k) {
-    int32_t row = blockIdx.x;
-    if (row >= rows) {
-        return;
-    }
-    int32_t tid = threadIdx.x;
-    int32_t rem = rem_k[row];
-    if (rem <= 0) {
-        return;
-    }
-    int32_t oc = ones_count[row];
-    int32_t zc = zeros_count[row];
-    int32_t pos = out_pos[row];
-
-    bool keep_ones = (oc >= rem);
-    if (!keep_ones) {
-        for (int32_t j = tid; j < oc; j += blockDim.x) {
-            if (pos + j < k) {
-                int32_t o = row * n + j;
-                sel_vals[row * k + pos + j] = ones_vals[o];
-                sel_idx[row * k + pos + j] = ones_idx[o];
-            }
-        }
-    }
-    __syncthreads();
-    if (tid == 0) {
-        if (keep_ones) {
-            cur_n[row] = oc;
-        } else {
-            out_pos[row] = pos + oc;
-            rem_k[row] = rem - oc;
-            cur_n[row] = zc;
-        }
-    }
-    __syncthreads();
-    int32_t new_n = cur_n[row];
-    for (int32_t j = tid; j < new_n; j += blockDim.x) {
-        int32_t o = row * n + j;
-        cur_vals[o] = keep_ones ? ones_vals[o] : zeros_vals[o];
-        cur_idx[o] = keep_ones ? ones_idx[o] : zeros_idx[o];
-    }
-}
-
-template <size_t BLOCK_SIZE>
-__global__ void take_remaining(const uint32_t *cur_vals, const int32_t *cur_idx, const int32_t *cur_n, const int32_t *rem_k, const int32_t *out_pos,
-                               uint32_t *sel_vals, int32_t *sel_idx, size_t rows, size_t n, size_t k) {
-    int32_t row = blockIdx.x;
-    int32_t tid = threadIdx.x;
-    if (row >= rows) {
-        return;
-    }
-    int32_t rem = rem_k[row];
-    int32_t pos = out_pos[row];
-    int32_t cn = cur_n[row];
-
-    int32_t take = rem;
-    if (take > cn) {
-        take = cn;
-    }
-    for (int32_t j = tid; j < take; j += blockDim.x) {
-        if (pos + j < k) {
-            int32_t o = row * k + pos + j;
-            sel_vals[o] = cur_vals[row * n + j];
-            sel_idx[o] = cur_idx[row * n + j];
-        }
-    }
-}
-
-template <typename Tdata>
-__global__ void scatter_to_output(const Tdata *input, const int32_t *sel_idx, Tdata *values_out, int32_t *indices_out,
-                                  size_t rows, size_t k, size_t ndim, size_t dim, const size_t *input_shape, const ptrdiff_t *input_strides,
-                                  const size_t *output_shape, const ptrdiff_t *output_strides) {
-    int32_t row = blockIdx.y;
-    int32_t j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row >= rows || j >= k) {
-        return;
-    }
-
-    int32_t output_base = baseOffsetExcludingDim(row, ndim, output_shape, output_strides, dim);
-    int32_t output_off = output_base + j * output_strides[dim];
-    int32_t input_base = baseOffsetExcludingDim(row, ndim, input_shape, input_strides, dim);
-    int32_t input_off = input_base + sel_idx[row * k + j] * input_strides[dim];
-
-    values_out[output_off] = input[input_off];
-    indices_out[output_off] = sel_idx[row * k + j];
-}
-
-} // namespace op::topk::cuda
-
-#endif // __TOPK_CUDA_KERNEL_H__
diff --git a/src/infiniop/ops/topk/info.h b/src/infiniop/ops/topk/info.h
deleted file mode 100644
index 4d73d0a5d..000000000
--- a/src/infiniop/ops/topk/info.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef __TOPK_INFO_H__
-#define __TOPK_INFO_H__
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <algorithm>
-#include <cstddef>
-#include <vector>
-
-namespace op::topk {
-class TopKInfo {
-    TopKInfo() = default;
-
-public:
-    infiniDtype_t dtype;
-    std::vector<size_t> input_shape;
-    std::vector<size_t> output_shape;
-    std::vector<ptrdiff_t> input_strides;
-    std::vector<ptrdiff_t> output_strides;
-    size_t k;
-    size_t dim;
-    bool largest;
-    bool sorted;
-    size_t ndim;
-    size_t dim_elements; // processed dim elements
-    size_t n_iteration;  // total number of topk iteration
-    static utils::Result<TopKInfo> create(
-        infiniopTensorDescriptor_t values_output_desc,
-        infiniopTensorDescriptor_t indices_output_desc,
-        infiniopTensorDescriptor_t input_desc,
-        size_t k,
-        size_t dim,
-        bool largest,
-        bool sorted) {
-        auto input_shape = input_desc->shape();
-        auto input_strides = input_desc->strides();
-        size_t input_ndim = input_desc->ndim();
-        size_t dim_elements = input_shape[dim];
-        size_t n_iteration = 1;
-        for (size_t i = 0; i < input_ndim; i++) {
-            if (i != dim) {
-                n_iteration *= input_shape[i];
-            }
-        }
-        return utils::Result<TopKInfo>(TopKInfo{input_desc->dtype(),
-                                                input_desc->shape(),
-                                                values_output_desc->shape(),
-                                                input_desc->strides(),
-                                                values_output_desc->strides(),
-                                                k,
-                                                dim,
-                                                largest,
-                                                sorted,
-                                                input_ndim,
-                                                dim_elements,
-                                                n_iteration});
-    }
-};
-} // namespace op::topk
-
-#endif
diff --git a/src/infiniop/ops/topk/metax/topk_metax.h b/src/infiniop/ops/topk/metax/topk_metax.h
deleted file mode 100644
index 04268bb66..000000000
--- a/src/infiniop/ops/topk/metax/topk_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __TOPK_METAX_H__
-#define __TOPK_METAX_H__
-
-#include "../topk_desc.h"
-
-DESCRIPTOR(metax);
-
-#endif
diff --git a/src/infiniop/ops/topk/metax/topk_metax.maca b/src/infiniop/ops/topk/metax/topk_metax.maca
deleted file mode 100644
index 48cd5b97f..000000000
--- a/src/infiniop/ops/topk/metax/topk_metax.maca
+++ /dev/null
@@ -1,280 +0,0 @@
-#include "../../../devices/metax/metax_common.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "topk_metax.h"
-
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/cub.cuh>
-
-namespace op::topk::metax {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t values_output_desc,
-    infiniopTensorDescriptor_t indices_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted) {
-    auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-
-    workspace_size += (input_desc->ndim() + values_output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    size_t dim_elements = input_desc->shape()[dim];
-    size_t n_iteration = 1;
-    for (size_t i = 0; i < input_desc->ndim(); i++) {
-        if (i != dim) {
-            n_iteration *= input_desc->shape()[i];
-        }
-    }
-    size_t total = n_iteration * dim_elements;
-
-    workspace_size += 3 * total * sizeof(uint32_t);
-    workspace_size += 3 * total * sizeof(int32_t);
-    workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t));
-    if (sorted) {
-        workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t));
-    }
-    workspace_size += 5 * n_iteration * sizeof(int32_t);
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, int32_t SORT_ITEMS_PER_THREAD, typename Tdata>
-infiniStatus_t launchKernel(
-    const TopKInfo &info,
-    Tdata *values_output, int32_t *indices_output, const Tdata *input,
-    size_t k, size_t dim, bool largest, bool sorted,
-    hcStream_t stream, void *workspace, size_t workspace_size) {
-    if (dim >= info.ndim) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-    if (k == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-    if (k > info.dim_elements) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-    size_t input_ndim = info.ndim;
-    size_t output_ndim = input_ndim;
-    size_t n_iteration = info.n_iteration;
-    size_t dim_elements = info.dim_elements;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_hc = input_shape_hc + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_hc = input_strides_hc + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_METAX(hcMemcpyAsync(input_shape_hc, info.input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-
-    const int32_t total = n_iteration * dim_elements;
-
-    uint32_t *cur_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-    uint32_t *ones_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-    uint32_t *zeros_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-
-    int32_t *cur_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-    int32_t *ones_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-    int32_t *zeros_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-
-    uint32_t *sel_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * k * sizeof(uint32_t);
-    int32_t *sel_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * k * sizeof(int32_t);
-    uint32_t *sel_sorted_vals = nullptr;
-    int32_t *sel_sorted_idx = nullptr;
-    if (sorted) {
-        sel_sorted_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-        workspace_offset += n_iteration * k * sizeof(uint32_t);
-        sel_sorted_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-        workspace_offset += n_iteration * k * sizeof(int32_t);
-    }
-
-    int32_t *cur_n = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *rem_k = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *out_pos = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *ones_count = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *zeros_count = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    // init
-    {
-        size_t threads = 256;
-        size_t blocks = (n_iteration + threads - 1) / threads;
-        op::topk::cuda::init_row_state<<<blocks, threads, 0, stream>>>(cur_n, rem_k, out_pos, n_iteration, dim_elements, k);
-    }
-    // gather input -> cur
-    {
-        dim3 block(BLOCK_SIZE);
-        dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-        op::topk::cuda::gather_rowwise<Tdata><<<grid, block, 0, stream>>>(
-            input, cur_vals, cur_idx,
-            n_iteration, dim_elements,
-            input_ndim, dim,
-            input_shape_hc, input_strides_hc);
-    }
-    // radix select/filter
-    for (int bit = 31; bit >= 0; --bit) {
-        {
-            size_t threads = 256;
-            size_t blocks = (n_iteration + threads - 1) / threads;
-            op::topk::cuda::zero_row_counters<<<blocks, threads, 0, stream>>>(ones_count, zeros_count, n_iteration);
-        }
-
-        {
-            dim3 block(BLOCK_SIZE);
-            dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-            op::topk::cuda::partition_rowwise<BLOCK_SIZE><<<grid, block, 0, stream>>>(
-                cur_vals, cur_idx,
-                ones_vals, ones_idx,
-                zeros_vals, zeros_idx,
-                cur_n, n_iteration, dim_elements,
-                bit, largest,
-                ones_count, zeros_count);
-        }
-
-        {
-            op::topk::cuda::decide_and_compact<BLOCK_SIZE><<<n_iteration, BLOCK_SIZE, 0, stream>>>(
-                cur_vals, cur_idx,
-                ones_vals, ones_idx,
-                zeros_vals, zeros_idx,
-                ones_count, zeros_count,
-                cur_n, rem_k, out_pos,
-                sel_vals, sel_idx,
-                n_iteration, dim_elements, k);
-        }
-    }
-
-    // append remaining
-
-    op::topk::cuda::take_remaining<BLOCK_SIZE><<<n_iteration, BLOCK_SIZE, 0, stream>>>(
-        cur_vals, cur_idx,
-        cur_n, rem_k, out_pos,
-        sel_vals, sel_idx,
-        n_iteration, dim_elements, k);
-
-    // sort (CUB block radix sort)
-    const int32_t *final_idx = sel_idx;
-
-    if (sorted) {
-        std::vector<int> h_offsets(n_iteration + 1);
-        for (size_t i = 0; i <= n_iteration; i++) {
-            h_offsets[i] = i * k;
-        }
-        int *d_offsets;
-        CHECK_METAX(hcMalloc(&d_offsets, (n_iteration + 1) * sizeof(int)));
-        CHECK_METAX(hcMemcpy(d_offsets, h_offsets.data(), (n_iteration + 1) * sizeof(int), hcMemcpyHostToDevice));
-
-        void *d_temp_storage = nullptr;
-        size_t temp_storage_bytes = 0;
-
-        if (!largest) {
-            cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                     n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-            hcMalloc(&d_temp_storage, temp_storage_bytes);
-            cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                     n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-        } else {
-            cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                               n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-            hcMalloc(&d_temp_storage, temp_storage_bytes);
-            cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                               n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-        }
-        CHECK_METAX(hcFree(d_offsets));
-        CHECK_METAX(hcFree(d_temp_storage));
-        final_idx = sel_sorted_idx;
-    }
-
-    // scatter to output (strided write)
-    {
-        dim3 block(BLOCK_SIZE);
-        dim3 grid((k + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-        op::topk::cuda::scatter_to_output<Tdata><<<grid, block, 0, stream>>>(
-            input, final_idx,
-            values_output, indices_output,
-            n_iteration, k,
-            input_ndim, dim,
-            input_shape_hc, input_strides_hc,
-            output_shape_hc, output_strides_hc);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *values_output,
-    void *indices_output,
-    const void *input,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted,
-    void *stream_) const {
-
-    hcStream_t stream = (hcStream_t)stream_;
-    constexpr int ITEMS = 4;
-#define CALCULATE_TOPK(BLOCK_SIZE, Tdata)                                        \
-    launchKernel<BLOCK_SIZE, ITEMS, Tdata>(                                      \
-        _info,                                                                   \
-        (Tdata *)values_output, (int32_t *)indices_output, (const Tdata *)input, \
-        k, dim, largest, sorted,                                                 \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_TOPK_WITH_BLOCK_SIZE(BLOCK_SIZE)              \
-    {                                                           \
-        if (_info.dtype == INFINI_DTYPE_BF16)                   \
-            return CALCULATE_TOPK(BLOCK_SIZE, __hpcc_bfloat16); \
-        else if (_info.dtype == INFINI_DTYPE_F16)               \
-            return CALCULATE_TOPK(BLOCK_SIZE, half);            \
-        else if (_info.dtype == INFINI_DTYPE_F32)               \
-            return CALCULATE_TOPK(BLOCK_SIZE, float);           \
-        else                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_TOPK_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::topk::metax
diff --git a/src/infiniop/ops/topk/moore/topk_moore.h b/src/infiniop/ops/topk/moore/topk_moore.h
deleted file mode 100644
index 37753992f..000000000
--- a/src/infiniop/ops/topk/moore/topk_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __TOPK_MOORE_H__
-#define __TOPK_MOORE_H__
-
-#include "../topk_desc.h"
-
-DESCRIPTOR(moore);
-
-#endif
diff --git a/src/infiniop/ops/topk/moore/topk_moore.mu b/src/infiniop/ops/topk/moore/topk_moore.mu
deleted file mode 100644
index b78b9fa0d..000000000
--- a/src/infiniop/ops/topk/moore/topk_moore.mu
+++ /dev/null
@@ -1,280 +0,0 @@
-#include "../../../devices/moore/moore_common.h"
-#include "../../../devices/moore/moore_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "topk_moore.h"
-
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/cub.cuh>
-
-namespace op::topk::moore {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t values_output_desc,
-    infiniopTensorDescriptor_t indices_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted) {
-    auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-
-    workspace_size += (input_desc->ndim() + values_output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    size_t dim_elements = input_desc->shape()[dim];
-    size_t n_iteration = 1;
-    for (size_t i = 0; i < input_desc->ndim(); i++) {
-        if (i != dim) {
-            n_iteration *= input_desc->shape()[i];
-        }
-    }
-    size_t total = n_iteration * dim_elements;
-
-    workspace_size += 3 * total * sizeof(uint32_t);
-    workspace_size += 3 * total * sizeof(int32_t);
-    workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t));
-    if (sorted) {
-        workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t));
-    }
-    workspace_size += 5 * n_iteration * sizeof(int32_t);
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, int32_t SORT_ITEMS_PER_THREAD, typename Tdata>
-infiniStatus_t launchKernel(
-    const TopKInfo &info,
-    Tdata *values_output, int32_t *indices_output, const Tdata *input,
-    size_t k, size_t dim, bool largest, bool sorted,
-    musaStream_t stream, void *workspace, size_t workspace_size) {
-    if (dim >= info.ndim) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-    if (k == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-    if (k > info.dim_elements) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-    size_t input_ndim = info.ndim;
-    size_t output_ndim = input_ndim;
-    size_t n_iteration = info.n_iteration;
-    size_t dim_elements = info.dim_elements;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_musa = input_shape_musa + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_musa = input_strides_musa + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_MOORE(musaMemcpyAsync(input_shape_musa, info.input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-
-    const int32_t total = n_iteration * dim_elements;
-
-    uint32_t *cur_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-    uint32_t *ones_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-    uint32_t *zeros_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-
-    int32_t *cur_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-    int32_t *ones_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-    int32_t *zeros_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-
-    uint32_t *sel_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * k * sizeof(uint32_t);
-    int32_t *sel_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * k * sizeof(int32_t);
-    uint32_t *sel_sorted_vals = nullptr;
-    int32_t *sel_sorted_idx = nullptr;
-    if (sorted) {
-        sel_sorted_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-        workspace_offset += n_iteration * k * sizeof(uint32_t);
-        sel_sorted_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-        workspace_offset += n_iteration * k * sizeof(int32_t);
-    }
-
-    int32_t *cur_n = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *rem_k = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *out_pos = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *ones_count = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *zeros_count = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    // init
-    {
-        size_t threads = 256;
-        size_t blocks = (n_iteration + threads - 1) / threads;
-        op::topk::cuda::init_row_state<<<blocks, threads, 0, stream>>>(cur_n, rem_k, out_pos, n_iteration, dim_elements, k);
-    }
-    // gather input -> cur
-    {
-        dim3 block(BLOCK_SIZE);
-        dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-        op::topk::cuda::gather_rowwise<Tdata><<<grid, block, 0, stream>>>(
-            input, cur_vals, cur_idx,
-            n_iteration, dim_elements,
-            input_ndim, dim,
-            input_shape_musa, input_strides_musa);
-    }
-    // radix select/filter
-    for (int bit = 31; bit >= 0; --bit) {
-        {
-            size_t threads = 256;
-            size_t blocks = (n_iteration + threads - 1) / threads;
-            op::topk::cuda::zero_row_counters<<<blocks, threads, 0, stream>>>(ones_count, zeros_count, n_iteration);
-        }
-
-        {
-            dim3 block(BLOCK_SIZE);
-            dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-            op::topk::cuda::partition_rowwise<BLOCK_SIZE><<<grid, block, 0, stream>>>(
-                cur_vals, cur_idx,
-                ones_vals, ones_idx,
-                zeros_vals, zeros_idx,
-                cur_n, n_iteration, dim_elements,
-                bit, largest,
-                ones_count, zeros_count);
-        }
-
-        {
-            op::topk::cuda::decide_and_compact<BLOCK_SIZE><<<n_iteration, BLOCK_SIZE, 0, stream>>>(
-                cur_vals, cur_idx,
-                ones_vals, ones_idx,
-                zeros_vals, zeros_idx,
-                ones_count, zeros_count,
-                cur_n, rem_k, out_pos,
-                sel_vals, sel_idx,
-                n_iteration, dim_elements, k);
-        }
-    }
-
-    // append remaining
-
-    op::topk::cuda::take_remaining<BLOCK_SIZE><<<n_iteration, BLOCK_SIZE, 0, stream>>>(
-        cur_vals, cur_idx,
-        cur_n, rem_k, out_pos,
-        sel_vals, sel_idx,
-        n_iteration, dim_elements, k);
-
-    // sort (CUB block radix sort)
-    const int32_t *final_idx = sel_idx;
-
-    if (sorted) {
-        std::vector<int> h_offsets(n_iteration + 1);
-        for (size_t i = 0; i <= n_iteration; i++) {
-            h_offsets[i] = i * k;
-        }
-        int *d_offsets;
-        CHECK_MOORE(musaMalloc(&d_offsets, (n_iteration + 1) * sizeof(int)));
-        CHECK_MOORE(musaMemcpy(d_offsets, h_offsets.data(), (n_iteration + 1) * sizeof(int), musaMemcpyHostToDevice));
-
-        void *d_temp_storage = nullptr;
-        size_t temp_storage_bytes = 0;
-
-        if (!largest) {
-            cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                     n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-            musaMalloc(&d_temp_storage, temp_storage_bytes);
-            cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                     n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-        } else {
-            cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                               n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-            musaMalloc(&d_temp_storage, temp_storage_bytes);
-            cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                               n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-        }
-        CHECK_MOORE(musaFree(d_offsets));
-        CHECK_MOORE(musaFree(d_temp_storage));
-        final_idx = sel_sorted_idx;
-    }
-
-    // scatter to output (strided write)
-    {
-        dim3 block(BLOCK_SIZE);
-        dim3 grid((k + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-        op::topk::cuda::scatter_to_output<Tdata><<<grid, block, 0, stream>>>(
-            input, final_idx,
-            values_output, indices_output,
-            n_iteration, k,
-            input_ndim, dim,
-            input_shape_musa, input_strides_musa,
-            output_shape_musa, output_strides_musa);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *values_output,
-    void *indices_output,
-    const void *input,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted,
-    void *stream_) const {
-
-    musaStream_t stream = (musaStream_t)stream_;
-    constexpr int ITEMS = 4;
-#define CALCULATE_TOPK(BLOCK_SIZE, Tdata)                                        \
-    launchKernel<BLOCK_SIZE, ITEMS, Tdata>(                                      \
-        _info,                                                                   \
-        (Tdata *)values_output, (int32_t *)indices_output, (const Tdata *)input, \
-        k, dim, largest, sorted,                                                 \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_TOPK_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
-    {                                                         \
-        if (_info.dtype == INFINI_DTYPE_BF16)                 \
-            return CALCULATE_TOPK(BLOCK_SIZE, __mt_bfloat16); \
-        else if (_info.dtype == INFINI_DTYPE_F16)             \
-            return CALCULATE_TOPK(BLOCK_SIZE, half);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)             \
-            return CALCULATE_TOPK(BLOCK_SIZE, float);         \
-        else                                                  \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;            \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_TOPK_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::topk::moore
diff --git a/src/infiniop/ops/topk/nvidia/topk_nvidia.cu b/src/infiniop/ops/topk/nvidia/topk_nvidia.cu
deleted file mode 100644
index 0f73d4857..000000000
--- a/src/infiniop/ops/topk/nvidia/topk_nvidia.cu
+++ /dev/null
@@ -1,283 +0,0 @@
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "topk_nvidia.cuh"
-
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/cub.cuh>
-
-namespace op::topk::nvidia {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t values_output_desc,
-    infiniopTensorDescriptor_t indices_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted) {
-    auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-
-    workspace_size += (input_desc->ndim() + values_output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
-    // 计算临时变量空间
-    size_t dim_elements = input_desc->shape()[dim];
-    size_t n_iteration = 1;
-    for (size_t i = 0; i < input_desc->ndim(); i++) {
-        if (i != dim) {
-            n_iteration *= input_desc->shape()[i];
-        }
-    }
-    size_t total = n_iteration * dim_elements;
-
-    workspace_size += 3 * total * sizeof(uint32_t);
-    workspace_size += 3 * total * sizeof(int32_t);
-    workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t));
-    if (sorted) {
-        workspace_size += n_iteration * k * (sizeof(uint32_t) + sizeof(int32_t));
-    }
-    workspace_size += 5 * n_iteration * sizeof(int32_t);
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-
-template <size_t BLOCK_SIZE, int32_t SORT_ITEMS_PER_THREAD, typename Tdata>
-infiniStatus_t launchKernel(
-    const TopKInfo &info,
-    Tdata *values_output, int32_t *indices_output, const Tdata *input,
-    size_t k, size_t dim, bool largest, bool sorted,
-    cudaStream_t stream, void *workspace, size_t workspace_size) {
-    if (dim >= info.ndim) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-    if (k == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-    if (k > info.dim_elements) {
-        return INFINI_STATUS_BAD_PARAM;
-    }
-    size_t input_ndim = info.ndim;
-    size_t output_ndim = input_ndim;
-    size_t n_iteration = info.n_iteration;
-    size_t dim_elements = info.dim_elements;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    size_t *output_shape_cuda = input_shape_cuda + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
-
-    ptrdiff_t *input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    ptrdiff_t *output_strides_cuda = input_strides_cuda + input_ndim;
-    workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
-
-    CHECK_CUDA(cudaMemcpyAsync(input_shape_cuda, info.input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-
-    const int32_t total = n_iteration * dim_elements;
-
-    uint32_t *cur_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-    uint32_t *ones_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-    uint32_t *zeros_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(uint32_t);
-
-    int32_t *cur_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-    int32_t *ones_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-    int32_t *zeros_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += total * sizeof(int32_t);
-
-    uint32_t *sel_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * k * sizeof(uint32_t);
-    int32_t *sel_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * k * sizeof(int32_t);
-    uint32_t *sel_sorted_vals = nullptr;
-    int32_t *sel_sorted_idx = nullptr;
-    if (sorted) {
-        sel_sorted_vals = reinterpret_cast<uint32_t *>(workspace_ptr + workspace_offset);
-        workspace_offset += n_iteration * k * sizeof(uint32_t);
-        sel_sorted_idx = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-        workspace_offset += n_iteration * k * sizeof(int32_t);
-    }
-
-    int32_t *cur_n = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *rem_k = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *out_pos = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *ones_count = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    int32_t *zeros_count = reinterpret_cast<int32_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += n_iteration * sizeof(int32_t);
-    // init
-    {
-        size_t threads = 256;
-        size_t blocks = (n_iteration + threads - 1) / threads;
-        op::topk::cuda::init_row_state<<<blocks, threads, 0, stream>>>(cur_n, rem_k, out_pos, n_iteration, dim_elements, k);
-    }
-    // gather input -> cur
-    {
-        dim3 block(BLOCK_SIZE);
-        dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-        op::topk::cuda::gather_rowwise<Tdata><<<grid, block, 0, stream>>>(
-            input, cur_vals, cur_idx,
-            n_iteration, dim_elements,
-            input_ndim, dim,
-            input_shape_cuda, input_strides_cuda);
-    }
-    // radix select/filter
-    for (int bit = 31; bit >= 0; --bit) {
-        {
-            size_t threads = 256;
-            size_t blocks = (n_iteration + threads - 1) / threads;
-            op::topk::cuda::zero_row_counters<<<blocks, threads, 0, stream>>>(ones_count, zeros_count, n_iteration);
-        }
-
-        {
-            dim3 block(BLOCK_SIZE);
-            dim3 grid((dim_elements + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-            op::topk::cuda::partition_rowwise<BLOCK_SIZE><<<grid, block, 0, stream>>>(
-                cur_vals, cur_idx,
-                ones_vals, ones_idx,
-                zeros_vals, zeros_idx,
-                cur_n, n_iteration, dim_elements,
-                bit, largest,
-                ones_count, zeros_count);
-        }
-
-        {
-            op::topk::cuda::decide_and_compact<BLOCK_SIZE><<<n_iteration, BLOCK_SIZE, 0, stream>>>(
-                cur_vals, cur_idx,
-                ones_vals, ones_idx,
-                zeros_vals, zeros_idx,
-                ones_count, zeros_count,
-                cur_n, rem_k, out_pos,
-                sel_vals, sel_idx,
-                n_iteration, dim_elements, k);
-        }
-    }
-
-    // append remaining
-
-    op::topk::cuda::take_remaining<BLOCK_SIZE><<<n_iteration, BLOCK_SIZE, 0, stream>>>(
-        cur_vals, cur_idx,
-        cur_n, rem_k, out_pos,
-        sel_vals, sel_idx,
-        n_iteration, dim_elements, k);
-
-    // sort (CUB block radix sort)
-    const int32_t *final_idx = sel_idx;
-
-    if (sorted) {
-        std::vector<int> h_offsets(n_iteration + 1);
-        for (size_t i = 0; i <= n_iteration; i++) {
-            h_offsets[i] = i * k;
-        }
-        int *d_offsets;
-        CHECK_CUDA(cudaMalloc(&d_offsets, (n_iteration + 1) * sizeof(int)));
-        CHECK_CUDA(cudaMemcpy(d_offsets, h_offsets.data(), (n_iteration + 1) * sizeof(int), cudaMemcpyHostToDevice));
-
-        void *d_temp_storage = nullptr;
-        size_t temp_storage_bytes = 0;
-
-        if (!largest) {
-            cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                     n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-            cudaMalloc(&d_temp_storage, temp_storage_bytes);
-            cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                     n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-        } else {
-            cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                               n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-            cudaMalloc(&d_temp_storage, temp_storage_bytes);
-            cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, sel_vals, sel_sorted_vals, sel_idx, sel_sorted_idx,
-                                                               n_iteration * k, n_iteration, d_offsets, d_offsets + 1, 0, sizeof(uint32_t) * 8, stream);
-        }
-        CHECK_CUDA(cudaFree(d_offsets));
-        CHECK_CUDA(cudaFree(d_temp_storage));
-        final_idx = sel_sorted_idx;
-    }
-
-    // scatter to output (strided write)
-    {
-        dim3 block(BLOCK_SIZE);
-        dim3 grid((k + BLOCK_SIZE - 1) / BLOCK_SIZE, n_iteration);
-        op::topk::cuda::scatter_to_output<Tdata><<<grid, block, 0, stream>>>(
-            input, final_idx,
-            values_output, indices_output,
-            n_iteration, k,
-            input_ndim, dim,
-            input_shape_cuda, input_strides_cuda,
-            output_shape_cuda, output_strides_cuda);
-    }
-
-    CHECK_CUDA(cudaGetLastError());
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *values_output,
-    void *indices_output,
-    const void *input,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted,
-    void *stream_) const {
-
-    cudaStream_t stream = (cudaStream_t)stream_;
-    constexpr int ITEMS = 4;
-#define CALCULATE_TOPK(BLOCK_SIZE, Tdata)                                        \
-    launchKernel<BLOCK_SIZE, ITEMS, Tdata>(                                      \
-        _info,                                                                   \
-        (Tdata *)values_output, (int32_t *)indices_output, (const Tdata *)input, \
-        k, dim, largest, sorted,                                                 \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_TOPK_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
-    {                                                         \
-        if (_info.dtype == INFINI_DTYPE_BF16)                 \
-            return CALCULATE_TOPK(BLOCK_SIZE, __nv_bfloat16); \
-        else if (_info.dtype == INFINI_DTYPE_F16)             \
-            return CALCULATE_TOPK(BLOCK_SIZE, half);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)             \
-            return CALCULATE_TOPK(BLOCK_SIZE, float);         \
-        else                                                  \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;            \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_TOPK_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::topk::nvidia
diff --git a/src/infiniop/ops/topk/nvidia/topk_nvidia.cuh b/src/infiniop/ops/topk/nvidia/topk_nvidia.cuh
deleted file mode 100644
index dfeb2977b..000000000
--- a/src/infiniop/ops/topk/nvidia/topk_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __TOPK_NVIDIA_H__
-#define __TOPK_NVIDIA_H__
-
-#include "../topk_desc.h"
-
-DESCRIPTOR(nvidia);
-
-#endif // __TOPK_NVIDIA_H__
diff --git a/src/infiniop/ops/topk/operator.cc b/src/infiniop/ops/topk/operator.cc
deleted file mode 100644
index 08ec6d18b..000000000
--- a/src/infiniop/ops/topk/operator.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/topk.h"
-#include <vector>
-
-#ifdef ENABLE_CPU_API
-#include "cpu/topk_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/topk_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/topk_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/topk_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/topk_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateTopKDescriptor(
-    infiniopHandle_t handle,
-    infiniopTopKDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t values_output_desc,
-    infiniopTensorDescriptor_t indices_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::topk::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::topk::NAMESPACE::Descriptor **>(desc_ptr), \
-            values_output_desc,                                             \
-            indices_output_desc,                                            \
-            input_desc,                                                     \
-            k,                                                              \
-            dim,                                                            \
-            largest,                                                        \
-            sorted)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetTopKWorkspaceSize(infiniopTopKDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::topk::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopTopK(
-    infiniopTopKDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *values_output,
-    void *indices_output,
-    const void *input,
-    size_t k,
-    size_t dim,
-    bool largest,
-    bool sorted,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::topk::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, values_output, indices_output, input, k, dim, largest, sorted, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyTopKDescriptor(infiniopTopKDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::topk::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/topk/topk_desc.h b/src/infiniop/ops/topk/topk_desc.h
deleted file mode 100644
index 309ec939a..000000000
--- a/src/infiniop/ops/topk/topk_desc.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef INFINIOP_TOPK_DESCRIPTOR_H_
-#define INFINIOP_TOPK_DESCRIPTOR_H_
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::topk::NAMESPACE {                              \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        TopKInfo _info;                                          \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            TopKInfo info,                                       \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t values_output_desc,       \
-            infiniopTensorDescriptor_t indices_output_desc,      \
-            infiniopTensorDescriptor_t input_desc,               \
-            size_t k,                                            \
-            size_t dim,                                          \
-            bool largest,                                        \
-            bool sorted);                                        \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *values_output,                                 \
-            void *indices_output,                                \
-            const void *input,                                   \
-            size_t k,                                            \
-            size_t dim,                                          \
-            bool largest,                                        \
-            bool sorted,                                         \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif
diff --git a/src/infiniop/ops/var/cpu/var_cpu.cc b/src/infiniop/ops/var/cpu/var_cpu.cc
deleted file mode 100644
index bd749a4ef..000000000
--- a/src/infiniop/ops/var/cpu/var_cpu.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "var_cpu.h"
-#include "../../../../utils.h"
-#include "../../../devices/cpu/common_cpu.h"
-namespace op::var::cpu {
-
-Descriptor::~Descriptor() {}
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-
-    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-// welford
-namespace {
-bool IsNanOut(const VarInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-// 直接用float计算
-template <typename Tdata>
-void computeVarUsingWelfordCpu(const Tdata *input_ptr, float &var_output, size_t start, size_t end, const VarInfo &info) {
-    if (start >= end) {
-        return;
-    }
-    float old_mean = 0.0f; // previous mean
-    float mean = 0.0f;     // new mean
-    float M2 = 0.0f;       // variance sum
-    size_t count = 0;      // element count of new sum
-    for (size_t idx = start; idx < end; ++idx) {
-        size_t input_offset = op::common_cpu::indexToOffset(idx, info.permuted_input_shape.size(), info.permuted_input_shape.data(), info.permuted_input_strides.data());
-        ;
-        float value = utils::cast<float>(input_ptr[input_offset]);
-        count++;
-        old_mean = mean;
-        mean += (value - mean) / count;
-        M2 += (value - old_mean) * (value - mean);
-    }
-    var_output = M2 / (info.unbiased_var ? (count - 1) : count);
-}
-
-template <typename Tdata>
-infiniStatus_t calculateVar(
-    const VarInfo &info,
-    Tdata *var_output,
-    const Tdata *input) {
-    Tdata nan_value = utils::cast<Tdata>(NAN);
-    bool is_scalar = (info.reduce_dim_size == info.permuted_input_shape.size());
-    for (size_t i = 0; i < info.output_size; ++i) {
-        size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data());
-        if (IsNanOut(info)) {
-            var_output[output_offset] = nan_value;
-        } else {
-            size_t start = is_scalar ? 0 : i * info.reduce_num;
-            size_t end = is_scalar ? info.input_size : (i + 1) * info.reduce_num;
-            float var = 0.0f;
-            computeVarUsingWelfordCpu(input, var, start, end, info);
-            var_output[output_offset] = utils::cast<Tdata>(var);
-        }
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream) const {
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return calculateVar<fp16_t>(_info, (fp16_t *)var_output, reinterpret_cast<const fp16_t *>(input));
-    case INFINI_DTYPE_F32:
-        return calculateVar<float>(_info, (float *)var_output, reinterpret_cast<const float *>(input));
-    case INFINI_DTYPE_BF16:
-        return calculateVar<bf16_t>(_info, (bf16_t *)var_output, reinterpret_cast<const bf16_t *>(input));
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::var::cpu
diff --git a/src/infiniop/ops/var/cpu/var_cpu.h b/src/infiniop/ops/var/cpu/var_cpu.h
deleted file mode 100644
index 12f1b243c..000000000
--- a/src/infiniop/ops/var/cpu/var_cpu.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_VAR_CPU_H__
-#define __INFINIOP_VAR_CPU_H__
-
-#include "../var_desc.h"
-
-DESCRIPTOR(cpu);
-
-#endif // __INFINIOP_VAR_CPU_H__
diff --git a/src/infiniop/ops/var/cuda/kernel.cuh b/src/infiniop/ops/var/cuda/kernel.cuh
deleted file mode 100644
index 03df669b5..000000000
--- a/src/infiniop/ops/var/cuda/kernel.cuh
+++ /dev/null
@@ -1,370 +0,0 @@
-#ifndef __VAR_CUDA_H__
-#define __VAR_CUDA_H__
-
-#include <cmath> // NAN
-
-__forceinline__ __device__ __host__ size_t indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
-
-namespace device {
-namespace cuda {
-template <typename Tdata>
-__inline__ __device__ Tdata Nan();
-template <>
-__inline__ __device__ float Nan<float>() {
-    return NAN;
-}
-template <>
-__inline__ __device__ double Nan<double>() {
-    return NAN;
-}
-template <>
-__inline__ __device__ half Nan<half>() {
-    return __float2half(NAN);
-}
-
-#if defined(ENABLE_MOORE_API)
-using bf16_t = __mt_bfloat16;
-#elif defined(ENABLE_METAX_API)
-using bf16_t = __hpcc_bfloat16;
-#else
-using bf16_t = __nv_bfloat16;
-#endif
-
-/* bf16 */
-template <>
-__inline__ __device__ bf16_t Nan<bf16_t>() {
-    return __float2bfloat16_rn(NAN);
-}
-
-template <typename Tdata>
-__inline__ __device__ Tdata Div(Tdata a, Tdata b);
-template <>
-__inline__ __device__ float Div<float>(float a, float b) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
-    return __fdividef(a, b);
-#else
-    return a / b;
-#endif
-}
-template <>
-__inline__ __device__ double Div<double>(double a, double b) {
-    return a / b;
-}
-template <>
-__inline__ __device__ half Div<half>(half a, half b) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return __hdiv(a, b);
-#else
-    return __float2half(__half2float(a) / __half2float(b));
-#endif
-}
-template <>
-__inline__ __device__ bf16_t Div<bf16_t>(bf16_t a, bf16_t b) {
-
-#if defined(ENABLE_NVIDIA_API) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    return __hdiv(a, b);
-#else
-    return __float2bfloat16_rn(
-        __bfloat162float(a) / __bfloat162float(b));
-#endif
-}
-
-template <typename Tdata, typename ComputeType>
-inline __device__ void WelfordReduce(const Tdata *input_ptr, ComputeType &mean, ComputeType &m2, ComputeType &count,
-                                     const size_t start, const size_t end, const size_t step,
-                                     const size_t ndim, const size_t *shape, const ptrdiff_t *strides) {
-    ComputeType old_mean = 0.0;
-    for (size_t i = start; i < end; i += step) {
-        ++count;
-        old_mean = mean;
-        size_t input_offset = indexToOffset(i, ndim, shape, strides);
-        ComputeType input_value = static_cast<ComputeType>(input_ptr[input_offset]);
-        mean += (input_value - mean) / count;
-        m2 += (input_value - mean)
-            * (input_value - old_mean);
-    }
-}
-
-template <typename Tdata>
-inline __device__ void WelfordCombine(Tdata val, Tdata &mean, Tdata &m2, Tdata &count) {
-    count += 1;
-    Tdata delta1 = val - mean;
-    mean += Div(delta1, count);
-    Tdata delta2 = val - mean;
-    m2 += delta1 * delta2;
-}
-
-template <typename Tdata>
-inline __device__ void WelfordCombine(Tdata b_mean, Tdata b_m2, Tdata b_count, Tdata &mean, Tdata &m2, Tdata &count) {
-    if (b_count == 0) {
-        return;
-    }
-    Tdata new_count = count + b_count;              // n1 + n2
-    Tdata nb_over_n = Div(b_count, new_count);      // n2 / (n1 + n2)
-    Tdata delta = b_mean - mean;                    // mean2 - mean1
-    mean += delta * nb_over_n;                      // mean1 + n2 * (mean2 - mean1) / (n1 + n2)
-    m2 += b_m2 + delta * delta * count * nb_over_n; // m21 + m22 + n2 * (mean2 - mean1) ^ 2 / (n1 + n2)
-    count = new_count;
-}
-
-template <typename Tdata>
-inline __device__ void WelfordCombineLoop(const Tdata *b_mean, const Tdata *b_m2, const Tdata *b_count,
-                                          Tdata &mean, Tdata &m2, Tdata &count,
-                                          const size_t start, const size_t end, const size_t step) {
-    for (size_t i = start; i < end; i += step) {
-        WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count);
-    }
-}
-
-template <typename Tdata, int thread_group_width = 32>
-__inline__ __device__ void WelfordWarpReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count,
-                                             Tdata &mean, Tdata &m2, Tdata &count) {
-    mean = thread_mean;
-    m2 = thread_m2;
-    count = thread_count;
-    for (int lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) {
-        Tdata b_mean = __shfl_down_sync(0xffffffff, mean, lane_mask, thread_group_width);
-        Tdata b_m2 = __shfl_down_sync(0xffffffff, m2, lane_mask, thread_group_width);
-        Tdata b_count = __shfl_down_sync(0xffffffff, count, lane_mask, thread_group_width);
-        WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
-    }
-}
-
-template <typename Tdata, size_t kWarpSize = 32>
-__inline__ __device__ void WelfordBlockAllReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count,
-                                                 Tdata &result_mean, Tdata &result_m2, Tdata &result_count) {
-    __shared__ Tdata mean_shared[kWarpSize];
-    __shared__ Tdata m2_shared[kWarpSize];
-    __shared__ Tdata count_shared[kWarpSize];
-    __shared__ Tdata mean_result_broadcast;
-    __shared__ Tdata m2_result_broadcast;
-    __shared__ Tdata count_result_broadcast;
-    const int lid = threadIdx.x % kWarpSize;
-    const int wid = threadIdx.x / kWarpSize;
-    // warp内规约
-    Tdata warp_mean = 0.0;
-    Tdata warp_m2 = 0.0;
-    Tdata warp_count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, warp_mean, warp_m2, warp_count);
-    __syncthreads();
-    if (lid == 0) { // 每个warp内的的thread0 保存warp结果
-        mean_shared[wid] = warp_mean;
-        m2_shared[wid] = warp_m2;
-        count_shared[wid] = warp_count;
-    }
-    __syncthreads();
-    // warp间规约
-    if (wid == 0) {
-        if (threadIdx.x < blockDim.x / kWarpSize) {
-            warp_mean = mean_shared[lid];
-            warp_m2 = m2_shared[lid];
-            warp_count = count_shared[lid];
-        } else {
-            warp_mean = static_cast<Tdata>(0);
-            warp_m2 = static_cast<Tdata>(0);
-            warp_count = static_cast<Tdata>(0);
-        }
-        __syncwarp();
-        Tdata block_mean = 0;
-        Tdata block_m2 = 0;
-        Tdata block_count = 0;
-        WelfordWarpReduce(warp_mean, warp_m2, warp_count, block_mean, block_m2, block_count);
-        if (lid == 0) {
-            mean_result_broadcast = block_mean;
-            m2_result_broadcast = block_m2;
-            count_result_broadcast = block_count;
-        }
-    }
-    __syncthreads();
-    result_mean = mean_result_broadcast;
-    result_m2 = m2_result_broadcast;
-    result_count = count_result_broadcast;
-}
-} // namespace cuda
-} // namespace device
-
-__device__ int32_t done_block_counts = 0;
-
-template <typename Tdata, typename ComputeType>
-__global__ void ComputeVarScalarOut(const Tdata *input_ptr, Tdata *var_output_ptr, ComputeType *tmp_buffer_ptr, // Tdata *mean_output_ptr,
-                                    size_t input_size, size_t input_ndim, size_t *permuted_input_shape, ptrdiff_t *permuted_input_strides,
-                                    bool unbiased, bool is_nan) {
-    // 处理 NaN 情况
-    if (is_nan) {
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
-            *var_output_ptr = device::cuda::Nan<Tdata>();
-        } // mean_output_ptr[0] = (input_size == 0) ? device::cuda::Nan<Tdata>() : input_ptr[0];}
-        return;
-    }
-
-    // 计算每个 block 和 thread 的工作量
-    const size_t elems_per_block = input_size / gridDim.x;
-    const size_t elems_per_thread = elems_per_block / blockDim.x;
-    // 线程级 Welford 累积
-    ComputeType thread_mean = 0.0, thread_m2 = 0.0, thread_count = 0;
-
-    // 每个线程处理常规元素（stride 访问）
-    if (elems_per_thread > 0) {
-        const size_t block_start = blockIdx.x * elems_per_block;
-        const size_t regular_elems = elems_per_block - (elems_per_block % blockDim.x);
-        device::cuda::WelfordReduce<Tdata, ComputeType>(input_ptr, thread_mean, thread_m2, thread_count,
-                                                        /*start=*/block_start + threadIdx.x, /*end=*/block_start + regular_elems, /*step=*/blockDim.x,
-                                                        /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides);
-    }
-
-    // thread 0 处理本 block 的尾部元素以及跨 block 的尾部元素（单个线程处理）
-    if (threadIdx.x == 0) {
-        size_t tail_count = elems_per_block % blockDim.x;
-        // 最后一个 block 还需要处理总元素数的尾部
-        if (blockIdx.x == gridDim.x - 1) {
-            tail_count += input_size % gridDim.x;
-        }
-        if (tail_count > 0) {
-            const size_t tail_start = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread;
-            device::cuda::WelfordReduce<Tdata, ComputeType>(input_ptr, thread_mean, thread_m2, thread_count,
-                                                            /*start=*/tail_start, /*end=*/tail_start + tail_count, /*step=*/1,
-                                                            /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides);
-        }
-    }
-
-    // Block 级规约
-    ComputeType block_mean = 0.0, block_m2 = 0.0, block_count = 0;
-    device::cuda::WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count,
-                                                     block_mean, block_m2, block_count);
-
-    // 单 block 情况：直接输出结果
-    if (gridDim.x == 1) {
-        if (threadIdx.x == 0) {
-            ComputeType divisor = unbiased ? block_count - 1 : block_count;
-            var_output_ptr[0] = device::cuda::Div(block_m2, divisor);
-        }
-        return;
-    }
-
-    // 多 block 情况：使用临时缓冲区
-    ComputeType *tmp_mean_ptr = tmp_buffer_ptr;
-    ComputeType *tmp_m2_ptr = tmp_mean_ptr + gridDim.x;
-    ComputeType *tmp_count_ptr = tmp_m2_ptr + gridDim.x;
-
-    // 保存本 block 的结果
-    if (threadIdx.x == 0) {
-        tmp_mean_ptr[blockIdx.x] = block_mean;
-        tmp_m2_ptr[blockIdx.x] = block_m2;
-        tmp_count_ptr[blockIdx.x] = block_count;
-    }
-
-    // 最后一个 block 负责最终规约
-    __shared__ bool is_last_block;
-    if (threadIdx.x == 0) {
-        is_last_block = (atomicAdd(&done_block_counts, 1) == gridDim.x - 1);
-    }
-    __syncthreads();
-
-    if (is_last_block) {
-        // 每个线程合并一部分 block 的结果
-        ComputeType final_thread_mean = 0.0, final_thread_m2 = 0.0, final_thread_count = 0;
-        const size_t blocks_per_thread = gridDim.x / blockDim.x;
-        const size_t regular_blocks = blocks_per_thread * blockDim.x;
-
-        if (blocks_per_thread > 0) {
-            device::cuda::WelfordCombineLoop(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr,
-                                             final_thread_mean, final_thread_m2, final_thread_count,
-                                             /*start=*/threadIdx.x, /*end=*/regular_blocks, /*step=*/blockDim.x);
-        }
-
-        // thread 0 处理尾部 block
-        if (threadIdx.x == 0 && regular_blocks < gridDim.x) {
-            device::cuda::WelfordCombineLoop(&tmp_mean_ptr[regular_blocks], &tmp_m2_ptr[regular_blocks], &tmp_count_ptr[regular_blocks],
-                                             final_thread_mean, final_thread_m2, final_thread_count,
-                                             /*start=*/0, /*end=*/gridDim.x - regular_blocks, /*step=*/1);
-        }
-
-        // 最终 block 级规约并输出
-        ComputeType final_mean = 0, final_m2 = 0, final_count = 0;
-        device::cuda::WelfordBlockAllReduce<ComputeType>(final_thread_mean, final_thread_m2, final_thread_count,
-                                                         final_mean, final_m2, final_count);
-        if (threadIdx.x == 0) {
-            ComputeType divisor = unbiased ? final_count - 1 : final_count;
-            var_output_ptr[0] = device::cuda::Div(final_m2, divisor);
-            done_block_counts = 0; // 重置计数器
-        }
-    }
-}
-
-// CUDA: grid stride looping
-#define CUDA_1D_KERNEL_LOOP(i, n)                                                                  \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
-         i += step)
-
-template <typename Tdata, typename ComputeType>
-__forceinline__ __device__ __host__ void ComputeVarUsingWelford(
-    const Tdata *input_ptr,
-    size_t offset,
-    Tdata &var_output,
-    size_t reduce_num,
-    size_t input_ndim,
-    size_t *permuted_input_shape,
-    ptrdiff_t *permuted_input_strides,
-    bool unbiased) {
-    size_t count = 0;
-    ComputeType mean = 0.0;
-    ComputeType old_mean = 0.0;
-    ComputeType m2 = 0.0;
-    for (size_t i = 0; i < reduce_num; ++i) {
-        size_t input_offset = indexToOffset(offset + i, input_ndim, permuted_input_shape, permuted_input_strides);
-        count++;
-        old_mean = mean;
-        mean = old_mean + (static_cast<ComputeType>(input_ptr[input_offset]) - old_mean) / count;
-        m2 += (static_cast<ComputeType>(input_ptr[input_offset]) - old_mean) * (static_cast<ComputeType>(input_ptr[input_offset]) - mean);
-    }
-    var_output = static_cast<Tdata>(m2 / (unbiased ? count - 1 : count));
-}
-
-template <typename Tdata, typename ComputeType>
-__global__ void ComputeVarUsingWelfordWrapper(
-    const Tdata *input_ptr, Tdata *var_output_ptr,
-    size_t input_ndim,
-    size_t output_size,
-    size_t reduce_num,
-    size_t *permuted_input_shape,
-    ptrdiff_t *permuted_input_strides,
-    bool unbiased,
-    bool is_nan) {
-    if (is_nan) {
-        if (reduce_num == 0) {
-            CUDA_1D_KERNEL_LOOP(i, output_size) {
-                var_output_ptr[i] = device::cuda::Nan<Tdata>();
-            }
-        } else {
-            CUDA_1D_KERNEL_LOOP(i, output_size) {
-                // const size_t input_offset = indexToOffset(i * reduce_num, input_ndim, permuted_input_shape, permuted_input_strides);
-                var_output_ptr[i] = device::cuda::Nan<Tdata>();
-            }
-        }
-    } else {
-        CUDA_1D_KERNEL_LOOP(i, output_size) {
-            ComputeVarUsingWelford<Tdata, ComputeType>(
-                input_ptr,
-                i * reduce_num,
-                var_output_ptr[i],
-                reduce_num,
-                input_ndim,
-                permuted_input_shape,
-                permuted_input_strides,
-                unbiased);
-        }
-    }
-}
-
-#endif // __VAR_CUDA_H__
diff --git a/src/infiniop/ops/var/info.h b/src/infiniop/ops/var/info.h
deleted file mode 100644
index f89e1c0dc..000000000
--- a/src/infiniop/ops/var/info.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __VAR_INFO_H__
-#define __VAR_INFO_H__
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <algorithm>
-#include <cstddef>
-#include <vector>
-
-namespace op::var {
-class VarInfo {
-    VarInfo() = default;
-
-public:
-    infiniDtype_t dtype;
-    std::vector<size_t> permuted_input_shape; // need to permute
-    std::vector<size_t> output_shape;
-    std::vector<ptrdiff_t> permuted_input_strides; // need to permute
-    std::vector<ptrdiff_t> output_strides;
-    size_t reduce_dim_size; // reduce dim size
-    size_t reduce_num;      // number of elements to reduce for each output element
-    size_t input_size;      // total number of input elements
-    size_t output_size;     // total number of output elements
-    bool unbiased_var;
-    static utils::Result<VarInfo> create(
-        infiniopTensorDescriptor_t var_output_desc,
-        infiniopTensorDescriptor_t input_desc,
-        size_t *dim,
-        size_t dim_size,
-        bool unbiased,
-        bool keepdim) {
-        auto input_shape = input_desc->shape();
-        auto input_strides = input_desc->strides();
-        size_t input_ndim = input_desc->ndim();
-        size_t reduce_num = 1;
-        for (size_t i = 0; i < dim_size; i++) {
-            reduce_num *= input_shape[dim[i]];
-        }
-        std::vector<size_t> permute_order;
-        for (size_t i = 0; i < input_ndim; i++) {
-            if (std::find(dim, dim + dim_size, i) == dim + dim_size) {
-                permute_order.push_back(i);
-            }
-        }
-        for (size_t i = 0; i < dim_size; i++) {
-            permute_order.push_back(dim[i]);
-        }
-        std::vector<size_t> permuted_input_shape;
-        std::vector<ptrdiff_t> permuted_input_strides;
-        for (size_t i = 0; i < permute_order.size(); i++) {
-            permuted_input_shape.push_back(input_shape[permute_order[i]]);
-            permuted_input_strides.push_back(input_strides[permute_order[i]]);
-        }
-        return utils::Result<VarInfo>(VarInfo{input_desc->dtype(),
-                                              permuted_input_shape,
-                                              var_output_desc->shape(),
-                                              permuted_input_strides,
-                                              var_output_desc->strides(),
-                                              dim_size,
-                                              reduce_num,
-                                              input_desc->numel(),
-                                              var_output_desc->numel(),
-                                              unbiased});
-    }
-};
-} // namespace op::var
-
-#endif
diff --git a/src/infiniop/ops/var/metax/var_metax.h b/src/infiniop/ops/var/metax/var_metax.h
deleted file mode 100644
index 99edcee98..000000000
--- a/src/infiniop/ops/var/metax/var_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __VAR_METAX_H__
-#define __VAR_METAX_H__
-
-#include "../var_desc.h"
-
-DESCRIPTOR(metax);
-
-#endif // __VAR_METAX_H__
diff --git a/src/infiniop/ops/var/metax/var_metax.maca b/src/infiniop/ops/var/metax/var_metax.maca
deleted file mode 100644
index ae8218646..000000000
--- a/src/infiniop/ops/var/metax/var_metax.maca
+++ /dev/null
@@ -1,124 +0,0 @@
-#include "../../../devices/metax/metax_common.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "var_metax.h"
-
-namespace op::var::metax {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarInfo &info,
-    Tdata *var_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    hcStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-
-    size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_METAX(hcMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
-        CHECK_METAX(hcFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    hcStream_t stream = (hcStream_t)stream_;
-
-#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(     \
-        _info,                                        \
-        (Tdata *)var_output, (const Tdata *)input,    \
-        unbiased, keepdim,                            \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE)                      \
-    {                                                                  \
-        if (_info.dtype == INFINI_DTYPE_BF16)                          \
-            return CALCULATE_VAR(BLOCK_SIZE, __hpcc_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                      \
-            return CALCULATE_VAR(BLOCK_SIZE, half, double);            \
-        else if (_info.dtype == INFINI_DTYPE_F32)                      \
-            return CALCULATE_VAR(BLOCK_SIZE, float, double);           \
-        else                                                           \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                     \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var::metax
diff --git a/src/infiniop/ops/var/moore/var_moore.h b/src/infiniop/ops/var/moore/var_moore.h
deleted file mode 100644
index 220912b5e..000000000
--- a/src/infiniop/ops/var/moore/var_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __VAR_MOORE_H__
-#define __VAR_MOORE_H__
-
-#include "../var_desc.h"
-
-DESCRIPTOR(moore);
-
-#endif // __VAR_MOORE_H__
diff --git a/src/infiniop/ops/var/moore/var_moore.mu b/src/infiniop/ops/var/moore/var_moore.mu
deleted file mode 100644
index 3e72da2b4..000000000
--- a/src/infiniop/ops/var/moore/var_moore.mu
+++ /dev/null
@@ -1,124 +0,0 @@
-#include "../../../devices/moore/moore_common.h"
-#include "../../../devices/moore/moore_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "var_moore.h"
-
-namespace op::var::moore {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarInfo &info,
-    Tdata *var_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    musaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-
-    size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_MOORE(musaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
-        CHECK_MOORE(musaFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    musaStream_t stream = (musaStream_t)stream_;
-
-#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(     \
-        _info,                                        \
-        (Tdata *)var_output, (const Tdata *)input,    \
-        unbiased, keepdim,                            \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE)                    \
-    {                                                                \
-        if (_info.dtype == INFINI_DTYPE_BF16)                        \
-            return CALCULATE_VAR(BLOCK_SIZE, __mt_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                    \
-            return CALCULATE_VAR(BLOCK_SIZE, half, double);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)                    \
-            return CALCULATE_VAR(BLOCK_SIZE, float, double);         \
-        else                                                         \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                   \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var::moore
diff --git a/src/infiniop/ops/var/nvidia/var_nvidia.cu b/src/infiniop/ops/var/nvidia/var_nvidia.cu
deleted file mode 100644
index a0166f804..000000000
--- a/src/infiniop/ops/var/nvidia/var_nvidia.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "var_nvidia.cuh"
-
-namespace op::var::nvidia {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarInfo &info,
-    Tdata *var_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    cudaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    // size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-
-    size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_CUDA(cudaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
-        CHECK_CUDA(cudaFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    cudaStream_t stream = (cudaStream_t)stream_;
-
-#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(     \
-        _info,                                        \
-        (Tdata *)var_output, (const Tdata *)input,    \
-        unbiased, keepdim,                            \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE)                    \
-    {                                                                \
-        if (_info.dtype == INFINI_DTYPE_BF16)                        \
-            return CALCULATE_VAR(BLOCK_SIZE, __nv_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                    \
-            return CALCULATE_VAR(BLOCK_SIZE, half, double);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)                    \
-            return CALCULATE_VAR(BLOCK_SIZE, float, double);         \
-        else                                                         \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                   \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var::nvidia
diff --git a/src/infiniop/ops/var/nvidia/var_nvidia.cuh b/src/infiniop/ops/var/nvidia/var_nvidia.cuh
deleted file mode 100644
index 8abfa87a0..000000000
--- a/src/infiniop/ops/var/nvidia/var_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __VAR_NVIDIA_H__
-#define __VAR_NVIDIA_H__
-
-#include "../var_desc.h"
-
-DESCRIPTOR(nvidia);
-
-#endif // __VAR_NVIDIA_H__
diff --git a/src/infiniop/ops/var/operator.cc b/src/infiniop/ops/var/operator.cc
deleted file mode 100644
index b963c0531..000000000
--- a/src/infiniop/ops/var/operator.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/var.h"
-#include <vector>
-
-#ifdef ENABLE_CPU_API
-#include "cpu/var_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/var_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/var_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/var_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/var_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateVarDescriptor(
-    infiniopHandle_t handle,
-    infiniopVarDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::var::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::var::NAMESPACE::Descriptor **>(desc_ptr), \
-            var_output_desc,                                               \
-            input_desc,                                                    \
-            dim,                                                           \
-            dim_size,                                                      \
-            unbiased,                                                      \
-            keepdim)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetVarWorkspaceSize(infiniopVarDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::var::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopVar(
-    infiniopVarDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::var::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, var_output, input, unbiased, keepdim, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyVarDescriptor(infiniopVarDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::var::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/var/var_desc.h b/src/infiniop/ops/var/var_desc.h
deleted file mode 100644
index e0cae2c89..000000000
--- a/src/infiniop/ops/var/var_desc.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef INFINIOP_VAR_DESCRIPTOR_H_
-#define INFINIOP_VAR_DESCRIPTOR_H_
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::var::NAMESPACE {                               \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        VarInfo _info;                                           \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            VarInfo info,                                        \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t var_output_desc,          \
-            infiniopTensorDescriptor_t input_desc,               \
-            size_t *dim,                                         \
-            size_t dim_size,                                     \
-            bool unbiased,                                       \
-            bool keepdim);                                       \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *var_output,                                    \
-            const void *input,                                   \
-            bool unbiased,                                       \
-            bool keepdim,                                        \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif
diff --git a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc b/src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc
deleted file mode 100644
index 0747b0c26..000000000
--- a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "var_mean_cpu.h"
-#include "../../../../utils.h"
-#include "../../../devices/cpu/common_cpu.h"
-namespace op::var_mean::cpu {
-
-Descriptor::~Descriptor() {}
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-
-    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-// welford
-namespace {
-bool IsNanOut(const VarMeanInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-// 直接用float计算
-template <typename Tdata>
-void computeVarMeanUsingWelfordCpu(const Tdata *input_ptr, float &var_output, float &mean_output, size_t start, size_t end, const VarMeanInfo &info) {
-    if (start >= end) {
-        return;
-    }
-    float old_mean = 0.0f; // previous mean
-    float mean = 0.0f;     // new mean
-    float M2 = 0.0f;       // variance sum
-    size_t count = 0;      // element count of new sum
-    for (size_t idx = start; idx < end; ++idx) {
-        size_t input_offset = op::common_cpu::indexToOffset(idx, info.permuted_input_shape.size(), info.permuted_input_shape.data(), info.permuted_input_strides.data());
-        ;
-        float value = utils::cast<float>(input_ptr[input_offset]);
-        count++;
-        old_mean = mean;
-        mean += (value - mean) / count;
-        M2 += (value - old_mean) * (value - mean);
-    }
-    mean_output = mean;
-    var_output = M2 / (info.unbiased_var ? (count - 1) : count);
-}
-
-template <typename Tdata>
-infiniStatus_t calculateVarMean(
-    const VarMeanInfo &info,
-    Tdata *var_output,
-    Tdata *mean_output,
-    const Tdata *input) {
-    Tdata nan_value = utils::cast<Tdata>(NAN);
-    bool is_scalar = (info.reduce_dim_size == info.permuted_input_shape.size());
-    // #pragma omp parallel for
-    for (size_t i = 0; i < info.output_size; ++i) {
-        size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data());
-        if (IsNanOut(info)) {
-            var_output[output_offset] = nan_value;
-            if (info.reduce_num == 0) {
-                mean_output[output_offset] = nan_value;
-            } else {
-                size_t input_idx = is_scalar ? 0 : i * info.reduce_num;
-                size_t input_offset = op::common_cpu::indexToOffset(input_idx, info.permuted_input_shape.size(), info.permuted_input_shape.data(), info.permuted_input_strides.data());
-                mean_output[output_offset] = input[input_offset];
-            }
-        } else {
-            size_t start = is_scalar ? 0 : i * info.reduce_num;
-            size_t end = is_scalar ? info.input_size : (i + 1) * info.reduce_num;
-            float var = 0.0f, mean = 0.0f;
-            computeVarMeanUsingWelfordCpu(input, var, mean, start, end, info);
-            var_output[output_offset] = utils::cast<Tdata>(var);
-            mean_output[output_offset] = utils::cast<Tdata>(mean);
-        }
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream) const {
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return calculateVarMean<fp16_t>(_info, (fp16_t *)var_output, (fp16_t *)mean_output, reinterpret_cast<const fp16_t *>(input));
-    case INFINI_DTYPE_F32:
-        return calculateVarMean<float>(_info, (float *)var_output, (float *)mean_output, reinterpret_cast<const float *>(input));
-    case INFINI_DTYPE_BF16:
-        return calculateVarMean<bf16_t>(_info, (bf16_t *)var_output, (bf16_t *)mean_output, reinterpret_cast<const bf16_t *>(input));
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::var_mean::cpu
diff --git a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.h b/src/infiniop/ops/var_mean/cpu/var_mean_cpu.h
deleted file mode 100644
index 205d02d14..000000000
--- a/src/infiniop/ops/var_mean/cpu/var_mean_cpu.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_VAR_MEAN_CPU_H__
-#define __INFINIOP_VAR_MEAN_CPU_H__
-
-#include "../var_mean_desc.h"
-
-DESCRIPTOR(cpu);
-
-#endif // __INFINIOP_VAR_MEAN_CPU_H__
diff --git a/src/infiniop/ops/var_mean/cuda/kernel.cuh b/src/infiniop/ops/var_mean/cuda/kernel.cuh
deleted file mode 100644
index ed50c37e2..000000000
--- a/src/infiniop/ops/var_mean/cuda/kernel.cuh
+++ /dev/null
@@ -1,378 +0,0 @@
-#ifndef __VAR_MEAN_CUDA_H__
-#define __VAR_MEAN_CUDA_H__
-
-#include <cmath> // NAN
-
-__forceinline__ __device__ __host__ size_t indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
-
-namespace device {
-namespace cuda {
-template <typename Tdata>
-__inline__ __device__ Tdata Nan();
-template <>
-__inline__ __device__ float Nan<float>() {
-    return NAN;
-}
-template <>
-__inline__ __device__ double Nan<double>() {
-    return NAN;
-}
-template <>
-__inline__ __device__ half Nan<half>() {
-    return __float2half(NAN);
-}
-
-#if defined(ENABLE_MOORE_API)
-using bf16_t = __mt_bfloat16;
-#elif defined(ENABLE_METAX_API)
-using bf16_t = __hpcc_bfloat16;
-#else
-using bf16_t = __nv_bfloat16;
-#endif
-
-/* bf16 */
-template <>
-__inline__ __device__ bf16_t Nan<bf16_t>() {
-    return __float2bfloat16_rn(NAN);
-}
-
-template <typename Tdata>
-__inline__ __device__ Tdata Div(Tdata a, Tdata b);
-template <>
-__inline__ __device__ float Div<float>(float a, float b) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
-    return __fdividef(a, b);
-#else
-    return a / b;
-#endif
-}
-template <>
-__inline__ __device__ double Div<double>(double a, double b) {
-    return a / b;
-}
-template <>
-__inline__ __device__ half Div<half>(half a, half b) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return __hdiv(a, b);
-#else
-    return __float2half(__half2float(a) / __half2float(b));
-#endif
-}
-template <>
-__inline__ __device__ bf16_t Div<bf16_t>(bf16_t a, bf16_t b) {
-
-#if defined(ENABLE_NVIDIA_API) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    return __hdiv(a, b);
-#else
-    return __float2bfloat16_rn(
-        __bfloat162float(a) / __bfloat162float(b));
-#endif
-}
-
-template <typename Tdata, typename ComputeType>
-inline __device__ void WelfordReduce(const Tdata *input_ptr, ComputeType &mean, ComputeType &m2, ComputeType &count,
-                                     const size_t start, const size_t end, const size_t step,
-                                     const size_t ndim, const size_t *shape, const ptrdiff_t *strides) {
-    ComputeType old_mean = 0.0;
-    for (size_t i = start; i < end; i += step) {
-        ++count;
-        old_mean = mean;
-        size_t input_offset = indexToOffset(i, ndim, shape, strides);
-        ComputeType input_value = static_cast<ComputeType>(input_ptr[input_offset]);
-        mean += (input_value - mean) / count;
-        m2 += (input_value - mean)
-            * (input_value - old_mean);
-    }
-}
-
-template <typename Tdata>
-inline __device__ void WelfordCombine(Tdata val, Tdata &mean, Tdata &m2, Tdata &count) {
-    count += 1;
-    Tdata delta1 = val - mean;
-    mean += Div(delta1, count);
-    Tdata delta2 = val - mean;
-    m2 += delta1 * delta2;
-}
-
-template <typename Tdata>
-inline __device__ void WelfordCombine(Tdata b_mean, Tdata b_m2, Tdata b_count, Tdata &mean, Tdata &m2, Tdata &count) {
-    if (b_count == 0) {
-        return;
-    }
-    Tdata new_count = count + b_count;              // n1 + n2
-    Tdata nb_over_n = Div(b_count, new_count);      // n2 / (n1 + n2)
-    Tdata delta = b_mean - mean;                    // mean2 - mean1
-    mean += delta * nb_over_n;                      // mean1 + n2 * (mean2 - mean1) / (n1 + n2)
-    m2 += b_m2 + delta * delta * count * nb_over_n; // m21 + m22 + n2 * (mean2 - mean1) ^ 2 / (n1 + n2)
-    count = new_count;
-}
-
-template <typename Tdata>
-inline __device__ void WelfordCombineLoop(const Tdata *b_mean, const Tdata *b_m2, const Tdata *b_count,
-                                          Tdata &mean, Tdata &m2, Tdata &count,
-                                          const size_t start, const size_t end, const size_t step) {
-    for (size_t i = start; i < end; i += step) {
-        WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count);
-    }
-}
-
-template <typename Tdata, int thread_group_width = 32>
-__inline__ __device__ void WelfordWarpReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count,
-                                             Tdata &mean, Tdata &m2, Tdata &count) {
-    mean = thread_mean;
-    m2 = thread_m2;
-    count = thread_count;
-    for (int lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) {
-        Tdata b_mean = __shfl_down_sync(0xffffffff, mean, lane_mask, thread_group_width);
-        Tdata b_m2 = __shfl_down_sync(0xffffffff, m2, lane_mask, thread_group_width);
-        Tdata b_count = __shfl_down_sync(0xffffffff, count, lane_mask, thread_group_width);
-        WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
-    }
-}
-
-template <typename Tdata, size_t kWarpSize = 32>
-__inline__ __device__ void WelfordBlockAllReduce(Tdata thread_mean, Tdata thread_m2, Tdata thread_count,
-                                                 Tdata &result_mean, Tdata &result_m2, Tdata &result_count) {
-    __shared__ Tdata mean_shared[kWarpSize];
-    __shared__ Tdata m2_shared[kWarpSize];
-    __shared__ Tdata count_shared[kWarpSize];
-    __shared__ Tdata mean_result_broadcast;
-    __shared__ Tdata m2_result_broadcast;
-    __shared__ Tdata count_result_broadcast;
-    const int lid = threadIdx.x % kWarpSize;
-    const int wid = threadIdx.x / kWarpSize;
-    // warp内规约
-    Tdata warp_mean = 0.0;
-    Tdata warp_m2 = 0.0;
-    Tdata warp_count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, warp_mean, warp_m2, warp_count);
-    __syncthreads();
-    if (lid == 0) { // 每个warp内的的thread0 保存warp结果
-        mean_shared[wid] = warp_mean;
-        m2_shared[wid] = warp_m2;
-        count_shared[wid] = warp_count;
-    }
-    __syncthreads();
-    // warp间规约
-    if (wid == 0) {
-        if (threadIdx.x < blockDim.x / kWarpSize) {
-            warp_mean = mean_shared[lid];
-            warp_m2 = m2_shared[lid];
-            warp_count = count_shared[lid];
-        } else {
-            warp_mean = static_cast<Tdata>(0);
-            warp_m2 = static_cast<Tdata>(0);
-            warp_count = static_cast<Tdata>(0);
-        }
-        __syncwarp();
-        Tdata block_mean = 0;
-        Tdata block_m2 = 0;
-        Tdata block_count = 0;
-        WelfordWarpReduce(warp_mean, warp_m2, warp_count, block_mean, block_m2, block_count);
-        if (lid == 0) {
-            mean_result_broadcast = block_mean;
-            m2_result_broadcast = block_m2;
-            count_result_broadcast = block_count;
-        }
-    }
-    __syncthreads();
-    result_mean = mean_result_broadcast;
-    result_m2 = m2_result_broadcast;
-    result_count = count_result_broadcast;
-}
-} // namespace cuda
-} // namespace device
-
-__device__ int32_t done_block_count = 0;
-
-template <typename Tdata, typename ComputeType>
-__global__ void ComputeVarScalarOut(const Tdata *input_ptr, Tdata *var_output_ptr, Tdata *mean_output_ptr, ComputeType *tmp_buffer_ptr,
-                                    size_t input_size, size_t input_ndim, size_t *permuted_input_shape, ptrdiff_t *permuted_input_strides,
-                                    bool unbiased, bool is_nan) {
-    // 处理 NaN 情况
-    if (is_nan) {
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
-            *var_output_ptr = device::cuda::Nan<Tdata>();
-            mean_output_ptr[0] = (input_size == 0) ? device::cuda::Nan<Tdata>() : input_ptr[0];
-        }
-        return;
-    }
-
-    // 计算每个 block 和 thread 的工作量
-    const size_t elems_per_block = input_size / gridDim.x;
-    const size_t elems_per_thread = elems_per_block / blockDim.x;
-    // 线程级 Welford 累积
-    ComputeType thread_mean = 0.0, thread_m2 = 0.0, thread_count = 0;
-
-    // 每个线程处理常规元素（stride 访问）
-    if (elems_per_thread > 0) {
-        const size_t block_start = blockIdx.x * elems_per_block;
-        const size_t regular_elems = elems_per_block - (elems_per_block % blockDim.x);
-        device::cuda::WelfordReduce<Tdata, ComputeType>(input_ptr, thread_mean, thread_m2, thread_count,
-                                                        /*start=*/block_start + threadIdx.x, /*end=*/block_start + regular_elems, /*step=*/blockDim.x,
-                                                        /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides);
-    }
-
-    // thread 0 处理本 block 的尾部元素以及跨 block 的尾部元素（单个线程处理）
-    if (threadIdx.x == 0) {
-        size_t tail_count = elems_per_block % blockDim.x;
-        // 最后一个 block 还需要处理总元素数的尾部
-        if (blockIdx.x == gridDim.x - 1) {
-            tail_count += input_size % gridDim.x;
-        }
-        if (tail_count > 0) {
-            const size_t tail_start = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread;
-            device::cuda::WelfordReduce<Tdata, ComputeType>(input_ptr, thread_mean, thread_m2, thread_count,
-                                                            /*start=*/tail_start, /*end=*/tail_start + tail_count, /*step=*/1,
-                                                            /*ndim=*/input_ndim, /*shape=*/permuted_input_shape, /*strides=*/permuted_input_strides);
-        }
-    }
-
-    // Block 级规约
-    ComputeType block_mean = 0.0, block_m2 = 0.0, block_count = 0;
-    device::cuda::WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count,
-                                                     block_mean, block_m2, block_count);
-
-    // 单 block 情况：直接输出结果
-    if (gridDim.x == 1) {
-        if (threadIdx.x == 0) {
-            ComputeType divisor = unbiased ? block_count - 1 : block_count;
-            var_output_ptr[0] = device::cuda::Div(block_m2, divisor);
-            mean_output_ptr[0] = static_cast<Tdata>(block_mean);
-        }
-        return;
-    }
-
-    // 多 block 情况：使用临时缓冲区
-    ComputeType *tmp_mean_ptr = tmp_buffer_ptr;
-    ComputeType *tmp_m2_ptr = tmp_mean_ptr + gridDim.x;
-    ComputeType *tmp_count_ptr = tmp_m2_ptr + gridDim.x;
-
-    // 保存本 block 的结果
-    if (threadIdx.x == 0) {
-        tmp_mean_ptr[blockIdx.x] = block_mean;
-        tmp_m2_ptr[blockIdx.x] = block_m2;
-        tmp_count_ptr[blockIdx.x] = block_count;
-    }
-
-    // 最后一个 block 负责最终规约
-    __shared__ bool is_last_block;
-    if (threadIdx.x == 0) {
-        is_last_block = (atomicAdd(&done_block_count, 1) == gridDim.x - 1);
-    }
-    __syncthreads();
-
-    if (is_last_block) {
-        // 每个线程合并一部分 block 的结果
-        ComputeType final_thread_mean = 0.0, final_thread_m2 = 0.0, final_thread_count = 0;
-        const size_t blocks_per_thread = gridDim.x / blockDim.x;
-        const size_t regular_blocks = blocks_per_thread * blockDim.x;
-
-        if (blocks_per_thread > 0) {
-            device::cuda::WelfordCombineLoop(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr,
-                                             final_thread_mean, final_thread_m2, final_thread_count,
-                                             /*start=*/threadIdx.x, /*end=*/regular_blocks, /*step=*/blockDim.x);
-        }
-
-        // thread 0 处理尾部 block
-        if (threadIdx.x == 0 && regular_blocks < gridDim.x) {
-            device::cuda::WelfordCombineLoop(&tmp_mean_ptr[regular_blocks], &tmp_m2_ptr[regular_blocks], &tmp_count_ptr[regular_blocks],
-                                             final_thread_mean, final_thread_m2, final_thread_count,
-                                             /*start=*/0, /*end=*/gridDim.x - regular_blocks, /*step=*/1);
-        }
-
-        // 最终 block 级规约并输出
-        ComputeType final_mean = 0, final_m2 = 0, final_count = 0;
-        device::cuda::WelfordBlockAllReduce<ComputeType>(final_thread_mean, final_thread_m2, final_thread_count,
-                                                         final_mean, final_m2, final_count);
-        if (threadIdx.x == 0) {
-            ComputeType divisor = unbiased ? final_count - 1 : final_count;
-            var_output_ptr[0] = device::cuda::Div(final_m2, divisor);
-            mean_output_ptr[0] = static_cast<Tdata>(final_mean);
-            done_block_count = 0; // 重置计数器
-        }
-    }
-}
-
-// CUDA: grid stride looping
-#define CUDA_1D_KERNEL_LOOP(i, n)                                                                  \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
-         i += step)
-
-template <typename Tdata, typename ComputeType>
-__forceinline__ __device__ __host__ void ComputeVarMeanUsingWelford(
-    const Tdata *input_ptr,
-    size_t offset,
-    Tdata &var_output,
-    Tdata &mean_output,
-    size_t reduce_num,
-    size_t input_ndim,
-    size_t *permuted_input_shape,
-    ptrdiff_t *permuted_input_strides,
-    bool unbiased) {
-    size_t count = 0;
-    ComputeType mean = 0.0;
-    ComputeType old_mean = 0.0;
-    ComputeType m2 = 0.0;
-    for (size_t i = 0; i < reduce_num; ++i) {
-        size_t input_offset = indexToOffset(offset + i, input_ndim, permuted_input_shape, permuted_input_strides);
-        count++;
-        old_mean = mean;
-        mean = old_mean + (static_cast<ComputeType>(input_ptr[input_offset]) - old_mean) / count;
-        m2 += (static_cast<ComputeType>(input_ptr[input_offset]) - old_mean) * (static_cast<ComputeType>(input_ptr[input_offset]) - mean);
-    }
-    var_output = static_cast<Tdata>(m2 / (unbiased ? count - 1 : count));
-    mean_output = static_cast<Tdata>(mean);
-}
-
-template <typename Tdata, typename ComputeType>
-__global__ void ComputeVarMeanUsingWelfordWrapper(
-    const Tdata *input_ptr, Tdata *var_output_ptr, Tdata *mean_output_ptr,
-    size_t input_ndim,
-    size_t output_size,
-    size_t reduce_num,
-    size_t *permuted_input_shape,
-    ptrdiff_t *permuted_input_strides,
-    bool unbiased,
-    bool is_nan) {
-    if (is_nan) {
-        if (reduce_num == 0) {
-            CUDA_1D_KERNEL_LOOP(i, output_size) {
-                var_output_ptr[i] = device::cuda::Nan<Tdata>();
-                mean_output_ptr[i] = device::cuda::Nan<Tdata>();
-            }
-        } else {
-            CUDA_1D_KERNEL_LOOP(i, output_size) {
-                const size_t input_offset = indexToOffset(i * reduce_num, input_ndim, permuted_input_shape, permuted_input_strides);
-                var_output_ptr[i] = device::cuda::Nan<Tdata>();
-                mean_output_ptr[i] = input_ptr[input_offset];
-            }
-        }
-    } else {
-        CUDA_1D_KERNEL_LOOP(i, output_size) {
-            ComputeVarMeanUsingWelford<Tdata, ComputeType>(
-                input_ptr,
-                i * reduce_num,
-                var_output_ptr[i],
-                mean_output_ptr[i],
-                reduce_num,
-                input_ndim,
-                permuted_input_shape,
-                permuted_input_strides,
-                unbiased);
-        }
-    }
-}
-
-#endif // __VAR_MEAN_CUDA_H__
diff --git a/src/infiniop/ops/var_mean/info.h b/src/infiniop/ops/var_mean/info.h
deleted file mode 100644
index 38eb3d1b1..000000000
--- a/src/infiniop/ops/var_mean/info.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __VAR_MEAN_INFO_H__
-#define __VAR_MEAN_INFO_H__
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <algorithm>
-#include <cstddef>
-#include <vector>
-
-namespace op::var_mean {
-class VarMeanInfo {
-    VarMeanInfo() = default;
-
-public:
-    infiniDtype_t dtype;
-    std::vector<size_t> permuted_input_shape; // need to permute
-    std::vector<size_t> output_shape;
-    std::vector<ptrdiff_t> permuted_input_strides; // need to permute
-    std::vector<ptrdiff_t> output_strides;
-    size_t reduce_dim_size; // reduce dim size
-    size_t reduce_num;      // number of elements to reduce for each output element
-    size_t input_size;      // total number of input elements
-    size_t output_size;     // total number of output elements
-    bool unbiased_var;
-    static utils::Result<VarMeanInfo> create(
-        infiniopTensorDescriptor_t var_output_desc,
-        infiniopTensorDescriptor_t input_desc,
-        size_t *dim,
-        size_t dim_size,
-        bool unbiased,
-        bool keepdim) {
-        auto input_shape = input_desc->shape();
-        auto input_strides = input_desc->strides();
-        size_t input_ndim = input_desc->ndim();
-        size_t reduce_num = 1;
-        for (size_t i = 0; i < dim_size; i++) {
-            reduce_num *= input_shape[dim[i]];
-        }
-        std::vector<size_t> permute_order;
-        for (size_t i = 0; i < input_ndim; i++) {
-            if (std::find(dim, dim + dim_size, i) == dim + dim_size) {
-                permute_order.push_back(i);
-            }
-        }
-        for (size_t i = 0; i < dim_size; i++) {
-            permute_order.push_back(dim[i]);
-        }
-        std::vector<size_t> permuted_input_shape;
-        std::vector<ptrdiff_t> permuted_input_strides;
-        for (size_t i = 0; i < permute_order.size(); i++) {
-            permuted_input_shape.push_back(input_shape[permute_order[i]]);
-            permuted_input_strides.push_back(input_strides[permute_order[i]]);
-        }
-        return utils::Result<VarMeanInfo>(VarMeanInfo{input_desc->dtype(),
-                                                      permuted_input_shape,
-                                                      var_output_desc->shape(),
-                                                      permuted_input_strides,
-                                                      var_output_desc->strides(),
-                                                      dim_size,
-                                                      reduce_num,
-                                                      input_desc->numel(),
-                                                      var_output_desc->numel(),
-                                                      unbiased});
-    }
-};
-} // namespace op::var_mean
-
-#endif
diff --git a/src/infiniop/ops/var_mean/metax/var_mean_metax.h b/src/infiniop/ops/var_mean/metax/var_mean_metax.h
deleted file mode 100644
index bc303987a..000000000
--- a/src/infiniop/ops/var_mean/metax/var_mean_metax.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __VAR_MEAN_METAX_H__
-#define __VAR_MEAN_METAX_H__
-
-#include "../var_mean_desc.h"
-
-DESCRIPTOR(metax);
-
-#endif // __VAR_MEAN_METAX_H__
diff --git a/src/infiniop/ops/var_mean/metax/var_mean_metax.maca b/src/infiniop/ops/var_mean/metax/var_mean_metax.maca
deleted file mode 100644
index ac4c61114..000000000
--- a/src/infiniop/ops/var_mean/metax/var_mean_metax.maca
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "../../../devices/metax/metax_common.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "var_mean_metax.h"
-
-namespace op::var_mean::metax {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarMeanInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarMeanInfo &info,
-    Tdata *var_output, Tdata *mean_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    hcStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-
-    size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
-    CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_METAX(hcMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
-        CHECK_METAX(hcFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    hcStream_t stream = (hcStream_t)stream_;
-
-#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType)               \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(                        \
-        _info,                                                           \
-        (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
-        unbiased, keepdim,                                               \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE)                      \
-    {                                                                       \
-        if (_info.dtype == INFINI_DTYPE_BF16)                               \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, __hpcc_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                           \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double);            \
-        else if (_info.dtype == INFINI_DTYPE_F32)                           \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double);           \
-        else                                                                \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                          \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var_mean::metax
diff --git a/src/infiniop/ops/var_mean/moore/var_mean_moore.h b/src/infiniop/ops/var_mean/moore/var_mean_moore.h
deleted file mode 100644
index 79f297e70..000000000
--- a/src/infiniop/ops/var_mean/moore/var_mean_moore.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __VAR_MEAN_MOORE_H__
-#define __VAR_MEAN_MOORE_H__
-
-#include "../var_mean_desc.h"
-
-DESCRIPTOR(moore);
-
-#endif // __VAR_MEAN_MOORE_H__
diff --git a/src/infiniop/ops/var_mean/moore/var_mean_moore.mu b/src/infiniop/ops/var_mean/moore/var_mean_moore.mu
deleted file mode 100644
index 62e44e3c8..000000000
--- a/src/infiniop/ops/var_mean/moore/var_mean_moore.mu
+++ /dev/null
@@ -1,125 +0,0 @@
-#include "../../../devices/moore/moore_common.h"
-#include "../../../devices/moore/moore_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "var_mean_moore.h"
-
-namespace op::var_mean::moore {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarMeanInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarMeanInfo &info,
-    Tdata *var_output, Tdata *mean_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    musaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_MOORE(musaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
-        CHECK_MOORE(musaFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    musaStream_t stream = (musaStream_t)stream_;
-
-#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType)               \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(                        \
-        _info,                                                           \
-        (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
-        unbiased, keepdim,                                               \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE)                    \
-    {                                                                     \
-        if (_info.dtype == INFINI_DTYPE_BF16)                             \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, __mt_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double);         \
-        else                                                              \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                        \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var_mean::moore
diff --git a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu b/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu
deleted file mode 100644
index 95352a106..000000000
--- a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "var_mean_nvidia.cuh"
-
-namespace op::var_mean::nvidia {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarMeanInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarMeanInfo &info,
-    Tdata *var_output, Tdata *mean_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    cudaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-
-    size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_CUDA(cudaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
-        CHECK_CUDA(cudaFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    cudaStream_t stream = (cudaStream_t)stream_;
-
-#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType)               \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(                        \
-        _info,                                                           \
-        (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
-        unbiased, keepdim,                                               \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE)                    \
-    {                                                                     \
-        if (_info.dtype == INFINI_DTYPE_BF16)                             \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, __nv_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double);         \
-        else                                                              \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                        \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var_mean::nvidia
diff --git a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh b/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh
deleted file mode 100644
index d8115883f..000000000
--- a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __VAR_MEAN_NVIDIA_H__
-#define __VAR_MEAN_NVIDIA_H__
-
-#include "../var_mean_desc.h"
-
-DESCRIPTOR(nvidia);
-
-#endif // __VAR_MEAN_NVIDIA_H__
diff --git a/src/infiniop/ops/var_mean/operator.cc b/src/infiniop/ops/var_mean/operator.cc
deleted file mode 100644
index 9b408ed23..000000000
--- a/src/infiniop/ops/var_mean/operator.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/var_mean.h"
-#include <vector>
-
-#ifdef ENABLE_CPU_API
-#include "cpu/var_mean_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/var_mean_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/var_mean_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/var_mean_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/var_mean_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateVarMeanDescriptor(
-    infiniopHandle_t handle,
-    infiniopVarMeanDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-
-#define CREATE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        return op::var_mean::NAMESPACE::Descriptor::create(                     \
-            handle,                                                             \
-            reinterpret_cast<op::var_mean::NAMESPACE::Descriptor **>(desc_ptr), \
-            var_output_desc,                                                    \
-            mean_output_desc,                                                   \
-            input_desc,                                                         \
-            dim,                                                                \
-            dim_size,                                                           \
-            unbiased,                                                           \
-            keepdim)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetVarMeanWorkspaceSize(infiniopVarMeanDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                    \
-    case CASE:                                                                                  \
-        *size = reinterpret_cast<op::var_mean::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopVarMean(
-    infiniopVarMeanDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                     \
-        return reinterpret_cast<const op::var_mean::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, var_output, mean_output, input, unbiased, keepdim, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyVarMeanDescriptor(infiniopVarMeanDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                     \
-    case CASE:                                                                      \
-        delete reinterpret_cast<const op::var_mean::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
diff --git a/src/infiniop/ops/var_mean/var_mean_desc.h b/src/infiniop/ops/var_mean/var_mean_desc.h
deleted file mode 100644
index 71b76814f..000000000
--- a/src/infiniop/ops/var_mean/var_mean_desc.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef INFINIOP_VAR_MEAN_DESCRIPTOR_H_
-#define INFINIOP_VAR_MEAN_DESCRIPTOR_H_
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::var_mean::NAMESPACE {                          \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        VarMeanInfo _info;                                       \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            VarMeanInfo info,                                    \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t var_output_desc,          \
-            infiniopTensorDescriptor_t mean_output_desc,         \
-            infiniopTensorDescriptor_t input_desc,               \
-            size_t *dim,                                         \
-            size_t dim_size,                                     \
-            bool unbiased,                                       \
-            bool keepdim);                                       \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *var_output,                                    \
-            void *mean_output,                                   \
-            const void *input,                                   \
-            bool unbiased,                                       \
-            bool keepdim,                                        \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif
diff --git a/src/utils/custom_types.h b/src/utils/custom_types.h
index 23be702ff..05a5c2fca 100644
--- a/src/utils/custom_types.h
+++ b/src/utils/custom_types.h
@@ -13,22 +13,6 @@ struct CustomBFloat16 {
 };
 typedef struct CustomBFloat16 bf16_t;
 
-inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
-    return lhs._v == rhs._v;
-}
-
-inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
-    return !(lhs == rhs);
-}
-
-inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
-    return lhs._v == rhs._v;
-}
-
-inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
-    return !(lhs == rhs);
-}
-
 float _f16_to_f32(fp16_t val);
 fp16_t _f32_to_f16(float val);
 
diff --git a/test/infinicore/ops/all.py b/test/infinicore/ops/all.py
index 2d295e262..cded59ee8 100644
--- a/test/infinicore/ops/all.py
+++ b/test/infinicore/ops/all.py
@@ -56,7 +56,7 @@ def parse_test_cases():
     for data in _TEST_CASES_DATA:
         shape, strides, dim, keepdim, out_strides = data
         input_supports_inplace = not is_broadcast(strides)
-        # out_supports_inplace = not is_broadcast(out_strides)
+        out_supports_inplace = not is_broadcast(out_strides)
 
         for dtype in _TENSOR_DTYPES:
             tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
@@ -81,19 +81,19 @@ def parse_test_cases():
             )
 
             # explicit out when supported (create out tensor with computed shape)
-            # out_shape = _compute_out_shape(shape, dim, keepdim)
-            # out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool)
-            # if out_supports_inplace:
-            #     test_cases.append(
-            #         TestCase(
-            #             inputs=[in_spec],
-            #             kwargs=kwargs,
-            #             output_spec=out_spec,
-            #             comparison_target="out",
-            #             tolerance=tol,
-            #             description="All - INPLACE(out)",
-            #         )
-            #     )
+            out_shape = _compute_out_shape(shape, dim, keepdim)
+            out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool)
+            if out_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[in_spec],
+                        kwargs=kwargs,
+                        output_spec=out_spec,
+                        comparison_target="out",
+                        tolerance=tol,
+                        description="All - INPLACE(out)",
+                    )
+                )
 
     return test_cases
 
@@ -110,9 +110,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.all(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.all(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.all(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/avg_pool1d.py b/test/infinicore/ops/avg_pool1d.py
index 539951628..5a0318571 100644
--- a/test/infinicore/ops/avg_pool1d.py
+++ b/test/infinicore/ops/avg_pool1d.py
@@ -74,8 +74,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.avg_pool1d(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/cross_entropy.py b/test/infinicore/ops/cross_entropy.py
index 269216bc7..e71a30567 100644
--- a/test/infinicore/ops/cross_entropy.py
+++ b/test/infinicore/ops/cross_entropy.py
@@ -11,8 +11,6 @@
 # Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None)
 # infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean')
 
-# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。
-# 仍然保留原始配置，后续实现这些特性时只需放开过滤条件即可。
 _TEST_CASES_DATA = [
     ((4, 5), (4,), None, False, None),
     ((8, 10), (8,), None, True, -1),
@@ -22,9 +20,6 @@
     ((2, 2), (2,), None, True, -100),
 ]
 
-_SUPPORT_WEIGHT = False
-_SUPPORT_IGNORE_INDEX = False
-
 _TOLERANCE_MAP = {
     infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
     infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
@@ -45,11 +40,6 @@ def parse_test_cases():
     ) in _TEST_CASES_DATA:
         for dtype in _TENSOR_DTYPES:
             tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
-            if weight_present and not _SUPPORT_WEIGHT:
-                continue
-            if ignore_index is not None and not _SUPPORT_IGNORE_INDEX:
-                continue
-
             logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype)
             target = TensorSpec.from_tensor(
                 target_shape,
@@ -61,7 +51,7 @@ def parse_test_cases():
             )
 
             inputs = [logits, target]
-            kwargs = {"reduction": "none"}
+            kwargs = {}
             if weight_present:
                 weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype)
                 inputs.append(weight_spec)
@@ -94,10 +84,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.cross_entropy(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation."""
-        out = kwargs.pop("out", None)
-        return infinicore.cross_entropy(*args, out=out, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.cross_entropy(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/equal.py b/test/infinicore/ops/equal.py
index fd5c37261..10aae3fcb 100644
--- a/test/infinicore/ops/equal.py
+++ b/test/infinicore/ops/equal.py
@@ -74,11 +74,8 @@ def parse_test_cases():
                     )
                 )
 
-            # Equal 结果为 bool，无法安全复用浮点/整型输入作为输出缓冲区。
-            # 只有当输入 dtype 本身为 bool 时才允许 inplace，这里提前留出开关。
-            allow_input_inplace = dtype == infinicore.bool
-
-            if allow_input_inplace and a_supports_inplace:
+            # in-place a
+            if a_supports_inplace:
                 test_cases.append(
                     TestCase(
                         inputs=[a_spec, b_spec],
@@ -90,7 +87,8 @@ def parse_test_cases():
                     )
                 )
 
-            if allow_input_inplace and b_supports_inplace:
+            # in-place b
+            if b_supports_inplace:
                 test_cases.append(
                     TestCase(
                         inputs=[a_spec, b_spec],
@@ -117,8 +115,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.eq(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        return infinicore.equal(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.eq(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/hardswish.py b/test/infinicore/ops/hardswish.py
index 5ab38d594..9f31cdc62 100644
--- a/test/infinicore/ops/hardswish.py
+++ b/test/infinicore/ops/hardswish.py
@@ -70,8 +70,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.hardswish(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        return infinicore.nn.functional.hardswish(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.hardswish(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/hardtanh.py b/test/infinicore/ops/hardtanh.py
index a88ea6c8d..6861e464e 100644
--- a/test/infinicore/ops/hardtanh.py
+++ b/test/infinicore/ops/hardtanh.py
@@ -17,6 +17,7 @@
 
 _TEST_CASES_DATA = [
     ((13, 4), None, -1.0, 1.0),
+    ((13, 4), (10, 1), -0.5, 0.5),
     ((8, 8, 8), None, -2.0, 2.0),
 ]
 
@@ -86,11 +87,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.hardtanh(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation."""
-        import infinicore.nn.functional as F
-
-        return F.hardtanh(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.hardtanh(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/sum.py b/test/infinicore/ops/sum.py
index b22f77242..74d9e29fc 100644
--- a/test/infinicore/ops/sum.py
+++ b/test/infinicore/ops/sum.py
@@ -20,7 +20,7 @@
     ((8, 8), None, None, None, None),
     ((8, 8), (16, 1), 1, False, None),
     ((2, 3, 4), None, 0, True, None),
-    ((1, 8), None, (0,), False, None),  # tuple 导致 infini_list  kwargs dim,[0]
+    ((1, 8), None, (0,), False, None),
     ((16, 64), (128, 1), None, None, None),
     ((4, 5, 6), (60, 12, 2), 2, True, None),
 ]
@@ -61,6 +61,7 @@ def parse_test_cases():
                     description="Sum - OUT_OF_PLACE",
                 )
             )
+
     return test_cases
 
 
@@ -76,11 +77,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.sum(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.sum(
-            *args, **kwargs
-        )  # todo 找到具体对应的 python/infinicore/ops/sum.py
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.sum(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/topk.py b/test/infinicore/ops/topk.py
index 50876b1b7..a0b9bdcd8 100644
--- a/test/infinicore/ops/topk.py
+++ b/test/infinicore/ops/topk.py
@@ -15,7 +15,7 @@
 
 # Test cases format: (shape, input_strides, k, dim, largest, sorted)
 _TEST_CASES_DATA = [
-    ((6, 8), None, 1, 1, False, True),
+    ((6, 8), None, 1, 1, True, True),
     ((8, 4), (16, 1), 2, 0, True, False),
     ((5, 5), None, 3, -1, False, True),
     ((3, 7), (14, 1), 2, 1, True, True),
@@ -55,7 +55,6 @@ def parse_test_cases():
                     comparison_target=None,
                     tolerance=tol,
                     description=f"topk - OUT_OF_PLACE",
-                    output_count=2,
                 )
             )
 
@@ -78,9 +77,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.topk(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.topk(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.topk(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/var.py b/test/infinicore/ops/var.py
index d441ed4ab..e0ce9f463 100644
--- a/test/infinicore/ops/var.py
+++ b/test/infinicore/ops/var.py
@@ -76,9 +76,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.var(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.var(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.var(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/var_mean.py b/test/infinicore/ops/var_mean.py
index b1de0bf90..18015d2cd 100644
--- a/test/infinicore/ops/var_mean.py
+++ b/test/infinicore/ops/var_mean.py
@@ -15,7 +15,7 @@
 
 # Test cases format: (in_shape, in_strides_or_None, dim_or_None, unbiased_or_None, keepdim_or_None)
 # var_mean returns (var, mean)
-# Changed in torch version 2.0: Previously this argument was called unbiased and was a boolean with True corresponding to correction=1 and False being correction=0.
+
 _TEST_CASES_DATA = [
     ((8, 8), None, None, None, None),
     ((8, 8), (16, 1), 1, True, False),
@@ -27,7 +27,7 @@
 
 _TOLERANCE_MAP = {
     infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
-    infinicore.float32: {"atol": 1e-5, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
 }
 
 _TENSOR_DTYPES = [infinicore.float16, infinicore.float32]
@@ -47,8 +47,6 @@ def parse_test_cases():
                 kwargs["dim"] = dim
             if unbiased is not None:
                 kwargs["unbiased"] = unbiased
-                # Changed in version 2.0: Previously this argument was called unbiased and was a boolean with True
-                # corresponding to correction=1 and False being correction=0.
             if keepdim is not None:
                 kwargs["keepdim"] = keepdim
 
@@ -78,9 +76,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.var_mean(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.var_mean(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.var_mean(*args, **kwargs)
 
 
 def main():
diff --git a/test/infiniop/avg_pool1d.py b/test/infiniop/avg_pool1d.py
deleted file mode 100644
index dd9e771c0..000000000
--- a/test/infiniop/avg_pool1d.py
+++ /dev/null
@@ -1,183 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-
-import torch
-
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-_TEST_CASES = [
-    # input_shape, x_stride, y_stride, kernel_size, stride, padding
-    ((2, 3, 16), None, None, 3, None, 0),
-    ((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2),
-    ((2, 1, 32), None, (32, 16, 1), 2, 2, 0),
-    ((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1),
-    ((4, 6, 31), None, None, 4, 2, 1),
-    ((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0),
-]
-
-# Data types used for testing
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def _effective_stride(stride, kernel_size):
-    if stride in (None, 0):
-        return kernel_size
-    return stride
-
-
-def _compute_output_shape(input_shape, kernel_size, stride, padding):
-    stride = _effective_stride(stride, kernel_size)
-    width = input_shape[2]
-    out_width = (width + 2 * padding - kernel_size) // stride + 1
-    return (input_shape[0], input_shape[1], out_width)
-
-
-def avg_pool1d_ref(x, kernel_size, stride, padding):
-    stride = _effective_stride(stride, kernel_size)
-    out = torch.nn.functional.avg_pool1d(
-        x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding
-    )
-    return out.to(x.dtype)
-
-
-def test(
-    handle,
-    device,
-    input_shape,
-    x_stride,
-    y_stride,
-    kernel_size,
-    stride,
-    padding,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    stride_value = _effective_stride(stride, kernel_size)
-    out_shape = _compute_output_shape(
-        input_shape, kernel_size, stride_value, padding
-    )
-    print(
-        f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, "
-        f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, "
-        f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}"
-    )
-
-    x = TestTensor(input_shape, x_stride, dtype, device)
-    y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros")
-
-    ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAvgPool1dDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y.descriptor,
-            x.descriptor,
-            kernel_size,
-            stride_value,
-            padding,
-        )
-    )
-
-    # Invalidate descriptors in tensors after creation to make sure kernels read from arguments
-    x.destroy_desc()
-    y.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, x.device)
-
-    def lib_avg_pool1d():
-        check_error(
-            LIBINFINIOP.infiniopAvgPool1d(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                x.data(),
-                None,
-            )
-        )
-
-    lib_avg_pool1d()
-
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-
-    if PROFILE:
-        # fmt: off
-        profile_operation(
-            "PyTorch",
-            lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding),
-            device,
-            NUM_PRERUN,
-            NUM_ITERATIONS,
-        )
-        profile_operation(
-            "    lib",
-            lambda: lib_avg_pool1d(),
-            device,
-            NUM_PRERUN,
-            NUM_ITERATIONS,
-        )
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
-
diff --git a/test/infiniop/cross_entropy.py b/test/infiniop/cross_entropy.py
deleted file mode 100644
index 987f2d11a..000000000
--- a/test/infiniop/cross_entropy.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-
-# ------------------------------------------------------------
-# 用例配置
-# ------------------------------------------------------------
-_TEST_CASES_ = [
-    ((2, 4, 10), None, None),        # logits shape, x_stride, y_stride
-    ((1, 128, 32000), None, None),
-    ((4, 512, 1000), None, None),
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
-}
-
-# ------------------------------------------------------------
-# PyTorch 参考实现
-# ------------------------------------------------------------
-def cross_entropy_ref(logits, target):
-    vocab = logits.shape[-1]
-    logits_flat = logits.reshape(-1, vocab).float()
-    target_flat = target.reshape(-1).long()
-    loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none")
-    return loss.view(target.shape).to(logits.dtype)
-
-
-def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None):
-    logits_shape = shape
-    label_shape = shape[:-1]
-    vocab = shape[-1]
-
-    print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}")
-
-    x = TestTensor(logits_shape, x_stride, dtype, device)
-    target = TestTensor(label_shape, None, InfiniDtype.I64, device)
-
-    # 生成有效标签
-    tgt = target.torch_tensor()
-    tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device))
-    target.actual_tensor().copy_(tgt)
-
-    reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor())
-    y = TestTensor(label_shape, y_stride, dtype, device)
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateCrossEntropyDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor
-        )
-    )
-
-    for tensor in [x, y, target]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
-    workspace = TestWorkspace(workspace_size.value, x.device)
-
-    def run():
-        check_error(
-            LIBINFINIOP.infiniopCrossEntropy(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                x.data(),
-                target.data(),
-                None,
-            )
-        )
-
-    run()
-    if sync:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol)
-
-    check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES)
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py
deleted file mode 100644
index e333b94b3..000000000
--- a/test/infiniop/equal.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
-    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool，内存大小不同)
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# 测试的输入数据类型
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64]
-
-# 容差设置 (对于 Bool 比较，通常要求完全匹配)
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 0},
-    InfiniDtype.F32: {"atol": 0, "rtol": 0},
-    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
-    InfiniDtype.I32: {"atol": 0, "rtol": 0},
-    InfiniDtype.I64: {"atol": 0, "rtol": 0},
-    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-# PyTorch 标准实现
-def equal_func(c, a, b):
-    torch.eq(a, b, out=c)
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    # 输入 Tensor 使用指定的 dtype (如 float16)
-    a = TestTensor(shape, a_stride, dtype, device)
-    b = TestTensor(shape, b_stride, dtype, device)
-    
-    # [关键修改] 输出 Tensor 强制使用 Bool 类型
-    # 注意：这里 c_stride 如果是按字节计算的，对于 Bool 类型通常是 1 byte
-    c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL"
-    )
-
-    # 运行 PyTorch 对照组
-    equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    
-    # [关键修改] 调用 Equal 的 Create 函数
-    check_error(
-        LIBINFINIOP.infiniopCreateEqualDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor, # Output (Bool)
-            a.descriptor, # Input A
-            b.descriptor, # Input B
-        )
-    )
-
-    # Invalidate descriptors
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetEqualWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, c.device)
-
-    def lib_equal():
-        check_error(
-            LIBINFINIOP.infiniopEqual(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_equal()
-
-    # 使用 Bool 类型的容差 (实际上就是全等)
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL)
-    
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-    
-    # 验证结果
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-        
-    check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
deleted file mode 100644
index b60439d16..000000000
--- a/test/infiniop/hardswish.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# 复用相同的测试用例配置，因为 HardSwish 也是逐元素操作
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None),
-    ((13, 4, 4), None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), None),
-    ((16, 5632), None, None),
-    ((16, 5632), (13312, 1), (13312, 1)),
-    ((4, 4, 5632), None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE = auto()
-
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    output_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    input = TestTensor(shape, input_stride, dtype, device)
-    if inplace == Inplace.INPLACE:
-        if input_stride != output_stride:
-            return
-        output = input
-    else:
-        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
-
-    if output.is_broadcast():
-        return
-
-    print(
-        f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    new_output = torch.nn.functional.hardswish(input.torch_tensor())
-    output.update_torch_tensor(new_output)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    
-    check_error(
-        LIBINFINIOP.infiniopCreateHardSwishDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            output.descriptor,
-            input.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [input, output]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetHardSwishWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, output.device)
-
-    def lib_hardswish():
-        check_error(
-            LIBINFINIOP.infiniopHardSwish(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                output.data(),
-                input.data(),
-                None,
-            )
-        )
-
-    lib_hardswish()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-    
-    assert torch.allclose(
-        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
-    )
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    
-    check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/hardtanh.py b/test/infiniop/hardtanh.py
deleted file mode 100644
index 02549ed7d..000000000
--- a/test/infiniop/hardtanh.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64, c_float
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-# Configuration
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((16, 5632), None, None),
-    ((4, 4, 5632), None, None),
-]
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE,
-]
-
-# HardTanh 特有的参数测试组合 (min_val, max_val)
-_PARAM_CASES = [
-    (-1.0, 1.0),
-    (0.0, 6.0), # 类似于 ReLU6
-    (-2.5, 2.5),
-]
-
-# 组合所有测试用例：shape + inplace + params
-_TEST_CASES = [
-    test_case + (inplace_item, p_min, p_max)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-    for p_min, p_max in _PARAM_CASES
-]
-
-_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    output_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    min_val=-1.0,
-    max_val=1.0,
-    dtype=torch.float16,
-    sync=None,
-):
-    input = TestTensor(shape, input_stride, dtype, device)
-    if inplace == Inplace.INPLACE:
-        if input_stride != output_stride:
-            return
-        output = input
-    else:
-        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
-
-    if output.is_broadcast():
-        return
-
-    print(
-        f"Testing HardTanh on {InfiniDeviceNames[device]} | shape:{shape} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace} range:[{min_val}, {max_val}]"
-    )
-
-    # 计算 PyTorch 真值
-    new_output = torch.nn.functional.hardtanh(input.torch_tensor(), min_val=min_val, max_val=max_val)
-    output.update_torch_tensor(new_output)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-
-    check_error(
-        LIBINFINIOP.infiniopCreateHardTanhDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            output.descriptor,
-            input.descriptor,
-            c_float(min_val),
-            c_float(max_val),
-        )
-    )
-
-    for tensor in [input, output]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetHardTanhWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, output.device)
-
-    def lib_hardtanh():
-        check_error(
-            LIBINFINIOP.infiniopHardTanh(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                output.data(),
-                input.data(),
-                None,
-            )
-        )
-
-    lib_hardtanh()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-    
-    assert torch.allclose(
-        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
-    )
-
-    if PROFILE:
-        profile_operation("PyTorch", lambda: torch.nn.functional.hardtanh(input.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("   lib", lambda: lib_hardtanh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        
-    check_error(LIBINFINIOP.infiniopDestroyHardTanhDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mHardTanh Test passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 8aeba0100..275689e78 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -54,54 +54,6 @@ def add_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
-@OpRegister.operator
-def equal_(lib):
-    # =========================================================
-    # 1. 注册 Create 函数
-    # C函数签名: (handle, &desc, output_desc, input_a_desc, input_b_desc)
-    # =========================================================
-    lib.infiniopCreateEqualDescriptor.restype = c_int32
-    lib.infiniopCreateEqualDescriptor.argtypes = [
-        infiniopHandle_t,                     # handle
-        POINTER(infiniopOperatorDescriptor_t),# desc_ptr (输出)
-        infiniopTensorDescriptor_t,           # output (c)
-        infiniopTensorDescriptor_t,           # input_a
-        infiniopTensorDescriptor_t,           # input_b
-    ]
-
-    # =========================================================
-    # 2. 注册 GetWorkspaceSize 函数
-    # C函数签名: (desc, &size)
-    # =========================================================
-    lib.infiniopGetEqualWorkspaceSize.restype = c_int32
-    lib.infiniopGetEqualWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    # =========================================================
-    # 3. 注册 Execute (计算) 函数
-    # C函数签名: (desc, workspace, size, output_data, input_a_data, input_b_data, stream)
-    # =========================================================
-    lib.infiniopEqual.restype = c_int32
-    lib.infiniopEqual.argtypes = [
-        infiniopOperatorDescriptor_t, # desc
-        c_void_p,                     # workspace ptr
-        c_size_t,                     # workspace size
-        c_void_p,                     # output data ptr
-        c_void_p,                     # input a data ptr
-        c_void_p,                     # input b data ptr
-        c_void_p,                     # stream
-    ]
-
-    # =========================================================
-    # 4. 注册 Destroy 函数
-    # C函数签名: (desc)
-    # =========================================================
-    lib.infiniopDestroyEqualDescriptor.restype = c_int32
-    lib.infiniopDestroyEqualDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]
 
 @OpRegister.operator
 def attention_(lib):
@@ -210,40 +162,6 @@ def clip_(lib):
     ]
 
 
-@OpRegister.operator
-def cross_entropy_(lib):
-    lib.infiniopCreateCrossEntropyDescriptor.restype = c_int32
-    lib.infiniopCreateCrossEntropyDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetCrossEntropyWorkspaceSize.restype = c_int32
-    lib.infiniopGetCrossEntropyWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    lib.infiniopCrossEntropy.restype = c_int32
-    lib.infiniopCrossEntropy.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p,
-        c_size_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyCrossEntropyDescriptor.restype = c_int32
-    lib.infiniopDestroyCrossEntropyDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]
-
-
 @OpRegister.operator
 def logsoftmax_(lib):
     lib.infiniopCreateLogSoftmaxDescriptor.restype = c_int32
@@ -991,112 +909,6 @@ def silu_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
-@OpRegister.operator
-def hardtanh_(lib):
-    # 1. Create Descriptor - 注意增加了两个 c_float 参数
-    lib.infiniopCreateHardTanhDescriptor.restype = c_int32
-    lib.infiniopCreateHardTanhDescriptor.argtypes = [
-        infiniopHandle_t,               # handle
-        POINTER(infiniopOperatorDescriptor_t), # desc_ptr
-        infiniopTensorDescriptor_t,     # output
-        infiniopTensorDescriptor_t,     # input
-        c_float,                        # min_val
-        c_float,                        # max_val
-    ]
-
-    # 2. Get Workspace Size
-    lib.infiniopGetHardTanhWorkspaceSize.restype = c_int32
-    lib.infiniopGetHardTanhWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,   # desc
-        POINTER(c_size_t),              # size
-    ]
-
-    # 3. Execute Operator
-    lib.infiniopHardTanh.restype = c_int32
-    lib.infiniopHardTanh.argtypes = [
-        infiniopOperatorDescriptor_t,   # desc
-        c_void_p,                       # workspace
-        c_size_t,                       # workspace_size
-        c_void_p,                       # output
-        c_void_p,                       # input
-        c_void_p,                       # stream
-    ]
-
-    # 4. Destroy Descriptor
-    lib.infiniopDestroyHardTanhDescriptor.restype = c_int32
-    lib.infiniopDestroyHardTanhDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,   # desc
-    ]
-
-@OpRegister.operator
-def hardswish_(lib):
-    lib.infiniopCreateHardSwishDescriptor.restype = c_int32
-    lib.infiniopCreateHardSwishDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetHardSwishWorkspaceSize.restype = c_int32
-    lib.infiniopGetHardSwishWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    lib.infiniopHardSwish.restype = c_int32
-    lib.infiniopHardSwish.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p,
-        c_size_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyHardSwishDescriptor.restype = c_int32
-    lib.infiniopDestroyHardSwishDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]
-
-@OpRegister.operator
-def avg_pool1d_(lib):
-    # 1. Create 函数
-    # C签名: (handle, *desc, y, x, kernel_size, stride, padding)
-    lib.infiniopCreateAvgPool1dDescriptor.restype = c_int32
-    lib.infiniopCreateAvgPool1dDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,  # y_desc (Output)
-        infiniopTensorDescriptor_t,  # x_desc (Input)
-        c_size_t,                    # kernel_size
-        c_size_t,                    # stride
-        c_size_t,                    # padding
-    ]
-
-    # 2. GetWorkspaceSize 函数
-    lib.infiniopGetAvgPool1dWorkspaceSize.restype = c_int32
-    lib.infiniopGetAvgPool1dWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    # 3. Execute 函数
-    lib.infiniopAvgPool1d.restype = c_int32
-    lib.infiniopAvgPool1d.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p,  # workspace
-        c_size_t,  # workspace_size
-        c_void_p,  # y (output pointer)
-        c_void_p,  # x (input pointer)
-        c_void_p,  # stream
-    ]
-
-    # 4. Destroy 函数
-    lib.infiniopDestroyAvgPool1dDescriptor.restype = c_int32
-    lib.infiniopDestroyAvgPool1dDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]
 
 @OpRegister.operator
 def layer_norm_(lib):
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index b690e74d4..ec8763a4e 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -83,12 +83,8 @@ def __init__(
                 InfiniDtype.BYTE,
                 InfiniDtype.BOOL,
             ]:
-                if dt == InfiniDtype.BOOL:
-                    randint_low = 0 if randint_low is None else randint_low
-                    randint_high = 2 if randint_high is None else randint_high
-                else:
-                    randint_low = -2000000000 if randint_low is None else randint_low
-                    randint_high = 2000000000 if randint_high is None else randint_high
+                randint_low = -2000000000 if randint_low is None else randint_low
+                randint_high = 2000000000 if randint_high is None else randint_high
                 self._torch_tensor = torch.randint(
                     randint_low,
                     randint_high,