From 6082f936b7ee581c00bfd60647fddc451f7409d0 Mon Sep 17 00:00:00 2001
From: Haowen <qindazhu@gmail.com>
Date: Fri, 29 Jan 2021 12:39:52 +0800
Subject: [PATCH 1/5] using pattern trasform_scan to merge two kenerls into one

---
 k2/csrc/benchmark/array_ops_benchmark.cu | 71 ++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
diff --git a/k2/csrc/benchmark/array_ops_benchmark.cu b/k2/csrc/benchmark/array_ops_benchmark.cu
index ece59eec9..22a803846 100644
--- a/k2/csrc/benchmark/array_ops_benchmark.cu
+++ b/k2/csrc/benchmark/array_ops_benchmark.cu
@@ -9,6 +9,7 @@
 #include "k2/csrc/array_ops.h"
 #include "k2/csrc/benchmark/benchmark.h"
 #include "k2/csrc/test_utils.h"
+#include "moderngpu/kernel_scan.hxx"
 
 namespace k2 {
 
@@ -268,6 +269,60 @@ static BenchmarkStat BenchmarkSizesToMergeMap(int32_t num_src,
   return stat;
 }
 
+void TransExclusiveSumOld(const Array1<int32_t> src, Array1<int32_t> *ans) {
+  ContextPtr c = src.Context();
+  int32_t dim = src.Dim();
+  K2_CHECK_EQ(dim + 1, ans->Dim());
+  const int32_t *src_data = src.Data();
+  int32_t *ans_data = ans->Data();
+  K2_EVAL(
+      c, dim, lambda_multiple2,
+      (int32_t i)->void { ans_data[i] = src_data[i] * 2; });
+  ExclusiveSum(*ans, ans);
+}
+
+void TransExclusiveSumNew(const Array1<int32_t> src, Array1<int32_t> *ans) {
+  ContextPtr c = src.Context();
+  int32_t dim = src.Dim();
+  K2_CHECK_EQ(dim + 1, ans->Dim());
+  const int32_t *src_data = src.Data();
+  int32_t *ans_data = ans->Data();
+  K2_TRANS_EXCSUM(
+      c, dim, ans_data, lambda_multiple2,
+      (int32_t i)->int32_t { return src_data[i] * 2; });
+}
+
+static BenchmarkStat BenchmarkTransExclusiveSum(int32_t dim,
+                                                DeviceType device_type) {
+  ContextPtr context;
+  if (device_type == kCpu) {
+    context = GetCpuContext();
+  } else {
+    K2_CHECK_EQ(device_type, kCuda);
+    context = GetCudaContext();
+  }
+
+  std::vector<int32_t> vec(dim);
+  std::iota(vec.begin(), vec.end(), dim);
+  Array1<int32_t> src(context, vec);
+  Array1<int32_t> ans(context, src.Dim() + 1);
+
+  BenchmarkStat stat;
+  stat.op_name = "TransExclusiveSum_New_" + std::to_string(dim);
+  int32_t num_iter = 20;
+  stat.num_iter = num_iter;
+  stat.problem_size = dim;
+  stat.device_type = device_type;
+
+  stat.eplased_per_iter =
+      BenchmarkOp(num_iter, context,
+                  (void (*)(const Array1<int32_t> &, Array1<int32_t> *))(
+                      &TransExclusiveSumNew),
+                  src, &ans);
+  stat.eplased_per_iter *= 1e6;  // from seconds to microseconds
+  return stat;
+}
+
 template <typename T>
 static void RegisterBenchmarkExclusiveSum(DeviceType device_type) {
   std::vector<int32_t> problems_sizes = {100,  500,   1000,  2000,
@@ -351,6 +406,20 @@ static void RegisterBenchmarkSizesToMergeMap(DeviceType device_type) {
   }
 }
 
+static void RegisterBenchmarkTransExclusiveSum(DeviceType device_type) {
+  std::vector<int32_t> problems_sizes = {100,    200,     500,    1000,  2000,
+                                         5000,   10000,   20000,  50000, 100000,
+                                         500000, 1000000, 5000000};
+  for (auto s : problems_sizes) {
+    std::string name =
+        GenerateBenchmarkName<int32_t>("TransExclusiveSum", device_type) + "_" +
+        std::to_string(s);
+    RegisterBenchmark(name, [s, device_type]() -> BenchmarkStat {
+      return BenchmarkTransExclusiveSum(s, device_type);
+    });
+  }
+}
+
 static void RunArrayOpsBenchmark() {
   PrintEnvironmentInfo();
 
@@ -369,6 +438,8 @@ static void RunArrayOpsBenchmark() {
 
   RegisterBenchmarkSizesToMergeMap(kCuda);
 
+  RegisterBenchmarkTransExclusiveSum(kCuda);
+
   // Users can set a regular expression via environment
   // variable `K2_BENCHMARK_FILTER` such that only benchmarks
   // with name matching the pattern are candidates to run.

From 414798550aba348a777c92241c6e02870571b5cb Mon Sep 17 00:00:00 2001
From: Haowen <qindazhu@gmail.com>
Date: Fri, 29 Jan 2021 13:02:42 +0800
Subject: [PATCH 2/5] delete benchmark code as they should not be merged

---
 k2/csrc/benchmark/array_ops_benchmark.cu | 70 ------------------------
 1 file changed, 70 deletions(-)

diff --git a/k2/csrc/benchmark/array_ops_benchmark.cu b/k2/csrc/benchmark/array_ops_benchmark.cu
index 22a803846..4f6dfae99 100644
--- a/k2/csrc/benchmark/array_ops_benchmark.cu
+++ b/k2/csrc/benchmark/array_ops_benchmark.cu
@@ -269,60 +269,6 @@ static BenchmarkStat BenchmarkSizesToMergeMap(int32_t num_src,
   return stat;
 }
 
-void TransExclusiveSumOld(const Array1<int32_t> src, Array1<int32_t> *ans) {
-  ContextPtr c = src.Context();
-  int32_t dim = src.Dim();
-  K2_CHECK_EQ(dim + 1, ans->Dim());
-  const int32_t *src_data = src.Data();
-  int32_t *ans_data = ans->Data();
-  K2_EVAL(
-      c, dim, lambda_multiple2,
-      (int32_t i)->void { ans_data[i] = src_data[i] * 2; });
-  ExclusiveSum(*ans, ans);
-}
-
-void TransExclusiveSumNew(const Array1<int32_t> src, Array1<int32_t> *ans) {
-  ContextPtr c = src.Context();
-  int32_t dim = src.Dim();
-  K2_CHECK_EQ(dim + 1, ans->Dim());
-  const int32_t *src_data = src.Data();
-  int32_t *ans_data = ans->Data();
-  K2_TRANS_EXCSUM(
-      c, dim, ans_data, lambda_multiple2,
-      (int32_t i)->int32_t { return src_data[i] * 2; });
-}
-
-static BenchmarkStat BenchmarkTransExclusiveSum(int32_t dim,
-                                                DeviceType device_type) {
-  ContextPtr context;
-  if (device_type == kCpu) {
-    context = GetCpuContext();
-  } else {
-    K2_CHECK_EQ(device_type, kCuda);
-    context = GetCudaContext();
-  }
-
-  std::vector<int32_t> vec(dim);
-  std::iota(vec.begin(), vec.end(), dim);
-  Array1<int32_t> src(context, vec);
-  Array1<int32_t> ans(context, src.Dim() + 1);
-
-  BenchmarkStat stat;
-  stat.op_name = "TransExclusiveSum_New_" + std::to_string(dim);
-  int32_t num_iter = 20;
-  stat.num_iter = num_iter;
-  stat.problem_size = dim;
-  stat.device_type = device_type;
-
-  stat.eplased_per_iter =
-      BenchmarkOp(num_iter, context,
-                  (void (*)(const Array1<int32_t> &, Array1<int32_t> *))(
-                      &TransExclusiveSumNew),
-                  src, &ans);
-  stat.eplased_per_iter *= 1e6;  // from seconds to microseconds
-  return stat;
-}
-
 template <typename T>
 static void RegisterBenchmarkExclusiveSum(DeviceType device_type) {
   std::vector<int32_t> problems_sizes = {100,  500,   1000,  2000,
@@ -406,20 +352,6 @@ static void RegisterBenchmarkSizesToMergeMap(DeviceType device_type) {
   }
 }
 
-static void RegisterBenchmarkTransExclusiveSum(DeviceType device_type) {
-  std::vector<int32_t> problems_sizes = {100,    200,     500,    1000,  2000,
-                                         5000,   10000,   20000,  50000, 100000,
-                                         500000, 1000000, 5000000};
-  for (auto s : problems_sizes) {
-    std::string name =
-        GenerateBenchmarkName<int32_t>("TransExclusiveSum", device_type) + "_" +
-        std::to_string(s);
-    RegisterBenchmark(name, [s, device_type]() -> BenchmarkStat {
-      return BenchmarkTransExclusiveSum(s, device_type);
-    });
-  }
-}
-
 static void RunArrayOpsBenchmark() {
   PrintEnvironmentInfo();
 
@@ -438,8 +370,6 @@ static void RunArrayOpsBenchmark() {
 
   RegisterBenchmarkSizesToMergeMap(kCuda);
 
-  RegisterBenchmarkTransExclusiveSum(kCuda);
-
   // Users can set a regular expression via environment
   // variable `K2_BENCHMARK_FILTER` such that only benchmarks
   // with name matching the pattern are candidates to run.

From c8d516e1fa6fa00bbd8bf26c6e10f1b206705b8e Mon Sep 17 00:00:00 2001
From: Haowen <qindazhu@gmail.com>
Date: Fri, 29 Jan 2021 13:03:49 +0800
Subject: [PATCH 3/5] delete useless header

---
 k2/csrc/benchmark/array_ops_benchmark.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/k2/csrc/benchmark/array_ops_benchmark.cu b/k2/csrc/benchmark/array_ops_benchmark.cu
index 4f6dfae99..ece59eec9 100644
--- a/k2/csrc/benchmark/array_ops_benchmark.cu
+++ b/k2/csrc/benchmark/array_ops_benchmark.cu
@@ -9,7 +9,6 @@
 #include "k2/csrc/array_ops.h"
 #include "k2/csrc/benchmark/benchmark.h"
 #include "k2/csrc/test_utils.h"
-#include "moderngpu/kernel_scan.hxx"
 
 namespace k2 {
 

From bbe1d9ee3f500ca9d2179ba5b6d19a600fbdf964 Mon Sep 17 00:00:00 2001
From: Haowen <qindazhu@gmail.com>
Date: Fri, 19 Feb 2021 11:05:35 +0800
Subject: [PATCH 4/5] try to implement the loop on axis with one kernel for
 IndexAxis0

---
 k2/csrc/benchmark/ragged_ops_benchmark.cu |  56 +++++
 k2/csrc/ragged_ops.cu                     | 259 ++++++++++++++++++----
 k2/csrc/ragged_ops.h                      |  22 +-
 k2/csrc/ragged_test.cu                    | 135 ++++++++++-
 4 files changed, 414 insertions(+), 58 deletions(-)

diff --git a/k2/csrc/benchmark/ragged_ops_benchmark.cu b/k2/csrc/benchmark/ragged_ops_benchmark.cu
index 7276c6f1f..68d7afb7c 100644
--- a/k2/csrc/benchmark/ragged_ops_benchmark.cu
+++ b/k2/csrc/benchmark/ragged_ops_benchmark.cu
@@ -114,6 +114,60 @@ static void RegisterBenchmarkSegmentedExclusiveSum(DeviceType device_type) {
   }
 }
 
+static BenchmarkStat BenchmarkIndexAxis0(int32_t dim, DeviceType device_type) {
+  ContextPtr context;
+  if (device_type == kCpu) {
+    context = GetCpuContext();
+  } else {
+    K2_CHECK_EQ(device_type, kCuda);
+    context = GetCudaContext();
+  }
+
+  int32_t num_iter = 20;
+  int32_t min_num_elems = dim * 10;
+  int32_t max_num_elems = dim * 20;
+
+  int32_t num_axes = 4;
+  RaggedShape shape =
+      RandomRaggedShape(true, num_axes, num_axes, min_num_elems, max_num_elems)
+          .To(context);
+  int32_t dim0 = shape.Dim0(), result_dim0 = RandInt(0, dim0);
+  if (dim0 == 0) result_dim0 = 0;
+  std::vector<int32_t> new2old_vec(result_dim0);
+  for (int i = 0; i < result_dim0; i++) new2old_vec[i] = RandInt(-1, dim0 - 1);
+  Array1<int32_t> new2old(context, new2old_vec);
+  int32_t num_elems = shape.NumElements();
+  Array1<int32_t> value_indexes;
+
+  BenchmarkStat stat;
+  stat.op_name = "IndexAxis0New_" + std::to_string(num_axes) + "_" +
+                 std::to_string(shape.Dim0()) + "_" + std::to_string(num_elems);
+  stat.num_iter = num_iter;
+  stat.problem_size = dim;
+  stat.dtype_name = TraitsOf(DtypeOf<int32_t>::dtype).Name();
+  stat.device_type = device_type;
+
+  stat.eplased_per_iter =
+      BenchmarkOp(num_iter, context,
+                  (RaggedShape(*)(RaggedShape &, const Array1<int32_t> &,
+                                  Array1<int32_t> *))(&IndexAxis0New),
+                  shape, new2old, &value_indexes);
+  stat.eplased_per_iter *= 1e6;  // from seconds to microseconds
+  return stat;
+}
+
+static void RegisterBenchmarkIndexAxis0(DeviceType device_type) {
+  std::vector<int32_t> problems_sizes = {50,    100,   200,    500,   1000,
+                                         10000, 50000, 100000, 200000};
+  for (auto s : problems_sizes) {
+    std::string name =
+        GenerateBenchmarkName<int32_t>("IndexAxis0", device_type);
+    RegisterBenchmark(name, [s, device_type]() -> BenchmarkStat {
+      return BenchmarkIndexAxis0(s, device_type);
+    });
+  }
+}
+
 static void RunRaggedOpsBenchmark() {
   PrintEnvironmentInfo();
 
@@ -122,6 +176,8 @@ static void RunRaggedOpsBenchmark() {
   RegisterBenchmarkSegmentedExclusiveSum<int32_t>(kCpu);
   RegisterBenchmarkSegmentedExclusiveSum<int32_t>(kCuda);
 
+  RegisterBenchmarkIndexAxis0(kCuda);
+
   // Users can set a regular expression via environment
   // variable `K2_BENCHMARK_FILTER` such that only benchmarks
   // with name matching the pattern are candidates to run.
diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu
index 8bb48fe95..bb29f99d8 100644
--- a/k2/csrc/ragged_ops.cu
+++ b/k2/csrc/ragged_ops.cu
@@ -358,13 +358,12 @@ std::vector<RaggedShape> UnsqueezeParallel(int32_t num_srcs, RaggedShape **src,
                          Note: `ans` is the result of Index(), with
                          ans.Dim0() == new2old.Dim().
  */
-inline void GetOldAndNewOffsets(RaggedShape &src,
-                                const Array1<int32_t> &new2old,
-                                Array2<int32_t> *old_offsets,
-                                Array2<int32_t> *new_offsets) {
+void GetOldAndNewOffsets(RaggedShape &src, const Array1<int32_t> &new2old,
+                         Array2<int32_t> *old_offsets,
+                         Array2<int32_t> *new_offsets) {
   NVTX_RANGE(K2_FUNC);
   K2_CHECK_GT(src.NumAxes(), 1);
-  ContextPtr &c = src.Context();
+  ContextPtr c = GetContext(src, new2old);
   int32_t num_axes = src.NumAxes(), ans_dim0 = new2old.Dim();
 
   // max 5 layers.
@@ -380,16 +379,14 @@ inline void GetOldAndNewOffsets(RaggedShape &src,
   K2_EVAL(
       c, ans_dim0, lambda_set_offsets, (int32_t i)->void {
         // 0 <= i < ans_dim0
-        int32_t old_offset = new2old_data[i],
-            old_offset_next = old_offset + 1,
-            offset_diff = 1;
+        int32_t old_offset = new2old_data[i], old_offset_next = old_offset + 1,
+                offset_diff = 1;
         // The following is a special case that interprets -1 as referring to an
         // empty list.  In this case, old_offset == old_offset_next == 0.
         // The specific value 0 is not necessary; they could be equal
         // and have any value in [0, src.Dim0() - 1] and still refer to
         // the empty list.
-        if (old_offset == -1)
-          old_offset = 0;
+        if (old_offset == -1) old_offset = 0;
         for (int32_t axis = 0;; axis++) {
           old_offsets_acc(axis, i) = old_offset;
           // Below, 'new_offsets_acc' currently contains the size rather
@@ -404,8 +401,54 @@ inline void GetOldAndNewOffsets(RaggedShape &src,
   ExclusiveSum(*new_offsets, new_offsets);
 }
 
-static RaggedShape IndexAxis0(RaggedShape &src, const Array1<int32_t> &new2old,
-                              Array1<int32_t> *elem_indexes /*=nullptr*/) {
+void GetOldAndNewOffsets(RaggedShape &src, const Array1<int32_t> &new2old,
+                         Array1<int32_t> *old_offsets,
+                         Array1<int32_t> *new_offsets) {
+  NVTX_RANGE(K2_FUNC);
+  K2_CHECK_GT(src.NumAxes(), 1);
+  ContextPtr c = GetContext(src, new2old);
+  int32_t num_axes = src.NumAxes(), ans_dim0 = new2old.Dim();
+
+  // max 5 layers.
+  RowSplitsAccessor<5> row_splits_acc(src);
+
+  const int32_t *new2old_data = new2old.Data();
+  *old_offsets = Array1<int32_t>(c, (num_axes - 1) * ans_dim0);
+  *new_offsets = Array1<int32_t>(c, (num_axes - 1) * ans_dim0 + 1);
+  int32_t *old_offsets_data = old_offsets->Data(),
+          *new_offsets_data = new_offsets->Data();
+  // Set old_offsets; and for now, set new_offsets to the corresponding
+  // sizes of the output slices.
+  K2_EVAL(
+      c, ans_dim0, lambda_set_offsets, (int32_t i)->void {
+        // 0 <= i < ans_dim0
+        int32_t old_offset = new2old_data[i], old_offset_next = old_offset + 1,
+                offset_diff = 1;
+        // The following is a special case that interprets -1 as referring to an
+        // empty list.  In this case, old_offset == old_offset_next == 0.
+        // The specific value 0 is not necessary; they could be equal
+        // and have any value in [0, src.Dim0() - 1] and still refer to
+        // the empty list.
+        if (old_offset == -1) old_offset = 0;
+        old_offset = row_splits_acc(0)[old_offset];
+        old_offset_next = row_splits_acc(0)[old_offset_next];
+        offset_diff = old_offset_next - old_offset;
+        for (int32_t axis = 1;; axis++) {
+          old_offsets_data[(axis - 1) * ans_dim0 + i] = old_offset;
+          // Below, 'new_offsets_acc' currently contains the size rather
+          // than the offset; we need to do exclusive-sum.
+          new_offsets_data[(axis - 1) * ans_dim0 + i] = offset_diff;
+          if (axis + 1 == num_axes) return;
+          old_offset = row_splits_acc(axis)[old_offset];
+          old_offset_next = row_splits_acc(axis)[old_offset_next];
+          offset_diff = old_offset_next - old_offset;
+        }
+      });
+  ExclusiveSum(*new_offsets, new_offsets);
+}
+
+RaggedShape IndexAxis0(RaggedShape &src, const Array1<int32_t> &new2old,
+                       Array1<int32_t> *elem_indexes /*=nullptr*/) {
   ContextPtr &c = src.Context();
   bool is_cpu = (c->GetDeviceType() == kCpu);
   K2_CHECK(IsCompatible(src, new2old));
@@ -416,7 +459,6 @@ static RaggedShape IndexAxis0(RaggedShape &src, const Array1<int32_t> &new2old,
     return EmptyRaggedShape(c, num_axes);
   }
 
-
   Array2<int32_t> old_offsets,  // num_axes by ans_dim0
       new_offsets;              // num_axes by (ans_dim0 + 1).
   GetOldAndNewOffsets(src, new2old, &old_offsets, &new_offsets);
@@ -438,7 +480,6 @@ static RaggedShape IndexAxis0(RaggedShape &src, const Array1<int32_t> &new2old,
   Array2<TaskRedirect> task_redirects(c, num_axes, num_jobs);
   auto task_redirects_acc = task_redirects.Accessor();
 
-
   ans.Layers()[0].row_splits = new_offsets.Row(1);
 
   for (int32_t axis = 0; axis < num_axes; ++axis) {
@@ -560,6 +601,157 @@ static RaggedShape IndexAxis0(RaggedShape &src, const Array1<int32_t> &new2old,
   return ans;
 }
 
+RaggedShape IndexAxis0New(RaggedShape &src, const Array1<int32_t> &new2old,
+                          Array1<int32_t> *elem_indexes /*=nullptr*/) {
+  ContextPtr c = GetContext(src, new2old);
+  int32_t num_axes = src.NumAxes(), src_dim0 = src.Dim0(),
+          ans_dim0 = new2old.Dim();
+  if (ans_dim0 == 0) {
+    if (elem_indexes) *elem_indexes = Array1<int32_t>(c, 0);
+    return EmptyRaggedShape(c, num_axes);
+  }
+
+  Array1<int32_t> old_offsets,  // (num_axes-1) by ans_dim0
+      new_offsets;              // ((num_axes-1) by ans_dim0) + 1.
+  GetOldAndNewOffsets(src, new2old, &old_offsets, &new_offsets);
+  const int32_t *old_offsets_data = old_offsets.Data(),
+                *new_offsets_data = new_offsets.Data();
+
+  if (num_axes == 2) {
+    if (c->GetDeviceType() == kCpu) {
+    } else {
+      K2_CHECK_EQ(c->GetDeviceType(), kCuda);
+      int32_t ans_num_elems = new_offsets.Back();
+      Array1<int32_t> ans_row_splits(c, ans_num_elems);
+      int32_t *ans_row_splits_data = ans_row_splits.Data();
+      if (elem_indexes != nullptr)
+        *elem_indexes = Array1<int32_t>(c, ans_num_elems);
+      int32_t *elem_indexes_data =
+          elem_indexes != nullptr ? elem_indexes->Data() : nullptr;
+      if (ans_num_elems == 0) return RaggedShape2(&new_offsets, nullptr, 0);
+
+      mgpu::context_t *mgpu_context = GetModernGpuAllocator(c);
+      auto lambda_set_ans = [=] __device__(int32_t index, int32_t seg,
+                                           int32_t rank) {
+        ans_row_splits_data[new_offsets_data[seg] + rank] = seg;
+        if (elem_indexes_data != nullptr) {
+          elem_indexes_data[new_offsets_data[seg] + rank] =
+              old_offsets_data[seg] + rank;
+        }
+      };
+      K2_CUDA_SAFE_CALL(mgpu::transform_lbs(
+          lambda_set_ans, new_offsets.Back(), new_offsets.Data(),
+          new_offsets.Dim() - 1, *mgpu_context));
+      RaggedShape ans =
+          RaggedShape2(&new_offsets, &ans_row_splits, ans_num_elems);
+      return ans;
+    }
+  }
+
+  Array1<int32_t> tot_sizes_out(c, num_axes);
+  int32_t *tot_sizes_out_data = tot_sizes_out.Data();
+  K2_EVAL(
+      c, num_axes, lambda_set_tot_sizes, (int32_t i)->void {
+        if (i == 0)
+          tot_sizes_out_data[0] = ans_dim0;
+        else {
+          tot_sizes_out_data[i] = new_offsets_data[ans_dim0 * i] -
+                                  new_offsets_data[ans_dim0 * (i - 1)];
+        }
+      });
+  tot_sizes_out = tot_sizes_out.To(GetCpuContext());
+
+  if (elem_indexes != nullptr)
+    *elem_indexes = Array1<int32_t>(c, tot_sizes_out.Back());
+  int32_t *elem_indexes_data =
+      elem_indexes != nullptr ? elem_indexes->Data() : nullptr;
+  RaggedShape ans = RaggedShapeFromTotSizes(c, num_axes, tot_sizes_out.Data());
+
+  RowSplitsAccessor<5> ans_row_splits_acc(ans);
+  RowIdsAccessor<5> ans_row_ids_acc(ans);
+
+  RowSplitsAccessor<5> src_row_splits_acc(src);
+  RowIdsAccessor<5> src_row_ids_acc(src);
+
+  ans.Layers()[0].row_splits = new_offsets.Arange(0, ans_dim0 + 1);
+
+  if (c->GetDeviceType() == kCpu) {
+  } else {
+    K2_CHECK_EQ(c->GetDeviceType(), kCuda);
+
+    int32_t tot_elems = new_offsets.Back();
+    if (tot_elems == 0) {
+      for (int32_t i = 0; i != num_axes - 1; ++i) {
+        if (i != 0)
+          ans.Layers()[i].row_splits =
+              new_offsets.Arange(ans_dim0 + i, ans_dim0 + i + 1);
+        ans.Layers()[i].row_ids = Array1<int32_t>(c, 0);
+      }
+      return ans;
+    }
+    mgpu::context_t *mgpu_context = GetModernGpuAllocator(c);
+    auto lambda_set_ans = [=] __device__(int32_t index, int32_t seg,
+                                         int32_t rank) {
+      // There are (num_axes-1) * ans_dim0 segments totally, each segment
+      // handles the corresponding row_ids for axis `axis` and
+      // row_splits for axis `axis + 1`.
+      if (index == 0) {
+        for (int32_t i = 1; i != num_axes - 1; ++i) {
+          ans_row_splits_acc(i)[0] = 0;
+        }
+      }
+      // TODO(Haowen): share below variables (including new_offset_prev,
+      // new_offset_next, etc.) in a segment, or at least we can create a kernel
+      // like things in IndexAxis0 to share those variables in 4/8 threads.
+      int32_t axis = seg / ans_dim0;
+      int32_t new_offset_curr =
+                  new_offsets_data[seg] - new_offsets_data[axis * ans_dim0],
+              old_offset_curr = old_offsets_data[seg];
+      if (axis != num_axes - 2) {
+        // set row_splits for the next axis
+
+        int32_t old_offset_next = old_offsets_data[seg + ans_dim0],
+                new_offset_next = new_offsets_data[seg + ans_dim0] -
+                                  new_offsets_data[(axis + 1) * ans_dim0];
+        int32_t value_offset = new_offset_next - old_offset_next;
+        // noted row_splits_acc(axis) is RowSplits(axis + 1);
+        int32_t *ans_row_splits_next_axis = ans_row_splits_acc(axis + 1);
+        const int32_t *src_row_splits_next_axis = src_row_splits_acc(axis + 1);
+        ans_row_splits_next_axis[new_offset_curr + rank] =
+            value_offset + src_row_splits_next_axis[old_offset_curr + rank];
+        // so that we can always set the last element of row_splits for the next
+        // axis even though there are empty rows in current axis.
+        ans_row_splits_next_axis[new_offset_curr + rank + 1] =
+            value_offset + src_row_splits_next_axis[old_offset_curr + rank + 1];
+      }
+      {
+        // set row_id for the current axis
+        if (axis == 0) {
+          ans_row_ids_acc(axis)[new_offset_curr + rank] = seg;
+        } else {
+          int32_t new_offset_prev = new_offsets_data[seg - ans_dim0] -
+                                    new_offsets_data[(axis - 1) * ans_dim0],
+                  old_offset_prev = old_offsets_data[seg - ans_dim0];
+          int32_t value_offset = new_offset_prev - old_offset_prev;
+          ans_row_ids_acc(axis)[new_offset_curr + rank] =
+              value_offset + src_row_ids_acc(axis)[old_offset_curr + rank];
+        }
+      }
+      if (axis == num_axes - 2 && elem_indexes_data != nullptr) {
+        elem_indexes_data[new_offset_curr + rank] = old_offset_curr + rank;
+      }
+    };
+    K2_CUDA_SAFE_CALL(
+        mgpu::transform_lbs(lambda_set_ans, tot_elems, new_offsets.Data(),
+                            new_offsets.Dim() - 1, *mgpu_context));
+  }
+
+#if !defined(NDEBUG)
+  ans.Check();
+#endif
+  return ans;
+}
+
 RaggedShape Index(RaggedShape &src, int32_t axis,
                   const Array1<int32_t> &indexes,
                   Array1<int32_t> *elem_indexes /*=nullptr*/) {
@@ -580,8 +772,7 @@ RaggedShape Index(RaggedShape &src, int32_t axis,
     Array1<int32_t> last_row_splits(last_row_ids.Context(),
                                     src.TotSize(num_axes - 2) + 1);
     RowIdsToRowSplits(last_row_ids, &last_row_splits);
-    if (elem_indexes)
-      *elem_indexes = indexes;
+    if (elem_indexes) *elem_indexes = indexes;
 
     std::vector<RaggedShapeLayer> axes = src.Layers();
     axes.back().row_splits = last_row_splits;
@@ -594,7 +785,7 @@ RaggedShape Index(RaggedShape &src, int32_t axis,
     DecomposeRaggedShape(src, axis, &top, &bottom);
 
     RaggedShape top_indexed = Index(top, axis, indexes, nullptr),
-        bottom_indexed = IndexAxis0(bottom, indexes, elem_indexes);
+                bottom_indexed = IndexAxis0(bottom, indexes, elem_indexes);
     return ComposeRaggedShapes(top_indexed, bottom_indexed);
   }
 }
@@ -1198,10 +1389,9 @@ static Array1<int32_t> GetTransposeReorderingThreeAxesCuda(Ragged<int32_t> &src,
   return ans;
 }
 
-
 /*
-// Checks the result of GetTranspoeReordering(), in debug mode and dies if it is wrong.
-static void CheckGetTransposeReordering(Ragged<int32_t> &src,
+// Checks the result of GetTranspoeReordering(), in debug mode and dies if it is
+wrong. static void CheckGetTransposeReordering(Ragged<int32_t> &src,
                                         Array1<int32_t> &ans) {
   if (!internal::kDisableDebug && !internal::DisableChecks()) {
     K2_CHECK(IsPermutation(ans));
@@ -2063,8 +2253,6 @@ RaggedShape RaggedShapeAxis0Splitter::GetElement(int32_t i,
   return RaggedShape(out);
 }
 
-
-
 namespace hash_internal {
 // Utilities for hashing strings (actually: sequences of int32_t).
 
@@ -2099,7 +2287,8 @@ struct Hash {
 
 template <typename T>
 struct HashInputIterator {
-  explicit __host__ __device__ __forceinline__ HashInputIterator(const int32_t *i)  // NOLINT
+  explicit __host__ __device__ __forceinline__
+  HashInputIterator(const int32_t *i)  // NOLINT
       : i_(i) {}
   __device__ __forceinline__ Hash<T> operator[](int32_t idx) const {
     return Hash<T>{i_[idx], i_[idx], 31, 167};
@@ -2115,8 +2304,7 @@ struct HashOutputIteratorDeref {  // this is what you get when you dereference
                                   // HashOutputIterator, it pretends to be a
                                   // Hash<T> but really only stores the `idx`
                                   // member.
-  explicit __device__ __forceinline__ HashOutputIteratorDeref(T *t)
-      : t_(t) {}
+  explicit __device__ __forceinline__ HashOutputIteratorDeref(T *t) : t_(t) {}
   __device__ __forceinline__ HashOutputIteratorDeref &operator=(
       const Hash<T> &h) {
     *t_ = h.hash1 + 13 * h.product1 + 104729 * h.hash2 +
@@ -2144,8 +2332,7 @@ struct HashCombineOp {
   __device__ __forceinline__ Hash<T> operator()(const Hash<T> &a,
                                                 const Hash<T> &b) const {
     return Hash<T>{a.hash1 * b.product1 + b.hash1,
-                   a.hash2 * b.product2 + b.hash2,
-                   a.product1 * b.product1,
+                   a.hash2 * b.product2 + b.hash2, a.product1 * b.product1,
                    a.product2 * b.product2};
   }
 };
@@ -2199,7 +2386,7 @@ Array1<T> ComputeHash(Ragged<int32_t> &src) {
     hash_internal::HashInputIterator<T> input_iter(values_data);
     hash_internal::HashOutputIterator<T> output_iter(output_data);
     hash_internal::HashCombineOp<T> op;
-    hash_internal::Hash<T> initial_hash{ 0, 0, 1, 1 };
+    hash_internal::Hash<T> initial_hash{0, 0, 1, 1};
 
     // This code is based on the example here:
     // https://nvlabs.github.io/cub/structcub_1_1_device_segmented_reduce.html
@@ -2218,7 +2405,6 @@ Array1<T> ComputeHash(Ragged<int32_t> &src) {
   return ans;
 }
 
-
 Ragged<int32_t> UniqueSequences(Ragged<int32_t> &src) {
   ContextPtr &c = src.Context();
   if (src.NumAxes() == 2) {
@@ -2235,11 +2421,12 @@ Ragged<int32_t> UniqueSequences(Ragged<int32_t> &src) {
   Ragged<int64_t> ragged_hashes(GetLayer(src.shape, src.shape.NumLayers() - 2),
                                 hashes);
 
-  SortSublists<int64_t, LessThan<int64_t> >(&ragged_hashes, &order);
+  SortSublists<int64_t, LessThan<int64_t>>(&ragged_hashes, &order);
 
   Renumbering renumber_lists(c, hashes.Dim());
   const int32_t *ragged_hashes_row_ids_data = ragged_hashes.RowIds(1).Data(),
-      *ragged_hashes_row_splits_data = ragged_hashes.RowSplits(1).Data();
+                *ragged_hashes_row_splits_data =
+                    ragged_hashes.RowSplits(1).Data();
   const int64_t *ragged_hashes_data = ragged_hashes.values.Data();
   char *keep_list_data = renumber_lists.Keep().Data();
   K2_EVAL(
@@ -2254,16 +2441,12 @@ Ragged<int32_t> UniqueSequences(Ragged<int32_t> &src) {
         keep_list_data[i] = keep;
       });
   Array1<int32_t> new2old = renumber_lists.New2Old(),
-      new2unsorted = order[new2old];
+                  new2unsorted = order[new2old];
   return Index(src, src.NumAxes() - 2, new2unsorted);
 }
 
-
 // Instantiate template for int64 and int32.
-template
-Array1<int64_t> ComputeHash(Ragged<int32_t> &src);
-template
-Array1<int32_t> ComputeHash(Ragged<int32_t> &src);
-
+template Array1<int64_t> ComputeHash(Ragged<int32_t> &src);
+template Array1<int32_t> ComputeHash(Ragged<int32_t> &src);
 
 }  // namespace k2
diff --git a/k2/csrc/ragged_ops.h b/k2/csrc/ragged_ops.h
index 1a9090e07..fc613a96f 100644
--- a/k2/csrc/ragged_ops.h
+++ b/k2/csrc/ragged_ops.h
@@ -132,7 +132,6 @@ void OrPerSublist(Ragged<T> &src, T initial_value, Array1<T> *or_values) {
   SegmentedReduce<T, BitOrOp<T>>(src, initial_value, or_values);
 }
 
-
 /*
   Stack a list of RaggedShape to create a RaggedShape with one more axis.
   Similar to TF/PyTorch's Stack. The result will have Dim0 == src_size.
@@ -228,6 +227,17 @@ RaggedShape Prefix(RaggedShape &src, int32_t n);
 std::vector<RaggedShape> GetPrefixes(RaggedShape &src,
                                      const std::vector<int32_t> &sizes);
 
+void GetOldAndNewOffsets(RaggedShape &src, const Array1<int32_t> &new2old,
+                         Array2<int32_t> *old_offsets,
+                         Array2<int32_t> *new_offsets);
+void GetOldAndNewOffsets(RaggedShape &src, const Array1<int32_t> &new2old,
+                         Array1<int32_t> *old_offsets,
+                         Array1<int32_t> *new_offsets);
+
+RaggedShape IndexAxis0(RaggedShape &src, const Array1<int32_t> &new2old,
+                       Array1<int32_t> *elem_indexes /*=nullptr*/);
+RaggedShape IndexAxis0New(RaggedShape &src, const Array1<int32_t> &new2old,
+                          Array1<int32_t> *elem_indexes /*=nullptr*/);
 /*
   This object splits a ragged shape on its axis 0, giving you efficient
   axis to the sub-parts of it for each index into its axis0.
@@ -416,7 +426,8 @@ RaggedShape Unsqueeze(const RaggedShape &src, int32_t axis);
   Version of Unsqueeze() above, that works for ragged tensors.
   Note: the opposite of this is not Squeeze(); it is ans.RemoveAxis(axis).
 */
-template <typename T> Ragged<T> Unsqueeze(const Ragged<T> &src, int32_t axis) {
+template <typename T>
+Ragged<T> Unsqueeze(const Ragged<T> &src, int32_t axis) {
   return Ragged<T>(Unsqueeze(src.shape, axis), src.values);
 }
 
@@ -1027,7 +1038,6 @@ inline Ragged<T> RaggedFromTotSizes(ContextPtr &c,
                    Array1<T>(c, tot_sizes.back()));
 }
 
-
 /*
   Transpose a ragged tensor as if it were the index information of a CSR-format
   sparse matrix (but with possibly repeated elements!).  This is easiest to
@@ -1146,7 +1156,6 @@ RaggedShape Index(RaggedShape &src, int32_t axis,
                   const Array1<int32_t> &indexes,
                   Array1<int32_t> *elem_indexes = nullptr);
 
-
 /*
   Index ragged tensor with array, return ragged tensor.
 
@@ -1172,8 +1181,7 @@ RaggedShape Index(RaggedShape &src, int32_t axis,
 
 */
 template <typename T>
-Ragged<T> Index(Ragged<T> &src, int32_t axis,
-                const Array1<int32_t> &indexes,
+Ragged<T> Index(Ragged<T> &src, int32_t axis, const Array1<int32_t> &indexes,
                 Array1<int32_t> *value_indexes_out = nullptr) {
   Array1<int32_t> value_indexes;
   RaggedShape ans_shape = Index(src.shape, axis, indexes, &value_indexes);
@@ -1327,7 +1335,6 @@ Array1<int32_t> CoveringShapeForwardMap(RaggedShape &src,
 template <typename T>
 Array1<T> ComputeHash(Ragged<int32_t> &src);
 
-
 /*
   If `src` has two axes, this will return the unique sub-lists (in a possibly
   different order, but without repeats).  If `src` has 3 axes, it will
@@ -1348,7 +1355,6 @@ Array1<T> ComputeHash(Ragged<int32_t> &src);
  */
 Ragged<int32_t> UniqueSequences(Ragged<int32_t> &src);
 
-
 /* Compute exclusive sum per sub-list.
  *
  * @param [in] src  The input ragged tensor. The exclusive sum is computed
diff --git a/k2/csrc/ragged_test.cu b/k2/csrc/ragged_test.cu
index 5214e2d15..ca4b4d20e 100644
--- a/k2/csrc/ragged_test.cu
+++ b/k2/csrc/ragged_test.cu
@@ -64,6 +64,119 @@ class RaggedShapeOpsSuiteTest : public ::testing::Test {
   RaggedShape random_shape_;
 };
 
+TEST(RaggedShapeOpsTest, TestIndex0) {
+  for (auto &context : {GetCudaContext()}) {
+    ContextPtr cpu = GetCpuContext();  // will be used to copy data
+    {
+      // simple case
+      const std::vector<int32_t> row_splits1 = {0, 2, 5, 6};
+      const std::vector<int32_t> row_ids1 = {0, 0, 1, 1, 1, 2};
+      const std::vector<int32_t> row_splits2 = {0, 2, 3, 4, 6, 7, 10};
+      const std::vector<int32_t> row_ids2 = {0, 0, 1, 2, 3, 3, 4, 5, 5, 5};
+
+      Array1<int32_t> splits1(context, row_splits1);
+      Array1<int32_t> ids1(context, row_ids1);
+      Array1<int32_t> splits2(context, row_splits2);
+      Array1<int32_t> ids2(context, row_ids2);
+      RaggedShape shape = RaggedShape3(&splits1, &ids1, ids1.Dim(), &splits2,
+                                       &ids2, ids2.Dim());
+
+      std::vector<int32_t> new2old_vec = {2, 1};
+      Array1<int32_t> new2old(context, new2old_vec);
+      Array1<int32_t> value_indexes_out;
+      RaggedShape result = IndexAxis0New(shape, new2old, &value_indexes_out);
+      // fsa 2, state_idx01 {5}, arc_idx012 {7, 8, 9}
+      // fsa 1, state_idx01 {2, 3, 4}, arc_idx012 {{3},{4, 5}, {6}}
+      CheckArrayData(value_indexes_out,
+                     std::vector<int32_t>{7, 8, 9, 3, 4, 5, 6});
+    }
+
+    {
+      // simple case 1
+      const std::vector<int32_t> row_splits1 = {0, 2, 5, 6};
+      const std::vector<int32_t> row_ids1 = {0, 0, 1, 1, 1, 2};
+
+      Array1<int32_t> splits1(context, row_splits1);
+      Array1<int32_t> ids1(context, row_ids1);
+      RaggedShape shape = RaggedShape2(&splits1, &ids1, ids1.Dim());
+
+      std::vector<int32_t> new2old_vec = {2, 1};
+      Array1<int32_t> new2old(context, new2old_vec);
+      Array1<int32_t> value_indexes_out;
+      RaggedShape result = IndexAxis0New(shape, new2old, &value_indexes_out);
+      CheckArrayData(value_indexes_out, std::vector<int32_t>{5, 2, 3, 4});
+    }
+    // test with random large size
+    for (int32_t i = 0; i < 50; ++i) {
+      int32_t num_axes = RandInt(2, 4);
+      RaggedShape shape =
+          RandomRaggedShape(true, num_axes, num_axes, 0, 1000).To(context);
+      int32_t dim0 = shape.Dim0(), result_dim0 = RandInt(0, 10);
+      if (dim0 == 0) result_dim0 = 0;
+      std::vector<int32_t> new2old_vec(result_dim0);
+      for (int i = 0; i < result_dim0; i++)
+        new2old_vec[i] = RandInt(-1, dim0 - 1);
+      Array1<int32_t> new2old(context, new2old_vec);
+      K2_LOG(INFO) << "new2old=" << new2old;
+
+      for (int32_t i = 0; i != shape.NumAxes() - 1; ++i) {
+        K2_LOG(INFO) << "row_splits=" << shape.RowSplits(i + 1);
+        K2_LOG(INFO) << "row_ids=" << shape.RowIds(i + 1);
+      }
+
+      Array1<int32_t> value_indexes_out_old;
+      RaggedShape result_old =
+          IndexAxis0(shape, new2old, &value_indexes_out_old);
+      Array1<int32_t> value_indexes_out;
+      RaggedShape result = IndexAxis0New(shape, new2old, &value_indexes_out);
+
+      ASSERT_TRUE(Equal(result_old, result));
+      CheckArrayData(value_indexes_out_old, value_indexes_out);
+    }
+  }
+  K2_LOG(FATAL) << "end..........";
+}
+
+TEST(RaggedShapeOpsTest, GetOldAndNewOffsets) {
+  for (int32_t i = 0; i < 1; i++) {
+    for (auto &context : {GetCpuContext(), GetCudaContext()}) {
+      RaggedShape random = RandomRaggedShape(false, 2, 4, 0, 2000).To(context);
+      int32_t dim0 = random.Dim0();
+      int32_t dim = RandInt(0, dim0);
+      K2_LOG(INFO) << "shape=" << random;
+      for (int32_t n = 1; n != random.NumAxes(); ++n)
+        K2_LOG(INFO) << "RowSplits=" << random.RowSplits(n);
+      Array1<int32_t> indexes = RandUniformArray1(context, dim, -1, dim0 - 1);
+      K2_LOG(INFO) << "index=" << indexes;
+      {
+        Array2<int32_t> old_offsets,  // num_axes by ans_dim0
+            new_offsets;              // num_axes by (ans_dim0 + 1).
+        GetOldAndNewOffsets(random, indexes, &old_offsets, &new_offsets);
+        K2_LOG(INFO) << "old_offsets=" << old_offsets;
+        K2_LOG(INFO) << "new_offsets=" << new_offsets;
+        int32_t num_axes = random.NumAxes();
+        std::vector<Array1<int32_t>> arrays_vec(num_axes);
+        std::vector<const Array1<int32_t> *> arrays(num_axes);
+        for (int32_t j = 0; j != num_axes; ++j) {
+          Array1<int32_t> row = new_offsets.Row(j);
+          arrays_vec[j] = row;
+          arrays[j] = &arrays_vec[j];
+        }
+        const Array1<int32_t> **src = arrays.data();
+        Array1<int32_t> dst = SpliceRowSplits(num_axes, src);
+        K2_LOG(INFO) << "splice_offsets=" << dst;
+      }
+      {
+        Array1<int32_t> old_offsets,  // num_axes by ans_dim0
+            new_offsets;              // num_axes by (ans_dim0 + 1).
+        GetOldAndNewOffsets(random, indexes, &old_offsets, &new_offsets);
+        K2_LOG(INFO) << "old_offsets=" << old_offsets;
+        K2_LOG(INFO) << "new_offsets=" << new_offsets;
+      }
+    }
+  }
+}
+
 TEST(RaggedShapeTest, TestConstructFromString) {
   RaggedShape rs(" [ [ x x ] [x] ]");
   Array1<int32_t> row_splits1(GetCpuContext(), std::vector<int32_t>{0, 2, 3});
@@ -1401,13 +1514,16 @@ TEST(RaggedShapeOpsTest, TestIndex) {
   }
 }
 
-
 TEST(RaggedShapeOpsTest, TestIndexAxis1) {
   for (auto &context : {GetCpuContext(), GetCudaContext()}) {
     {
-      Ragged<int32_t> input = Ragged<int32_t>(" [ [ 1 2 ] [ 3 4 5 ] [ 6 7 ] [ ] ]").To(context);  // NOLINT
+      Ragged<int32_t> input =
+          Ragged<int32_t>(" [ [ 1 2 ] [ 3 4 5 ] [ 6 7 ] [ ] ]")
+              .To(context);  // NOLINT
       Array1<int32_t> indexes = Array1<int32_t>(" [ 1 0 4 2 6 5 ]").To(context);
-      Ragged<int32_t> output = Ragged<int32_t>(" [ [ 2 1 ] [ 5 3 ] [ 7 6 ] [ ] ]").To(context);  // NOLINT
+      Ragged<int32_t> output =
+          Ragged<int32_t>(" [ [ 2 1 ] [ 5 3 ] [ 7 6 ] [ ] ]")
+              .To(context);  // NOLINT
 
       Ragged<int32_t> indexed = Index(input, 1, indexes);
       EXPECT_EQ(Equal(output, indexed), true);
@@ -1415,8 +1531,6 @@ TEST(RaggedShapeOpsTest, TestIndexAxis1) {
   }
 }
 
-
-
 TEST(GetTransposeReordering, NoDuplicates) {
   //       col0  col1  col2  col3  col4  col5
   // row0                           a0    b1
@@ -2581,8 +2695,6 @@ TEST(RaggedOpsTest, TestComputeHash) {
   }
 }
 
-
-
 TEST(RaggedOpsTest, TestUniqueSequences) {
   for (int32_t i = 0; i < 20; i++) {
     for (auto &c : {GetCpuContext(), GetCudaContext()}) {
@@ -2597,7 +2709,7 @@ TEST(RaggedOpsTest, TestUniqueSequences) {
 
       ContextPtr cpu = GetCpuContext();
       Array1<int32_t> hash_src = ComputeHash<int32_t>(src).To(cpu),
-          hash_unique = ComputeHash<int32_t>(unique).To(cpu);
+                      hash_unique = ComputeHash<int32_t>(unique).To(cpu);
 
       RaggedShape src_hash_shape =
           RemoveAxis(src.shape, src.NumAxes() - 1).To(cpu);
@@ -2611,9 +2723,10 @@ TEST(RaggedOpsTest, TestUniqueSequences) {
       K2_CHECK_EQ(src_hash_shape.Dim0(), unique_hash_shape.Dim0());
 
       const int32_t *src_hash_row_splits = src_hash_shape.RowSplits(1).Data(),
-          *unique_hash_row_splits = unique_hash_shape.RowSplits(1).Data();
+                    *unique_hash_row_splits =
+                        unique_hash_shape.RowSplits(1).Data();
       const int32_t *src_hash_data = hash_src.Data(),
-          *unique_hash_data = hash_unique.Data();
+                    *unique_hash_data = hash_unique.Data();
 
       for (int32_t r = 0; r < src_hash_shape.Dim0(); r++) {
         int32_t src_begin = src_hash_row_splits[r],
@@ -2644,7 +2757,6 @@ TEST(RaggedIntTest, TestCreateRagged2Int) {
   K2_CHECK(Equal(r, r2));
 }
 
-
 TEST(RaggedFloatTest, TestCreateRagged2Float) {
   std::vector<std::vector<float>> vecs{{1.2, 2.3}, {}, {3.4, 5.6}};
   std::vector<float> expected_values{1.2, 2.3, 3.4, 5.6};
@@ -2656,5 +2768,4 @@ TEST(RaggedFloatTest, TestCreateRagged2Float) {
   CheckArrayData(r.values, expected_values);
 }
 
-
 }  // namespace k2

From 14383d7a6fe80047fa3c54d6c43fd602e6669f21 Mon Sep 17 00:00:00 2001
From: Haowen <qindazhu@gmail.com>
Date: Fri, 19 Feb 2021 14:17:20 +0800
Subject: [PATCH 5/5] add comments for transform_lbs

---
 k2/csrc/ragged_ops.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu
index bb29f99d8..08808b6e6 100644
--- a/k2/csrc/ragged_ops.cu
+++ b/k2/csrc/ragged_ops.cu
@@ -690,6 +690,13 @@ RaggedShape IndexAxis0New(RaggedShape &src, const Array1<int32_t> &new2old,
       return ans;
     }
     mgpu::context_t *mgpu_context = GetModernGpuAllocator(c);
+    // For a lambda called in `mgpu::transform_lbs`
+    // `index` is the index of element (i.e. idx01)
+    // `seg` is the row id (i.e. idx0),
+    // `rank` is the index in current `seg`/row (i.e. idx1)
+    // In the below calling code, `new_offsets` is the segment/row descriptor
+    // (i.e. row_splits for this call), the number of elements is
+    // new_offsets.Back().
     auto lambda_set_ans = [=] __device__(int32_t index, int32_t seg,
                                          int32_t rank) {
       // There are (num_axes-1) * ans_dim0 segments totally, each segment