From 6082f936b7ee581c00bfd60647fddc451f7409d0 Mon Sep 17 00:00:00 2001 From: Haowen Date: Fri, 29 Jan 2021 12:39:52 +0800 Subject: [PATCH 1/5] using pattern trasform_scan to merge two kenerls into one --- k2/csrc/benchmark/array_ops_benchmark.cu | 71 ++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/k2/csrc/benchmark/array_ops_benchmark.cu b/k2/csrc/benchmark/array_ops_benchmark.cu index ece59eec9..22a803846 100644 --- a/k2/csrc/benchmark/array_ops_benchmark.cu +++ b/k2/csrc/benchmark/array_ops_benchmark.cu @@ -9,6 +9,7 @@ #include "k2/csrc/array_ops.h" #include "k2/csrc/benchmark/benchmark.h" #include "k2/csrc/test_utils.h" +#include "moderngpu/kernel_scan.hxx" namespace k2 { @@ -268,6 +269,60 @@ static BenchmarkStat BenchmarkSizesToMergeMap(int32_t num_src, return stat; } +void TransExclusiveSumOld(const Array1 src, Array1 *ans) { + ContextPtr c = src.Context(); + int32_t dim = src.Dim(); + K2_CHECK_EQ(dim + 1, ans->Dim()); + const int32_t *src_data = src.Data(); + int32_t *ans_data = ans->Data(); + K2_EVAL( + c, dim, lambda_multiple2, + (int32_t i)->void { ans_data[i] = src_data[i] * 2; }); + ExclusiveSum(*ans, ans); +} + +void TransExclusiveSumNew(const Array1 src, Array1 *ans) { + ContextPtr c = src.Context(); + int32_t dim = src.Dim(); + K2_CHECK_EQ(dim + 1, ans->Dim()); + const int32_t *src_data = src.Data(); + int32_t *ans_data = ans->Data(); + K2_TRANS_EXCSUM( + c, dim, ans_data, lambda_multiple2, + (int32_t i)->int32_t { return src_data[i] * 2; }); +} + +static BenchmarkStat BenchmarkTransExclusiveSum(int32_t dim, + DeviceType device_type) { + ContextPtr context; + if (device_type == kCpu) { + context = GetCpuContext(); + } else { + K2_CHECK_EQ(device_type, kCuda); + context = GetCudaContext(); + } + + std::vector vec(dim); + std::iota(vec.begin(), vec.end(), dim); + Array1 src(context, vec); + Array1 ans(context, src.Dim() + 1); + + BenchmarkStat stat; + stat.op_name = "TransExclusiveSum_New_" + std::to_string(dim); + int32_t num_iter = 20; + stat.num_iter = num_iter; + stat.problem_size = dim; + stat.device_type = device_type; + + stat.eplased_per_iter = + BenchmarkOp(num_iter, context, + (void (*)(const Array1 &, Array1 *))( + &TransExclusiveSumNew), + src, &ans); + stat.eplased_per_iter *= 1e6; // from seconds to microseconds + return stat; +} + template static void RegisterBenchmarkExclusiveSum(DeviceType device_type) { std::vector problems_sizes = {100, 500, 1000, 2000, @@ -351,6 +406,20 @@ static void RegisterBenchmarkSizesToMergeMap(DeviceType device_type) { } } +static void RegisterBenchmarkTransExclusiveSum(DeviceType device_type) { + std::vector problems_sizes = {100, 200, 500, 1000, 2000, + 5000, 10000, 20000, 50000, 100000, + 500000, 1000000, 5000000}; + for (auto s : problems_sizes) { + std::string name = + GenerateBenchmarkName("TransExclusiveSum", device_type) + "_" + + std::to_string(s); + RegisterBenchmark(name, [s, device_type]() -> BenchmarkStat { + return BenchmarkTransExclusiveSum(s, device_type); + }); + } +} + static void RunArrayOpsBenchmark() { PrintEnvironmentInfo(); @@ -369,6 +438,8 @@ static void RunArrayOpsBenchmark() { RegisterBenchmarkSizesToMergeMap(kCuda); + RegisterBenchmarkTransExclusiveSum(kCuda); + // Users can set a regular expression via environment // variable `K2_BENCHMARK_FILTER` such that only benchmarks // with name matching the pattern are candidates to run. From 414798550aba348a777c92241c6e02870571b5cb Mon Sep 17 00:00:00 2001 From: Haowen Date: Fri, 29 Jan 2021 13:02:42 +0800 Subject: [PATCH 2/5] delete benchmark code as they should not be merged --- k2/csrc/benchmark/array_ops_benchmark.cu | 70 ------------------------ 1 file changed, 70 deletions(-) diff --git a/k2/csrc/benchmark/array_ops_benchmark.cu b/k2/csrc/benchmark/array_ops_benchmark.cu index 22a803846..4f6dfae99 100644 --- a/k2/csrc/benchmark/array_ops_benchmark.cu +++ b/k2/csrc/benchmark/array_ops_benchmark.cu @@ -269,60 +269,6 @@ static BenchmarkStat BenchmarkSizesToMergeMap(int32_t num_src, return stat; } -void TransExclusiveSumOld(const Array1 src, Array1 *ans) { - ContextPtr c = src.Context(); - int32_t dim = src.Dim(); - K2_CHECK_EQ(dim + 1, ans->Dim()); - const int32_t *src_data = src.Data(); - int32_t *ans_data = ans->Data(); - K2_EVAL( - c, dim, lambda_multiple2, - (int32_t i)->void { ans_data[i] = src_data[i] * 2; }); - ExclusiveSum(*ans, ans); -} - -void TransExclusiveSumNew(const Array1 src, Array1 *ans) { - ContextPtr c = src.Context(); - int32_t dim = src.Dim(); - K2_CHECK_EQ(dim + 1, ans->Dim()); - const int32_t *src_data = src.Data(); - int32_t *ans_data = ans->Data(); - K2_TRANS_EXCSUM( - c, dim, ans_data, lambda_multiple2, - (int32_t i)->int32_t { return src_data[i] * 2; }); -} - -static BenchmarkStat BenchmarkTransExclusiveSum(int32_t dim, - DeviceType device_type) { - ContextPtr context; - if (device_type == kCpu) { - context = GetCpuContext(); - } else { - K2_CHECK_EQ(device_type, kCuda); - context = GetCudaContext(); - } - - std::vector vec(dim); - std::iota(vec.begin(), vec.end(), dim); - Array1 src(context, vec); - Array1 ans(context, src.Dim() + 1); - - BenchmarkStat stat; - stat.op_name = "TransExclusiveSum_New_" + std::to_string(dim); - int32_t num_iter = 20; - stat.num_iter = num_iter; - stat.problem_size = dim; - stat.device_type = device_type; - - stat.eplased_per_iter = - BenchmarkOp(num_iter, context, - (void (*)(const Array1 &, Array1 *))( - &TransExclusiveSumNew), - src, &ans); - stat.eplased_per_iter *= 1e6; // from seconds to microseconds - return stat; -} - template static void RegisterBenchmarkExclusiveSum(DeviceType device_type) { std::vector problems_sizes = {100, 500, 1000, 2000, @@ -406,20 +352,6 @@ static void RegisterBenchmarkSizesToMergeMap(DeviceType device_type) { } } -static void RegisterBenchmarkTransExclusiveSum(DeviceType device_type) { - std::vector problems_sizes = {100, 200, 500, 1000, 2000, - 5000, 10000, 20000, 50000, 100000, - 500000, 1000000, 5000000}; - for (auto s : problems_sizes) { - std::string name = - GenerateBenchmarkName("TransExclusiveSum", device_type) + "_" + - std::to_string(s); - RegisterBenchmark(name, [s, device_type]() -> BenchmarkStat { - return BenchmarkTransExclusiveSum(s, device_type); - }); - } -} - static void RunArrayOpsBenchmark() { PrintEnvironmentInfo(); @@ -438,8 +370,6 @@ static void RunArrayOpsBenchmark() { RegisterBenchmarkSizesToMergeMap(kCuda); - RegisterBenchmarkTransExclusiveSum(kCuda); - // Users can set a regular expression via environment // variable `K2_BENCHMARK_FILTER` such that only benchmarks // with name matching the pattern are candidates to run. From c8d516e1fa6fa00bbd8bf26c6e10f1b206705b8e Mon Sep 17 00:00:00 2001 From: Haowen Date: Fri, 29 Jan 2021 13:03:49 +0800 Subject: [PATCH 3/5] delete useless header --- k2/csrc/benchmark/array_ops_benchmark.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/k2/csrc/benchmark/array_ops_benchmark.cu b/k2/csrc/benchmark/array_ops_benchmark.cu index 4f6dfae99..ece59eec9 100644 --- a/k2/csrc/benchmark/array_ops_benchmark.cu +++ b/k2/csrc/benchmark/array_ops_benchmark.cu @@ -9,7 +9,6 @@ #include "k2/csrc/array_ops.h" #include "k2/csrc/benchmark/benchmark.h" #include "k2/csrc/test_utils.h" -#include "moderngpu/kernel_scan.hxx" namespace k2 { From bbe1d9ee3f500ca9d2179ba5b6d19a600fbdf964 Mon Sep 17 00:00:00 2001 From: Haowen Date: Fri, 19 Feb 2021 11:05:35 +0800 Subject: [PATCH 4/5] try to implement the loop on axis with one kernel for IndexAxis0 --- k2/csrc/benchmark/ragged_ops_benchmark.cu | 56 +++++ k2/csrc/ragged_ops.cu | 259 ++++++++++++++++++---- k2/csrc/ragged_ops.h | 22 +- k2/csrc/ragged_test.cu | 135 ++++++++++- 4 files changed, 414 insertions(+), 58 deletions(-) diff --git a/k2/csrc/benchmark/ragged_ops_benchmark.cu b/k2/csrc/benchmark/ragged_ops_benchmark.cu index 7276c6f1f..68d7afb7c 100644 --- a/k2/csrc/benchmark/ragged_ops_benchmark.cu +++ b/k2/csrc/benchmark/ragged_ops_benchmark.cu @@ -114,6 +114,60 @@ static void RegisterBenchmarkSegmentedExclusiveSum(DeviceType device_type) { } } +static BenchmarkStat BenchmarkIndexAxis0(int32_t dim, DeviceType device_type) { + ContextPtr context; + if (device_type == kCpu) { + context = GetCpuContext(); + } else { + K2_CHECK_EQ(device_type, kCuda); + context = GetCudaContext(); + } + + int32_t num_iter = 20; + int32_t min_num_elems = dim * 10; + int32_t max_num_elems = dim * 20; + + int32_t num_axes = 4; + RaggedShape shape = + RandomRaggedShape(true, num_axes, num_axes, min_num_elems, max_num_elems) + .To(context); + int32_t dim0 = shape.Dim0(), result_dim0 = RandInt(0, dim0); + if (dim0 == 0) result_dim0 = 0; + std::vector new2old_vec(result_dim0); + for (int i = 0; i < result_dim0; i++) new2old_vec[i] = RandInt(-1, dim0 - 1); + Array1 new2old(context, new2old_vec); + int32_t num_elems = shape.NumElements(); + Array1 value_indexes; + + BenchmarkStat stat; + stat.op_name = "IndexAxis0New_" + std::to_string(num_axes) + "_" + + std::to_string(shape.Dim0()) + "_" + std::to_string(num_elems); + stat.num_iter = num_iter; + stat.problem_size = dim; + stat.dtype_name = TraitsOf(DtypeOf::dtype).Name(); + stat.device_type = device_type; + + stat.eplased_per_iter = + BenchmarkOp(num_iter, context, + (RaggedShape(*)(RaggedShape &, const Array1 &, + Array1 *))(&IndexAxis0New), + shape, new2old, &value_indexes); + stat.eplased_per_iter *= 1e6; // from seconds to microseconds + return stat; +} + +static void RegisterBenchmarkIndexAxis0(DeviceType device_type) { + std::vector problems_sizes = {50, 100, 200, 500, 1000, + 10000, 50000, 100000, 200000}; + for (auto s : problems_sizes) { + std::string name = + GenerateBenchmarkName("IndexAxis0", device_type); + RegisterBenchmark(name, [s, device_type]() -> BenchmarkStat { + return BenchmarkIndexAxis0(s, device_type); + }); + } +} + static void RunRaggedOpsBenchmark() { PrintEnvironmentInfo(); @@ -122,6 +176,8 @@ static void RunRaggedOpsBenchmark() { RegisterBenchmarkSegmentedExclusiveSum(kCpu); RegisterBenchmarkSegmentedExclusiveSum(kCuda); + RegisterBenchmarkIndexAxis0(kCuda); + // Users can set a regular expression via environment // variable `K2_BENCHMARK_FILTER` such that only benchmarks // with name matching the pattern are candidates to run. diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu index 8bb48fe95..bb29f99d8 100644 --- a/k2/csrc/ragged_ops.cu +++ b/k2/csrc/ragged_ops.cu @@ -358,13 +358,12 @@ std::vector UnsqueezeParallel(int32_t num_srcs, RaggedShape **src, Note: `ans` is the result of Index(), with ans.Dim0() == new2old.Dim(). */ -inline void GetOldAndNewOffsets(RaggedShape &src, - const Array1 &new2old, - Array2 *old_offsets, - Array2 *new_offsets) { +void GetOldAndNewOffsets(RaggedShape &src, const Array1 &new2old, + Array2 *old_offsets, + Array2 *new_offsets) { NVTX_RANGE(K2_FUNC); K2_CHECK_GT(src.NumAxes(), 1); - ContextPtr &c = src.Context(); + ContextPtr c = GetContext(src, new2old); int32_t num_axes = src.NumAxes(), ans_dim0 = new2old.Dim(); // max 5 layers. @@ -380,16 +379,14 @@ inline void GetOldAndNewOffsets(RaggedShape &src, K2_EVAL( c, ans_dim0, lambda_set_offsets, (int32_t i)->void { // 0 <= i < ans_dim0 - int32_t old_offset = new2old_data[i], - old_offset_next = old_offset + 1, - offset_diff = 1; + int32_t old_offset = new2old_data[i], old_offset_next = old_offset + 1, + offset_diff = 1; // The following is a special case that interprets -1 as referring to an // empty list. In this case, old_offset == old_offset_next == 0. // The specific value 0 is not necessary; they could be equal // and have any value in [0, src.Dim0() - 1] and still refer to // the empty list. - if (old_offset == -1) - old_offset = 0; + if (old_offset == -1) old_offset = 0; for (int32_t axis = 0;; axis++) { old_offsets_acc(axis, i) = old_offset; // Below, 'new_offsets_acc' currently contains the size rather @@ -404,8 +401,54 @@ inline void GetOldAndNewOffsets(RaggedShape &src, ExclusiveSum(*new_offsets, new_offsets); } -static RaggedShape IndexAxis0(RaggedShape &src, const Array1 &new2old, - Array1 *elem_indexes /*=nullptr*/) { +void GetOldAndNewOffsets(RaggedShape &src, const Array1 &new2old, + Array1 *old_offsets, + Array1 *new_offsets) { + NVTX_RANGE(K2_FUNC); + K2_CHECK_GT(src.NumAxes(), 1); + ContextPtr c = GetContext(src, new2old); + int32_t num_axes = src.NumAxes(), ans_dim0 = new2old.Dim(); + + // max 5 layers. + RowSplitsAccessor<5> row_splits_acc(src); + + const int32_t *new2old_data = new2old.Data(); + *old_offsets = Array1(c, (num_axes - 1) * ans_dim0); + *new_offsets = Array1(c, (num_axes - 1) * ans_dim0 + 1); + int32_t *old_offsets_data = old_offsets->Data(), + *new_offsets_data = new_offsets->Data(); + // Set old_offsets; and for now, set new_offsets to the corresponding + // sizes of the output slices. + K2_EVAL( + c, ans_dim0, lambda_set_offsets, (int32_t i)->void { + // 0 <= i < ans_dim0 + int32_t old_offset = new2old_data[i], old_offset_next = old_offset + 1, + offset_diff = 1; + // The following is a special case that interprets -1 as referring to an + // empty list. In this case, old_offset == old_offset_next == 0. + // The specific value 0 is not necessary; they could be equal + // and have any value in [0, src.Dim0() - 1] and still refer to + // the empty list. + if (old_offset == -1) old_offset = 0; + old_offset = row_splits_acc(0)[old_offset]; + old_offset_next = row_splits_acc(0)[old_offset_next]; + offset_diff = old_offset_next - old_offset; + for (int32_t axis = 1;; axis++) { + old_offsets_data[(axis - 1) * ans_dim0 + i] = old_offset; + // Below, 'new_offsets_acc' currently contains the size rather + // than the offset; we need to do exclusive-sum. + new_offsets_data[(axis - 1) * ans_dim0 + i] = offset_diff; + if (axis + 1 == num_axes) return; + old_offset = row_splits_acc(axis)[old_offset]; + old_offset_next = row_splits_acc(axis)[old_offset_next]; + offset_diff = old_offset_next - old_offset; + } + }); + ExclusiveSum(*new_offsets, new_offsets); +} + +RaggedShape IndexAxis0(RaggedShape &src, const Array1 &new2old, + Array1 *elem_indexes /*=nullptr*/) { ContextPtr &c = src.Context(); bool is_cpu = (c->GetDeviceType() == kCpu); K2_CHECK(IsCompatible(src, new2old)); @@ -416,7 +459,6 @@ static RaggedShape IndexAxis0(RaggedShape &src, const Array1 &new2old, return EmptyRaggedShape(c, num_axes); } - Array2 old_offsets, // num_axes by ans_dim0 new_offsets; // num_axes by (ans_dim0 + 1). GetOldAndNewOffsets(src, new2old, &old_offsets, &new_offsets); @@ -438,7 +480,6 @@ static RaggedShape IndexAxis0(RaggedShape &src, const Array1 &new2old, Array2 task_redirects(c, num_axes, num_jobs); auto task_redirects_acc = task_redirects.Accessor(); - ans.Layers()[0].row_splits = new_offsets.Row(1); for (int32_t axis = 0; axis < num_axes; ++axis) { @@ -560,6 +601,157 @@ static RaggedShape IndexAxis0(RaggedShape &src, const Array1 &new2old, return ans; } +RaggedShape IndexAxis0New(RaggedShape &src, const Array1 &new2old, + Array1 *elem_indexes /*=nullptr*/) { + ContextPtr c = GetContext(src, new2old); + int32_t num_axes = src.NumAxes(), src_dim0 = src.Dim0(), + ans_dim0 = new2old.Dim(); + if (ans_dim0 == 0) { + if (elem_indexes) *elem_indexes = Array1(c, 0); + return EmptyRaggedShape(c, num_axes); + } + + Array1 old_offsets, // (num_axes-1) by ans_dim0 + new_offsets; // ((num_axes-1) by ans_dim0) + 1. + GetOldAndNewOffsets(src, new2old, &old_offsets, &new_offsets); + const int32_t *old_offsets_data = old_offsets.Data(), + *new_offsets_data = new_offsets.Data(); + + if (num_axes == 2) { + if (c->GetDeviceType() == kCpu) { + } else { + K2_CHECK_EQ(c->GetDeviceType(), kCuda); + int32_t ans_num_elems = new_offsets.Back(); + Array1 ans_row_splits(c, ans_num_elems); + int32_t *ans_row_splits_data = ans_row_splits.Data(); + if (elem_indexes != nullptr) + *elem_indexes = Array1(c, ans_num_elems); + int32_t *elem_indexes_data = + elem_indexes != nullptr ? elem_indexes->Data() : nullptr; + if (ans_num_elems == 0) return RaggedShape2(&new_offsets, nullptr, 0); + + mgpu::context_t *mgpu_context = GetModernGpuAllocator(c); + auto lambda_set_ans = [=] __device__(int32_t index, int32_t seg, + int32_t rank) { + ans_row_splits_data[new_offsets_data[seg] + rank] = seg; + if (elem_indexes_data != nullptr) { + elem_indexes_data[new_offsets_data[seg] + rank] = + old_offsets_data[seg] + rank; + } + }; + K2_CUDA_SAFE_CALL(mgpu::transform_lbs( + lambda_set_ans, new_offsets.Back(), new_offsets.Data(), + new_offsets.Dim() - 1, *mgpu_context)); + RaggedShape ans = + RaggedShape2(&new_offsets, &ans_row_splits, ans_num_elems); + return ans; + } + } + + Array1 tot_sizes_out(c, num_axes); + int32_t *tot_sizes_out_data = tot_sizes_out.Data(); + K2_EVAL( + c, num_axes, lambda_set_tot_sizes, (int32_t i)->void { + if (i == 0) + tot_sizes_out_data[0] = ans_dim0; + else { + tot_sizes_out_data[i] = new_offsets_data[ans_dim0 * i] - + new_offsets_data[ans_dim0 * (i - 1)]; + } + }); + tot_sizes_out = tot_sizes_out.To(GetCpuContext()); + + if (elem_indexes != nullptr) + *elem_indexes = Array1(c, tot_sizes_out.Back()); + int32_t *elem_indexes_data = + elem_indexes != nullptr ? elem_indexes->Data() : nullptr; + RaggedShape ans = RaggedShapeFromTotSizes(c, num_axes, tot_sizes_out.Data()); + + RowSplitsAccessor<5> ans_row_splits_acc(ans); + RowIdsAccessor<5> ans_row_ids_acc(ans); + + RowSplitsAccessor<5> src_row_splits_acc(src); + RowIdsAccessor<5> src_row_ids_acc(src); + + ans.Layers()[0].row_splits = new_offsets.Arange(0, ans_dim0 + 1); + + if (c->GetDeviceType() == kCpu) { + } else { + K2_CHECK_EQ(c->GetDeviceType(), kCuda); + + int32_t tot_elems = new_offsets.Back(); + if (tot_elems == 0) { + for (int32_t i = 0; i != num_axes - 1; ++i) { + if (i != 0) + ans.Layers()[i].row_splits = + new_offsets.Arange(ans_dim0 + i, ans_dim0 + i + 1); + ans.Layers()[i].row_ids = Array1(c, 0); + } + return ans; + } + mgpu::context_t *mgpu_context = GetModernGpuAllocator(c); + auto lambda_set_ans = [=] __device__(int32_t index, int32_t seg, + int32_t rank) { + // There are (num_axes-1) * ans_dim0 segments totally, each segment + // handles the corresponding row_ids for axis `axis` and + // row_splits for axis `axis + 1`. + if (index == 0) { + for (int32_t i = 1; i != num_axes - 1; ++i) { + ans_row_splits_acc(i)[0] = 0; + } + } + // TODO(Haowen): share below variables (including new_offset_prev, + // new_offset_next, etc.) in a segment, or at least we can create a kernel + // like things in IndexAxis0 to share those variables in 4/8 threads. + int32_t axis = seg / ans_dim0; + int32_t new_offset_curr = + new_offsets_data[seg] - new_offsets_data[axis * ans_dim0], + old_offset_curr = old_offsets_data[seg]; + if (axis != num_axes - 2) { + // set row_splits for the next axis + + int32_t old_offset_next = old_offsets_data[seg + ans_dim0], + new_offset_next = new_offsets_data[seg + ans_dim0] - + new_offsets_data[(axis + 1) * ans_dim0]; + int32_t value_offset = new_offset_next - old_offset_next; + // noted row_splits_acc(axis) is RowSplits(axis + 1); + int32_t *ans_row_splits_next_axis = ans_row_splits_acc(axis + 1); + const int32_t *src_row_splits_next_axis = src_row_splits_acc(axis + 1); + ans_row_splits_next_axis[new_offset_curr + rank] = + value_offset + src_row_splits_next_axis[old_offset_curr + rank]; + // so that we can always set the last element of row_splits for the next + // axis even though there are empty rows in current axis. + ans_row_splits_next_axis[new_offset_curr + rank + 1] = + value_offset + src_row_splits_next_axis[old_offset_curr + rank + 1]; + } + { + // set row_id for the current axis + if (axis == 0) { + ans_row_ids_acc(axis)[new_offset_curr + rank] = seg; + } else { + int32_t new_offset_prev = new_offsets_data[seg - ans_dim0] - + new_offsets_data[(axis - 1) * ans_dim0], + old_offset_prev = old_offsets_data[seg - ans_dim0]; + int32_t value_offset = new_offset_prev - old_offset_prev; + ans_row_ids_acc(axis)[new_offset_curr + rank] = + value_offset + src_row_ids_acc(axis)[old_offset_curr + rank]; + } + } + if (axis == num_axes - 2 && elem_indexes_data != nullptr) { + elem_indexes_data[new_offset_curr + rank] = old_offset_curr + rank; + } + }; + K2_CUDA_SAFE_CALL( + mgpu::transform_lbs(lambda_set_ans, tot_elems, new_offsets.Data(), + new_offsets.Dim() - 1, *mgpu_context)); + } + +#if !defined(NDEBUG) + ans.Check(); +#endif + return ans; +} + RaggedShape Index(RaggedShape &src, int32_t axis, const Array1 &indexes, Array1 *elem_indexes /*=nullptr*/) { @@ -580,8 +772,7 @@ RaggedShape Index(RaggedShape &src, int32_t axis, Array1 last_row_splits(last_row_ids.Context(), src.TotSize(num_axes - 2) + 1); RowIdsToRowSplits(last_row_ids, &last_row_splits); - if (elem_indexes) - *elem_indexes = indexes; + if (elem_indexes) *elem_indexes = indexes; std::vector axes = src.Layers(); axes.back().row_splits = last_row_splits; @@ -594,7 +785,7 @@ RaggedShape Index(RaggedShape &src, int32_t axis, DecomposeRaggedShape(src, axis, &top, &bottom); RaggedShape top_indexed = Index(top, axis, indexes, nullptr), - bottom_indexed = IndexAxis0(bottom, indexes, elem_indexes); + bottom_indexed = IndexAxis0(bottom, indexes, elem_indexes); return ComposeRaggedShapes(top_indexed, bottom_indexed); } } @@ -1198,10 +1389,9 @@ static Array1 GetTransposeReorderingThreeAxesCuda(Ragged &src, return ans; } - /* -// Checks the result of GetTranspoeReordering(), in debug mode and dies if it is wrong. -static void CheckGetTransposeReordering(Ragged &src, +// Checks the result of GetTranspoeReordering(), in debug mode and dies if it is +wrong. static void CheckGetTransposeReordering(Ragged &src, Array1 &ans) { if (!internal::kDisableDebug && !internal::DisableChecks()) { K2_CHECK(IsPermutation(ans)); @@ -2063,8 +2253,6 @@ RaggedShape RaggedShapeAxis0Splitter::GetElement(int32_t i, return RaggedShape(out); } - - namespace hash_internal { // Utilities for hashing strings (actually: sequences of int32_t). @@ -2099,7 +2287,8 @@ struct Hash { template struct HashInputIterator { - explicit __host__ __device__ __forceinline__ HashInputIterator(const int32_t *i) // NOLINT + explicit __host__ __device__ __forceinline__ + HashInputIterator(const int32_t *i) // NOLINT : i_(i) {} __device__ __forceinline__ Hash operator[](int32_t idx) const { return Hash{i_[idx], i_[idx], 31, 167}; @@ -2115,8 +2304,7 @@ struct HashOutputIteratorDeref { // this is what you get when you dereference // HashOutputIterator, it pretends to be a // Hash but really only stores the `idx` // member. - explicit __device__ __forceinline__ HashOutputIteratorDeref(T *t) - : t_(t) {} + explicit __device__ __forceinline__ HashOutputIteratorDeref(T *t) : t_(t) {} __device__ __forceinline__ HashOutputIteratorDeref &operator=( const Hash &h) { *t_ = h.hash1 + 13 * h.product1 + 104729 * h.hash2 + @@ -2144,8 +2332,7 @@ struct HashCombineOp { __device__ __forceinline__ Hash operator()(const Hash &a, const Hash &b) const { return Hash{a.hash1 * b.product1 + b.hash1, - a.hash2 * b.product2 + b.hash2, - a.product1 * b.product1, + a.hash2 * b.product2 + b.hash2, a.product1 * b.product1, a.product2 * b.product2}; } }; @@ -2199,7 +2386,7 @@ Array1 ComputeHash(Ragged &src) { hash_internal::HashInputIterator input_iter(values_data); hash_internal::HashOutputIterator output_iter(output_data); hash_internal::HashCombineOp op; - hash_internal::Hash initial_hash{ 0, 0, 1, 1 }; + hash_internal::Hash initial_hash{0, 0, 1, 1}; // This code is based on the example here: // https://nvlabs.github.io/cub/structcub_1_1_device_segmented_reduce.html @@ -2218,7 +2405,6 @@ Array1 ComputeHash(Ragged &src) { return ans; } - Ragged UniqueSequences(Ragged &src) { ContextPtr &c = src.Context(); if (src.NumAxes() == 2) { @@ -2235,11 +2421,12 @@ Ragged UniqueSequences(Ragged &src) { Ragged ragged_hashes(GetLayer(src.shape, src.shape.NumLayers() - 2), hashes); - SortSublists >(&ragged_hashes, &order); + SortSublists>(&ragged_hashes, &order); Renumbering renumber_lists(c, hashes.Dim()); const int32_t *ragged_hashes_row_ids_data = ragged_hashes.RowIds(1).Data(), - *ragged_hashes_row_splits_data = ragged_hashes.RowSplits(1).Data(); + *ragged_hashes_row_splits_data = + ragged_hashes.RowSplits(1).Data(); const int64_t *ragged_hashes_data = ragged_hashes.values.Data(); char *keep_list_data = renumber_lists.Keep().Data(); K2_EVAL( @@ -2254,16 +2441,12 @@ Ragged UniqueSequences(Ragged &src) { keep_list_data[i] = keep; }); Array1 new2old = renumber_lists.New2Old(), - new2unsorted = order[new2old]; + new2unsorted = order[new2old]; return Index(src, src.NumAxes() - 2, new2unsorted); } - // Instantiate template for int64 and int32. -template -Array1 ComputeHash(Ragged &src); -template -Array1 ComputeHash(Ragged &src); - +template Array1 ComputeHash(Ragged &src); +template Array1 ComputeHash(Ragged &src); } // namespace k2 diff --git a/k2/csrc/ragged_ops.h b/k2/csrc/ragged_ops.h index 1a9090e07..fc613a96f 100644 --- a/k2/csrc/ragged_ops.h +++ b/k2/csrc/ragged_ops.h @@ -132,7 +132,6 @@ void OrPerSublist(Ragged &src, T initial_value, Array1 *or_values) { SegmentedReduce>(src, initial_value, or_values); } - /* Stack a list of RaggedShape to create a RaggedShape with one more axis. Similar to TF/PyTorch's Stack. The result will have Dim0 == src_size. @@ -228,6 +227,17 @@ RaggedShape Prefix(RaggedShape &src, int32_t n); std::vector GetPrefixes(RaggedShape &src, const std::vector &sizes); +void GetOldAndNewOffsets(RaggedShape &src, const Array1 &new2old, + Array2 *old_offsets, + Array2 *new_offsets); +void GetOldAndNewOffsets(RaggedShape &src, const Array1 &new2old, + Array1 *old_offsets, + Array1 *new_offsets); + +RaggedShape IndexAxis0(RaggedShape &src, const Array1 &new2old, + Array1 *elem_indexes /*=nullptr*/); +RaggedShape IndexAxis0New(RaggedShape &src, const Array1 &new2old, + Array1 *elem_indexes /*=nullptr*/); /* This object splits a ragged shape on its axis 0, giving you efficient axis to the sub-parts of it for each index into its axis0. @@ -416,7 +426,8 @@ RaggedShape Unsqueeze(const RaggedShape &src, int32_t axis); Version of Unsqueeze() above, that works for ragged tensors. Note: the opposite of this is not Squeeze(); it is ans.RemoveAxis(axis). */ -template Ragged Unsqueeze(const Ragged &src, int32_t axis) { +template +Ragged Unsqueeze(const Ragged &src, int32_t axis) { return Ragged(Unsqueeze(src.shape, axis), src.values); } @@ -1027,7 +1038,6 @@ inline Ragged RaggedFromTotSizes(ContextPtr &c, Array1(c, tot_sizes.back())); } - /* Transpose a ragged tensor as if it were the index information of a CSR-format sparse matrix (but with possibly repeated elements!). This is easiest to @@ -1146,7 +1156,6 @@ RaggedShape Index(RaggedShape &src, int32_t axis, const Array1 &indexes, Array1 *elem_indexes = nullptr); - /* Index ragged tensor with array, return ragged tensor. @@ -1172,8 +1181,7 @@ RaggedShape Index(RaggedShape &src, int32_t axis, */ template -Ragged Index(Ragged &src, int32_t axis, - const Array1 &indexes, +Ragged Index(Ragged &src, int32_t axis, const Array1 &indexes, Array1 *value_indexes_out = nullptr) { Array1 value_indexes; RaggedShape ans_shape = Index(src.shape, axis, indexes, &value_indexes); @@ -1327,7 +1335,6 @@ Array1 CoveringShapeForwardMap(RaggedShape &src, template Array1 ComputeHash(Ragged &src); - /* If `src` has two axes, this will return the unique sub-lists (in a possibly different order, but without repeats). If `src` has 3 axes, it will @@ -1348,7 +1355,6 @@ Array1 ComputeHash(Ragged &src); */ Ragged UniqueSequences(Ragged &src); - /* Compute exclusive sum per sub-list. * * @param [in] src The input ragged tensor. The exclusive sum is computed diff --git a/k2/csrc/ragged_test.cu b/k2/csrc/ragged_test.cu index 5214e2d15..ca4b4d20e 100644 --- a/k2/csrc/ragged_test.cu +++ b/k2/csrc/ragged_test.cu @@ -64,6 +64,119 @@ class RaggedShapeOpsSuiteTest : public ::testing::Test { RaggedShape random_shape_; }; +TEST(RaggedShapeOpsTest, TestIndex0) { + for (auto &context : {GetCudaContext()}) { + ContextPtr cpu = GetCpuContext(); // will be used to copy data + { + // simple case + const std::vector row_splits1 = {0, 2, 5, 6}; + const std::vector row_ids1 = {0, 0, 1, 1, 1, 2}; + const std::vector row_splits2 = {0, 2, 3, 4, 6, 7, 10}; + const std::vector row_ids2 = {0, 0, 1, 2, 3, 3, 4, 5, 5, 5}; + + Array1 splits1(context, row_splits1); + Array1 ids1(context, row_ids1); + Array1 splits2(context, row_splits2); + Array1 ids2(context, row_ids2); + RaggedShape shape = RaggedShape3(&splits1, &ids1, ids1.Dim(), &splits2, + &ids2, ids2.Dim()); + + std::vector new2old_vec = {2, 1}; + Array1 new2old(context, new2old_vec); + Array1 value_indexes_out; + RaggedShape result = IndexAxis0New(shape, new2old, &value_indexes_out); + // fsa 2, state_idx01 {5}, arc_idx012 {7, 8, 9} + // fsa 1, state_idx01 {2, 3, 4}, arc_idx012 {{3},{4, 5}, {6}} + CheckArrayData(value_indexes_out, + std::vector{7, 8, 9, 3, 4, 5, 6}); + } + + { + // simple case 1 + const std::vector row_splits1 = {0, 2, 5, 6}; + const std::vector row_ids1 = {0, 0, 1, 1, 1, 2}; + + Array1 splits1(context, row_splits1); + Array1 ids1(context, row_ids1); + RaggedShape shape = RaggedShape2(&splits1, &ids1, ids1.Dim()); + + std::vector new2old_vec = {2, 1}; + Array1 new2old(context, new2old_vec); + Array1 value_indexes_out; + RaggedShape result = IndexAxis0New(shape, new2old, &value_indexes_out); + CheckArrayData(value_indexes_out, std::vector{5, 2, 3, 4}); + } + // test with random large size + for (int32_t i = 0; i < 50; ++i) { + int32_t num_axes = RandInt(2, 4); + RaggedShape shape = + RandomRaggedShape(true, num_axes, num_axes, 0, 1000).To(context); + int32_t dim0 = shape.Dim0(), result_dim0 = RandInt(0, 10); + if (dim0 == 0) result_dim0 = 0; + std::vector new2old_vec(result_dim0); + for (int i = 0; i < result_dim0; i++) + new2old_vec[i] = RandInt(-1, dim0 - 1); + Array1 new2old(context, new2old_vec); + K2_LOG(INFO) << "new2old=" << new2old; + + for (int32_t i = 0; i != shape.NumAxes() - 1; ++i) { + K2_LOG(INFO) << "row_splits=" << shape.RowSplits(i + 1); + K2_LOG(INFO) << "row_ids=" << shape.RowIds(i + 1); + } + + Array1 value_indexes_out_old; + RaggedShape result_old = + IndexAxis0(shape, new2old, &value_indexes_out_old); + Array1 value_indexes_out; + RaggedShape result = IndexAxis0New(shape, new2old, &value_indexes_out); + + ASSERT_TRUE(Equal(result_old, result)); + CheckArrayData(value_indexes_out_old, value_indexes_out); + } + } + K2_LOG(FATAL) << "end.........."; +} + +TEST(RaggedShapeOpsTest, GetOldAndNewOffsets) { + for (int32_t i = 0; i < 1; i++) { + for (auto &context : {GetCpuContext(), GetCudaContext()}) { + RaggedShape random = RandomRaggedShape(false, 2, 4, 0, 2000).To(context); + int32_t dim0 = random.Dim0(); + int32_t dim = RandInt(0, dim0); + K2_LOG(INFO) << "shape=" << random; + for (int32_t n = 1; n != random.NumAxes(); ++n) + K2_LOG(INFO) << "RowSplits=" << random.RowSplits(n); + Array1 indexes = RandUniformArray1(context, dim, -1, dim0 - 1); + K2_LOG(INFO) << "index=" << indexes; + { + Array2 old_offsets, // num_axes by ans_dim0 + new_offsets; // num_axes by (ans_dim0 + 1). + GetOldAndNewOffsets(random, indexes, &old_offsets, &new_offsets); + K2_LOG(INFO) << "old_offsets=" << old_offsets; + K2_LOG(INFO) << "new_offsets=" << new_offsets; + int32_t num_axes = random.NumAxes(); + std::vector> arrays_vec(num_axes); + std::vector *> arrays(num_axes); + for (int32_t j = 0; j != num_axes; ++j) { + Array1 row = new_offsets.Row(j); + arrays_vec[j] = row; + arrays[j] = &arrays_vec[j]; + } + const Array1 **src = arrays.data(); + Array1 dst = SpliceRowSplits(num_axes, src); + K2_LOG(INFO) << "splice_offsets=" << dst; + } + { + Array1 old_offsets, // num_axes by ans_dim0 + new_offsets; // num_axes by (ans_dim0 + 1). + GetOldAndNewOffsets(random, indexes, &old_offsets, &new_offsets); + K2_LOG(INFO) << "old_offsets=" << old_offsets; + K2_LOG(INFO) << "new_offsets=" << new_offsets; + } + } + } +} + TEST(RaggedShapeTest, TestConstructFromString) { RaggedShape rs(" [ [ x x ] [x] ]"); Array1 row_splits1(GetCpuContext(), std::vector{0, 2, 3}); @@ -1401,13 +1514,16 @@ TEST(RaggedShapeOpsTest, TestIndex) { } } - TEST(RaggedShapeOpsTest, TestIndexAxis1) { for (auto &context : {GetCpuContext(), GetCudaContext()}) { { - Ragged input = Ragged(" [ [ 1 2 ] [ 3 4 5 ] [ 6 7 ] [ ] ]").To(context); // NOLINT + Ragged input = + Ragged(" [ [ 1 2 ] [ 3 4 5 ] [ 6 7 ] [ ] ]") + .To(context); // NOLINT Array1 indexes = Array1(" [ 1 0 4 2 6 5 ]").To(context); - Ragged output = Ragged(" [ [ 2 1 ] [ 5 3 ] [ 7 6 ] [ ] ]").To(context); // NOLINT + Ragged output = + Ragged(" [ [ 2 1 ] [ 5 3 ] [ 7 6 ] [ ] ]") + .To(context); // NOLINT Ragged indexed = Index(input, 1, indexes); EXPECT_EQ(Equal(output, indexed), true); @@ -1415,8 +1531,6 @@ TEST(RaggedShapeOpsTest, TestIndexAxis1) { } } - - TEST(GetTransposeReordering, NoDuplicates) { // col0 col1 col2 col3 col4 col5 // row0 a0 b1 @@ -2581,8 +2695,6 @@ TEST(RaggedOpsTest, TestComputeHash) { } } - - TEST(RaggedOpsTest, TestUniqueSequences) { for (int32_t i = 0; i < 20; i++) { for (auto &c : {GetCpuContext(), GetCudaContext()}) { @@ -2597,7 +2709,7 @@ TEST(RaggedOpsTest, TestUniqueSequences) { ContextPtr cpu = GetCpuContext(); Array1 hash_src = ComputeHash(src).To(cpu), - hash_unique = ComputeHash(unique).To(cpu); + hash_unique = ComputeHash(unique).To(cpu); RaggedShape src_hash_shape = RemoveAxis(src.shape, src.NumAxes() - 1).To(cpu); @@ -2611,9 +2723,10 @@ TEST(RaggedOpsTest, TestUniqueSequences) { K2_CHECK_EQ(src_hash_shape.Dim0(), unique_hash_shape.Dim0()); const int32_t *src_hash_row_splits = src_hash_shape.RowSplits(1).Data(), - *unique_hash_row_splits = unique_hash_shape.RowSplits(1).Data(); + *unique_hash_row_splits = + unique_hash_shape.RowSplits(1).Data(); const int32_t *src_hash_data = hash_src.Data(), - *unique_hash_data = hash_unique.Data(); + *unique_hash_data = hash_unique.Data(); for (int32_t r = 0; r < src_hash_shape.Dim0(); r++) { int32_t src_begin = src_hash_row_splits[r], @@ -2644,7 +2757,6 @@ TEST(RaggedIntTest, TestCreateRagged2Int) { K2_CHECK(Equal(r, r2)); } - TEST(RaggedFloatTest, TestCreateRagged2Float) { std::vector> vecs{{1.2, 2.3}, {}, {3.4, 5.6}}; std::vector expected_values{1.2, 2.3, 3.4, 5.6}; @@ -2656,5 +2768,4 @@ TEST(RaggedFloatTest, TestCreateRagged2Float) { CheckArrayData(r.values, expected_values); } - } // namespace k2 From 14383d7a6fe80047fa3c54d6c43fd602e6669f21 Mon Sep 17 00:00:00 2001 From: Haowen Date: Fri, 19 Feb 2021 14:17:20 +0800 Subject: [PATCH 5/5] add comments for transform_lbs --- k2/csrc/ragged_ops.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu index bb29f99d8..08808b6e6 100644 --- a/k2/csrc/ragged_ops.cu +++ b/k2/csrc/ragged_ops.cu @@ -690,6 +690,13 @@ RaggedShape IndexAxis0New(RaggedShape &src, const Array1 &new2old, return ans; } mgpu::context_t *mgpu_context = GetModernGpuAllocator(c); + // For a lambda called in `mgpu::transform_lbs` + // `index` is the index of element (i.e. idx01) + // `seg` is the row id (i.e. idx0), + // `rank` is the index in current `seg`/row (i.e. idx1) + // In the below calling code, `new_offsets` is the segment/row descriptor + // (i.e. row_splits for this call), the number of elements is + // new_offsets.Back(). auto lambda_set_ans = [=] __device__(int32_t index, int32_t seg, int32_t rank) { // There are (num_axes-1) * ans_dim0 segments totally, each segment