From 0b9f6f4291acceaefbb7931e3150d32786a0b265 Mon Sep 17 00:00:00 2001 From: Manasvi Goyal <55101825+ManasviGoyal@users.noreply.github.com> Date: Wed, 29 May 2024 22:27:14 +0200 Subject: [PATCH] fix: use grid-stride looping for kernels with variable-length loops (#3130) * fix: use grid-stride looping * feat: add awkward_ListArray_getitem_next_range_carrylength kernel * feat: add awkward_ListArray_getitem_next_range kernel * test: add integration tests * ignore 'Jitify is performing a one-time only warm-up' messages --------- Co-authored-by: Jim Pivarski --- .../awkward_ListArray_getitem_next_range.cpp | 29 +- dev/generate-kernel-signatures.py | 2 + dev/generate-tests.py | 2 + kernel-specification.yml | 46 +- kernel-test-data.json | 195 +++++ pyproject.toml | 1 + src/awkward/_connect/cuda/__init__.py | 2 + .../awkward_IndexedArray_getitem_nextcarry.cu | 4 +- ...kward_IndexedArray_ranges_carry_next_64.cu | 4 +- .../awkward_IndexedArray_ranges_next_64.cu | 6 +- .../awkward_ListArray_broadcast_tooffsets.cu | 4 +- .../awkward_ListArray_combinations_length.cu | 2 +- ...kward_ListArray_getitem_jagged_numvalid.cu | 2 +- .../awkward_ListArray_getitem_next_range.cu | 90 +++ ...istArray_getitem_next_range_carrylength.cu | 70 ++ ...ard_ListArray_getitem_next_range_counts.cu | 10 +- ...Array_getitem_next_range_spreadadvanced.cu | 2 +- .../awkward_ListArray_localindex.cu | 2 +- .../awkward_ListArray_rpad_axis1.cu | 8 +- ...kward_ListOffsetArray_drop_none_indexes.cu | 2 +- ...OffsetArray_reduce_local_nextparents_64.cu | 6 +- .../awkward_ListOffsetArray_rpad_axis1.cu | 4 +- .../awkward_NumpyArray_pad_zero_to_length.cu | 4 +- .../awkward_NumpyArray_rearrange_shifted.cu | 2 +- .../awkward_UnionArray_flatten_combine.cu | 2 +- ...wkward_UnionArray_nestedfill_tags_index.cu | 4 +- .../_connect/cuda/cuda_kernels/cuda_common.cu | 135 +--- .../test_3130_cuda_listarray_getitem_next.py | 687 ++++++++++++++++++ 28 files changed, 1137 insertions(+), 190 deletions(-) create mode 100644 src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range.cu create mode 100644 src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_carrylength.cu create mode 100644 tests-cuda/test_3130_cuda_listarray_getitem_next.py diff --git a/awkward-cpp/src/cpu-kernels/awkward_ListArray_getitem_next_range.cpp b/awkward-cpp/src/cpu-kernels/awkward_ListArray_getitem_next_range.cpp index f066a66aec..817a1e318f 100644 --- a/awkward-cpp/src/cpu-kernels/awkward_ListArray_getitem_next_range.cpp +++ b/awkward-cpp/src/cpu-kernels/awkward_ListArray_getitem_next_range.cpp @@ -17,35 +17,26 @@ ERROR awkward_ListArray_getitem_next_range( int64_t step) { int64_t k = 0; tooffsets[0] = 0; - if (step > 0) { - for (int64_t i = 0; i < lenstarts; i++) { - int64_t length = fromstops[i] - fromstarts[i]; - int64_t regular_start = start; - int64_t regular_stop = stop; - awkward_regularize_rangeslice(®ular_start, ®ular_stop, step > 0, - start != kSliceNone, stop != kSliceNone, - length); + for (int64_t i = 0; i < lenstarts; i++) { + int64_t length = fromstops[i] - fromstarts[i]; + int64_t regular_start = start; + int64_t regular_stop = stop; + awkward_regularize_rangeslice(®ular_start, ®ular_stop, step > 0, + start != kSliceNone, stop != kSliceNone, + length); + if (step > 0) { for (int64_t j = regular_start; j < regular_stop; j += step) { tocarry[k] = fromstarts[i] + j; k++; } - tooffsets[i + 1] = (C)k; } - } - else { - for (int64_t i = 0; i < lenstarts; i++) { - int64_t length = fromstops[i] - fromstarts[i]; - int64_t regular_start = start; - int64_t regular_stop = stop; - awkward_regularize_rangeslice(®ular_start, ®ular_stop, step > 0, - start != kSliceNone, stop != kSliceNone, - length); + else { for (int64_t j = regular_start; j > regular_stop; j += step) { tocarry[k] = fromstarts[i] + j; k++; } - tooffsets[i + 1] = (C)k; } + tooffsets[i + 1] = (C)k; } return success(); } diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index b64946626c..7b1c55c8e9 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -60,6 +60,8 @@ "awkward_ListArray_getitem_next_array_advanced", "awkward_ListArray_getitem_next_array", "awkward_ListArray_getitem_next_at", + "awkward_ListArray_getitem_next_range", + "awkward_ListArray_getitem_next_range_carrylength", "awkward_ListArray_getitem_next_range_counts", "awkward_ListArray_rpad_and_clip_length_axis1", "awkward_ListArray_rpad_axis1", diff --git a/dev/generate-tests.py b/dev/generate-tests.py index 37dc859b9a..5e544ff36d 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -845,6 +845,8 @@ def gencpuunittests(specdict): "awkward_ListArray_getitem_next_array_advanced", "awkward_ListArray_getitem_next_array", "awkward_ListArray_getitem_next_at", + "awkward_ListArray_getitem_next_range", + "awkward_ListArray_getitem_next_range_carrylength", "awkward_ListArray_getitem_next_range_counts", "awkward_ListArray_rpad_and_clip_length_axis1", "awkward_ListArray_rpad_axis1", diff --git a/kernel-specification.yml b/kernel-specification.yml index c58bc4d287..93c0859444 100644 --- a/kernel-specification.yml +++ b/kernel-specification.yml @@ -1762,44 +1762,30 @@ kernels: ): k = 0 tooffsets[0] = 0 - if step > 0: - for i in range(lenstarts): - length = fromstops[i] - fromstarts[i] - regular_start = start - regular_stop = stop - regular_start, regular_stop = awkward_regularize_rangeslice( - regular_start, - regular_stop, - step > 0, - start != kSliceNone, - stop != kSliceNone, - length, - ) - j = regular_start + for i in range(lenstarts): + length = fromstops[i] - fromstarts[i] + regular_start = start + regular_stop = stop + regular_start, regular_stop = awkward_regularize_rangeslice( + regular_start, + regular_stop, + step > 0, + start != kSliceNone, + stop != kSliceNone, + length, + ) + j = regular_start + if step > 0: while j < regular_stop: tocarry[k] = fromstarts[i] + j k = k + 1 j += step - tooffsets[i + 1] = k - else: - for i in range(lenstarts): - length = fromstops[i] - fromstarts[i] - regular_start = start - regular_stop = stop - regular_start, regular_stop = awkward_regularize_rangeslice( - regular_start, - regular_stop, - step > 0, - start != kSliceNone, - stop != kSliceNone, - length, - ) - j = regular_start + else: while j > regular_stop: tocarry[k] = fromstarts[i] + j k = k + 1 j += step - tooffsets[i + 1] = k + tooffsets[i + 1] = k automatic-tests: false - name: awkward_ListArray_getitem_next_range_carrylength diff --git a/kernel-test-data.json b/kernel-test-data.json index fde02211fa..fffc631108 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -13806,6 +13806,21 @@ "carrylength": [7] } }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 2, 2, 3, 5], + "fromstops": [2, 2, 3, 5, 7], + "lenstarts": 5, + "start": 7, + "stop": 0, + "step": -1 + }, + "outputs": { + "carrylength": [3] + } + }, { "error": false, "message": "", @@ -13820,6 +13835,186 @@ "outputs": { "carrylength": [0] } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 2, 2, 3, 5], + "fromstops": [2, 2, 3, 5, 7], + "lenstarts": 5, + "start": 0, + "stop": 6, + "step": 2 + }, + "outputs": { + "carrylength": [4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 2, 2, 3, 5], + "fromstops": [2, 2, 3, 5, 7], + "lenstarts": 5, + "start": 7, + "stop": 0, + "step": -2 + }, + "outputs": { + "carrylength": [3] + } + } + ] + }, + { + "name": "awkward_ListArray_getitem_next_range", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [], + "fromstops": [], + "lenstarts": 0, + "start": 0, + "stop": 0, + "step": 0 + }, + "outputs": { + "tooffsets": [0], + "tocarry": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 2, 2, 3, 5], + "fromstops": [2, 2, 3, 5, 7], + "lenstarts": 5, + "start": 0, + "stop": 3, + "step": 1 + }, + "outputs": { + "tooffsets": [0, 2, 2, 3, 5, 7], + "tocarry": [0, 1, 2, 3, 4, 5, 6] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 2, 2, 3, 5], + "fromstops": [2, 2, 3, 5, 7], + "lenstarts": 5, + "start": 7, + "stop": 0, + "step": -1 + }, + "outputs": { + "tooffsets": [0, 1, 1, 1, 2, 3], + "tocarry": [1, 4, 6] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 2, 2, 3, 5], + "fromstops": [2, 2, 3, 5, 7], + "lenstarts": 5, + "start": 0, + "stop": 2, + "step": 0 + }, + "outputs": { + "tooffsets": [0, 0, 0, 0, 0, 0], + "tocarry": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 3, 3, 5, 7, 9], + "fromstops": [3, 3, 5, 7, 9, 11], + "lenstarts": 6, + "start": 0, + "stop": 6, + "step": 2 + }, + "outputs": { + "tooffsets": [0, 2, 2, 3, 4, 5, 6], + "tocarry": [0, 2, 3, 5, 7, 9] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 3, 3, 5, 7, 9], + "fromstops": [3, 3, 5, 7, 9, 11], + "lenstarts": 6, + "start": 2, + "stop": 6, + "step": 2 + }, + "outputs": { + "tooffsets": [0, 1, 1, 1, 1, 1, 1], + "tocarry": [2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 3, 3, 5, 7, 9], + "fromstops": [3, 3, 5, 7, 9, 11], + "lenstarts": 6, + "start": 6, + "stop": 2, + "step": -2 + }, + "outputs": { + "tooffsets": [0, 0, 0, 0, 0, 0, 0], + "tocarry": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 3, 3, 5, 7, 9], + "fromstops": [3, 3, 5, 7, 9, 11], + "lenstarts": 6, + "start": 0, + "stop": 6, + "step": -2 + }, + "outputs": { + "tooffsets": [0, 0, 0, 0, 0, 0, 0], + "tocarry": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 2, 2, 3, 5], + "fromstops": [2, 2, 3, 5, 7], + "lenstarts": 5, + "start": 7, + "stop": 0, + "step": -2 + }, + "outputs": { + "tooffsets": [0, 1, 1, 1, 2, 3], + "tocarry": [1, 4, 6] + } } ] }, diff --git a/pyproject.toml b/pyproject.toml index 997e601dc5..533fe66396 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,7 @@ filterwarnings = [ "ignore:The NumPy module was reloaded:UserWarning", "ignore:.*np\\.MachAr.*:DeprecationWarning", "ignore:module 'sre_.*' is deprecated:DeprecationWarning", + "ignore:Jitify is performing a one-time only warm-up", ] log_cli_level = "info" diff --git a/src/awkward/_connect/cuda/__init__.py b/src/awkward/_connect/cuda/__init__.py index 354fdcd217..d48aab5ccd 100644 --- a/src/awkward/_connect/cuda/__init__.py +++ b/src/awkward/_connect/cuda/__init__.py @@ -99,6 +99,8 @@ def fetch_template_specializations(kernel_dict): "awkward_ListArray_getitem_jagged_carrylen", "awkward_ListArray_getitem_jagged_descend", "awkward_ListArray_getitem_jagged_numvalid", + "awkward_ListArray_getitem_next_range", + "awkward_ListArray_getitem_next_range_carrylength", "awkward_ListArray_min_range", "awkward_ListArray_rpad_and_clip_length_axis1", "awkward_ListArray_rpad_axis1", diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu index d7cd5302d5..90bac2c9c5 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu @@ -31,7 +31,7 @@ awkward_IndexedArray_getitem_nextcarry_a( C j = fromindex[thread_id]; if (j < 0 || j >= lencontent) { RAISE_ERROR(INDEXEDARRAY_GETITEM_NEXTCARRY_ERRORS::IND_OUT_OF_RANGE) - } else if (j >= 0) { + } else { scan_in_array[thread_id] = 1; } } @@ -55,7 +55,7 @@ awkward_IndexedArray_getitem_nextcarry_b( C j = fromindex[thread_id]; if (j < 0 || j >= lencontent) { RAISE_ERROR(INDEXEDARRAY_GETITEM_NEXTCARRY_ERRORS::IND_OUT_OF_RANGE) - } else if (j >= 0) { + } else { tocarry[scan_in_array[thread_id] - 1] = j; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_carry_next_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_carry_next_64.cu index 9f40ca9963..a25ca62834 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_carry_next_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_carry_next_64.cu @@ -28,7 +28,7 @@ awkward_IndexedArray_ranges_carry_next_64_a( if (thread_id < length) { stride = fromstops[thread_id] - fromstarts[thread_id]; - for (int64_t j = 0; j < stride; j++) { + for (int64_t j = threadIdx.y; j < stride; j += blockDim.y) { if (!(index[fromstarts[thread_id] + j] < 0)) { scan_in_array[fromstarts[thread_id] + j] = 1; } @@ -54,7 +54,7 @@ awkward_IndexedArray_ranges_carry_next_64_b( if (thread_id < length) { stride = fromstops[thread_id] - fromstarts[thread_id]; - for (int64_t j = 0; j < stride; j++) { + for (int64_t j = threadIdx.y; j < stride; j += blockDim.y) { if (!(index[fromstarts[thread_id] + j] < 0)) { tocarry[scan_in_array[fromstarts[thread_id] + j] - 1] = index[fromstarts[thread_id] + j]; } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_next_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_next_64.cu index 998a4b54ab..b7332e41b9 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_next_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_ranges_next_64.cu @@ -30,7 +30,7 @@ awkward_IndexedArray_ranges_next_64_a( if (thread_id < length) { stride = fromstops[thread_id] - fromstarts[thread_id]; - for (int64_t j = 0; j < stride; j++) { + for (int64_t j = threadIdx.y; j < stride; j += blockDim.y) { if (!(index[fromstarts[thread_id] + j] < 0)) { scan_in_array[fromstarts[thread_id] + j] = 1; } @@ -61,10 +61,6 @@ awkward_IndexedArray_ranges_next_64_b( if (thread_id < length) { stride = fromstops[thread_id] - fromstarts[thread_id]; tostarts[thread_id] = scan_in_array[fromstarts[thread_id] - 1]; - for (int64_t j = 0; j < stride; j++) { - if (!(index[fromstarts[thread_id] + j] < 0)) { - } - } tostops[thread_id] = scan_in_array[fromstops[thread_id] - 1]; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_broadcast_tooffsets.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_broadcast_tooffsets.cu index e93ead5a00..d8eb677703 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_broadcast_tooffsets.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_broadcast_tooffsets.cu @@ -50,7 +50,7 @@ awkward_ListArray_broadcast_tooffsets_a( if (stop - start != count) { RAISE_ERROR(LISTARRAY_BROADCAST_TOOFFSETS_ERRORS::NESTED_ERR) } - for (int64_t j = start; j < stop; j++) { + for (int64_t j = start + threadIdx.y; j < stop; j += blockDim.y) { scan_in_array[fromoffsets[thread_id] + j - start] = 1; } } @@ -87,7 +87,7 @@ awkward_ListArray_broadcast_tooffsets_b( RAISE_ERROR(LISTARRAY_BROADCAST_TOOFFSETS_ERRORS::NESTED_ERR) } - for (int64_t j = start; j < stop; j++) { + for (int64_t j = start + threadIdx.y; j < stop; j += blockDim.y) { tocarry[scan_in_array[fromoffsets[thread_id] + j - start] - 1] = (T)j; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations_length.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations_length.cu index c36027d2fa..5282578ca3 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations_length.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations_length.cu @@ -48,7 +48,7 @@ awkward_ListArray_combinations_length_a( thisn = size - thisn; } combinationslen = size; - for (int64_t j = 2; j <= thisn; j++) { + for (int64_t j = 2 + threadIdx.y; j <= thisn; j += blockDim.y) { combinationslen *= (size - j + 1); combinationslen /= j; } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_numvalid.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_numvalid.cu index 2c9d1e5f7e..4fd0787415 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_numvalid.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_numvalid.cu @@ -42,7 +42,7 @@ awkward_ListArray_getitem_jagged_numvalid_a( if (slicestop > missinglength) { RAISE_ERROR(LISTARRAY_GETITEM_JAGGED_NUMVALID_ERRORS::OFF_GET_CON) } - for (int64_t j = slicestart; j < slicestop; j++) { + for (int64_t j = slicestart + threadIdx.y; j < slicestop; j += blockDim.y) { scan_in_array[j] = missing[j] >= 0 ? 1 : 0; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range.cu new file mode 100644 index 0000000000..fcd0605425 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range.cu @@ -0,0 +1,90 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (tooffsets, tocarry, fromstarts, fromstops, lenstarts, start, stop, step, invocation_index, err_code) = args +// scan_in_array = cupy.zeros(lenstarts + 1, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_a", tooffsets.dtype, tocarry.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (tooffsets, tocarry, fromstarts, fromstops, lenstarts, start, stop, step, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_b", tooffsets.dtype, tocarry.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (tooffsets, tocarry, fromstarts, fromstops, lenstarts, start, stop, step, scan_in_array, invocation_index, err_code)) +// out["awkward_ListArray_getitem_next_range_a", {dtype_specializations}] = None +// out["awkward_ListArray_getitem_next_range_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ListArray_getitem_next_range_a( + T* tooffsets, + C* tocarry, + const U* fromstarts, + const V* fromstops, + int64_t lenstarts, + int64_t start, + int64_t stop, + int64_t step, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < lenstarts) { + scan_in_array[0] = 0; + int64_t length = fromstops[thread_id] - fromstarts[thread_id]; + int64_t regular_start = start; + int64_t regular_stop = stop; + awkward_regularize_rangeslice(®ular_start, ®ular_stop, step > 0, + start != kSliceNone, stop != kSliceNone, + length); + + if (step != 0) { + scan_in_array[thread_id + 1] = ceil((float)(regular_stop - regular_start) / step); + } + else { + scan_in_array[thread_id + 1] = 0; + } + } + } +} + +template +__global__ void +awkward_ListArray_getitem_next_range_b( + T* tooffsets, + C* tocarry, + const U* fromstarts, + const V* fromstops, + int64_t lenstarts, + int64_t start, + int64_t stop, + int64_t step, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + tooffsets[0] = 0; + int64_t k = 0; + if (thread_id < lenstarts) { + int64_t length = fromstops[thread_id] - fromstarts[thread_id]; + int64_t regular_start = start; + int64_t regular_stop = stop; + awkward_regularize_rangeslice(®ular_start, ®ular_stop, step > 0, + start != kSliceNone, stop != kSliceNone, + length); + + if (step > 0) { + for (int64_t j = regular_start + step * threadIdx.y; j < regular_stop; j += step * blockDim.y) { + tocarry[scan_in_array[thread_id] + k] = fromstarts[thread_id] + j; + k++; + } + } + else { + for (int64_t j = regular_start - step * threadIdx.y; j > regular_stop; j += step * blockDim.y) { + tocarry[scan_in_array[thread_id] + k] = fromstarts[thread_id] + j; + k++; + } + } + tooffsets[thread_id + 1] = (T)scan_in_array[thread_id + 1]; + } + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_carrylength.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_carrylength.cu new file mode 100644 index 0000000000..dde19b6e8b --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_carrylength.cu @@ -0,0 +1,70 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (carrylength, fromstarts, fromstops, lenstarts, start, stop, step, invocation_index, err_code) = args +// scan_in_array_carrylength = cupy.zeros(lenstarts, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_carrylength_a", carrylength.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (carrylength, fromstarts, fromstops, lenstarts, start, stop, step, scan_in_array_carrylength, invocation_index, err_code)) +// scan_in_array_carrylength = cupy.cumsum(scan_in_array_carrylength) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_carrylength_b", carrylength.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (carrylength, fromstarts, fromstops, lenstarts, start, stop, step, scan_in_array_carrylength, invocation_index, err_code)) +// out["awkward_ListArray_getitem_next_range_carrylength_a", {dtype_specializations}] = None +// out["awkward_ListArray_getitem_next_range_carrylength_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ListArray_getitem_next_range_carrylength_a( + T* carrylength, + const C* fromstarts, + const U* fromstops, + int64_t lenstarts, + int64_t start, + int64_t stop, + int64_t step, + int64_t* scan_in_array_carrylength, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t kMaxInt64 = 9223372036854775806; // 2**63 - 2: see below + const int64_t kSliceNone = kMaxInt64 + 1; // for Slice::none() + if (thread_id < lenstarts) { + int64_t length = fromstops[thread_id] - fromstarts[thread_id]; + int64_t regular_start = start; + int64_t regular_stop = stop; + awkward_regularize_rangeslice(®ular_start, ®ular_stop, step > 0, + start != kSliceNone, stop != kSliceNone, + length); + int64_t carrylen = 0; + if (step > 0) { + for (int64_t j = regular_start + step * threadIdx.y; j < regular_stop; j += step * blockDim.y) { + carrylen += 1; + } + } + else { + for (int64_t j = regular_start - step * threadIdx.y; j > regular_stop; j += step * blockDim.y) { + carrylen += 1; + } + } + scan_in_array_carrylength[thread_id] = carrylen; + } + } +} + +template +__global__ void +awkward_ListArray_getitem_next_range_carrylength_b( + T* carrylength, + const C* fromstarts, + const U* fromstops, + int64_t lenstarts, + int64_t start, + int64_t stop, + int64_t step, + int64_t* scan_in_array_carrylength, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *carrylength = lenstarts > 0 ? scan_in_array_carrylength[lenstarts - 1] : 0; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_counts.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_counts.cu index 725b9468ca..c95d050550 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_counts.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_counts.cu @@ -2,11 +2,11 @@ // BEGIN PYTHON // def f(grid, block, args): -// (total, fromoffsets, lenstarts, invocation_total, err_code) = args +// (total, fromoffsets, lenstarts, invocation_index, err_code) = args // scan_in_array = cupy.zeros(lenstarts, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_counts_a", total.dtype, fromoffsets.dtype]))(grid, block, (total, fromoffsets, lenstarts, scan_in_array, invocation_total, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_counts_a", total.dtype, fromoffsets.dtype]))(grid, block, (total, fromoffsets, lenstarts, scan_in_array, invocation_index, err_code)) // scan_in_array = cupy.cumsum(scan_in_array) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_counts_b", total.dtype, fromoffsets.dtype]))(grid, block, (total, fromoffsets, lenstarts, scan_in_array, invocation_total, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_counts_b", total.dtype, fromoffsets.dtype]))(grid, block, (total, fromoffsets, lenstarts, scan_in_array, invocation_index, err_code)) // out["awkward_ListArray_getitem_next_range_counts_a", {dtype_specializations}] = None // out["awkward_ListArray_getitem_next_range_counts_b", {dtype_specializations}] = None // END PYTHON @@ -18,7 +18,7 @@ awkward_ListArray_getitem_next_range_counts_a( const C* fromoffsets, int64_t lenstarts, int64_t* scan_in_array, - uint64_t invocation_total, + uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -36,7 +36,7 @@ awkward_ListArray_getitem_next_range_counts_b( const C* fromoffsets, int64_t lenstarts, int64_t* scan_in_array, - uint64_t invocation_total, + uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { *total = lenstarts > 0 ? scan_in_array[lenstarts - 1] : 0; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_spreadadvanced.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_spreadadvanced.cu index e43586dc00..3ca97960e5 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_spreadadvanced.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_spreadadvanced.cu @@ -13,7 +13,7 @@ awkward_ListArray_getitem_next_range_spreadadvanced( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < lenstarts) { C count = fromoffsets[thread_id + 1] - fromoffsets[thread_id]; - for (int64_t j = 0; j < count; j++) { + for (int64_t j = threadIdx.y; j < count; j += blockDim.y) { toadvanced[fromoffsets[thread_id] + j] = fromadvanced[thread_id]; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_localindex.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_localindex.cu index 7ee9dabf00..0958cec481 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_localindex.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_localindex.cu @@ -13,7 +13,7 @@ awkward_ListArray_localindex( if (thread_id < length) { int64_t start = (int64_t)offsets[thread_id]; int64_t stop = (int64_t)offsets[thread_id + 1]; - for (int64_t j = start; j < stop; j++) { + for (int64_t j = start + threadIdx.y; j < stop; j += blockDim.y) { toindex[j] = j - start; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_rpad_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_rpad_axis1.cu index 7b8ca6ba67..7a63f6d078 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_rpad_axis1.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_rpad_axis1.cu @@ -58,12 +58,12 @@ awkward_ListArray_rpad_axis1_b( } tostarts[thread_id] = offset; int64_t rangeval = fromstops[thread_id] - fromstarts[thread_id]; - for (int64_t j = 0; j < rangeval; j++) { + for (int64_t j = threadIdx.y; j < rangeval; j += blockDim.y) { toindex[offset + j] = fromstarts[thread_id] + j; - } - for (int64_t j = rangeval; j < target; j++) { + } + for (int64_t j = rangeval + threadIdx.y; j < target; j += blockDim.y) { toindex[offset + j] = -1; - } + } tostops[thread_id] = scan_in_array[thread_id]; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_drop_none_indexes.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_drop_none_indexes.cu index 0a3ec51d86..72dd0ad04e 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_drop_none_indexes.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_drop_none_indexes.cu @@ -31,7 +31,7 @@ awkward_ListOffsetArray_drop_none_indexes_a( int64_t offset1 = fromoffsets[thread_id - 1]; } int64_t offset2 = fromoffsets[thread_id]; - for (int j = offset1; j < offset2; j++) { + for (int j = offset1 + threadIdx.y; j < offset2; j += blockDim.y) { if (noneindexes[j] < 0) { scan_in_array[j] = 1; } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_nextparents_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_nextparents_64.cu index 03de70aea8..b68268f447 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_nextparents_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_nextparents_64.cu @@ -13,9 +13,9 @@ awkward_ListOffsetArray_reduce_local_nextparents_64( if (thread_id < length) { int64_t initialoffset = (int64_t)(offsets[0]); - for (int64_t j = (int64_t)(offsets[thread_id]) - initialoffset; - j < offsets[thread_id + 1] - initialoffset; - j++) { + int64_t start = (int64_t)(offsets[thread_id]) - initialoffset; + int64_t stop = (int64_t)offsets[thread_id + 1] - initialoffset; + for (int64_t j = start + threadIdx.y; j < stop; j += blockDim.y) { nextparents[j] = thread_id; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu index da5911e21d..47a99dbca6 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu @@ -32,10 +32,10 @@ awkward_ListOffsetArray_rpad_axis1( index = scan_in_array[thread_id - 1]; } - for (int64_t j = 0; j < rangeval; j++) { + for (int64_t j = threadIdx.y; j < rangeval; j += blockDim.y) { toindex[index + j] = (T)fromoffsets[thread_id] + j; } - for (int64_t j = rangeval; j < target; j++) { + for (int64_t j = rangeval + threadIdx.y; j < target; j += blockDim.y) { toindex[index + j] = -1; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_pad_zero_to_length.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_pad_zero_to_length.cu index 555774b087..30b12be1ac 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_pad_zero_to_length.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_pad_zero_to_length.cu @@ -32,12 +32,12 @@ awkward_NumpyArray_pad_zero_to_length( l_to_char = scan_in_array[thread_id - 1]; } // Copy from src to dst - for (int64_t j_from_char = fromoffsets[thread_id]; j_from_char < fromoffsets[thread_id + 1]; j_from_char++) { + for (int64_t j_from_char = fromoffsets[thread_id] + threadIdx.y; j_from_char < fromoffsets[thread_id + 1]; j_from_char += blockDim.y) { toptr[l_to_char++] = fromptr[j_from_char]; } // Pad to remaining width auto n_to_pad = target - (fromoffsets[thread_id + 1] - fromoffsets[thread_id]); - for (int64_t j_from_char = 0; j_from_char < n_to_pad; j_from_char++){ + for (int64_t j_from_char = threadIdx.y; j_from_char < n_to_pad; j_from_char += blockDim.y){ toptr[l_to_char++] = 0; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_rearrange_shifted.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_rearrange_shifted.cu index 28eb039552..7a183a902b 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_rearrange_shifted.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_rearrange_shifted.cu @@ -26,7 +26,7 @@ awkward_NumpyArray_rearrange_shifted_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < offsetslength - 1) { - for (int64_t j = 0; j < fromoffsets[thread_id + 1] - fromoffsets[thread_id]; j++) { + for (int64_t j = threadIdx.y; j < fromoffsets[thread_id + 1] - fromoffsets[thread_id]; j += blockDim.y) { int64_t idx = fromoffsets[thread_id] + j; toptr[idx] = toptr[idx] + fromoffsets[thread_id]; } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_flatten_combine.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_flatten_combine.cu index 18dab536c4..d70957736d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_flatten_combine.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_flatten_combine.cu @@ -57,7 +57,7 @@ awkward_UnionArray_flatten_combine_b( X start = offsetsraws[tag][idx]; X stop = offsetsraws[tag][idx + 1]; int64_t k = scan_in_array_tooffsets[thread_id]; - for (int64_t j = start; j < stop; j++) { + for (int64_t j = start + threadIdx.y; j < stop; j += blockDim.y) { totags[k] = tag; toindex[k] = j; k++; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_nestedfill_tags_index.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_nestedfill_tags_index.cu index d27ecba5e0..4317ee6158 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_nestedfill_tags_index.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_nestedfill_tags_index.cu @@ -31,7 +31,7 @@ awkward_UnionArray_nestedfill_tags_index_a( if (thread_id < length) { U start = tmpstarts[thread_id]; V stop = start + fromcounts[thread_id]; - for (int64_t j = start; j < stop; j++) { + for (int64_t j = start + threadIdx.y; j < stop; j += blockDim.y) { scan_in_array[j] += 1; } } @@ -55,7 +55,7 @@ awkward_UnionArray_nestedfill_tags_index_b( if (thread_id < length) { U start = tmpstarts[thread_id]; V stop = start + fromcounts[thread_id]; - for (int64_t j = start; j < stop; j++) { + for (int64_t j = start + threadIdx.y; j < stop; j += blockDim.y) { totags[j] = tag; toindex[j] = (C)(scan_in_array[j] - 1); } diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index 8a02094f34..e101a7f3d8 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -32,115 +32,40 @@ typedef unsigned long long uintmax_t; atomicMin(err_code, \ invocation_index*(1 << ERROR_BITS) + (int)(ERROR_KERNEL_CODE)); -// BEGIN PYTHON -// def inclusive_scan(grid, block, args): -// (d_in, invocation_index, err_code) = args -// import math -// d_out = cupy.empty(len(d_in), dtype=cupy.int64) -// d_final = cupy.empty(len(d_in), dtype=cupy.int64) -// stride = 1 -// total_steps = math.ceil(math.log2(len(d_in))) -// for curr_step in range(1, total_steps + 1): -// in_out_flag = (curr_step % 2) != 0 -// cuda_kernel_templates.get_function(fetch_specialization(['inclusive_scan_kernel', cupy.int64]))(grid, block, (d_in, d_out, d_final, curr_step, total_steps, stride, in_out_flag, len(d_in), invocation_index, err_code)) -// stride = stride * 2 -// return d_final -// out['inclusive_scan_kernel', cupy.int64] = inclusive_scan +const int64_t kMaxInt64 = 9223372036854775806; // 2**63 - 2: see below +const int64_t kSliceNone = kMaxInt64 + 1; // for Slice::none() -// def exclusive_scan(grid, block, args): -// print(args) -// (d_in, invocation_index, err_code) = args -// import math -// d_out = cupy.empty(len(d_in), dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(['exclusive_scan_kernel', cupy.int64]))(grid, block, (d_in, d_out, len(d_in), invocation_index, err_code)) -// print(d_out) -// print("\n") -// return d_out -// out['exclusive_scan_kernel', cupy.int64] = exclusive_scan -// END PYTHON +void +awkward_regularize_rangeslice( + int64_t* start, + int64_t* stop, + bool posstep, + bool hasstart, + bool hasstop, + int64_t length) { + if (posstep) { + if (!hasstart) *start = 0; + else if (*start < 0) *start += length; + if (*start < 0) *start = 0; + if (*start > length) *start = length; -template -__global__ void -inclusive_scan_kernel(T* d_in, - T* d_out, - T* d_final, - int64_t curr_step, - int64_t total_steps, - int64_t stride, - bool in_out_flag, - int64_t length, - uint64_t* invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t block_id = blockIdx.x + blockIdx.y * gridDim.x + - gridDim.x * gridDim.y * blockIdx.z; - - int64_t thread_id = block_id * blockDim.x + threadIdx.x; - - if (thread_id < length) { - if (!in_out_flag) { - if (thread_id < stride) { - d_in[thread_id] = d_out[thread_id]; - } else { - d_in[thread_id] = d_out[thread_id] + d_out[thread_id - stride]; - } - } else { - if (thread_id < stride) { - d_out[thread_id] = d_in[thread_id]; - } else { - d_out[thread_id] = d_in[thread_id] + d_in[thread_id - stride]; - } - } - - if (curr_step == total_steps) { - d_final[thread_id] = in_out_flag ? d_out[thread_id] : d_in[thread_id]; - } - } - } -} - - -template -__global__ void -exclusive_scan_kernel(T* input, - T* output, - int64_t n, - uint64_t* invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - extern __shared__ int temp[1024*2]; - int tid = threadIdx.x; - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < n) { - temp[tid] = input[idx]; - } else { - temp[tid] = 0; - } - __syncthreads(); - - for (int stride = 1; stride <= 1024; stride *= 2) { - int index = (tid + 1) * stride * 2 - 1; - if (index < 2 * 1024) { - temp[index] += temp[index - stride]; - } - __syncthreads(); + if (!hasstop) *stop = length; + else if (*stop < 0) *stop += length; + if (*stop < 0) *stop = 0; + if (*stop > length) *stop = length; + if (*stop < *start) *stop = *start; } - if (tid == 0) { - temp[2 * 1024 - 1] = 0; - } - __syncthreads(); - - for (int stride = 1024; stride > 0; stride /= 2) { - int index = (tid + 1) * stride * 2 - 1; - if (index + stride < 2 * 1024) { - temp[index + stride] += temp[index]; - } - __syncthreads(); - } + else { + if (!hasstart) *start = length - 1; + else if (*start < 0) *start += length; + if (*start < -1) *start = -1; + if (*start > length - 1) *start = length - 1; - if (idx < n) { - output[idx] = temp[tid]; + if (!hasstop) *stop = -1; + else if (*stop < 0) *stop += length; + if (*stop < -1) *stop = -1; + if (*stop > length - 1) *stop = length - 1; + if (*stop > *start) *stop = *start; } } -} diff --git a/tests-cuda/test_3130_cuda_listarray_getitem_next.py b/tests-cuda/test_3130_cuda_listarray_getitem_next.py new file mode 100644 index 0000000000..c26c8f9319 --- /dev/null +++ b/tests-cuda/test_3130_cuda_listarray_getitem_next.py @@ -0,0 +1,687 @@ +from __future__ import annotations + +import cupy as cp +import numpy as np +import pytest + +import awkward as ak + +to_list = ak.operations.to_list + +content = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9]) +) +starts1 = ak.index.IndexU32(np.array([0, 3, 3, 5, 6], np.uint32)) +stops1 = ak.index.IndexU32(np.array([3, 3, 5, 6, 9], np.uint32)) +offsets1 = ak.index.IndexU32(np.array([0, 3, 3, 5, 6, 9], np.uint32)) +starts2 = ak.index.IndexU32(np.array([0, 2, 3, 3], np.uint32)) +stops2 = ak.index.IndexU32(np.array([2, 3, 3, 5], np.uint32)) +offsets2 = ak.index.IndexU32(np.array([0, 2, 3, 3, 5], np.uint32)) + + +def tests_0020_support_unsigned_indexes_listarray_ellipsis(): + array1 = ak.contents.ListArray(starts1, stops1, content) + array2 = ak.contents.ListArray(starts2, stops2, array1) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + cuda_array2 = ak.to_backend(array2, "cuda", highlevel=False) + + assert to_list(cuda_array1[Ellipsis, 1:]) == [ + [2.2, 3.3], + [], + [5.5], + [], + [8.8, 9.9], + ] + assert ( + cuda_array1.to_typetracer()[Ellipsis, 1:].form == cuda_array1[Ellipsis, 1:].form + ) + assert to_list(cuda_array2[Ellipsis, 1:]) == [ + [[2.2, 3.3], []], + [[5.5]], + [], + [[], [8.8, 9.9]], + ] + assert ( + cuda_array2.to_typetracer()[Ellipsis, 1:].form == cuda_array2[Ellipsis, 1:].form + ) + + +def tests_0020_support_unsigned_indexes_listoffsetarray_ellipsis(): + array1 = ak.contents.ListOffsetArray(offsets1, content) + array2 = ak.contents.ListOffsetArray(offsets2, array1) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + cuda_array2 = ak.to_backend(array2, "cuda", highlevel=False) + + assert to_list(cuda_array1[Ellipsis, 1:]) == [ + [2.2, 3.3], + [], + [5.5], + [], + [8.8, 9.9], + ] + assert ( + cuda_array1.to_typetracer()[Ellipsis, 1:].form == cuda_array1[Ellipsis, 1:].form + ) + assert to_list(cuda_array2[Ellipsis, 1:]) == [ + [[2.2, 3.3], []], + [[5.5]], + [], + [[], [8.8, 9.9]], + ] + assert ( + cuda_array2.to_typetracer()[Ellipsis, 1:].form == cuda_array2[Ellipsis, 1:].form + ) + + +def tests_0020_support_unsigned_indexes_listarray_array_slice(): + array1 = ak.contents.ListArray(starts1, stops1, content) + array2 = ak.contents.ListArray(starts2, stops2, array1) + + cuda_array2 = ak.to_backend(array2, "cuda", highlevel=False) + + assert to_list(cuda_array2[[0, 0, 1, 1, 1, 0]]) == [ + [[1.1, 2.2, 3.3], []], + [[1.1, 2.2, 3.3], []], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[1.1, 2.2, 3.3], []], + ] + assert ( + cuda_array2.to_typetracer()[[0, 0, 1, 1, 1, 0]].form + == cuda_array2[[0, 0, 1, 1, 1, 0]].form + ) + + assert to_list(cuda_array2[[0, 0, 1, 1, 1, 0], :]) == [ + [[1.1, 2.2, 3.3], []], + [[1.1, 2.2, 3.3], []], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[1.1, 2.2, 3.3], []], + ] + assert ( + cuda_array2.to_typetracer()[[0, 0, 1, 1, 1, 0], :].form + == cuda_array2[[0, 0, 1, 1, 1, 0], :].form + ) + assert to_list(cuda_array2[[0, 0, 1, 1, 1, 0], :, 1:]) == [ + [[2.2, 3.3], []], + [[2.2, 3.3], []], + [[5.5]], + [[5.5]], + [[5.5]], + [[2.2, 3.3], []], + ] + assert ( + cuda_array2.to_typetracer()[[0, 0, 1, 1, 1, 0], :, 1:].form + == cuda_array2[[0, 0, 1, 1, 1, 0], :, 1:].form + ) + + +def tests_0020_support_unsigned_indexes_listoffsetarray_array_slice(): + array1 = ak.contents.ListOffsetArray(offsets1, content) + array2 = ak.contents.ListOffsetArray(offsets2, array1) + + cuda_array2 = ak.to_backend(array2, "cuda", highlevel=False) + + assert to_list(cuda_array2[[0, 0, 1, 1, 1, 0]]) == [ + [[1.1, 2.2, 3.3], []], + [[1.1, 2.2, 3.3], []], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[1.1, 2.2, 3.3], []], + ] + assert ( + cuda_array2.to_typetracer()[[0, 0, 1, 1, 1, 0]].form + == cuda_array2[[0, 0, 1, 1, 1, 0]].form + ) + assert to_list(cuda_array2[[0, 0, 1, 1, 1, 0], :]) == [ + [[1.1, 2.2, 3.3], []], + [[1.1, 2.2, 3.3], []], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[4.4, 5.5]], + [[1.1, 2.2, 3.3], []], + ] + assert ( + cuda_array2.to_typetracer()[[0, 0, 1, 1, 1, 0], :].form + == cuda_array2[[0, 0, 1, 1, 1, 0], :].form + ) + assert to_list(cuda_array2[[0, 0, 1, 1, 1, 0], :, 1:]) == [ + [[2.2, 3.3], []], + [[2.2, 3.3], []], + [[5.5]], + [[5.5]], + [[5.5]], + [[2.2, 3.3], []], + ] + assert ( + cuda_array2.to_typetracer()[[0, 0, 1, 1, 1, 0], :, 1:].form + == cuda_array2[[0, 0, 1, 1, 1, 0], :, 1:].form + ) + + +def tests_0020_support_unsigned_indexes_listarray_array(): + array1 = ak.contents.ListArray(starts1, stops1, content) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + + assert to_list(cuda_array1[cp.array([2, 0, 0, 1, -1])]) == [ + [4.4, 5.5], + [1.1, 2.2, 3.3], + [1.1, 2.2, 3.3], + [], + [7.7, 8.8, 9.9], + ] + assert ( + cuda_array1.to_typetracer()[cp.array([2, 0, 0, 1, -1])].form + == cuda_array1[cp.array([2, 0, 0, 1, -1])].form + ) + assert to_list(cuda_array1[cp.array([2, 0, 0, -1]), cp.array([1, 1, 0, 0])]) == [ + 5.5, + 2.2, + 1.1, + 7.7, + ] + assert ( + cuda_array1.to_typetracer()[ + cp.array([2, 0, 0, -1]), cp.array([1, 1, 0, 0]) + ].form + == cuda_array1[cp.array([2, 0, 0, -1]), cp.array([1, 1, 0, 0])].form + ) + + +def tests_0020_support_unsigned_indexes_listoffsetarray_array(): + array1 = ak.contents.ListOffsetArray(offsets1, content) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + + assert to_list(cuda_array1[cp.array([2, 0, 0, 1, -1])]) == [ + [4.4, 5.5], + [1.1, 2.2, 3.3], + [1.1, 2.2, 3.3], + [], + [7.7, 8.8, 9.9], + ] + assert ( + cuda_array1.to_typetracer()[cp.array([2, 0, 0, 1, -1])].form + == cuda_array1[cp.array([2, 0, 0, 1, -1])].form + ) + assert to_list(cuda_array1[cp.array([2, 0, 0, -1]), cp.array([1, 1, 0, 0])]) == [ + 5.5, + 2.2, + 1.1, + 7.7, + ] + assert ( + cuda_array1.to_typetracer()[ + cp.array([2, 0, 0, -1]), cp.array([1, 1, 0, 0]) + ].form + == cuda_array1[cp.array([2, 0, 0, -1]), cp.array([1, 1, 0, 0])].form + ) + + +def tests_0020_support_unsigned_indexes_listarray_slice(): + array1 = ak.contents.ListArray(starts1, stops1, content) + array2 = ak.contents.ListArray(starts2, stops2, array1) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + cuda_array2 = ak.to_backend(array2, "cuda", highlevel=False) + + assert to_list(cuda_array1[1:-1]) == [[], [4.4, 5.5], [6.6]] + assert cuda_array1.to_typetracer()[1:-1].form == cuda_array1[1:-1].form + assert to_list(cuda_array1[1:-1,]) == [[], [4.4, 5.5], [6.6]] + assert cuda_array1.to_typetracer()[1:-1,].form == cuda_array1[1:-1,].form + assert to_list(cuda_array2[1:-1]) == [[[4.4, 5.5]], []] + assert cuda_array2.to_typetracer()[1:-1].form == cuda_array2[1:-1].form + assert to_list(cuda_array2[1:-1,]) == [[[4.4, 5.5]], []] + assert cuda_array2.to_typetracer()[1:-1,].form == cuda_array2[1:-1,].form + + +def tests_0020_support_unsigned_indexes_listoffsetarray_slice(): + array1 = ak.contents.ListOffsetArray(offsets1, content) + array2 = ak.contents.ListOffsetArray(offsets2, array1) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + cuda_array2 = ak.to_backend(array2, "cuda", highlevel=False) + + assert to_list(cuda_array1[1:-1]) == [[], [4.4, 5.5], [6.6]] + assert cuda_array1.to_typetracer()[1:-1].form == cuda_array1[1:-1].form + assert to_list(cuda_array1[1:-1,]) == [[], [4.4, 5.5], [6.6]] + assert cuda_array1.to_typetracer()[1:-1,].form == cuda_array1[1:-1,].form + assert to_list(cuda_array2[1:-1]) == [[[4.4, 5.5]], []] + assert cuda_array2.to_typetracer()[1:-1].form == cuda_array2[1:-1].form + assert to_list(cuda_array2[1:-1,]) == [[[4.4, 5.5]], []] + assert cuda_array2.to_typetracer()[1:-1,].form == cuda_array2[1:-1,].form + + +def tests_0020_support_unsigned_indexes_listarray_slice_slice(): + array1 = ak.contents.ListArray(starts1, stops1, content) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + + assert to_list(cuda_array1[2:]) == [[4.4, 5.5], [6.6], [7.7, 8.8, 9.9]] + assert cuda_array1.to_typetracer()[2:].form == cuda_array1[2:].form + assert to_list(cuda_array1[2:, 1:]) == [[5.5], [], [8.8, 9.9]] + assert cuda_array1.to_typetracer()[2:, 1:].form == cuda_array1[2:, 1:].form + assert to_list(cuda_array1[2:, :-1]) == [[4.4], [], [7.7, 8.8]] + assert cuda_array1.to_typetracer()[2:, :-1].form == cuda_array1[2:, :-1].form + + +def tests_0020_support_unsigned_indexes_listoffsetarray_slice_slice(): + array1 = ak.contents.ListOffsetArray(offsets1, content) + + cuda_array1 = ak.to_backend(array1, "cuda", highlevel=False) + + assert to_list(cuda_array1[2:]) == [[4.4, 5.5], [6.6], [7.7, 8.8, 9.9]] + assert cuda_array1.to_typetracer()[2:].form == cuda_array1[2:].form + assert to_list(cuda_array1[2:, 1:]) == [[5.5], [], [8.8, 9.9]] + assert cuda_array1.to_typetracer()[2:, 1:].form == cuda_array1[2:, 1:].form + assert to_list(cuda_array1[2:, :-1]) == [[4.4], [], [7.7, 8.8]] + assert cuda_array1.to_typetracer()[2:, :-1].form == cuda_array1[2:, :-1].form + + +class ReversibleArray(ak.Array): + def reversed(self): + return self[..., ::-1] + + +def test_2549_list_nominal_type_class(): + behavior = {"reversible": ReversibleArray} + reversible_array = ak.with_parameter( + [[1, 2, 3], [4, 5, 6, 7], [8, 9]], "__list__", "reversible", behavior=behavior + ) + + cuda_reversible_array = ak.to_backend(reversible_array, "cuda") + + assert isinstance(cuda_reversible_array, ReversibleArray) + assert cuda_reversible_array.to_list() == [[1, 2, 3], [4, 5, 6, 7], [8, 9]] + assert cuda_reversible_array.reversed().to_list() == [ + [3, 2, 1], + [7, 6, 5, 4], + [9, 8], + ] + + +def test_2549_list_nominal_type_deep_class(): + behavior = {"reversible": ReversibleArray, ("*", "reversible"): ReversibleArray} + reversible_array = ak.with_parameter( + [[1, 2, 3], [4, 5, 6, 7], [8, 9]], "__list__", "reversible", behavior=behavior + ) + outer_array = ak.Array( + ak.contents.ListOffsetArray( + ak.index.Index64([0, 2, 3, 3]), reversible_array.layout + ), + behavior=behavior, + ) + + cuda_outer_array = ak.to_backend(outer_array, "cuda") + + assert isinstance(cuda_outer_array, ReversibleArray) + assert cuda_outer_array.to_list() == [[[1, 2, 3], [4, 5, 6, 7]], [[8, 9]], []] + assert cuda_outer_array.reversed().to_list() == [ + [[3, 2, 1], [7, 6, 5, 4]], + [[9, 8]], + [], + ] + + +def test_2549_list_nominal_type_ufunc(): + behavior = {"reversible": ReversibleArray} + reversible_array = ak.with_parameter( + [[1, 2, 3], [4, 5, 6, 7], [8, 9]], "__list__", "reversible", behavior=behavior + ) + cuda_reversible_array = ak.to_backend(reversible_array, "cuda") + + assert isinstance(cuda_reversible_array, ReversibleArray) + + def reversible_add(x, y): + return ak.with_parameter(x.reversed(), "__list__", None) + ak.with_parameter( + y.reversed(), "__list__", None + ) + + ak.behavior[np.add, "reversible", "reversible"] = reversible_add + + assert (cuda_reversible_array + cuda_reversible_array).to_list() == [ + [6, 4, 2], + [14, 12, 10, 8], + [18, 16], + ] + with pytest.raises(TypeError, match=r"overloads for custom types"): + cuda_reversible_array + ak.with_parameter( + cuda_reversible_array, "__list__", "non-reversible" + ) + + +def test_0150_flatten_ListOffsetArray(): + array = ak.highlevel.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.to_list(ak.operations.flatten(cuda_array)) == [ + 1.1, + 2.2, + 3.3, + 4.4, + 5.5, + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:])) == [4.4, 5.5] + + array = ak.highlevel.Array( + [[[0.0, 1.1, 2.2], [], [3.3, 4.4]], [], [[5.5]], [[], [6.6, 7.7, 8.8, 9.9]]] + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.to_list(ak.operations.flatten(cuda_array)) == [ + [0.0, 1.1, 2.2], + [], + [3.3, 4.4], + [5.5], + [], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:])) == [ + [5.5], + [], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[:, 1:])) == [ + [], + [3.3, 4.4], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) == [ + [0.0, 1.1, 2.2, 3.3, 4.4], + [], + [5.5], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:], axis=2)) == [ + [], + [5.5], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[:, 1:], axis=2)) == [ + [3.3, 4.4], + [], + [], + [6.6, 7.7, 8.8, 9.9], + ] + + array = ak.highlevel.Array( + cp.arange(2 * 3 * 5 * 7).reshape(2, 3, 5, 7).tolist() + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=1)) + == cp.arange(2 * 3 * 5 * 7).reshape(2 * 3, 5, 7).tolist() + ) + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) + == cp.arange(2 * 3 * 5 * 7).reshape(2, 3 * 5, 7).tolist() + ) + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=3)) + == cp.arange(2 * 3 * 5 * 7).reshape(2, 3, 5 * 7).tolist() + ) + + array = ak.highlevel.Array( + ak.operations.from_iter( + cp.arange(2 * 3 * 5 * 7).reshape(2, 3, 5, 7).tolist(), highlevel=False + ) + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=1)) + == cp.arange(2 * 3 * 5 * 7).reshape(2 * 3, 5, 7).tolist() + ) + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) + == cp.arange(2 * 3 * 5 * 7).reshape(2, 3 * 5, 7).tolist() + ) + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=3)) + == cp.arange(2 * 3 * 5 * 7).reshape(2, 3, 5 * 7).tolist() + ) + + array = ak.highlevel.Array(cp.arange(2 * 3 * 5 * 7).reshape(2, 3, 5, 7)).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=1)) + == cp.arange(2 * 3 * 5 * 7).reshape(2 * 3, 5, 7).tolist() + ) + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) + == cp.arange(2 * 3 * 5 * 7).reshape(2, 3 * 5, 7).tolist() + ) + assert ( + ak.operations.to_list(ak.operations.flatten(cuda_array, axis=3)) + == cp.arange(2 * 3 * 5 * 7).reshape(2, 3, 5 * 7).tolist() + ) + + +def test_0150_flatten_IndexedArray(): + array = ak.highlevel.Array( + [[1.1, 2.2, None, 3.3], None, [], None, [4.4, 5.5], None] + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.to_list(ak.operations.flatten(cuda_array)) == [ + 1.1, + 2.2, + None, + 3.3, + 4.4, + 5.5, + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:])) == [4.4, 5.5] + + array = ak.highlevel.Array( + [ + [[0.0, 1.1, 2.2], None, None, [3.3, 4.4]], + [], + [[5.5]], + [[], [6.6, 7.7, 8.8, 9.9]], + ] + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) == [ + [0.0, 1.1, 2.2, 3.3, 4.4], + [], + [5.5], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:], axis=2)) == [ + [], + [5.5], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[:, 1:], axis=2)) == [ + [3.3, 4.4], + [], + [], + [6.6, 7.7, 8.8, 9.9], + ] + + array = ak.highlevel.Array( + [ + [[0.0, 1.1, 2.2], [3.3, 4.4]], + [], + [[5.5]], + None, + None, + [[], [6.6, 7.7, 8.8, 9.9]], + ] + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) == [ + [0.0, 1.1, 2.2, 3.3, 4.4], + [], + [5.5], + None, + None, + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:], axis=2)) == [ + [], + [5.5], + None, + None, + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[:, 1:], axis=2)) == [ + [3.3, 4.4], + [], + [], + None, + None, + [6.6, 7.7, 8.8, 9.9], + ] + + array = ak.highlevel.Array( + [ + [[0.0, 1.1, None, 2.2], None, [], None, [3.3, 4.4]], + None, + [], + [[5.5]], + None, + [[], [6.6, None, 7.7, 8.8, 9.9], None], + ] + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.to_list(ak.operations.flatten(cuda_array)) == [ + [0.0, 1.1, None, 2.2], + None, + [], + None, + [3.3, 4.4], + [5.5], + [], + [6.6, None, 7.7, 8.8, 9.9], + None, + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) == [ + [0.0, 1.1, None, 2.2, 3.3, 4.4], + None, + [], + [5.5], + None, + [6.6, None, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:], axis=2)) == [ + None, + [], + [5.5], + None, + [6.6, None, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[:, 1:], axis=2)) == [ + [3.3, 4.4], + None, + [], + [], + None, + [6.6, None, 7.7, 8.8, 9.9], + ] + + content = ak.operations.from_iter( + [[0.0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]], highlevel=False + ) + index = ak.index.Index64(np.array([2, 1, 0, 3, 3, 4], dtype=np.int64)) + array = ak.contents.IndexedArray(index, content) + + cuda_array = ak.to_backend(array, "cuda") + + assert to_list(cuda_array) == [ + [3.3, 4.4], + [], + [0.0, 1.1, 2.2], + [5.5], + [5.5], + [6.6, 7.7, 8.8, 9.9], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array)) == [ + 3.3, + 4.4, + 0.0, + 1.1, + 2.2, + 5.5, + 5.5, + 6.6, + 7.7, + 8.8, + 9.9, + ] + + content = ak.operations.from_iter( + [[[0.0, 1.1, 2.2], [], [3.3, 4.4]], [], [[5.5]], [[], [6.6, 7.7, 8.8, 9.9]]], + highlevel=False, + ) + index = ak.index.Index64(np.array([2, 2, 1, 0, 3], dtype=np.int64)) + array = ak.contents.IndexedArray(index, content) + + cuda_array = ak.to_backend(array, "cuda") + + assert to_list(cuda_array) == [ + [[5.5]], + [[5.5]], + [], + [[0.0, 1.1, 2.2], [], [3.3, 4.4]], + [[], [6.6, 7.7, 8.8, 9.9]], + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) == [ + [5.5], + [5.5], + [], + [0.0, 1.1, 2.2, 3.3, 4.4], + [6.6, 7.7, 8.8, 9.9], + ] + + +def test_0150_flatten_RecordArray(): + array = ak.highlevel.Array( + [ + {"x": [], "y": [[3, 3, 3]]}, + {"x": [[1]], "y": [[2, 2]]}, + {"x": [[2], [2]], "y": [[1]]}, + {"x": [[3], [3], [3]], "y": [[]]}, + ] + ).layout + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.to_list(ak.operations.flatten(cuda_array, axis=2)) == [ + {"x": [], "y": [3, 3, 3]}, + {"x": [1], "y": [2, 2]}, + {"x": [2, 2], "y": [1]}, + {"x": [3, 3, 3], "y": []}, + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[1:], axis=2)) == [ + {"x": [1], "y": [2, 2]}, + {"x": [2, 2], "y": [1]}, + {"x": [3, 3, 3], "y": []}, + ] + assert ak.operations.to_list(ak.operations.flatten(cuda_array[:, 1:], axis=2)) == [ + {"x": [], "y": []}, + {"x": [], "y": []}, + {"x": [2], "y": []}, + {"x": [3, 3], "y": []}, + ]