diff --git a/awkward-cpp/include/awkward/unicode.h b/awkward-cpp/include/awkward/unicode.h new file mode 100644 index 0000000000..ac4d5a4ba1 --- /dev/null +++ b/awkward-cpp/include/awkward/unicode.h @@ -0,0 +1,23 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +#include +#include + +#ifndef AWKWARD_UNICODE_H_ +#define AWKWARD_UNICODE_H_ + + +#define UTF8_ONE_BYTE_MASK 0x80 +#define UTF8_ONE_BYTE_BITS 0 +#define UTF8_TWO_BYTES_MASK 0xE0 +#define UTF8_TWO_BYTES_BITS 0xC0 +#define UTF8_THREE_BYTES_MASK 0xF0 +#define UTF8_THREE_BYTES_BITS 0xE0 +#define UTF8_FOUR_BYTES_MASK 0xF8 +#define UTF8_FOUR_BYTES_BITS 0xF0 +#define UTF8_CONTINUATION_MASK 0xC0 +#define UTF8_CONTINUATION_BITS 0x80 + + +size_t utf8_codepoint_size(const uint8_t byte); + +#endif // AWKWARD_UNICODE_H_ diff --git a/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp new file mode 100644 index 0000000000..a55ac66310 --- /dev/null +++ b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp @@ -0,0 +1,45 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp", line) + +#include "awkward/kernels.h" + + +template +ERROR awkward_NumpyArray_pad_zero_to_length( + const T* fromptr, + const int64_t* fromoffsets, + int64_t offsetslength, + int64_t target, + T* toptr) { + int64_t l_to_char = 0; + + // For each sublist + for (auto k_sublist = 0; k_sublist < offsetslength-1; k_sublist++) { + // Copy from src to dst + for (int64_t j_from_char=fromoffsets[k_sublist]; j_from_char( + fromptr, + fromoffsets, + offsetslength, + target, + toptr); +} diff --git a/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp new file mode 100644 index 0000000000..36a160db2e --- /dev/null +++ b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp @@ -0,0 +1,40 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp", line) + +#include "awkward/kernels.h" +#include "awkward/unicode.h" + + +ERROR awkward_NumpyArray_prepare_utf8_to_utf32_padded( + const uint8_t *fromptr, + const int64_t *fromoffsets, + int64_t offsetslength, + int64_t *outmaxcodepoints) { + + *outmaxcodepoints = 0; + int64_t i_code_unit = fromoffsets[0]; + int64_t code_point_width; + + // For each sublist of code units + for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) { + auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist]; + auto n_code_point_sublist = 0; + + // Repeat until we exhaust the code units within this sublist + for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) { + code_point_width = utf8_codepoint_size(fromptr[i_code_unit]); + + // Shift the code-unit start index + i_code_unit += code_point_width; + + // Increment the code-point counter for this sublist + n_code_point_sublist += 1; + } + + // Set largest substring length (in code points) + *outmaxcodepoints = ( *outmaxcodepoints < n_code_point_sublist) ? n_code_point_sublist : *outmaxcodepoints; + } + + return success(); +} diff --git a/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp new file mode 100644 index 0000000000..435fcc6c76 --- /dev/null +++ b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp @@ -0,0 +1,73 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp", line) + +#include "awkward/kernels.h" +#include "awkward/unicode.h" + + +ERROR awkward_NumpyArray_utf8_to_utf32_padded( + const uint8_t *fromptr, + const int64_t *fromoffsets, + int64_t offsetslength, + int64_t maxcodepoints, + uint32_t *toptr) { + + int64_t i_code_unit = fromoffsets[0]; + int64_t code_point_width; + int64_t n_code_point = 0; + + // For each sublist of code units + for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) { + auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist]; + int64_t n_code_point_sublist = 0; + + // Repeat until we exhaust the code units within this sublist + for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) { + // Parse a single codepoint + code_point_width = utf8_codepoint_size(fromptr[i_code_unit]); + switch (code_point_width) { + case 1: + toptr[n_code_point] = ((uint32_t) fromptr[i_code_unit] & ~UTF8_ONE_BYTE_MASK); + break; + case 2: + toptr[n_code_point] = + ((uint32_t) fromptr[i_code_unit] & ~UTF8_TWO_BYTES_MASK) << 6 | + ((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK); + break; + case 3: + toptr[n_code_point] = + ((uint32_t) fromptr[i_code_unit] & ~UTF8_THREE_BYTES_MASK) << 12 | + ((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 6 | + ((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK); + + break; + case 4: + toptr[n_code_point] = + ((uint32_t) fromptr[i_code_unit] & ~UTF8_FOUR_BYTES_MASK) << 18 | + ((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 12 | + ((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK) << 6 | + ((uint32_t) fromptr[i_code_unit + 3] & ~UTF8_CONTINUATION_MASK); + break; + default: + return failure("could not convert UTF8 code point to UTF32: invalid byte in UTF8 string", kSliceNone, fromptr[i_code_unit], FILENAME(__LINE__)); + } + // Increment the code-point counter + n_code_point++; + + // Shift the code-unit start index + i_code_unit += code_point_width; + + // Increment the code-point counter for this sublist + n_code_point_sublist += 1; + } + + // Zero pad the remaining characters + int64_t n_pad_code_points = maxcodepoints - n_code_point_sublist; + for (auto j = 0; j < n_pad_code_points; j++) { + toptr[n_code_point++] = 0; + } + } + + return success(); +} diff --git a/awkward-cpp/src/cpu-kernels/unicode.cpp b/awkward-cpp/src/cpu-kernels/unicode.cpp new file mode 100644 index 0000000000..625167a21d --- /dev/null +++ b/awkward-cpp/src/cpu-kernels/unicode.cpp @@ -0,0 +1,26 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/unicode.cpp", line) + +#include "awkward/unicode.h" + + +size_t utf8_codepoint_size(const uint8_t byte) { + if ((byte & UTF8_ONE_BYTE_MASK) == UTF8_ONE_BYTE_BITS) { + return 1; + } + + if ((byte & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES_BITS) { + return 2; + } + + if ((byte & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES_BITS) { + return 3; + } + + if ((byte & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES_BITS) { + return 4; + } + + return 0; +} diff --git a/kernel-specification.yml b/kernel-specification.yml index 437a22da63..2a120e505b 100644 --- a/kernel-specification.yml +++ b/kernel-specification.yml @@ -3357,6 +3357,50 @@ kernels: automatic-tests: false manual-tests: [] + - name: awkward_NumpyArray_prepare_utf8_to_utf32_padded + specializations: + - name: awkward_NumpyArray_prepare_utf8_to_utf32_padded + args: + - {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default} + - {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets} + - {name: offsetslength, type: "int64_t", dir: in, role: default} + - {name: outmaxcodepoints, type: "List[int64_t]", dir: out} + description: null + definition: | + Insert Python definition here + automatic-tests: false + manual-tests: [] + + - name: awkward_NumpyArray_utf8_to_utf32_padded + specializations: + - name: awkward_NumpyArray_utf8_to_utf32_padded + args: + - {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default} + - {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets} + - {name: offsetslength, type: "int64_t", dir: in, role: default} + - {name: maxcodepoints, type: "int64_t", dir: in, role: default} + - {name: toptr, type: "List[uint32_t]", dir: out} + description: null + definition: | + Insert Python definition here + automatic-tests: false + manual-tests: [] + + - name: awkward_NumpyArray_pad_zero_to_length + specializations: + - name: awkward_NumpyArray_pad_zero_to_length_uint8 + args: + - {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default} + - {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets} + - {name: offsetslength, type: "int64_t", dir: in, role: default} + - {name: target, type: "int64_t", dir: in, role: default} + - {name: toptr, type: "List[uint8_t]", dir: out} + description: null + definition: | + Insert Python definition here + automatic-tests: false + manual-tests: [] + - name: awkward_NumpyArray_subrange_equal specializations: - name: awkward_NumpyArray_subrange_equal_bool diff --git a/src/awkward/contents/listarray.py b/src/awkward/contents/listarray.py index ece4404487..036687b780 100644 --- a/src/awkward/contents/listarray.py +++ b/src/awkward/contents/listarray.py @@ -1488,9 +1488,9 @@ def _to_arrow(self, pyarrow, mask_node, validbytes, length, options): def _to_backend_array(self, allow_missing, backend): array_param = self.parameter("__array__") if array_param in {"bytestring", "string"}: - # As our array-of-strings _may_ be empty, we should pass the dtype - dtype = np.str_ if array_param == "string" else np.bytes_ - return backend.nplike.asarray(self.to_list(), dtype=dtype) + return self.to_ListOffsetArray64(False)._to_backend_array( + allow_missing, backend + ) else: return self.to_RegularArray()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/listoffsetarray.py b/src/awkward/contents/listoffsetarray.py index cdaf7ab9a4..6104d51d54 100644 --- a/src/awkward/contents/listoffsetarray.py +++ b/src/awkward/contents/listoffsetarray.py @@ -1986,10 +1986,73 @@ def _to_arrow(self, pyarrow, mask_node, validbytes, length, options): def _to_backend_array(self, allow_missing, backend): array_param = self.parameter("__array__") - if array_param in {"bytestring", "string"}: - # As our array-of-strings _may_ be empty, we should pass the dtype - dtype = np.str_ if array_param == "string" else np.bytes_ - return backend.nplike.asarray(self.to_list(), dtype=dtype) + if array_param == "string": + # Determine the widest string (in code points) + _max_code_points = backend.index_nplike.empty(1, dtype=np.int64) + backend[ + "awkward_NumpyArray_prepare_utf8_to_utf32_padded", + self._content.dtype.type, + self._offsets.dtype.type, + _max_code_points.dtype.type, + ]( + self._content.data, + self._offsets.data, + self._offsets.length, + _max_code_points, + ) + max_code_points = backend.index_nplike.index_as_shape_item( + _max_code_points[0] + ) + # Ensure that we have at-least length-1 bytestrings + if max_code_points is not unknown_length: + max_code_points = max(1, max_code_points) + + # Allocate the correct size buffer + total_code_points = max_code_points * self.length + buffer = backend.nplike.empty(total_code_points, dtype=np.uint32) + + # Fill buffer with new uint32_t + self.backend[ + "awkward_NumpyArray_utf8_to_utf32_padded", + self._content.dtype.type, + self._offsets.dtype.type, + buffer.dtype.type, + ]( + self._content.data, + self._offsets.data, + self._offsets.length, + max_code_points, + buffer, + ) + return buffer.view(np.dtype(("U", max_code_points))) + elif array_param == "bytestring": + # Handle length=0 case + if self.starts.length is not unknown_length and self.starts.length == 0: + max_count = 0 + else: + max_count = backend.index_nplike.index_as_shape_item( + backend.index_nplike.max(self.stops.data - self.starts.data) + ) + + # Ensure that we have at-least length-1 bytestrings + if max_count is not unknown_length: + max_count = max(1, max_count) + + buffer = backend.nplike.empty(max_count * self.length, dtype=np.uint8) + + self.backend[ + "awkward_NumpyArray_pad_zero_to_length", + self._content.dtype.type, + self._offsets.dtype.type, + buffer.dtype.type, + ]( + self._content.data, + self._offsets.data, + self._offsets.length, + max_count, + buffer, + ) + return buffer.view(np.dtype(("S", max_count))) else: return self.to_RegularArray()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/regulararray.py b/src/awkward/contents/regulararray.py index be180ef9ab..b508f096a0 100644 --- a/src/awkward/contents/regulararray.py +++ b/src/awkward/contents/regulararray.py @@ -1230,10 +1230,55 @@ def _pad_none(self, target, axis, depth, clip): def _to_backend_array(self, allow_missing, backend): array_param = self.parameter("__array__") - if array_param in {"bytestring", "string"}: - # As our array-of-strings _may_ be empty, we should pass the dtype - dtype = np.str_ if array_param == "string" else np.bytes_ - return backend.nplike.asarray(self.to_list(), dtype=dtype) + if array_param == "string": + offsets = self._compact_offsets64(True) + # Determine the widest string (in code points) + _max_code_points = backend.index_nplike.empty(1, dtype=np.int64) + backend[ + "awkward_NumpyArray_prepare_utf8_to_utf32_padded", + self._content.dtype.type, + offsets.dtype.type, + _max_code_points.dtype.type, + ]( + self._content.data, + offsets.data, + offsets.length, + _max_code_points, + ) + max_code_points = backend.index_nplike.index_as_shape_item( + _max_code_points[0] + ) + # Ensure that we have at-least length-1 bytestrings + if max_code_points is not unknown_length: + max_code_points = max(1, max_code_points) + + # Allocate the correct size buffer + total_code_points = max_code_points * self.length + buffer = backend.nplike.empty(total_code_points, dtype=np.uint32) + + # Fill buffer with new uint32_t + self.backend[ + "awkward_NumpyArray_utf8_to_utf32_padded", + self._content.dtype.type, + offsets.dtype.type, + buffer.dtype.type, + ]( + self._content.data, + offsets.data, + offsets.length, + max_code_points, + buffer, + ) + return buffer.view(np.dtype(("U", max_code_points))) + elif array_param == "bytestring": + # Ensure that we have at-least length-1 bytestrings + if self._size is not unknown_length and self._size == 0: + # Create new empty-buffer + return backend.nplike.zeros(self.length, dtype=np.uint8).view( + np.dtype(("S", 1)) + ) + else: + return self._content.data.view(np.dtype(("S", self._size))) else: out = self._content._to_backend_array(allow_missing, backend) shape = (self._length, self._size) + out.shape[1:] diff --git a/tests/test_2631_vectorised_to_numpy_strings.py b/tests/test_2631_vectorised_to_numpy_strings.py new file mode 100644 index 0000000000..196995fd93 --- /dev/null +++ b/tests/test_2631_vectorised_to_numpy_strings.py @@ -0,0 +1,21 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import numpy as np + +import awkward as ak + + +def test_string(): + source = ak.Array(["abc$¢€", "d¢#", "€e¢", "💰💰"]) + result = source.to_numpy(False) + expected = np.array(["abc$¢€", "d¢#", "€e¢", "💰💰"]) + assert result.dtype == expected.dtype + np.testing.assert_equal(result, expected) + + +def test_bytestring(): + source = ak.Array([b"foo", b"bar", b"catastrophic", b"\x03\x07"]) + result = source.to_numpy(False) + expected = np.array([b"foo", b"bar", b"catastrophic", b"\x03\x07"]) + assert result.dtype == expected.dtype + np.testing.assert_equal(result, expected)