feat: add CPU kernel for to_numpy support for strings/bytestrings (#…

…2631)
scikit-hep · Aug 11, 2023 · 2d21296 · 2d21296
1 parent 8ba3e30
commit 2d21296
Show file tree

Hide file tree

Showing 10 changed files with 391 additions and 11 deletions.
diff --git a/awkward-cpp/include/awkward/unicode.h b/awkward-cpp/include/awkward/unicode.h
@@ -0,0 +1,23 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+#include <cstddef>
+#include <cstdint>
+
+#ifndef AWKWARD_UNICODE_H_
+#define AWKWARD_UNICODE_H_
+
+
+#define UTF8_ONE_BYTE_MASK 0x80
+#define UTF8_ONE_BYTE_BITS 0
+#define UTF8_TWO_BYTES_MASK 0xE0
+#define UTF8_TWO_BYTES_BITS 0xC0
+#define UTF8_THREE_BYTES_MASK 0xF0
+#define UTF8_THREE_BYTES_BITS 0xE0
+#define UTF8_FOUR_BYTES_MASK 0xF8
+#define UTF8_FOUR_BYTES_BITS 0xF0
+#define UTF8_CONTINUATION_MASK 0xC0
+#define UTF8_CONTINUATION_BITS 0x80
+
+
+size_t utf8_codepoint_size(const uint8_t byte);
+
+#endif // AWKWARD_UNICODE_H_
diff --git a/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp
@@ -0,0 +1,45 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp", line)
+
+#include "awkward/kernels.h"
+
+
+template <typename T>
+ERROR awkward_NumpyArray_pad_zero_to_length(
+    const T* fromptr,
+    const int64_t* fromoffsets,
+    int64_t offsetslength,
+    int64_t target,
+    T* toptr) {
+    int64_t l_to_char = 0;
+
+    // For each sublist
+    for (auto k_sublist = 0; k_sublist < offsetslength-1; k_sublist++) {
+        // Copy from src to dst
+        for (int64_t j_from_char=fromoffsets[k_sublist]; j_from_char<fromoffsets[k_sublist+1]; j_from_char++) {
+            toptr[l_to_char++] = fromptr[j_from_char];
+        }
+        // Pad to remaining width
+        auto n_to_pad = target - (fromoffsets[k_sublist+1] - fromoffsets[k_sublist]);
+        for (int64_t j_from_char=0; j_from_char<n_to_pad; j_from_char++){
+            toptr[l_to_char++] = 0;
+        }
+    }
+
+    return success();
+}
+
+ERROR awkward_NumpyArray_pad_zero_to_length_uint8(
+    const uint8_t* fromptr,
+    const int64_t* fromoffsets,
+    int64_t offsetslength,
+    int64_t target,
+    uint8_t* toptr) {
+  return awkward_NumpyArray_pad_zero_to_length<uint8_t>(
+    fromptr,
+    fromoffsets,
+    offsetslength,
+    target,
+    toptr);
+}
diff --git a/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp
@@ -0,0 +1,40 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp", line)
+
+#include "awkward/kernels.h"
+#include "awkward/unicode.h"
+
+
+ERROR awkward_NumpyArray_prepare_utf8_to_utf32_padded(
+  const uint8_t *fromptr,
+    const int64_t *fromoffsets,
+      int64_t offsetslength,
+      int64_t *outmaxcodepoints) {
+
+  *outmaxcodepoints = 0;
+  int64_t i_code_unit = fromoffsets[0];
+  int64_t code_point_width;
+
+  // For each sublist of code units
+  for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) {
+    auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist];
+    auto n_code_point_sublist = 0;
+
+    // Repeat until we exhaust the code units within this sublist
+    for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) {
+      code_point_width = utf8_codepoint_size(fromptr[i_code_unit]);
+
+      // Shift the code-unit start index
+      i_code_unit += code_point_width;
+
+      // Increment the code-point counter for this sublist
+      n_code_point_sublist += 1;
+    }
+
+    // Set largest substring length (in code points)
+    *outmaxcodepoints = ( *outmaxcodepoints < n_code_point_sublist) ? n_code_point_sublist : *outmaxcodepoints;
+  }
+
+  return success();
+}
diff --git a/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp b/awkward-cpp/src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp
@@ -0,0 +1,73 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp", line)
+
+#include "awkward/kernels.h"
+#include "awkward/unicode.h"
+
+
+ERROR awkward_NumpyArray_utf8_to_utf32_padded(
+  const uint8_t *fromptr,
+    const int64_t *fromoffsets,
+      int64_t offsetslength,
+      int64_t maxcodepoints,
+      uint32_t *toptr) {
+
+  int64_t i_code_unit = fromoffsets[0];
+  int64_t code_point_width;
+  int64_t n_code_point = 0;
+
+  // For each sublist of code units
+  for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) {
+    auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist];
+    int64_t n_code_point_sublist = 0;
+
+    // Repeat until we exhaust the code units within this sublist
+    for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) {
+      // Parse a single codepoint
+      code_point_width = utf8_codepoint_size(fromptr[i_code_unit]);
+      switch (code_point_width) {
+      case 1:
+        toptr[n_code_point] = ((uint32_t) fromptr[i_code_unit] & ~UTF8_ONE_BYTE_MASK);
+        break;
+      case 2:
+        toptr[n_code_point] =
+          ((uint32_t) fromptr[i_code_unit] & ~UTF8_TWO_BYTES_MASK) << 6 |
+          ((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK);
+        break;
+      case 3:
+        toptr[n_code_point] =
+          ((uint32_t) fromptr[i_code_unit] & ~UTF8_THREE_BYTES_MASK) << 12 |
+          ((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 6 |
+          ((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK);
+
+        break;
+      case 4:
+        toptr[n_code_point] =
+          ((uint32_t) fromptr[i_code_unit] & ~UTF8_FOUR_BYTES_MASK) << 18 |
+          ((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 12 |
+          ((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK) << 6 |
+          ((uint32_t) fromptr[i_code_unit + 3] & ~UTF8_CONTINUATION_MASK);
+        break;
+      default:
+        return failure("could not convert UTF8 code point to UTF32: invalid byte in UTF8 string", kSliceNone, fromptr[i_code_unit], FILENAME(__LINE__));
+      }
+      // Increment the code-point counter
+      n_code_point++;
+
+      // Shift the code-unit start index
+      i_code_unit += code_point_width;
+
+      // Increment the code-point counter for this sublist
+      n_code_point_sublist += 1;
+    }
+
+    // Zero pad the remaining characters
+    int64_t n_pad_code_points = maxcodepoints - n_code_point_sublist;
+    for (auto j = 0; j < n_pad_code_points; j++) {
+      toptr[n_code_point++] = 0;
+    }
+  }
+
+  return success();
+}
diff --git a/awkward-cpp/src/cpu-kernels/unicode.cpp b/awkward-cpp/src/cpu-kernels/unicode.cpp
@@ -0,0 +1,26 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/unicode.cpp", line)
+
+#include "awkward/unicode.h"
+
+
+size_t utf8_codepoint_size(const uint8_t byte) {
+	if ((byte & UTF8_ONE_BYTE_MASK) == UTF8_ONE_BYTE_BITS) {
+		return 1;
+	}
+
+	if ((byte & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES_BITS) {
+		return 2;
+	}
+
+	if ((byte & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES_BITS) {
+		return 3;
+	}
+
+	if ((byte & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES_BITS) {
+		return 4;
+	}
+
+	return 0;
+}
diff --git a/kernel-specification.yml b/kernel-specification.yml
@@ -3357,6 +3357,50 @@ kernels:
     automatic-tests: false
     manual-tests: []
 
+  - name: awkward_NumpyArray_prepare_utf8_to_utf32_padded
+    specializations:
+      - name: awkward_NumpyArray_prepare_utf8_to_utf32_padded
+        args:
+          - {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default}
+          - {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets}
+          - {name: offsetslength, type: "int64_t", dir: in, role: default}
+          - {name: outmaxcodepoints, type: "List[int64_t]", dir: out}
+    description: null
+    definition: |
+      Insert Python definition here
+    automatic-tests: false
+    manual-tests: []
+
+  - name: awkward_NumpyArray_utf8_to_utf32_padded
+    specializations:
+      - name: awkward_NumpyArray_utf8_to_utf32_padded
+        args:
+          - {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default}
+          - {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets}
+          - {name: offsetslength, type: "int64_t", dir: in, role: default}
+          - {name: maxcodepoints, type: "int64_t", dir: in,  role: default}
+          - {name: toptr, type: "List[uint32_t]", dir: out}
+    description: null
+    definition: |
+      Insert Python definition here
+    automatic-tests: false
+    manual-tests: []
+
+  - name: awkward_NumpyArray_pad_zero_to_length
+    specializations:
+      - name: awkward_NumpyArray_pad_zero_to_length_uint8
+        args:
+          - {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default}
+          - {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets}
+          - {name: offsetslength, type: "int64_t", dir: in, role: default}
+          - {name: target, type: "int64_t", dir: in,  role: default}
+          - {name: toptr, type: "List[uint8_t]", dir: out}
+    description: null
+    definition: |
+      Insert Python definition here
+    automatic-tests: false
+    manual-tests: []
+
   - name: awkward_NumpyArray_subrange_equal
     specializations:
       - name: awkward_NumpyArray_subrange_equal_bool

diff --git a/src/awkward/contents/listarray.py b/src/awkward/contents/listarray.py
@@ -1488,9 +1488,9 @@ def _to_arrow(self, pyarrow, mask_node, validbytes, length, options):
     def _to_backend_array(self, allow_missing, backend):
         array_param = self.parameter("__array__")
         if array_param in {"bytestring", "string"}:
-            # As our array-of-strings _may_ be empty, we should pass the dtype
-            dtype = np.str_ if array_param == "string" else np.bytes_
-            return backend.nplike.asarray(self.to_list(), dtype=dtype)
+            return self.to_ListOffsetArray64(False)._to_backend_array(
+                allow_missing, backend
+            )
         else:
             return self.to_RegularArray()._to_backend_array(allow_missing, backend)
 

diff --git a/src/awkward/contents/listoffsetarray.py b/src/awkward/contents/listoffsetarray.py
@@ -1986,10 +1986,73 @@ def _to_arrow(self, pyarrow, mask_node, validbytes, length, options):
 
     def _to_backend_array(self, allow_missing, backend):
         array_param = self.parameter("__array__")
-        if array_param in {"bytestring", "string"}:
-            # As our array-of-strings _may_ be empty, we should pass the dtype
-            dtype = np.str_ if array_param == "string" else np.bytes_
-            return backend.nplike.asarray(self.to_list(), dtype=dtype)
+        if array_param == "string":
+            # Determine the widest string (in code points)
+            _max_code_points = backend.index_nplike.empty(1, dtype=np.int64)
+            backend[
+                "awkward_NumpyArray_prepare_utf8_to_utf32_padded",
+                self._content.dtype.type,
+                self._offsets.dtype.type,
+                _max_code_points.dtype.type,
+            ](
+                self._content.data,
+                self._offsets.data,
+                self._offsets.length,
+                _max_code_points,
+            )
+            max_code_points = backend.index_nplike.index_as_shape_item(
+                _max_code_points[0]
+            )
+            # Ensure that we have at-least length-1 bytestrings
+            if max_code_points is not unknown_length:
+                max_code_points = max(1, max_code_points)
+
+            # Allocate the correct size buffer
+            total_code_points = max_code_points * self.length
+            buffer = backend.nplike.empty(total_code_points, dtype=np.uint32)
+
+            # Fill buffer with new uint32_t
+            self.backend[
+                "awkward_NumpyArray_utf8_to_utf32_padded",
+                self._content.dtype.type,
+                self._offsets.dtype.type,
+                buffer.dtype.type,
+            ](
+                self._content.data,
+                self._offsets.data,
+                self._offsets.length,
+                max_code_points,
+                buffer,
+            )
+            return buffer.view(np.dtype(("U", max_code_points)))
+        elif array_param == "bytestring":
+            # Handle length=0 case
+            if self.starts.length is not unknown_length and self.starts.length == 0:
+                max_count = 0
+            else:
+                max_count = backend.index_nplike.index_as_shape_item(
+                    backend.index_nplike.max(self.stops.data - self.starts.data)
+                )
+
+            # Ensure that we have at-least length-1 bytestrings
+            if max_count is not unknown_length:
+                max_count = max(1, max_count)
+
+            buffer = backend.nplike.empty(max_count * self.length, dtype=np.uint8)
+
+            self.backend[
+                "awkward_NumpyArray_pad_zero_to_length",
+                self._content.dtype.type,
+                self._offsets.dtype.type,
+                buffer.dtype.type,
+            ](
+                self._content.data,
+                self._offsets.data,
+                self._offsets.length,
+                max_count,
+                buffer,
+            )
+            return buffer.view(np.dtype(("S", max_count)))
         else:
             return self.to_RegularArray()._to_backend_array(allow_missing, backend)