Skip to content

Commit

Permalink
feat: add CPU kernel for to_numpy support for strings/bytestrings (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
agoose77 authored Aug 11, 2023
1 parent 8ba3e30 commit 2d21296
Show file tree
Hide file tree
Showing 10 changed files with 391 additions and 11 deletions.
23 changes: 23 additions & 0 deletions awkward-cpp/include/awkward/unicode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
#include <cstddef>
#include <cstdint>

#ifndef AWKWARD_UNICODE_H_
#define AWKWARD_UNICODE_H_


#define UTF8_ONE_BYTE_MASK 0x80
#define UTF8_ONE_BYTE_BITS 0
#define UTF8_TWO_BYTES_MASK 0xE0
#define UTF8_TWO_BYTES_BITS 0xC0
#define UTF8_THREE_BYTES_MASK 0xF0
#define UTF8_THREE_BYTES_BITS 0xE0
#define UTF8_FOUR_BYTES_MASK 0xF8
#define UTF8_FOUR_BYTES_BITS 0xF0
#define UTF8_CONTINUATION_MASK 0xC0
#define UTF8_CONTINUATION_BITS 0x80


size_t utf8_codepoint_size(const uint8_t byte);

#endif // AWKWARD_UNICODE_H_
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp", line)

#include "awkward/kernels.h"


template <typename T>
ERROR awkward_NumpyArray_pad_zero_to_length(
const T* fromptr,
const int64_t* fromoffsets,
int64_t offsetslength,
int64_t target,
T* toptr) {
int64_t l_to_char = 0;

// For each sublist
for (auto k_sublist = 0; k_sublist < offsetslength-1; k_sublist++) {
// Copy from src to dst
for (int64_t j_from_char=fromoffsets[k_sublist]; j_from_char<fromoffsets[k_sublist+1]; j_from_char++) {
toptr[l_to_char++] = fromptr[j_from_char];
}
// Pad to remaining width
auto n_to_pad = target - (fromoffsets[k_sublist+1] - fromoffsets[k_sublist]);
for (int64_t j_from_char=0; j_from_char<n_to_pad; j_from_char++){
toptr[l_to_char++] = 0;
}
}

return success();
}

ERROR awkward_NumpyArray_pad_zero_to_length_uint8(
const uint8_t* fromptr,
const int64_t* fromoffsets,
int64_t offsetslength,
int64_t target,
uint8_t* toptr) {
return awkward_NumpyArray_pad_zero_to_length<uint8_t>(
fromptr,
fromoffsets,
offsetslength,
target,
toptr);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp", line)

#include "awkward/kernels.h"
#include "awkward/unicode.h"


ERROR awkward_NumpyArray_prepare_utf8_to_utf32_padded(
const uint8_t *fromptr,
const int64_t *fromoffsets,
int64_t offsetslength,
int64_t *outmaxcodepoints) {

*outmaxcodepoints = 0;
int64_t i_code_unit = fromoffsets[0];
int64_t code_point_width;

// For each sublist of code units
for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) {
auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist];
auto n_code_point_sublist = 0;

// Repeat until we exhaust the code units within this sublist
for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) {
code_point_width = utf8_codepoint_size(fromptr[i_code_unit]);

// Shift the code-unit start index
i_code_unit += code_point_width;

// Increment the code-point counter for this sublist
n_code_point_sublist += 1;
}

// Set largest substring length (in code points)
*outmaxcodepoints = ( *outmaxcodepoints < n_code_point_sublist) ? n_code_point_sublist : *outmaxcodepoints;
}

return success();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp", line)

#include "awkward/kernels.h"
#include "awkward/unicode.h"


ERROR awkward_NumpyArray_utf8_to_utf32_padded(
const uint8_t *fromptr,
const int64_t *fromoffsets,
int64_t offsetslength,
int64_t maxcodepoints,
uint32_t *toptr) {

int64_t i_code_unit = fromoffsets[0];
int64_t code_point_width;
int64_t n_code_point = 0;

// For each sublist of code units
for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) {
auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist];
int64_t n_code_point_sublist = 0;

// Repeat until we exhaust the code units within this sublist
for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) {
// Parse a single codepoint
code_point_width = utf8_codepoint_size(fromptr[i_code_unit]);
switch (code_point_width) {
case 1:
toptr[n_code_point] = ((uint32_t) fromptr[i_code_unit] & ~UTF8_ONE_BYTE_MASK);
break;
case 2:
toptr[n_code_point] =
((uint32_t) fromptr[i_code_unit] & ~UTF8_TWO_BYTES_MASK) << 6 |
((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK);
break;
case 3:
toptr[n_code_point] =
((uint32_t) fromptr[i_code_unit] & ~UTF8_THREE_BYTES_MASK) << 12 |
((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 6 |
((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK);

break;
case 4:
toptr[n_code_point] =
((uint32_t) fromptr[i_code_unit] & ~UTF8_FOUR_BYTES_MASK) << 18 |
((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 12 |
((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK) << 6 |
((uint32_t) fromptr[i_code_unit + 3] & ~UTF8_CONTINUATION_MASK);
break;
default:
return failure("could not convert UTF8 code point to UTF32: invalid byte in UTF8 string", kSliceNone, fromptr[i_code_unit], FILENAME(__LINE__));
}
// Increment the code-point counter
n_code_point++;

// Shift the code-unit start index
i_code_unit += code_point_width;

// Increment the code-point counter for this sublist
n_code_point_sublist += 1;
}

// Zero pad the remaining characters
int64_t n_pad_code_points = maxcodepoints - n_code_point_sublist;
for (auto j = 0; j < n_pad_code_points; j++) {
toptr[n_code_point++] = 0;
}
}

return success();
}
26 changes: 26 additions & 0 deletions awkward-cpp/src/cpu-kernels/unicode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/unicode.cpp", line)

#include "awkward/unicode.h"


size_t utf8_codepoint_size(const uint8_t byte) {
if ((byte & UTF8_ONE_BYTE_MASK) == UTF8_ONE_BYTE_BITS) {
return 1;
}

if ((byte & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES_BITS) {
return 2;
}

if ((byte & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES_BITS) {
return 3;
}

if ((byte & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES_BITS) {
return 4;
}

return 0;
}
44 changes: 44 additions & 0 deletions kernel-specification.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3357,6 +3357,50 @@ kernels:
automatic-tests: false
manual-tests: []

- name: awkward_NumpyArray_prepare_utf8_to_utf32_padded
specializations:
- name: awkward_NumpyArray_prepare_utf8_to_utf32_padded
args:
- {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default}
- {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets}
- {name: offsetslength, type: "int64_t", dir: in, role: default}
- {name: outmaxcodepoints, type: "List[int64_t]", dir: out}
description: null
definition: |
Insert Python definition here
automatic-tests: false
manual-tests: []

- name: awkward_NumpyArray_utf8_to_utf32_padded
specializations:
- name: awkward_NumpyArray_utf8_to_utf32_padded
args:
- {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default}
- {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets}
- {name: offsetslength, type: "int64_t", dir: in, role: default}
- {name: maxcodepoints, type: "int64_t", dir: in, role: default}
- {name: toptr, type: "List[uint32_t]", dir: out}
description: null
definition: |
Insert Python definition here
automatic-tests: false
manual-tests: []

- name: awkward_NumpyArray_pad_zero_to_length
specializations:
- name: awkward_NumpyArray_pad_zero_to_length_uint8
args:
- {name: fromptr, type: "Const[List[uint8_t]]", dir: in, role: default}
- {name: fromoffsets, type: "Const[List[int64_t]]", dir: in, role: ListOffsetArray-offsets}
- {name: offsetslength, type: "int64_t", dir: in, role: default}
- {name: target, type: "int64_t", dir: in, role: default}
- {name: toptr, type: "List[uint8_t]", dir: out}
description: null
definition: |
Insert Python definition here
automatic-tests: false
manual-tests: []

- name: awkward_NumpyArray_subrange_equal
specializations:
- name: awkward_NumpyArray_subrange_equal_bool
Expand Down
6 changes: 3 additions & 3 deletions src/awkward/contents/listarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1488,9 +1488,9 @@ def _to_arrow(self, pyarrow, mask_node, validbytes, length, options):
def _to_backend_array(self, allow_missing, backend):
array_param = self.parameter("__array__")
if array_param in {"bytestring", "string"}:
# As our array-of-strings _may_ be empty, we should pass the dtype
dtype = np.str_ if array_param == "string" else np.bytes_
return backend.nplike.asarray(self.to_list(), dtype=dtype)
return self.to_ListOffsetArray64(False)._to_backend_array(
allow_missing, backend
)
else:
return self.to_RegularArray()._to_backend_array(allow_missing, backend)

Expand Down
71 changes: 67 additions & 4 deletions src/awkward/contents/listoffsetarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1986,10 +1986,73 @@ def _to_arrow(self, pyarrow, mask_node, validbytes, length, options):

def _to_backend_array(self, allow_missing, backend):
array_param = self.parameter("__array__")
if array_param in {"bytestring", "string"}:
# As our array-of-strings _may_ be empty, we should pass the dtype
dtype = np.str_ if array_param == "string" else np.bytes_
return backend.nplike.asarray(self.to_list(), dtype=dtype)
if array_param == "string":
# Determine the widest string (in code points)
_max_code_points = backend.index_nplike.empty(1, dtype=np.int64)
backend[
"awkward_NumpyArray_prepare_utf8_to_utf32_padded",
self._content.dtype.type,
self._offsets.dtype.type,
_max_code_points.dtype.type,
](
self._content.data,
self._offsets.data,
self._offsets.length,
_max_code_points,
)
max_code_points = backend.index_nplike.index_as_shape_item(
_max_code_points[0]
)
# Ensure that we have at-least length-1 bytestrings
if max_code_points is not unknown_length:
max_code_points = max(1, max_code_points)

# Allocate the correct size buffer
total_code_points = max_code_points * self.length
buffer = backend.nplike.empty(total_code_points, dtype=np.uint32)

# Fill buffer with new uint32_t
self.backend[
"awkward_NumpyArray_utf8_to_utf32_padded",
self._content.dtype.type,
self._offsets.dtype.type,
buffer.dtype.type,
](
self._content.data,
self._offsets.data,
self._offsets.length,
max_code_points,
buffer,
)
return buffer.view(np.dtype(("U", max_code_points)))
elif array_param == "bytestring":
# Handle length=0 case
if self.starts.length is not unknown_length and self.starts.length == 0:
max_count = 0
else:
max_count = backend.index_nplike.index_as_shape_item(
backend.index_nplike.max(self.stops.data - self.starts.data)
)

# Ensure that we have at-least length-1 bytestrings
if max_count is not unknown_length:
max_count = max(1, max_count)

buffer = backend.nplike.empty(max_count * self.length, dtype=np.uint8)

self.backend[
"awkward_NumpyArray_pad_zero_to_length",
self._content.dtype.type,
self._offsets.dtype.type,
buffer.dtype.type,
](
self._content.data,
self._offsets.data,
self._offsets.length,
max_count,
buffer,
)
return buffer.view(np.dtype(("S", max_count)))
else:
return self.to_RegularArray()._to_backend_array(allow_missing, backend)

Expand Down
Loading

0 comments on commit 2d21296

Please sign in to comment.