-
Notifications
You must be signed in to change notification settings - Fork 85
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add CPU kernel for
to_numpy
support for strings/bytestrings (#…
- Loading branch information
Showing
10 changed files
with
391 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE | ||
#include <cstddef> | ||
#include <cstdint> | ||
|
||
#ifndef AWKWARD_UNICODE_H_ | ||
#define AWKWARD_UNICODE_H_ | ||
|
||
|
||
#define UTF8_ONE_BYTE_MASK 0x80 | ||
#define UTF8_ONE_BYTE_BITS 0 | ||
#define UTF8_TWO_BYTES_MASK 0xE0 | ||
#define UTF8_TWO_BYTES_BITS 0xC0 | ||
#define UTF8_THREE_BYTES_MASK 0xF0 | ||
#define UTF8_THREE_BYTES_BITS 0xE0 | ||
#define UTF8_FOUR_BYTES_MASK 0xF8 | ||
#define UTF8_FOUR_BYTES_BITS 0xF0 | ||
#define UTF8_CONTINUATION_MASK 0xC0 | ||
#define UTF8_CONTINUATION_BITS 0x80 | ||
|
||
|
||
size_t utf8_codepoint_size(const uint8_t byte); | ||
|
||
#endif // AWKWARD_UNICODE_H_ |
45 changes: 45 additions & 0 deletions
45
awkward-cpp/src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE | ||
|
||
#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_pad_zero_to_length.cpp", line) | ||
|
||
#include "awkward/kernels.h" | ||
|
||
|
||
template <typename T> | ||
ERROR awkward_NumpyArray_pad_zero_to_length( | ||
const T* fromptr, | ||
const int64_t* fromoffsets, | ||
int64_t offsetslength, | ||
int64_t target, | ||
T* toptr) { | ||
int64_t l_to_char = 0; | ||
|
||
// For each sublist | ||
for (auto k_sublist = 0; k_sublist < offsetslength-1; k_sublist++) { | ||
// Copy from src to dst | ||
for (int64_t j_from_char=fromoffsets[k_sublist]; j_from_char<fromoffsets[k_sublist+1]; j_from_char++) { | ||
toptr[l_to_char++] = fromptr[j_from_char]; | ||
} | ||
// Pad to remaining width | ||
auto n_to_pad = target - (fromoffsets[k_sublist+1] - fromoffsets[k_sublist]); | ||
for (int64_t j_from_char=0; j_from_char<n_to_pad; j_from_char++){ | ||
toptr[l_to_char++] = 0; | ||
} | ||
} | ||
|
||
return success(); | ||
} | ||
|
||
ERROR awkward_NumpyArray_pad_zero_to_length_uint8( | ||
const uint8_t* fromptr, | ||
const int64_t* fromoffsets, | ||
int64_t offsetslength, | ||
int64_t target, | ||
uint8_t* toptr) { | ||
return awkward_NumpyArray_pad_zero_to_length<uint8_t>( | ||
fromptr, | ||
fromoffsets, | ||
offsetslength, | ||
target, | ||
toptr); | ||
} |
40 changes: 40 additions & 0 deletions
40
awkward-cpp/src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE | ||
|
||
#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_prepare_utf8_to_utf32_padded.cpp", line) | ||
|
||
#include "awkward/kernels.h" | ||
#include "awkward/unicode.h" | ||
|
||
|
||
ERROR awkward_NumpyArray_prepare_utf8_to_utf32_padded( | ||
const uint8_t *fromptr, | ||
const int64_t *fromoffsets, | ||
int64_t offsetslength, | ||
int64_t *outmaxcodepoints) { | ||
|
||
*outmaxcodepoints = 0; | ||
int64_t i_code_unit = fromoffsets[0]; | ||
int64_t code_point_width; | ||
|
||
// For each sublist of code units | ||
for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) { | ||
auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist]; | ||
auto n_code_point_sublist = 0; | ||
|
||
// Repeat until we exhaust the code units within this sublist | ||
for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) { | ||
code_point_width = utf8_codepoint_size(fromptr[i_code_unit]); | ||
|
||
// Shift the code-unit start index | ||
i_code_unit += code_point_width; | ||
|
||
// Increment the code-point counter for this sublist | ||
n_code_point_sublist += 1; | ||
} | ||
|
||
// Set largest substring length (in code points) | ||
*outmaxcodepoints = ( *outmaxcodepoints < n_code_point_sublist) ? n_code_point_sublist : *outmaxcodepoints; | ||
} | ||
|
||
return success(); | ||
} |
73 changes: 73 additions & 0 deletions
73
awkward-cpp/src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE | ||
|
||
#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/awkward_NumpyArray_utf8_to_utf32_padded.cpp", line) | ||
|
||
#include "awkward/kernels.h" | ||
#include "awkward/unicode.h" | ||
|
||
|
||
ERROR awkward_NumpyArray_utf8_to_utf32_padded( | ||
const uint8_t *fromptr, | ||
const int64_t *fromoffsets, | ||
int64_t offsetslength, | ||
int64_t maxcodepoints, | ||
uint32_t *toptr) { | ||
|
||
int64_t i_code_unit = fromoffsets[0]; | ||
int64_t code_point_width; | ||
int64_t n_code_point = 0; | ||
|
||
// For each sublist of code units | ||
for (auto k_sublist = 0; k_sublist < offsetslength - 1; k_sublist++) { | ||
auto n_code_units = fromoffsets[k_sublist + 1] - fromoffsets[k_sublist]; | ||
int64_t n_code_point_sublist = 0; | ||
|
||
// Repeat until we exhaust the code units within this sublist | ||
for (auto j_code_unit_last = i_code_unit + n_code_units; i_code_unit < j_code_unit_last;) { | ||
// Parse a single codepoint | ||
code_point_width = utf8_codepoint_size(fromptr[i_code_unit]); | ||
switch (code_point_width) { | ||
case 1: | ||
toptr[n_code_point] = ((uint32_t) fromptr[i_code_unit] & ~UTF8_ONE_BYTE_MASK); | ||
break; | ||
case 2: | ||
toptr[n_code_point] = | ||
((uint32_t) fromptr[i_code_unit] & ~UTF8_TWO_BYTES_MASK) << 6 | | ||
((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK); | ||
break; | ||
case 3: | ||
toptr[n_code_point] = | ||
((uint32_t) fromptr[i_code_unit] & ~UTF8_THREE_BYTES_MASK) << 12 | | ||
((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 6 | | ||
((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK); | ||
|
||
break; | ||
case 4: | ||
toptr[n_code_point] = | ||
((uint32_t) fromptr[i_code_unit] & ~UTF8_FOUR_BYTES_MASK) << 18 | | ||
((uint32_t) fromptr[i_code_unit + 1] & ~UTF8_CONTINUATION_MASK) << 12 | | ||
((uint32_t) fromptr[i_code_unit + 2] & ~UTF8_CONTINUATION_MASK) << 6 | | ||
((uint32_t) fromptr[i_code_unit + 3] & ~UTF8_CONTINUATION_MASK); | ||
break; | ||
default: | ||
return failure("could not convert UTF8 code point to UTF32: invalid byte in UTF8 string", kSliceNone, fromptr[i_code_unit], FILENAME(__LINE__)); | ||
} | ||
// Increment the code-point counter | ||
n_code_point++; | ||
|
||
// Shift the code-unit start index | ||
i_code_unit += code_point_width; | ||
|
||
// Increment the code-point counter for this sublist | ||
n_code_point_sublist += 1; | ||
} | ||
|
||
// Zero pad the remaining characters | ||
int64_t n_pad_code_points = maxcodepoints - n_code_point_sublist; | ||
for (auto j = 0; j < n_pad_code_points; j++) { | ||
toptr[n_code_point++] = 0; | ||
} | ||
} | ||
|
||
return success(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE | ||
|
||
#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_C("src/cpu-kernels/unicode.cpp", line) | ||
|
||
#include "awkward/unicode.h" | ||
|
||
|
||
size_t utf8_codepoint_size(const uint8_t byte) { | ||
if ((byte & UTF8_ONE_BYTE_MASK) == UTF8_ONE_BYTE_BITS) { | ||
return 1; | ||
} | ||
|
||
if ((byte & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES_BITS) { | ||
return 2; | ||
} | ||
|
||
if ((byte & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES_BITS) { | ||
return 3; | ||
} | ||
|
||
if ((byte & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES_BITS) { | ||
return 4; | ||
} | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.