diff --git a/docs/prepare_docstrings.py b/docs/prepare_docstrings.py index 963a3cb408..35756f516a 100644 --- a/docs/prepare_docstrings.py +++ b/docs/prepare_docstrings.py @@ -303,6 +303,7 @@ def dofunction(link, linelink, shortname, name, astfcn): .replace(".behaviors.string", "") ) shortname = re.sub(r"\.operations\.ak_\w+", "", shortname) + shortname = re.sub(r"\.operations\.str\.akstr_\w+", ".str", shortname) shortname = re.sub(r"\.(contents|types|forms)\.\w+", r".\1", shortname) if ( diff --git a/docs/reference/toctree.txt b/docs/reference/toctree.txt index f442d9cb2c..2304bba695 100644 --- a/docs/reference/toctree.txt +++ b/docs/reference/toctree.txt @@ -145,6 +145,79 @@ generated/ak.argcartesian generated/ak.argcombinations +.. toctree:: + :caption: String predicates + + generated/ak.str.is_alnum + generated/ak.str.is_alpha + generated/ak.str.is_ascii + generated/ak.str.is_decimal + generated/ak.str.is_digit + generated/ak.str.is_lower + generated/ak.str.is_numeric + generated/ak.str.is_printable + generated/ak.str.is_space + generated/ak.str.is_title + generated/ak.str.is_upper + +.. toctree:: + :caption: String transforms + + generated/ak.str.capitalize + generated/ak.str.length + generated/ak.str.lower + generated/ak.str.repeat + generated/ak.str.replace_slice + generated/ak.str.replace_substring + generated/ak.str.replace_substring_regex + generated/ak.str.reverse + generated/ak.str.swapcase + generated/ak.str.title + generated/ak.str.upper + +.. toctree:: + :caption: String padding and trimming + + generated/ak.str.center + generated/ak.str.lpad + generated/ak.str.rpad + generated/ak.str.ltrim + generated/ak.str.ltrim_whitespace + generated/ak.str.rtrim + generated/ak.str.rtrim_whitespace + generated/ak.str.trim + generated/ak.str.trim_whitespace + +.. toctree:: + :caption: String splitting and joining + + generated/ak.str.split_pattern + generated/ak.str.split_pattern_regex + generated/ak.str.split_whitespace + generated/ak.str.join + generated/ak.str.join_element_wise + +.. toctree:: + :caption: String slicing and decomposition + + generated/ak.str.slice + generated/ak.str.extract_regex + +.. toctree:: + :caption: String containment tests + + generated/ak.str.count_substring + generated/ak.str.count_substring_regex + generated/ak.str.ends_with + generated/ak.str.find_substring + generated/ak.str.find_substring_regex + generated/ak.str.index_in + generated/ak.str.is_in + generated/ak.str.match_like + generated/ak.str.match_substring + generated/ak.str.match_substring_regex + generated/ak.str.starts_with + .. toctree:: :caption: Value and type conversions diff --git a/pyproject.toml b/pyproject.toml index 87a1b29507..24a6fed90a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -309,7 +309,8 @@ mccabe.max-complexity = 100 "src/awkward/_connect/*" = ["TID251"] "src/awkward/__init__.py" = ["E402", "F401", "F403", "I001"] "src/awkward/_ext.py" = ["F401"] -"src/awkward/operations/__init__.py" = ["F403"] +"src/awkward/operations/__init__.py" = ["F401", "F403"] +"src/awkward/operations/str/__init__.py" = ["F401", "F403", "I001"] "src/awkward/_nplikes/*" = ["TID251"] "src/awkward/_operators.py" = ["TID251"] "tests*/*" = ["T20", "TID251"] diff --git a/src/awkward/_connect/pyarrow.py b/src/awkward/_connect/pyarrow.py index 54cae0ca92..b98c17975b 100644 --- a/src/awkward/_connect/pyarrow.py +++ b/src/awkward/_connect/pyarrow.py @@ -1,7 +1,9 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +from __future__ import annotations import json from collections.abc import Iterable, Sized +from types import ModuleType from packaging.version import parse as parse_version @@ -36,13 +38,13 @@ error_message = "pyarrow 7.0.0 or later required for {0}" -def import_pyarrow(name): +def import_pyarrow(name: str) -> ModuleType: if pyarrow is None: raise ImportError(error_message.format(name)) return pyarrow -def import_pyarrow_parquet(name): +def import_pyarrow_parquet(name: str) -> ModuleType: if pyarrow is None: raise ImportError(error_message.format(name)) @@ -51,7 +53,16 @@ def import_pyarrow_parquet(name): return out -def import_fsspec(name): +def import_pyarrow_compute(name: str) -> ModuleType: + if pyarrow is None: + raise ImportError(error_message.format(name)) + + import pyarrow.compute as out + + return out + + +def import_fsspec(name: str) -> ModuleType: try: import fsspec diff --git a/src/awkward/contents/unmaskedarray.py b/src/awkward/contents/unmaskedarray.py index 804bf02c7b..4431eb6cb6 100644 --- a/src/awkward/contents/unmaskedarray.py +++ b/src/awkward/contents/unmaskedarray.py @@ -491,7 +491,7 @@ def _remove_structure(self, backend, options): return [self] def _drop_none(self) -> Content: - return self.to_ByteMaskedArray(True)._drop_none() + return self.content def _recursively_apply( self, action, behavior, depth, depth_context, lateral_context, options diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py index 450e4679de..f378a12dc7 100644 --- a/src/awkward/operations/__init__.py +++ b/src/awkward/operations/__init__.py @@ -1,6 +1,6 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE -# ruff: noqa: F401 +import awkward.operations.str from awkward.operations.ak_all import * from awkward.operations.ak_almost_equal import * from awkward.operations.ak_any import * diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py new file mode 100644 index 0000000000..610a99de4b --- /dev/null +++ b/src/awkward/operations/str/__init__.py @@ -0,0 +1,205 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +# https://arrow.apache.org/docs/python/api/compute.html#string-predicates + +# string predicates +from awkward.operations.str.akstr_is_alnum import * +from awkward.operations.str.akstr_is_alpha import * +from awkward.operations.str.akstr_is_decimal import * +from awkward.operations.str.akstr_is_digit import * +from awkward.operations.str.akstr_is_lower import * +from awkward.operations.str.akstr_is_numeric import * +from awkward.operations.str.akstr_is_printable import * +from awkward.operations.str.akstr_is_space import * +from awkward.operations.str.akstr_is_upper import * +from awkward.operations.str.akstr_is_title import * +from awkward.operations.str.akstr_is_ascii import * + +# string transforms +from awkward.operations.str.akstr_capitalize import * +from awkward.operations.str.akstr_length import * +from awkward.operations.str.akstr_lower import * +from awkward.operations.str.akstr_swapcase import * +from awkward.operations.str.akstr_title import * +from awkward.operations.str.akstr_upper import * +from awkward.operations.str.akstr_repeat import * +from awkward.operations.str.akstr_replace_slice import * +from awkward.operations.str.akstr_reverse import * +from awkward.operations.str.akstr_replace_substring import * +from awkward.operations.str.akstr_replace_substring_regex import * + +# string padding +from awkward.operations.str.akstr_center import * +from awkward.operations.str.akstr_lpad import * +from awkward.operations.str.akstr_rpad import * + +# string trimming +from awkward.operations.str.akstr_ltrim import * +from awkward.operations.str.akstr_ltrim_whitespace import * +from awkward.operations.str.akstr_rtrim import * +from awkward.operations.str.akstr_rtrim_whitespace import * +from awkward.operations.str.akstr_trim import * +from awkward.operations.str.akstr_trim_whitespace import * + +# string splitting +from awkward.operations.str.akstr_split_whitespace import * +from awkward.operations.str.akstr_split_pattern import * +from awkward.operations.str.akstr_split_pattern_regex import * + +# string component extraction + +from awkward.operations.str.akstr_extract_regex import * + +# string joining + +from awkward.operations.str.akstr_join import * +from awkward.operations.str.akstr_join_element_wise import * + +# string slicing + +from awkward.operations.str.akstr_slice import * + +# containment tests + +from awkward.operations.str.akstr_count_substring import * +from awkward.operations.str.akstr_count_substring_regex import * +from awkward.operations.str.akstr_ends_with import * +from awkward.operations.str.akstr_find_substring import * +from awkward.operations.str.akstr_find_substring_regex import * +from awkward.operations.str.akstr_index_in import * +from awkward.operations.str.akstr_is_in import * +from awkward.operations.str.akstr_match_like import * +from awkward.operations.str.akstr_match_substring import * +from awkward.operations.str.akstr_match_substring_regex import * +from awkward.operations.str.akstr_starts_with import * + + +def _get_ufunc_action( + utf8_function, + ascii_function, + *args, + bytestring_to_string=False, + **kwargs, +): + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + def action(layout, **absorb): + if layout.is_list and layout.parameter("__array__") == "string": + return from_arrow( + utf8_function(to_arrow(layout, extensionarray=False), *args, **kwargs), + highlevel=False, + ) + + elif layout.is_list and layout.parameter("__array__") == "bytestring": + if bytestring_to_string: + out = from_arrow( + ascii_function( + to_arrow( + layout.copy( + content=layout.content.copy( + parameters={"__array__": "char"} + ), + parameters={"__array__": "string"}, + ), + extensionarray=False, + ), + *args, + **kwargs, + ), + highlevel=False, + ) + if out.is_list and out.parameter("__array__") == "string": + out = out.copy( + content=out.content.copy(parameters={"__array__": "byte"}), + parameters={"__array__": "bytestring"}, + ) + return out + + else: + return from_arrow( + ascii_function( + to_arrow(layout, extensionarray=False), *args, **kwargs + ), + highlevel=False, + ) + + return action + + +def _erase_list_option(layout): + from awkward.contents.unmaskedarray import UnmaskedArray + + assert layout.is_list + if layout.content.is_option: + assert isinstance(layout.content, UnmaskedArray) + return layout.copy(content=layout.content.content) + else: + return layout + + +def _get_split_action( + utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs +): + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + def action(layout, **absorb): + if layout.is_list and layout.parameter("__array__") == "string": + return _erase_list_option( + from_arrow( + utf8_function( + to_arrow(layout, extensionarray=False), + *args, + **kwargs, + ), + highlevel=False, + ) + ) + + elif layout.is_list and layout.parameter("__array__") == "bytestring": + if bytestring_to_string: + out = _erase_list_option( + from_arrow( + ascii_function( + to_arrow( + layout.copy( + content=layout.content.copy( + parameters={"__array__": "char"} + ), + parameters={"__array__": "string"}, + ), + extensionarray=False, + ), + *args, + **kwargs, + ), + highlevel=False, + ) + ) + assert out.is_list + + assert ( + out.content.is_list + and out.content.parameter("__array__") == "string" + ) + return out.copy( + content=out.content.copy( + content=out.content.content.copy( + parameters={"__array__": "byte"} + ), + parameters={"__array__": "bytestring"}, + ), + ) + + else: + return _erase_list_option( + from_arrow( + ascii_function( + to_arrow(layout, extensionarray=False), *args, **kwargs + ), + highlevel=False, + ) + ) + + return action diff --git a/src/awkward/operations/str/akstr_capitalize.py b/src/awkward/operations/str/akstr_capitalize.py new file mode 100644 index 0000000000..1c33e480f8 --- /dev/null +++ b/src/awkward/operations/str/akstr_capitalize.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("capitalize",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def capitalize(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a capitalized version + (correctly transforming Unicode characters), with the first character + uppercased and the others lowercased. + + Replaces any bytestring-valued data with a capitalized version + (transforming ASCII characters only). + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_capitalize](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_capitalize.html) + or + [pyarrow.compute.ascii_capitalize](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_capitalize.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.capitalize") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_capitalize, pc.ascii_capitalize, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_center.py b/src/awkward/operations/str/akstr_center.py new file mode 100644 index 0000000000..d7d1801136 --- /dev/null +++ b/src/awkward/operations/str/akstr_center.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("center",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def center(array, width, padding=" ", *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + width (int): Desired string length. + padding (str or bytes): What to pad the string with. Should be one + codepoint or byte. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with centered + strings/bytestrings of a given `width`, padding both sides with the given + `padding` codepoint or byte. + + If the data are strings, `width` is measured in codepoints and `padding` + must be one codepoint. + + If the data are bytestrings, `width` is measured in bytes and `padding` + must be one byte. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_center](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_center.html) + or + [pyarrow.compute.ascii_center](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_center.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, width, padding, highlevel, behavior) + + +def _impl(array, width, padding, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.center") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_center, pc.ascii_center, width, padding, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_count_substring.py b/src/awkward/operations/str/akstr_count_substring.py new file mode 100644 index 0000000000..8bbc44bcd7 --- /dev/null +++ b/src/awkward/operations/str/akstr_count_substring.py @@ -0,0 +1,62 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("count_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def count_substring( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Substring pattern to count for each string in + `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Counts the number of occurrences of the given literal `pattern` in every + string in `array`. Depending upon the value of `ignore_case`, the matching + function will be case-insensitive. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + + See also: #ak.str.count_substring_regex. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.count_substring") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.count_substring, + pc.count_substring, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_count_substring_regex.py b/src/awkward/operations/str/akstr_count_substring_regex.py new file mode 100644 index 0000000000..4cd7f3fe8a --- /dev/null +++ b/src/awkward/operations/str/akstr_count_substring_regex.py @@ -0,0 +1,62 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("count_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def count_substring_regex( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Regular expression that matches substrings to + count for each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Counts the number of occurrences of the given regular expression `pattern` + in every string in `array`. Depending upon the value of `ignore_case`, the + matching function will be case-insensitive. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.count_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring_regex.html). + + See also: #ak.str.count_substring. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.count_substring_regex") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.count_substring_regex, + pc.count_substring_regex, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_ends_with.py b/src/awkward/operations/str/akstr_ends_with.py new file mode 100644 index 0000000000..ed68476a1f --- /dev/null +++ b/src/awkward/operations/str/akstr_ends_with.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("ends_with",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def ends_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Substring pattern to test against the ending + of each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Returns True for every string in `array` if it ends with the given literal + suffix `pattern`. Depending upon the value of `ignore_case`, the matching + function will be case-insensitive. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.ends_with](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ends_with.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.ends_with") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.ends_with, + pc.ends_with, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_extract_regex.py b/src/awkward/operations/str/akstr_extract_regex.py new file mode 100644 index 0000000000..2592ba1268 --- /dev/null +++ b/src/awkward/operations/str/akstr_extract_regex.py @@ -0,0 +1,76 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("extract_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def extract_regex(array, pattern, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Regular expression with named capture fields. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Returns None for every string in `array` if it does not match `pattern`; + otherwise, a record whose fields are named capture groups and whose + contents are the substrings they've captured. + + Uses [Google RE2](https://github.com/google/re2/wiki/Syntax), and `pattern` must + contain named groups. The syntax for a named group is `(?P<...>...)` in which + the first `...` is a name and the last `...` is a regular expression. + + For example, + + >>> array = ak.Array([["one1", "two2", "three3"], [], ["four4", "five5"]]) + >>> result = ak.str.extract_regex(array, "(?P[aeiou])(?P[0-9]+)") + >>> result.show(type=True) + type: 3 * var * ?{ + vowel: ?string, + number: ?string + } + [[{vowel: 'e', number: '1'}, {vowel: 'o', number: '2'}, {vowel: 'e', number: '3'}], + [], + [None, {vowel: 'e', number: '5'}]] + + (The string `"four4"` does not match because the vowel is not immediately before + the number.) + + Regular expressions with unnamed groups or features not implemented by RE2 raise an error. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.extract_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.extract_regex.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, highlevel, behavior) + + +def _impl(array, pattern, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.extract_regex") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.extract_regex, pc.extract_regex, pattern, bytestring_to_string=False + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_find_substring.py b/src/awkward/operations/str/akstr_find_substring.py new file mode 100644 index 0000000000..f936bdac86 --- /dev/null +++ b/src/awkward/operations/str/akstr_find_substring.py @@ -0,0 +1,59 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("find_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Substring pattern to find inside each string + in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Returns the index of the first occurrence of the given literal `pattern` + for each string in `array`. If the literal pattern is not found inside the + string, the index is taken to be -1. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). + + See also: #ak.str.find_substring_regex. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.find_substring") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.find_substring, + pc.find_substring, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_find_substring_regex.py b/src/awkward/operations/str/akstr_find_substring_regex.py new file mode 100644 index 0000000000..e5059f846a --- /dev/null +++ b/src/awkward/operations/str/akstr_find_substring_regex.py @@ -0,0 +1,61 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("find_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def find_substring_regex( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Regular expression that matches substrings to + find inside each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Returns the index of the first occurrence of the given regular expression + `pattern` for each string in `array`. If the literal pattern is not found + inside the string, the index is taken to be -1. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). + + See also: #ak.str.find_substring. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.find_substring_regex") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.find_substring_regex, + pc.find_substring_regex, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_index_in.py b/src/awkward/operations/str/akstr_index_in.py new file mode 100644 index 0000000000..3c71e0a281 --- /dev/null +++ b/src/awkward/operations/str/akstr_index_in.py @@ -0,0 +1,78 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("index_in",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def index_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + value_set: Array-like data (anything #ak.to_layout recognizes), set of + values to search for in `array`. + skip_nones (bool): If True, None values in `array` are not matched + against `value_set`; otherwise, None is considered a legal value. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Returns the index of the first pattern in `value_set` that each string in + `array` matches. If the string is not found within `value_set`, then the + index is set to None. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.index_in](https://arrow.apache.org/docs/python/generated/pyarrow.compute.index_in.html). + """ + # Dispatch + yield (array, value_set) + + # Implementation + return _impl(array, value_set, skip_nones, highlevel, behavior) + + +def _is_maybe_optional_list_of_string(layout): + if layout.is_list and layout.parameter("__array__") in {"string", "bytestring"}: + return True + elif layout.is_option or layout.is_indexed: + return _is_maybe_optional_list_of_string(layout.content) + else: + return False + + +def _impl(array, value_set, skip_nones, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + value_set_layout = ak.to_layout(value_set, allow_record=False, allow_other=True) + + if not _is_maybe_optional_list_of_string(value_set_layout): + raise TypeError("`value_set` must be 1D array of (possibly missing) strings") + + behavior = behavior_of(array, value_set, behavior=behavior) + + def apply(layout, **kwargs): + if _is_maybe_optional_list_of_string(layout): + return ak.from_arrow( + pc.index_in( + ak.to_arrow(layout, extensionarray=False), + ak.to_arrow(value_set_layout, extensionarray=False), + skip_nulls=skip_nones, + ), + highlevel=False, + ) + + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_is_alnum.py b/src/awkward/operations/str/akstr_is_alnum.py new file mode 100644 index 0000000000..d18d42f2a9 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_alnum.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_alnum",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_alnum(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of alphanumeric Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of alphanumeric ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_alnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alnum.html) + or + [pyarrow.compute.ascii_is_alnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alnum.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_alnum") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_alnum, pc.ascii_is_alnum, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_alpha.py b/src/awkward/operations/str/akstr_is_alpha.py new file mode 100644 index 0000000000..892336f661 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_alpha.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_alpha",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_alpha(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of alphabetic Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of alphabetic ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_alpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alpha.html) + or + [pyarrow.compute.ascii_is_alpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alpha.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_alpha") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_alpha, pc.ascii_is_alpha, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_ascii.py b/src/awkward/operations/str/akstr_is_ascii.py new file mode 100644 index 0000000000..db7092c842 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_ascii.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_ascii",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_ascii(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True iff the string consists only of + ASCII characters, False otherwise. + + Replaces any bytestring-valued data with True iff the string consists only + of ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.string_is_ascii](https://arrow.apache.org/docs/python/generated/pyarrow.compute.string_is_ascii.html) + or + [pyarrow.compute.string_is_ascii](https://arrow.apache.org/docs/python/generated/pyarrow.compute.string_is_ascii.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_ascii") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.string_is_ascii, pc.string_is_ascii, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_decimal.py b/src/awkward/operations/str/akstr_is_decimal.py new file mode 100644 index 0000000000..7599f150d2 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_decimal.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_decimal",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_decimal(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of decimal Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of decimal ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_decimal](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_decimal.html) + or + [pyarrow.compute.ascii_is_decimal](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_decimal.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_decimal") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_decimal, pc.ascii_is_decimal, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_digit.py b/src/awkward/operations/str/akstr_is_digit.py new file mode 100644 index 0000000000..2838d5e39a --- /dev/null +++ b/src/awkward/operations/str/akstr_is_digit.py @@ -0,0 +1,59 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_digit",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_digit(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of Unicode digits, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of Unicode digits, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_digit](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) + or + [pyarrow.compute.utf8_is_digit](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) + on strings and bytestrings, respectively. + + (Arrow's compute module does not have an `ascii_is_digit`.) + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_digit") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_digit, pc.utf8_is_digit, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_in.py b/src/awkward/operations/str/akstr_is_in.py new file mode 100644 index 0000000000..99223cc5e0 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_in.py @@ -0,0 +1,77 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_in",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + value_set: Array-like data (anything #ak.to_layout recognizes), set of + values to search for in `array`. + skip_nones (bool): If True, None values in `array` are not matched + against `value_set`; otherwise, None is considered a legal value. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Returns True for each string in `array` if it matches any pattern in + `value_set`; otherwise, returns False. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.is_in](https://arrow.apache.org/docs/python/generated/pyarrow.compute.is_in.html). + """ + # Dispatch + yield (array, value_set) + + # Implementation + return _impl(array, value_set, skip_nones, highlevel, behavior) + + +def _is_maybe_optional_list_of_string(layout): + if layout.is_list and layout.parameter("__array__") in {"string", "bytestring"}: + return True + elif layout.is_option or layout.is_indexed: + return _is_maybe_optional_list_of_string(layout.content) + else: + return False + + +def _impl(array, value_set, skip_nones, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + value_set_layout = ak.to_layout(value_set, allow_record=False, allow_other=True) + + if not _is_maybe_optional_list_of_string(value_set_layout): + raise TypeError("`value_set` must be 1D array of (possibly missing) strings") + + behavior = behavior_of(array, value_set, behavior=behavior) + + def apply(layout, **kwargs): + if _is_maybe_optional_list_of_string(layout): + return ak.from_arrow( + pc.is_in( + ak.to_arrow(layout, extensionarray=False), + ak.to_arrow(value_set_layout, extensionarray=False), + skip_nulls=skip_nones, + ), + highlevel=False, + ) + + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_is_lower.py b/src/awkward/operations/str/akstr_is_lower.py new file mode 100644 index 0000000000..5b502ec5e6 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_lower.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_lower",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_lower(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of lowercase Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of lowercase ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_lower.html) + or + [pyarrow.compute.ascii_is_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_lower.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_lower") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_lower, pc.ascii_is_lower, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_numeric.py b/src/awkward/operations/str/akstr_is_numeric.py new file mode 100644 index 0000000000..e7e5a0db49 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_numeric.py @@ -0,0 +1,59 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_numeric",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_numeric(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of numeric Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of numeric Unicode characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_numeric](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) + or + [pyarrow.compute.utf8_is_numeric](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) + on strings and bytestrings, respectively. + + (Arrow's compute module does not have an `ascii_is_numeric`.) + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_numeric") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_numeric, pc.utf8_is_numeric, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_printable.py b/src/awkward/operations/str/akstr_is_printable.py new file mode 100644 index 0000000000..3b825acf6a --- /dev/null +++ b/src/awkward/operations/str/akstr_is_printable.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_printable",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_printable(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of printable Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of printable ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_printable](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_printable.html) + or + [pyarrow.compute.ascii_is_printable](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_printable.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_printable") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_printable, pc.ascii_is_printable, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_space.py b/src/awkward/operations/str/akstr_is_space.py new file mode 100644 index 0000000000..624691cdf5 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_space.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_space",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_space(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of whitespace Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of whitespace ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_space](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_space.html) + or + [pyarrow.compute.ascii_is_space](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_space.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_space") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_space, pc.ascii_is_space, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_title.py b/src/awkward/operations/str/akstr_is_title.py new file mode 100644 index 0000000000..05a13377dc --- /dev/null +++ b/src/awkward/operations/str/akstr_is_title.py @@ -0,0 +1,61 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_title",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_title(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is title-cased, + i.e. it has at least one cased character, each uppercase character follows + an uncased character, and each lowercase character follows an uppercase + character, otherwise False. + + Replaces any bytestring-valued data with True if the string is + title-cased, i.e. it has at least one cased character, each uppercase + character follows an uncased character, and each lowercase character + follows an uppercase character, otherwise False. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_title.html) + or + [pyarrow.compute.ascii_is_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_title.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_title") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_title, pc.ascii_is_title, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_is_upper.py b/src/awkward/operations/str/akstr_is_upper.py new file mode 100644 index 0000000000..b37aa1c843 --- /dev/null +++ b/src/awkward/operations/str/akstr_is_upper.py @@ -0,0 +1,60 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_upper",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def is_upper(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is non-empty and + consists only of uppercase Unicode characters, False otherwise. + + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of uppercase ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_is_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_upper.html) + or + [pyarrow.compute.ascii_is_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_upper.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.is_upper") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_is_upper, + pc.ascii_is_upper, + # pc.ascii_is_upper is defined on binary, but for consistency with is_lower and is_title... + bytestring_to_string=True, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_join.py b/src/awkward/operations/str/akstr_join.py new file mode 100644 index 0000000000..7f0f93e5fb --- /dev/null +++ b/src/awkward/operations/str/akstr_join.py @@ -0,0 +1,119 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("join",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def join(array, separator, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + separator (str, bytes, or array of them to broadcast): separator to + insert between strings. If array-like, `separator` is broadcast + against `array`. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Concatenate the strings in `array`. The `separator` is inserted between + each string. If array-like, `separator` is broadcast against `array` which + permits a unique separator for each list of strings in `array`. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.binary_join](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_join.html). + + See also: #ak.str.join_element_wise. + """ + # Dispatch + yield (array, separator) + + # Implementation + return _impl(array, separator, highlevel, behavior) + + +def _is_maybe_optional_list_of_string(layout): + if layout.is_list and layout.parameter("__array__") in {"string", "bytestring"}: + return True + elif layout.is_option or layout.is_indexed: + return _is_maybe_optional_list_of_string(layout.content) + else: + return False + + +def _impl(array, separator, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + def apply_unary(layout, **kwargs): + if not (layout.is_list and _is_maybe_optional_list_of_string(layout.content)): + return + + arrow_array = to_arrow( + # Arrow needs an option type here + layout.copy(content=ak.contents.UnmaskedArray.simplified(layout.content)), + extensionarray=False, + # This kernel requires non-large string/bytestrings + string_to32=True, + bytestring_to32=True, + ) + return from_arrow( + pc.binary_join(arrow_array, separator), + highlevel=False, + ) + + def apply_binary(layouts, **kwargs): + layout, separator_layout = layouts + if not (layout.is_list and _is_maybe_optional_list_of_string(layout.content)): + return + + if not _is_maybe_optional_list_of_string(separator_layout): + raise TypeError( + f"`separator` must be a list of (possibly missing) strings, not {ak.type(separator_layout)}" + ) + + # We have (maybe option/indexed type wrapping) strings + layout_arrow = to_arrow( + # Arrow needs an option type here + layout.copy(content=ak.contents.UnmaskedArray.simplified(layout.content)), + extensionarray=False, + # This kernel requires non-large string/bytestrings + string_to32=True, + bytestring_to32=True, + ) + separator_arrow = to_arrow( + separator_layout, + extensionarray=False, + # This kernel requires non-large string/bytestrings + string_to32=True, + bytestring_to32=True, + ) + return ( + from_arrow( + pc.binary_join(layout_arrow, separator_arrow), + highlevel=False, + ), + ) + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, separator, behavior=behavior) + if isinstance(separator, (bytes, str)): + out = ak._do.recursively_apply(layout, apply_unary, behavior=behavior) + else: + separator_layout = ak.to_layout(separator, allow_record=False, allow_other=True) + (out,) = ak._broadcasting.broadcast_and_apply( + (layout, separator_layout), apply_binary, behavior + ) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_join_element_wise.py b/src/awkward/operations/str/akstr_join_element_wise.py new file mode 100644 index 0000000000..cde4eef163 --- /dev/null +++ b/src/awkward/operations/str/akstr_join_element_wise.py @@ -0,0 +1,73 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("join_element_wise",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def join_element_wise(*arrays, highlevel=True, behavior=None): + """ + Args: + arrays: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Broadcasts and concatenates all but the last array of strings in `arrays`; + the last is used as a separator. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.binary_join_element_wise](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_join_element_wise.html). + + Unlike Arrow's `binary_join_element_wise`, this function has no `null_handling` + and `null_replacement` arguments. This function's behavior is like + `null_handling="emit_null"` (Arrow's default). The other cases can be implemented + with Awkward slices, #ak.drop_none, and #ak.fill_none. + + See also: #ak.str.join. + """ + # Dispatch + yield arrays + + # Implementation + return _impl(arrays, highlevel, behavior) + + +def _impl(arrays, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + if len(arrays) < 1: + raise TypeError("at least one array is required") + + layouts = [ak.to_layout(x) for x in arrays] + behavior = behavior_of(*arrays, behavior=behavior) + + def action(layouts, **kwargs): + if all( + x.is_list and x.parameter("__array__") in ("string", "bytestring") + for x in layouts + ): + return ( + from_arrow( + pc.binary_join_element_wise( + *[to_arrow(x, extensionarray=False) for x in layouts] + ), + highlevel=False, + ), + ) + + (out,) = ak._broadcasting.broadcast_and_apply(layouts, action, behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_length.py b/src/awkward/operations/str/akstr_length.py new file mode 100644 index 0000000000..700dbe534c --- /dev/null +++ b/src/awkward/operations/str/akstr_length.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("length",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def length(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with its length in Unicode characters + (not its length in bytes). + + Replaces any bytestring-valued data with its length of bytes. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_length](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_length.html) + or + [pyarrow.compute.binary_length](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_length.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.length") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_length, pc.binary_length, bytestring_to_string=False + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_lower.py b/src/awkward/operations/str/akstr_lower.py new file mode 100644 index 0000000000..ade17e10ac --- /dev/null +++ b/src/awkward/operations/str/akstr_lower.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("lower",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def lower(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a lowercase version (correctly + transforming Unicode characters). + + Replaces any bytestring-valued data with a lowercase version (transforming ASCII characters only). + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_lower.html) + or + [pyarrow.compute.ascii_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_lower.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.lower") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_lower, pc.ascii_lower, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_lpad.py b/src/awkward/operations/str/akstr_lpad.py new file mode 100644 index 0000000000..431557d086 --- /dev/null +++ b/src/awkward/operations/str/akstr_lpad.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("lpad",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + width (int): Desired string length. + padding (str or bytes): What to pad the string with. Should be one + codepoint or byte. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with right-aligned + strings/bytestrings of a given `width`, padding the left side with the + given `padding` codepoint or byte. + + If the data are strings, `width` is measured in codepoints and `padding` + must be one codepoint. + + If the data are bytestrings, `width` is measured in bytes and `padding` + must be one byte. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_lpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_lpad.html) + or + [pyarrow.compute.ascii_lpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_lpad.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, width, padding, highlevel, behavior) + + +def _impl(array, width, padding, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.lpad") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_lpad, pc.ascii_lpad, width, padding, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_ltrim.py b/src/awkward/operations/str/akstr_ltrim.py new file mode 100644 index 0000000000..a6904c1d11 --- /dev/null +++ b/src/awkward/operations/str/akstr_ltrim.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("ltrim",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def ltrim(array, characters, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + characters (str or bytes): Individual characters to be trimmed + from the string. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading characters of `characters` from any string or + bytestring-valued data. + + If the data are strings, `characters` are interpreted as unordered, + individual codepoints. + + If the data are bytestrings, `characters` are interpreted as unordered, + individual bytes. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_ltrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_ltrim.html) + or + [pyarrow.compute.ascii_ltrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_ltrim.html) + on strings and bytestrings, respectively. + + See also: #ak.str.ltrim_whitespace. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, characters, highlevel, behavior) + + +def _impl(array, characters, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.ltrim") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_ltrim, pc.ascii_ltrim, characters, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_ltrim_whitespace.py b/src/awkward/operations/str/akstr_ltrim_whitespace.py new file mode 100644 index 0000000000..060af89288 --- /dev/null +++ b/src/awkward/operations/str/akstr_ltrim_whitespace.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("ltrim_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def ltrim_whitespace(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading whitespace from any string or bytestring-valued data. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_ltrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_ltrim_whitespace.html) + or + [pyarrow.compute.ascii_ltrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_ltrim_whitespace.html) + on strings and bytestrings, respectively. + + See also: #ak.str.ltrim. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.ltrim_whitespace") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_ltrim_whitespace, + pc.ascii_ltrim_whitespace, + bytestring_to_string=True, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_match_like.py b/src/awkward/operations/str/akstr_match_like.py new file mode 100644 index 0000000000..6cc83443bb --- /dev/null +++ b/src/awkward/operations/str/akstr_match_like.py @@ -0,0 +1,62 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("match_like",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def match_like(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): SQL-style LIKE pattern to match against + strings in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it matches the given + SQL-style LIKE pattern, which obeys the following rules: + + - '%' matches any number of characters. + - '_' matches exactly one character. + - Any other character matches itself. + - To match a literal '%', '_', or "'", the character must be preceded + with a backslash. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.match_like](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_like.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.match_like") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.match_like, + pc.match_like, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_match_substring.py b/src/awkward/operations/str/akstr_match_substring.py new file mode 100644 index 0000000000..29778364bb --- /dev/null +++ b/src/awkward/operations/str/akstr_match_substring.py @@ -0,0 +1,59 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("match_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def match_substring( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Substring pattern to look for inside `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it contains the given + literal `pattern`. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.match_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring.html). + + See also: #ak.str.match_substring_regex. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.match_substring") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.match_substring, + pc.match_substring, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_match_substring_regex.py b/src/awkward/operations/str/akstr_match_substring_regex.py new file mode 100644 index 0000000000..85bbe38eb5 --- /dev/null +++ b/src/awkward/operations/str/akstr_match_substring_regex.py @@ -0,0 +1,60 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("match_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def match_substring_regex( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Regular expression to search for inside `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether any substring matches the + given regular expression `pattern` + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.match_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring_regex.html). + + See also: #ak.str.match_substring. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.match_substring_regex") + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.match_substring_regex, + pc.match_substring_regex, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_repeat.py b/src/awkward/operations/str/akstr_repeat.py new file mode 100644 index 0000000000..3d0edaa755 --- /dev/null +++ b/src/awkward/operations/str/akstr_repeat.py @@ -0,0 +1,108 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("repeat",) + +import numbers + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout +from awkward._nplikes.numpylike import NumpyMetadata + +np = NumpyMetadata.instance() + + +@high_level_function(module="ak.str") +def repeat(array, num_repeats, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + num_repeats: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued or bytestring-valued data with the same value + repeated `num_repeats` times, which can be a scalar integer or a + (broadcasted) array of integers. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.binary_repeat](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_repeat.html) + or + [pyarrow.compute.binary_repeat](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_repeat.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array, num_repeats) + + # Implementation + return _impl(array, num_repeats, highlevel, behavior) + + +def _impl(array, num_repeats, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + layout = ak.operations.to_layout(array) + behavior = behavior_of(array, behavior=behavior) + + num_repeats_layout = ak.operations.to_layout(num_repeats, allow_other=True) + + if not isinstance(num_repeats_layout, ak.contents.Content): + if not isinstance(num_repeats, numbers.Integral): + raise TypeError( + "num_repeats must be an integer or broadcastable to integers" + ) + + def action(layout, **kwargs): + if layout.is_list and layout.parameter("__array__") in ( + "string", + "bytestring", + ): + return from_arrow( + pc.binary_repeat( + to_arrow(layout, extensionarray=False), num_repeats + ), + highlevel=False, + ) + + out = ak._do.recursively_apply(layout, action, behavior) + + else: + + def action(inputs, **kwargs): + if inputs[0].is_list and inputs[0].parameter("__array__") in ( + "string", + "bytestring", + ): + if not inputs[1].is_numpy or not issubclass( + inputs[1].dtype.type, np.integer + ): + raise TypeError( + "num_repeats must be an integer or broadcastable to integers" + ) + return ( + from_arrow( + pc.binary_repeat( + to_arrow(inputs[0], extensionarray=False), + to_arrow(inputs[1], extensionarray=False), + ), + highlevel=False, + ), + ) + + out = ak._broadcasting.broadcast_and_apply( + (layout, num_repeats_layout), action, behavior + ) + assert isinstance(out, tuple) and len(out) == 1 + out = out[0] + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_replace_slice.py b/src/awkward/operations/str/akstr_replace_slice.py new file mode 100644 index 0000000000..fc0668daf9 --- /dev/null +++ b/src/awkward/operations/str/akstr_replace_slice.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("replace_slice",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + start (int): Index to start slicing at (inclusive). + stop (int): Index to stop slicing at (exclusive). + replacement (str or bytes): What to replace the slice with. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces slices of any string or bytestring-valued data with `replacement` + between `start` and `stop` indexes; `start` is inclusive and `stop` is + exclusive and both are 0-indexed. + + For strings, `start` and `stop` are measured in Unicode characters; for + bytestrings, `start` and `stop` are measured in bytes. + + The `start`, `stop`, and `replacement` are scalars; they cannot be + different for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_replace_slice](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_replace_slice.html) + or + [pyarrow.compute.binary_replace_slice](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_replace_slice.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, start, stop, replacement, highlevel, behavior) + + +def _impl(array, start, stop, replacement, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.replace_slice") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_replace_slice, pc.binary_replace_slice, start, stop, replacement + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_replace_substring.py b/src/awkward/operations/str/akstr_replace_substring.py new file mode 100644 index 0000000000..38f00cbe42 --- /dev/null +++ b/src/awkward/operations/str/akstr_replace_substring.py @@ -0,0 +1,71 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("replace_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def replace_substring( + array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str): Substring pattern to look for inside input values. + replacement (str or bytes): What to replace the pattern with. + max_replacements (None or int): If not None and not -1, limits the + maximum number of replacements per string/bytestring, counting from + the left. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces non-overlapping subsequences of any string or bytestring-valued + data that match a literal `pattern` with `replacement`. + + The `pattern` and `replacement` are scalars; they cannot be different for + each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.replace_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring.html) + or + [pyarrow.compute.replace_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring.html) + on strings and bytestrings, respectively. + + See also: #ak.str.replace_substring_regex. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) + + +def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.replace_substring") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.replace_substring, + pc.replace_substring, + pattern, + replacement, + max_replacements=max_replacements, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_replace_substring_regex.py b/src/awkward/operations/str/akstr_replace_substring_regex.py new file mode 100644 index 0000000000..832bf83552 --- /dev/null +++ b/src/awkward/operations/str/akstr_replace_substring_regex.py @@ -0,0 +1,71 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("replace_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def replace_substring_regex( + array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str): Regular expression pattern to look for inside input values. + replacement (str or bytes): What to replace the pattern with. + max_replacements (None or int): If not None and not -1, limits the + maximum number of replacements per string/bytestring, counting from + the left. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces non-overlapping subsequences of any string or bytestring-valued + data that match a regular expression `pattern` with `replacement`. + + The `pattern` and `replacement` are scalars; they cannot be different + for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.replace_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring_regex.html) + or + [pyarrow.compute.replace_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring_regex.html) + on strings and bytestrings, respectively. + + See also: #ak.str.replace_substring_regex. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) + + +def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.replace_substring_regex") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.replace_substring_regex, + pc.replace_substring_regex, + pattern, + replacement, + max_replacements=max_replacements, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_reverse.py b/src/awkward/operations/str/akstr_reverse.py new file mode 100644 index 0000000000..bd5e6c79f2 --- /dev/null +++ b/src/awkward/operations/str/akstr_reverse.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("reverse",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def reverse(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Reverses the order of Unicode characters in any string-valued data. + (This function operates on Unicode codepoints, not grapheme clusters. + Hence, it will not correctly reverse grapheme clusters composed of + multiple codepoints.) + + Reverses the order of bytes in any bytestring-valued data. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_reverse](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_reverse.html) + or + [pyarrow.compute.binary_reverse](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_reverse.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.reverse") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_reverse, pc.binary_reverse, bytestring_to_string=False + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_rpad.py b/src/awkward/operations/str/akstr_rpad.py new file mode 100644 index 0000000000..99fe323d60 --- /dev/null +++ b/src/awkward/operations/str/akstr_rpad.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("rpad",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + width (int): Desired string length. + padding (str or bytes): What to pad the string with. Should be one + codepoint or byte. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with left-aligned + strings/bytestrings of a given `width`, padding the right side with the + given `padding` codepoint or byte. + + If the data are strings, `width` is measured in codepoints and `padding` + must be one codepoint. + + If the data are bytestrings, `width` is measured in bytes and `padding` + must be one byte. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_rpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rpad.html) + or + [pyarrow.compute.ascii_rpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rpad.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, width, padding, highlevel, behavior) + + +def _impl(array, width, padding, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.rpad") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_rpad, pc.ascii_rpad, width, padding, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_rtrim.py b/src/awkward/operations/str/akstr_rtrim.py new file mode 100644 index 0000000000..816605de40 --- /dev/null +++ b/src/awkward/operations/str/akstr_rtrim.py @@ -0,0 +1,64 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("rtrim",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def rtrim(array, characters, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + characters (str or bytes): Individual characters to be trimmed from the string. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any trailing characters of `characters` from any string or + bytestring-valued data. + + If the data are strings, `characters` are interpreted as unordered, + individual codepoints. + + If the data are bytestrings, `characters` are interpreted as unordered, + individual bytes. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_rtrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rtrim.html) + or + [pyarrow.compute.ascii_rtrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rtrim.html) + on strings and bytestrings, respectively. + + See also: #ak.str.rtrim_whitespace. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, characters, highlevel, behavior) + + +def _impl(array, characters, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.rtrim") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_rtrim, pc.ascii_rtrim, characters, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_rtrim_whitespace.py b/src/awkward/operations/str/akstr_rtrim_whitespace.py new file mode 100644 index 0000000000..19c18677b0 --- /dev/null +++ b/src/awkward/operations/str/akstr_rtrim_whitespace.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("rtrim_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def rtrim_whitespace(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any trailing whitespace from any string or bytestring-valued data. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_rtrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rtrim_whitespace.html) + or + [pyarrow.compute.ascii_rtrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rtrim_whitespace.html) + on strings and bytestrings, respectively. + + See also: #ak.str.rtrim. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.rtrim_whitespace") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_rtrim_whitespace, + pc.ascii_rtrim_whitespace, + bytestring_to_string=True, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_slice.py b/src/awkward/operations/str/akstr_slice.py new file mode 100644 index 0000000000..89385c1a03 --- /dev/null +++ b/src/awkward/operations/str/akstr_slice.py @@ -0,0 +1,73 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("slice",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def slice(array, start, stop=None, step=1, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + start (int): Index to start slicing at (inclusive). + stop (None or int): Index to stop slicing at (exclusive). If not given, + slicing will stop at the end. + step (int): Slice step. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with a slice between `start` + and `stop` indexes; `start` is inclusive and `stop` is exclusive and both + are 0-indexed. + + For strings, `start` and `stop` are measured in Unicode characters; for + bytestrings, `start` and `stop` are measured in bytes. + + The `start`, `stop`, and `replacement` are scalars; they cannot be + different for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_slice_codeunits](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_slice_codeunits.html) + or performs a literal slice on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, start, stop, step, highlevel, behavior) + + +def _impl(array, start, stop, step, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + def action(layout, **absorb): + if layout.is_list and layout.parameter("__array__") == "string": + return from_arrow( + pc.utf8_slice_codeunits( + to_arrow(layout, extensionarray=False), start, stop, step + ), + highlevel=False, + ) + + elif layout.is_list and layout.parameter("__array__") == "bytestring": + return layout[:, start:stop:step] + + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_split_pattern.py b/src/awkward/operations/str/akstr_split_pattern.py new file mode 100644 index 0000000000..e311ade93d --- /dev/null +++ b/src/awkward/operations/str/akstr_split_pattern.py @@ -0,0 +1,63 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("split_pattern",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def split_pattern( + array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Pattern of characters/bytes to split on. + max_splits (None or int): Maximum number of splits for each input + value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input + value; otherwise, start splitting from the beginning of each + value. This flag only has an effect if `max_splits` is not None. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Splits any string or bytestring-valued data into a list of substrings + according to the given separator. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). + + See also: #ak.str.split_whitespace, #ak.str.split_pattern_regex. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, max_splits, reverse, highlevel, behavior) + + +def _impl(array, pattern, max_splits, reverse, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.split_pattern") + behavior = behavior_of(array, behavior=behavior) + action = ak.operations.str._get_split_action( + pc.split_pattern, + pc.split_pattern, + pattern=pattern, + max_splits=max_splits, + reverse=reverse, + bytestring_to_string=False, + ) + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_split_pattern_regex.py b/src/awkward/operations/str/akstr_split_pattern_regex.py new file mode 100644 index 0000000000..dd71e8b9b5 --- /dev/null +++ b/src/awkward/operations/str/akstr_split_pattern_regex.py @@ -0,0 +1,64 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("split_pattern_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def split_pattern_regex( + array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Regular expression of characters/bytes to + split on. + max_splits (None or int): Maximum number of splits for each input + value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input + value; otherwise, start splitting from the beginning of each + value. This flag only has an effect if `max_splits` is not None. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Splits any string or bytestring-valued data into a list of substrings + according to the given regular expression. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). + + See also: #ak.str.split_whitespace, #ak.str.split_pattern. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, max_splits, reverse, highlevel, behavior) + + +def _impl(array, pattern, max_splits, reverse, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.split_pattern_regex") + behavior = behavior_of(array, behavior=behavior) + action = ak.operations.str._get_split_action( + pc.split_pattern_regex, + pc.split_pattern_regex, + pattern=pattern, + max_splits=max_splits, + reverse=reverse, + bytestring_to_string=False, + ) + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_split_whitespace.py b/src/awkward/operations/str/akstr_split_whitespace.py new file mode 100644 index 0000000000..5bfb9e77a8 --- /dev/null +++ b/src/awkward/operations/str/akstr_split_whitespace.py @@ -0,0 +1,71 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("split_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def split_whitespace( + array, *, max_splits=None, reverse=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + max_splits (None or int): Maximum number of splits for each input + value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input + value; otherwise, start splitting from the beginning of each + value. This flag only has an effect if `max_splits` is not None. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Splits any string or bytestring-valued data into a list of substrings + according to any non-zero length sequence of + whitespace characters. + + For strings, a split is performed for every sequence of Unicode whitespace + characters; for bytestrings, splitting is performed for sequences of ascii + whitespace characters. + + The `max_splits`, and `reverse` arguments are scalars; they cannot be + different for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_split_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_split_whitespace.html) + or [pyarrow.compute.ascii_split_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_split_whitespace.html) + on strings and bytestrings, respectively. + + See also: #ak.str.split_pattern, #ak.str.split_pattern_regex. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, max_splits, reverse, highlevel, behavior) + + +def _impl(array, max_splits, reverse, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.split_whitespace") + behavior = behavior_of(array, behavior=behavior) + action = ak.operations.str._get_split_action( + pc.utf8_split_whitespace, + pc.ascii_split_whitespace, + max_splits=max_splits, + reverse=reverse, + bytestring_to_string=True, + ) + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_starts_with.py b/src/awkward/operations/str/akstr_starts_with.py new file mode 100644 index 0000000000..e035c53920 --- /dev/null +++ b/src/awkward/operations/str/akstr_starts_with.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("starts_with",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def starts_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Substring pattern to test against the start + of each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Returns True for every string in `array` if it starts with the given literal + suffix `pattern`. Depending upon the value of `ignore_case`, the matching + function will be case-insensitive. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.starts_with](https://arrow.apache.org/docs/python/generated/pyarrow.compute.starts_with.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.starts_with") + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.starts_with, + pc.starts_with, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/akstr_swapcase.py b/src/awkward/operations/str/akstr_swapcase.py new file mode 100644 index 0000000000..1629c65fdc --- /dev/null +++ b/src/awkward/operations/str/akstr_swapcase.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("swapcase",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def swapcase(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with uppercase characters transformed to + lowercase and vice-versa (correctly transforming Unicode characters). + + Replaces any bytestring-valued data with uppercase characters transformed + to lowercase and vice-versa (transforming ASCII characters only). + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_swapcase](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_swapcase.html) + or + [pyarrow.compute.ascii_swapcase](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_swapcase.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.swapcase") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_swapcase, pc.ascii_swapcase, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_title.py b/src/awkward/operations/str/akstr_title.py new file mode 100644 index 0000000000..8c7d0361b4 --- /dev/null +++ b/src/awkward/operations/str/akstr_title.py @@ -0,0 +1,59 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("title",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def title(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a titlecase version (correctly + transforming Unicode characters). Each word in the output will start with + an uppercase character and its remaining characters will be lowercase. + + Replaces any bytestring-valued data with a titlecase version (transforming + ASCII characters only). Each word in the output will start with an + uppercase character and its remaining characters will be lowercase. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_title.html) + or + [pyarrow.compute.ascii_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_title.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.title") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_title, pc.ascii_title, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_trim.py b/src/awkward/operations/str/akstr_trim.py new file mode 100644 index 0000000000..aa5352cd5e --- /dev/null +++ b/src/awkward/operations/str/akstr_trim.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("trim",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def trim(array, characters, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + characters (str or bytes): Individual characters to be trimmed from + the string. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading or trailing characters of `characters` from any string + or bytestring-valued data. + + If the data are strings, `characters` are interpreted as unordered, + individual codepoints. + + If the data are bytestrings, `characters` are interpreted as unordered, + individual bytes. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_trim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_trim.html) + or + [pyarrow.compute.ascii_trim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_trim.html) + on strings and bytestrings, respectively. + + See also: #ak.str.trim_whitespace. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, characters, highlevel, behavior) + + +def _impl(array, characters, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.trim") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_trim, pc.ascii_trim, characters, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_trim_whitespace.py b/src/awkward/operations/str/akstr_trim_whitespace.py new file mode 100644 index 0000000000..200118fdb7 --- /dev/null +++ b/src/awkward/operations/str/akstr_trim_whitespace.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("trim_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def trim_whitespace(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading or trailing whitespace from any string or + bytestring-valued data. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_trim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_trim_whitespace.html) + or + [pyarrow.compute.ascii_trim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_trim_whitespace.html) + on strings and bytestrings, respectively. + + See also: #ak.str.trim. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.trim_whitespace") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_trim_whitespace, pc.ascii_trim_whitespace, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/akstr_upper.py b/src/awkward/operations/str/akstr_upper.py new file mode 100644 index 0000000000..4f0a8bf920 --- /dev/null +++ b/src/awkward/operations/str/akstr_upper.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("upper",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function(module="ak.str") +def upper(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with an uppercase version (correctly + transforming Unicode characters). + + Replaces any bytestring-valued data with am uppercase version (transforming + ASCII characters only). + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_upper.html) + or + [pyarrow.compute.ascii_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_upper.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + from awkward._connect.pyarrow import import_pyarrow_compute + + pc = import_pyarrow_compute("ak.str.upper") + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.utf8_upper, pc.ascii_upper, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py new file mode 100644 index 0000000000..79ddfb4d82 --- /dev/null +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -0,0 +1,1155 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import pytest + +import awkward as ak + +pyarrow = pytest.importorskip("pyarrow") + +string = ak.Array( + [ + ["\u03b1\u03b2\u03b3", ""], + [], + ["\u2192\u03b4\u03b5\u2190", "\u03b6z z\u03b6", "abc"], + ] +) +bytestring = ak.Array( + [ + ["\u03b1\u03b2\u03b3".encode(), b""], + [], + ["\u2192\u03b4\u03b5\u2190".encode(), "\u03b6z z\u03b6".encode(), b"abc"], + ] +) + +string_padded = ak.Array( + [ + [" αβγ ", " "], + [], + [" →δε← ", " ζz zζ ", " abc "], + ] +) +bytestring_padded = ak.Array( + [ + [b" \xce\xb1\xce\xb2\xce\xb3 ", b" "], + [], + [ + b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", + b" \xce\xb6z z\xce\xb6 ", + b" abc ", + ], + ] +) + +string_repeats = ak.Array( + [["foo123bar123baz", "foo", "bar"], ["123foo", "456bar", "foo123456bar"], []] +) + +bytestring_repeats = ak.Array( + [[b"foo123bar123baz", b"foo", b"bar"], [b"123foo", b"456bar", b"foo123456bar"], []] +) + + +def test_is_alnum(): + assert ak.str.is_alnum(string).tolist() == [ + [True, False], + [], + [False, False, True], + ] + assert ak.str.is_alnum(bytestring).tolist() == [ + [False, False], + [], + [False, False, True], + ] + + +def test_is_alpha(): + assert ak.str.is_alpha(string).tolist() == [ + [True, False], + [], + [False, False, True], + ] + assert ak.str.is_alpha(bytestring).tolist() == [ + [False, False], + [], + [False, False, True], + ] + + +def test_is_decimal(): + assert ak.str.is_decimal(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_decimal(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + +def test_is_digit(): + assert ak.str.is_digit(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_digit(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + +def test_is_lower(): + assert ak.str.is_lower(string).tolist() == [ + [True, False], + [], + [True, True, True], + ] + assert ak.str.is_lower(bytestring).tolist() == [ + [False, False], + [], + [False, True, True], + ] + + +def test_is_numeric(): + assert ak.str.is_numeric(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_numeric(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + +def test_is_printable(): + assert ak.str.is_printable(string).tolist() == [ + [True, True], + [], + [True, True, True], + ] + assert ak.str.is_printable(bytestring).tolist() == [ + [False, True], + [], + [False, False, True], + ] + + +def test_is_space(): + assert ak.str.is_space(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_space(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + +def test_is_upper(): + assert ak.str.is_upper(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_upper(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + +def test_is_title(): + assert ak.str.is_title(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_title(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + +def test_is_ascii(): + assert ak.str.is_ascii(string).tolist() == [ + [False, True], + [], + [False, False, True], + ] + assert ak.str.is_ascii(bytestring).tolist() == [ + [False, True], + [], + [False, False, True], + ] + + +def test_capitalize(): + assert ak.str.capitalize(string).tolist() == [ + ["Αβγ", ""], + [], + ["→δε←", "Ζz zζ", "Abc"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.capitalize(bytestring).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"Abc"], + ] + + +def test_length(): + assert ak.str.length(string).tolist() == [ + [3, 0], + [], + [4, 5, 3], + ] + assert ak.str.length(bytestring).tolist() == [ + [6, 0], + [], + [10, 7, 3], + ] + + +def test_lower(): + assert ak.str.lower(string).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.lower(bytestring).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + +def test_swapcase(): + assert ak.str.swapcase(string).tolist() == [ + ["ΑΒΓ", ""], + [], + ["→ΔΕ←", "ΖZ ZΖ", "ABC"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.swapcase(bytestring).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζZ Zζ".encode(), b"ABC"], + ] + + +def test_title(): + assert ak.str.title(string).tolist() == [ + ["Αβγ", ""], + [], + ["→Δε←", "Ζz Zζ", "Abc"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.title(bytestring).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζZ Zζ".encode(), b"Abc"], + ] + + +def test_upper(): + assert ak.str.upper(string).tolist() == [ + ["ΑΒΓ", ""], + [], + ["→ΔΕ←", "ΖZ ZΖ", "ABC"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.upper(bytestring).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζZ Zζ".encode(), b"ABC"], + ] + + +def test_repeat(): + assert ak.str.repeat(string, 3).tolist() == [ + ["αβγαβγαβγ", ""], + [], + ["→δε←→δε←→δε←", "ζz zζζz zζζz zζ", "abcabcabc"], + ] + assert ak.str.repeat(bytestring, 3).tolist() == [ + ["αβγαβγαβγ".encode(), b""], + [], + ["→δε←→δε←→δε←".encode(), "ζz zζζz zζζz zζ".encode(), b"abcabcabc"], + ] + + assert ak.str.repeat(string, [[3, 3], [], [2, 0, 1]]).tolist() == [ + ["αβγαβγαβγ", ""], + [], + ["→δε←→δε←", "", "abc"], + ] + assert ak.str.repeat(bytestring, [[3, 3], [], [2, 0, 1]]).tolist() == [ + ["αβγαβγαβγ".encode(), b""], + [], + ["→δε←→δε←".encode(), b"", b"abc"], + ] + + +def test_replace_slice(): + assert ak.str.replace_slice(string[:, :1], 1, 2, "qj").tolist() == [ + ["αqjγ"], # noqa: RUF001 + [], + ["→qjε←"], + ] + assert ak.str.replace_slice(bytestring[:, :1], 1, 2, b"qj").tolist() == [ + [b"\xceqj\xce\xb2\xce\xb3"], + [], + [b"\xe2qj\x92\xce\xb4\xce\xb5\xe2\x86\x90"], + ] + + +def test_reverse(): + assert ak.str.reverse(string).tolist() == [ + ["αβγ"[::-1], ""], + [], + ["→δε←"[::-1], "ζz zζ"[::-1], "abc"[::-1]], + ] + assert ak.str.reverse(bytestring).tolist() == [ + ["αβγ".encode()[::-1], b""], + [], + ["→δε←".encode()[::-1], "ζz zζ".encode()[::-1], b"abc"[::-1]], + ] + + +def test_replace_substring(): + assert ak.str.replace_substring(string, "βγ", "HELLO").tolist() == [ + ["αHELLO", ""], # noqa: RUF001 + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring(bytestring, "βγ".encode(), b"HELLO").tolist() == [ + ["αHELLO".encode(), b""], # noqa: RUF001 + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + assert ak.str.replace_substring( + string, "βγ", "HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring( + bytestring, "βγ".encode(), b"HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + +def test_replace_substring_regex(): + assert ak.str.replace_substring_regex(string, "βγ", "HELLO").tolist() == [ + ["αHELLO", ""], # noqa: RUF001 + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring_regex( + bytestring, "βγ".encode(), b"HELLO" + ).tolist() == [ + ["αHELLO".encode(), b""], # noqa: RUF001 + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + assert ak.str.replace_substring_regex( + string, "βγ", "HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring_regex( + bytestring, "βγ".encode(), b"HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + +def test_center(): + assert ak.str.center(string, 15, " ").tolist() == [ + [" αβγ ", " "], + [], + [" →δε← ", " ζz zζ ", " abc "], + ] + + print(ak.str.center(bytestring, 15, " ").tolist()) + + assert ak.str.center(bytestring, 15, b" ").tolist() == [ + [b" \xce\xb1\xce\xb2\xce\xb3 ", b" "], + [], + [ + b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", + b" \xce\xb6z z\xce\xb6 ", + b" abc ", + ], + ] + + +def test_lpad(): + assert ak.str.lpad(string, 15, " ").tolist() == [ + [" αβγ", " "], + [], + [" →δε←", " ζz zζ", " abc"], + ] + + print(ak.str.lpad(bytestring, 15, " ").tolist()) + + assert ak.str.lpad(bytestring, 15, b" ").tolist() == [ + [b" \xce\xb1\xce\xb2\xce\xb3", b" "], + [], + [ + b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90", + b" \xce\xb6z z\xce\xb6", + b" abc", + ], + ] + + +def test_rpad(): + assert ak.str.rpad(string, 15, " ").tolist() == [ + ["αβγ ", " "], + [], + ["→δε← ", "ζz zζ ", "abc "], + ] + + print(ak.str.rpad(bytestring, 15, " ").tolist()) + + assert ak.str.rpad(bytestring, 15, b" ").tolist() == [ + [b"\xce\xb1\xce\xb2\xce\xb3 ", b" "], + [], + [ + b"\xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", + b"\xce\xb6z z\xce\xb6 ", + b"abc ", + ], + ] + + +def test_ltrim(): + assert ak.str.ltrim(string_padded, " ").tolist() == [ + ["αβγ ", ""], + [], + ["→δε← ", "ζz zζ ", "abc "], + ] + assert ak.str.ltrim(bytestring_padded, b" ").tolist() == [ + ["αβγ ".encode(), b""], + [], + ["→δε← ".encode(), "ζz zζ ".encode(), b"abc "], + ] + + +def test_ltrim_whitespace(): + assert ak.str.ltrim_whitespace(string_padded).tolist() == [ + ["αβγ ", ""], + [], + ["→δε← ", "ζz zζ ", "abc "], + ] + assert ak.str.ltrim_whitespace(bytestring_padded).tolist() == [ + ["αβγ ".encode(), b""], + [], + ["→δε← ".encode(), "ζz zζ ".encode(), b"abc "], + ] + + +def test_rtrim(): + assert ak.str.rtrim(string_padded, " ").tolist() == [ + [" αβγ", ""], + [], + [" →δε←", " ζz zζ", " abc"], + ] + assert ak.str.rtrim(bytestring_padded, b" ").tolist() == [ + [" αβγ".encode(), b""], + [], + [" →δε←".encode(), " ζz zζ".encode(), b" abc"], + ] + + +def test_rtrim_whitespace(): + assert ak.str.rtrim_whitespace(string_padded).tolist() == [ + [" αβγ", ""], + [], + [" →δε←", " ζz zζ", " abc"], + ] + assert ak.str.rtrim_whitespace(bytestring_padded).tolist() == [ + [" αβγ".encode(), b""], + [], + [" →δε←".encode(), " ζz zζ".encode(), b" abc"], + ] + + +def test_trim(): + assert ak.str.trim(string_padded, " ").tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.trim(bytestring_padded, b" ").tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + +def test_trim_whitespace(): + assert ak.str.trim_whitespace(string_padded).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.trim_whitespace(bytestring_padded).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + +def test_slice(): + assert ak.str.slice(string, 1, 3).tolist() == [ + ["αβγ"[1:3], ""[1:3]], + [], + ["→δε←"[1:3], "ζz zζ"[1:3], "abc"[1:3]], + ] + assert ak.str.slice(bytestring, 1, 3).tolist() == [ + ["αβγ".encode()[1:3], b""[1:3]], + [], + ["→δε←".encode()[1:3], "ζz zζ".encode()[1:3], b"abc"[1:3]], + ] + + # ArrowInvalid: Negative buffer resize: -40 (looks like an Arrow bug) + # assert ak.str.slice(string, 1).tolist() == [ + # ["αβγ"[1:], ""[1:]], + # [], + # ["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]], + # ] + assert ak.str.slice(bytestring, 1).tolist() == [ + ["αβγ".encode()[1:], b""[1:]], + [], + ["→δε←".encode()[1:], "ζz zζ".encode()[1:], b"abc"[1:]], + ] + + +def test_split_whitespace(): + assert ak.str.split_whitespace(string_padded, max_splits=1).tolist() == [ + [["", "αβγ "], ["", " "]], + [], + [["", "→δε← "], ["", "ζz zζ "], ["", "abc "]], + ] + assert ak.str.split_whitespace( + string_padded, max_splits=1, reverse=True + ).tolist() == [ + [[" αβγ", ""], [" ", ""]], + [], + [[" →δε←", ""], [" ζz zζ", ""], [" abc", ""]], + ] + assert ak.str.split_whitespace(string_padded, max_splits=None).tolist() == [ + [["", "αβγ", "", ""], ["", "", ""]], + [], + [["", "→δε←", "", ""], ["", "ζz", "zζ", "", ""], ["", "abc", "", ""]], + ] + + # Bytestrings + assert ak.str.split_whitespace(bytestring_padded, max_splits=1).tolist() == [ + [[b"", "αβγ ".encode()], [b"", b""]], + [], + [ + [b"", "→δε← ".encode()], + [b"", "ζz zζ ".encode()], + [b"", b"abc "], + ], + ] + assert ak.str.split_whitespace( + bytestring_padded, max_splits=1, reverse=True + ).tolist() == [ + [[" αβγ".encode(), b""], [b"", b""]], + [], + [ + [" →δε←".encode(), b""], + [" ζz zζ".encode(), b""], + [b" abc", b""], + ], + ] + assert ak.str.split_whitespace(bytestring_padded, max_splits=None).tolist() == [ + [[b"", "αβγ".encode(), b""], [b"", b""]], + [], + [ + [b"", "→δε←".encode(), b""], + [b"", "ζz".encode(), "zζ".encode(), b""], + [b"", b"abc", b""], + ], + ] + + +def test_split_pattern(): + assert ak.str.split_pattern(string_repeats, "123", max_splits=1).tolist() == [ + [["foo", "bar123baz"], ["foo"], ["bar"]], + [["", "foo"], ["456bar"], ["foo", "456bar"]], + [], + ] + assert ak.str.split_pattern( + string_repeats, "123", max_splits=1, reverse=True + ).tolist() == [ + [["foo123bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["456bar"], ["foo", "456bar"]], + [], + ] + assert ak.str.split_pattern(string_repeats, "123", max_splits=None).tolist() == [ + [["foo", "bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["456bar"], ["foo", "456bar"]], + [], + ] + + # Bytestrings + assert ak.str.split_pattern(bytestring_repeats, b"123", max_splits=1).tolist() == [ + [[b"foo", b"bar123baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"456bar"], [b"foo", b"456bar"]], + [], + ] + assert ak.str.split_pattern( + bytestring_repeats, b"123", max_splits=1, reverse=True + ).tolist() == [ + [[b"foo123bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"456bar"], [b"foo", b"456bar"]], + [], + ] + assert ak.str.split_pattern( + bytestring_repeats, b"123", max_splits=None + ).tolist() == [ + [[b"foo", b"bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"456bar"], [b"foo", b"456bar"]], + [], + ] + + +def test_split_pattern_regex(): + assert ak.str.split_pattern_regex( + string_repeats, r"\d{3}", max_splits=1 + ).tolist() == [ + [["foo", "bar123baz"], ["foo"], ["bar"]], + [["", "foo"], ["", "bar"], ["foo", "456bar"]], + [], + ] + with pytest.raises( + pyarrow.ArrowNotImplementedError, match=r"split in reverse with regex" + ): + assert ak.str.split_pattern_regex( + string_repeats, r"\d{3}", max_splits=1, reverse=True + ).tolist() == [ + [["foo123bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["", "bar"], ["foo", "456bar"]], + [], + ] + assert ak.str.split_pattern_regex( + string_repeats, r"\d{3}", max_splits=None + ).tolist() == [ + [["foo", "bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["", "bar"], ["foo", "", "bar"]], + [], + ] + + # Bytestrings + assert ak.str.split_pattern_regex( + bytestring_repeats, rb"\d{3}", max_splits=1 + ).tolist() == [ + [[b"foo", b"bar123baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"", b"bar"], [b"foo", b"456bar"]], + [], + ] + with pytest.raises( + pyarrow.ArrowNotImplementedError, match=r"split in reverse with regex" + ): + assert ak.str.split_pattern_regex( + bytestring_repeats, rb"\d{3}", max_splits=1, reverse=True + ).tolist() == [ + [[b"foo123bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"", b"bar"], [b"foo", b"456bar"]], + [], + ] + assert ak.str.split_pattern_regex( + bytestring_repeats, rb"\d{3}", max_splits=None + ).tolist() == [ + [[b"foo", b"bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"", b"bar"], [b"foo", b"", b"bar"]], + [], + ] + + +def test_extract_regex(): + assert ak.str.extract_regex( + ak.Array([["one1", "two2", "three3"], [], ["four4", "five5"]]), + "(?P[aeiou])(?P[0-9]+)", + ).tolist() == [ + [ + {"vowel": "e", "number": "1"}, + {"vowel": "o", "number": "2"}, + {"vowel": "e", "number": "3"}, + ], + [], + [None, {"vowel": "e", "number": "5"}], + ] + + assert ak.str.extract_regex( + ak.Array([[b"one1", b"two2", b"three3"], [], [b"four4", b"five5"]]), + b"(?P[aeiou])(?P[0-9]+)", + ).tolist() == [ + [ + {"vowel": b"e", "number": b"1"}, + {"vowel": b"o", "number": b"2"}, + {"vowel": b"e", "number": b"3"}, + ], + [], + [None, {"vowel": b"e", "number": b"5"}], + ] + + +def test_join(): + array1 = ak.Array( + [ + ["this", "that"], + [], + ["foo", "bar", "baz"], + ] + ) + assert ak.str.join(array1, "-").tolist() == ["this-that", "", "foo-bar-baz"] + + separator = ak.Array(["→", "↑", "←"]) + assert ak.str.join(array1, separator).tolist() == ["this→that", "", "foo←bar←baz"] + + array2 = ak.Array( + [ + [b"this", b"that"], + [], + [b"foo", b"bar", b"baz"], + ] + ) + assert ak.str.join(array2, b"-").tolist() == [b"this-that", b"", b"foo-bar-baz"] + + separator = ak.Array(["→".encode(), "↑".encode(), "←".encode()]) + assert ak.str.join(array2, separator).tolist() == [ + "this→that".encode(), + b"", + "foo←bar←baz".encode(), + ] + + +def test_join_element_wise(): + array1 = ak.Array([["one", "two", "three"], [], ["four", "five"]]) + array2 = ak.Array([["111", "222", "333"], [], ["444", "555"]]) + separator = ak.Array(["→", "↑", "←"]) + + assert ak.str.join_element_wise(array1, array2, separator).tolist() == [ + ["one→111", "two→222", "three→333"], + [], + ["four←444", "five←555"], + ] + + array1 = ak.Array([[b"one", b"two", b"three"], [], [b"four", b"five"]]) + array2 = ak.Array([[b"111", b"222", b"333"], [], [b"444", b"555"]]) + separator = ak.Array(["→".encode(), "↑".encode(), "←".encode()]) + + assert ak.str.join_element_wise(array1, array2, separator).tolist() == [ + ["one→111".encode(), "two→222".encode(), "three→333".encode()], + [], + ["four←444".encode(), "five←555".encode()], + ] + + +def test_count_substring(): + assert ak.str.count_substring(string_repeats, "BA").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring(string_repeats, "BA", ignore_case=True).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + + # Bytestrings + assert ak.str.count_substring(bytestring_repeats, b"BA").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring( + bytestring_repeats, b"BA", ignore_case=True + ).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + + +def test_count_substring_regex(): + assert ak.str.count_substring_regex(string_repeats, r"BA\d*").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring_regex( + string_repeats, r"BA\d*", ignore_case=True + ).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + assert ak.str.count_substring_regex(string_repeats, r"\d{1,}").tolist() == [ + [2, 0, 0], + [1, 1, 1], + [], + ] + + # Bytestrings + assert ak.str.count_substring_regex(bytestring_repeats, rb"BA\d*").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring_regex( + bytestring_repeats, rb"BA\d*", ignore_case=True + ).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + assert ak.str.count_substring_regex(bytestring_repeats, rb"\d{1,}").tolist() == [ + [2, 0, 0], + [1, 1, 1], + [], + ] + + +def test_ends_with(): + assert ak.str.ends_with(string_repeats, "BAR").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.ends_with(string_repeats, "BAR", ignore_case=True).tolist() == [ + [False, False, True], + [False, True, True], + [], + ] + + # Bytestrings + assert ak.str.ends_with(bytestring_repeats, b"BAR").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.ends_with(bytestring_repeats, b"BAR", ignore_case=True).tolist() == [ + [False, False, True], + [False, True, True], + [], + ] + + +def test_starts_with(): + assert ak.str.starts_with(string_repeats, "FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.starts_with(string_repeats, "FOO", ignore_case=True).tolist() == [ + [True, True, False], + [False, False, True], + [], + ] + + # Bytestrings + assert ak.str.starts_with(bytestring_repeats, b"FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.starts_with( + bytestring_repeats, b"FOO", ignore_case=True + ).tolist() == [ + [True, True, False], + [False, False, True], + [], + ] + + +def test_find_substring(): + assert ak.str.find_substring(string_repeats, "FOO").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring(string_repeats, "FOO", ignore_case=True).tolist() == [ + [0, 0, -1], + [3, -1, 0], + [], + ] + + # Bytestrings + assert ak.str.find_substring(bytestring_repeats, b"FOO").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring( + bytestring_repeats, b"FOO", ignore_case=True + ).tolist() == [ + [0, 0, -1], + [3, -1, 0], + [], + ] + + +def test_find_substring_regex(): + assert ak.str.find_substring_regex(string_repeats, r"FOO\d+").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring_regex( + string_repeats, r"FOO\d+", ignore_case=True + ).tolist() == [ + [0, -1, -1], + [-1, -1, 0], + [], + ] + + # Bytestrings + assert ak.str.find_substring_regex(bytestring_repeats, rb"FOO\d+").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring_regex( + bytestring_repeats, rb"FOO\d+", ignore_case=True + ).tolist() == [ + [0, -1, -1], + [-1, -1, 0], + [], + ] + + +def test_match_like(): + assert ak.str.match_like(string_repeats, "FOO%BA%").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_like(string_repeats, "FOO%BA%", ignore_case=True).tolist() == [ + [True, False, False], + [False, False, True], + [], + ] + + # Bytestrings + assert ak.str.match_like(bytestring_repeats, b"FOO%BA%").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_like( + bytestring_repeats, b"FOO%BA%", ignore_case=True + ).tolist() == [ + [True, False, False], + [False, False, True], + [], + ] + + +def test_match_substring(): + assert ak.str.match_substring(string_repeats, "FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring(string_repeats, "FOO", ignore_case=True).tolist() == [ + [True, True, False], + [True, False, True], + [], + ] + + # Bytestrings + assert ak.str.match_substring(bytestring_repeats, b"FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring( + bytestring_repeats, b"FOO", ignore_case=True + ).tolist() == [ + [True, True, False], + [True, False, True], + [], + ] + + +def test_match_substring_regex(): + assert ak.str.match_substring_regex(string_repeats, r"FOO\d+").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring_regex( + string_repeats, r"FOO\d+", ignore_case=True + ).tolist() == [ + [True, False, False], + [False, False, True], + [], + ] + + # Bytestrings + assert ak.str.match_substring_regex(bytestring_repeats, rb"FOO\d+").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring_regex( + bytestring_repeats, rb"FOO\d+", ignore_case=True + ).tolist() == [ + [True, False, False], + [False, False, True], + [], + ] + + +def test_is_in(): + assert ak.str.is_in(string_repeats, ["123foo", "foo"]).tolist() == [ + [False, True, False], + [True, False, False], + [], + ] + assert ak.str.is_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + ).tolist() == [ + [False, True, False], + [True, False, False], + [True], + ] + assert ak.str.is_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + skip_nones=True, + ).tolist() == [ + [False, True, False], + [True, False, False], + [False], + ] + + # Bytestrings + + assert ak.str.is_in(string_repeats, [b"123foo", b"foo"]).tolist() == [ + [False, True, False], + [True, False, False], + [], + ] + assert ak.str.is_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + ).tolist() == [ + [False, True, False], + [True, False, False], + [True], + ] + assert ak.str.is_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + skip_nones=True, + ).tolist() == [ + [False, True, False], + [True, False, False], + [False], + ] + + +def test_index_in(): + assert ak.str.index_in(string_repeats, ["123foo", "foo"]).tolist() == [ + [None, 1, None], + [0, None, None], + [], + ] + assert ak.str.index_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + ).tolist() == [ + [None, 1, None], + [0, None, None], + [2], + ] + assert ak.str.index_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + skip_nones=True, + ).tolist() == [ + [None, 1, None], + [0, None, None], + [None], + ] + + # Bytestrings + + assert ak.str.index_in(string_repeats, [b"123foo", b"foo"]).tolist() == [ + [None, 1, None], + [0, None, None], + [], + ] + assert ak.str.index_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + ).tolist() == [ + [None, 1, None], + [0, None, None], + [2], + ] + assert ak.str.index_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + skip_nones=True, + ).tolist() == [ + [None, 1, None], + [0, None, None], + [None], + ]