From e497e2f462584246390ddc59b1f958bc931ce8c2 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Tue, 28 Jan 2025 16:31:14 -0500 Subject: [PATCH 1/6] feat: add a non-touching ak.zip, called 'ak.unsafe_zip' --- src/awkward/operations/__init__.py | 1 + src/awkward/operations/ak_unsafe_zip.py | 192 ++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 src/awkward/operations/ak_unsafe_zip.py diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py index 91ebc9c184..49964b4066 100644 --- a/src/awkward/operations/__init__.py +++ b/src/awkward/operations/__init__.py @@ -109,6 +109,7 @@ from awkward.operations.ak_transform import * from awkward.operations.ak_type import * from awkward.operations.ak_unflatten import * +from awkward.operations.ak_unsafe_zip import * from awkward.operations.ak_unzip import * from awkward.operations.ak_validity_error import * from awkward.operations.ak_values_astype import * diff --git a/src/awkward/operations/ak_unsafe_zip.py b/src/awkward/operations/ak_unsafe_zip.py new file mode 100644 index 0000000000..3094e85e92 --- /dev/null +++ b/src/awkward/operations/ak_unsafe_zip.py @@ -0,0 +1,192 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE + +from __future__ import annotations + +from collections.abc import Mapping +from functools import reduce + +import awkward as ak +from awkward._dispatch import high_level_function +from awkward._layout import HighLevelContext, ensure_same_backend +from awkward._namedaxis import _get_named_axis, _unify_named_axis +from awkward._nplikes.numpy_like import NumpyMetadata + +__all__ = ("unsafe_zip",) + +np = NumpyMetadata.instance() + + +@high_level_function() +def unsafe_zip( + arrays, + *, + parameters=None, + with_name=None, + highlevel=True, + behavior=None, + attrs=None, +): + """ + Args: + arrays (mapping or sequence of arrays): Each value in this mapping or + sequence can be any array-like data that #ak.to_layout recognizes. + parameters (None or dict): Parameters for the new + #ak.contents.RecordArray node that is created by this operation. + with_name (None or str): Assigns a `"__record__"` name to the new + #ak.contents.RecordArray node that is created by this operation + (overriding `parameters`, if necessary). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + attrs (None or dict): Custom attributes for the output array, if + high-level. + + Combines `arrays` into a single structure as the fields of a collection + of records or the slots of a collection of tuples. + + Caution: unlike #ak.zip this function will _not_ broadcast the arrays together. + It assumes that the given arrays have already the same layouts and lengths. + + This operation may be thought of as the opposite of projection in + #ak.Array.__getitem__, which extracts fields one at a time, or + #ak.unzip, which extracts them all in one call. + + Consider the following arrays, `one` and `two`. + + >>> one = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5], [6.6]]) + >>> two = ak.Array([["a", "b", "c"], [], ["d", "e"], ["f"]]) + + Zipping them together using a dict creates a collection of records with + the same nesting structure as `one` and `two`. + + >>> ak.unsafe_zip({"x": one, "y": two}).show() + [[{x: 1.1, y: 'a'}, {x: 2.2, y: 'b'}, {x: 3.3, y: 'c'}], + [], + [{x: 4.4, y: 'd'}], + []] + + Doing so with a list creates tuples, whose fields are not named. + + >>> ak.zip([one, two]).show() + [[(1.1, 'a'), (2.2, 'b'), (3.3, 'c')], + [], + [(4.4, 'd')], + []] + + See also #ak.zip and #ak.unzip. + """ + # Dispatch + if isinstance(arrays, Mapping): + yield arrays.values() + else: + yield arrays + + # Implementation + return _impl( + arrays, + parameters, + with_name, + highlevel, + behavior, + attrs, + ) + + +def _impl( + arrays, + parameters, + with_name, + highlevel, + behavior, + attrs, +): + with HighLevelContext(behavior=behavior, attrs=attrs) as ctx: + if isinstance(arrays, Mapping): + layouts = ensure_same_backend( + *( + ctx.unwrap( + x, + allow_record=False, + allow_unknown=False, + none_policy="pass-through", + primitive_policy="pass-through", + ) + for x in arrays.values() + ) + ) + fields = list(arrays.keys()) + + # propagate named axis from input to output, + # use strategy "unify" (see: awkward._namedaxis) + out_named_axis = reduce( + _unify_named_axis, map(_get_named_axis, arrays.values()) + ) + + else: + layouts = ensure_same_backend( + *( + ctx.unwrap( + x, + allow_record=False, + allow_unknown=False, + none_policy="pass-through", + primitive_policy="pass-through", + ) + for x in arrays + ) + ) + fields = None + + # propagate named axis from input to output, + # use strategy "unify" (see: awkward._namedaxis) + out_named_axis = reduce(_unify_named_axis, map(_get_named_axis, arrays)) + + # determine backend + backend = next((b.backend for b in layouts if hasattr(b, "backend")), "cpu") + + if with_name is not None: + if parameters is None: + parameters = {} + else: + parameters = dict(parameters) + parameters["__record__"] = with_name + + # only allow all NumpyArrays and ListOffsetArrays + # maybe this could be done recursively, but for now just check the top level. This is also how ak.zip works. + if all(isinstance(layout, ak.contents.NumpyArray) for layout in layouts): + length = layouts[0].length + out = ak.contents.RecordArray( + layouts, fields, length=length, parameters=parameters, backend=backend + ) + elif all(isinstance(layout, ak.contents.ListOffsetArray) for layout in layouts): + contents = [] + for layout in layouts: + if not isinstance(layout.content, ak.contents.NumpyArray): + raise ValueError( + "can not (unsafe) zip ListOffsetArrays with non-NumpyArray contents" + ) + contents.append(layout.content) + # just get from the first one + offsets = layouts[0].offsets + length = layouts[0].length + out = ak.contents.ListOffsetArray( + offsets=offsets, + content=ak.contents.RecordArray( + contents, fields, length=length, parameters=parameters, backend=backend + ), + ) + else: + raise ValueError( + "all array layouts must be either NumpyArrays or ListOffsetArrays" + ) + + # Unify named axes propagated through the broadcast + wrapped_out = ctx.wrap(out, highlevel=highlevel) + return ak.operations.ak_with_named_axis._impl( + wrapped_out, + named_axis=out_named_axis, + highlevel=highlevel, + behavior=ctx.behavior, + attrs=ctx.attrs, + ) From eb4d4f80250a7e5edc37825fa95f1bbcbc6f9ae3 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Tue, 28 Jan 2025 17:03:28 -0500 Subject: [PATCH 2/6] fix getting correct length --- src/awkward/operations/ak_unsafe_zip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/operations/ak_unsafe_zip.py b/src/awkward/operations/ak_unsafe_zip.py index 3094e85e92..78529300c2 100644 --- a/src/awkward/operations/ak_unsafe_zip.py +++ b/src/awkward/operations/ak_unsafe_zip.py @@ -169,7 +169,7 @@ def _impl( contents.append(layout.content) # just get from the first one offsets = layouts[0].offsets - length = layouts[0].length + length = layouts[0].content.length out = ak.contents.ListOffsetArray( offsets=offsets, content=ak.contents.RecordArray( From f4f1e0d47fa009a5b8ca5090824cdbc03d738c44 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Tue, 28 Jan 2025 17:04:13 -0500 Subject: [PATCH 3/6] add tests --- tests/test_3390_ak_unsafe_zip.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/test_3390_ak_unsafe_zip.py diff --git a/tests/test_3390_ak_unsafe_zip.py b/tests/test_3390_ak_unsafe_zip.py new file mode 100644 index 0000000000..c98faf854d --- /dev/null +++ b/tests/test_3390_ak_unsafe_zip.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE +# ruff: noqa: E402 + +from __future__ import annotations + +import awkward as ak + + +def test_ak_unsafe_zip_NumpyArray_dict(): + a = ak.Array([1]) + b = ak.Array([2]) + c = ak.unsafe_zip({"a": a, "b": b}) + assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b})) + + +def test_ak_unsafe_zip_ListOffsetArray_dict(): + a = ak.Array([[1], []]) + b = ak.Array([[2], []]) + c = ak.unsafe_zip({"a": a, "b": b}) + assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b})) + + +def test_ak_unsafe_zip_NumpyArray_list(): + a = ak.Array([1]) + b = ak.Array([2]) + c = ak.unsafe_zip([a, b]) + assert ak.to_list(c) == ak.to_list(ak.zip([a, b])) + + +def test_ak_unsafe_zip_ListOffsetArray_list(): + a = ak.Array([[1], []]) + b = ak.Array([[2], []]) + c = ak.unsafe_zip([a, b]) + assert ak.to_list(c) == ak.to_list(ak.zip([a, b])) + + +def test_typetracer_NumpyArray_non_touching(): + tracer = ak.Array([1], backend="typetracer") + + tracer, report = ak.typetracer.typetracer_with_report( + tracer.layout.form_with_key(), highlevel=True + ) + + _ = ak.unsafe_zip({"foo": tracer, "bar": tracer}) + assert len(report.shape_touched) == 1 + assert len(report.data_touched) == 0 + + +def test_typetracer_ListOffsetArray_non_touching(): + tracer = ak.Array([[1], [], [2, 3]], backend="typetracer") + + tracer, report = ak.typetracer.typetracer_with_report( + tracer.layout.form_with_key(), highlevel=True + ) + + _ = ak.unsafe_zip({"foo": tracer, "bar": tracer}) + assert len(report.shape_touched) == 1 + assert len(report.data_touched) == 0 From 406de57acc236d574ed01b6648c51a030986346d Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Tue, 28 Jan 2025 17:37:38 -0500 Subject: [PATCH 4/6] check same lengths (more safe) --- src/awkward/operations/ak_unsafe_zip.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/awkward/operations/ak_unsafe_zip.py b/src/awkward/operations/ak_unsafe_zip.py index 78529300c2..1395490e6a 100644 --- a/src/awkward/operations/ak_unsafe_zip.py +++ b/src/awkward/operations/ak_unsafe_zip.py @@ -155,7 +155,7 @@ def _impl( # only allow all NumpyArrays and ListOffsetArrays # maybe this could be done recursively, but for now just check the top level. This is also how ak.zip works. if all(isinstance(layout, ak.contents.NumpyArray) for layout in layouts): - length = layouts[0].length + length = _check_equal_lengths(layouts) out = ak.contents.RecordArray( layouts, fields, length=length, parameters=parameters, backend=backend ) @@ -169,7 +169,7 @@ def _impl( contents.append(layout.content) # just get from the first one offsets = layouts[0].offsets - length = layouts[0].content.length + length = _check_equal_lengths([layout.content for layout in layouts]) out = ak.contents.ListOffsetArray( offsets=offsets, content=ak.contents.RecordArray( @@ -190,3 +190,13 @@ def _impl( behavior=ctx.behavior, attrs=ctx.attrs, ) + + +def _check_equal_lengths( + layouts: ak.contents.Content, +) -> int | ak._nplikes.shape.UnknownLength: + length = layouts[0].length + for layout in layouts: + if layout.length != length: + raise ValueError("all arrays must have the same length") + return length From a6d0f8074e04fa7c92ce1b3c38233d72c6de7dee Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Wed, 29 Jan 2025 13:50:30 -0500 Subject: [PATCH 5/6] unsafe_zip -> zip_no_broadcast; check equal offsets at runtime with actual data --- src/awkward/operations/__init__.py | 2 +- ...k_unsafe_zip.py => ak_zip_no_broadcast.py} | 42 ++++++++++++++----- ...ip.py => test_3390_ak_zip_no_broadcast.py} | 20 ++++----- 3 files changed, 42 insertions(+), 22 deletions(-) rename src/awkward/operations/{ak_unsafe_zip.py => ak_zip_no_broadcast.py} (81%) rename tests/{test_3390_ak_unsafe_zip.py => test_3390_ak_zip_no_broadcast.py} (71%) diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py index 49964b4066..306a49bb48 100644 --- a/src/awkward/operations/__init__.py +++ b/src/awkward/operations/__init__.py @@ -109,7 +109,6 @@ from awkward.operations.ak_transform import * from awkward.operations.ak_type import * from awkward.operations.ak_unflatten import * -from awkward.operations.ak_unsafe_zip import * from awkward.operations.ak_unzip import * from awkward.operations.ak_validity_error import * from awkward.operations.ak_values_astype import * @@ -124,3 +123,4 @@ from awkward.operations.ak_without_parameters import * from awkward.operations.ak_zeros_like import * from awkward.operations.ak_zip import * +from awkward.operations.ak_zip_no_broadcast import * diff --git a/src/awkward/operations/ak_unsafe_zip.py b/src/awkward/operations/ak_zip_no_broadcast.py similarity index 81% rename from src/awkward/operations/ak_unsafe_zip.py rename to src/awkward/operations/ak_zip_no_broadcast.py index 1395490e6a..729157a23c 100644 --- a/src/awkward/operations/ak_unsafe_zip.py +++ b/src/awkward/operations/ak_zip_no_broadcast.py @@ -11,13 +11,13 @@ from awkward._namedaxis import _get_named_axis, _unify_named_axis from awkward._nplikes.numpy_like import NumpyMetadata -__all__ = ("unsafe_zip",) +__all__ = ("zip_no_broadcast",) np = NumpyMetadata.instance() @high_level_function() -def unsafe_zip( +def zip_no_broadcast( arrays, *, parameters=None, @@ -46,7 +46,7 @@ def unsafe_zip( of records or the slots of a collection of tuples. Caution: unlike #ak.zip this function will _not_ broadcast the arrays together. - It assumes that the given arrays have already the same layouts and lengths. + During typetracing, it assumes that the given arrays have already the same layouts and lengths. This operation may be thought of as the opposite of projection in #ak.Array.__getitem__, which extracts fields one at a time, or @@ -60,7 +60,7 @@ def unsafe_zip( Zipping them together using a dict creates a collection of records with the same nesting structure as `one` and `two`. - >>> ak.unsafe_zip({"x": one, "y": two}).show() + >>> ak.zip_no_broadcast({"x": one, "y": two}).show() [[{x: 1.1, y: 'a'}, {x: 2.2, y: 'b'}, {x: 3.3, y: 'c'}], [], [{x: 4.4, y: 'd'}], @@ -153,7 +153,6 @@ def _impl( parameters["__record__"] = with_name # only allow all NumpyArrays and ListOffsetArrays - # maybe this could be done recursively, but for now just check the top level. This is also how ak.zip works. if all(isinstance(layout, ak.contents.NumpyArray) for layout in layouts): length = _check_equal_lengths(layouts) out = ak.contents.RecordArray( @@ -162,14 +161,35 @@ def _impl( elif all(isinstance(layout, ak.contents.ListOffsetArray) for layout in layouts): contents = [] for layout in layouts: + # get the content of the ListOffsetArray if not isinstance(layout.content, ak.contents.NumpyArray): raise ValueError( "can not (unsafe) zip ListOffsetArrays with non-NumpyArray contents" ) contents.append(layout.content) - # just get from the first one - offsets = layouts[0].offsets - length = _check_equal_lengths([layout.content for layout in layouts]) + + if backend.name == "typetracer": + # just get from the first one + # we're in typetracer mode, so we can't check the offsets (see else branch) + offsets = layouts[0].offsets + else: + # this is at 'runtime' with actual data, that means we can check the offsets, + # but only those that have actual data, i.e. no PlaceholderArrays + # so first, let's filter out any PlaceholderArrays + comparable_offsets = filter( + lambda o: not isinstance(o, ak._nplikes.placeholder.PlaceholderArray), + (layout.offsets for layout in layouts), + ) + # check that offsets are the same + first = next(comparable_offsets) + if not all( + first.nplike.all(offsets.data == first.data) + for offsets in comparable_offsets + ): + raise ValueError("all ListOffsetArrays must have the same offsets") + offsets = first + + length = _check_equal_lengths(contents) out = ak.contents.ListOffsetArray( offsets=offsets, content=ak.contents.RecordArray( @@ -193,10 +213,10 @@ def _impl( def _check_equal_lengths( - layouts: ak.contents.Content, + contents: ak.contents.Content, ) -> int | ak._nplikes.shape.UnknownLength: - length = layouts[0].length - for layout in layouts: + length = contents[0].length + for layout in contents: if layout.length != length: raise ValueError("all arrays must have the same length") return length diff --git a/tests/test_3390_ak_unsafe_zip.py b/tests/test_3390_ak_zip_no_broadcast.py similarity index 71% rename from tests/test_3390_ak_unsafe_zip.py rename to tests/test_3390_ak_zip_no_broadcast.py index c98faf854d..fe327ce4ec 100644 --- a/tests/test_3390_ak_unsafe_zip.py +++ b/tests/test_3390_ak_zip_no_broadcast.py @@ -6,31 +6,31 @@ import awkward as ak -def test_ak_unsafe_zip_NumpyArray_dict(): +def test_ak_zip_no_broadcast_NumpyArray_dict(): a = ak.Array([1]) b = ak.Array([2]) - c = ak.unsafe_zip({"a": a, "b": b}) + c = ak.zip_no_broadcast({"a": a, "b": b}) assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b})) -def test_ak_unsafe_zip_ListOffsetArray_dict(): +def test_ak_zip_no_broadcast_ListOffsetArray_dict(): a = ak.Array([[1], []]) b = ak.Array([[2], []]) - c = ak.unsafe_zip({"a": a, "b": b}) + c = ak.zip_no_broadcast({"a": a, "b": b}) assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b})) -def test_ak_unsafe_zip_NumpyArray_list(): +def test_ak_zip_no_broadcast_NumpyArray_list(): a = ak.Array([1]) b = ak.Array([2]) - c = ak.unsafe_zip([a, b]) + c = ak.zip_no_broadcast([a, b]) assert ak.to_list(c) == ak.to_list(ak.zip([a, b])) -def test_ak_unsafe_zip_ListOffsetArray_list(): +def test_ak_zip_no_broadcast_ListOffsetArray_list(): a = ak.Array([[1], []]) b = ak.Array([[2], []]) - c = ak.unsafe_zip([a, b]) + c = ak.zip_no_broadcast([a, b]) assert ak.to_list(c) == ak.to_list(ak.zip([a, b])) @@ -41,7 +41,7 @@ def test_typetracer_NumpyArray_non_touching(): tracer.layout.form_with_key(), highlevel=True ) - _ = ak.unsafe_zip({"foo": tracer, "bar": tracer}) + _ = ak.zip_no_broadcast({"foo": tracer, "bar": tracer}) assert len(report.shape_touched) == 1 assert len(report.data_touched) == 0 @@ -53,6 +53,6 @@ def test_typetracer_ListOffsetArray_non_touching(): tracer.layout.form_with_key(), highlevel=True ) - _ = ak.unsafe_zip({"foo": tracer, "bar": tracer}) + _ = ak.zip_no_broadcast({"foo": tracer, "bar": tracer}) assert len(report.shape_touched) == 1 assert len(report.data_touched) == 0 From 143214b20fd42225ac392dd5d8081132f14abde5 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Wed, 29 Jan 2025 13:53:37 -0500 Subject: [PATCH 6/6] fix typo in doc string --- src/awkward/operations/ak_zip_no_broadcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/operations/ak_zip_no_broadcast.py b/src/awkward/operations/ak_zip_no_broadcast.py index 729157a23c..87e7ccb1b2 100644 --- a/src/awkward/operations/ak_zip_no_broadcast.py +++ b/src/awkward/operations/ak_zip_no_broadcast.py @@ -68,7 +68,7 @@ def zip_no_broadcast( Doing so with a list creates tuples, whose fields are not named. - >>> ak.zip([one, two]).show() + >>> ak.zip_no_broadcast([one, two]).show() [[(1.1, 'a'), (2.2, 'b'), (3.3, 'c')], [], [(4.4, 'd')],