From 4124889b8213dacac9c80649cc09744453f397ca Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 19 Sep 2023 00:19:00 +0200 Subject: [PATCH] fix: support placeholders in `from_buffers` (#2714) * fix: support unknown lengths in NumpyArray * refactor: don't handle placeholder arrays in `nplike.frombuffer` * fix: ensure that slicing enforces typetracer invariant * fix: placeholders are 0-bytes * fix: support unknown lengths in `from_buffers` * chore: improve comment * fix: always reshape buffers * refactor: appease pylint * test: add simple tests * fix: ensure unions handle placeholders in either branch --- src/awkward/_nplikes/array_module.py | 10 +- src/awkward/_nplikes/placeholder.py | 30 +- src/awkward/forms/form.py | 5 +- src/awkward/forms/numpyform.py | 6 +- src/awkward/operations/ak_from_buffers.py | 99 ++-- tests/test_2714_from_buffers_placeholders.py | 450 +++++++++++++++++++ 6 files changed, 552 insertions(+), 48 deletions(-) create mode 100644 tests/test_2714_from_buffers_placeholders.py diff --git a/src/awkward/_nplikes/array_module.py b/src/awkward/_nplikes/array_module.py index 4e46bf5b73..291fec4623 100644 --- a/src/awkward/_nplikes/array_module.py +++ b/src/awkward/_nplikes/array_module.py @@ -53,15 +53,11 @@ def ascontiguousarray(self, x: ArrayLike) -> ArrayLike: return self._module.ascontiguousarray(x) def frombuffer( - self, buffer, *, dtype: np.dtype | None = None, count: int = -1 + self, buffer, *, dtype: np.dtype | None = None, count: ShapeItem = -1 ) -> ArrayLike: if isinstance(buffer, PlaceholderArray): - if count == -1: - return self.asarray(buffer) - else: - return self.asarray(buffer[:count]) - else: - return self._module.frombuffer(buffer, dtype=dtype, count=count) + raise TypeError("placeholder arrays are not supported in `frombuffer`") + return self._module.frombuffer(buffer, dtype=dtype, count=count) def from_dlpack(self, x: Any) -> ArrayLike: return self._module.from_dlpack(x) diff --git a/src/awkward/_nplikes/placeholder.py b/src/awkward/_nplikes/placeholder.py index 2d5612ac7d..c401121d03 100644 --- a/src/awkward/_nplikes/placeholder.py +++ b/src/awkward/_nplikes/placeholder.py @@ -17,7 +17,7 @@ def __init__( ): self._nplike = nplike self._shape = shape - self._dtype = dtype + self._dtype = np.dtype(dtype) @property def dtype(self) -> np.dtype: @@ -37,7 +37,7 @@ def size(self) -> int: @property def nbytes(self) -> int: - return self.size * self._dtype.itemsize + return 0 @property def strides(self) -> tuple[int, ...]: @@ -67,13 +67,29 @@ def view(self, dtype: dtype) -> Self: return type(self)(self._nplike, shape, dtype) def __getitem__(self, index): + # Typetracers permit slices that don't touch data or shapes if isinstance(index, slice): - if self._shape[0] is unknown_length: - return type(self)(self._nplike, self._shape, self._dtype) + length = self._shape[0] + + # Unknown-length placeholders should not be sliced (as their shapes would be touched( + if length is unknown_length: + raise AssertionError( + "placeholder arrays that are sliced should have known shapes" + ) + # Known-length placeholders *always* need a known shape + elif ( + index.start is unknown_length + or index.stop is unknown_length + or index.step is unknown_length + ): + raise AssertionError( + "known-length placeholders should never encounter unknown lengths in slices" + ) else: - start, stop, step = index.indices(self._shape[0]) - new_shape = ((stop - start) // step,) - return type(self)(self._nplike, new_shape, self._dtype) + start, stop, step = index.indices(length) + new_length = (stop - start) // step + + return type(self)(self._nplike, (new_length,), self._dtype) else: raise TypeError( f"{type(self).__name__} supports only trivial slices, not {type(index).__name__}" diff --git a/src/awkward/forms/form.py b/src/awkward/forms/form.py index a661a2c52a..fc433d7fb5 100644 --- a/src/awkward/forms/form.py +++ b/src/awkward/forms/form.py @@ -46,7 +46,10 @@ def from_dict(input: Mapping) -> Form: if input["class"] == "NumpyArray": primitive = input["primitive"] - inner_shape = input.get("inner_shape", []) + inner_shape = tuple( + unknown_length if item is None else item + for item in input.get("inner_shape", []) + ) return ak.forms.NumpyForm( primitive, inner_shape, parameters=parameters, form_key=form_key ) diff --git a/src/awkward/forms/numpyform.py b/src/awkward/forms/numpyform.py index 34dfb23694..aba2a85f4e 100644 --- a/src/awkward/forms/numpyform.py +++ b/src/awkward/forms/numpyform.py @@ -7,6 +7,7 @@ import awkward as ak from awkward._errors import deprecate from awkward._nplikes.numpylike import NumpyMetadata +from awkward._nplikes.shape import unknown_length from awkward._parameters import type_parameters_equal from awkward._typing import JSONSerializable, Self, final from awkward._util import UNSET @@ -139,7 +140,10 @@ def _to_dict_part(self, verbose, toplevel): "primitive": self._primitive, } if verbose or len(self._inner_shape) > 0: - out["inner_shape"] = list(self._inner_shape) + out["inner_shape"] = [ + None if item is unknown_length else item + for item in self._inner_shape + ] return self._to_dict_extra(out, verbose) @property diff --git a/src/awkward/operations/ak_from_buffers.py b/src/awkward/operations/ak_from_buffers.py index 730b4127d3..1744a96463 100644 --- a/src/awkward/operations/ak_from_buffers.py +++ b/src/awkward/operations/ak_from_buffers.py @@ -9,7 +9,9 @@ from awkward._dispatch import high_level_function from awkward._layout import wrap_layout from awkward._nplikes.numpy import Numpy -from awkward._nplikes.numpylike import NumpyMetadata +from awkward._nplikes.numpylike import ArrayLike, NumpyLike, NumpyMetadata +from awkward._nplikes.placeholder import PlaceholderArray +from awkward._nplikes.shape import ShapeItem, unknown_length from awkward._regularize import is_integer from awkward.forms.form import index_to_dtype, regularize_buffer_key @@ -140,11 +142,27 @@ def _impl( return wrap_layout(out, behavior, highlevel) -def _from_buffer(nplike, buffer, dtype, count, byteorder): - if nplike.is_own_array(buffer): +def _from_buffer( + nplike: NumpyLike, buffer, dtype: np.dtype, count: ShapeItem, byteorder: str +) -> ArrayLike: + # Unknown-length information implies that we didn't load shape-buffers (offsets, etc) + # for the parent of this node. Thus, this node and its children *must* only + # contain placeholders + if count is unknown_length: + if not isinstance(buffer, PlaceholderArray): + raise AssertionError("Encountered unknown length for concrete buffer") + return PlaceholderArray(nplike, (unknown_length,), dtype) + # Known-length information implies that we should have known-length buffers here + # Therefore, placeholders without shape information are not permitted + elif isinstance(buffer, PlaceholderArray) or nplike.is_own_array(buffer): + # Require 1D buffers array = nplike.reshape(buffer.view(dtype), shape=(-1,), copy=False) - # Require 1D + # Raise if the buffer we encountered isn't definitely-sized + if array.size is unknown_length: + raise AssertionError( + "Encountered unknown length for placeholder in context where length should be known" + ) if array.size < count: raise TypeError( f"size of array ({array.size}) is less than size of form ({count})" @@ -168,9 +186,7 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) elif isinstance(form, ak.forms.NumpyForm): dtype = ak.types.numpytype.primitive_to_dtype(form.primitive) raw_array = container[getkey(form, "data")] - real_length = length - for x in form.inner_shape: - real_length *= x + real_length = length * math.prod(form.inner_shape) data = _from_buffer( backend.nplike, raw_array, @@ -179,10 +195,8 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) byteorder=byteorder, ) if form.inner_shape != (): - if len(data) == 0: - data = backend.nplike.reshape(data, (length, *form.inner_shape)) - else: - data = backend.nplike.reshape(data, (-1, *form.inner_shape)) + data = backend.nplike.reshape(data, (length, *form.inner_shape)) + return ak.contents.NumpyArray( data, parameters=form._parameters, backend=backend ) @@ -199,12 +213,15 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) elif isinstance(form, ak.forms.BitMaskedForm): raw_array = container[getkey(form, "mask")] - excess_length = int(math.ceil(length / 8.0)) + if length is unknown_length: + next_length = unknown_length + else: + next_length = int(math.ceil(length / 8.0)) mask = _from_buffer( backend.index_nplike, raw_array, dtype=index_to_dtype[form.mask], - count=excess_length, + count=next_length, byteorder=byteorder, ) content = _reconstitute( @@ -255,9 +272,12 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) count=length, byteorder=byteorder, ) - next_length = ( - 0 if len(index) == 0 else max(0, backend.index_nplike.max(index) + 1) - ) + if isinstance(index, PlaceholderArray): + next_length = unknown_length + else: + next_length = ( + 0 if len(index) == 0 else max(0, backend.index_nplike.max(index) + 1) + ) content = _reconstitute( form.content, next_length, container, getkey, backend, byteorder, simplify ) @@ -280,13 +300,16 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) count=length, byteorder=byteorder, ) - next_length = ( - 0 - if len(index) == 0 - else backend.index_nplike.index_as_shape_item( - backend.index_nplike.max(index) + 1 + if isinstance(index, PlaceholderArray): + next_length = unknown_length + else: + next_length = ( + 0 + if len(index) == 0 + else backend.index_nplike.index_as_shape_item( + backend.index_nplike.max(index) + 1 + ) ) - ) content = _reconstitute( form.content, next_length, container, getkey, backend, byteorder, simplify ) @@ -317,8 +340,13 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) count=length, byteorder=byteorder, ) - reduced_stops = stops[starts != stops] - next_length = 0 if len(starts) == 0 else backend.index_nplike.max(reduced_stops) + if isinstance(stops, PlaceholderArray): + next_length = unknown_length + else: + reduced_stops = stops[starts != stops] + next_length = ( + 0 if len(starts) == 0 else backend.index_nplike.max(reduced_stops) + ) content = _reconstitute( form.content, next_length, container, getkey, backend, byteorder, simplify ) @@ -338,7 +366,11 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) count=length + 1, byteorder=byteorder, ) - next_length = 0 if len(offsets) == 1 else offsets[-1] + + if isinstance(offsets, PlaceholderArray): + next_length = unknown_length + else: + next_length = 0 if len(offsets) == 1 else offsets[-1] content = _reconstitute( form.content, next_length, container, getkey, backend, byteorder, simplify ) @@ -391,13 +423,16 @@ def _reconstitute(form, length, container, getkey, backend, byteorder, simplify) count=length, byteorder=byteorder, ) - lengths = [] - for tag in range(len(form.contents)): - selected_index = index[tags == tag] - if len(selected_index) == 0: - lengths.append(0) - else: - lengths.append(backend.index_nplike.max(selected_index) + 1) + if isinstance(index, PlaceholderArray) or isinstance(tags, PlaceholderArray): + lengths = [unknown_length] * len(form.contents) + else: + lengths = [] + for tag in range(len(form.contents)): + selected_index = index[tags == tag] + if len(selected_index) == 0: + lengths.append(0) + else: + lengths.append(backend.index_nplike.max(selected_index) + 1) contents = [ _reconstitute( content, lengths[i], container, getkey, backend, byteorder, simplify diff --git a/tests/test_2714_from_buffers_placeholders.py b/tests/test_2714_from_buffers_placeholders.py new file mode 100644 index 0000000000..1c72ce047a --- /dev/null +++ b/tests/test_2714_from_buffers_placeholders.py @@ -0,0 +1,450 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + + +import numpy as np +import pytest + +import awkward as ak +from awkward._nplikes.numpy import Numpy +from awkward._nplikes.placeholder import PlaceholderArray +from awkward._nplikes.shape import unknown_length + +numpy = Numpy.instance() + + +def test_numpyarray(): + layout = ak.from_buffers( + {"class": "NumpyArray", "primitive": "int64", "form_key": "node0"}, + 10, + {"node0-data": PlaceholderArray(numpy, (10,), np.int64)}, + highlevel=False, + ) + assert layout.length == 10 + + # Content too small + with pytest.raises(TypeError, match=r"is less than size of form"): + ak.from_buffers( + {"class": "NumpyArray", "primitive": "int64", "form_key": "node0"}, + 10, + {"node0-data": PlaceholderArray(numpy, (9,), np.int64)}, + highlevel=False, + ) + + # Unknown length content at top-level + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + {"class": "NumpyArray", "primitive": "int64", "form_key": "node0"}, + 10, + {"node0-data": PlaceholderArray(numpy, (unknown_length,), np.int64)}, + highlevel=False, + ) + + +def test_listoffsetarray_numpyarray(): + # Unknown data + layout = ak.from_buffers( + { + "class": "ListOffsetArray", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "offsets": "i64", + "form_key": "node0", + }, + 2, + { + "node0-offsets": np.array([0, 1, 2], dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (10,), dtype=np.int64), + }, + highlevel=False, + ) + assert layout.length == 2 + assert layout.content.length == 2 + + # Unknown offsets + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + { + "class": "ListOffsetArray", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "offsets": "i64", + "form_key": "node0", + }, + 2, + { + "node0-offsets": PlaceholderArray(numpy, (3,), dtype=np.int64), + "node1-data": np.array( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=np.int64 + ), + }, + highlevel=False, + ) + + # Unknown offsets and unknown data + layout = ak.from_buffers( + { + "class": "ListOffsetArray", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "offsets": "i64", + "form_key": "node0", + }, + 2, + { + "node0-offsets": PlaceholderArray(numpy, (3,), dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (10,), dtype=np.int64), + }, + highlevel=False, + ) + assert layout.length == 2 + assert layout.content.length is unknown_length + + +def test_listarray_numpyarray(): + # Unknown data + layout = ak.from_buffers( + { + "class": "ListArray", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "starts": "i64", + "stops": "i64", + "form_key": "node0", + }, + 2, + { + "node0-starts": np.array([0, 1], dtype=np.int64), + "node0-stops": np.array([1, 2], dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (10,), dtype=np.int64), + }, + highlevel=False, + ) + assert layout.length == 2 + assert layout.content.length == 2 + + # Unknown offsets + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + { + "class": "ListArray", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "starts": "i64", + "stops": "i64", + "form_key": "node0", + }, + 2, + { + "node0-starts": PlaceholderArray(numpy, (2,), dtype=np.int64), + "node0-stops": PlaceholderArray(numpy, (2,), dtype=np.int64), + "node1-data": np.array( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=np.int64 + ), + }, + highlevel=False, + ) + + # Unknown offsets and unknown data + layout = ak.from_buffers( + { + "class": "ListArray", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "starts": "i64", + "stops": "i64", + "form_key": "node0", + }, + 2, + { + "node0-starts": PlaceholderArray(numpy, (2,), dtype=np.int64), + "node0-stops": PlaceholderArray(numpy, (2,), dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (10,), dtype=np.int64), + }, + highlevel=False, + ) + assert layout.length == 2 + assert layout.content.length is unknown_length + + +def test_indexedoptionarray(): + # Unknown data + layout = ak.from_buffers( + { + "class": "IndexedOptionArray", + "index": "i64", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "form_key": "node0", + }, + 3, + { + "node0-index": np.array([0, 1, 2], dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (3,), np.int64), + }, + highlevel=False, + ) + assert layout.length == 3 + assert layout.content.length == 3 + + # Unknown index + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + { + "class": "IndexedOptionArray", + "index": "i64", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "form_key": "node0", + }, + 3, + { + "node0-index": PlaceholderArray(numpy, (3,), np.int64), + "node1-data": np.array([0, 1, 2, 3, 4, 5], dtype=np.int64), + }, + highlevel=False, + ) + + # Unknown data + layout = ak.from_buffers( + { + "class": "IndexedOptionArray", + "index": "i64", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "form_key": "node0", + }, + 3, + { + "node0-index": PlaceholderArray(numpy, (3,), np.int64), + "node1-data": PlaceholderArray(numpy, (6,), np.int64), + }, + highlevel=False, + ) + assert layout.length == 3 + assert layout.content.length is unknown_length + + +def test_indexedarray(): + # Unknown data + layout = ak.from_buffers( + { + "class": "IndexedArray", + "index": "i64", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "form_key": "node0", + }, + 3, + { + "node0-index": np.array([0, 1, 2], dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (3,), np.int64), + }, + highlevel=False, + ) + assert layout.length == 3 + assert layout.content.length == 3 + + # Unknown index + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + { + "class": "IndexedArray", + "index": "i64", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "form_key": "node0", + }, + 3, + { + "node0-index": PlaceholderArray(numpy, (3,), np.int64), + "node1-data": np.array([0, 1, 2, 3, 4, 5], dtype=np.int64), + }, + highlevel=False, + ) + + # Unknown data + layout = ak.from_buffers( + { + "class": "IndexedArray", + "index": "i64", + "content": { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + "form_key": "node0", + }, + 3, + { + "node0-index": PlaceholderArray(numpy, (3,), np.int64), + "node1-data": PlaceholderArray(numpy, (6,), np.int64), + }, + highlevel=False, + ) + assert layout.length == 3 + assert layout.content.length is unknown_length + + +def test_unionarray(): + # Unknown data + layout = ak.from_buffers( + { + "class": "UnionArray", + "tags": "i8", + "index": "i64", + "contents": [ + { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + { + "class": "NumpyArray", + "primitive": "datetime64[D]", + "form_key": "node2", + }, + ], + "form_key": "node0", + }, + 3, + { + "node0-tags": np.array([0, 0, 1], dtype=np.int8), + "node0-index": np.array([0, 1, 0], dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (3,), np.int64), + "node2-data": PlaceholderArray(numpy, (6,), np.dtype("datetime64[D]")), + }, + highlevel=False, + ) + assert layout.length == 3 + assert layout.contents[0].length == 2 + assert layout.contents[1].length == 1 + + # Unknown tags + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + { + "class": "UnionArray", + "tags": "i8", + "index": "i64", + "contents": [ + { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + { + "class": "NumpyArray", + "primitive": "datetime64[D]", + "form_key": "node2", + }, + ], + "form_key": "node0", + }, + 3, + { + "node0-tags": PlaceholderArray(numpy, (3,), np.int8), + "node0-index": np.array([0, 1, 0], dtype=np.int64), + "node1-data": np.array([0, 1, 2], np.int64), + "node2-data": np.array( + [0, 1, 2, 3, 4, 5, 6], np.dtype("datetime64[D]") + ), + }, + highlevel=False, + ) + + # Unknown index + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + { + "class": "UnionArray", + "tags": "i8", + "index": "i64", + "contents": [ + { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + { + "class": "NumpyArray", + "primitive": "datetime64[D]", + "form_key": "node2", + }, + ], + "form_key": "node0", + }, + 3, + { + "node0-tags": np.array([0, 0, 1], dtype=np.int8), + "node0-index": PlaceholderArray(numpy, (3,), np.int64), + "node1-data": np.array([0, 1, 2], np.int64), + "node2-data": np.array( + [0, 1, 2, 3, 4, 5, 6], np.dtype("datetime64[D]") + ), + }, + highlevel=False, + ) + + # Unknown content length + with pytest.raises(AssertionError, match=r"Encountered unknown length"): + ak.from_buffers( + { + "class": "UnionArray", + "tags": "i8", + "index": "i64", + "contents": [ + { + "class": "NumpyArray", + "primitive": "int64", + "form_key": "node1", + }, + { + "class": "NumpyArray", + "primitive": "datetime64[D]", + "form_key": "node2", + }, + ], + "form_key": "node0", + }, + 3, + { + "node0-tags": np.array([0, 0, 1], dtype=np.int8), + "node0-index": np.array([0, 1, 0], dtype=np.int64), + "node1-data": PlaceholderArray(numpy, (unknown_length,), np.int64), + "node2-data": PlaceholderArray(numpy, (6,), np.dtype("datetime64[D]")), + }, + highlevel=False, + )