From 1c368f16f27e5f18e9c1d55ff0993e40a67dbd88 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 13 Sep 2024 14:00:01 -0400 Subject: [PATCH] feat: Add to_cudf (#3051) * Start * style: pre-commit fixes * results of chatting * style: pre-commit fixes * Add toplevel func * fix * fix in indexedoptionarray * Add tests and fixes * style: pre-commit fixes * use direct np module (for now) * style: pre-commit fixes * Don't accidentally iterate cupy with CPU * style: pre-commit fixes * Update src/awkward/_errors.py * add docstring * Add string * style: pre-commit fixes * Simpler for numerics This works also for time types, without going via cupy --------- Co-authored-by: Martin Durant Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- awkward-cpp/rapidjson | 2 +- src/awkward/contents/bitmaskedarray.py | 19 ++++++++ src/awkward/contents/bytemaskedarray.py | 13 +++++ src/awkward/contents/content.py | 4 ++ src/awkward/contents/emptyarray.py | 9 ++++ src/awkward/contents/indexedarray.py | 10 ++++ src/awkward/contents/indexedoptionarray.py | 3 ++ src/awkward/contents/listarray.py | 3 ++ src/awkward/contents/listoffsetarray.py | 34 +++++++++++++ src/awkward/contents/numpyarray.py | 15 ++++++ src/awkward/contents/recordarray.py | 17 +++++++ src/awkward/contents/unmaskedarray.py | 3 ++ src/awkward/operations/__init__.py | 1 + src/awkward/operations/ak_to_cudf.py | 21 ++++++++ tests-cuda/test_3051_to_cuda.py | 57 ++++++++++++++++++++++ 15 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 src/awkward/operations/ak_to_cudf.py create mode 100644 tests-cuda/test_3051_to_cuda.py diff --git a/awkward-cpp/rapidjson b/awkward-cpp/rapidjson index 3b2441b87f..f54b0e47a0 160000 --- a/awkward-cpp/rapidjson +++ b/awkward-cpp/rapidjson @@ -1 +1 @@ -Subproject commit 3b2441b87f99ab65f37b141a7b548ebadb607b96 +Subproject commit f54b0e47a08782a6131cc3d60f94d038fa6e0a51 diff --git a/src/awkward/contents/bitmaskedarray.py b/src/awkward/contents/bitmaskedarray.py index 0e12133d6e..9c70bfc4b5 100644 --- a/src/awkward/contents/bitmaskedarray.py +++ b/src/awkward/contents/bitmaskedarray.py @@ -11,6 +11,7 @@ from awkward._backends.backend import Backend from awkward._meta.bitmaskedmeta import BitMaskedMeta from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata from awkward._nplikes.placeholder import PlaceholderArray @@ -687,6 +688,24 @@ def _to_arrow( pyarrow, mask_node, validbytes, length, options ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cp = Cupy.instance()._module + + assert mask is None # this class has its own mask + if not self.lsb_order: + m = cp.flip( + cp.packbits(cp.flip(cp.unpackbits(cp.asarray(self._mask.data)))) + ) + else: + m = self._mask.data + + if m.nbytes % 64: + m = cp.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(m) + inner = self._content._to_cudf(cudf, mask=None, length=length) + inner.set_base_mask(m) + return inner + def _to_backend_array(self, allow_missing, backend): return self.to_ByteMaskedArray()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/bytemaskedarray.py b/src/awkward/contents/bytemaskedarray.py index 65ad948a16..87beb5f59f 100644 --- a/src/awkward/contents/bytemaskedarray.py +++ b/src/awkward/contents/bytemaskedarray.py @@ -12,6 +12,7 @@ from awkward._layout import maybe_posaxis from awkward._meta.bytemaskedmeta import ByteMaskedMeta from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata from awkward._nplikes.placeholder import PlaceholderArray @@ -1051,6 +1052,18 @@ def _to_arrow( options, ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cp = Cupy.instance()._module + + assert mask is None # this class has its own mask + m = cp.packbits(cp.asarray(self._mask), bitorder="little") + if m.nbytes % 64: + m = cp.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(m) + inner = self._content._to_cudf(cudf, mask=None, length=length) + inner.set_base_mask(m) + return inner + def _to_backend_array(self, allow_missing, backend): return self.to_IndexedOptionArray64()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/content.py b/src/awkward/contents/content.py index 1a0fe080a9..d0169ee2eb 100644 --- a/src/awkward/contents/content.py +++ b/src/awkward/contents/content.py @@ -1010,6 +1010,10 @@ def _to_arrow( ): raise NotImplementedError + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + # prototype abstract signature + raise NotImplementedError + def to_backend_array( self, allow_missing: bool = True, *, backend: Backend | str | None = None ): diff --git a/src/awkward/contents/emptyarray.py b/src/awkward/contents/emptyarray.py index 112effddf0..06447f2d8b 100644 --- a/src/awkward/contents/emptyarray.py +++ b/src/awkward/contents/emptyarray.py @@ -387,6 +387,15 @@ def _to_arrow( ) return next._to_arrow(pyarrow, mask_node, validbytes, length, options) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + dtype = np.dtype("float64") + next = ak.contents.NumpyArray( + numpy.empty(length, dtype=dtype), + parameters=self._parameters, + backend=self._backend, + ) + return next._to_cudf(cudf, None, 0) + @classmethod def _arrow_needs_option_type(cls): return True # This overrides Content._arrow_needs_option_type diff --git a/src/awkward/contents/indexedarray.py b/src/awkward/contents/indexedarray.py index 6fb4ea3c69..6421f51742 100644 --- a/src/awkward/contents/indexedarray.py +++ b/src/awkward/contents/indexedarray.py @@ -1049,6 +1049,16 @@ def _to_arrow( ) return next2._to_arrow(pyarrow, mask_node, validbytes, length, options) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + if self._content.length == 0: + # IndexedOptionArray._to_arrow replaces -1 in the index with 0. So behind + # every masked value is self._content[0], unless self._content.length == 0. + # In that case, don't call self._content[index]; it's empty anyway. + next = self._content + else: + next = self._content._carry(self._index, False) + return next._to_cudf(cudf, None, len(next)) + def _to_backend_array(self, allow_missing, backend): return self.project()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/indexedoptionarray.py b/src/awkward/contents/indexedoptionarray.py index 0e68461dc5..2162fb72c4 100644 --- a/src/awkward/contents/indexedoptionarray.py +++ b/src/awkward/contents/indexedoptionarray.py @@ -1576,6 +1576,9 @@ def _to_arrow( options, ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + return self.to_ByteMaskedArray(True)._to_cudf(cudf, mask, length) + def _to_backend_array(self, allow_missing, backend): nplike = backend.nplike index_nplike = backend.index_nplike diff --git a/src/awkward/contents/listarray.py b/src/awkward/contents/listarray.py index 722b9044dd..a05eeaea55 100644 --- a/src/awkward/contents/listarray.py +++ b/src/awkward/contents/listarray.py @@ -1498,6 +1498,9 @@ def _to_arrow( pyarrow, mask_node, validbytes, length, options ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + return self.to_ListOffsetArray64(False)._to_cudf(cudf, mask, length) + def _to_backend_array(self, allow_missing, backend): array_param = self.parameter("__array__") if array_param in {"bytestring", "string"}: diff --git a/src/awkward/contents/listoffsetarray.py b/src/awkward/contents/listoffsetarray.py index 4aa149b69d..003467c24b 100644 --- a/src/awkward/contents/listoffsetarray.py +++ b/src/awkward/contents/listoffsetarray.py @@ -10,6 +10,7 @@ from awkward._layout import maybe_posaxis from awkward._meta.listoffsetmeta import ListOffsetMeta from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata from awkward._nplikes.placeholder import PlaceholderArray @@ -1999,6 +2000,39 @@ def _to_arrow( ), ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cupy = Cupy.instance() + index = self._offsets.raw(cupy).astype("int32") + buf = cudf.core.buffer.as_buffer(index) + ind_buf = cudf.core.column.numerical.NumericalColumn( + buf, index.dtype, None, size=len(index) + ) + cont = self._content._to_cudf(cudf, None, len(self._content)) + if mask is not None: + m = np._module.packbits(mask, bitorder="little") + if m.nbytes % 64: + m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(cupy.asarray(m)) + else: + m = None + if self.parameters.get("__array__") == "string": + from cudf.core.column.string import StringColumn + + data = cudf.core.buffer.as_buffer(cupy.asarray(self._content.data)) + # docs for StringColumn says there should be two children instead of a data= + return StringColumn( + data=data, + children=(ind_buf,), + mask=m, + ) + + return cudf.core.column.lists.ListColumn( + length, + mask=m, + children=(ind_buf, cont), + dtype=cudf.core.dtypes.ListDtype(cont.dtype), + ) + def _to_backend_array(self, allow_missing, backend): array_param = self.parameter("__array__") if array_param == "string": diff --git a/src/awkward/contents/numpyarray.py b/src/awkward/contents/numpyarray.py index 11a73bb124..315d9383b7 100644 --- a/src/awkward/contents/numpyarray.py +++ b/src/awkward/contents/numpyarray.py @@ -14,6 +14,7 @@ from awkward._meta.numpymeta import NumpyMeta from awkward._nplikes import to_nplike from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.jax import Jax from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata @@ -1220,6 +1221,20 @@ def _to_arrow( ), ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cupy = Cupy.instance() + from cudf.core.column.column import as_column + + assert self._backend.nplike.known_data + data = as_column(self._data) + if mask is not None: + m = cupy.packbits(cupy.asarray(mask), bitorder="little") + if m.nbytes % 64: + m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(m) + data.set_base_data(m) + return data + def _to_backend_array(self, allow_missing, backend): return to_nplike(self.data, backend.nplike, from_nplike=self._backend.nplike) diff --git a/src/awkward/contents/recordarray.py b/src/awkward/contents/recordarray.py index c091d45365..4aafcfd6b2 100644 --- a/src/awkward/contents/recordarray.py +++ b/src/awkward/contents/recordarray.py @@ -1101,6 +1101,23 @@ def _to_arrow( children=values, ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + children = tuple( + c._to_cudf(cudf, mask=None, length=length) for c in self.contents + ) + dt = cudf.core.dtypes.StructDtype( + {field: c.dtype for field, c in zip(self.fields, children)} + ) + m = mask._to_cudf(cudf, None, length) if mask else None + return cudf.core.column.struct.StructColumn( + data=None, + children=children, + dtype=dt, + mask=m, + size=length, + offset=0, + ) + def _to_backend_array(self, allow_missing, backend): if self.fields is None: return backend.nplike.empty(self.length, dtype=[]) diff --git a/src/awkward/contents/unmaskedarray.py b/src/awkward/contents/unmaskedarray.py index cbc726b310..0dd500ebc1 100644 --- a/src/awkward/contents/unmaskedarray.py +++ b/src/awkward/contents/unmaskedarray.py @@ -498,6 +498,9 @@ def _to_arrow( ): return self._content._to_arrow(pyarrow, self, None, length, options) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + return self._content._to_cudf(cudf, mask, length) + def _to_backend_array(self, allow_missing, backend): content = self.content._to_backend_array(allow_missing, backend) if allow_missing: diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py index 6d4a84c565..e9b1a3818b 100644 --- a/src/awkward/operations/__init__.py +++ b/src/awkward/operations/__init__.py @@ -86,6 +86,7 @@ from awkward.operations.ak_to_arrow_table import * from awkward.operations.ak_to_backend import * from awkward.operations.ak_to_buffers import * +from awkward.operations.ak_to_cudf import * from awkward.operations.ak_to_cupy import * from awkward.operations.ak_to_dataframe import * from awkward.operations.ak_to_feather import * diff --git a/src/awkward/operations/ak_to_cudf.py b/src/awkward/operations/ak_to_cudf.py new file mode 100644 index 0000000000..e45fe041a2 --- /dev/null +++ b/src/awkward/operations/ak_to_cudf.py @@ -0,0 +1,21 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE +from __future__ import annotations + +import awkward as ak +from awkward._dispatch import high_level_function + +__all__ = ("to_cudf",) + + +@high_level_function() +def to_cudf( + array: ak.Array, +): + """Create a cuDF.Series out of the given ak array + + Buffers that are not already in GPU memory will be transferred, and some + structural reformatting may happen to account for differences in architecture. + """ + import cudf + + return cudf.Series(array.layout._to_cudf(cudf, None, len(array))) diff --git a/tests-cuda/test_3051_to_cuda.py b/tests-cuda/test_3051_to_cuda.py new file mode 100644 index 0000000000..af02ed798f --- /dev/null +++ b/tests-cuda/test_3051_to_cuda.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import pytest + +import awkward as ak + +cudf = pytest.importorskip("cudf") +cupy = pytest.importorskip("cupy") + + +def test_jagged(): + arr = ak.Array([[[1, 2, 3], [], [3, 4]], []]) + out = ak.to_cudf(arr) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [[[1, 2, 3], [], [3, 4]], []] + + +def test_nested(): + arr = ak.Array( + [{"a": 0, "b": 1.0, "c": {"d": 0}}, {"a": 1, "b": 0.0, "c": {"d": 1}}] + ) + out = ak.to_cudf(arr) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [ + {"a": 0, "b": 1.0, "c": {"d": 0}}, + {"a": 1, "b": 0.0, "c": {"d": 1}}, + ] + + +def test_null(): + arr = ak.Array([12, None, 21, 12]) + # calls ByteMaskedArray._to_cudf not NumpyArray + out = ak.to_cudf(arr) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [12, None, 21, 12] + + # True is valid, LSB order + arr2 = ak.Array(arr.layout.to_BitMaskedArray(True, True)) + out = ak.to_cudf(arr2) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [12, None, 21, 12] + + # reversed LSB (should be rare, involves extra work!) + arr3 = ak.Array(arr.layout.to_BitMaskedArray(True, False)) + out = ak.to_cudf(arr3) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [12, None, 21, 12] + + +def test_strings(): + arr = ak.Array(["hey", "hi", "hum"]) + out = ak.to_cudf(arr) + assert out.to_arrow().tolist() == ["hey", "hi", "hum"] + + arr = ak.Array(["hey", "hi", None, "hum"]) + out = ak.to_cudf(arr) + assert out.to_arrow().tolist() == ["hey", "hi", None, "hum"]