feat: Add to_cudf (#3051)

* Start * style: pre-commit fixes * results of chatting * style: pre-commit fixes * Add toplevel func * fix * fix in indexedoptionarray * Add tests and fixes * style: pre-commit fixes * use direct np module (for now) * style: pre-commit fixes * Don't accidentally iterate cupy with CPU * style: pre-commit fixes * Update src/awkward/_errors.py * add docstring * Add string * style: pre-commit fixes * Simpler for numerics This works also for time types, without going via cupy --------- Co-authored-by: Martin Durant <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
scikit-hep · Sep 13, 2024 · 1c368f1 · 1c368f1
1 parent 1420121
commit 1c368f1
Show file tree

Hide file tree

Showing 15 changed files with 210 additions and 1 deletion.
diff --git a/awkward-cpp/rapidjson b/awkward-cpp/rapidjson
diff --git a/src/awkward/contents/bitmaskedarray.py b/src/awkward/contents/bitmaskedarray.py
@@ -11,6 +11,7 @@
 from awkward._backends.backend import Backend
 from awkward._meta.bitmaskedmeta import BitMaskedMeta
 from awkward._nplikes.array_like import ArrayLike
+from awkward._nplikes.cupy import Cupy
 from awkward._nplikes.numpy import Numpy
 from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
 from awkward._nplikes.placeholder import PlaceholderArray
@@ -687,6 +688,24 @@ def _to_arrow(
             pyarrow, mask_node, validbytes, length, options
         )
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        cp = Cupy.instance()._module
+
+        assert mask is None  # this class has its own mask
+        if not self.lsb_order:
+            m = cp.flip(
+                cp.packbits(cp.flip(cp.unpackbits(cp.asarray(self._mask.data))))
+            )
+        else:
+            m = self._mask.data
+
+        if m.nbytes % 64:
+            m = cp.resize(m, ((m.nbytes // 64) + 1) * 64)
+        m = cudf.core.buffer.as_buffer(m)
+        inner = self._content._to_cudf(cudf, mask=None, length=length)
+        inner.set_base_mask(m)
+        return inner
+
     def _to_backend_array(self, allow_missing, backend):
         return self.to_ByteMaskedArray()._to_backend_array(allow_missing, backend)
 

diff --git a/src/awkward/contents/bytemaskedarray.py b/src/awkward/contents/bytemaskedarray.py
@@ -12,6 +12,7 @@
 from awkward._layout import maybe_posaxis
 from awkward._meta.bytemaskedmeta import ByteMaskedMeta
 from awkward._nplikes.array_like import ArrayLike
+from awkward._nplikes.cupy import Cupy
 from awkward._nplikes.numpy import Numpy
 from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
 from awkward._nplikes.placeholder import PlaceholderArray
@@ -1051,6 +1052,18 @@ def _to_arrow(
             options,
         )
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        cp = Cupy.instance()._module
+
+        assert mask is None  # this class has its own mask
+        m = cp.packbits(cp.asarray(self._mask), bitorder="little")
+        if m.nbytes % 64:
+            m = cp.resize(m, ((m.nbytes // 64) + 1) * 64)
+        m = cudf.core.buffer.as_buffer(m)
+        inner = self._content._to_cudf(cudf, mask=None, length=length)
+        inner.set_base_mask(m)
+        return inner
+
     def _to_backend_array(self, allow_missing, backend):
         return self.to_IndexedOptionArray64()._to_backend_array(allow_missing, backend)
 

diff --git a/src/awkward/contents/content.py b/src/awkward/contents/content.py
@@ -1010,6 +1010,10 @@ def _to_arrow(
     ):
         raise NotImplementedError
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        # prototype abstract signature
+        raise NotImplementedError
+
     def to_backend_array(
         self, allow_missing: bool = True, *, backend: Backend | str | None = None
     ):

diff --git a/src/awkward/contents/emptyarray.py b/src/awkward/contents/emptyarray.py
@@ -387,6 +387,15 @@ def _to_arrow(
             )
             return next._to_arrow(pyarrow, mask_node, validbytes, length, options)
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        dtype = np.dtype("float64")
+        next = ak.contents.NumpyArray(
+            numpy.empty(length, dtype=dtype),
+            parameters=self._parameters,
+            backend=self._backend,
+        )
+        return next._to_cudf(cudf, None, 0)
+
     @classmethod
     def _arrow_needs_option_type(cls):
         return True  # This overrides Content._arrow_needs_option_type

diff --git a/src/awkward/contents/indexedarray.py b/src/awkward/contents/indexedarray.py
@@ -1049,6 +1049,16 @@ def _to_arrow(
             )
             return next2._to_arrow(pyarrow, mask_node, validbytes, length, options)
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        if self._content.length == 0:
+            # IndexedOptionArray._to_arrow replaces -1 in the index with 0. So behind
+            # every masked value is self._content[0], unless self._content.length == 0.
+            # In that case, don't call self._content[index]; it's empty anyway.
+            next = self._content
+        else:
+            next = self._content._carry(self._index, False)
+        return next._to_cudf(cudf, None, len(next))
+
     def _to_backend_array(self, allow_missing, backend):
         return self.project()._to_backend_array(allow_missing, backend)
 

diff --git a/src/awkward/contents/indexedoptionarray.py b/src/awkward/contents/indexedoptionarray.py
@@ -1576,6 +1576,9 @@ def _to_arrow(
             options,
         )
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        return self.to_ByteMaskedArray(True)._to_cudf(cudf, mask, length)
+
     def _to_backend_array(self, allow_missing, backend):
         nplike = backend.nplike
         index_nplike = backend.index_nplike

diff --git a/src/awkward/contents/listarray.py b/src/awkward/contents/listarray.py
@@ -1498,6 +1498,9 @@ def _to_arrow(
             pyarrow, mask_node, validbytes, length, options
         )
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        return self.to_ListOffsetArray64(False)._to_cudf(cudf, mask, length)
+
     def _to_backend_array(self, allow_missing, backend):
         array_param = self.parameter("__array__")
         if array_param in {"bytestring", "string"}:

diff --git a/src/awkward/contents/listoffsetarray.py b/src/awkward/contents/listoffsetarray.py
@@ -10,6 +10,7 @@
 from awkward._layout import maybe_posaxis
 from awkward._meta.listoffsetmeta import ListOffsetMeta
 from awkward._nplikes.array_like import ArrayLike
+from awkward._nplikes.cupy import Cupy
 from awkward._nplikes.numpy import Numpy
 from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
 from awkward._nplikes.placeholder import PlaceholderArray
@@ -1999,6 +2000,39 @@ def _to_arrow(
                 ),
             )
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        cupy = Cupy.instance()
+        index = self._offsets.raw(cupy).astype("int32")
+        buf = cudf.core.buffer.as_buffer(index)
+        ind_buf = cudf.core.column.numerical.NumericalColumn(
+            buf, index.dtype, None, size=len(index)
+        )
+        cont = self._content._to_cudf(cudf, None, len(self._content))
+        if mask is not None:
+            m = np._module.packbits(mask, bitorder="little")
+            if m.nbytes % 64:
+                m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64)
+            m = cudf.core.buffer.as_buffer(cupy.asarray(m))
+        else:
+            m = None
+        if self.parameters.get("__array__") == "string":
+            from cudf.core.column.string import StringColumn
+
+            data = cudf.core.buffer.as_buffer(cupy.asarray(self._content.data))
+            # docs for StringColumn says there should be two children instead of a data=
+            return StringColumn(
+                data=data,
+                children=(ind_buf,),
+                mask=m,
+            )
+
+        return cudf.core.column.lists.ListColumn(
+            length,
+            mask=m,
+            children=(ind_buf, cont),
+            dtype=cudf.core.dtypes.ListDtype(cont.dtype),
+        )
+
     def _to_backend_array(self, allow_missing, backend):
         array_param = self.parameter("__array__")
         if array_param == "string":

diff --git a/src/awkward/contents/numpyarray.py b/src/awkward/contents/numpyarray.py
@@ -14,6 +14,7 @@
 from awkward._meta.numpymeta import NumpyMeta
 from awkward._nplikes import to_nplike
 from awkward._nplikes.array_like import ArrayLike
+from awkward._nplikes.cupy import Cupy
 from awkward._nplikes.jax import Jax
 from awkward._nplikes.numpy import Numpy
 from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
@@ -1220,6 +1221,20 @@ def _to_arrow(
             ),
         )
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        cupy = Cupy.instance()
+        from cudf.core.column.column import as_column
+
+        assert self._backend.nplike.known_data
+        data = as_column(self._data)
+        if mask is not None:
+            m = cupy.packbits(cupy.asarray(mask), bitorder="little")
+            if m.nbytes % 64:
+                m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64)
+            m = cudf.core.buffer.as_buffer(m)
+            data.set_base_data(m)
+        return data
+
     def _to_backend_array(self, allow_missing, backend):
         return to_nplike(self.data, backend.nplike, from_nplike=self._backend.nplike)
 

diff --git a/src/awkward/contents/recordarray.py b/src/awkward/contents/recordarray.py
@@ -1101,6 +1101,23 @@ def _to_arrow(
             children=values,
         )
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        children = tuple(
+            c._to_cudf(cudf, mask=None, length=length) for c in self.contents
+        )
+        dt = cudf.core.dtypes.StructDtype(
+            {field: c.dtype for field, c in zip(self.fields, children)}
+        )
+        m = mask._to_cudf(cudf, None, length) if mask else None
+        return cudf.core.column.struct.StructColumn(
+            data=None,
+            children=children,
+            dtype=dt,
+            mask=m,
+            size=length,
+            offset=0,
+        )
+
     def _to_backend_array(self, allow_missing, backend):
         if self.fields is None:
             return backend.nplike.empty(self.length, dtype=[])

diff --git a/src/awkward/contents/unmaskedarray.py b/src/awkward/contents/unmaskedarray.py
@@ -498,6 +498,9 @@ def _to_arrow(
     ):
         return self._content._to_arrow(pyarrow, self, None, length, options)
 
+    def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
+        return self._content._to_cudf(cudf, mask, length)
+
     def _to_backend_array(self, allow_missing, backend):
         content = self.content._to_backend_array(allow_missing, backend)
         if allow_missing:

diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py
@@ -86,6 +86,7 @@
 from awkward.operations.ak_to_arrow_table import *
 from awkward.operations.ak_to_backend import *
 from awkward.operations.ak_to_buffers import *
+from awkward.operations.ak_to_cudf import *
 from awkward.operations.ak_to_cupy import *
 from awkward.operations.ak_to_dataframe import *
 from awkward.operations.ak_to_feather import *

diff --git a/src/awkward/operations/ak_to_cudf.py b/src/awkward/operations/ak_to_cudf.py
@@ -0,0 +1,21 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+from __future__ import annotations
+
+import awkward as ak
+from awkward._dispatch import high_level_function
+
+__all__ = ("to_cudf",)
+
+
+@high_level_function()
+def to_cudf(
+    array: ak.Array,
+):
+    """Create a cuDF.Series out of the given ak array
+
+    Buffers that are not already in GPU memory will be transferred, and some
+    structural reformatting may happen to account for differences in architecture.
+    """
+    import cudf
+
+    return cudf.Series(array.layout._to_cudf(cudf, None, len(array)))
diff --git a/tests-cuda/test_3051_to_cuda.py b/tests-cuda/test_3051_to_cuda.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import pytest
+
+import awkward as ak
+
+cudf = pytest.importorskip("cudf")
+cupy = pytest.importorskip("cupy")
+
+
+def test_jagged():
+    arr = ak.Array([[[1, 2, 3], [], [3, 4]], []])
+    out = ak.to_cudf(arr)
+    assert isinstance(out, cudf.Series)
+    assert out.to_arrow().tolist() == [[[1, 2, 3], [], [3, 4]], []]
+
+
+def test_nested():
+    arr = ak.Array(
+        [{"a": 0, "b": 1.0, "c": {"d": 0}}, {"a": 1, "b": 0.0, "c": {"d": 1}}]
+    )
+    out = ak.to_cudf(arr)
+    assert isinstance(out, cudf.Series)
+    assert out.to_arrow().tolist() == [
+        {"a": 0, "b": 1.0, "c": {"d": 0}},
+        {"a": 1, "b": 0.0, "c": {"d": 1}},
+    ]
+
+
+def test_null():
+    arr = ak.Array([12, None, 21, 12])
+    # calls ByteMaskedArray._to_cudf not NumpyArray
+    out = ak.to_cudf(arr)
+    assert isinstance(out, cudf.Series)
+    assert out.to_arrow().tolist() == [12, None, 21, 12]
+
+    # True is valid, LSB order
+    arr2 = ak.Array(arr.layout.to_BitMaskedArray(True, True))
+    out = ak.to_cudf(arr2)
+    assert isinstance(out, cudf.Series)
+    assert out.to_arrow().tolist() == [12, None, 21, 12]
+
+    # reversed LSB (should be rare, involves extra work!)
+    arr3 = ak.Array(arr.layout.to_BitMaskedArray(True, False))
+    out = ak.to_cudf(arr3)
+    assert isinstance(out, cudf.Series)
+    assert out.to_arrow().tolist() == [12, None, 21, 12]
+
+
+def test_strings():
+    arr = ak.Array(["hey", "hi", "hum"])
+    out = ak.to_cudf(arr)
+    assert out.to_arrow().tolist() == ["hey", "hi", "hum"]
+
+    arr = ak.Array(["hey", "hi", None, "hum"])
+    out = ak.to_cudf(arr)
+    assert out.to_arrow().tolist() == ["hey", "hi", None, "hum"]