From f3cbbe6b16ce56f48c616a354814897f603d44a2 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 11 Jun 2024 14:15:40 -0700 Subject: [PATCH 1/3] Initial commit --- .../_lib/pylibcudf/libcudf/lists/contains.pxd | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd index 721679f35c7..8a1c3b2523f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from cudf._lib.exception_handler cimport cudf_exception_handler @@ -12,11 +13,25 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil: + + cpdef enum class duplicate_find_option(int32_t): + FIND_FIRST + FIND_LAST + cdef unique_ptr[column] contains( lists_column_view lists, scalar search_key, ) except +cudf_exception_handler + cdef unique_ptr[column] contains( + lists_column_view lists, + column_view search_keys, + ) except +cudf_exception_handler + + cdef unique_ptr[column] contains_nulls( + lists_column_view lists, + ) except +cudf_exception_handler + cdef unique_ptr[column] index_of( lists_column_view lists, scalar search_key, From afb4061b7b1dc6e14e925b1e2cac10f14f1cb4d6 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 12 Jun 2024 14:09:28 -0700 Subject: [PATCH 2/3] Migrate contains --- python/cudf/cudf/_lib/lists.pyx | 22 ++----- .../_lib/pylibcudf/libcudf/lists/contains.pxd | 14 ++-- python/cudf/cudf/_lib/pylibcudf/lists.pxd | 14 ++++ python/cudf/cudf/_lib/pylibcudf/lists.pyx | 65 +++++++++++++++++-- .../cudf/cudf/pylibcudf_tests/test_lists.py | 4 ++ 5 files changed, 90 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 5d406f5c85f..0b63d2e5573 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,7 +10,6 @@ from cudf._lib.column cimport Column from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.lists.contains cimport ( - contains, index_of as cpp_index_of, ) from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport ( @@ -154,23 +153,12 @@ def extract_element_column(Column col, Column index): @acquire_spill_lock() def contains_scalar(Column col, object py_search_key): - - cdef DeviceScalar search_key = py_search_key.device_value - - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) + return Column.from_pylibcudf( + pylibcudf.lists.contains( + col.to_pylibcudf(mode="read"), + py_search_key, + ) ) - cdef const scalar* search_key_value = search_key.get_raw_ptr() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(contains( - list_view.get()[0], - search_key_value[0], - )) - result = Column.from_unique_ptr(move(c_result)) - return result @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd index 8a1c3b2523f..ce21e4bf34d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd @@ -15,17 +15,17 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil: cpdef enum class duplicate_find_option(int32_t): - FIND_FIRST - FIND_LAST + FIND_FIRST "cudf::lists::duplicate_find_option::FIND_FIRST" + FIND_LAST "cudf::lists::duplicate_find_option::FIND_LAST" cdef unique_ptr[column] contains( - lists_column_view lists, - scalar search_key, + const lists_column_view& lists, + const scalar& search_key, ) except +cudf_exception_handler cdef unique_ptr[column] contains( - lists_column_view lists, - column_view search_keys, + const lists_column_view& lists, + const column_view& search_keys, ) except +cudf_exception_handler cdef unique_ptr[column] contains_nulls( @@ -35,9 +35,11 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] index_of( lists_column_view lists, scalar search_key, + # duplicate_find_option find_option, ) except +cudf_exception_handler cdef unique_ptr[column] index_of( lists_column_view lists, column_view search_keys, + # duplicate_find_option find_option, ) except +cudf_exception_handler diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 2d2a5b2a9ea..53eedd4d405 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -5,11 +5,25 @@ from libcpp cimport bool from cudf._lib.pylibcudf.libcudf.types cimport size_type from .column cimport Column +from .scalar cimport Scalar from .table cimport Table +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) cpdef Column concatenate_list_elements(Column, bool dropna) + +cpdef Column contains(Column, ColumnOrScalar) + +# cpdef Column contains_nulls(Column) + +# ctypedef Column index_of(Column, ColumnOrScalar) + +# from cudf._lib.pylibcudf.libcudf.binaryop import \ +# binary_operator as BinaryOperator # no-cython-lint +# from cudf._lib.pylibcudf.libcudf.lists.contains cimport duplicate_find_option diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 069c9da31c2..011ac91be49 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -1,19 +1,31 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_shared, shared_ptr, unique_ptr from libcpp.utility cimport move from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode +from cudf._lib.pylibcudf.libcudf.lists cimport ( + contains as cpp_contains, + explode as cpp_explode, +) from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_list_elements as cpp_concatenate_list_elements, concatenate_null_policy, concatenate_rows as cpp_concatenate_rows, ) +from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( + lists_column_view, +) +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.libcudf.lists.contains import \ + duplicate_find_option as DuplicateFindOption # no-cython-lint + +from cudf._lib.scalar cimport DeviceScalar + from .column cimport Column from .table cimport Table @@ -71,15 +83,15 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): ---------- input : Column The input column + dropna : bool + If true, null list elements will be ignored + from concatenation. Otherwise any input null values will result in + the corresponding output row being set to null. Returns ------- Column A new Column of concatenated list elements - dropna : bool - If true, null list elements will be ignored - from concatenation. Otherwise any input null values will result in - the corresponding output row being set to null. """ cdef concatenate_null_policy null_policy = ( concatenate_null_policy.IGNORE if dropna @@ -94,3 +106,44 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): )) return Column.from_libcudf(move(c_result)) + +cpdef Column contains(Column input, ColumnOrScalar search_key): + """Create a column of bool values based upon the search key. + + ``search_key`` may be a + :py:class:`~cudf._lib.pylibcudf.column.Column` or a + :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. + + For details, see :cpp:func:`contains`. + + Parameters + ---------- + input : Column + The input column. + search_key : Union[Column, Scalar] + The search key. + + Returns + ------- + Column + A new Column of bools + """ + cdef unique_ptr[column] c_result + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](input.view()) + ) + if ColumnOrScalar is Column: + with nogil: + c_result = move(cpp_contains.contains( + list_view.get()[0], + search_key.view(), + )) + return Column.from_libcudf(move(c_result)) + cdef DeviceScalar key = search_key.device_value + cdef const scalar* key_value = key.get_raw_ptr() + with nogil: + c_result = move(cpp_contains.contains( + list_view.get()[0], + key_value[0], + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index b21af8ea11c..93c513af8ab 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -44,3 +44,7 @@ def test_concatenate_list_elements(test_data, dropna, expected): expect = pa.array(expected) assert_column_eq(expect, res) + + +def test_contains(): + pass From 0a98c7a4d20630b7955a4519d2e88d733e72dab8 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 12 Jun 2024 17:16:48 -0700 Subject: [PATCH 3/3] Add tests for contains (scalar and column_view) --- python/cudf/cudf/_lib/lists.pyx | 4 +-- python/cudf/cudf/_lib/pylibcudf/lists.pxd | 6 ++-- python/cudf/cudf/_lib/pylibcudf/lists.pyx | 35 +++++++++---------- .../cudf/cudf/pylibcudf_tests/test_lists.py | 29 +++++++++++++-- 4 files changed, 48 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 0b63d2e5573..1bee04d1d50 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -152,11 +152,11 @@ def extract_element_column(Column col, Column index): @acquire_spill_lock() -def contains_scalar(Column col, object py_search_key): +def contains_scalar(Column col, py_search_key): return Column.from_pylibcudf( pylibcudf.lists.contains( col.to_pylibcudf(mode="read"), - py_search_key, + py_search_key.device_value, ) ) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 53eedd4d405..c3fdfb9d0cd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -3,6 +3,7 @@ from libcpp cimport bool from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.scalar cimport DeviceScalar from .column cimport Column from .scalar cimport Scalar @@ -11,6 +12,7 @@ from .table cimport Table ctypedef fused ColumnOrScalar: Column Scalar + DeviceScalar cpdef Table explode_outer(Table, size_type explode_column_idx) @@ -23,7 +25,3 @@ cpdef Column contains(Column, ColumnOrScalar) # cpdef Column contains_nulls(Column) # ctypedef Column index_of(Column, ColumnOrScalar) - -# from cudf._lib.pylibcudf.libcudf.binaryop import \ -# binary_operator as BinaryOperator # no-cython-lint -# from cudf._lib.pylibcudf.libcudf.lists.contains cimport duplicate_find_option diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 011ac91be49..2d2f4bb9b07 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -20,10 +20,6 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type - -from cudf._lib.pylibcudf.libcudf.lists.contains import \ - duplicate_find_option as DuplicateFindOption # no-cython-lint - from cudf._lib.scalar cimport DeviceScalar from .column cimport Column @@ -107,15 +103,10 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): return Column.from_libcudf(move(c_result)) + cpdef Column contains(Column input, ColumnOrScalar search_key): """Create a column of bool values based upon the search key. - ``search_key`` may be a - :py:class:`~cudf._lib.pylibcudf.column.Column` or a - :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. - - For details, see :cpp:func:`contains`. - Parameters ---------- input : Column @@ -132,18 +123,26 @@ cpdef Column contains(Column input, ColumnOrScalar search_key): cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](input.view()) ) + cdef const scalar* search_key_value = NULL + if ColumnOrScalar is Column: with nogil: c_result = move(cpp_contains.contains( list_view.get()[0], search_key.view(), )) - return Column.from_libcudf(move(c_result)) - cdef DeviceScalar key = search_key.device_value - cdef const scalar* key_value = key.get_raw_ptr() - with nogil: - c_result = move(cpp_contains.contains( - list_view.get()[0], - key_value[0], - )) + elif ColumnOrScalar is DeviceScalar: + search_key_value = search_key.get_raw_ptr() + with nogil: + c_result = move(cpp_contains.contains( + list_view.get()[0], + search_key_value[0], + )) + else: + search_key_value = search_key.get() + with nogil: + c_result = move(cpp_contains.contains( + list_view.get()[0], + search_key_value[0], + )) return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 93c513af8ab..a348e14c150 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -46,5 +46,30 @@ def test_concatenate_list_elements(test_data, dropna, expected): assert_column_eq(expect, res) -def test_contains(): - pass +def test_contains_scalar(): + list_column = [[1, 2], [1, 3, 4], [5, 6]] + arr = pa.array(list_column) + scalar = pa.scalar(1) + + plc_column = plc.interop.from_arrow(arr) + plc_scalar = plc.interop.from_arrow(scalar) + res = plc.lists.contains(plc_column, plc_scalar) + + expect = pa.array([True, True, False]) + + assert_column_eq(expect, res) + + +def test_contains_list_column(): + list_column1 = [[1, 2], [1, 3, 4], [5, 6]] + list_column2 = [1, 3, 6] + arr1 = pa.array(list_column1) + arr2 = pa.array(list_column2) + + plc_column1 = plc.interop.from_arrow(arr1) + plc_column2 = plc.interop.from_arrow(arr2) + res = plc.lists.contains(plc_column1, plc_column2) + + expect = pa.array([True, True, True]) + + assert_column_eq(expect, res)