Skip to content

Commit

Permalink
[WIP] Migrate NVText Replacing APIs
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 committed Oct 13, 2024
1 parent 37c9107 commit f0902f7
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 55 deletions.
4 changes: 2 additions & 2 deletions cpp/include/nvtext/replace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext {
* The default of empty string will identify tokens using whitespace.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of with replaced strings
* @return New strings column of replaced strings
*/
std::unique_ptr<cudf::column> replace_tokens(
cudf::strings_column_view const& input,
Expand Down Expand Up @@ -131,7 +131,7 @@ std::unique_ptr<cudf::column> replace_tokens(
* The default of empty string will identify tokens using whitespace.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of with replaced strings
* @return New strings column of filtered strings
*/
std::unique_ptr<cudf::column> filter_tokens(
cudf::strings_column_view const& input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ nvtext
minhash
ngrams_tokenize
normalize
replace
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
replace
=======

.. automodule:: pylibcudf.nvtext.replace
:members:
66 changes: 15 additions & 51 deletions python/cudf/cudf/_lib/nvtext/replace.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,10 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.replace cimport (
filter_tokens as cpp_filter_tokens,
replace_tokens as cpp_replace_tokens,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from pylibcudf import nvtext


@acquire_spill_lock()
Expand All @@ -30,27 +20,14 @@ def replace_tokens(Column strings,
provided.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef column_view c_targets = targets.view()
cdef column_view c_replacements = replacements.view()

cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_replace_tokens(
c_strings,
c_targets,
c_replacements,
c_delimiter[0],
)
return Column.from_pylibcudf(
nvtext.replace.replace_tokens(
strings.to_pylibcudf(mode="read"),
targets.to_pylibcudf(mode="read"),
replacements.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
Expand All @@ -65,24 +42,11 @@ def filter_tokens(Column strings,
character provided.
"""

cdef DeviceScalar replacement = py_replacement.device_value
cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef const string_scalar* c_repl = <const string_scalar*>replacement\
.get_raw_ptr()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_filter_tokens(
c_strings,
min_token_length,
c_repl[0],
c_delimiter[0],
)
return Column.from_pylibcudf(
nvtext.replace.filter_tokens(
strings.to_pylibcudf(mode="read"),
min_token_length,
py_replacement.device_value.c_value,
py_delimiter.device_value.c_value,
)

return Column.from_unique_ptr(move(c_result))
)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx normalize.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ from . cimport (
minhash,
ngrams_tokenize,
normalize,
replace,
)

__all__ = [
Expand All @@ -16,4 +17,5 @@ __all__ = [
"minhash",
"ngrams_tokenize",
"normalize",
"replace",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
minhash,
ngrams_tokenize,
normalize,
replace,
)

__all__ = [
Expand All @@ -16,4 +17,5 @@
"minhash",
"ngrams_tokenize",
"normalize",
"replace",
]
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/normalize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,6 @@ cpdef Column normalize_characters(Column input, bool do_lower_case):
cdef unique_ptr[column] c_result

with nogil:
c_result = cpp_normalize_characters(input.view(), do_lower)
c_result = cpp_normalize_characters(input.view(), do_lower_case)

return Column.from_libcudf(move(c_result))
20 changes: 20 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/replace.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column replace_tokens(
Column input,
Column targets,
Column replacements,
Scalar delimiter=*,
)

cpdef Column filter_tokens(
Column input,
size_type min_token_length,
Scalar replacement=*,
Scalar delimiter=*
)
109 changes: 109 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/replace.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.nvtext.replace cimport (
filter_tokens as cpp_filter_tokens,
replace_tokens as cpp_replace_tokens,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column replace_tokens(
Column input,
Column targets,
Column replacements,
Scalar delimiter=None,
):
"""
Replaces specified tokens with corresponding replacement strings.
For details, see :cpp:func:`replace_tokens`
Parameters
----------
input : Column
Strings column to replace
targets : Column
Strings to compare against tokens found in ``input``
replacements : Column
Replacement strings for each string in ``targets``
delimiter : Scalar, optional
Characters used to separate each string into tokens.
The default of empty string will identify tokens using whitespace.
Returns
-------
Column
New strings column of replaced strings
"""
cdef unique_ptr[column] c_result
if delimiter is None:
delimiter = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)
with nogil:
c_result = cpp_replace_tokens(
input.view(),
targets.view(),
replacements.view(),
dereference(<const string_scalar*>delimiter.get()),
)
return Column.from_libcudf(move(c_result))


cpdef Column filter_tokens(
Column input,
size_type min_token_length,
Scalar replacement=None,
Scalar delimiter=None
):
"""
Removes tokens whose lengths are less than a specified number of characters.
For details, see :cpp:func:`filter_tokens`
Parameters
----------
input : Column
Strings column to replace
min_token_length : size_type
The minimum number of characters to retain a
token in the output string
replacement : Scalar, optional
Optional replacement string to be used in place of removed tokens
delimiter : Scalar, optional
Characters used to separate each string into tokens.
The default of empty string will identify tokens using whitespace.
Returns
-------
Column
New strings column of filtered strings
"""
cdef unique_ptr[column] c_result
if delimiter is None:
delimiter = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)
if replacement is None:
replacement = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)

with nogil:
c_result = cpp_filter_tokens(
input.view(),
min_token_length,
dereference(<const string_scalar*>replacement.get()),
dereference(<const string_scalar*>delimiter.get()),
)

return Column.from_libcudf(move(c_result))
60 changes: 60 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_col():
arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
return pa.array(arr)


@pytest.mark.parametrize("delim", ["*"])
def test_replace_tokens(input_col, delim):
targets = pa.array(["quick", "fox", "jumps", "dog"])
replacements = pa.array(["slow", "cat", "looked", "rat"])
result = plc.nvtext.replace.replace_tokens(
plc.interop.from_arrow(input_col),
plc.interop.from_arrow(targets),
plc.interop.from_arrow(replacements),
plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
)
expected = pa.array(
["the quick", "brown fox", "looked*over the", "lazy dog"]
)
if not delim:
expected = pa.array(
["the slow", "brown cat", "jumps*over the", "lazy rat"]
)
assert_column_eq(result, expected)


@pytest.mark.parametrize("min_token_length", [4, 5])
@pytest.mark.parametrize("replace", ["---", None])
@pytest.mark.parametrize("delim", ["*", None])
def test_filter_tokens(input_col, min_token_length, replace, delim):
result = plc.nvtext.replace.filter_tokens(
plc.interop.from_arrow(input_col),
min_token_length,
plc.interop.from_arrow(pa.scalar(replace)) if replace else None,
plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
)
expected = pa.array(
["the quick", "brown fox", "jumps*over the", "lazy dog"]
)
if not delim and not replace and min_token_length == 4:
expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "])
if not delim and not replace and min_token_length == 5:
expected = pa.array([" quick", "brown ", "jumps*over ", " "])
if not delim and replace == "---" and min_token_length == 4:
expected = pa.array(
["--- quick", "brown ---", "jumps*over ---", "lazy ---"]
)
if not delim and replace == "---" and min_token_length == 5:
expected = pa.array(
["--- quick", "brown ---", "jumps*over ---", "--- ---"]
)
assert_column_eq(result, expected)

0 comments on commit f0902f7

Please sign in to comment.