-
Notifications
You must be signed in to change notification settings - Fork 890
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
219 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,3 +10,4 @@ nvtext | |
minhash | ||
ngrams_tokenize | ||
normalize | ||
replace |
6 changes: 6 additions & 0 deletions
6
docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
======= | ||
replace | ||
======= | ||
|
||
.. automodule:: pylibcudf.nvtext.replace | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.types cimport size_type | ||
from pylibcudf.scalar cimport Scalar | ||
|
||
|
||
cpdef Column replace_tokens( | ||
Column input, | ||
Column targets, | ||
Column replacements, | ||
Scalar delimiter=*, | ||
) | ||
|
||
cpdef Column filter_tokens( | ||
Column input, | ||
size_type min_token_length, | ||
Scalar replacement=*, | ||
Scalar delimiter=* | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from cython.operator cimport dereference | ||
from libcpp.memory cimport unique_ptr | ||
from libcpp.utility cimport move | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.column.column cimport column | ||
from pylibcudf.libcudf.nvtext.replace cimport ( | ||
filter_tokens as cpp_filter_tokens, | ||
replace_tokens as cpp_replace_tokens, | ||
) | ||
from pylibcudf.libcudf.scalar.scalar cimport string_scalar | ||
from pylibcudf.libcudf.scalar.scalar_factories cimport ( | ||
make_string_scalar as cpp_make_string_scalar, | ||
) | ||
from pylibcudf.libcudf.types cimport size_type | ||
from pylibcudf.scalar cimport Scalar | ||
|
||
|
||
cpdef Column replace_tokens( | ||
Column input, | ||
Column targets, | ||
Column replacements, | ||
Scalar delimiter=None, | ||
): | ||
""" | ||
Replaces specified tokens with corresponding replacement strings. | ||
For details, see :cpp:func:`replace_tokens` | ||
Parameters | ||
---------- | ||
input : Column | ||
Strings column to replace | ||
targets : Column | ||
Strings to compare against tokens found in ``input`` | ||
replacements : Column | ||
Replacement strings for each string in ``targets`` | ||
delimiter : Scalar, optional | ||
Characters used to separate each string into tokens. | ||
The default of empty string will identify tokens using whitespace. | ||
Returns | ||
------- | ||
Column | ||
New strings column of replaced strings | ||
""" | ||
cdef unique_ptr[column] c_result | ||
if delimiter is None: | ||
delimiter = Scalar.from_libcudf( | ||
cpp_make_string_scalar("".encode()) | ||
) | ||
with nogil: | ||
c_result = cpp_replace_tokens( | ||
input.view(), | ||
targets.view(), | ||
replacements.view(), | ||
dereference(<const string_scalar*>delimiter.get()), | ||
) | ||
return Column.from_libcudf(move(c_result)) | ||
|
||
|
||
cpdef Column filter_tokens( | ||
Column input, | ||
size_type min_token_length, | ||
Scalar replacement=None, | ||
Scalar delimiter=None | ||
): | ||
""" | ||
Removes tokens whose lengths are less than a specified number of characters. | ||
For details, see :cpp:func:`filter_tokens` | ||
Parameters | ||
---------- | ||
input : Column | ||
Strings column to replace | ||
min_token_length : size_type | ||
The minimum number of characters to retain a | ||
token in the output string | ||
replacement : Scalar, optional | ||
Optional replacement string to be used in place of removed tokens | ||
delimiter : Scalar, optional | ||
Characters used to separate each string into tokens. | ||
The default of empty string will identify tokens using whitespace. | ||
Returns | ||
------- | ||
Column | ||
New strings column of filtered strings | ||
""" | ||
cdef unique_ptr[column] c_result | ||
if delimiter is None: | ||
delimiter = Scalar.from_libcudf( | ||
cpp_make_string_scalar("".encode()) | ||
) | ||
if replacement is None: | ||
replacement = Scalar.from_libcudf( | ||
cpp_make_string_scalar("".encode()) | ||
) | ||
|
||
with nogil: | ||
c_result = cpp_filter_tokens( | ||
input.view(), | ||
min_token_length, | ||
dereference(<const string_scalar*>replacement.get()), | ||
dereference(<const string_scalar*>delimiter.get()), | ||
) | ||
|
||
return Column.from_libcudf(move(c_result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pylibcudf as plc | ||
import pytest | ||
from utils import assert_column_eq | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def input_col(): | ||
arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"] | ||
return pa.array(arr) | ||
|
||
|
||
@pytest.mark.parametrize("delim", ["*"]) | ||
def test_replace_tokens(input_col, delim): | ||
targets = pa.array(["quick", "fox", "jumps", "dog"]) | ||
replacements = pa.array(["slow", "cat", "looked", "rat"]) | ||
result = plc.nvtext.replace.replace_tokens( | ||
plc.interop.from_arrow(input_col), | ||
plc.interop.from_arrow(targets), | ||
plc.interop.from_arrow(replacements), | ||
plc.interop.from_arrow(pa.scalar(delim)) if delim else None, | ||
) | ||
expected = pa.array( | ||
["the quick", "brown fox", "looked*over the", "lazy dog"] | ||
) | ||
if not delim: | ||
expected = pa.array( | ||
["the slow", "brown cat", "jumps*over the", "lazy rat"] | ||
) | ||
assert_column_eq(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize("min_token_length", [4, 5]) | ||
@pytest.mark.parametrize("replace", ["---", None]) | ||
@pytest.mark.parametrize("delim", ["*", None]) | ||
def test_filter_tokens(input_col, min_token_length, replace, delim): | ||
result = plc.nvtext.replace.filter_tokens( | ||
plc.interop.from_arrow(input_col), | ||
min_token_length, | ||
plc.interop.from_arrow(pa.scalar(replace)) if replace else None, | ||
plc.interop.from_arrow(pa.scalar(delim)) if delim else None, | ||
) | ||
expected = pa.array( | ||
["the quick", "brown fox", "jumps*over the", "lazy dog"] | ||
) | ||
if not delim and not replace and min_token_length == 4: | ||
expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "]) | ||
if not delim and not replace and min_token_length == 5: | ||
expected = pa.array([" quick", "brown ", "jumps*over ", " "]) | ||
if not delim and replace == "---" and min_token_length == 4: | ||
expected = pa.array( | ||
["--- quick", "brown ---", "jumps*over ---", "lazy ---"] | ||
) | ||
if not delim and replace == "---" and min_token_length == 5: | ||
expected = pa.array( | ||
["--- quick", "brown ---", "jumps*over ---", "--- ---"] | ||
) | ||
assert_column_eq(result, expected) |