rapidsai · Matt711 · Oct 6, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
@@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext {
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column of replaced strings
  */
 std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& input,
@@ -131,7 +131,7 @@ std::unique_ptr<cudf::column> replace_tokens(
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column of filtered strings
  */
 std::unique_ptr<cudf::column> filter_tokens(
   cudf::strings_column_view const& input,

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -8,3 +8,6 @@ nvtext
     generate_ngrams
     jaccard
     minhash
+    ngrams_tokenize
+    normalize
+    replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst
@@ -0,0 +1,6 @@
+===============
+ngrams_tokenize
+===============
+
+.. automodule:: pylibcudf.nvtext.ngrams_tokenize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
@@ -0,0 +1,6 @@
+=========
+normalize
+=========
+
+.. automodule:: pylibcudf.nvtext.normalize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
@@ -0,0 +1,6 @@
+=======
+replace
+=======
+
+.. automodule:: pylibcudf.nvtext.replace
+   :members:
@@ -2,48 +2,22 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
-    ngrams_tokenize as cpp_ngrams_tokenize,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def ngrams_tokenize(
-    Column strings,
+    Column input,
     int ngrams,
     object py_delimiter,
     object py_separator
 ):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_ngrams_tokenize(
-                c_strings,
-                c_ngrams,
-                c_delimiter[0],
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.ngrams_tokenize.ngrams_tokenize(
+        input.to_pylibcudf(mode="read"),
+        ngrams,
+        py_delimiter.device_value.c_value,
+        py_separator.device_value.c_value
+    )
+    return Column.from_pylibcudf(result)
@@ -3,36 +3,24 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
 
 from cudf._lib.column cimport Column
 
-
-@acquire_spill_lock()
-def normalize_spaces(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_normalize_spaces(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def normalize_characters(Column strings, bool do_lower=True):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
+def normalize_spaces(Column input):
+    result = nvtext.normalize.normalize_spaces(
+        input.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(result)
 
-    with nogil:
-        c_result = move(cpp_normalize_characters(c_strings, do_lower))
 
-    return Column.from_unique_ptr(move(c_result))
+@acquire_spill_lock()
+def normalize_characters(Column input, bool do_lower=True):
+    result = nvtext.normalize.normalize_characters(
+        input.to_pylibcudf(mode="read"),
+        do_lower,
+    )
+    return Column.from_pylibcudf(result)
@@ -2,20 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.replace cimport (
-    filter_tokens as cpp_filter_tokens,
-    replace_tokens as cpp_replace_tokens,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
@@ -30,27 +20,14 @@ def replace_tokens(Column strings,
     provided.
     """
 
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_targets = targets.view()
-    cdef column_view c_replacements = replacements.view()
-
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_replace_tokens(
-                c_strings,
-                c_targets,
-                c_replacements,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.replace_tokens(
+            strings.to_pylibcudf(mode="read"),
+            targets.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
@@ -65,24 +42,11 @@ def filter_tokens(Column strings,
     character provided.
     """
 
-    cdef DeviceScalar replacement = py_replacement.device_value
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_repl = <const string_scalar*>replacement\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_filter_tokens(
-                c_strings,
-                min_token_length,
-                c_repl[0],
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.filter_tokens(
+            strings.to_pylibcudf(mode="read"),
+            min_token_length,
+            py_replacement.device_value.c_value,
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
@@ -12,7 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(

@@ -1,10 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance, generate_ngrams, jaccard, minhash
+from . cimport (
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+    normalize,
+    replace,
+)
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
-    "minhash"
+    "minhash",
+    "ngrams_tokenize",
+    "normalize",
+    "replace",
 ]
@@ -1,10 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams, jaccard, minhash
+from . import (
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+    normalize,
+    replace,
+)
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
     "minhash",
+    "ngrams_tokenize",
+    "normalize",
+    "replace",
 ]
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column ngrams_tokenize(
+    Column input,
+    size_type ngrams,
+    Scalar delimiter,
+    Scalar separator
+)
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
+    ngrams_tokenize as cpp_ngrams_tokenize,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column ngrams_tokenize(
+    Column input,
+    size_type ngrams,
+    Scalar delimiter,
+    Scalar separator
+):
+    """
+    Returns a single column of strings by tokenizing the input strings column
+    and then producing ngrams of each string.
+
+    For details, see :cpp:func:`ngrams_tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngrams : size_type
+        The ngram number to generate
+    delimiter : Scalar
+        UTF-8 characters used to separate each string into tokens.
+        An empty string will separate tokens using whitespace.
+    separator : Scalar
+        The string to use for separating ngram tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_ngrams_tokenize(
+            input.view(),
+            ngrams,
+            dereference(<const string_scalar*>delimiter.get()),
+            dereference(<const string_scalar*>separator.get()),
+        )
+    return Column.from_libcudf(move(c_result))
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+
+
+cpdef Column normalize_spaces(Column input)
+
+cpdef Column normalize_characters(Column input, bool do_lower_case)