Migrate NVText Tokenizing APIs to pylibcudf (#17100)

Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) URL: #17100
rapidsai · Oct 31, 2024 · 893d0fd · 893d0fd
1 parent 0e294b1
commit 893d0fd
Show file tree

Hide file tree

Showing 11 changed files with 476 additions and 122 deletions.
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
@@ -292,7 +292,7 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(
  * @throw cudf::logic_error if `delimiter` is invalid
  *
  * @param input Strings column to tokenize
- * @param vocabulary Used to lookup tokens within
+ * @param vocabulary Used to lookup tokens within `input`
  * @param delimiter Used to identify tokens within `input`
  * @param default_id The token id to be used for tokens not found in the `vocabulary`;
  *                   Default is -1

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -12,3 +12,4 @@ nvtext
     normalize
     replace
     stemmer
+    tokenize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst
@@ -0,0 +1,6 @@
+========
+tokenize
+========
+
+.. automodule:: pylibcudf.nvtext.tokenize
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -2,162 +2,85 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.tokenize cimport (
-    character_tokenize as cpp_character_tokenize,
-    count_tokens as cpp_count_tokens,
-    detokenize as cpp_detokenize,
-    load_vocabulary as cpp_load_vocabulary,
-    tokenize as cpp_tokenize,
-    tokenize_vocabulary as cpp_tokenize_vocabulary,
-    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
+from pylibcudf.nvtext.tokenize import TokenizeVocabulary  # no-cython-lint
+
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def _tokenize_scalar(Column strings, object py_delimiter):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_scalar(
+            strings.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _tokenize_column(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiters = delimiters.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_delimiters
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_column(
+            strings.to_pylibcudf(mode="read"),
+            delimiters.to_pylibcudf(mode="read"),
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _count_tokens_scalar(Column strings, object py_delimiter):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_delimiter[0]
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.count_tokens_scalar(
+            strings.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _count_tokens_column(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiters = delimiters.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_delimiters
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.count_tokens_column(
+            strings.to_pylibcudf(mode="read"),
+            delimiters.to_pylibcudf(mode="read")
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def character_tokenize(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_character_tokenize(c_strings)
+    return Column.from_pylibcudf(
+        nvtext.tokenize.character_tokenize(
+            strings.to_pylibcudf(mode="read")
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def detokenize(Column strings, Column indices, object py_separator):
-
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_detokenize(c_strings, c_indices, c_separator[0])
+    return Column.from_pylibcudf(
+        nvtext.tokenize.detokenize(
+            strings.to_pylibcudf(mode="read"),
+            indices.to_pylibcudf(mode="read"),
+            py_separator.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-cdef class TokenizeVocabulary:
-    cdef unique_ptr[cpp_tokenize_vocabulary] c_obj
-
-    def __cinit__(self, Column vocab):
-        cdef column_view c_vocab = vocab.view()
-        with nogil:
-            self.c_obj = move(cpp_load_vocabulary(c_vocab))
+    )
 
 
 @acquire_spill_lock()
 def tokenize_with_vocabulary(Column strings,
-                             TokenizeVocabulary vocabulary,
+                             object vocabulary,
                              object py_delimiter,
                              size_type default_id):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize_with_vocabulary(
-                c_strings,
-                vocabulary.c_obj.get()[0],
-                c_delimiter[0],
-                default_id
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_with_vocabulary(
+            strings.to_pylibcudf(mode="read"),
+            vocabulary,
+            py_delimiter.device_value.c_value,
+            default_id
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -20,7 +20,9 @@ class TokenizeVocabulary:
     """
 
     def __init__(self, vocabulary: "cudf.Series"):
-        self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
+        self.vocabulary = cpp_tokenize_vocabulary(
+            vocabulary._column.to_pylibcudf(mode="read")
+        )
 
     def tokenize(
         self, text, delimiter: str = "", default_id: int = -1

diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx tokenize.pyx
 )
 
 set(linked_libraries cudf::cudf)

diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -9,6 +9,7 @@ from . cimport (
     normalize,
     replace,
     stemmer,
+    tokenize,
 )
 
 __all__ = [
@@ -20,4 +21,5 @@ __all__ = [
     "normalize",
     "replace",
     "stemmer",
+    "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -9,6 +9,7 @@
     normalize,
     replace,
     stemmer,
+    tokenize,
 )
 
 __all__ = [
@@ -20,4 +21,5 @@
     "normalize",
     "replace",
     "stemmer",
+    "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+cdef class TokenizeVocabulary:
+    cdef unique_ptr[tokenize_vocabulary] c_obj
+
+cpdef Column tokenize_scalar(Column input, Scalar delimiter=*)
+
+cpdef Column tokenize_column(Column input, Column delimiters)
+
+cpdef Column count_tokens_scalar(Column input, Scalar delimiter=*)
+
+cpdef Column count_tokens_column(Column input, Column delimiters)
+
+cpdef Column character_tokenize(Column input)
+
+cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)
+
+cpdef TokenizeVocabulary load_vocabulary(Column input)
+
+cpdef Column tokenize_with_vocabulary(
+    Column input,
+    TokenizeVocabulary vocabulary,
+    Scalar delimiter,
+    size_type default_id=*
+)