rapidsai · rapids-bot · Oct 24, 2024 · Oct 6, 2024 · Oct 7, 2024 · Oct 7, 2024
@@ -51,7 +51,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b1 = is_letter(st, VOWEL, 1)
  * b1 is now [false, true, true]
  * @endcode
@@ -62,7 +62,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
  * b2 is now [false, true, false]
  * @endcode
@@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, 1, 4]
  * b1 = is_letter(st, VOWEL, ix)
  * b1 is now [true, true, false]
@@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, -2, 4] // 2nd to last character in st[1] is checked
  * b2 = is_letter(st, CONSONANT, ix)
  * b2 is now [false, false, true]

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -11,3 +11,4 @@ nvtext
     ngrams_tokenize
     normalize
     replace
+    stemmer
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
@@ -0,0 +1,6 @@
+=======
+stemmer
+=======
+
+.. automodule:: pylibcudf.nvtext.stemmer
+   :members:
@@ -1,24 +1,19 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from enum import IntEnum
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
+from cudf.core.buffer import acquire_spill_lock
+
 from pylibcudf.libcudf.nvtext.stemmer cimport (
-    is_letter as cpp_is_letter,
     letter_type,
-    porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 class LetterType(IntEnum):
     CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
@@ -27,43 +22,34 @@ class LetterType(IntEnum):
 
 @acquire_spill_lock()
 def porter_stemmer_measure(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_porter_stemmer_measure(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        nvtext.stemmer.porter_stemmer_measure(
+            strings.to_pylibcudf(mode="read"),
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_letter(Column strings,
               object ltype,
               size_type index):
-    cdef column_view c_strings = strings.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            index,
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, index))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def is_letter_multi(Column strings,
                     object ltype,
                     Column indices):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            indices.to_pylibcudf(mode="read"),
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))
-
-    return Column.from_unique_ptr(move(c_result))
@@ -1,16 +1,17 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
-    ctypedef enum letter_type:
-        CONSONANT 'nvtext::letter_type::CONSONANT'
-        VOWEL 'nvtext::letter_type::VOWEL'
+    cpdef enum class letter_type:
+        CONSONANT
+        VOWEL
 
     cdef unique_ptr[column] porter_stemmer_measure(
         const column_view & strings

@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx replace.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
 )
 
 set(linked_libraries cudf::cudf)

@@ -8,6 +8,7 @@ from . cimport (
     ngrams_tokenize,
     normalize,
     replace,
+    stemmer,
 )
 
 __all__ = [
@@ -18,4 +19,5 @@ __all__ = [
     "ngrams_tokenize",
     "normalize",
     "replace",
+    "stemmer",
 ]
@@ -8,6 +8,7 @@
     ngrams_tokenize,
     normalize,
     replace,
+    stemmer,
 )
 
 __all__ = [
@@ -18,4 +19,5 @@
     "ngrams_tokenize",
     "normalize",
     "replace",
+    "stemmer",
 ]
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnOrSize:
+    Column
+    size_type
+
+cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)
+
+cpdef Column porter_stemmer_measure(Column input)
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.stemmer cimport (
+    is_letter as cpp_is_letter,
+    letter_type,
+    porter_stemmer_measure as cpp_porter_stemmer_measure,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column is_letter(
+    Column input,
+    bool check_vowels,
+    ColumnOrSize indices
+):
+    """
+    Returns boolean column indicating if the character
+    or characters at the provided character index or
+    indices (respectively) are consonants or vowels
+
+    For details, see :cpp:func:`is_letter`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    check_vowels : bool
+        If true, the check is for vowels. Otherwise the check is
+        for consonants.
+    indices : Union[Column, size_type]
+        The character position(s) to check in each string
+
+    Returns
+    -------
+    Column
+        New boolean column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_is_letter(
+            input.view(),
+            letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
+            indices if ColumnOrSize is size_type else indices.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column porter_stemmer_measure(Column input):
+    """
+    Returns the Porter Stemmer measurements of a strings column.
+
+    For details, see :cpp:func:`porter_stemmer_measure`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column of words to measure
+
+    Returns
+    -------
+    Column
+        New column of measure values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_porter_stemmer_measure(input.view())
+
+    return Column.from_libcudf(move(c_result))
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["trouble", "toy", "syzygy"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("check_vowels", [True, False])
+@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
+def test_is_letter(input_col, check_vowels, indices):
+    def is_letter(s, i, check):
+        vowels = "aeiouy"
+        return (s[i] in vowels) == check
+
+    result = plc.nvtext.stemmer.is_letter(
+        plc.interop.from_arrow(input_col),
+        check_vowels,
+        plc.interop.from_arrow(pa.array(indices))
+        if isinstance(indices, list)
+        else indices,
+    )
+    expected = pa.array(
+        [
+            is_letter(
+                s,
+                indices[i] if isinstance(indices, list) else indices,
+                check_vowels,
+            )
+            for i, s in enumerate(input_col.to_pylist())
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_porter_stemmer_measure(input_col):
+    result = plc.nvtext.stemmer.porter_stemmer_measure(
+        plc.interop.from_arrow(input_col),
+    )
+    expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,4 @@ nvtext @@
         ngrams_tokenize
         normalize
         replace
+        stemmer