Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate NVText Stemming APIs to pylibcudf #17085

Merged
merged 30 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7a0e19f
Migrate nvtext generate_ngrams APIs to pylibcudf
Matt711 Oct 6, 2024
82aaabb
Migrate nvtext jaccard API to pylibcudf
Matt711 Oct 7, 2024
e51dfb1
Merge branch 'branch-24.12' into pylibcudf-nvtext-jaccard
Matt711 Oct 7, 2024
1db10e6
comment out test
Matt711 Oct 7, 2024
2c0534c
Merge branch 'branch-24.12' into pylibcudf-nvtext-jaccard
Matt711 Oct 7, 2024
af71a5b
add a test
Matt711 Oct 8, 2024
b8f8d7d
merge conflict
Matt711 Oct 8, 2024
860835d
clean up
Matt711 Oct 8, 2024
5e5cd03
Migrate Min Hashing APIs to pylibcudf
Matt711 Oct 8, 2024
cfbda56
merge conflict
Matt711 Oct 8, 2024
bfa583e
clean up, add missing tests
Matt711 Oct 9, 2024
2584241
Merge branch 'branch-24.12' into pylibcudf-nvtext-minhash
Matt711 Oct 9, 2024
c67b737
[WIP] Migrate remaining nvtext NGrams APIs to pylibcudf
Matt711 Oct 11, 2024
5033f2c
merge conflict
Matt711 Oct 12, 2024
e536b89
initial commit
Matt711 Oct 12, 2024
335803c
clean up
Matt711 Oct 12, 2024
42228eb
oops deleted the conda envs?
Matt711 Oct 12, 2024
94cd3b1
remove device_span
Matt711 Oct 13, 2024
8614bd7
Merge branch 'pylibcudf-nvtext-ngrams_tokenize' of github.com:Matt711…
Matt711 Oct 13, 2024
37c9107
clean up
Matt711 Oct 13, 2024
6de3871
fix typo
Matt711 Oct 13, 2024
6e47499
[WIP] Migrate NVText Stemming APIs to pylibcudf
Matt711 Oct 15, 2024
7df464a
Merge branch 'branch-24.12' into pylibcudf-nvtext-stemmer
Matt711 Oct 15, 2024
753769b
merge conflict
Matt711 Oct 22, 2024
e482445
clean up
Matt711 Oct 22, 2024
e12d684
address review
Matt711 Oct 22, 2024
43da0ee
enum class
Matt711 Oct 22, 2024
f06be3a
Merge branch 'branch-24.12' into pylibcudf-nvtext-stemmer
Matt711 Oct 22, 2024
1f2d57e
address merge conflict
Matt711 Oct 22, 2024
205ded5
address review
Matt711 Oct 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cpp/include/nvtext/stemmer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ enum class letter_type {
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* b1 = is_letter(st, VOWEL, 1)
* b1 is now [false, true, true]
* @endcode
Expand All @@ -62,7 +62,7 @@ enum class letter_type {
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
* b2 is now [false, true, false]
* @endcode
Expand Down Expand Up @@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* ix = [3, 1, 4]
* b1 = is_letter(st, VOWEL, ix)
* b1 is now [true, true, false]
Expand All @@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* ix = [3, -2, 4] // 2nd to last character in st[1] is checked
* b2 = is_letter(st, CONSONANT, ix)
* b2 is now [false, false, true]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ nvtext
ngrams_tokenize
normalize
replace
stemmer
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
stemmer
=======

.. automodule:: pylibcudf.nvtext.stemmer
:members:
56 changes: 21 additions & 35 deletions python/cudf/cudf/_lib/nvtext/stemmer.pyx
Original file line number Diff line number Diff line change
@@ -1,24 +1,19 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from enum import IntEnum

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.nvtext.stemmer cimport (
is_letter as cpp_is_letter,
letter_type,
porter_stemmer_measure as cpp_porter_stemmer_measure,
underlying_type_t_letter_type,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column

from pylibcudf import nvtext


class LetterType(IntEnum):
CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
Expand All @@ -27,43 +22,34 @@ class LetterType(IntEnum):

@acquire_spill_lock()
def porter_stemmer_measure(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_porter_stemmer_measure(c_strings))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
nvtext.stemmer.porter_stemmer_measure(
strings.to_pylibcudf(mode="read"),
)
)


@acquire_spill_lock()
def is_letter(Column strings,
object ltype,
size_type index):
cdef column_view c_strings = strings.view()
cdef letter_type c_ltype = <letter_type>(
<underlying_type_t_letter_type> ltype
return Column.from_pylibcudf(
nvtext.stemmer.is_letter(
strings.to_pylibcudf(mode="read"),
ltype==LetterType.VOWEL,
index,
)
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_is_letter(c_strings, c_ltype, index))

return Column.from_unique_ptr(move(c_result))


@acquire_spill_lock()
def is_letter_multi(Column strings,
object ltype,
Column indices):
cdef column_view c_strings = strings.view()
cdef column_view c_indices = indices.view()
cdef letter_type c_ltype = <letter_type>(
<underlying_type_t_letter_type> ltype
return Column.from_pylibcudf(
nvtext.stemmer.is_letter(
strings.to_pylibcudf(mode="read"),
ltype==LetterType.VOWEL,
indices.to_pylibcudf(mode="read"),
)
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))

return Column.from_unique_ptr(move(c_result))
7 changes: 4 additions & 3 deletions python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.types cimport size_type


cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
ctypedef enum letter_type:
CONSONANT 'nvtext::letter_type::CONSONANT'
VOWEL 'nvtext::letter_type::VOWEL'
cpdef enum class letter_type:
CONSONANT
VOWEL

cdef unique_ptr[column] porter_stemmer_measure(
const column_view & strings
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ from . cimport (
ngrams_tokenize,
normalize,
replace,
stemmer,
)

__all__ = [
Expand All @@ -18,4 +19,5 @@ __all__ = [
"ngrams_tokenize",
"normalize",
"replace",
"stemmer",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ngrams_tokenize,
normalize,
replace,
stemmer,
)

__all__ = [
Expand All @@ -18,4 +19,5 @@
"ngrams_tokenize",
"normalize",
"replace",
"stemmer",
]
14 changes: 14 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from pylibcudf.column cimport Column
from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
from pylibcudf.libcudf.types cimport size_type

ctypedef fused ColumnOrSize:
Column
size_type

cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)

cpdef Column porter_stemmer_measure(Column input)
76 changes: 76 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.nvtext.stemmer cimport (
is_letter as cpp_is_letter,
letter_type,
porter_stemmer_measure as cpp_porter_stemmer_measure,
)
from pylibcudf.libcudf.types cimport size_type


cpdef Column is_letter(
Column input,
bool check_vowels,
ColumnOrSize indices
):
"""
Returns boolean column indicating if the character
or characters at the provided character index or
indices (respectively) are consonants or vowels

For details, see :cpp:func:`is_letter`

Parameters
----------
input : Column
Input strings
check_vowels : bool
If true, the check is for vowels. Otherwise the check is
for consonants.
indices : Union[Column, size_type]
The character position(s) to check in each string

Returns
-------
Column
New boolean column.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = cpp_is_letter(
input.view(),
letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
indices if ColumnOrSize is size_type else indices.view()
)

return Column.from_libcudf(move(c_result))


cpdef Column porter_stemmer_measure(Column input):
"""
Returns the Porter Stemmer measurements of a strings column.

For details, see :cpp:func:`porter_stemmer_measure`

Parameters
----------
input : Column
Strings column of words to measure

Returns
-------
Column
New column of measure values
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = cpp_porter_stemmer_measure(input.view())

return Column.from_libcudf(move(c_result))
47 changes: 47 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_col():
arr = ["trouble", "toy", "syzygy"]
return pa.array(arr)


@pytest.mark.parametrize("check_vowels", [True, False])
@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
def test_is_letter(input_col, check_vowels, indices):
def is_letter(s, i, check):
vowels = "aeiouy"
return (s[i] in vowels) == check

result = plc.nvtext.stemmer.is_letter(
plc.interop.from_arrow(input_col),
check_vowels,
plc.interop.from_arrow(pa.array(indices))
if isinstance(indices, list)
else indices,
)
expected = pa.array(
[
is_letter(
s,
indices[i] if isinstance(indices, list) else indices,
check_vowels,
)
for i, s in enumerate(input_col.to_pylist())
]
)
assert_column_eq(result, expected)


def test_porter_stemmer_measure(input_col):
result = plc.nvtext.stemmer.porter_stemmer_measure(
plc.interop.from_arrow(input_col),
)
expected = pa.array([1, 1, 2], type=pa.int32())
assert_column_eq(result, expected)
Loading