Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string.split APIs to pylibcudf #16940

Open
wants to merge 5 commits into
base: branch-24.12
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ strings
repeat
replace
slice
split
strip
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=====
split
=====

.. automodule:: pylibcudf.strings.split
:members:
59 changes: 10 additions & 49 deletions python/cudf/cudf/_lib/strings/split/partition.pyx
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.split.partition cimport (
partition as cpp_partition,
rpartition as cpp_rpartition,
)
from pylibcudf.libcudf.table.table cimport table

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport data_from_unique_ptr

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -25,25 +14,11 @@ def partition(Column source_strings,
Returns data by splitting the `source_strings`
column at the first occurrence of the specified `py_delimiter`.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_partition(
source_view,
scalar_str[0]
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.partition.partition(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -53,22 +28,8 @@ def rpartition(Column source_strings,
Returns a Column by splitting the `source_strings`
column at the last occurrence of the specified `py_delimiter`.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_rpartition(
source_view,
scalar_str[0]
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.partition.rpartition(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
217 changes: 54 additions & 163 deletions python/cudf/cudf/_lib/strings/split/split.pyx
Original file line number Diff line number Diff line change
@@ -1,33 +1,12 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program
from pylibcudf.libcudf.strings.split.split cimport (
rsplit as cpp_rsplit,
rsplit_re as cpp_rsplit_re,
rsplit_record as cpp_rsplit_record,
rsplit_record_re as cpp_rsplit_record_re,
split as cpp_split,
split_re as cpp_split_re,
split_record as cpp_split_record,
split_record_re as cpp_split_record_re,
)
from pylibcudf.libcudf.table.table cimport table
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport data_from_unique_ptr

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -39,26 +18,12 @@ def split(Column source_strings,
column around the specified `py_delimiter`.
The split happens from beginning.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_split(
source_view,
scalar_str[0],
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.split(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -70,25 +35,12 @@ def split_record(Column source_strings,
column around the specified `py_delimiter`.
The split happens from beginning.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_split_record(
source_view,
scalar_str[0],
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.split_record(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -100,26 +52,12 @@ def rsplit(Column source_strings,
column around the specified `py_delimiter`.
The split happens from the end.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_rsplit(
source_view,
scalar_str[0],
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.rsplit(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings,
column around the specified `py_delimiter`.
The split happens from the end.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_rsplit_record(
source_view,
scalar_str[0],
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.rsplit_record(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -160,24 +85,15 @@ def split_re(Column source_strings,
Returns data by splitting the `source_strings`
column around the delimiters identified by `pattern`.
"""
cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_split_re(
source_view,
dereference(c_prog),
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.split_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings,
column around the delimiters identified by `pattern`.
The delimiters are searched starting from the end of each string.
"""
cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_rsplit_re(
source_view,
dereference(c_prog),
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.rsplit_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -217,23 +124,15 @@ def split_record_re(Column source_strings,
Returns a Column by splitting the `source_strings`
column around the delimiters identified by `pattern`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_split_record_re(
source_view,
dereference(c_prog),
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.split_record_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings,
column around the delimiters identified by `pattern`.
The delimiters are searched starting from the end of each string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_rsplit_record_re(
source_view,
dereference(c_prog),
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.rsplit_record_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return Column.from_pylibcudf(plc_column)
Loading
Loading