From 5bcf14a4ed15a679a13b0bc7ba3d7773a00f7c98 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:12:46 -0700 Subject: [PATCH 1/4] Add string.split APIs to pylibcudf --- .../api_docs/pylibcudf/strings/index.rst | 1 + .../api_docs/pylibcudf/strings/split.rst | 6 ++ .../cudf/_lib/strings/split/partition.pyx | 59 +++---------- .../libcudf/strings/split/partition.pxd | 4 +- .../pylibcudf/strings/CMakeLists.txt | 1 + .../pylibcudf/pylibcudf/strings/__init__.pxd | 2 + .../pylibcudf/pylibcudf/strings/__init__.py | 2 + .../pylibcudf/strings/split/CMakeLists.txt | 22 +++++ .../pylibcudf/strings/split/__init__.pxd | 2 + .../pylibcudf/strings/split/__init__.py | 2 + .../pylibcudf/strings/split/partition.pxd | 10 +++ .../pylibcudf/strings/split/partition.pyx | 82 +++++++++++++++++++ .../pylibcudf/strings/split/split.pxd | 0 .../pylibcudf/strings/split/split.pyx | 0 .../tests/test_string_split_partition.py | 43 ++++++++++ 15 files changed, 185 insertions(+), 51 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst create mode 100644 python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.py create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pyx create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_partition.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 9b1a6b72a88..b70953cd56b 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -15,4 +15,5 @@ strings repeat replace slice + split strip diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst new file mode 100644 index 00000000000..cba96e86f45 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst @@ -0,0 +1,6 @@ +===== +split +===== + +.. automodule:: pylibcudf.strings.split + :members: diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index a81fb18e752..5319addc41c 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -1,21 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.split.partition cimport ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -25,25 +14,11 @@ def partition(Column source_strings, Returns data by splitting the `source_strings` column at the first occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_partition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.partition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -53,22 +28,8 @@ def rpartition(Column source_strings, Returns a Column by splitting the `source_strings` column at the last occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rpartition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.rpartition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd index 4162e886a7d..4299cf62e99 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd @@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] partition( - column_view source_strings, + column_view input, string_scalar delimiter) except + cdef unique_ptr[table] rpartition( - column_view source_strings, + column_view input, string_scalar delimiter) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 142bc124ca2..498340d7497 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -38,3 +38,4 @@ rapids_cython_create_modules( ) add_subdirectory(convert) +add_subdirectory(split) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d8afccc7336..fb137fab745 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -14,6 +14,7 @@ from . cimport ( regex_program, replace, slice, + split, strip, ) from .side_type cimport side_type @@ -33,5 +34,6 @@ __all__ = [ "replace", "slice", "strip", + "split", "side_type", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 22452812e42..d3e7cb59dc7 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -15,6 +15,7 @@ repeat, replace, slice, + split, strip, ) from .side_type import SideType @@ -34,5 +35,6 @@ "replace", "slice", "strip", + "split", "SideType", ] diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt new file mode 100644 index 00000000000..8f544f6f537 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources partition.pyx split.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd new file mode 100644 index 00000000000..72086e57d9f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . cimport partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py new file mode 100644 index 00000000000..2033e5e275b --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd new file mode 100644 index 00000000000..9e0b471f180 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + + +cpdef Table partition(Column input, Scalar delimiter) + +cpdef Table rpartition(Column input, Scalar delimiter) diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx new file mode 100644 index 00000000000..ab1600f1733 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -0,0 +1,82 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.split cimport partition as cpp_partition +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table partition(Column input, Scalar delimiter): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::partition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_partition.partition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rpartition(Column input, Scalar delimiter): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rpartition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_partition.rpartition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py new file mode 100644 index 00000000000..80cae8d1c6b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_table_eq + + +@pytest.fixture +def data_col(): + pa_arr = pa.array(["ab_cd", "def_g_h", None]) + plc_column = plc.interop.from_arrow(pa_arr) + return pa_arr, plc_column + + +def test_partition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.partition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def", None], + "b": ["_", "_", None], + "c": ["cd", "g_h", None], + } + ) + assert_table_eq(expected, result) + + +def test_rpartition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.rpartition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def_g", None], + "b": ["_", "_", None], + "c": ["cd", "h", None], + } + ) + assert_table_eq(expected, result) From 6e7482b32633b5fa229c907f409b34256d7512cb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:19:16 -0700 Subject: [PATCH 2/4] Fix python call --- python/cudf/cudf/_lib/strings/split/partition.pyx | 4 ++-- python/cudf/cudf/core/column/string.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index 5319addc41c..220a9621adb 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -18,7 +18,7 @@ def partition(Column source_strings, source_strings.to_pylibcudf(mode="read"), py_delimiter.device_value.c_value ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) + return dict(enumerate((Column.from_pylibcudf(col) for col in plc_table.columns()))) @acquire_spill_lock() @@ -32,4 +32,4 @@ def rpartition(Column source_strings, source_strings.to_pylibcudf(mode="read"), py_delimiter.device_value.c_value ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) + return dict(enumerate((Column.from_pylibcudf(col) for col in plc_table.columns()))) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4463e3280df..0a292837480 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2820,7 +2820,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.partition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2885,7 +2885,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) From 3a209408c7c96ecc56d61866eeac984a3d418205 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 26 Sep 2024 16:49:20 -0700 Subject: [PATCH 3/4] Add pxd for split --- .../pylibcudf/libcudf/strings/split/split.pxd | 24 ++++++------- .../pylibcudf/strings/split/split.pxd | 24 +++++++++++++ .../pylibcudf/strings/split/split.pyx | 35 +++++++++++++++++++ 3 files changed, 71 insertions(+), 12 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd index 3046149aebb..a22a79fc7d7 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd @@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[table] rsplit( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] split_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + @@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[table] rsplit_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] split_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd index e69de29bb2d..355a1874298 100644 --- a/python/pylibcudf/pylibcudf/strings/split/split.pxd +++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx index e69de29bb2d..2103b5a351e 100644 --- a/python/pylibcudf/pylibcudf/strings/split/split.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +# from pylibcudf.libcudf.column.column cimport column +# from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +# from pylibcudf.strings.split cimport split as cpp_split +from pylibcudf.table cimport Table + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): + pass + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): + pass + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): + pass + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit): + pass + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): + pass + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): + pass + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit): + pass + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit): + pass From 9bc0ce276e6d51e2a91502036d856917ac54e801 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 27 Sep 2024 16:12:06 -0700 Subject: [PATCH 4/4] Add split implementations and tests --- .../cudf/_lib/strings/split/partition.pyx | 4 +- python/cudf/cudf/_lib/strings/split/split.pyx | 217 +++--------- python/cudf/cudf/core/column/string.py | 8 +- .../pylibcudf/strings/split/split.pyx | 315 +++++++++++++++++- .../tests/test_string_split_split.py | 130 ++++++++ 5 files changed, 493 insertions(+), 181 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_split.py diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index 220a9621adb..5319addc41c 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -18,7 +18,7 @@ def partition(Column source_strings, source_strings.to_pylibcudf(mode="read"), py_delimiter.device_value.c_value ) - return dict(enumerate((Column.from_pylibcudf(col) for col in plc_table.columns()))) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -32,4 +32,4 @@ def rpartition(Column source_strings, source_strings.to_pylibcudf(mode="read"), py_delimiter.device_value.c_value ) - return dict(enumerate((Column.from_pylibcudf(col) for col in plc_table.columns()))) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index f481fea4c51..4ec6c7073d8 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,33 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.split.split cimport ( - rsplit as cpp_rsplit, - rsplit_re as cpp_rsplit_re, - rsplit_record as cpp_rsplit_record, - rsplit_record_re as cpp_rsplit_record_re, - split as cpp_split, - split_re as cpp_split_re, - split_record as cpp_split_record, - split_record_re as cpp_split_record_re, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -39,26 +18,12 @@ def split(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -70,25 +35,12 @@ def split_record(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -100,26 +52,12 @@ def rsplit(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -160,24 +85,15 @@ def split_re(Column source_strings, Returns data by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -217,23 +124,15 @@ def split_record_re(Column source_strings, Returns a Column by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0a292837480..da422db5eae 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2546,9 +2546,9 @@ def split( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.split_re(self._column, pat, n) + data = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split( + data = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2719,9 +2719,9 @@ def rsplit( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.rsplit_re(self._column, pat, n) + data = libstrings.rsplit_re(self._column, pat, n) else: - data, _ = libstrings.rsplit( + data = libstrings.rsplit( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx index 2103b5a351e..a7d7f39fc47 100644 --- a/python/pylibcudf/pylibcudf/strings/split/split.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -1,35 +1,326 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move from pylibcudf.column cimport Column -# from pylibcudf.libcudf.column.column cimport column -# from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.split cimport split as cpp_split +from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_program cimport RegexProgram -# from pylibcudf.strings.split cimport split as cpp_split from pylibcudf.table cimport Table +from cython.operator import dereference + cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): - pass + """ + Returns a list of columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::split`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): - pass + """ + Returns a list of columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns. + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): - pass + """ + Splits individual strings elements into a list of strings. + + For details, see :cpp:func:`cudf::strings::split_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit): - pass + """ + Splits individual strings elements into a list of strings starting + from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): - pass + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.split_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): - pass + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string starting from + the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit): - pass + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.split_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit): - pass + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string starting from the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py new file mode 100644 index 00000000000..2aeffac8209 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -0,0 +1,130 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq, assert_table_eq + + +@pytest.fixture +def data_col(): + pa_array = pa.array(["a_b_c", "d-e-f", None]) + plc_column = plc.interop.from_arrow(pa_array) + return pa_array, plc_column + + +@pytest.fixture +def delimiter(): + delimiter = "_" + plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter)) + return delimiter, plc_delimiter + + +@pytest.fixture +def re_delimiter(): + return "[_-]" + + +def test_split(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.split(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a", "d-e-f", None], + "b": ["b_c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a_b", "d-e-f", None], + "b": ["c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_split_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.split_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a", "d", None], + "b": ["b_c", "e-f", None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.rsplit_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a_b", "d-e", None], + "b": ["c", "f", None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.split_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.rsplit_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + -1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter) + assert_column_eq(expected, result)