rapidsai · mroeschke · Sep 26, 2024 · Sep 26, 2024 · Sep 26, 2024 · Sep 27, 2024
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -15,4 +15,5 @@ strings
  repeat
  replace
  slice
+ split
  strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
@@ -0,0 +1,6 @@
+=====
+split
+=====
+
+.. automodule:: pylibcudf.strings.split
+ :members:
@@ -1,21 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.split.partition cimport (
- partition as cpp_partition,
- rpartition as cpp_rpartition,
-)
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -25,25 +14,11 @@ def partition(Column source_strings,
  Returns data by splitting the `source_strings`
  column at the first occurrence of the specified `py_delimiter`.
  """
-
- cdef DeviceScalar delimiter = py_delimiter.device_value
-
- cdef unique_ptr[table] c_result
- cdef column_view source_view = source_strings.view()
- cdef const string_scalar* scalar_str = <const string_scalar*>(
- delimiter.get_raw_ptr()
- )
-
- with nogil:
- c_result = move(cpp_partition(
- source_view,
- scalar_str[0]
- ))
-
- return data_from_unique_ptr(
- move(c_result),
- column_names=range(0, c_result.get()[0].num_columns())
+ plc_table = plc.strings.split.partition.partition(
+ source_strings.to_pylibcudf(mode="read"),
+ py_delimiter.device_value.c_value
  )
+ return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -53,22 +28,8 @@ def rpartition(Column source_strings,
  Returns a Column by splitting the `source_strings`
  column at the last occurrence of the specified `py_delimiter`.
  """
-
- cdef DeviceScalar delimiter = py_delimiter.device_value
-
- cdef unique_ptr[table] c_result
- cdef column_view source_view = source_strings.view()
- cdef const string_scalar* scalar_str = <const string_scalar*>(
- delimiter.get_raw_ptr()
- )
-
- with nogil:
- c_result = move(cpp_rpartition(
- source_view,
- scalar_str[0]
- ))
-
- return data_from_unique_ptr(
- move(c_result),
- column_names=range(0, c_result.get()[0].num_columns())
+ plc_table = plc.strings.split.partition.rpartition(
+ source_strings.to_pylibcudf(mode="read"),
+ py_delimiter.device_value.c_value
  )
+ return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
@@ -1,33 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.split.split cimport (
- rsplit as cpp_rsplit,
- rsplit_re as cpp_rsplit_re,
- rsplit_record as cpp_rsplit_record,
- rsplit_record_re as cpp_rsplit_record_re,
- split as cpp_split,
- split_re as cpp_split_re,
- split_record as cpp_split_record,
- split_record_re as cpp_split_record_re,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -39,26 +18,12 @@ def split(Column source_strings,
  column around the specified `py_delimiter`.
  The split happens from beginning.
  """
-
- cdef DeviceScalar delimiter = py_delimiter.device_value
-
- cdef unique_ptr[table] c_result
- cdef column_view source_view = source_strings.view()
- cdef const string_scalar* scalar_str = <const string_scalar*>(
- delimiter.get_raw_ptr()
- )
-
- with nogil:
- c_result = move(cpp_split(
- source_view,
- scalar_str[0],
- maxsplit
- ))
-
- return data_from_unique_ptr(
- move(c_result),
- column_names=range(0, c_result.get()[0].num_columns())
+ plc_table = plc.strings.split.split.split(
+ source_strings.to_pylibcudf(mode="read"),
+ py_delimiter.device_value.c_value,
+ maxsplit,
  )
+ return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -70,25 +35,12 @@ def split_record(Column source_strings,
  column around the specified `py_delimiter`.
  The split happens from beginning.
  """
-
- cdef DeviceScalar delimiter = py_delimiter.device_value
-
- cdef unique_ptr[column] c_result
- cdef column_view source_view = source_strings.view()
- cdef const string_scalar* scalar_str = <const string_scalar*>(
- delimiter.get_raw_ptr()
- )
-
- with nogil:
- c_result = move(cpp_split_record(
- source_view,
- scalar_str[0],
- maxsplit
- ))
-
- return Column.from_unique_ptr(
- move(c_result),
+ plc_column = plc.strings.split.split.split_record(
+ source_strings.to_pylibcudf(mode="read"),
+ py_delimiter.device_value.c_value,
+ maxsplit,
  )
+ return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -100,26 +52,12 @@ def rsplit(Column source_strings,
  column around the specified `py_delimiter`.
  The split happens from the end.
  """
-
- cdef DeviceScalar delimiter = py_delimiter.device_value
-
- cdef unique_ptr[table] c_result
- cdef column_view source_view = source_strings.view()
- cdef const string_scalar* scalar_str = <const string_scalar*>(
- delimiter.get_raw_ptr()
- )
-
- with nogil:
- c_result = move(cpp_rsplit(
- source_view,
- scalar_str[0],
- maxsplit
- ))
-
- return data_from_unique_ptr(
- move(c_result),
- column_names=range(0, c_result.get()[0].num_columns())
+ plc_table = plc.strings.split.split.rsplit(
+ source_strings.to_pylibcudf(mode="read"),
+ py_delimiter.device_value.c_value,
+ maxsplit,
  )
+ return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -131,25 +69,12 @@ def rsplit_record(Column source_strings,
  column around the specified `py_delimiter`.
  The split happens from the end.
  """
-
- cdef DeviceScalar delimiter = py_delimiter.device_value
-
- cdef unique_ptr[column] c_result
- cdef column_view source_view = source_strings.view()
- cdef const string_scalar* scalar_str = <const string_scalar*>(
- delimiter.get_raw_ptr()
- )
-
- with nogil:
- c_result = move(cpp_rsplit_record(
- source_view,
- scalar_str[0],
- maxsplit
- ))
-
- return Column.from_unique_ptr(
- move(c_result),
+ plc_column = plc.strings.split.split.rsplit_record(
+ source_strings.to_pylibcudf(mode="read"),
+ py_delimiter.device_value.c_value,
+ maxsplit,
  )
+ return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -160,24 +85,15 @@ def split_re(Column source_strings,
  Returns data by splitting the `source_strings`
  column around the delimiters identified by `pattern`.
  """
- cdef unique_ptr[table] c_result
- cdef column_view source_view = source_strings.view()
- cdef string pattern_string = <string>str(pattern).encode()
- cdef regex_flags c_flags = regex_flags.DEFAULT
- cdef unique_ptr[regex_program] c_prog
-
- with nogil:
- c_prog = move(regex_program.create(pattern_string, c_flags))
- c_result = move(cpp_split_re(
- source_view,
- dereference(c_prog),
- maxsplit
- ))
-
- return data_from_unique_ptr(
- move(c_result),
- column_names=range(0, c_result.get()[0].num_columns())
+ plc_table = plc.strings.split.split.split_re(
+ source_strings.to_pylibcudf(mode="read"),
+ plc.strings.regex_program.RegexProgram.create(
+ str(pattern),
+ plc.strings.regex_flags.RegexFlags.DEFAULT,
+ ),
+ maxsplit,
  )
+ return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -189,24 +105,15 @@ def rsplit_re(Column source_strings,
  column around the delimiters identified by `pattern`.
  The delimiters are searched starting from the end of each string.
  """
- cdef unique_ptr[table] c_result
- cdef column_view source_view = source_strings.view()
- cdef string pattern_string = <string>str(pattern).encode()
- cdef regex_flags c_flags = regex_flags.DEFAULT
- cdef unique_ptr[regex_program] c_prog
-
- with nogil:
- c_prog = move(regex_program.create(pattern_string, c_flags))
- c_result = move(cpp_rsplit_re(
- source_view,
- dereference(c_prog),
- maxsplit
- ))
-
- return data_from_unique_ptr(
- move(c_result),
- column_names=range(0, c_result.get()[0].num_columns())
+ plc_table = plc.strings.split.split.rsplit_re(
+ source_strings.to_pylibcudf(mode="read"),
+ plc.strings.regex_program.RegexProgram.create(
+ str(pattern),
+ plc.strings.regex_flags.RegexFlags.DEFAULT,
+ ),
+ maxsplit,
  )
+ return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -217,23 +124,15 @@ def split_record_re(Column source_strings,
  Returns a Column by splitting the `source_strings`
  column around the delimiters identified by `pattern`.
  """
- cdef unique_ptr[column] c_result
- cdef column_view source_view = source_strings.view()
- cdef string pattern_string = <string>str(pattern).encode()
- cdef regex_flags c_flags = regex_flags.DEFAULT
- cdef unique_ptr[regex_program] c_prog
-
- with nogil:
- c_prog = move(regex_program.create(pattern_string, c_flags))
- c_result = move(cpp_split_record_re(
- source_view,
- dereference(c_prog),
- maxsplit
- ))
-
- return Column.from_unique_ptr(
- move(c_result),
+ plc_column = plc.strings.split.split.split_record_re(
+ source_strings.to_pylibcudf(mode="read"),
+ plc.strings.regex_program.RegexProgram.create(
+ str(pattern),
+ plc.strings.regex_flags.RegexFlags.DEFAULT,
+ ),
+ maxsplit,
  )
+ return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings,
  column around the delimiters identified by `pattern`.
  The delimiters are searched starting from the end of each string.
  """
- cdef unique_ptr[column] c_result
- cdef column_view source_view = source_strings.view()
- cdef string pattern_string = <string>str(pattern).encode()
- cdef regex_flags c_flags = regex_flags.DEFAULT
- cdef unique_ptr[regex_program] c_prog
-
- with nogil:
- c_prog = move(regex_program.create(pattern_string, c_flags))
- c_result = move(cpp_rsplit_record_re(
- source_view,
- dereference(c_prog),
- maxsplit
- ))
-
- return Column.from_unique_ptr(
- move(c_result),
+ plc_column = plc.strings.split.split.rsplit_record_re(
+ source_strings.to_pylibcudf(mode="read"),
+ plc.strings.regex_program.RegexProgram.create(
+ str(pattern),
+ plc.strings.regex_flags.RegexFlags.DEFAULT,
+ ),
+ maxsplit,
  )
+ return Column.from_pylibcudf(plc_column)