From ef45bb554ecd5739a557e8e9601dd83bba40942f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 21 Nov 2024 13:59:14 -0800
Subject: [PATCH 1/2] Remove cudf._lib.text in favor of inlining pylibcudf

---
 python/cudf/cudf/_lib/CMakeLists.txt |  1 -
 python/cudf/cudf/_lib/__init__.py    |  1 -
 python/cudf/cudf/_lib/text.pyx       | 53 ----------------------------
 python/cudf/cudf/io/text.py          | 45 +++++++++++++++++------
 4 files changed, 34 insertions(+), 66 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/text.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 2958c286d20..b6efdf6daeb 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -41,7 +41,6 @@ set(cython_sources
     stream_compaction.pyx
     string_casting.pyx
     strings_udf.pyx
-    text.pyx
     timezone.pyx
     transform.pyx
     transpose.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 19dc4488560..d2eb6f994a0 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -28,7 +28,6 @@
     string_casting,
     strings,
     strings_udf,
-    text,
     timezone,
     transpose,
 )
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
deleted file mode 100644
index 7942d067c2b..00000000000
--- a/python/cudf/cudf/_lib/text.pyx
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from io import TextIOBase
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-def read_text(object filepaths_or_buffers,
-              str delimiter,
-              object byte_range,
-              bool strip_delimiters,
-              object compression,
-              object compression_offsets):
-    """
-    Cython function to call into libcudf API, see `multibyte_split`.
-
-    See Also
-    --------
-    cudf.io.text.read_text
-    """
-    if compression is None:
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
-        else:
-            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
-    elif compression == "bgzip":
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            raise ValueError("bgzip compression requires a file path")
-        if compression_offsets is not None:
-            if len(compression_offsets) != 2:
-                raise ValueError(
-                    "compression offsets need to consist of two elements")
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-                compression_offsets[0],
-                compression_offsets[1]
-            )
-        else:
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-            )
-    else:
-        raise ValueError("Only bgzip compression is supported at the moment")
-
-    options = plc.io.text.ParseOptions(
-        byte_range=byte_range, strip_delimiters=strip_delimiters
-    )
-    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 5ce738cae0e..01caeafe31d 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, TextIOBase
+
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import text as libtext
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -33,13 +34,35 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_column(
-        libtext.read_text(
-            filepath_or_buffer,
-            delimiter=delimiter,
-            byte_range=byte_range,
-            strip_delimiters=strip_delimiters,
-            compression=compression,
-            compression_offsets=compression_offsets,
-        )
+    if compression is None:
+        if isinstance(filepath_or_buffer, TextIOBase):
+            datasource = plc.io.text.make_source(filepath_or_buffer.read())
+        else:
+            datasource = plc.io.text.make_source_from_file(filepath_or_buffer)
+    elif compression == "bgzip":
+        if isinstance(filepath_or_buffer, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        if compression_offsets is not None:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "compression offsets need to consist of two elements"
+                )
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+                compression_offsets[0],
+                compression_offsets[1],
+            )
+        else:
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+            )
+    else:
+        raise ValueError("Only bgzip compression is supported at the moment")
+
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
     )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    result = cudf._lib.column.Column.from_pylibcudf(plc_column)
+
+    return cudf.Series._from_column(result)

From 63cdf0f20ef58879671e3b2faed0770bd1b7e8c4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 5 Dec 2024 15:43:46 -0800
Subject: [PATCH 2/2] Update python/cudf/cudf/io/text.py

Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com>
---
 python/cudf/cudf/io/text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 01caeafe31d..5e266c5ff55 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -45,7 +45,7 @@ def read_text(
         if compression_offsets is not None:
             if len(compression_offsets) != 2:
                 raise ValueError(
-                    "compression offsets need to consist of two elements"
+                    "Compression offsets need to consist of two elements"
                 )
             datasource = plc.io.text.make_source_from_bgzip_file(
                 filepath_or_buffer,