From ef45bb554ecd5739a557e8e9601dd83bba40942f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Nov 2024 13:59:14 -0800 Subject: [PATCH 1/2] Remove cudf._lib.text in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/text.pyx | 53 ---------------------------- python/cudf/cudf/io/text.py | 45 +++++++++++++++++------ 4 files changed, 34 insertions(+), 66 deletions(-) delete mode 100644 python/cudf/cudf/_lib/text.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 2958c286d20..b6efdf6daeb 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -41,7 +41,6 @@ set(cython_sources stream_compaction.pyx string_casting.pyx strings_udf.pyx - text.pyx timezone.pyx transform.pyx transpose.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 19dc4488560..d2eb6f994a0 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -28,7 +28,6 @@ string_casting, strings, strings_udf, - text, timezone, transpose, ) diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx deleted file mode 100644 index 7942d067c2b..00000000000 --- a/python/cudf/cudf/_lib/text.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from io import TextIOBase - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -def read_text(object filepaths_or_buffers, - str delimiter, - object byte_range, - bool strip_delimiters, - object compression, - object compression_offsets): - """ - Cython function to call into libcudf API, see `multibyte_split`. - - See Also - -------- - cudf.io.text.read_text - """ - if compression is None: - if isinstance(filepaths_or_buffers, TextIOBase): - datasource = plc.io.text.make_source(filepaths_or_buffers.read()) - else: - datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) - elif compression == "bgzip": - if isinstance(filepaths_or_buffers, TextIOBase): - raise ValueError("bgzip compression requires a file path") - if compression_offsets is not None: - if len(compression_offsets) != 2: - raise ValueError( - "compression offsets need to consist of two elements") - datasource = plc.io.text.make_source_from_bgzip_file( - filepaths_or_buffers, - compression_offsets[0], - compression_offsets[1] - ) - else: - datasource = plc.io.text.make_source_from_bgzip_file( - filepaths_or_buffers, - ) - else: - raise ValueError("Only bgzip compression is supported at the moment") - - options = plc.io.text.ParseOptions( - byte_range=byte_range, strip_delimiters=strip_delimiters - ) - plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 5ce738cae0e..01caeafe31d 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -1,9 +1,10 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. -from io import BytesIO, StringIO +from io import BytesIO, StringIO, TextIOBase + +import pylibcudf as plc import cudf -from cudf._lib import text as libtext from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking @@ -33,13 +34,35 @@ def read_text( filepath_or_buffer, "read_text" ) - return cudf.Series._from_column( - libtext.read_text( - filepath_or_buffer, - delimiter=delimiter, - byte_range=byte_range, - strip_delimiters=strip_delimiters, - compression=compression, - compression_offsets=compression_offsets, - ) + if compression is None: + if isinstance(filepath_or_buffer, TextIOBase): + datasource = plc.io.text.make_source(filepath_or_buffer.read()) + else: + datasource = plc.io.text.make_source_from_file(filepath_or_buffer) + elif compression == "bgzip": + if isinstance(filepath_or_buffer, TextIOBase): + raise ValueError("bgzip compression requires a file path") + if compression_offsets is not None: + if len(compression_offsets) != 2: + raise ValueError( + "compression offsets need to consist of two elements" + ) + datasource = plc.io.text.make_source_from_bgzip_file( + filepath_or_buffer, + compression_offsets[0], + compression_offsets[1], + ) + else: + datasource = plc.io.text.make_source_from_bgzip_file( + filepath_or_buffer, + ) + else: + raise ValueError("Only bgzip compression is supported at the moment") + + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + result = cudf._lib.column.Column.from_pylibcudf(plc_column) + + return cudf.Series._from_column(result) From 63cdf0f20ef58879671e3b2faed0770bd1b7e8c4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:43:46 -0800 Subject: [PATCH 2/2] Update python/cudf/cudf/io/text.py Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com> --- python/cudf/cudf/io/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 01caeafe31d..5e266c5ff55 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -45,7 +45,7 @@ def read_text( if compression_offsets is not None: if len(compression_offsets) != 2: raise ValueError( - "compression offsets need to consist of two elements" + "Compression offsets need to consist of two elements" ) datasource = plc.io.text.make_source_from_bgzip_file( filepath_or_buffer,