From 62378c8e2abba3f01cdc7607a5fe2e5258f4fabe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:41:09 -0700 Subject: [PATCH] Add string.convert.convert_urls APIs to pylibcudf --- .../cudf/strings/convert/convert_urls.hpp | 4 +- .../_lib/strings/convert/convert_urls.pyx | 36 +++-------- .../libcudf/strings/convert/convert_urls.pxd | 4 +- .../pylibcudf/strings/convert/CMakeLists.txt | 2 +- .../pylibcudf/strings/convert/__init__.pxd | 7 ++- .../pylibcudf/strings/convert/__init__.py | 7 ++- .../strings/convert/convert_urls.pxd | 8 +++ .../strings/convert/convert_urls.pyx | 63 +++++++++++++++++++ .../tests/test_string_convert_urls.py | 36 +++++++++++ 9 files changed, 131 insertions(+), 36 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp index d6e87f9d543..febc63d8779 100644 --- a/cpp/include/cudf/strings/convert/convert_urls.hpp +++ b/cpp/include/cudf/strings/convert/convert_urls.hpp @@ -28,7 +28,7 @@ namespace strings { */ /** - * @brief Decodes each string using URL encoding. + * @brief Encodes each string using URL encoding. * * Converts mostly non-ascii characters and control characters into UTF-8 hex code-points * prefixed with '%'. For example, the space character must be converted to characters '%20' where @@ -49,7 +49,7 @@ std::unique_ptr url_encode( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Encodes each string using URL encoding. + * @brief Decodes each string using URL encoding. * * Converts all character sequences starting with '%' into character code-points * interpreting the 2 following characters as hex values to create the code-point. diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx index e52116d6247..d5c2f771970 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx @@ -1,17 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_urls cimport ( - url_decode as cpp_url_decode, - url_encode as cpp_url_encode, -) - from cudf._lib.column cimport Column @@ -28,17 +20,10 @@ def url_decode(Column source_strings): ------- URL decoded string column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_decode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_urls.url_decode( + source_strings.to_pylibcudf(mode="read") ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,14 +42,7 @@ def url_encode(Column source_strings): ------- URL encoded string column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_encode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_urls.url_encode( + source_strings.to_pylibcudf(mode="read") ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd index 5c07b698454..cb319ad143b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] url_encode( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] url_decode( - column_view input_col) except + + column_view input) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 3febc78dfd2..e8613c00136 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx convert_urls.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 5525bca46d6..6929e2cbd99 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_booleans, convert_datetime, convert_durations +from . cimport ( + convert_booleans, + convert_datetime, + convert_durations, + convert_urls, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index 2340ebe9a26..a520e8a7dd5 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_booleans, convert_datetime, convert_durations +from . import ( + convert_booleans, + convert_datetime, + convert_durations, + convert_urls, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd new file mode 100644 index 00000000000..da05ce93426 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column url_encode(Column Input) + +cpdef Column url_decode(Column Input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx new file mode 100644 index 00000000000..a5e080e53b7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls + + +cpdef Column url_encode(Column input): + """ + Encodes each string using URL encoding. + + For details, see :cpp:func:`cudf::strings::url_encode` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_urls.url_encode( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column url_decode(Column input): + """ + Decodes each string using URL encoding. + + For details, see :cpp:func:`cudf::strings::url_decode` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_urls.url_decode( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py new file mode 100644 index 00000000000..fee8c3fb8f6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import urllib + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_url_encode(): + data = ["/home/nfs", None] + arr = pa.array(data) + result = plc.strings.convert.convert_urls.url_encode( + plc.interop.from_arrow(arr) + ) + expected = pa.array( + [ + urllib.parse.quote(url, safe="") if isinstance(url, str) else url + for url in data + ] + ) + assert_column_eq(result, expected) + + +def test_url_decode(): + data = ["%2Fhome%2fnfs", None] + arr = pa.array(data) + result = plc.strings.convert.convert_urls.url_decode( + plc.interop.from_arrow(arr) + ) + expected = pa.array( + [ + urllib.parse.unquote(url) if isinstance(url, str) else url + for url in data + ] + ) + assert_column_eq(result, expected)