Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start migrating I/O writers to pylibcudf (starting with JSON) #15952

Merged
merged 38 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
591cdd2
Start migrating I/O writers to pylibcudf (starting with JSON)
lithomas1 Jun 6, 2024
15daaaa
update docs
lithomas1 Jun 7, 2024
72204f1
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 7, 2024
c24664c
update and start writing tests
lithomas1 Jun 7, 2024
8c88c7c
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 11, 2024
2b3853f
add some tests
lithomas1 Jun 11, 2024
cd6df5e
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 11, 2024
c54316e
update
lithomas1 Jun 11, 2024
dc93356
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 12, 2024
8c4c4e4
address comments
lithomas1 Jun 12, 2024
63358e9
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 12, 2024
9150a6c
try something else
lithomas1 Jun 12, 2024
b1951d0
try fix
lithomas1 Jun 13, 2024
1228569
update following feedback
lithomas1 Jun 13, 2024
699efd3
cleanup tests
lithomas1 Jun 13, 2024
e242182
address more comments
lithomas1 Jun 13, 2024
564358f
Merge branch 'branch-24.08' into pylibcudf-io-writers
lithomas1 Jun 17, 2024
e0901dd
fix bad merge
lithomas1 Jun 17, 2024
d22953f
Merge branch 'branch-24.08' into pylibcudf-io-writers
lithomas1 Jun 18, 2024
604c16d
address more comments
lithomas1 Jun 24, 2024
e6c3ec7
address more comments
lithomas1 Jun 24, 2024
624d444
fix all nested struct cases
lithomas1 Jun 24, 2024
53b821c
Merge branch 'pylibcudf-io-writers' of github.com:lithomas1/cudf into…
lithomas1 Jun 24, 2024
186a2fb
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 24, 2024
9a6a896
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 25, 2024
0ed9af6
Fix error in testing utils
lithomas1 Jun 25, 2024
aff6178
small test fixes
lithomas1 Jun 25, 2024
f7cd9e6
cleanup utils
lithomas1 Jun 26, 2024
c5a3fbe
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 27, 2024
79c1dfd
clean source_or_sink
lithomas1 Jun 27, 2024
8fc139f
Merge branch 'pylibcudf-io-writers' of github.com:lithomas1/cudf into…
lithomas1 Jun 27, 2024
e940e30
Address code review
lithomas1 Jun 27, 2024
e57a677
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jun 29, 2024
7806ce4
simplify again
lithomas1 Jun 29, 2024
25c25d4
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
lithomas1 Jul 1, 2024
60287e1
address more comments
lithomas1 Jul 1, 2024
205c32c
Update python/cudf/cudf/_lib/pylibcudf/io/types.pyx
vyasr Jul 2, 2024
d325b64
Merge branch 'branch-24.08' into pylibcudf-io-writers
vyasr Jul 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ I/O Functions
:maxdepth: 1

avro
json
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
JSON
====

.. automodule:: cudf._lib.pylibcudf.io.json
:members:
98 changes: 29 additions & 69 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,38 +9,27 @@ from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool
from libcpp.map cimport map
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
from cudf._lib.column cimport Column
from cudf._lib.io.utils cimport (
make_sink_info,
make_source_info,
update_struct_field_names,
)
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_reader_options,
json_recovery_mode_t,
json_writer_options,
read_json as libcudf_read_json,
schema_element,
write_json as libcudf_write_json,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_name_info,
compression_type,
sink_info,
table_metadata,
table_with_metadata,
)
from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport data_from_unique_ptr

import cudf._lib.pylibcudf as plc


cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
Expand Down Expand Up @@ -175,45 +164,27 @@ def write_json(
--------
cudf.to_json
"""
cdef table_view input_table_view = table_view_from_table(
table, ignore_index=True
)

cdef unique_ptr[data_sink] data_sink_c
cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
cdef string na_c = na_rep.encode()
cdef bool include_nulls_c = include_nulls
cdef bool lines_c = lines
cdef int rows_per_chunk_c = rows_per_chunk
cdef string true_value_c = 'true'.encode()
cdef string false_value_c = 'false'.encode()
cdef table_metadata tbl_meta

num_index_cols_meta = 0
cdef column_name_info child_info
for i, name in enumerate(table._column_names, num_index_cols_meta):
child_info.name = name.encode()
tbl_meta.schema_info.push_back(child_info)
_set_col_children_metadata(
table[name]._column,
tbl_meta.schema_info[i]
)
cdef list colnames = []

cdef json_writer_options options = move(
json_writer_options.builder(sink_info_c, input_table_view)
.metadata(tbl_meta)
.na_rep(na_c)
.include_nulls(include_nulls_c)
.lines(lines_c)
.rows_per_chunk(rows_per_chunk_c)
.true_value(true_value_c)
.false_value(false_value_c)
.build()
)
for name in table._column_names:
colnames.append((name, _dtype_to_names_list(table[name]._column)))

try:
with nogil:
libcudf_write_json(options)
plc.io.json.write_json(
plc.io.SinkInfo([path_or_buf]),
plc.io.TableWithMetadata(
plc.Table([
c.to_pylibcudf(mode="read") for c in table._columns
]),
colnames
),
na_rep,
include_nulls,
lines,
rows_per_chunk,
true_value="true",
false_value="false"
)
except OverflowError:
raise OverflowError(
f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
Expand Down Expand Up @@ -254,23 +225,12 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
)
return dtype_to_data_type(dtype)

cdef _set_col_children_metadata(Column col,
column_name_info& col_meta):
cdef column_name_info child_info

def _dtype_to_names_list(col):
if isinstance(col.dtype, cudf.StructDtype):
for i, (child_col, name) in enumerate(
zip(col.children, list(col.dtype.fields))
):
child_info.name = name.encode()
col_meta.children.push_back(child_info)
_set_col_children_metadata(
child_col, col_meta.children[i]
)
return [(name, _dtype_to_names_list(child))
for name, child in zip(col.dtype.fields, col.children)]
elif isinstance(col.dtype, cudf.ListDtype):
for i, child_col in enumerate(col.children):
col_meta.children.push_back(child_info)
_set_col_children_metadata(
child_col, col_meta.children[i]
)
else:
return
return [("", _dtype_to_names_list(child))
for child in col.children]
return []
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources avro.pyx datasource.pyx types.pyx)
set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand All @@ -21,5 +21,7 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)

set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
pylibcudf_io_types
)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport avro, datasource, types
from . cimport avro, datasource, json, types
from .types cimport SourceInfo, TableWithMetadata
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import avro, datasource, types
from .types import SourceInfo, TableWithMetadata
from . import avro, datasource, json, types
from .types import SinkInfo, SourceInfo, TableWithMetadata
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ cpdef TableWithMetadata read_avro(
size_type num_rows = -1
):
"""
Reads an Avro dataset into a set of columns.
Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.

Parameters
----------
Expand All @@ -36,7 +36,7 @@ cpdef TableWithMetadata read_avro(
Returns
-------
TableWithMetadata
The Table and its corresponding metadata that was read in.
The Table and its corresponding metadata (column names) that were read in.
"""
cdef vector[string] c_columns
if columns is not None and len(columns) > 0:
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.string cimport string
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved

from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved
from cudf._lib.pylibcudf.libcudf.types cimport size_type


cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata tbl,
str na_rep = *,
bool include_nulls = *,
bool lines = *,
size_type rows_per_chunk = *,
str true_value = *,
str false_value = *
)
71 changes: 71 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.limits cimport numeric_limits
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_writer_options,
write_json as cpp_write_json,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
from cudf._lib.pylibcudf.types cimport size_type


cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata table_w_meta,
str na_rep = "",
bool include_nulls = False,
bool lines = False,
size_type rows_per_chunk = numeric_limits[size_type].max(),
str true_value = "true",
str false_value = "false"
):
"""
Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.

Parameters
----------
sink_info: SinkInfo
The SinkInfo object to write the JSON to.
table_w_meta: TableWithMetadata
The TableWithMetadata object containing the Table to write
na_rep: str, default ""
The string representation for null values.
include_nulls: bool, default False
Enables/Disables output of nulls as 'null'.
lines: bool, default False
If `True`, write output in the JSON lines format.
rows_per_chunk: size_type, defaults to length of the input table
The maximum number of rows to write at a time.
true_value: str, default "true"
The string representation for values != 0 in INT8 types.
false_value: str, default "false"
The string representation for values == 0 in INT8 types.
"""
cdef table_metadata tbl_meta = table_w_meta.metadata
cdef string na_rep_c = na_rep.encode()
cdef string true_value_c = true_value.encode()
cdef string false_value_c = false_value.encode()
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved

cdef json_writer_options options = move(
json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
.metadata(tbl_meta)
.na_rep(na_rep_c)
.include_nulls(include_nulls)
.lines(lines)
.build()
)

if rows_per_chunk != numeric_limits[size_type].max():
options.set_rows_per_chunk(rows_per_chunk)
if true_value != "true":
options.set_true_value(true_value_c)
if false_value != "false":
options.set_false_value(false_value_c)

with nogil:
cpp_write_json(options)
11 changes: 11 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_encoding,
column_in_metadata,
Expand All @@ -22,8 +26,15 @@ cdef class TableWithMetadata:
cdef public Table tbl
cdef table_metadata metadata

cdef vector[column_name_info] _make_column_info(self, list column_names)

@staticmethod
cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)

cdef class SourceInfo:
cdef source_info c_obj

cdef class SinkInfo:
# This vector just exists to keep the unique_ptrs to the sinks alive
cdef vector[unique_ptr[data_sink]] sink_storage
cdef sink_info c_obj
Loading
Loading