-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
[ENH] to_orc #43860
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ENH] to_orc #43860
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2734,7 +2734,81 @@ def to_parquet( | |
storage_options=storage_options, | ||
**kwargs, | ||
) | ||
|
||
def to_orc( | ||
self, | ||
path: FilePathOrBuffer = None, | ||
engine: str = 'pyarrow', | ||
index: bool = None, | ||
**kwargs | ||
) -> bytes: | ||
""" | ||
Write a DataFrame to the orc/arrow format. | ||
Parameters | ||
---------- | ||
df : DataFrame | ||
path : str or file-like object, default None | ||
If a string, it will be used as Root Directory path | ||
when writing a partitioned dataset. By file-like object, | ||
we refer to objects with a write() method, such as a file handle | ||
(e.g. via builtin open function) or io.BytesIO. The engine | ||
fastparquet does not accept file-like objects. If path is None, | ||
a bytes object is returned. | ||
engine : {{'pyarrow'}}, default 'pyarrow' | ||
Parquet library to use, or library it self, checked with 'pyarrow' name | ||
and version > 4.0.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it > 4.0.0, meaning >= 5.0? would be more informative |
||
index : bool, default None | ||
If ``True``, include the dataframe's index(es) in the file output. If | ||
``False``, they will not be written to the file. | ||
If ``None``, similar to ``infer`` the dataframe's index(es) | ||
will be saved. However, instead of being saved as values, | ||
the RangeIndex will be stored as a range in the metadata so it | ||
doesn't require much space and is faster. Other indexes will | ||
be included as columns in the file output. | ||
kwargs | ||
Additional keyword arguments passed to the engine | ||
Returns | ||
------- | ||
bytes if no path argument is provided else None | ||
|
||
See Also | ||
-------- | ||
read_orc : Read a ORC file. | ||
DataFrame.to_parquet : Write a parquet file. | ||
DataFrame.to_csv : Write a csv file. | ||
DataFrame.to_sql : Write to a sql table. | ||
DataFrame.to_hdf : Write to hdf. | ||
|
||
Notes | ||
----- | ||
This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ library. | ||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) | ||
>>> df.to_orc('df.orc', compression='gzip') # doctest: +SKIP | ||
>>> pd.read_orc('df.orc') # doctest: +SKIP | ||
col1 col2 | ||
0 1 3 | ||
1 2 4 | ||
|
||
If you want to get a buffer to the orc content you can write it to io.BytesIO | ||
>>> import io | ||
>>> b = io.BytesIO(df.to_orc()) | ||
>>> b.seek(0) | ||
0 | ||
>>> content = b.read() | ||
""" | ||
from pandas.io.orc import to_orc | ||
|
||
return to_orc( | ||
self, | ||
path, | ||
engine, | ||
index=index, | ||
**kwargs | ||
) | ||
|
||
@Substitution( | ||
header_type="bool", | ||
header="Whether to print column labels, default True", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,11 @@ | ||
""" orc compat """ | ||
from __future__ import annotations | ||
|
||
import os | ||
import pandas._testing as tm | ||
|
||
from typing import TYPE_CHECKING | ||
from tempfile import gettempdir | ||
|
||
from pandas._typing import FilePathOrBuffer | ||
from pandas.compat._optional import import_optional_dependency | ||
|
@@ -55,3 +59,68 @@ def read_orc( | |
with get_handle(path, "rb", is_text=False) as handles: | ||
orc_file = orc.ORCFile(handles.handle) | ||
return orc_file.read(columns=columns, **kwargs).to_pandas() | ||
|
||
|
||
def to_orc( | ||
df: DataFrame, | ||
path: FilePathOrBuffer = None, | ||
engine: str = 'pyarrow', | ||
index: bool = None, | ||
**kwargs | ||
) -> bytes: | ||
""" | ||
Write a DataFrame to the orc/arrow format. | ||
Parameters | ||
---------- | ||
df : DataFrame | ||
path : str or file-like object, default None | ||
If a string, it will be used as Root Directory path | ||
when writing a partitioned dataset. By file-like object, | ||
we refer to objects with a write() method, such as a file handle | ||
(e.g. via builtin open function) or io.BytesIO. The engine | ||
fastparquet does not accept file-like objects. If path is None, | ||
a bytes object is returned. | ||
engine : {{'pyarrow'}}, default 'pyarrow' | ||
Parquet library to use, or library it self, checked with 'pyarrow' name | ||
and version > 4.0.0 | ||
index : bool, default None | ||
If ``True``, include the dataframe's index(es) in the file output. If | ||
``False``, they will not be written to the file. | ||
If ``None``, similar to ``infer`` the dataframe's index(es) | ||
will be saved. However, instead of being saved as values, | ||
the RangeIndex will be stored as a range in the metadata so it | ||
doesn't require much space and is faster. Other indexes will | ||
be included as columns in the file output. | ||
kwargs | ||
Additional keyword arguments passed to the engine | ||
Returns | ||
------- | ||
bytes if no path argument is provided else None | ||
""" | ||
if index is None: | ||
index = df.index.names[0] is not None | ||
|
||
if isinstance(engine, str): | ||
engine = import_optional_dependency(engine, min_version='4.0.0') | ||
else: | ||
try: | ||
assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" | ||
assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" | ||
except Exception as e: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can be more specific about the exception type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
raise ValueError("Wrong engine passed, %s" % e) | ||
|
||
if path is None: | ||
# to bytes: tmp path, pyarrow auto closes buffers | ||
with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this getting written to a file? Thought path = None will just return byte string? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I do close the file-like object from my side by default in Arrow. It does seem to be different from the behavior of the Parquet writer in Arrow. If this is indeed an issue I can discuss with the Arrow community whether we should change it. Right now I use PyArrow buffer and avoid creating a temp file. |
||
engine.orc.write_table( | ||
engine.Table.from_pandas(df, preserve_index=index), | ||
path, **kwargs | ||
) | ||
with open(path, 'rb') as path: | ||
return path.read() | ||
else: | ||
engine.orc.write_table( | ||
engine.Table.from_pandas(df, preserve_index=index), | ||
path, **kwargs | ||
) | ||
return |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we reuse the docstring opposed to copy/paste
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm there isn't an orc/arrow format. Maybe it should be "Write a DataFrame to the ORC format using PyArrow"?