pandas-dev · ghost · Oct 3, 2021 · Oct 3, 2021 · Oct 3, 2021 · alimcmaster1
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2734,7 +2734,81 @@ def to_parquet(
             storage_options=storage_options,
             **kwargs,
         )
+
+    def to_orc(
+        self,
+        path: FilePathOrBuffer = None,
+        engine: str = 'pyarrow',
+        index: bool = None,
+        **kwargs
+    ) -> bytes:
+        """
+        Write a DataFrame to the orc/arrow format.
+        Parameters
+        ----------
+        df : DataFrame
+        path : str or file-like object, default None
+            If a string, it will be used as Root Directory path
+            when writing a partitioned dataset. By file-like object,
+            we refer to objects with a write() method, such as a file handle
+            (e.g. via builtin open function) or io.BytesIO. The engine
+            fastparquet does not accept file-like objects. If path is None,
+            a bytes object is returned.
+        engine : {{'pyarrow'}}, default 'pyarrow'
+            Parquet library to use, or library it self, checked with 'pyarrow' name
+            and version > 4.0.0
+        index : bool, default None
+            If ``True``, include the dataframe's index(es) in the file output. If
+            ``False``, they will not be written to the file.
+            If ``None``, similar to ``infer`` the dataframe's index(es)
+            will be saved. However, instead of being saved as values,
+            the RangeIndex will be stored as a range in the metadata so it
+            doesn't require much space and is faster. Other indexes will
+            be included as columns in the file output.
+        kwargs
+            Additional keyword arguments passed to the engine
+        Returns
+        -------
+        bytes if no path argument is provided else None
 
+        See Also
+        --------
+        read_orc : Read a ORC file.
+        DataFrame.to_parquet : Write a parquet file.
+        DataFrame.to_csv : Write a csv file.
+        DataFrame.to_sql : Write to a sql table.
+        DataFrame.to_hdf : Write to hdf.
+
+        Notes
+        -----
+        This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ library.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
+        >>> df.to_orc('df.orc', compression='gzip')  # doctest: +SKIP
+        >>> pd.read_orc('df.orc')  # doctest: +SKIP
+           col1  col2
+        0     1     3
+        1     2     4
+
+        If you want to get a buffer to the orc content you can write it to io.BytesIO
+        >>> import io
+        >>> b = io.BytesIO(df.to_orc())
+        >>> b.seek(0)
+        0
+        >>> content = b.read()
+        """
+        from pandas.io.orc import to_orc
+
+        return to_orc(
+            self,
+            path,
+            engine,
+            index=index,
+            **kwargs
+        )
+
     @Substitution(
         header_type="bool",
         header="Whether to print column labels, default True",

diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -1,7 +1,11 @@
 """ orc compat """
 from __future__ import annotations
 
+import os
+import pandas._testing as tm
+
 from typing import TYPE_CHECKING
+from tempfile import gettempdir
 
 from pandas._typing import FilePathOrBuffer
 from pandas.compat._optional import import_optional_dependency
@@ -55,3 +59,68 @@ def read_orc(
     with get_handle(path, "rb", is_text=False) as handles:
         orc_file = orc.ORCFile(handles.handle)
         return orc_file.read(columns=columns, **kwargs).to_pandas()
+
+
+def to_orc(
+    df: DataFrame,
+    path: FilePathOrBuffer = None,
+    engine: str = 'pyarrow',
+    index: bool = None,
+    **kwargs
+) -> bytes:
+    """
+    Write a DataFrame to the orc/arrow format.
+    Parameters
+    ----------
+    df : DataFrame
+    path : str or file-like object, default None
+        If a string, it will be used as Root Directory path
+        when writing a partitioned dataset. By file-like object,
+        we refer to objects with a write() method, such as a file handle
+        (e.g. via builtin open function) or io.BytesIO. The engine
+        fastparquet does not accept file-like objects. If path is None,
+        a bytes object is returned.
+    engine : {{'pyarrow'}}, default 'pyarrow'
+        Parquet library to use, or library it self, checked with 'pyarrow' name
+        and version > 4.0.0
+    index : bool, default None
+        If ``True``, include the dataframe's index(es) in the file output. If
+        ``False``, they will not be written to the file.
+        If ``None``, similar to ``infer`` the dataframe's index(es)
+        will be saved. However, instead of being saved as values,
+        the RangeIndex will be stored as a range in the metadata so it
+        doesn't require much space and is faster. Other indexes will
+        be included as columns in the file output.
+    kwargs
+        Additional keyword arguments passed to the engine
+    Returns
+    -------
+    bytes if no path argument is provided else None
+    """
+    if index is None:
+        index = df.index.names[0] is not None
+
+    if isinstance(engine, str):
+        engine = import_optional_dependency(engine, min_version='4.0.0')
+    else:
+        try:
+            assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module"
+            assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module"
+        except Exception as e:
+            raise ValueError("Wrong engine passed, %s" % e)
+
+    if path is None:
+        # to bytes: tmp path, pyarrow auto closes buffers
+        with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path:
+            engine.orc.write_table(
+                engine.Table.from_pandas(df, preserve_index=index),
+                path, **kwargs
+            )
+            with open(path, 'rb') as path:
+                return path.read()
+    else:
+        engine.orc.write_table(
+            engine.Table.from_pandas(df, preserve_index=index),
+            path, **kwargs
+        )
+    return