diff --git a/doc/source/io.rst b/doc/source/io.rst index c2c8c1c17700f..cb22bb9198e25 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4570,6 +4570,9 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. * Duplicate column names and non-string columns names are not supported. +* The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default + indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can + force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. * Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message @@ -4633,6 +4636,41 @@ Read only certain columns of a parquet file. os.remove('example_pa.parquet') os.remove('example_fp.parquet') + +Handling Indexes +'''''''''''''''' + +Serializing a ``DataFrame`` to parquet may include the implicit index as one or +more columns in the output file. Thus, this code: + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + df.to_parquet('test.parquet', engine='pyarrow') + +creates a parquet file with *three* columns if you use ``pyarrow`` for serialization: +``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the +index `may or may not `_ +be written to the file. + +This unexpected extra column causes some databases like Amazon Redshift to reject +the file, because that column doesn't exist in the target table. + +If you want to omit a dataframe's indexes when writing, pass ``index=False`` to +:func:`~pandas.DataFrame.to_parquet`: + +.. ipython:: python + + df.to_parquet('test.parquet', index=False) + +This creates a parquet file with just the two expected columns, ``a`` and ``b``. +If your ``DataFrame`` has a custom index, you won't get it back when you load +this file into a ``DataFrame``. + +Passing ``index=True`` will *always* write the index, even if that's not the +underlying engine's default behavior. + + .. _io.sql: SQL Queries diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9e2c20c78f489..fc594a2e8de2f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,6 +17,10 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing +the user to override the engine's default behavior to include or omit the +dataframe's indexes from the resulting Parquet file. (:issue:`20768`) + .. _whatsnew_0240.enhancements.extension_array_operators: ``ExtensionArray`` operator support diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb221ced9e6bd..770eca2210138 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1874,7 +1874,7 @@ def to_feather(self, fname): to_feather(self, fname) def to_parquet(self, fname, engine='auto', compression='snappy', - **kwargs): + index=None, **kwargs): """ Write a DataFrame to the binary parquet format. @@ -1896,6 +1896,13 @@ def to_parquet(self, fname, engine='auto', compression='snappy', 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. If ``None``, + the behavior depends on the chosen engine. + + .. versionadded:: 0.24.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. @@ -1924,7 +1931,7 @@ def to_parquet(self, fname, engine='auto', compression='snappy', """ from pandas.io.parquet import to_parquet to_parquet(self, fname, engine, - compression=compression, **kwargs) + compression=compression, index=index, **kwargs) @Substitution(header='Write out the column names. If a list of strings ' 'is given, it is assumed to be aliases for the ' diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a99014f07a6b3..6ab56c68a510a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -103,19 +103,27 @@ def __init__(self): self.api = pyarrow def write(self, df, path, compression='snappy', - coerce_timestamps='ms', **kwargs): + coerce_timestamps='ms', index=None, **kwargs): self.validate_dataframe(df) - if self._pyarrow_lt_070: + + # Only validate the index if we're writing it. + if self._pyarrow_lt_070 and index is not False: self._validate_write_lt_070(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') + if index is None: + from_pandas_kwargs = {} + else: + from_pandas_kwargs = {'preserve_index': index} + if self._pyarrow_lt_060: - table = self.api.Table.from_pandas(df, timestamps_to_ms=True) + table = self.api.Table.from_pandas(df, timestamps_to_ms=True, + **from_pandas_kwargs) self.api.parquet.write_table( table, path, compression=compression, **kwargs) else: - table = self.api.Table.from_pandas(df) + table = self.api.Table.from_pandas(df, **from_pandas_kwargs) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs) @@ -197,7 +205,7 @@ def __init__(self): ) self.api = fastparquet - def write(self, df, path, compression='snappy', **kwargs): + def write(self, df, path, compression='snappy', index=None, **kwargs): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. @@ -214,8 +222,8 @@ def write(self, df, path, compression='snappy', **kwargs): path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): - self.api.write(path, df, - compression=compression, **kwargs) + self.api.write(path, df, compression=compression, + write_index=index, **kwargs) def read(self, path, columns=None, **kwargs): if is_s3_url(path): @@ -234,7 +242,8 @@ def read(self, path, columns=None, **kwargs): return parquet_file.to_pandas(columns=columns, **kwargs) -def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): +def to_parquet(df, path, engine='auto', compression='snappy', index=None, + **kwargs): """ Write a DataFrame to the parquet format. @@ -250,11 +259,17 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. If ``None``, the + engine's default behavior will be used. + + .. versionadded 0.24.0 kwargs Additional keyword arguments passed to the engine """ impl = get_engine(engine) - return impl.write(df, path, compression=compression, **kwargs) + return impl.write(df, path, compression=compression, index=index, **kwargs) def read_parquet(path, engine='auto', columns=None, **kwargs): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fefbe8afb59cb..ab7f04ad86ffc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -368,6 +368,40 @@ def test_multiindex_with_columns(self, pa_ge_070): check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']}, expected=df[['A', 'B']]) + def test_write_ignoring_index(self, engine): + # ENH 20768 + # Ensure index=False omits the index from the written Parquet file. + df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) + + write_kwargs = { + 'compression': None, + 'index': False, + } + + # Because we're dropping the index, we expect the loaded dataframe to + # have the default integer index. + expected = df.reset_index(drop=True) + + check_round_trip(df, engine, write_kwargs=write_kwargs, + expected=expected) + + # Ignore custom index + df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, + index=['zyx', 'wvu', 'tsr']) + + check_round_trip(df, engine, write_kwargs=write_kwargs, + expected=expected) + + # Ignore multi-indexes as well. + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame({'one': [i for i in range(8)], + 'two': [-i for i in range(8)]}, index=arrays) + + expected = df.reset_index(drop=True) + check_round_trip(df, engine, write_kwargs=write_kwargs, + expected=expected) + class TestParquetPyArrow(Base):