From 847598b5ca935d56f9f86d18b0bca446217c72f1 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Thu, 9 Aug 2018 12:30:19 -0700 Subject: [PATCH 01/17] Add support for excluding the index from Parquet files --- pandas/core/frame.py | 7 +++++-- pandas/io/parquet.py | 19 ++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb221ced9e6bd..05612e3705552 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1874,7 +1874,7 @@ def to_feather(self, fname): to_feather(self, fname) def to_parquet(self, fname, engine='auto', compression='snappy', - **kwargs): + index=True, **kwargs): """ Write a DataFrame to the binary parquet format. @@ -1896,6 +1896,9 @@ def to_parquet(self, fname, engine='auto', compression='snappy', 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. + index : bool, default True + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. @@ -1924,7 +1927,7 @@ def to_parquet(self, fname, engine='auto', compression='snappy', """ from pandas.io.parquet import to_parquet to_parquet(self, fname, engine, - compression=compression, **kwargs) + compression=compression, index=index, **kwargs) @Substitution(header='Write out the column names. If a list of strings ' 'is given, it is assumed to be aliases for the ' diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a99014f07a6b3..1b4e50545e67c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -103,14 +103,15 @@ def __init__(self): self.api = pyarrow def write(self, df, path, compression='snappy', - coerce_timestamps='ms', **kwargs): + coerce_timestamps='ms', index=True, **kwargs): self.validate_dataframe(df) if self._pyarrow_lt_070: self._validate_write_lt_070(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if self._pyarrow_lt_060: - table = self.api.Table.from_pandas(df, timestamps_to_ms=True) + table = self.api.Table.from_pandas(df, timestamps_to_ms=True, + preserve_index=index) self.api.parquet.write_table( table, path, compression=compression, **kwargs) @@ -197,7 +198,7 @@ def __init__(self): ) self.api = fastparquet - def write(self, df, path, compression='snappy', **kwargs): + def write(self, df, path, compression='snappy', index=True, **kwargs): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. @@ -214,8 +215,8 @@ def write(self, df, path, compression='snappy', **kwargs): path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): - self.api.write(path, df, - compression=compression, **kwargs) + self.api.write(path, df, compression=compression, + write_index=index, **kwargs) def read(self, path, columns=None, **kwargs): if is_s3_url(path): @@ -234,7 +235,8 @@ def read(self, path, columns=None, **kwargs): return parquet_file.to_pandas(columns=columns, **kwargs) -def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): +def to_parquet(df, path, engine='auto', compression='snappy', index=True, + **kwargs): """ Write a DataFrame to the parquet format. @@ -250,11 +252,14 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. + index : bool, default True + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. kwargs Additional keyword arguments passed to the engine """ impl = get_engine(engine) - return impl.write(df, path, compression=compression, **kwargs) + return impl.write(df, path, compression=compression, index=index, **kwargs) def read_parquet(path, engine='auto', columns=None, **kwargs): From cb01127479e82451d58afba6bb02fe7ce888141f Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Thu, 9 Aug 2018 14:33:44 -0700 Subject: [PATCH 02/17] Update whatsnew --- doc/source/whatsnew/v0.24.0.txt | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9e2c20c78f489..5aabd8b949c96 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,6 +17,8 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- ``DataFrame.to_parquet()`` now accepts ``index`` as a keyword argument, allowing the user to optionally omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) + .. _whatsnew_0240.enhancements.extension_array_operators: ``ExtensionArray`` operator support @@ -159,6 +161,45 @@ This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. +Omitting Indexes in ``to_parquet()`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You now have the option to omit a dataframe's indexes when writing to Parquet +files with ``to_parquet``, just like ``to_csv`` (:issue:`20768`). + +*Previous Behavior*: + +Dumping a ``DataFrame`` to Parquet would include the implicit index as a column +in the output file. Thus, this code: + +.. ipython:: python + + import pandas + + df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) + df.to_parquet('test.parquet') + +would create a Parquet file with *three* columns: ``a``, ``b``, and ``__index_level_0__``. + +This unexpected extra column causes some databases like Amazon Redshift to reject +the file, because that column doesn't exist in the target table. + +*New Behavior*: + +If you want to omit a dataframe's indexes when writing, pass ``index=False`` to +``to_parquet()``: + +.. ipython:: python + + import pandas + + df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) + df.to_parquet('test.parquet', index=False) + +This creates a Parquet file with just the two expected columns, ``a`` and ``b``. +The preexisting behavior is still the default. + + .. _whatsnew_0240.enhancements.other: Other Enhancements From 3bec3c294b719e0fb4dd672618c2ccba13b09672 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Sat, 11 Aug 2018 00:12:21 -0700 Subject: [PATCH 03/17] Test index omission? --- pandas/tests/io/test_parquet.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fefbe8afb59cb..c9249eef6df90 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -368,6 +368,12 @@ def test_multiindex_with_columns(self, pa_ge_070): check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']}, expected=df[['A', 'B']]) + def test_write_ignoring_index(self): + """Ensure index=False omits the index from the written Parquet file.""" + df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) + check_round_trip(df, write_kwargs={'index': False}, + check_names=['a', 'b']) + class TestParquetPyArrow(Base): From 377cda5e3cd464c2b748441863a91847ede66185 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Sat, 11 Aug 2018 18:47:14 -0700 Subject: [PATCH 04/17] PR feedback --- pandas/tests/io/test_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index c9249eef6df90..dcd06304d05df 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -369,7 +369,8 @@ def test_multiindex_with_columns(self, pa_ge_070): expected=df[['A', 'B']]) def test_write_ignoring_index(self): - """Ensure index=False omits the index from the written Parquet file.""" + # ENH 20768 + # Ensure index=False omits the index from the written Parquet file. df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) check_round_trip(df, write_kwargs={'index': False}, check_names=['a', 'b']) From ec58c1aae676bc6e03e093efc073821551c8f595 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Sat, 11 Aug 2018 19:12:33 -0700 Subject: [PATCH 05/17] Add tests for custom indexes and a multiindex. --- pandas/tests/io/test_parquet.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index dcd06304d05df..86f0b328182f0 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -375,6 +375,26 @@ def test_write_ignoring_index(self): check_round_trip(df, write_kwargs={'index': False}, check_names=['a', 'b']) + def test_write_ignoring_custom_index(self): + # ENH 20768 + # Ensure index=False omits the index from the written Parquet file, + # even if we're using a custom one.s + df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, + index=['zyx', 'wvu', 'tsr']) + check_round_trip(df, write_kwargs={'index': False}, + check_names=['a', 'b']) + + def test_write_ignoring_multiindex(self): + # ENH 20768 + # Ensure index=False omits multiindexes as well. + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame({'one': [i for i in range(8)], + 'two': [-i for i in range(8)]}, index=arrays) + + check_round_trip(df, write_kwargs={'index': False}, + check_names=['one', 'two']) + class TestParquetPyArrow(Base): From 46209e5308ffd1857bc79de0f0669c57f2a7ca57 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Sat, 11 Aug 2018 19:18:12 -0700 Subject: [PATCH 06/17] Forgot to put preserve_index=index in one place --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1b4e50545e67c..8bacf879603d0 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -116,7 +116,7 @@ def write(self, df, path, compression='snappy', table, path, compression=compression, **kwargs) else: - table = self.api.Table.from_pandas(df) + table = self.api.Table.from_pandas(df, preserve_index=index) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs) From 45b864deb3fd84acf86ed0802cba0292f9f53e81 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Sun, 12 Aug 2018 19:44:17 -0700 Subject: [PATCH 07/17] Use `engine` fixture to test both implementations. --- pandas/tests/io/test_parquet.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 86f0b328182f0..683e0d852b373 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -368,23 +368,23 @@ def test_multiindex_with_columns(self, pa_ge_070): check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']}, expected=df[['A', 'B']]) - def test_write_ignoring_index(self): + def test_write_ignoring_index(self, engine): # ENH 20768 # Ensure index=False omits the index from the written Parquet file. df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) - check_round_trip(df, write_kwargs={'index': False}, + check_round_trip(df, engine, write_kwargs={'index': False}, check_names=['a', 'b']) - def test_write_ignoring_custom_index(self): + def test_write_ignoring_custom_index(self, engine): # ENH 20768 # Ensure index=False omits the index from the written Parquet file, # even if we're using a custom one.s df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, index=['zyx', 'wvu', 'tsr']) - check_round_trip(df, write_kwargs={'index': False}, + check_round_trip(df, engine, write_kwargs={'index': False}, check_names=['a', 'b']) - def test_write_ignoring_multiindex(self): + def test_write_ignoring_multiindex(self, engine): # ENH 20768 # Ensure index=False omits multiindexes as well. arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], @@ -392,7 +392,7 @@ def test_write_ignoring_multiindex(self): df = pd.DataFrame({'one': [i for i in range(8)], 'two': [-i for i in range(8)]}, index=arrays) - check_round_trip(df, write_kwargs={'index': False}, + check_round_trip(df, engine, write_kwargs={'index': False}, check_names=['one', 'two']) From 5768b53b727c5f4690f17a18ed83b8bc921509e4 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Mon, 13 Aug 2018 20:35:24 -0700 Subject: [PATCH 08/17] Fix tests: Remove indexes in expected value. --- pandas/tests/io/test_parquet.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 683e0d852b373..b2d2ff7cb382c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -372,8 +372,13 @@ def test_write_ignoring_index(self, engine): # ENH 20768 # Ensure index=False omits the index from the written Parquet file. df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) + + # Because we're dropping the index, we expect the loaded dataframe to + # have the default integer index. + expected = df.reset_index(drop=True) + check_round_trip(df, engine, write_kwargs={'index': False}, - check_names=['a', 'b']) + expected=expected) def test_write_ignoring_custom_index(self, engine): # ENH 20768 @@ -381,8 +386,10 @@ def test_write_ignoring_custom_index(self, engine): # even if we're using a custom one.s df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, index=['zyx', 'wvu', 'tsr']) + + expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs={'index': False}, - check_names=['a', 'b']) + expected=expected) def test_write_ignoring_multiindex(self, engine): # ENH 20768 @@ -392,8 +399,9 @@ def test_write_ignoring_multiindex(self, engine): df = pd.DataFrame({'one': [i for i in range(8)], 'two': [-i for i in range(8)]}, index=arrays) + expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs={'index': False}, - check_names=['one', 'two']) + expected=expected) class TestParquetPyArrow(Base): From f8bcf60013cc140d6fb05e32384f24572783c8d6 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Tue, 14 Aug 2018 12:07:39 -0700 Subject: [PATCH 09/17] Move explanation of new argument to io.rst --- doc/source/io.rst | 36 +++++++++++++++++++++++++++++ doc/source/whatsnew/v0.24.0.txt | 41 +-------------------------------- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index c2c8c1c17700f..371ca4a2fa80e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4570,6 +4570,8 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. * Duplicate column names and non-string columns names are not supported. +* The index is included in the output by default, which can cause problems with non-Pandas consumers that are + not expecting that extra column. You can, however, omit indexes. (See below) * Index level names, if specified, must be strings. * Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message @@ -4633,6 +4635,40 @@ Read only certain columns of a parquet file. os.remove('example_pa.parquet') os.remove('example_fp.parquet') + +Omitting Indexes +'''''''''''''''' + +Dumping a ``DataFrame`` to parquet includes the implicit index as one or more +columns in the output file. Thus, this code: + +.. ipython:: python + + import pandas + + df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) + df.to_parquet('test.parquet') + +creates a parquet file with *three* columns: ``a``, ``b``, and ``__index_level_0__``. + +This unexpected extra column causes some databases like Amazon Redshift to reject +the file, because that column doesn't exist in the target table. + +If you want to omit a dataframe's indexes when writing, pass ``index=False`` to +:func:`~pandas.DataFrame.to_parquet`: + +.. ipython:: python + + import pandas + + df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) + df.to_parquet('test.parquet', index=False) + +This creates a parquet file with just the two expected columns, ``a`` and ``b``. +If your ``DataFrame`` has a custom index, you won't get it back when you load +this file into a ``DataFrame``. + + .. _io.sql: SQL Queries diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5aabd8b949c96..34d03580d4ef4 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,7 +17,7 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) -- ``DataFrame.to_parquet()`` now accepts ``index`` as a keyword argument, allowing the user to optionally omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :func:`DataFrame.to_parquet` now accepts ``index`` as a keyword argument, allowing the user to optionally omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) .. _whatsnew_0240.enhancements.extension_array_operators: @@ -161,45 +161,6 @@ This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. -Omitting Indexes in ``to_parquet()`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You now have the option to omit a dataframe's indexes when writing to Parquet -files with ``to_parquet``, just like ``to_csv`` (:issue:`20768`). - -*Previous Behavior*: - -Dumping a ``DataFrame`` to Parquet would include the implicit index as a column -in the output file. Thus, this code: - -.. ipython:: python - - import pandas - - df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) - df.to_parquet('test.parquet') - -would create a Parquet file with *three* columns: ``a``, ``b``, and ``__index_level_0__``. - -This unexpected extra column causes some databases like Amazon Redshift to reject -the file, because that column doesn't exist in the target table. - -*New Behavior*: - -If you want to omit a dataframe's indexes when writing, pass ``index=False`` to -``to_parquet()``: - -.. ipython:: python - - import pandas - - df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) - df.to_parquet('test.parquet', index=False) - -This creates a Parquet file with just the two expected columns, ``a`` and ``b``. -The preexisting behavior is still the default. - - .. _whatsnew_0240.enhancements.other: Other Enhancements From e629ae80fd1da5b9e1437977161e19e47cc7f634 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Tue, 14 Aug 2018 12:16:21 -0700 Subject: [PATCH 10/17] Don't validate the index if we're not writing it. --- pandas/io/parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 8bacf879603d0..c80e31a4e7d75 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -105,7 +105,9 @@ def __init__(self): def write(self, df, path, compression='snappy', coerce_timestamps='ms', index=True, **kwargs): self.validate_dataframe(df) - if self._pyarrow_lt_070: + + # Only validate the index if we're writing it. + if self._pyarrow_lt_070 and index: self._validate_write_lt_070(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') From f3ddae0111c9b504fa15787193b03072c68b098a Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Tue, 14 Aug 2018 12:16:36 -0700 Subject: [PATCH 11/17] Test bugfixes and PR feedback. --- pandas/tests/io/test_parquet.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b2d2ff7cb382c..9df4c0c3f8e62 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -373,34 +373,34 @@ def test_write_ignoring_index(self, engine): # Ensure index=False omits the index from the written Parquet file. df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) + write_kwargs = { + 'compression': None, + 'index': False, + } + # Because we're dropping the index, we expect the loaded dataframe to # have the default integer index. expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs={'index': False}, + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) - def test_write_ignoring_custom_index(self, engine): - # ENH 20768 - # Ensure index=False omits the index from the written Parquet file, - # even if we're using a custom one.s + # Ignore custom index df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, index=['zyx', 'wvu', 'tsr']) expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs={'index': False}, + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) - def test_write_ignoring_multiindex(self, engine): - # ENH 20768 - # Ensure index=False omits multiindexes as well. + # Ignore multi-indexes as well. arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] df = pd.DataFrame({'one': [i for i in range(8)], 'two': [-i for i in range(8)]}, index=arrays) expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs={'index': False}, + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) From d26fea87d6b5b8b93d7e6fbc46ed948d297385fc Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Tue, 14 Aug 2018 14:02:48 -0700 Subject: [PATCH 12/17] Allow using engine's default behavior. --- pandas/core/frame.py | 7 ++++--- pandas/io/parquet.py | 22 ++++++++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 05612e3705552..997b3186c853a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1874,7 +1874,7 @@ def to_feather(self, fname): to_feather(self, fname) def to_parquet(self, fname, engine='auto', compression='snappy', - index=True, **kwargs): + index=None, **kwargs): """ Write a DataFrame to the binary parquet format. @@ -1896,9 +1896,10 @@ def to_parquet(self, fname, engine='auto', compression='snappy', 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. - index : bool, default True + index : bool, default None If ``True``, include the dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. + If ``False``, they will not be written to the file. If ``None``, the + behavior depends on the chosen engine. **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index c80e31a4e7d75..07f1e212d40c8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -103,22 +103,27 @@ def __init__(self): self.api = pyarrow def write(self, df, path, compression='snappy', - coerce_timestamps='ms', index=True, **kwargs): + coerce_timestamps='ms', index=None, **kwargs): self.validate_dataframe(df) # Only validate the index if we're writing it. - if self._pyarrow_lt_070 and index: + if self._pyarrow_lt_070 and index is not False: self._validate_write_lt_070(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') + if index is None: + from_pandas_kwargs = {} + else: + from_pandas_kwargs = {'preserve_index': index} + if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True, - preserve_index=index) + **from_pandas_kwargs) self.api.parquet.write_table( table, path, compression=compression, **kwargs) else: - table = self.api.Table.from_pandas(df, preserve_index=index) + table = self.api.Table.from_pandas(df, **from_pandas_kwargs) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs) @@ -200,7 +205,7 @@ def __init__(self): ) self.api = fastparquet - def write(self, df, path, compression='snappy', index=True, **kwargs): + def write(self, df, path, compression='snappy', index=None, **kwargs): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. @@ -237,7 +242,7 @@ def read(self, path, columns=None, **kwargs): return parquet_file.to_pandas(columns=columns, **kwargs) -def to_parquet(df, path, engine='auto', compression='snappy', index=True, +def to_parquet(df, path, engine='auto', compression='snappy', index=None, **kwargs): """ Write a DataFrame to the parquet format. @@ -254,9 +259,10 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=True, 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. - index : bool, default True + index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If - ``False``, they will not be written to the file. + ``False``, they will not be written to the file. If ``None``, the + engine's default behavior will be used. kwargs Additional keyword arguments passed to the engine """ From e54e5f17812347530335e2ab0108196818762cb7 Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Wed, 15 Aug 2018 19:45:08 -0700 Subject: [PATCH 13/17] Document behavior change. --- doc/source/io.rst | 14 ++++++++++---- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/frame.py | 4 ++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 371ca4a2fa80e..5b9383175aeff 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4636,10 +4636,10 @@ Read only certain columns of a parquet file. os.remove('example_fp.parquet') -Omitting Indexes +Handling Indexes '''''''''''''''' -Dumping a ``DataFrame`` to parquet includes the implicit index as one or more +Dumping a ``DataFrame`` to parquet may include the implicit index as one or more columns in the output file. Thus, this code: .. ipython:: python @@ -4647,9 +4647,12 @@ columns in the output file. Thus, this code: import pandas df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) - df.to_parquet('test.parquet') + df.to_parquet('test.parquet', engine='pyarrow') -creates a parquet file with *three* columns: ``a``, ``b``, and ``__index_level_0__``. +creates a parquet file with *three* columns if you use ``pyarrow`` for serialization: +``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the +index `may or may not `_ +be written to the file. This unexpected extra column causes some databases like Amazon Redshift to reject the file, because that column doesn't exist in the target table. @@ -4668,6 +4671,9 @@ This creates a parquet file with just the two expected columns, ``a`` and ``b``. If your ``DataFrame`` has a custom index, you won't get it back when you load this file into a ``DataFrame``. +Passing ``index=True`` will *always* write the index, even if that's not the +underlying engine's default behavior. + .. _io.sql: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 34d03580d4ef4..444d032ab23f3 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,7 +17,7 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) -- :func:`DataFrame.to_parquet` now accepts ``index`` as a keyword argument, allowing the user to optionally omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing the user to override the engine's default behavior and include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) .. _whatsnew_0240.enhancements.extension_array_operators: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 997b3186c853a..02ad02836f999 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1898,8 +1898,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy', Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. If ``None``, the - behavior depends on the chosen engine. + If ``False``, they will not be written to the file. If ``None``, + the behavior depends on the chosen engine. **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. From 46a4324e09dee963765237c503ebdeab61cec50a Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Thu, 16 Aug 2018 10:08:47 -0700 Subject: [PATCH 14/17] Code cleanup, PR feedback. --- doc/source/io.rst | 7 +------ pandas/tests/io/test_parquet.py | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 5b9383175aeff..3955209fca9d0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4644,9 +4644,7 @@ columns in the output file. Thus, this code: .. ipython:: python - import pandas - - df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) df.to_parquet('test.parquet', engine='pyarrow') creates a parquet file with *three* columns if you use ``pyarrow`` for serialization: @@ -4662,9 +4660,6 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to .. ipython:: python - import pandas - - df = pandas.DataFrame({'a': [1, 2], 'b': [3, 4]}) df.to_parquet('test.parquet', index=False) This creates a parquet file with just the two expected columns, ``a`` and ``b``. diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9df4c0c3f8e62..ab7f04ad86ffc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -389,7 +389,6 @@ def test_write_ignoring_index(self, engine): df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, index=['zyx', 'wvu', 'tsr']) - expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) From 90361b6cd51f5e7ce4ffccfcb8138c473e2ad5be Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Mon, 20 Aug 2018 15:59:07 -0700 Subject: [PATCH 15/17] PR feedback for documentation --- doc/source/io.rst | 5 +++-- pandas/core/frame.py | 3 +++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 3955209fca9d0..4098906e0e173 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4570,8 +4570,9 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. * Duplicate column names and non-string columns names are not supported. -* The index is included in the output by default, which can cause problems with non-Pandas consumers that are - not expecting that extra column. You can, however, omit indexes. (See below) +* The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default + indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can + force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. * Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02ad02836f999..770eca2210138 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1900,6 +1900,9 @@ def to_parquet(self, fname, engine='auto', compression='snappy', If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, the behavior depends on the chosen engine. + + .. versionadded:: 0.24.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. From 759da77d1244fbfe095303ed5c2252d332917290 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 5 Sep 2018 14:23:36 +0200 Subject: [PATCH 16/17] add versionadded --- doc/source/whatsnew/v0.24.0.txt | 4 +++- pandas/io/parquet.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 444d032ab23f3..fc594a2e8de2f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,7 +17,9 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) -- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing the user to override the engine's default behavior and include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing +the user to override the engine's default behavior to include or omit the +dataframe's indexes from the resulting Parquet file. (:issue:`20768`) .. _whatsnew_0240.enhancements.extension_array_operators: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 07f1e212d40c8..6ab56c68a510a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -263,6 +263,8 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None, If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, the engine's default behavior will be used. + + .. versionadded 0.24.0 kwargs Additional keyword arguments passed to the engine """ From 7dc53a19320162218671654d7a406f5777730ddc Mon Sep 17 00:00:00 2001 From: Diego Argueta Date: Wed, 19 Sep 2018 16:08:20 -0700 Subject: [PATCH 17/17] PR feedback about rephrasing --- doc/source/io.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 4098906e0e173..cb22bb9198e25 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4640,8 +4640,8 @@ Read only certain columns of a parquet file. Handling Indexes '''''''''''''''' -Dumping a ``DataFrame`` to parquet may include the implicit index as one or more -columns in the output file. Thus, this code: +Serializing a ``DataFrame`` to parquet may include the implicit index as one or +more columns in the output file. Thus, this code: .. ipython:: python