From 56e5db27a39a78cf35b601924c7399e17057ce80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 5 Jul 2020 16:02:17 -0400 Subject: [PATCH 1/3] io/common: no encoding when opening file in binary mode --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- pandas/io/common.py | 13 +++++++++---- pandas/tests/io/test_common.py | 11 +++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f173cb2fce12..3f7e260314f70 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -121,7 +121,7 @@ MultiIndex I/O ^^^ -- +- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - Plotting @@ -167,4 +167,4 @@ Other .. _whatsnew_120.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ diff --git a/pandas/io/common.py b/pandas/io/common.py index f39b8279fbdb0..9cda4177d9f04 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -407,8 +407,9 @@ def get_handle( memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True - whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.). + Whether the type of the content passed to the file/buffer is string or + bytes. This is not the same as `"b" not in mode`. If a string content is + passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list @@ -489,10 +490,14 @@ def get_handle( handles.append(f) elif is_path: - if encoding: + # Check whether the filename is to be opened in binary mode. + # Binary mode does not support 'encoding' and 'newline'. + is_binary_mode = "b" in mode + + if encoding and not is_binary_mode: # Encoding f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") - elif is_text: + elif is_text and not is_binary_mode: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") else: diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index dde38eb55ea7f..5ce2233bc0cd0 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -378,6 +378,17 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + def test_binary_mode(self): + """ + 'encoding' shouldn't be passed to 'open' in binary mode. + + GH 35058 + """ + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path, mode="w+b") + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") From 11c3fc480ec802b12ebcd15e6013ff08f140edc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 6 Jul 2020 01:18:20 -0400 Subject: [PATCH 2/3] to_csv: support binary file handles and their encoding --- doc/source/user_guide/io.rst | 15 ++++++ doc/source/whatsnew/v1.2.0.rst | 17 ++++++ pandas/core/generic.py | 12 +++-- pandas/io/formats/csvs.py | 73 ++++++++++---------------- pandas/tests/io/formats/test_to_csv.py | 36 +++++++++++++ 5 files changed, 104 insertions(+), 49 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cc42f952b1733..1e40a4840f6fb 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1064,6 +1064,21 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: pd.read_csv('tmp.csv', parse_dates=[0]) pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) +Writing CSVs to binary file objects ++++++++++++++++++++++++++++++++++++ + +``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object +opened binary mode. For this to work, it is necessary that ``mode`` +contains a "b": + +.. ipython:: python + + import io + + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") + .. _io.float_precision: Specifying method for floating-point conversion diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3f7e260314f70..9c33018e28aba 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,23 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_120.binary_handle_to_csv: + +Support for binary file handles in ``to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) +and honors their requested ``encoding`` (:issue:`13068` and :issue:`23854`). +``mode`` has to contain a ``b`` for binary handles to be supported. + +For example: + +.. ipython:: python + import io + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, mode="w+b", encoding="utf-8") + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42d02f37508fc..63b3b58cfc5e7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3021,13 +3021,18 @@ def to_csv( ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as - a string. If a file object is passed it should be opened with - `newline=''`, disabling universal newlines. + a string. If a non-binary file object is passed, it should be opened + with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` needs to contain a `'b'`. .. versionchanged:: 0.24.0 Was previously named "path" for Series. + .. versionchanged:: 1.2.0 + + Support for binary file objects was introduced. + sep : str, default ',' String of length 1. Field delimiter for the output file. na_rep : str, default '' @@ -3056,7 +3061,8 @@ def to_csv( Python write mode, default 'w'. encoding : str, optional A string representing the encoding to use in the output file, - defaults to 'utf-8'. + defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` + is a non-binary file object. compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5bd51dc8351f6..d6b34a48639b5 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -3,11 +3,10 @@ """ import csv as csvlib -from io import StringIO +from io import StringIO, TextIOWrapper import os from typing import Hashable, List, Mapping, Optional, Sequence, Union import warnings -from zipfile import ZipFile import numpy as np @@ -166,31 +165,18 @@ def save(self) -> None: RuntimeWarning, stacklevel=2, ) - - # when zip compression is called. - is_zip = isinstance(self.path_or_buf, ZipFile) or ( - not hasattr(self.path_or_buf, "write") and self.compression == "zip" + self.compression = None + + # get a handle or wrap an existing handle to take care of 1) compression and + # 2) text -> byte conversion + f, handles = get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + errors=self.errors, + compression=dict(self.compression_args, method=self.compression), ) - if is_zip: - # zipfile doesn't support writing string to archive. uses string - # buffer to receive csv writing and dump into zip compression - # file handle. GH21241, GH21118 - f = StringIO() - close = False - elif hasattr(self.path_or_buf, "write"): - f = self.path_or_buf - close = False - else: - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=dict(self.compression_args, method=self.compression), - ) - close = True - try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( @@ -206,29 +192,24 @@ def save(self) -> None: self._save() finally: - if is_zip: - # GH17778 handles zip compression separately. - buf = f.getvalue() - if hasattr(self.path_or_buf, "write"): - self.path_or_buf.write(buf) - else: - compression = dict(self.compression_args, method=self.compression) - - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=compression, - ) - f.write(buf) - close = True - if close: + if self.should_close: f.close() - for _fh in handles: - _fh.close() - elif self.should_close: + + if ( + isinstance(f, TextIOWrapper) + and not f.closed + and f != self.path_or_buf + and hasattr(self.path_or_buf, "write") + ): + # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper + # closes the wrapped handle if it is not detached. + f.flush() # make sure everything is written + f.detach() # makes f unusable + del f + elif f != self.path_or_buf: f.close() + for _fh in handles: + _fh.close() def _save_header(self): writer = self.writer diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 4c86e3a16b135..753b8b6eda9c5 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -607,3 +607,39 @@ def test_to_csv_errors(self, errors): ser.to_csv(path, errors=errors) # No use in reading back the data as it is not the same anymore # due to the error handling + + def test_to_csv_binary_handle(self): + """ + Binary file objects should work if 'mode' contains a 'b'. + + GH 35058 and GH 19827 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, mode="w+b") as handle: + df.to_csv(handle, mode="w+b") + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + + def test_to_csv_encoding_binary_handle(self): + """ + Binary file objects should honor a specified encoding. + + GH 23854 and GH 13068 with binary handles + """ + # example from GH 23854 + content = "a, b, 🐟".encode("utf-8-sig") + buffer = io.BytesIO(content) + df = pd.read_csv(buffer, encoding="utf-8-sig") + + buffer = io.BytesIO() + df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False) + buffer.seek(0) # tests whether file handle wasn't closed + assert buffer.getvalue().startswith(content) + + # example from GH 13068 + with tm.ensure_clean() as path: + with open(path, "w+b") as handle: + pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig") + + handle.seek(0) + assert handle.read().startswith(b'\xef\xbb\xbf""') From 5eaf2766401a50e31e17bbb2665829deb75de67c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 18 Jul 2020 02:11:09 -0400 Subject: [PATCH 3/3] to_csv: support compression for binary file handles --- doc/source/user_guide/io.rst | 2 ++ doc/source/whatsnew/v1.2.0.rst | 6 ++++-- pandas/core/generic.py | 4 ++++ pandas/io/common.py | 4 ++-- pandas/io/formats/csvs.py | 13 ++++++++----- pandas/tests/io/test_compression.py | 16 ++++++++++++++++ 6 files changed, 36 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1e40a4840f6fb..ab233f653061a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1067,6 +1067,8 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: Writing CSVs to binary file objects +++++++++++++++++++++++++++++++++++ +.. versionadded:: 1.2.0 + ``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object opened binary mode. For this to work, it is necessary that ``mode`` contains a "b": diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9c33018e28aba..10dfd8406b8ce 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -19,16 +19,18 @@ Support for binary file handles in ``to_csv`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) -and honors their requested ``encoding`` (:issue:`13068` and :issue:`23854`). +with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`). ``mode`` has to contain a ``b`` for binary handles to be supported. For example: .. ipython:: python + import io + data = pd.DataFrame([0, 1, 2]) buffer = io.BytesIO() - data.to_csv(buffer, mode="w+b", encoding="utf-8") + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") .. _whatsnew_120.enhancements.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 63b3b58cfc5e7..53b12e1abdee8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3086,6 +3086,10 @@ def to_csv( supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + .. versionchanged:: 1.2.0 + + Compression is supported for non-binary file objects. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC diff --git a/pandas/io/common.py b/pandas/io/common.py index 9cda4177d9f04..34e4425c657f1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -450,14 +450,14 @@ def get_handle( if is_path: f = gzip.open(path_or_buf, mode, **compression_args) else: - f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) + f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": if is_path: f = bz2.BZ2File(path_or_buf, mode, **compression_args) else: - f = bz2.BZ2File(path_or_buf, **compression_args) + f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d6b34a48639b5..b10946a20d041 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -158,10 +158,14 @@ def save(self) -> None: """ Create the writer & save. """ - # GH21227 internal compression is not used when file-like passed. - if self.compression and hasattr(self.path_or_buf, "write"): + # GH21227 internal compression is not used for non-binary handles. + if ( + self.compression + and hasattr(self.path_or_buf, "write") + and "b" not in self.mode + ): warnings.warn( - "compression has no effect when passing file-like object as input.", + "compression has no effect when passing a non-binary object as input.", RuntimeWarning, stacklevel=2, ) @@ -194,8 +198,7 @@ def save(self) -> None: finally: if self.should_close: f.close() - - if ( + elif ( isinstance(f, TextIOWrapper) and not f.closed and f != self.path_or_buf diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 59c9bd0a36d3d..902a3d5d2a397 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -114,6 +114,22 @@ def test_compression_warning(compression_only): df.to_csv(f, compression=compression_only) +def test_compression_binary(compression_only): + """ + Binary file handles support compression. + + GH22555 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, mode="wb") as file: + df.to_csv(file, mode="wb", compression=compression_only) + file.seek(0) # file shouldn't be closed + tm.assert_frame_equal( + df, pd.read_csv(path, index_col=0, compression=compression_only) + ) + + def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575