From b78175e567c7c832d975911d93cdbba8a0449117 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Fri, 23 Jun 2023 15:43:17 +0000 Subject: [PATCH 1/6] created unit tests for desired new functionality --- pandas/tests/frame/methods/test_to_csv.py | 62 +++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 5671a569c8ac8..93aaacca5d41d 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1307,3 +1307,65 @@ def test_to_csv_categorical_and_interval(self): expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected + + def prepate_string_rep_of_comment_output( + self, delim: str, comments_attrs, data_for_comments_raw, frame_for_comments + ) -> str: + comment = "#" + + data_for_comments_raw = data_for_comments_raw.replace(",", delim) + # Create string representation of data with attrs written at start + output_data_rows = [] + for k, v in comments_attrs.items(): + # Make sure delims being used are sanitized from comment lines + k = k.replace(delim, "") + v = v.replace(delim, "") + output_data_rows.append(f"{comment}{k}:{v}\n") + output_data = "".join(output_data_rows) + output_data = output_data + data_for_comments_raw + return output_data + + def test_comment_writer_csv( + self, comments_attrs, data_for_comments_raw, frame_for_comments + ): + comment = "#" + delim = "," + output_data = self.prepate_string_rep_of_comment_output( + delim, comments_attrs, data_for_comments_raw, frame_for_comments + ) + read_output = read_csv(StringIO(output_data), comment=comment) + + # Check output data can be read correctly + tm.assert_frame_equal( + read_output, frame_for_comments + ), "Frame read from test data did not match expected results." + + # Check saved output is as expected + with tm.ensure_clean() as path: + frame_for_comments.to_csv(path, comment=comment, index=False) + with open(path, encoding="utf-8") as fp: + lines = fp.read() + assert ( + lines == output_data + ), "csv output with comment lines not as expected" + + def test_comment_writer_tabs( + self, comments_attrs, data_for_comments_raw, frame_for_comments + ): + comment = "#" + delim = "\t" + output_data = self.prepate_string_rep_of_comment_output( + delim, comments_attrs, data_for_comments_raw, frame_for_comments + ) + read_output = read_csv(StringIO(output_data), comment=comment, sep="\t") + + tm.assert_frame_equal( + read_output, frame_for_comments + ), "Read tab outputs are not as expected" + with tm.ensure_clean() as path: + frame_for_comments.to_csv(path, comment=comment, index=False, sep="\t") + with open(path, encoding="utf-8") as fp: + lines = fp.read() + assert ( + lines == output_data + ), "tsv output with comment lines not as expected" From 63f6abca63b15ec2d790300aa7f8323a19ff442f Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Fri, 23 Jun 2023 15:47:28 +0000 Subject: [PATCH 2/6] Added new fixtures for test --- pandas/tests/frame/conftest.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 97cf75acbd629..054ea842b3e6f 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -1,3 +1,5 @@ +from io import StringIO + import numpy as np import pytest @@ -5,6 +7,7 @@ DataFrame, NaT, date_range, + read_csv, ) import pandas._testing as tm @@ -259,3 +262,26 @@ def frame_of_index_cols(): } ) return df + + +@pytest.fixture +def comments_attrs(): + return { + "one": "Hello", + "two": "Hello World", + "three": "Hello, World!", + "four,": "comma in keym", + } + + +@pytest.fixture +def data_for_comments_raw(): + data = "col1,col2,col3\n0,0,0\n1,1,1\n2,2,2\n" + return data + + +@pytest.fixture +def frame_for_comments(data_for_comments_raw, comments_attrs): + df = read_csv(StringIO(data_for_comments_raw)) + df.attrs = comments_attrs + return df From b9037b54f81df09ced41b2d0486f973da9130e65 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Fri, 23 Jun 2023 16:00:19 +0000 Subject: [PATCH 3/6] implementation of df.attrs being written as csv header lines starting with a comment char --- pandas/core/generic.py | 10 ++++++++++ pandas/io/formats/csvs.py | 11 +++++++++++ pandas/io/formats/format.py | 2 ++ 3 files changed, 23 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0be840f9a4ef1..61990afef2a90 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3636,6 +3636,7 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + comment: str | None = ..., ) -> str: ... @@ -3663,6 +3664,7 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + comment: str | None = ..., ) -> None: ... @@ -3694,6 +3696,7 @@ def to_csv( decimal: str = ".", errors: OpenFileErrors = "strict", storage_options: StorageOptions = None, + comment: str | None = ..., ) -> str | None: r""" Write object to a comma-separated values (csv) file. @@ -3799,6 +3802,12 @@ def to_csv( .. versionadded:: 1.2.0 + comment : str, default None + If set the key and values of df.attrs will be written to the + beginning of the csv file, prefixed by this value, each key/value + pair to a single ling. To prevent downstream reading issues + this char will be removed from the df.attrs if present. + Complement of pd.read_csv's 'comment' param. Returns ------- None or str @@ -3865,6 +3874,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, + comment=comment, ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 672f7c1f71b15..f9ad6598bf06b 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -67,6 +67,7 @@ def __init__( doublequote: bool = True, escapechar: str | None = None, storage_options: StorageOptions = None, + comment: str | None = None, ) -> None: self.fmt = formatter @@ -89,6 +90,7 @@ def __init__( self.date_format = date_format self.cols = self._initialize_columns(cols) self.chunksize = self._initialize_chunksize(chunksize) + self.comment = comment @property def na_rep(self) -> str: @@ -260,6 +262,8 @@ def save(self) -> None: self._save() def _save(self) -> None: + if self.comment: + self._save_df_attrs() if self._need_to_save_header: self._save_header() self._save_body() @@ -318,3 +322,10 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: self.cols, self.writer, ) + + def _save_df_attrs(self) -> None: + for key, value in self.fmt.frame.attrs.items(): + # remove the delimiter from the attr string values + key = str(key).replace(self.writer.dialect.delimiter, "") + value = str(value).replace(self.writer.dialect.delimiter, "") + self.writer.writerow([f"{self.comment}{key}:{value}"]) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6cc00ffc8889c..489df372ed5dd 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1117,6 +1117,7 @@ def to_csv( escapechar: str | None = None, errors: str = "strict", storage_options: StorageOptions = None, + comment: str | None = None, ) -> str | None: """ Render dataframe as comma-separated file. @@ -1147,6 +1148,7 @@ def to_csv( escapechar=escapechar, storage_options=storage_options, formatter=self.fmt, + comment=comment, ) csv_formatter.save() From 143d9dd67ad2bba0e403606758452e6f2334cae4 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Fri, 23 Jun 2023 20:05:24 +0000 Subject: [PATCH 4/6] fixed incorrect default param value --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 61990afef2a90..41f2de3d35722 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3696,7 +3696,7 @@ def to_csv( decimal: str = ".", errors: OpenFileErrors = "strict", storage_options: StorageOptions = None, - comment: str | None = ..., + comment: str | None = None, ) -> str | None: r""" Write object to a comma-separated values (csv) file. From 48d515955bdaea12a1423293ffb79a2813774785 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Fri, 23 Jun 2023 21:49:06 +0000 Subject: [PATCH 5/6] added missing space before Returns in docstring --- pandas/core/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 41f2de3d35722..cdcb1a9a541e9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3808,6 +3808,7 @@ def to_csv( pair to a single ling. To prevent downstream reading issues this char will be removed from the df.attrs if present. Complement of pd.read_csv's 'comment' param. + Returns ------- None or str From cfa33004f63683a3609fad8af65b8b6e051e7b3d Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Fri, 23 Jun 2023 21:55:38 +0000 Subject: [PATCH 6/6] ran pre-commit formatting. Forgot to reinstall in new env. oops --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cdcb1a9a541e9..6cb9c7db35be0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3808,7 +3808,7 @@ def to_csv( pair to a single ling. To prevent downstream reading issues this char will be removed from the df.attrs if present. Complement of pd.read_csv's 'comment' param. - + Returns ------- None or str