From 1436aef9aee4548ee065a8aaa8d3a6d550ff2977 Mon Sep 17 00:00:00 2001 From: Stephannie Jimenez Date: Wed, 23 Jun 2021 16:53:46 -0500 Subject: [PATCH 1/5] Add metadata attribute to DataFrame and Column --- protocol/dataframe_protocol.py | 14 +++++++++++++ protocol/pandas_implementation.py | 33 ++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 00cf5b12..db8bfd41 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -285,6 +285,13 @@ def null_count(self) -> Optional[int]: """ pass + @property + def metadata(self) -> Dict[str, Any]: + """ + Store the metadata specific to the column. + """ + pass + def num_chunks(self) -> int: """ Return the number of chunks the column consists of. @@ -350,6 +357,13 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: "version": 0 # Version number of the protocol } + @property + def metadata(self) -> Dict[str, Any]: + """ + Store the metadata specific to the DataFrame + """ + pass + def num_columns(self) -> int: """ Return the number of columns in the DataFrame diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e3e3e62e..bd5bb4ac 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -426,6 +426,13 @@ def null_count(self) -> int: """ return self._col.isna().sum() + @property + def metadata(self) -> Dict[str, Any]: + """ + Store specific metadata of the column. + """ + return {"num_chunks": self.num_chunks()} + def num_chunks(self) -> int: """ Return the number of chunks the column consists of. @@ -495,6 +502,11 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: # dtypes is added, this value should be propagated to columns. self._nan_as_null = nan_as_null + @property + def metadata(self): + return {"num_chunks": self.num_chunks(), + "num_columns": self.num_columns()} + def num_columns(self) -> int: return len(self._df.columns) @@ -578,9 +590,28 @@ def test_categorical_dtype(): tm.assert_frame_equal(df, df2) +def test_metadata(): + df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9])) + + # Check the metadata from the dataframe + df_metadata = df.__dataframe__().metadata + excpected = {"num_chunks": 1, "num_columns": 3} + for key in df_metadata: + assert df_metadata[key] == excpected[key] + + # Check the metadata from the column + col_metadata = df.__dataframe__().get_column(0).metadata + expected = {"num_chunks": 1} + for key in col_metadata: + assert col_metadata[key] == excpected[key] + + df2 = from_dataframe(df) + tm.assert_frame_equal(df, df2) + + if __name__ == '__main__': test_categorical_dtype() test_float_only() test_mixed_intfloat() test_noncontiguous_columns() - + test_metadata() From 9b9a35ee8eeb7736d31eb68775e5714406cd96d1 Mon Sep 17 00:00:00 2001 From: Athan Date: Wed, 23 Jun 2021 15:01:53 -0700 Subject: [PATCH 2/5] Add missing period. --- protocol/dataframe_protocol.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index db8bfd41..4ebe554b 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -360,7 +360,7 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: @property def metadata(self) -> Dict[str, Any]: """ - Store the metadata specific to the DataFrame + Store the metadata specific to the DataFrame. """ pass @@ -431,4 +431,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]: before yielding it. """ pass - From d72c1ef830f0ecf211babc19fbe3ed7f855b68b7 Mon Sep 17 00:00:00 2001 From: Stephannie Jimenez Date: Fri, 25 Jun 2021 16:33:09 -0500 Subject: [PATCH 3/5] Add review changes --- protocol/dataframe_protocol.py | 10 ++++++++-- protocol/pandas_implementation.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index db8bfd41..c117de8c 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -288,7 +288,7 @@ def null_count(self) -> Optional[int]: @property def metadata(self) -> Dict[str, Any]: """ - Store the metadata specific to the column. + The metadata for the column. See `DataFrame.metadata` for more details. """ pass @@ -360,7 +360,13 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: @property def metadata(self) -> Dict[str, Any]: """ - Store the metadata specific to the DataFrame + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. """ pass diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index bd5bb4ac..ec31924b 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -431,7 +431,7 @@ def metadata(self) -> Dict[str, Any]: """ Store specific metadata of the column. """ - return {"num_chunks": self.num_chunks()} + return {} def num_chunks(self) -> int: """ @@ -504,8 +504,7 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: @property def metadata(self): - return {"num_chunks": self.num_chunks(), - "num_columns": self.num_columns()} + return {"pandas.indexcol": self._df.index.name} def num_columns(self) -> int: return len(self._df.columns) @@ -591,19 +590,20 @@ def test_categorical_dtype(): def test_metadata(): - df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9])) + d = {'A': [1, 2, 3, 4],'B': [1, 2, 3, 4]} + df = pd.DataFrame(d).set_index('A') # Check the metadata from the dataframe df_metadata = df.__dataframe__().metadata - excpected = {"num_chunks": 1, "num_columns": 3} + expected = {"pandas.indexcol": 'A'} for key in df_metadata: - assert df_metadata[key] == excpected[key] + assert df_metadata[key] == expected[key] # Check the metadata from the column col_metadata = df.__dataframe__().get_column(0).metadata - expected = {"num_chunks": 1} + expected = {} for key in col_metadata: - assert col_metadata[key] == excpected[key] + assert col_metadata[key] == expected[key] df2 = from_dataframe(df) tm.assert_frame_equal(df, df2) From 8ed5d728c393c196857554e8f60b2d8685485eb6 Mon Sep 17 00:00:00 2001 From: Stephannie Jimenez Date: Fri, 25 Jun 2021 16:40:48 -0500 Subject: [PATCH 4/5] Save the index object in the metadata --- protocol/pandas_implementation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index ec31924b..606afcfa 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -504,7 +504,7 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: @property def metadata(self): - return {"pandas.indexcol": self._df.index.name} + return {"pandas.index": self._df.index} def num_columns(self) -> int: return len(self._df.columns) @@ -590,12 +590,11 @@ def test_categorical_dtype(): def test_metadata(): - d = {'A': [1, 2, 3, 4],'B': [1, 2, 3, 4]} - df = pd.DataFrame(d).set_index('A') + df = pd.DataFrame({'A': [1, 2, 3, 4],'B': [1, 2, 3, 4]}) # Check the metadata from the dataframe df_metadata = df.__dataframe__().metadata - expected = {"pandas.indexcol": 'A'} + expected = {"pandas.index": df.index} for key in df_metadata: assert df_metadata[key] == expected[key] From 9b73543b8fe1c5b0b5dd9b463843f500f4bf8dd9 Mon Sep 17 00:00:00 2001 From: Stephannie Jimenez Date: Fri, 25 Jun 2021 16:44:57 -0500 Subject: [PATCH 5/5] Fix test --- protocol/pandas_implementation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 606afcfa..4c6e0e1e 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -596,7 +596,7 @@ def test_metadata(): df_metadata = df.__dataframe__().metadata expected = {"pandas.index": df.index} for key in df_metadata: - assert df_metadata[key] == expected[key] + assert all(df_metadata[key] == expected[key]) # Check the metadata from the column col_metadata = df.__dataframe__().get_column(0).metadata