diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 720ce7af47a18..d4e1539404ead 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -70,6 +70,8 @@ Other enhancements - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) +- You can now override how Pandas constructs DataFrames from custom objects, by registering a new function on the + ``pandas.core.internals.construction.create_dataframe`` ``singledispatch`` function. - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b9e43b1cd9b05..f1eb32b5ce830 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,6 @@ import warnings import numpy as np -import numpy.ma as ma from pandas._config import get_option @@ -67,7 +66,6 @@ maybe_convert_platform, maybe_downcast_to_dtype, maybe_infer_to_datetimelike, - maybe_upcast, maybe_upcast_putmask, validate_numeric_casting, ) @@ -77,7 +75,6 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, - is_dataclass, is_datetime64_any_dtype, is_dict_like, is_dtype_equal, @@ -88,7 +85,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_object_dtype, is_period_dtype, is_scalar, @@ -105,7 +101,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import Categorical, ExtensionArray +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs @@ -115,14 +111,9 @@ from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable -from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, - dataclasses_to_dicts, - get_names_from_index, - init_dict, - init_ndarray, - masked_rec_array_to_mgr, + create_dataframe, reorder_arrays, sanitize_index, to_arrays, @@ -427,97 +418,9 @@ def __init__( dtype: Optional[Dtype] = None, copy: bool = False, ): - if data is None: - data = {} if dtype is not None: dtype = self._validate_dtype(dtype) - - if isinstance(data, DataFrame): - data = data._data - - if isinstance(data, BlockManager): - mgr = self._init_mgr( - data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy - ) - elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) - elif isinstance(data, ma.MaskedArray): - import numpy.ma.mrecords as mrecords - - # masked recarray - if isinstance(data, mrecords.MaskedRecords): - mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) - - # a masked array - else: - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True - data[mask] = fill_value - else: - data = data.copy() - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) - - elif isinstance(data, (np.ndarray, Series, Index)): - if data.dtype.names: - data_columns = list(data.dtype.names) - data = {k: data[k] for k in data_columns} - if columns is None: - columns = data_columns - mgr = init_dict(data, index, columns, dtype=dtype) - elif getattr(data, "name", None) is not None: - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) - else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) - - # For data is list-like, or Iterable (will consume into list) - elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): - if not isinstance(data, (abc.Sequence, ExtensionArray)): - data = list(data) - if len(data) > 0: - if is_dataclass(data[0]): - data = dataclasses_to_dicts(data) - if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: - if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields - arrays, columns = to_arrays(data, columns, dtype=dtype) - columns = ensure_index(columns) - - # set the index - if index is None: - if isinstance(data[0], Series): - index = get_names_from_index(data) - elif isinstance(data[0], Categorical): - index = ibase.default_index(len(data[0])) - else: - index = ibase.default_index(len(data)) - - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) - else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) - else: - mgr = init_dict({}, index, columns, dtype=dtype) - else: - try: - arr = np.array(data, dtype=dtype, copy=copy) - except (ValueError, TypeError) as err: - exc = TypeError( - "DataFrame constructor called with " - f"incompatible data and dtype: {err}" - ) - raise exc from err - - if arr.ndim == 0 and index is not None and columns is not None: - values = cast_scalar_to_array( - (len(index), len(columns)), data, dtype=dtype - ) - mgr = init_ndarray( - values, index, columns, dtype=values.dtype, copy=False - ) - else: - raise ValueError("DataFrame constructor not properly called!") - + mgr = create_dataframe(data, index, columns, dtype, copy, type(self)) NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- @@ -8548,6 +8451,11 @@ def isin(self, values) -> "DataFrame": ops.add_special_arithmetic_methods(DataFrame) +@create_dataframe.register +def _create_dataframe_dataframe(data: DataFrame, *args, **kwargs): + return create_dataframe(data._data, *args, **kwargs) + + def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = collections.defaultdict(dict) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c4416472d451c..0394cef38ba49 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,13 +3,18 @@ constructors before passing them to a BlockManager. """ from collections import abc +import functools +from typing import Any, List, Optional, Type, Union, cast import numpy as np import numpy.ma as ma +import numpy.ma.mrecords as mrecords from pandas._libs import lib +from pandas._typing import Axes, Dtype from pandas.core.dtypes.cast import ( + cast_scalar_to_array, construct_1d_arraylike_from_scalar, maybe_cast_to_datetime, maybe_convert_platform, @@ -18,11 +23,13 @@ ) from pandas.core.dtypes.common import ( is_categorical_dtype, + is_dataclass, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_integer_dtype, is_list_like, + is_named_tuple, is_object_dtype, ) from pandas.core.dtypes.generic import ( @@ -35,8 +42,9 @@ ) from pandas.core import algorithms, common as com -from pandas.core.arrays import Categorical +from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.construction import extract_array, sanitize_array +from pandas.core.generic import NDFrame from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( Index, @@ -45,9 +53,11 @@ union_indexes, ) from pandas.core.internals import ( + BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks, ) +from pandas.core.series import Series # --------------------------------------------------------------------- # BlockManager Interface @@ -115,6 +125,135 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy: bool): return mgr +@functools.singledispatch +def create_dataframe( + data: Any, + index: Optional[Axes], + columns: Optional[Axes], + dtype: Optional[Dtype], + copy: bool, + cls: Type[NDFrame], +) -> BlockManager: + """ + Create a BlockManager for some given data. Used inside the DataFrame constructor + to convert different input types. + If you want to provide a custom way to convert from your objec to a DataFrame + you can register a dispatch on this function. + """ + # Base case is to try to cast to NumPy array + try: + arr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError) as err: + exc = TypeError( + f"DataFrame constructor called with incompatible data and dtype: {err}" + ) + raise exc from err + + if arr.ndim == 0 and index is not None and columns is not None: + values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype) + return init_ndarray(values, index, columns, dtype=values.dtype, copy=False) + else: + raise ValueError("DataFrame constructor not properly called!") + + +@create_dataframe.register +def _create_dataframe_none(data: None, *args, **kwargs): + return create_dataframe({}, *args, **kwargs) + + +@create_dataframe.register +def _create_dataframe_blockmanager( + data: BlockManager, index, columns, dtype, copy, cls +): + return cls._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) + + +@create_dataframe.register +def _create_dataframe_dict(data: dict, index, columns, dtype, copy, cls): + return init_dict(data, index, columns, dtype=dtype) + + +@create_dataframe.register +def _create_dataframe_masked_array( + data: ma.MaskedArray, index, columns, dtype, copy, cls +): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + + +@create_dataframe.register +def _create_dataframe_masked_record( + data: mrecords.MaskedRecords, index, columns, dtype, copy, cls +): + return masked_rec_array_to_mgr(data, index, columns, dtype, copy) + + +@create_dataframe.register(np.ndarray) +@create_dataframe.register(Series) +@create_dataframe.register(Index) +def _create_dataframe_array_series_index( + data: Union[np.ndarray, Series, Index], index, columns, dtype, copy, cls +): + if data.dtype.names: + data_columns = list(data.dtype.names) + data = {k: data[k] for k in data_columns} + if columns is None: + columns = data_columns + return init_dict(data, index, columns, dtype=dtype) + elif getattr(data, "name", None) is not None: + return init_dict({data.name: data}, index, columns, dtype=dtype) + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + + +class _IterableExceptStringOrBytesMeta(type): + def __subclasscheck__(cls, sub: Type) -> bool: + return not issubclass(sub, (str, bytes)) and issubclass(sub, abc.Iterable) + + +class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): + """ + Class that is subclass of iterable but not of str or bytes to use for singledispatch + registration + """ + + pass + + +@create_dataframe.register(_IterableExceptStringOrBytes) +def _create_dataframe_iterable(data: abc.Iterable, index, columns, dtype, copy, cls): + if not isinstance(data, (abc.Sequence, ExtensionArray)): + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = cast(List[dict], dataclasses_to_dicts(data)) + if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: + if is_named_tuple(data[0]) and columns is None: + columns = data[0]._fields + arrays, columns = to_arrays(data, columns, dtype=dtype) + columns = ensure_index(columns) + + # set the index + if index is None: + if isinstance(data[0], Series): + index = get_names_from_index(data) + elif isinstance(data[0], Categorical): + index = ibase.default_index(len(data[0])) + else: + index = ibase.default_index(len(data)) + + return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + return init_dict({}, index, columns, dtype=dtype) + + # --------------------------------------------------------------------- # DataFrame Constructor Interface diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 631f484cfc22a..dab49c147d44d 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range import pandas._testing as tm +from pandas.core.internals.construction import create_dataframe from .test_generic import Generic @@ -169,6 +170,7 @@ def test_set_attribute(self): df = DataFrame({"x": [1, 2, 3]}) df.y = 2 + df["y"] = [2, 4, 6] df.y = 5 @@ -183,6 +185,25 @@ def test_deepcopy_empty(self): self._compare(empty_frame_copy, empty_frame) + def test_register_constructor(self): + # Verify that if you register a custom `create_dataframe` imeplementation + # this will be used in the constructor + class MyCustomObject: + pass + + o = MyCustomObject() + + with pytest.raises(ValueError): + DataFrame(o) + + @create_dataframe.register + def _create_dataframe_custom(o: MyCustomObject, *args, **kwargs): + return create_dataframe(None, *args, **kwargs) + + result = DataFrame(o) + expected = DataFrame(None) + self._compare(result, expected) + # formerly in Generic but only test DataFrame class TestDataFrame2: