diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2023858181baa..d8b35abb94b9d 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -13,13 +13,20 @@ class Construction: param_names = ["dtype"] def setup(self, dtype): - self.data = tm.rands_array(nchars=10 ** 5, size=10) + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() - def time_construction(self, dtype): - Series(self.data, dtype=dtype) + def time_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) - def peakmem_construction(self, dtype): - Series(self.data, dtype=dtype) + def peakmem_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) + + def time_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) + + def peakmem_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) class Methods: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1286577748afa..177485c6dfe7e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -222,7 +222,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`) +- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2d4163e0dee89..d19a0dd8f29e3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -189,15 +190,16 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) - if dtype is not None: - if not is_dtype_equal(values.dtype, dtype): - try: - values = values.astype(dtype) - except Exception as orig: - # e.g. ValueError when trying to cast object dtype to float64 - raise ValueError( - f"failed to cast to '{dtype}' (Exception was: {orig})" - ) from orig + if dtype is not None and not is_dtype_equal(values.dtype, dtype): + try: + values = construct_1d_ndarray_preserving_na( + values.ravel(), dtype=dtype, copy=False + ).reshape(values.shape) + except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 + raise ValueError( + f"failed to cast to '{dtype}' (Exception was: {orig})" + ) from orig # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(