diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d8b35abb94b9d..7c75ad031e7cd 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import DataFrame, Series +from pandas import Categorical, DataFrame, Series from .pandas_vb_common import tm @@ -16,6 +16,10 @@ def setup(self, dtype): self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() + # GH37371. Testing construction of string series/frames from ExtensionArrays + self.series_cat_arr = Categorical(self.series_arr) + self.frame_cat_arr = Categorical(self.frame_arr) + def time_series_construction(self, dtype): Series(self.series_arr, dtype=dtype) @@ -28,6 +32,18 @@ def time_frame_construction(self, dtype): def peakmem_frame_construction(self, dtype): DataFrame(self.frame_arr, dtype=dtype) + def time_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def peakmem_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def time_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + + def peakmem_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + class Methods: def setup(self): diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 05996efb6d332..9b320182d7968 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -334,7 +334,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) +- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 001fbae120ae8..2cb4df7e054fe 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -651,6 +651,11 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) + if hasattr(arr, "to_numpy"): + arr = arr.to_numpy() + elif not isinstance(arr, np.ndarray): + arr = np.array(arr, dtype="object") + result = np.asarray(arr, dtype="object") if copy and result is arr: