From 9edf9b3243dd6d343be03789e06126b247c14cf1 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 11 Dec 2019 00:26:29 -0600 Subject: [PATCH 1/9] Move skipna check to after type casting of values. Fixes performance regression introduced in aaaac86ee019675119cb0ae9c3fb7a2b7eef9959 --- pandas/_libs/lib.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a6b02e016823c..305d1d543f7ac 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1259,9 +1259,6 @@ def infer_dtype(value: object, skipna: bool = True) -> str: # make contiguous values = values.ravel() - if skipna: - values = values[~isnaobj(values)] - val = _try_infer_map(values) if val is not None: return val @@ -1269,6 +1266,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if values.dtype != np.object_: values = values.astype('O') + if skipna: + values = values[~isnaobj(values)] + n = len(values) if n == 0: return 'empty' From 70c8bed5aa49ae12d737b7761e99515bd0d496a4 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 11 Dec 2019 01:10:03 -0600 Subject: [PATCH 2/9] Add entry to whatsnew. --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3e72072eae303..a6f10c5aa7b37 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -645,6 +645,7 @@ Performance improvements - Performance improvement when checking if values in a :class:`Categorical` are equal, equal or larger or larger than a given scalar. The improvement is not present if checking if the :class:`Categorical` is less than or less than or equal than the scalar (:issue:`29820`) - Performance improvement in :meth:`Index.equals` and :meth:`MultiIndex.equals` (:issue:`29134`) +- Performance improvement in :func:`infer_dtype` when ``skipna`` is ``True`` (:issue:`28814`) .. _whatsnew_1000.bug_fixes: From ea579f7e5069fb849b69e73f4246509b92d9a0e0 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 11 Dec 2019 17:18:43 +0000 Subject: [PATCH 3/9] Update doc/source/whatsnew/v1.0.0.rst Co-Authored-By: Joris Van den Bossche --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a6f10c5aa7b37..d1e22293487e7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -645,7 +645,7 @@ Performance improvements - Performance improvement when checking if values in a :class:`Categorical` are equal, equal or larger or larger than a given scalar. The improvement is not present if checking if the :class:`Categorical` is less than or less than or equal than the scalar (:issue:`29820`) - Performance improvement in :meth:`Index.equals` and :meth:`MultiIndex.equals` (:issue:`29134`) -- Performance improvement in :func:`infer_dtype` when ``skipna`` is ``True`` (:issue:`28814`) +- Performance improvement in :func:`~pandas.api.types.infer_dtype` when ``skipna`` is ``True`` (:issue:`28814`) .. _whatsnew_1000.bug_fixes: From df558643a322f8aba667a59fe0ff0654c90f8776 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 11 Dec 2019 14:26:50 -0600 Subject: [PATCH 4/9] Add asv tests for skipna. --- asv_bench/benchmarks/dtypes.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 24cc1c6f9fa70..a3a8b1bab7499 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -7,6 +7,7 @@ extension_dtypes, numeric_dtypes, string_dtypes, + lib ) _numpy_dtypes = [ @@ -39,5 +40,26 @@ def time_pandas_dtype_invalid(self, dtype): except TypeError: pass +class InferDtypes: + params = _dtypes + param_names = ['dtype'] + data_dict = { + "np-object": np.array([1] * 1000, dtype='O'), + "py-object": [1] * 1000, + "np-null": np.array([1] * 500 + [np.nan] * 500), + "py-null": [1] * 500 + [None] * 500, + "np-int": np.array([1] * 1000, dtype=int), + "np-floating": np.array([1.0] * 1000, dtype=float), + "empty": [], + "bytes": [b'a'] * 1000, + } + params = list(data_dict.keys()) + + def time_infer_skipna(self, dtype): + lib.infer_dtype(dtype, skipna=True) + + def time_infer(self, dtype): + lib.infer_dtype(dtype, skipna=False) + from .pandas_vb_common import setup # noqa: F401 isort:skip From 94a968c16b0a83e9d5565a2dd86941d67677b0e9 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 11 Dec 2019 14:28:57 -0600 Subject: [PATCH 5/9] make pep8 happy. --- asv_bench/benchmarks/dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index a3a8b1bab7499..4bbb6b47f84b5 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -40,6 +40,7 @@ def time_pandas_dtype_invalid(self, dtype): except TypeError: pass + class InferDtypes: params = _dtypes param_names = ['dtype'] From 3ba2229ccad0926a34f883972b86840868407183 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Thu, 12 Dec 2019 09:19:47 -0600 Subject: [PATCH 6/9] make black happy. --- asv_bench/benchmarks/dtypes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 4bbb6b47f84b5..254bd43c740cd 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -7,7 +7,7 @@ extension_dtypes, numeric_dtypes, string_dtypes, - lib + lib, ) _numpy_dtypes = [ @@ -43,16 +43,16 @@ def time_pandas_dtype_invalid(self, dtype): class InferDtypes: params = _dtypes - param_names = ['dtype'] + param_names = ["dtype"] data_dict = { - "np-object": np.array([1] * 1000, dtype='O'), + "np-object": np.array([1] * 1000, dtype="O"), "py-object": [1] * 1000, "np-null": np.array([1] * 500 + [np.nan] * 500), "py-null": [1] * 500 + [None] * 500, "np-int": np.array([1] * 1000, dtype=int), "np-floating": np.array([1.0] * 1000, dtype=float), "empty": [], - "bytes": [b'a'] * 1000, + "bytes": [b"a"] * 1000, } params = list(data_dict.keys()) From f97fb1fb2bdd7afe54f20d0e96f5b1589604282a Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Thu, 12 Dec 2019 21:22:21 -0600 Subject: [PATCH 7/9] make isort happy. --- asv_bench/benchmarks/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 254bd43c740cd..2c6d06dbf8f2a 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -5,9 +5,9 @@ from .pandas_vb_common import ( datetime_dtypes, extension_dtypes, + lib, numeric_dtypes, string_dtypes, - lib, ) _numpy_dtypes = [ From a5efb8f7c51e66b9e5cfadaefaefa217d6d98b9b Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Mon, 16 Dec 2019 10:53:35 -0600 Subject: [PATCH 8/9] Make dtype inference test cases larger. --- asv_bench/benchmarks/dtypes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 2c6d06dbf8f2a..1b0c8dc487c9c 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -45,14 +45,14 @@ class InferDtypes: params = _dtypes param_names = ["dtype"] data_dict = { - "np-object": np.array([1] * 1000, dtype="O"), - "py-object": [1] * 1000, - "np-null": np.array([1] * 500 + [np.nan] * 500), - "py-null": [1] * 500 + [None] * 500, - "np-int": np.array([1] * 1000, dtype=int), - "np-floating": np.array([1.0] * 1000, dtype=float), + "np-object": np.array([1] * 100000, dtype="O"), + "py-object": [1] * 100000, + "np-null": np.array([1] * 50000 + [np.nan] * 50000), + "py-null": [1] * 50000 + [None] * 50000, + "np-int": np.array([1] * 100000, dtype=int), + "np-floating": np.array([1.0] * 100000, dtype=float), "empty": [], - "bytes": [b"a"] * 1000, + "bytes": [b"a"] * 100000, } params = list(data_dict.keys()) From 8d7ec5deb92ff045fc1935f75f98494b4216d41e Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Mon, 16 Dec 2019 18:15:54 -0600 Subject: [PATCH 9/9] Fix benchmark params. --- asv_bench/benchmarks/dtypes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 1b0c8dc487c9c..bd17b710b108d 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -42,7 +42,6 @@ def time_pandas_dtype_invalid(self, dtype): class InferDtypes: - params = _dtypes param_names = ["dtype"] data_dict = { "np-object": np.array([1] * 100000, dtype="O"), @@ -57,10 +56,10 @@ class InferDtypes: params = list(data_dict.keys()) def time_infer_skipna(self, dtype): - lib.infer_dtype(dtype, skipna=True) + lib.infer_dtype(self.data_dict[dtype], skipna=True) def time_infer(self, dtype): - lib.infer_dtype(dtype, skipna=False) + lib.infer_dtype(self.data_dict[dtype], skipna=False) from .pandas_vb_common import setup # noqa: F401 isort:skip