From 8f7452877a84236be8dcf9274064e14ae09a483b Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 29 Mar 2016 18:58:02 +0200 Subject: [PATCH 1/2] ARROW-89: [Python] Add benchmarks for Arrow<->Pandas conversion --- python/benchmarks/array.py | 40 +++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/python/benchmarks/array.py b/python/benchmarks/array.py index 6ab73d18d1f87..13a15c27bfa01 100644 --- a/python/benchmarks/array.py +++ b/python/benchmarks/array.py @@ -15,22 +15,52 @@ # specific language governing permissions and limitations # under the License. -import pyarrow +import numpy as np +import pandas as pd +import pyarrow as A -class Conversions(object): + +class PyListConversions(object): + param_names = ('size',) params = (1, 10 ** 5, 10 ** 6, 10 ** 7) + def setup(self, n): + self.data = list(range(n)) + def time_from_pylist(self, n): - pyarrow.from_pylist(list(range(n))) + A.from_pylist(self.data) def peakmem_from_pylist(self, n): - pyarrow.from_pylist(list(range(n))) + A.from_pylist(self.data) + + +class PandasConversions(object): + param_names = ('size', 'dtype') + params = ((1, 10 ** 5, 10 ** 6, 10 ** 7), ('int64', 'float64', 'str')) + + def setup(self, n, dtype): + self.data = pd.DataFrame({'column': pd.Series(np.arange(n).astype(dtype))}) + self.arrow_data = A.from_pandas_dataframe(self.data) + + def time_from_series(self, n, dtype): + A.from_pandas_dataframe(self.data) + + def peakmem_from_series(self, n, dtype): + A.from_pandas_dataframe(self.data) + + def time_to_series(self, n, dtype): + self.arrow_data.to_pandas() + + def peakmem_to_series(self, n, dtype): + self.arrow_data.to_pandas() + class ScalarAccess(object): + param_names = ('size',) params = (1, 10 ** 5, 10 ** 6, 10 ** 7) def setUp(self, n): - self._array = pyarrow.from_pylist(list(range(n))) + self._array = A.from_pylist(list(range(n))) def time_as_py(self, n): for i in range(n): From bd6a7cbfe9e78a99fedda7a931f615d536eb4796 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 30 Mar 2016 10:47:33 +0200 Subject: [PATCH 2/2] Split benchmarks and add one for a float64 column with NaNs --- python/benchmarks/array.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/python/benchmarks/array.py b/python/benchmarks/array.py index 13a15c27bfa01..4268f0073f292 100644 --- a/python/benchmarks/array.py +++ b/python/benchmarks/array.py @@ -34,13 +34,19 @@ def peakmem_from_pylist(self, n): A.from_pylist(self.data) -class PandasConversions(object): - param_names = ('size', 'dtype') - params = ((1, 10 ** 5, 10 ** 6, 10 ** 7), ('int64', 'float64', 'str')) - +class PandasConversionsBase(object): def setup(self, n, dtype): - self.data = pd.DataFrame({'column': pd.Series(np.arange(n).astype(dtype))}) - self.arrow_data = A.from_pandas_dataframe(self.data) + if dtype == 'float64_nans': + arr = np.arange(n).astype('float64') + arr[arr % 10 == 0] = np.nan + else: + arr = np.arange(n).astype(dtype) + self.data = pd.DataFrame({'column': arr}) + + +class PandasConversionsToArrow(PandasConversionsBase): + param_names = ('size', 'dtype') + params = ((1, 10 ** 5, 10 ** 6, 10 ** 7), ('int64', 'float64', 'float64_nans', 'str')) def time_from_series(self, n, dtype): A.from_pandas_dataframe(self.data) @@ -48,6 +54,15 @@ def time_from_series(self, n, dtype): def peakmem_from_series(self, n, dtype): A.from_pandas_dataframe(self.data) + +class PandasConversionsFromArrow(PandasConversionsBase): + param_names = ('size', 'dtype') + params = ((1, 10 ** 5, 10 ** 6, 10 ** 7), ('int64', 'float64', 'float64_nans', 'str')) + + def setup(self, n, dtype): + super(PandasConversionsFromArrow, self).setup(n, dtype) + self.arrow_data = A.from_pandas_dataframe(self.data) + def time_to_series(self, n, dtype): self.arrow_data.to_pandas()