-
Notifications
You must be signed in to change notification settings - Fork 28.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-27276][PYTHON][SQL] Increase minimum version of pyarrow to 0.12.1 and remove prior workarounds #24298
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -260,10 +260,14 @@ def __init__(self, timezone, safecheck, assign_cols_by_name): | |
self._safecheck = safecheck | ||
self._assign_cols_by_name = assign_cols_by_name | ||
|
||
def arrow_to_pandas(self, arrow_column, data_type): | ||
from pyspark.sql.types import _arrow_column_to_pandas, _check_series_localize_timestamps | ||
def arrow_to_pandas(self, arrow_column): | ||
from pyspark.sql.types import _check_series_localize_timestamps | ||
|
||
# If the given column is a date type column, creates a series of datetime.date directly | ||
# instead of creating datetime64[ns] as intermediate data to avoid overflow caused by | ||
# datetime64[ns] type handling. | ||
s = arrow_column.to_pandas(date_as_object=True) | ||
|
||
s = _arrow_column_to_pandas(arrow_column, data_type) | ||
s = _check_series_localize_timestamps(s, self._timezone) | ||
return s | ||
|
||
|
@@ -275,8 +279,6 @@ def _create_batch(self, series): | |
:param series: A single pandas.Series, list of Series, or list of (series, arrow_type) | ||
:return: Arrow RecordBatch | ||
""" | ||
import decimal | ||
from distutils.version import LooseVersion | ||
import pandas as pd | ||
import pyarrow as pa | ||
from pyspark.sql.types import _check_series_convert_timestamps_internal | ||
|
@@ -289,24 +291,10 @@ def _create_batch(self, series): | |
def create_array(s, t): | ||
mask = s.isnull() | ||
# Ensure timestamp series are in expected form for Spark internal representation | ||
# TODO: maybe don't need None check anymore as of Arrow 0.9.1 | ||
if t is not None and pa.types.is_timestamp(t): | ||
s = _check_series_convert_timestamps_internal(s.fillna(0), self._timezone) | ||
# TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need the workaround There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't want to change this since it was related to a pandas version, I can double-check though |
||
return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) | ||
elif t is not None and pa.types.is_string(t) and sys.version < '3': | ||
# TODO: need decode before converting to Arrow in Python 2 | ||
# TODO: don't need as of Arrow 0.9.1 | ||
return pa.Array.from_pandas(s.apply( | ||
lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) | ||
elif t is not None and pa.types.is_decimal(t) and \ | ||
LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"): | ||
# TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0. | ||
return pa.Array.from_pandas(s.apply( | ||
lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t) | ||
elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"): | ||
# TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0. | ||
return pa.Array.from_pandas(s, mask=mask, type=t) | ||
|
||
try: | ||
array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck) | ||
|
@@ -340,12 +328,7 @@ def create_array(s, t): | |
for i, field in enumerate(t)] | ||
|
||
struct_arrs, struct_names = zip(*arrs_names) | ||
|
||
# TODO: from_arrays args switched for v0.9.0, remove when bump min pyarrow version | ||
if LooseVersion(pa.__version__) < LooseVersion("0.9.0"): | ||
arrs.append(pa.StructArray.from_arrays(struct_names, struct_arrs)) | ||
else: | ||
arrs.append(pa.StructArray.from_arrays(struct_arrs, struct_names)) | ||
arrs.append(pa.StructArray.from_arrays(struct_arrs, struct_names)) | ||
else: | ||
arrs.append(create_array(s, t)) | ||
|
||
|
@@ -365,10 +348,8 @@ def load_stream(self, stream): | |
""" | ||
batches = super(ArrowStreamPandasSerializer, self).load_stream(stream) | ||
import pyarrow as pa | ||
from pyspark.sql.types import from_arrow_type | ||
for batch in batches: | ||
yield [self.arrow_to_pandas(c, from_arrow_type(c.type)) | ||
for c in pa.Table.from_batches([batch]).itercolumns()] | ||
yield [self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch]).itercolumns()] | ||
|
||
def __repr__(self): | ||
return "ArrowStreamPandasSerializer" | ||
|
@@ -384,17 +365,17 @@ def __init__(self, timezone, safecheck, assign_cols_by_name, df_for_struct=False | |
.__init__(timezone, safecheck, assign_cols_by_name) | ||
self._df_for_struct = df_for_struct | ||
|
||
def arrow_to_pandas(self, arrow_column, data_type): | ||
from pyspark.sql.types import StructType, \ | ||
_arrow_column_to_pandas, _check_dataframe_localize_timestamps | ||
def arrow_to_pandas(self, arrow_column): | ||
import pyarrow.types as types | ||
|
||
if self._df_for_struct and type(data_type) == StructType: | ||
if self._df_for_struct and types.is_struct(arrow_column.type): | ||
import pandas as pd | ||
series = [_arrow_column_to_pandas(column, field.dataType).rename(field.name) | ||
for column, field in zip(arrow_column.flatten(), data_type)] | ||
s = _check_dataframe_localize_timestamps(pd.concat(series, axis=1), self._timezone) | ||
series = [super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(column) | ||
.rename(field.name) | ||
for column, field in zip(arrow_column.flatten(), arrow_column.type)] | ||
s = pd.concat(series, axis=1) | ||
else: | ||
s = super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(arrow_column, data_type) | ||
s = super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(arrow_column) | ||
return s | ||
|
||
def dump_stream(self, iterator, stream): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need to use
mask
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure, I'll check
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it's need to correctly insert NULL values in timestamps, since there is a
fillna(0)
done on the series.