Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade Docker image and pyarrow in it #677

Merged
merged 3 commits into from
May 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 2

defaults: &defaults
docker:
- image: palantirtechnologies/circle-spark-base:0.1.3
- image: palantirtechnologies/circle-spark-base:0.2.2
resource_class: xlarge
environment: &defaults-environment
TERM: dumb
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:
<<: *defaults
# Some part of the maven setup fails if there's no R, so we need to use the R image here
docker:
- image: palantirtechnologies/circle-spark-r:0.1.3
- image: palantirtechnologies/circle-spark-r:0.2.2
steps:
# Saves us from recompiling every time...
- restore_cache:
Expand Down Expand Up @@ -296,7 +296,7 @@ jobs:
# depends on build-sbt, but we only need the assembly jars
<<: *defaults
docker:
- image: palantirtechnologies/circle-spark-python:0.1.3
- image: palantirtechnologies/circle-spark-python:0.2.2
parallelism: 2
steps:
- *checkout-code
Expand All @@ -321,7 +321,7 @@ jobs:
# depends on build-sbt, but we only need the assembly jars
<<: *defaults
docker:
- image: palantirtechnologies/circle-spark-r:0.1.3
- image: palantirtechnologies/circle-spark-r:0.2.2
steps:
- *checkout-code
- attach_workspace:
Expand Down Expand Up @@ -434,7 +434,7 @@ jobs:
<<: *defaults
# Some part of the maven setup fails if there's no R, so we need to use the R image here
docker:
- image: palantirtechnologies/circle-spark-r:0.1.3
- image: palantirtechnologies/circle-spark-r:0.2.2
steps:
- *checkout-code
- restore_cache:
Expand All @@ -454,7 +454,7 @@ jobs:
deploy-gradle:
<<: *defaults
docker:
- image: palantirtechnologies/circle-spark-r:0.1.3
- image: palantirtechnologies/circle-spark-r:0.2.2
steps:
- *checkout-code
- *restore-gradle-wrapper-cache
Expand All @@ -466,7 +466,7 @@ jobs:
<<: *defaults
# Some part of the maven setup fails if there's no R, so we need to use the R image here
docker:
- image: palantirtechnologies/circle-spark-r:0.1.3
- image: palantirtechnologies/circle-spark-r:0.2.2
steps:
# This cache contains the whole project after version was set and mvn package was called
# Restoring first (and instead of checkout) as mvn versions:set mutates real source code...
Expand Down
7 changes: 4 additions & 3 deletions dev/docker-images/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@

.PHONY: all publish base python r

BASE_IMAGE_NAME = palantirtechnologies/circle-spark-base:0.1.3
PYTHON_IMAGE_NAME = palantirtechnologies/circle-spark-python:0.1.3
R_IMAGE_NAME = palantirtechnologies/circle-spark-r:0.1.3
VERSION=0.2.2
BASE_IMAGE_NAME = "palantirtechnologies/circle-spark-base:${VERSION}"
PYTHON_IMAGE_NAME = "palantirtechnologies/circle-spark-python:${VERSION}"
R_IMAGE_NAME = "palantirtechnologies/circle-spark-r:${VERSION}"

all: base python r

Expand Down
16 changes: 10 additions & 6 deletions dev/docker-images/base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.
#

FROM buildpack-deps:cosmic
FROM buildpack-deps:20.04

# make Apt non-interactive
RUN echo 'APT::Get::Assume-Yes "true";' > /etc/apt/apt.conf.d/90circleci \
Expand All @@ -28,12 +28,15 @@ ENV DEBIAN_FRONTEND=noninteractive
RUN mkdir -p /usr/share/man/man1 \
&& apt-get update \
&& apt-get install -y \
git \
git python2 \
locales sudo openssh-client ca-certificates tar gzip parallel \
net-tools netcat unzip zip bzip2 gnupg curl wget \
openjdk-8-jdk rsync pandoc pandoc-citeproc flake8 tzdata \
&& rm -rf /var/lib/apt/lists/*

# Make python command default to python2
RUN sudo update-alternatives --install /usr/bin/python python /usr/bin/python2 0

# If you update java, make sure this aligns
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64

Expand All @@ -59,13 +62,13 @@ RUN JQ_URL="https://circle-downloads.s3.amazonaws.com/circleci-images/cache/linu
# The output looks like this:

#> # To install, run the following commands as root:
#> curl -fsSLO https://download.docker.com/linux/static/stable/x86_64/docker-17.05.0-ce.tgz && tar --strip-components=1 -xvzf docker-17.05.0-ce.tgz -C /usr/local/bin
#> curl -fsSLO https://download.docker.com/linux/static/stable/x86_64/docker-17.05.0.tgz && tar --strip-components=1 -xvzf docker-17.05.0.tgz -C /usr/local/bin
#>
#> # Then start docker in daemon mode:
#> /usr/local/bin/dockerd

RUN set -ex \
&& export DOCKER_VERSION=$(curl --silent --fail --retry 3 https://download.docker.com/linux/static/stable/x86_64/ | grep -o -e 'docker-[.0-9]*-ce\.tgz' | sort -r | head -n 1) \
&& export DOCKER_VERSION=$(curl --silent --fail --retry 3 https://download.docker.com/linux/static/stable/x86_64/ | grep -o -e 'docker-[.0-9]*\.tgz' | sort -r | head -n 1) \
&& DOCKER_URL="https://download.docker.com/linux/static/stable/x86_64/${DOCKER_VERSION}" \
&& echo Docker URL: $DOCKER_URL \
&& curl --silent --show-error --location --fail --retry 3 --output /tmp/docker.tgz "${DOCKER_URL}" \
Expand Down Expand Up @@ -109,9 +112,10 @@ WORKDIR $CIRCLE_HOME
ENV CONDA_ROOT=$CIRCLE_HOME/miniconda
ENV CONDA_BIN=$CIRCLE_HOME/miniconda/bin/conda
ENV MINICONDA2_VERSION=4.5.11
RUN curl -sO https://repo.continuum.io/miniconda/Miniconda2-${MINICONDA2_VERSION}-Linux-x86_64.sh \

RUN curl -sO https://repo.anaconda.com/miniconda/Miniconda2-${MINICONDA2_VERSION}-Linux-x86_64.sh \
&& bash Miniconda2-${MINICONDA2_VERSION}-Linux-x86_64.sh -b -p ${CONDA_ROOT} \
&& $CONDA_BIN clean --all \
&& $CONDA_BIN clean --all --yes \
&& sudo mkdir -m 777 /home/.conda \
&& rm -f Miniconda2-${MINICONDA2_VERSION}-Linux-x86_64.sh

Expand Down
8 changes: 4 additions & 4 deletions dev/docker-images/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ RUN curl -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-instal
# A version I've tested earlier that I know it breaks with is 1.14.1
RUN mkdir -p $(pyenv root)/versions \
&& ln -s $CONDA_ROOT $(pyenv root)/versions/our-miniconda \
&& $CONDA_BIN create -y -n python2 -c anaconda -c conda-forge python==2.7.15 numpy=1.14.0 pyarrow==0.8.0 pandas nomkl \
&& $CONDA_BIN create -y -n python3 -c anaconda -c conda-forge python=3.6 numpy=1.14.0 pyarrow==0.8.0 pandas nomkl \
&& $CONDA_BIN create -y -n python2 -c anaconda -c conda-forge python==2.7.15 numpy=1.14.0 pyarrow==0.12.1 pandas nomkl \
&& $CONDA_BIN create -y -n python3 -c anaconda -c conda-forge python=3.6 numpy=1.14.0 pyarrow==0.12.1 pandas nomkl \
&& $CONDA_BIN clean --all

RUN pyenv global our-miniconda/envs/python2 our-miniconda/envs/python3 \
Expand All @@ -37,5 +37,5 @@ RUN pyenv global our-miniconda/envs/python2 our-miniconda/envs/python3 \
# Expose pyenv globally
ENV PATH=$CIRCLE_HOME/.pyenv/shims:$PATH

RUN PYENV_VERSION=our-miniconda/envs/python2 $CIRCLE_HOME/.pyenv/shims/pip install unishark unittest-xml-reporting \
&& PYENV_VERSION=our-miniconda/envs/python3 $CIRCLE_HOME/.pyenv/shims/pip install unishark unittest-xml-reporting
RUN PYENV_VERSION=our-miniconda/envs/python2 $CIRCLE_HOME/.pyenv/shims/pip install unishark "unittest-xml-reporting<3"
RUN PYENV_VERSION=our-miniconda/envs/python3 $CIRCLE_HOME/.pyenv/shims/pip install unishark unittest-xml-reporting
2 changes: 1 addition & 1 deletion dev/run-tests-jenkins.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
Expand Down
2 changes: 1 addition & 1 deletion dev/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
Expand Down
44 changes: 18 additions & 26 deletions python/pyspark/sql/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@
from pyspark.testing.utils import QuietTest
from pyspark.util import _exception_message

if have_pandas:
import pandas as pd
from pandas.util.testing import assert_frame_equal

if have_pyarrow:
import pyarrow as pa


@unittest.skipIf(
not have_pandas or not have_pyarrow,
Expand All @@ -41,7 +48,6 @@ def setUpClass(cls):
from datetime import date, datetime
from decimal import Decimal
from distutils.version import LooseVersion
import pyarrow as pa
super(ArrowTests, cls).setUpClass()
cls.warnings_lock = threading.Lock()

Expand Down Expand Up @@ -90,7 +96,6 @@ def tearDownClass(cls):
super(ArrowTests, cls).tearDownClass()

def create_pandas_data_frame(self):
import pandas as pd
import numpy as np
data_dict = {}
for j, name in enumerate(self.schema.names):
Expand All @@ -101,8 +106,6 @@ def create_pandas_data_frame(self):
return pd.DataFrame(data=data_dict)

def test_toPandas_fallback_enabled(self):
import pandas as pd

with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}):
schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
df = self.spark.createDataFrame([({u'a': 1},)], schema=schema)
Expand All @@ -118,11 +121,10 @@ def test_toPandas_fallback_enabled(self):
self.assertTrue(len(user_warns) > 0)
self.assertTrue(
"Attempting non-optimization" in _exception_message(user_warns[-1]))
self.assertPandasEqual(pdf, pd.DataFrame({u'map': [{u'a': 1}]}))
assert_frame_equal(pdf, pd.DataFrame({u'map': [{u'a': 1}]}))

def test_toPandas_fallback_disabled(self):
from distutils.version import LooseVersion
import pyarrow as pa

schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
df = self.spark.createDataFrame([(None,)], schema=schema)
Expand Down Expand Up @@ -158,8 +160,8 @@ def test_toPandas_arrow_toggle(self):
df = self.spark.createDataFrame(self.data, schema=self.schema)
pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
expected = self.create_pandas_data_frame()
self.assertPandasEqual(expected, pdf)
self.assertPandasEqual(expected, pdf_arrow)
assert_frame_equal(expected, pdf)
assert_frame_equal(expected, pdf_arrow)

def test_toPandas_respect_session_timezone(self):
df = self.spark.createDataFrame(self.data, schema=self.schema)
Expand All @@ -169,13 +171,13 @@ def test_toPandas_respect_session_timezone(self):
"spark.sql.execution.pandas.respectSessionTimeZone": False,
"spark.sql.session.timeZone": timezone}):
pdf_la, pdf_arrow_la = self._toPandas_arrow_toggle(df)
self.assertPandasEqual(pdf_arrow_la, pdf_la)
assert_frame_equal(pdf_arrow_la, pdf_la)

with self.sql_conf({
"spark.sql.execution.pandas.respectSessionTimeZone": True,
"spark.sql.session.timeZone": timezone}):
pdf_ny, pdf_arrow_ny = self._toPandas_arrow_toggle(df)
self.assertPandasEqual(pdf_arrow_ny, pdf_ny)
assert_frame_equal(pdf_arrow_ny, pdf_ny)

self.assertFalse(pdf_ny.equals(pdf_la))

Expand All @@ -185,13 +187,13 @@ def test_toPandas_respect_session_timezone(self):
if isinstance(field.dataType, TimestampType):
pdf_la_corrected[field.name] = _check_series_convert_timestamps_local_tz(
pdf_la_corrected[field.name], timezone)
self.assertPandasEqual(pdf_ny, pdf_la_corrected)
assert_frame_equal(pdf_ny, pdf_la_corrected)

def test_pandas_round_trip(self):
pdf = self.create_pandas_data_frame()
df = self.spark.createDataFrame(self.data, schema=self.schema)
pdf_arrow = df.toPandas()
self.assertPandasEqual(pdf_arrow, pdf)
assert_frame_equal(pdf_arrow, pdf)

def test_filtered_frame(self):
df = self.spark.range(3).toDF("i")
Expand Down Expand Up @@ -265,7 +267,7 @@ def test_createDataFrame_with_schema(self):
df = self.spark.createDataFrame(pdf, schema=self.schema)
self.assertEquals(self.schema, df.schema)
pdf_arrow = df.toPandas()
self.assertPandasEqual(pdf_arrow, pdf)
assert_frame_equal(pdf_arrow, pdf)

def test_createDataFrame_with_incorrect_schema(self):
pdf = self.create_pandas_data_frame()
Expand All @@ -287,7 +289,6 @@ def test_createDataFrame_with_names(self):
self.assertEquals(df.schema.fieldNames(), new_names)

def test_createDataFrame_column_name_encoding(self):
import pandas as pd
pdf = pd.DataFrame({u'a': [1]})
columns = self.spark.createDataFrame(pdf).columns
self.assertTrue(isinstance(columns[0], str))
Expand All @@ -297,13 +298,11 @@ def test_createDataFrame_column_name_encoding(self):
self.assertEquals(columns[0], 'b')

def test_createDataFrame_with_single_data_type(self):
import pandas as pd
with QuietTest(self.sc):
with self.assertRaisesRegexp(ValueError, ".*IntegerType.*not supported.*"):
self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int")

def test_createDataFrame_does_not_modify_input(self):
import pandas as pd
# Some series get converted for Spark to consume, this makes sure input is unchanged
pdf = self.create_pandas_data_frame()
# Use a nanosecond value to make sure it is not truncated
Expand All @@ -321,7 +320,6 @@ def test_schema_conversion_roundtrip(self):
self.assertEquals(self.schema, schema_rt)

def test_createDataFrame_with_array_type(self):
import pandas as pd
pdf = pd.DataFrame({"a": [[1, 2], [3, 4]], "b": [[u"x", u"y"], [u"y", u"z"]]})
df, df_arrow = self._createDataFrame_toggle(pdf)
result = df.collect()
Expand All @@ -347,16 +345,13 @@ def test_toPandas_with_array_type(self):

def test_createDataFrame_with_int_col_names(self):
import numpy as np
import pandas as pd
pdf = pd.DataFrame(np.random.rand(4, 2))
df, df_arrow = self._createDataFrame_toggle(pdf)
pdf_col_names = [str(c) for c in pdf.columns]
self.assertEqual(pdf_col_names, df.columns)
self.assertEqual(pdf_col_names, df_arrow.columns)

def test_createDataFrame_fallback_enabled(self):
import pandas as pd

with QuietTest(self.sc):
with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}):
with warnings.catch_warnings(record=True) as warns:
Expand All @@ -374,8 +369,6 @@ def test_createDataFrame_fallback_enabled(self):

def test_createDataFrame_fallback_disabled(self):
from distutils.version import LooseVersion
import pandas as pd
import pyarrow as pa

with QuietTest(self.sc):
with self.assertRaisesRegexp(TypeError, 'Unsupported type'):
Expand All @@ -391,7 +384,6 @@ def test_createDataFrame_fallback_disabled(self):

# Regression test for SPARK-23314
def test_timestamp_dst(self):
import pandas as pd
# Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am
dt = [datetime.datetime(2015, 11, 1, 0, 30),
datetime.datetime(2015, 11, 1, 1, 30),
Expand All @@ -401,8 +393,8 @@ def test_timestamp_dst(self):
df_from_python = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
df_from_pandas = self.spark.createDataFrame(pdf)

self.assertPandasEqual(pdf, df_from_python.toPandas())
self.assertPandasEqual(pdf, df_from_pandas.toPandas())
assert_frame_equal(pdf, df_from_python.toPandas())
assert_frame_equal(pdf, df_from_pandas.toPandas())

def test_toPandas_batch_order(self):

Expand All @@ -418,7 +410,7 @@ def run_test(num_records, num_parts, max_records, use_delay=False):
df = df.rdd.mapPartitionsWithIndex(delay_first_part).toDF()
with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": max_records}):
pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
self.assertPandasEqual(pdf, pdf_arrow)
assert_frame_equal(pdf, pdf_arrow)

cases = [
(1024, 512, 2), # Use large num partitions for more likely collecting out of order
Expand Down
7 changes: 4 additions & 3 deletions python/pyspark/sql/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,14 +581,15 @@ def test_create_dataframe_required_pandas_not_found(self):

# Regression test for SPARK-23360
@unittest.skipIf(not have_pandas, pandas_requirement_message)
def test_create_dateframe_from_pandas_with_dst(self):
def test_create_dataframe_from_pandas_with_dst(self):
import pandas as pd
from pandas.util.testing import assert_frame_equal
from datetime import datetime

pdf = pd.DataFrame({'time': [datetime(2015, 10, 31, 22, 30)]})

df = self.spark.createDataFrame(pdf)
self.assertPandasEqual(pdf, df.toPandas())
assert_frame_equal(pdf, df.toPandas())

orig_env_tz = os.environ.get('TZ', None)
try:
Expand All @@ -597,7 +598,7 @@ def test_create_dateframe_from_pandas_with_dst(self):
time.tzset()
with self.sql_conf({'spark.sql.session.timeZone': tz}):
df = self.spark.createDataFrame(pdf)
self.assertPandasEqual(pdf, df.toPandas())
assert_frame_equal(pdf, df.toPandas())
finally:
del os.environ['TZ']
if orig_env_tz is not None:
Expand Down
Loading