Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 6 additions & 47 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,12 @@
else:
import unittest

_have_pandas = False
_have_old_pandas = False
try:
import pandas
try:
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()
_have_pandas = True
except:
_have_old_pandas = True
except:
# No Pandas, but that's okay, we'll skip those tests
pass
# make sure pandas and pyarrow are installed and satisfy our version requirement during tests.
from pyspark.sql.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
import pandas
require_minimum_pandas_version()
import pyarrow
require_minimum_pyarrow_version()

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row
Expand All @@ -75,15 +68,6 @@
from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException


_have_arrow = False
try:
import pyarrow
_have_arrow = True
except:
# No Arrow, but that's okay, we'll skip those tests
pass


class UTCOffsetTimezone(datetime.tzinfo):
"""
Specifies timezone in UTC offset
Expand Down Expand Up @@ -2807,7 +2791,6 @@ def _to_pandas(self):
df = self.spark.createDataFrame(data, schema)
return df.toPandas()

@unittest.skipIf(not _have_pandas, "Pandas not installed")
def test_to_pandas(self):
import numpy as np
pdf = self._to_pandas()
Expand All @@ -2819,13 +2802,6 @@ def test_to_pandas(self):
self.assertEquals(types[4], 'datetime64[ns]')
self.assertEquals(types[5], 'datetime64[ns]')

@unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
def test_to_pandas_old(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea how to test it in jenkins, as jenkins should always have pandas and pyarrow installed with required versions. I think we can only test it manually.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

jenkins does indeed have pandas and pyarrow installed, btw.

with QuietTest(self.sc):
with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
self._to_pandas()

@unittest.skipIf(not _have_pandas, "Pandas not installed")
def test_to_pandas_avoid_astype(self):
import numpy as np
schema = StructType().add("a", IntegerType()).add("b", StringType())\
Expand All @@ -2843,7 +2819,6 @@ def test_create_dataframe_from_array_of_long(self):
df = self.spark.createDataFrame(data)
self.assertEqual(df.first(), Row(longarray=[-9223372036854775808, 0, 9223372036854775807]))

@unittest.skipIf(not _have_pandas, "Pandas not installed")
def test_create_dataframe_from_pandas_with_timestamp(self):
import pandas as pd
from datetime import datetime
Expand All @@ -2858,16 +2833,6 @@ def test_create_dataframe_from_pandas_with_timestamp(self):
self.assertTrue(isinstance(df.schema['ts'].dataType, TimestampType))
self.assertTrue(isinstance(df.schema['d'].dataType, DateType))

@unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
def test_create_dataframe_from_old_pandas(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

import pandas as pd
from datetime import datetime
pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)],
"d": [pd.Timestamp.now().date()]})
with QuietTest(self.sc):
with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
self.spark.createDataFrame(pdf)


class HiveSparkSubmitTests(SparkSubmitTests):

Expand Down Expand Up @@ -3132,7 +3097,6 @@ def test_datetime_functions(self):
parse_result = df.select(functions.to_date(functions.col("dateCol"))).first()
self.assertEquals(date(2017, 1, 22), parse_result['to_date(`dateCol`)'])

@unittest.skipIf(sys.version_info < (3, 3), "Unittest < 3.3 doesn't support mocking")
def test_unbounded_frames(self):
from unittest.mock import patch
from pyspark.sql import functions as F
Expand Down Expand Up @@ -3357,7 +3321,6 @@ def __init__(self, **kwargs):
_make_type_verifier(data_type, nullable=False)(obj)


@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
class ArrowTests(ReusedSQLTestCase):

@classmethod
Expand Down Expand Up @@ -3613,7 +3576,6 @@ def test_createDataFrame_with_int_col_names(self):
self.assertEqual(pdf_col_names, df_arrow.columns)


@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
class PandasUDFTests(ReusedSQLTestCase):
def test_pandas_udf_basic(self):
from pyspark.rdd import PythonEvalType
Expand Down Expand Up @@ -3737,7 +3699,6 @@ def foo(k, v):
return k


@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
class ScalarPandasUDF(ReusedSQLTestCase):

@classmethod
Expand Down Expand Up @@ -4216,7 +4177,6 @@ def test_register_vectorized_udf_basic(self):
self.assertEquals(expected.collect(), res2.collect())


@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
class GroupbyApplyPandasUDFTests(ReusedSQLTestCase):

@property
Expand Down Expand Up @@ -4385,7 +4345,6 @@ def test_unsupported_types(self):
df.groupby('id').apply(f).collect()


@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
class GroupbyAggPandasUDFTests(ReusedSQLTestCase):

@property
Expand Down