diff --git a/dev/tox.ini b/dev/tox.ini index 8a8e03ec9be3e..721266ec2bfa4 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -18,9 +18,6 @@ ignore = E203, # Skip as black formatter adds a whitespace around ':'. E402, # Module top level import is disabled for optional import check, etc. - # 1. Type hints with def are treated as redefinition (e.g., functions.log). - # 2. Some are used for testing. - F811, # There are too many instances to fix. Ignored for now. W503, W504, diff --git a/python/pyspark/logger/tests/test_logger.py b/python/pyspark/logger/tests/test_logger.py index 96d707eb34a2b..69b28d2308a72 100644 --- a/python/pyspark/logger/tests/test_logger.py +++ b/python/pyspark/logger/tests/test_logger.py @@ -215,7 +215,6 @@ class LoggerTests(LoggerTestsMixin, ReusedSQLTestCase): if __name__ == "__main__": - import unittest from pyspark.logger.tests.test_logger import * # noqa: F401 try: diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 9072e88ca29fb..06759bc252696 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -50,7 +50,6 @@ from py4j.java_gateway import JavaGateway, JavaObject from pyspark.ml._typing import PipelineStage from pyspark.ml.base import Params - from pyspark.ml.wrapper import JavaWrapper from pyspark.core.context import SparkContext from pyspark.sql import DataFrame from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index bf8fd04dc2837..84b59eecbe9d5 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -34,7 +34,6 @@ ) from pyspark.mllib.util import Saveable, Loader, inherit_doc from pyspark.mllib.linalg import Vector -from pyspark.mllib.regression import LabeledPoint if TYPE_CHECKING: from pyspark.mllib._typing import VectorLike diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 915a55595cb53..0d988dcb065a5 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -36,7 +36,6 @@ if TYPE_CHECKING: from pyspark.mllib._typing import VectorLike - from py4j.java_collections import JavaMap __all__ = [ "Normalizer", diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 87f05bc0979b8..b4bd414927066 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -32,7 +32,7 @@ import numpy as np -from pyspark import RDD, since +from pyspark import since from pyspark.streaming.dstream import DStream from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc from pyspark.mllib.linalg import _convert_to_vector diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index b24bced3ced6e..9e945d8a1b947 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -18,7 +18,7 @@ import sys import random -from pyspark import RDD, since +from pyspark import since from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper from pyspark.mllib.linalg import _convert_to_vector from pyspark.mllib.regression import LabeledPoint diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index 5572d9ca85556..caa2c9338a959 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -20,7 +20,7 @@ import numpy as np -from pyspark import SparkContext, since +from pyspark import since from pyspark.mllib.common import callMLlibFunc, inherit_doc from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector from pyspark.sql import DataFrame @@ -28,7 +28,6 @@ from pyspark.core.context import SparkContext from pyspark.mllib.linalg import Vector from pyspark.core.rdd import RDD -from pyspark.sql.dataframe import DataFrame T = TypeVar("T") L = TypeVar("L", bound="Loader") diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py index 1bd274b45a742..5421de9f33715 100644 --- a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py @@ -124,7 +124,6 @@ class DiffFramesCorrWithTests( if __name__ == "__main__": - import unittest from pyspark.pandas.tests.diff_frames_ops.test_corrwith import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py b/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py index 7a94e1858f09e..14292b6c32802 100644 --- a/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py @@ -91,7 +91,6 @@ class DiffFramesDotFrameTests(DiffFramesDotFrameMixin, PandasOnSparkTestCase, SQ if __name__ == "__main__": - import unittest from pyspark.pandas.tests.diff_frames_ops.test_dot_frame import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index acd80378333e8..7d44bd426da7d 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -402,7 +402,6 @@ class CategoricalIndexTests(CategoricalIndexTestsMixin, PandasOnSparkTestCase, T if __name__ == "__main__": - import unittest from pyspark.pandas.tests.indexes.test_category import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py b/python/pyspark/pandas/tests/indexes/test_timedelta.py index 400bb0e2a365e..037d784840aef 100644 --- a/python/pyspark/pandas/tests/indexes/test_timedelta.py +++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py @@ -115,7 +115,6 @@ class TimedeltaIndexTests( if __name__ == "__main__": - import unittest from pyspark.pandas.tests.indexes.test_timedelta import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py index b0e4c69a35eac..f03167d2ad729 100644 --- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py +++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py @@ -226,7 +226,6 @@ class SeriesStringOpsAdvTests( if __name__ == "__main__": - import unittest from pyspark.pandas.tests.series.test_string_ops_adv import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py index 141c6873d7f59..145ac2dcb9f3f 100644 --- a/python/pyspark/pandas/tests/test_namespace.py +++ b/python/pyspark/pandas/tests/test_namespace.py @@ -683,7 +683,6 @@ class NamespaceTests(NamespaceTestsMixin, PandasOnSparkTestCase, SQLTestUtils): if __name__ == "__main__": - import unittest from pyspark.pandas.tests.test_namespace import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/test_numpy_compat.py b/python/pyspark/pandas/tests/test_numpy_compat.py index f754ee08a7835..d961a433e181d 100644 --- a/python/pyspark/pandas/tests/test_numpy_compat.py +++ b/python/pyspark/pandas/tests/test_numpy_compat.py @@ -198,7 +198,6 @@ class NumPyCompatTests( if __name__ == "__main__": - import unittest from pyspark.pandas.tests.test_numpy_compat import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/test_utils.py b/python/pyspark/pandas/tests/test_utils.py index 6286df8e54690..89efb85dd61a7 100644 --- a/python/pyspark/pandas/tests/test_utils.py +++ b/python/pyspark/pandas/tests/test_utils.py @@ -181,7 +181,7 @@ def test_dataframe_error_assert_pandas_almost_equal(self): }, ) - def test_series_error_assert_pandas_equal(self): + def test_series_error_assert_pandas_almost_equal_2(self): series1 = pd.Series([1, 2, 3]) series2 = pd.Series([4, 5, 6]) diff --git a/python/pyspark/pipelines/tests/test_block_connect_access.py b/python/pyspark/pipelines/tests/test_block_connect_access.py index 1ad1881b7127a..60688f30bfb9d 100644 --- a/python/pyspark/pipelines/tests/test_block_connect_access.py +++ b/python/pyspark/pipelines/tests/test_block_connect_access.py @@ -17,7 +17,6 @@ import unittest from pyspark.errors import PySparkException -from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.connectutils import ( ReusedConnectTestCase, should_test_connect, diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 65dea5cf1a579..8de8663693f7e 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -19,7 +19,6 @@ from pyspark.errors.exceptions.base import ( SessionNotSameException, PySparkIndexError, - PySparkAttributeError, ) from pyspark.resource import ResourceProfile from pyspark.sql.connect.logging import logger diff --git a/python/pyspark/sql/connect/group.py b/python/pyspark/sql/connect/group.py index d540e721f149e..afaf286a7e9c8 100644 --- a/python/pyspark/sql/connect/group.py +++ b/python/pyspark/sql/connect/group.py @@ -55,7 +55,6 @@ PandasGroupedMapFunctionWithState, ) from pyspark.sql.connect.dataframe import DataFrame - from pyspark.sql.types import StructType class GroupedData: diff --git a/python/pyspark/sql/connect/udf.py b/python/pyspark/sql/connect/udf.py index 3bc2f87e4bd3f..64506731badbe 100644 --- a/python/pyspark/sql/connect/udf.py +++ b/python/pyspark/sql/connect/udf.py @@ -50,7 +50,6 @@ UserDefinedFunctionLike, ) from pyspark.sql.connect.session import SparkSession - from pyspark.sql.types import StringType def _create_py_udf( diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 17eda26f76e18..04b800be23723 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -3061,56 +3061,6 @@ def floor(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Co return _invoke_function_over_columns("floor", col, scale) # type: ignore[arg-type] -@_try_remote_functions -def log(col: "ColumnOrName") -> Column: - """ - Computes the natural logarithm of the given value. - - .. versionadded:: 1.4.0 - - .. versionchanged:: 3.4.0 - Supports Spark Connect. - - Parameters - ---------- - col : :class:`~pyspark.sql.Column` or column name - column to calculate natural logarithm for. - - Returns - ------- - :class:`~pyspark.sql.Column` - natural logarithm of the given value. - - Examples - -------- - Example 1: Compute the natural logarithm of E - - >>> from pyspark.sql import functions as sf - >>> spark.range(1).select(sf.log(sf.e())).show() - +-------+ - |ln(E())| - +-------+ - | 1.0| - +-------+ - - Example 2: Compute the natural logarithm of invalid values - - >>> from pyspark.sql import functions as sf - >>> spark.sql( - ... "SELECT * FROM VALUES (-1), (0), (FLOAT('NAN')), (NULL) AS TAB(value)" - ... ).select("*", sf.log("value")).show() - +-----+---------+ - |value|ln(value)| - +-----+---------+ - | -1.0| NULL| - | 0.0| NULL| - | NaN| NaN| - | NULL| NULL| - +-----+---------+ - """ - return _invoke_function_over_columns("log", col) - - @_try_remote_functions def log10(col: "ColumnOrName") -> Column: """ @@ -8342,7 +8292,7 @@ def when(condition: Column, value: Any) -> Column: return _invoke_function("when", condition._jc, v) -@overload # type: ignore[no-redef] +@overload def log(arg1: "ColumnOrName") -> Column: ... diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py b/python/pyspark/sql/tests/arrow/test_arrow.py index 79d8bf77d9d5b..19e579cb67781 100644 --- a/python/pyspark/sql/tests/arrow/test_arrow.py +++ b/python/pyspark/sql/tests/arrow/test_arrow.py @@ -61,7 +61,6 @@ from pyspark.errors import ArithmeticException, PySparkTypeError, UnsupportedOperationException from pyspark.loose_version import LooseVersion from pyspark.util import is_remote_only -from pyspark.loose_version import LooseVersion if have_pandas: import pandas as pd @@ -1009,11 +1008,6 @@ def check_createDataFrame_pandas_with_struct_type(self, arrow_enabled): expected[r][e] == result[r][e], f"{expected[r][e]} == {result[r][e]}" ) - def test_createDataFrame_pandas_with_struct_type(self): - for arrow_enabled in [True, False]: - with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_pandas_with_struct_type(arrow_enabled) - def test_createDataFrame_arrow_with_struct_type_nulls(self): t = pa.table( { diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udf.py b/python/pyspark/sql/tests/arrow/test_arrow_udf.py index b56ed75acad9e..51190f6d2a3de 100644 --- a/python/pyspark/sql/tests/arrow/test_arrow_udf.py +++ b/python/pyspark/sql/tests/arrow/test_arrow_udf.py @@ -164,13 +164,13 @@ def test_arrow_udf_wrong_arg(self): with self.assertRaises(ParseException): @arrow_udf("blah") - def foo(x): + def _(x): return x with self.assertRaises(PySparkTypeError) as pe: @arrow_udf(returnType="double", functionType=PandasUDFType.SCALAR) - def foo(df): + def _(df): return df self.check_error( @@ -185,7 +185,7 @@ def foo(df): with self.assertRaises(PySparkTypeError) as pe: @arrow_udf(functionType=ArrowUDFType.SCALAR) - def foo(x): + def _(x): return x self.check_error( @@ -197,7 +197,7 @@ def foo(x): with self.assertRaises(PySparkTypeError) as pe: @arrow_udf("double", 100) - def foo(x): + def _(x): return x self.check_error( @@ -209,7 +209,7 @@ def foo(x): with self.assertRaises(PySparkTypeError) as pe: @arrow_udf(returnType=PandasUDFType.GROUPED_MAP) - def foo(df): + def _(df): return df self.check_error( @@ -224,13 +224,13 @@ def foo(df): with self.assertRaisesRegex(ValueError, "0-arg arrow_udfs.*not.*supported"): @arrow_udf(LongType(), ArrowUDFType.SCALAR) - def zero_with_type(): + def _(): return 1 with self.assertRaisesRegex(ValueError, "0-arg arrow_udfs.*not.*supported"): @arrow_udf(LongType(), ArrowUDFType.SCALAR_ITER) - def zero_with_type(): + def _(): yield 1 yield 2 diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py b/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py index 3bd00d2cc921a..a0d805a3ab274 100644 --- a/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py +++ b/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py @@ -27,7 +27,7 @@ from pyspark.util import PythonEvalType from pyspark.sql.functions import arrow_udf, ArrowUDFType -from pyspark.sql import Row, functions as F +from pyspark.sql import functions as F from pyspark.sql.types import ( IntegerType, ByteType, diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py index f05f982d2d14e..cc750f2b3439e 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py @@ -313,7 +313,6 @@ def test_server_listener_uninterruptible(self): if __name__ == "__main__": - import unittest from pyspark.sql.tests.connect.streaming.test_parity_listener import * # noqa: F401 try: diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py b/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py index 59107363571ee..b83a2d5f198b1 100644 --- a/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py +++ b/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py @@ -26,7 +26,6 @@ class DataFrameQueryContextParityTests(DataFrameQueryContextTestsMixin, ReusedCo if __name__ == "__main__": - import unittest from pyspark.sql.tests.connect.test_parity_dataframe_query_context import * # noqa: F401 try: diff --git a/python/pyspark/sql/tests/connect/test_parity_geographytype.py b/python/pyspark/sql/tests/connect/test_parity_geographytype.py index 501bbed20ff19..a85edbc69c2ab 100644 --- a/python/pyspark/sql/tests/connect/test_parity_geographytype.py +++ b/python/pyspark/sql/tests/connect/test_parity_geographytype.py @@ -26,7 +26,6 @@ class GeographyTypeParityTest(GeographyTypeTestMixin, ReusedConnectTestCase): if __name__ == "__main__": - import unittest from pyspark.sql.tests.connect.test_parity_geographytype import * # noqa: F401 try: diff --git a/python/pyspark/sql/tests/connect/test_parity_geometrytype.py b/python/pyspark/sql/tests/connect/test_parity_geometrytype.py index b95321b3c61be..640ead4c32443 100644 --- a/python/pyspark/sql/tests/connect/test_parity_geometrytype.py +++ b/python/pyspark/sql/tests/connect/test_parity_geometrytype.py @@ -26,7 +26,6 @@ class GeometryTypeParityTest(GeometryTypeTestMixin, ReusedConnectTestCase): if __name__ == "__main__": - import unittest from pyspark.sql.tests.connect.test_parity_geometrytype import * # noqa: F401 try: diff --git a/python/pyspark/sql/tests/connect/test_parity_observation.py b/python/pyspark/sql/tests/connect/test_parity_observation.py index e16053d5a082a..ebe0ed78e5c42 100644 --- a/python/pyspark/sql/tests/connect/test_parity_observation.py +++ b/python/pyspark/sql/tests/connect/test_parity_observation.py @@ -29,7 +29,6 @@ class DataFrameObservationParityTests( if __name__ == "__main__": - import unittest from pyspark.sql.tests.connect.test_parity_observation import * # noqa: F401 try: diff --git a/python/pyspark/sql/tests/connect/test_parity_readwriter.py b/python/pyspark/sql/tests/connect/test_parity_readwriter.py index f83f3edbfa787..a95cea3e7345f 100644 --- a/python/pyspark/sql/tests/connect/test_parity_readwriter.py +++ b/python/pyspark/sql/tests/connect/test_parity_readwriter.py @@ -37,7 +37,6 @@ def test_partitioning_functions(self): if __name__ == "__main__": - import unittest from pyspark.sql.tests.connect.test_parity_readwriter import * # noqa: F401 try: diff --git a/python/pyspark/sql/tests/connect/test_parity_udf.py b/python/pyspark/sql/tests/connect/test_parity_udf.py index 5507f8e9f289b..4b027f6fca375 100644 --- a/python/pyspark/sql/tests/connect/test_parity_udf.py +++ b/python/pyspark/sql/tests/connect/test_parity_udf.py @@ -56,10 +56,6 @@ def test_udf_defers_judf_initialization(self): def test_nondeterministic_udf3(self): super().test_nondeterministic_udf3() - @unittest.skip("Spark Connect doesn't support RDD but the test depends on it.") - def test_worker_original_stdin_closed(self): - super().test_worker_original_stdin_closed() - @unittest.skip("Spark Connect does not support SQLContext but the test depends on it.") def test_udf_on_sql_context(self): super().test_udf_on_sql_context() diff --git a/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py b/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py index 7f9eb8979c08e..c5172ff6af3f4 100644 --- a/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py +++ b/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py @@ -26,7 +26,6 @@ ReusedSQLTestCase, ) -from pyspark.testing.sqlutils import ReusedSQLTestCase from pyspark.sql.tests.pandas.streaming.test_pandas_transform_with_state_state_variable import ( TransformWithStateStateVariableTestsMixin, ) diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf.py b/python/pyspark/sql/tests/pandas/test_pandas_udf.py index 017698f318d59..6b397bc636263 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_udf.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_udf.py @@ -170,13 +170,13 @@ def test_udf_wrong_arg(self): with self.assertRaises(ParseException): @pandas_udf("blah") - def foo(x): + def _(x): return x with self.assertRaises(PySparkTypeError) as pe: @pandas_udf(returnType="double", functionType=PandasUDFType.GROUPED_MAP) - def foo(df): + def _(df): return df self.check_error( @@ -193,14 +193,14 @@ def foo(df): with self.assertRaisesRegex(ValueError, "Invalid function"): @pandas_udf(returnType="k int, v double", functionType=PandasUDFType.GROUPED_MAP) - def foo(k, v, w): + def _(k, v, w): return k def check_udf_wrong_arg(self): with self.assertRaises(PySparkTypeError) as pe: @pandas_udf(functionType=PandasUDFType.SCALAR) - def foo(x): + def _(x): return x self.check_error( @@ -212,7 +212,7 @@ def foo(x): with self.assertRaises(PySparkTypeError) as pe: @pandas_udf("double", 100) - def foo(x): + def _(x): return x self.check_error( @@ -227,20 +227,20 @@ def foo(x): with self.assertRaisesRegex(ValueError, "0-arg pandas_udfs.*not.*supported"): @pandas_udf(LongType(), PandasUDFType.SCALAR) - def zero_with_type(): + def _(): return 1 with self.assertRaisesRegex(ValueError, "0-arg pandas_udfs.*not.*supported"): @pandas_udf(LongType(), PandasUDFType.SCALAR_ITER) - def zero_with_type(): + def _(): yield 1 yield 2 with self.assertRaises(PySparkTypeError) as pe: @pandas_udf(returnType=PandasUDFType.GROUPED_MAP) - def foo(df): + def _(df): return df self.check_error( diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py index b936a9240e529..88f2168f131b1 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py @@ -53,7 +53,6 @@ StringType, ArrayType, StructField, - Row, TimestampType, MapType, DateType, diff --git a/python/pyspark/sql/tests/test_python_datasource.py b/python/pyspark/sql/tests/test_python_datasource.py index e5171876656c7..9a4782e7322f0 100644 --- a/python/pyspark/sql/tests/test_python_datasource.py +++ b/python/pyspark/sql/tests/test_python_datasource.py @@ -20,7 +20,6 @@ import unittest import logging import json -import os from dataclasses import dataclass from datetime import datetime from decimal import Decimal @@ -852,9 +851,9 @@ def schema(self): return "x string" def reader(self, schema): - return TestReader() + return TestReader2() - class TestReader(DataSourceReader): + class TestReader2(DataSourceReader): def read(self, partition): ctypes.string_at(0) yield "x", @@ -894,9 +893,9 @@ def name(cls): return "test" def writer(self, schema, overwrite): - return TestWriter() + return TestWriter2() - class TestWriter(DataSourceWriter): + class TestWriter2(DataSourceWriter): def write(self, iterator): return WriterCommitMessage() diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py index 5ded5aa67b4eb..389df5b5a6cf4 100644 --- a/python/pyspark/sql/tests/test_udtf.py +++ b/python/pyspark/sql/tests/test_udtf.py @@ -40,7 +40,6 @@ array, col, create_map, - array, lit, named_struct, udf, diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py index 08d360a19e81d..5f85c1886b4b2 100644 --- a/python/pyspark/sql/tests/test_utils.py +++ b/python/pyspark/sql/tests/test_utils.py @@ -923,7 +923,7 @@ def test_assert_equal_exact_pandas_on_spark_df(self): assertDataFrameEqual(df1, df2, checkRowOrder=True) @unittest.skipIf(not have_pandas or not have_pyarrow, "no pandas or pyarrow dependency") - def test_assert_equal_exact_pandas_on_spark_df(self): + def test_assert_equal_exact_pandas_on_spark_df_no_order(self): import pyspark.pandas as ps df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"]) @@ -1543,53 +1543,6 @@ def test_list_rows_unequal(self): messageParameters={"error_msg": error_msg}, ) - def test_list_row_unequal_schema(self): - df1 = self.spark.createDataFrame( - data=[ - (1, 3000), - (2, 1000), - (3, 10), - ], - schema=["id", "amount"], - ) - - list_of_rows = [Row(id=1, amount=300), Row(id=2, amount=100), Row(id=3, amount=10)] - - rows_str1 = "" - rows_str2 = "" - - # count different rows - for r1, r2 in list(zip_longest(df1, list_of_rows)): - rows_str1 += str(r1) + "\n" - rows_str2 += str(r2) + "\n" - - generated_diff = _context_diff( - actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=3 - ) - - error_msg = "Results do not match: " - percent_diff = (2 / 3) * 100 - error_msg += "( %.5f %% )" % percent_diff - error_msg += "\n" + "\n".join(generated_diff) - - with self.assertRaises(PySparkAssertionError) as pe: - assertDataFrameEqual(df1, list_of_rows) - - self.check_error( - exception=pe.exception, - errorClass="DIFFERENT_ROWS", - messageParameters={"error_msg": error_msg}, - ) - - with self.assertRaises(PySparkAssertionError) as pe: - assertDataFrameEqual(df1, list_of_rows, checkRowOrder=True) - - self.check_error( - exception=pe.exception, - errorClass="DIFFERENT_ROWS", - messageParameters={"error_msg": error_msg}, - ) - def test_list_row_unequal_schema(self): from pyspark.sql import Row diff --git a/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py b/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py index 0ad17e919754c..5e307110559b8 100644 --- a/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py +++ b/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py @@ -21,7 +21,6 @@ import platform import unittest from decimal import Decimal -import numpy as np import pandas as pd from pyspark.sql import Row diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 94e3b2728d08f..08497f1a1ea6b 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -19,7 +19,6 @@ Worker that receives input from Piped RDD. """ import datetime -import itertools import os import sys import dataclasses @@ -54,7 +53,6 @@ from pyspark.sql.pandas.serializers import ( ArrowStreamPandasUDFSerializer, ArrowStreamPandasUDTFSerializer, - GroupPandasUDFSerializer, GroupArrowUDFSerializer, CogroupArrowUDFSerializer, CogroupPandasUDFSerializer,