diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 00b733f5437b..9ae180c32ab8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -343,18 +343,13 @@ jobs: key: docs-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | docs-maven- - - name: Install Python 3.6 - uses: actions/setup-python@v2 - with: - python-version: 3.6 - architecture: x64 - name: Install Python linter dependencies run: | # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.6 -m pip install flake8 pydata_sphinx_theme 'mypy==0.910' numpydoc 'jinja2<3.0.0' 'black==21.5b2' + python3.9 -m pip install flake8 pydata_sphinx_theme 'mypy==0.910' numpydoc 'jinja2<3.0.0' 'black==21.5b2' - name: Install R linter dependencies and SparkR run: | apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev @@ -373,8 +368,8 @@ jobs: # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' - python3.6 -m pip install sphinx_plotly_directive 'pyarrow<5.0.0' pandas 'plotly>=4.8' + python3.9 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' + python3.9 -m pip install sphinx_plotly_directive 'pyarrow<5.0.0' pandas 'plotly>=4.8' apt-get update -y apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" diff --git a/python/pyspark/ml/linalg/__init__.pyi b/python/pyspark/ml/linalg/__init__.pyi index b4fba8823b67..a302825a1d8e 100644 --- a/python/pyspark/ml/linalg/__init__.pyi +++ b/python/pyspark/ml/linalg/__init__.pyi @@ -164,7 +164,7 @@ class Matrix: def __init__( self, numRows: int, numCols: int, isTransposed: bool = ... ) -> None: ... - def toArray(self) -> NoReturn: ... + def toArray(self) -> ndarray: ... class DenseMatrix(Matrix): values: Any diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi index 213a69996b0a..22469e869acc 100644 --- a/python/pyspark/mllib/_typing.pyi +++ b/python/pyspark/mllib/_typing.pyi @@ -20,4 +20,4 @@ from typing import List, Tuple, Union from pyspark.mllib.linalg import Vector from numpy import ndarray # noqa: F401 -VectorLike = Union[Vector, List[float], Tuple[float, ...]] +VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]] diff --git a/python/pyspark/mllib/classification.pyi b/python/pyspark/mllib/classification.pyi index 967b0a9f289d..89229e2d2430 100644 --- a/python/pyspark/mllib/classification.pyi +++ b/python/pyspark/mllib/classification.pyi @@ -90,7 +90,7 @@ class LogisticRegressionWithLBFGS: class SVMModel(LinearClassificationModel): def __init__(self, weights: Vector, intercept: float) -> None: ... - @overload + @overload # type: ignore def predict(self, x: VectorLike) -> float64: ... @overload def predict(self, x: RDD[VectorLike]) -> RDD[float64]: ... diff --git a/python/pyspark/mllib/clustering.pyi b/python/pyspark/mllib/clustering.pyi index b4f349612f0f..7b3673aa4560 100644 --- a/python/pyspark/mllib/clustering.pyi +++ b/python/pyspark/mllib/clustering.pyi @@ -35,7 +35,7 @@ from pyspark.streaming.dstream import DStream T = TypeVar("T") class BisectingKMeansModel(JavaModelWrapper): - centers: List[ndarray] + centers: List[VectorLike] def __init__(self, java_model: JavaObject) -> None: ... @property def clusterCenters(self) -> List[ndarray]: ... @@ -62,7 +62,7 @@ class BisectingKMeans: ) -> BisectingKMeansModel: ... class KMeansModel(Saveable, Loader[KMeansModel]): - centers: List[ndarray] + centers: List[VectorLike] def __init__(self, centers: List[VectorLike]) -> None: ... @property def clusterCenters(self) -> List[ndarray]: ... @@ -149,7 +149,7 @@ class StreamingKMeansModel(KMeansModel): ) -> None: ... @property def clusterWeights(self) -> List[float64]: ... - centers: ndarray + centers: List[VectorLike] def update( self, data: RDD[VectorLike], decayFactor: float, timeUnit: str ) -> StreamingKMeansModel: ... diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py index 230b45bc0b0f..6b772e26321d 100644 --- a/python/pyspark/pandas/categorical.py +++ b/python/pyspark/pandas/categorical.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pandas as pd from pandas.api.types import CategoricalDtype @@ -79,7 +79,7 @@ def categories(self) -> pd.Index: >>> s.cat.categories Index(['a', 'b', 'c'], dtype='object') """ - return self._data.dtype.categories + return cast(CategoricalDtype, self._data.dtype).categories @categories.setter def categories(self, categories: pd.Index) -> None: @@ -106,7 +106,7 @@ def ordered(self) -> bool: >>> s.cat.ordered False """ - return self._data.dtype.ordered + return cast(CategoricalDtype, self._data.dtype).ordered @property def codes(self) -> "ps.Series": diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py b/python/pyspark/pandas/data_type_ops/categorical_ops.py index fb5666d850bf..b0f32cbe2f38 100644 --- a/python/pyspark/pandas/data_type_ops/categorical_ops.py +++ b/python/pyspark/pandas/data_type_ops/categorical_ops.py @@ -16,7 +16,7 @@ # from itertools import chain -from typing import Any, Union +from typing import Any, Union, cast import pandas as pd from pandas.api.types import CategoricalDtype @@ -41,8 +41,12 @@ def pretty_name(self) -> str: def restore(self, col: pd.Series) -> pd.Series: """Restore column when to_pandas.""" - return pd.Categorical.from_codes( - col, categories=self.dtype.categories, ordered=self.dtype.ordered + return pd.Series( + pd.Categorical.from_codes( + col, + categories=cast(CategoricalDtype, self.dtype).categories, + ordered=cast(CategoricalDtype, self.dtype).ordered, + ) ) def prepare(self, col: pd.Series) -> pd.Series: @@ -52,10 +56,10 @@ def prepare(self, col: pd.Series) -> pd.Series: def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, _ = pandas_on_spark_type(dtype) - if isinstance(dtype, CategoricalDtype) and dtype.categories is None: + if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None: return index_ops.copy() - categories = index_ops.dtype.categories + categories = cast(CategoricalDtype, index_ops.dtype).categories if len(categories) == 0: scol = SF.lit(None) else: @@ -84,7 +88,7 @@ def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: def _non_equality_comparison_input_check(left: IndexOpsLike, right: Any) -> None: - if not left.dtype.ordered: + if not cast(CategoricalDtype, left.dtype).ordered: raise TypeError("Unordered Categoricals can only compare equality or not.") if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, CategoricalDtype): if hash(left.dtype) != hash(right.dtype): diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 6189e17fc5e7..deb148458510 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -4966,8 +4966,8 @@ def _assign(self, kwargs: Any) -> "DataFrame": @staticmethod def from_records( - data: Union[np.array, List[tuple], dict, pd.DataFrame], - index: Union[str, list, np.array] = None, + data: Union[np.ndarray, List[tuple], dict, pd.DataFrame], + index: Union[str, list, np.ndarray] = None, exclude: list = None, columns: list = None, coerce_float: bool = False, @@ -7399,7 +7399,11 @@ def isin(self, values: Union[List, Dict]) -> "DataFrame": SF.lit(False).alias(self._internal.data_spark_column_names[i]) ) elif is_list_like(values): - values = values.tolist() if isinstance(values, np.ndarray) else list(values) + values = ( + cast(np.ndarray, values).tolist() + if isinstance(values, np.ndarray) + else list(values) + ) data_spark_columns += [ self._internal.spark_column_for(label) .isin(values) diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py index 9976da0aafeb..0ece940fcbe9 100644 --- a/python/pyspark/pandas/indexes/category.py +++ b/python/pyspark/pandas/indexes/category.py @@ -15,10 +15,10 @@ # limitations under the License. # from functools import partial -from typing import Any, no_type_check +from typing import Any, no_type_check, cast import pandas as pd -from pandas.api.types import is_hashable +from pandas.api.types import is_hashable, CategoricalDtype from pyspark import pandas as ps from pyspark.pandas.indexes.base import Index @@ -156,7 +156,7 @@ def categories(self) -> pd.Index: >>> idx.categories Index(['a', 'b', 'c'], dtype='object') """ - return self.dtype.categories + return cast(CategoricalDtype, self.dtype).categories @categories.setter def categories(self, categories: pd.Index) -> None: @@ -177,7 +177,7 @@ def ordered(self) -> bool: >>> idx.ordered False """ - return self.dtype.ordered + return cast(CategoricalDtype, self.dtype).ordered def __getattr__(self, item: str) -> Any: if hasattr(MissingPandasLikeCategoricalIndex, item): diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index f374a6f8ddc2..d36c453bc831 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -1646,14 +1646,14 @@ def _select_rows_by_iterable( ) -> Tuple[Optional[Column], Optional[int], Optional[int]]: sdf = self._internal.spark_frame - if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel): + if any(isinstance(key, (int, np.int64, np.int32)) and key < 0 for key in rows_sel): offset = sdf.count() else: offset = 0 new_rows_sel = [] for key in list(rows_sel): - if not isinstance(key, (int, np.int, np.int64, np.int32)): + if not isinstance(key, (int, np.int64, np.int32)): raise TypeError( "cannot do positional indexing with these indexers [{}] of {}".format( key, type(key) diff --git a/python/pyspark/pandas/ml.py b/python/pyspark/pandas/ml.py index f0554ddefc7d..a4f1d7a36f53 100644 --- a/python/pyspark/pandas/ml.py +++ b/python/pyspark/pandas/ml.py @@ -78,7 +78,7 @@ def to_numeric_df(psdf: "ps.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Lab """ # TODO, it should be more robust. accepted_types = { - np.dtype(dt) + np.dtype(dt) # type: ignore for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_] } numeric_column_labels = [ diff --git a/python/pyspark/pandas/mlflow.py b/python/pyspark/pandas/mlflow.py index 6178bac1f9c1..719db40b4b01 100644 --- a/python/pyspark/pandas/mlflow.py +++ b/python/pyspark/pandas/mlflow.py @@ -25,7 +25,7 @@ import numpy as np from typing import Any -from pyspark.pandas._typing import Label # noqa: F401 (SPARK-34943) +from pyspark.pandas._typing import Label, Dtype # noqa: F401 (SPARK-34943) from pyspark.pandas.utils import lazy_property, default_session from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import Series, first_series @@ -42,7 +42,7 @@ class PythonModelWrapper(object): """ - def __init__(self, model_uri: str, return_type_hint: str): + def __init__(self, model_uri: str, return_type_hint: Union[str, type, Dtype]): self._model_uri = model_uri self._return_type_hint = return_type_hint @@ -109,7 +109,9 @@ def predict(self, data: Union[DataFrame, pd.DataFrame]) -> Union[Series, pd.Seri raise ValueError("unknown data type: {}".format(type(data).__name__)) -def load_model(model_uri: str, predict_type: str = "infer") -> PythonModelWrapper: +def load_model( + model_uri: str, predict_type: Union[str, type, Dtype] = "infer" +) -> PythonModelWrapper: """ Loads an MLflow model into an wrapper that can be used both for pandas and pandas-on-Spark DataFrame. diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 70894861b16f..a46926d102cf 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -2020,11 +2020,11 @@ def get_dummies( if drop_first: values = values[1:] - def column_name(value: str) -> str: + def column_name(v: Any) -> Name: if prefix is None or cast(List[str], prefix)[i] == "": - return value + return v else: - return "{}{}{}".format(cast(List[str], prefix)[i], prefix_sep, value) + return "{}{}{}".format(cast(List[str], prefix)[i], prefix_sep, v) for value in values: remaining_columns.append( diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index f3a119882e46..bb1bc5c8fbfa 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -1027,8 +1027,9 @@ def map(self, arg: Union[Dict, Callable]) -> "Series": current = current.when(self.spark.column == SF.lit(to_replace), value) if hasattr(arg, "__missing__"): - tmp_val = arg[np._NoValue] - del arg[np._NoValue] # Remove in case it's set in defaultdict. + tmp_val = arg[np._NoValue] # type: ignore + # Remove in case it's set in defaultdict. + del arg[np._NoValue] # type: ignore current = current.otherwise(SF.lit(tmp_val)) else: current = current.otherwise(SF.lit(None).cast(self.spark.data_type)) diff --git a/python/pyspark/pandas/tests/test_typedef.py b/python/pyspark/pandas/tests/test_typedef.py index 795ffe74b53b..6b644f40f1a7 100644 --- a/python/pyspark/pandas/tests/test_typedef.py +++ b/python/pyspark/pandas/tests/test_typedef.py @@ -67,14 +67,14 @@ def func() -> pd.Series[int]: self.assertEqual(inferred.dtype, np.int64) self.assertEqual(inferred.spark_type, LongType()) - def func() -> pd.Series[np.float]: + def func() -> pd.Series[float]: pass inferred = infer_return_type(func) self.assertEqual(inferred.dtype, np.float64) self.assertEqual(inferred.spark_type, DoubleType()) - def func() -> "pd.DataFrame[np.float, str]": + def func() -> "pd.DataFrame[np.float_, str]": pass expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())]) @@ -82,7 +82,7 @@ def func() -> "pd.DataFrame[np.float, str]": self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) - def func() -> "pandas.DataFrame[np.float]": + def func() -> "pandas.DataFrame[float]": pass expected = StructType([StructField("c0", DoubleType())]) @@ -97,7 +97,7 @@ def func() -> "pd.Series[int]": self.assertEqual(inferred.dtype, np.int64) self.assertEqual(inferred.spark_type, LongType()) - def func() -> pd.DataFrame[np.float, str]: + def func() -> pd.DataFrame[np.float64, str]: pass expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())]) @@ -105,7 +105,7 @@ def func() -> pd.DataFrame[np.float, str]: self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) - def func() -> pd.DataFrame[np.float]: + def func() -> pd.DataFrame[np.float_]: pass expected = StructType([StructField("c0", DoubleType())]) @@ -152,7 +152,7 @@ def test_if_pandas_implements_class_getitem(self): "Type inference from pandas instances is supported with Python 3.7+", ) def test_infer_schema_with_names_pandas_instances(self): - def func() -> 'pd.DataFrame["a" : np.float, "b":str]': # noqa: F405 + def func() -> 'pd.DataFrame["a" : np.float_, "b":str]': # noqa: F405 pass expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())]) @@ -160,7 +160,7 @@ def func() -> 'pd.DataFrame["a" : np.float, "b":str]': # noqa: F405 self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) - def func() -> "pd.DataFrame['a': np.float, 'b': int]": # noqa: F405 + def func() -> "pd.DataFrame['a': float, 'b': int]": # noqa: F405 pass expected = StructType([StructField("a", DoubleType()), StructField("b", LongType())]) @@ -206,7 +206,7 @@ def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]: ) def test_infer_schema_with_names_pandas_instances_negative(self): def try_infer_return_type(): - def f() -> 'pd.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'pd.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) @@ -225,7 +225,7 @@ def f() -> pd.DataFrame[A]: self.assertRaisesRegex(TypeError, "not understood", try_infer_return_type) def try_infer_return_type(): - def f() -> 'pd.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'pd.DataFrame["a" : float : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) @@ -253,7 +253,7 @@ def f() -> pd.Series[pdf.a.dtype]: # type: ignore def test_infer_schema_with_names_negative(self): def try_infer_return_type(): - def f() -> 'ps.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'ps.DataFrame["a" : float : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) @@ -272,7 +272,7 @@ def f() -> ps.DataFrame[A]: self.assertRaisesRegex(TypeError, "not understood", try_infer_return_type) def try_infer_return_type(): - def f() -> 'ps.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'ps.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 00415dc5dabe..1321313142a5 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -160,7 +160,7 @@ def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True) -> elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType - elif tpe in (bool, np.bool, "bool", "?"): + elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): @@ -171,13 +171,13 @@ def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True) -> elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) - elif tpe in (float, np.float, np.float64, "float", "float64", "double"): + elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() - elif tpe in (int, np.int, np.int64, "int", "int64", "long"): + elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() diff --git a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi index 9148e7a2dca8..e219c1cd60b8 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi @@ -22,7 +22,7 @@ # - Add Protocol as a base class # - Replace imports with Any -import numpy.ma as np # type: ignore[import] +import numpy as np # type: ignore[import] from typing import Any, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union from typing_extensions import Protocol from .series import SeriesLike