From 1821a3f68bf1adae4c861a234d1372449dcaf5a0 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 27 Aug 2020 17:30:09 +0200 Subject: [PATCH 1/6] Add missing methods --- third_party/3/pyspark/accumulators.pyi | 5 +++-- third_party/3/pyspark/broadcast.pyi | 10 ++++++---- third_party/3/pyspark/serializers.pyi | 9 ++++++++- third_party/3/pyspark/util.pyi | 18 +++++++++++++----- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/third_party/3/pyspark/accumulators.pyi b/third_party/3/pyspark/accumulators.pyi index 8fe35359..f60de257 100644 --- a/third_party/3/pyspark/accumulators.pyi +++ b/third_party/3/pyspark/accumulators.pyi @@ -19,7 +19,7 @@ # Stubs for pyspark.accumulators (Python 3.7) # -from typing import Callable, Generic, Tuple, Type, TypeVar +from typing import Callable, Generic, Tuple, Type, TypeVar, Dict import socketserver.BaseRequestHandler # type: ignore @@ -29,7 +29,8 @@ T = TypeVar("T") U = TypeVar("U", bound=SupportsIAdd) import socketserver as SocketServer -from typing import Any + +_accumulatorRegistry: Dict = {} class Accumulator(Generic[T]): aid: int diff --git a/third_party/3/pyspark/broadcast.pyi b/third_party/3/pyspark/broadcast.pyi index f000e0bf..f3600e92 100644 --- a/third_party/3/pyspark/broadcast.pyi +++ b/third_party/3/pyspark/broadcast.pyi @@ -16,14 +16,13 @@ # specific language governing permissions and limitations # under the License. -# Stubs for pyspark.broadcast (Python 3.5) -# - import threading -from typing import Any, Generic, Optional, TypeVar +from typing import Any, Generic, Optional, TypeVar, Dict T = TypeVar("T") +_broadcastRegistry: Dict + class Broadcast(Generic[T]): def __init__( self, @@ -47,3 +46,6 @@ class BroadcastPickleRegistry(threading.local): def __iter__(self) -> None: ... def add(self, bcast: Any) -> None: ... def clear(self) -> None: ... + +class InheritableThread: + def __init__(self) -> None: ... diff --git a/third_party/3/pyspark/serializers.pyi b/third_party/3/pyspark/serializers.pyi index c8f76066..5b858b1d 100644 --- a/third_party/3/pyspark/serializers.pyi +++ b/third_party/3/pyspark/serializers.pyi @@ -16,12 +16,14 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Optional +from typing import Any, Dict basestring = str unicode = str xrange = range +__cls: Dict + class SpecialLengths: END_OF_DATA_SECTION: int = ... PYTHON_EXCEPTION_THROWN: int = ... @@ -117,5 +119,10 @@ class ChunkedStream: @property def closed(self): ... +def write_with_length(obj: Any, stream: Any): ... +def pack_long(value): ... def read_int(stream): ... +def read_long(stream): ... +def read_bool(stream): ... def write_int(value, stream): ... +def write_long(write_long, stream): ... diff --git a/third_party/3/pyspark/util.pyi b/third_party/3/pyspark/util.pyi index 7ff6b66f..34a35932 100644 --- a/third_party/3/pyspark/util.pyi +++ b/third_party/3/pyspark/util.pyi @@ -15,11 +15,19 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -# Stubs for pyspark.util (Python 3.7) # -# NOTE: This dynamically typed stub was automatically generated by stubgen. -from typing import Any +from typing import List, Callable, Any, Tuple + +__all__: List[str] + +def fail_on_stopiteration(f: Callable) -> Callable: ... +def _parse_memory(s: str) -> int: ... +def _print_missing_jar(lib_name: str, pkg_name: str, jar_name: str, spark_version: str) -> None: ... + +class InheritableThread: + def __init__(self, target: Any, *args: Any, **kwargs: Any): ... -def fail_on_stopiteration(f: Any): ... +class VersionUtils: + @staticmethod + def majorMinorVersion(sparkVersion: str) -> Tuple[int, int]: ... From 1bcfdde12224069e81e498414b6bde76249055aa Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 27 Aug 2020 22:28:22 +0200 Subject: [PATCH 2/6] Make Black happy --- third_party/3/pyspark/__init__.pyi | 9 ++++--- third_party/3/pyspark/ml/classification.pyi | 29 +++++++++++++-------- third_party/3/pyspark/ml/feature.pyi | 4 ++- third_party/3/pyspark/resource/__init__.pyi | 12 +++++++-- third_party/3/pyspark/resource/profile.pyi | 14 ++++++++-- third_party/3/pyspark/resource/requests.pyi | 24 ++++++++++++++--- third_party/3/pyspark/util.pyi | 4 ++- 7 files changed, 72 insertions(+), 24 deletions(-) diff --git a/third_party/3/pyspark/__init__.pyi b/third_party/3/pyspark/__init__.pyi index 14172975..a04b862a 100644 --- a/third_party/3/pyspark/__init__.pyi +++ b/third_party/3/pyspark/__init__.pyi @@ -51,9 +51,12 @@ T = TypeVar("T") F = TypeVar("F", bound=Callable) def since(version: str) -> Callable[[T], T]: ... - -def copy_func(f: F, name: Optional[str] = ..., sinceversion: Optional[str] = ..., doc: Optional[str] = ...) -> F: ... - +def copy_func( + f: F, + name: Optional[str] = ..., + sinceversion: Optional[str] = ..., + doc: Optional[str] = ..., +) -> F: ... def keyword_only(func: F) -> F: ... # Names in __all__ with no definition: diff --git a/third_party/3/pyspark/ml/classification.pyi b/third_party/3/pyspark/ml/classification.pyi index da6670a7..71fb80c5 100644 --- a/third_party/3/pyspark/ml/classification.pyi +++ b/third_party/3/pyspark/ml/classification.pyi @@ -49,7 +49,6 @@ from pyspark.ml.wrapper import ( ) from pyspark.sql.dataframe import DataFrame - class _ClassifierParams(HasRawPredictionCol, _PredictorParams): ... class Classifier(Predictor, _ClassifierParams): @@ -65,14 +64,18 @@ class ClassificationModel(PredictionModel, _ClassifierParams, metaclass=abc.ABCM @abstractmethod def predictRaw(self, value: Vector) -> Vector: ... -class _ProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, _ClassifierParams): ... +class _ProbabilisticClassifierParams( + HasProbabilityCol, HasThresholds, _ClassifierParams +): ... class ProbabilisticClassifier(Classifier, _ProbabilisticClassifierParams): __metaclass__: Type[abc.ABCMeta] def setProbabilityCol(self: P, value: str) -> P: ... def setThresholds(self: P, value: List[float]) -> P: ... -class ProbabilisticClassificationModel(ClassificationModel, _ProbabilisticClassifierParams, metaclass=abc.ABCMeta): +class ProbabilisticClassificationModel( + ClassificationModel, _ProbabilisticClassifierParams, metaclass=abc.ABCMeta +): __metaclass__: Type[abc.ABCMeta] def setProbabilityCol(self: M, value: str) -> M: ... def setThresholds(self: M, value: List[float]) -> M: ... @@ -95,7 +98,9 @@ class _JavaProbabilisticClassifierParams( class _JavaProbabilisticClassifier(ProbabilisticClassifier, _JavaClassifier[JM]): __metaclass__: Type[abc.ABCMeta] -class _JavaProbabilisticClassificationModel(ProbabilisticClassificationModel, _JavaClassificationModel[T]): +class _JavaProbabilisticClassificationModel( + ProbabilisticClassificationModel, _JavaClassificationModel[T] +): def predictProbability(self, value: Any): ... class _ClassificationSummary(JavaWrapper): @@ -348,12 +353,15 @@ class LogisticRegressionSummary(_ClassificationSummary): @property def featuresCol(self) -> str: ... -class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSummary): ... - -class BinaryLogisticRegressionSummary(_BinaryClassificationSummary, LogisticRegressionSummary): ... - -class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, LogisticRegressionTrainingSummary): ... - +class LogisticRegressionTrainingSummary( + LogisticRegressionSummary, _TrainingSummary +): ... +class BinaryLogisticRegressionSummary( + _BinaryClassificationSummary, LogisticRegressionSummary +): ... +class BinaryLogisticRegressionTrainingSummary( + BinaryLogisticRegressionSummary, LogisticRegressionTrainingSummary +): ... class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams): ... class DecisionTreeClassifier( @@ -861,7 +869,6 @@ class FMClassificationModel( def linear(self) -> Vector: ... @property def factors(self) -> Matrix: ... - def summary(self) -> FMClassificationTrainingSummary: ... def evaluate(self, dataset: DataFrame) -> FMClassificationSummary: ... diff --git a/third_party/3/pyspark/ml/feature.pyi b/third_party/3/pyspark/ml/feature.pyi index a287d7f1..c22b0d15 100644 --- a/third_party/3/pyspark/ml/feature.pyi +++ b/third_party/3/pyspark/ml/feature.pyi @@ -1463,7 +1463,9 @@ class ANOVASelectorModel( ): ... class ChiSqSelector( - _Selector[ChiSqSelectorModel], JavaMLReadable[ChiSqSelector], JavaMLWritable, + _Selector[ChiSqSelectorModel], + JavaMLReadable[ChiSqSelector], + JavaMLWritable, ): def __init__( self, diff --git a/third_party/3/pyspark/resource/__init__.pyi b/third_party/3/pyspark/resource/__init__.pyi index e18a8901..3a2c3faa 100644 --- a/third_party/3/pyspark/resource/__init__.pyi +++ b/third_party/3/pyspark/resource/__init__.pyi @@ -17,5 +17,13 @@ # under the License. from pyspark.resource.information import ResourceInformation as ResourceInformation -from pyspark.resource.profile import ResourceProfile as ResourceProfile, ResourceProfileBuilder as ResourceProfileBuilder -from pyspark.resource.requests import ExecutorResourceRequest as ExecutorResourceRequest, ExecutorResourceRequests as ExecutorResourceRequests, TaskResourceRequest as TaskResourceRequest, TaskResourceRequests as TaskResourceRequests +from pyspark.resource.profile import ( + ResourceProfile as ResourceProfile, + ResourceProfileBuilder as ResourceProfileBuilder, +) +from pyspark.resource.requests import ( + ExecutorResourceRequest as ExecutorResourceRequest, + ExecutorResourceRequests as ExecutorResourceRequests, + TaskResourceRequest as TaskResourceRequest, + TaskResourceRequests as TaskResourceRequests, +) diff --git a/third_party/3/pyspark/resource/profile.pyi b/third_party/3/pyspark/resource/profile.pyi index 11aec5c8..9e3a1565 100644 --- a/third_party/3/pyspark/resource/profile.pyi +++ b/third_party/3/pyspark/resource/profile.pyi @@ -16,11 +16,21 @@ # specific language governing permissions and limitations # under the License. -from pyspark.resource.requests import ExecutorResourceRequest as ExecutorResourceRequest, ExecutorResourceRequests as ExecutorResourceRequests, TaskResourceRequest as TaskResourceRequest, TaskResourceRequests as TaskResourceRequests +from pyspark.resource.requests import ( + ExecutorResourceRequest as ExecutorResourceRequest, + ExecutorResourceRequests as ExecutorResourceRequests, + TaskResourceRequest as TaskResourceRequest, + TaskResourceRequests as TaskResourceRequests, +) from typing import Any, Optional class ResourceProfile: - def __init__(self, _java_resource_profile: Optional[Any] = ..., _exec_req: Any = ..., _task_req: Any = ...) -> None: ... + def __init__( + self, + _java_resource_profile: Optional[Any] = ..., + _exec_req: Any = ..., + _task_req: Any = ..., + ) -> None: ... @property def id(self): ... @property diff --git a/third_party/3/pyspark/resource/requests.pyi b/third_party/3/pyspark/resource/requests.pyi index 36b2c601..45022e27 100644 --- a/third_party/3/pyspark/resource/requests.pyi +++ b/third_party/3/pyspark/resource/requests.pyi @@ -19,7 +19,13 @@ from typing import Any, Optional class ExecutorResourceRequest: - def __init__(self, resourceName: Any, amount: Any, discoveryScript: str = ..., vendor: str = ...) -> None: ... + def __init__( + self, + resourceName: Any, + amount: Any, + discoveryScript: str = ..., + vendor: str = ..., + ) -> None: ... @property def resourceName(self): ... @property @@ -30,12 +36,20 @@ class ExecutorResourceRequest: def vendor(self): ... class ExecutorResourceRequests: - def __init__(self, _jvm: Optional[Any] = ..., _requests: Optional[Any] = ...) -> None: ... + def __init__( + self, _jvm: Optional[Any] = ..., _requests: Optional[Any] = ... + ) -> None: ... def memory(self, amount: Any): ... def memoryOverhead(self, amount: Any): ... def pysparkMemory(self, amount: Any): ... def cores(self, amount: Any): ... - def resource(self, resourceName: Any, amount: Any, discoveryScript: str = ..., vendor: str = ...): ... + def resource( + self, + resourceName: Any, + amount: Any, + discoveryScript: str = ..., + vendor: str = ..., + ): ... @property def requests(self): ... @@ -47,7 +61,9 @@ class TaskResourceRequest: def amount(self): ... class TaskResourceRequests: - def __init__(self, _jvm: Optional[Any] = ..., _requests: Optional[Any] = ...) -> None: ... + def __init__( + self, _jvm: Optional[Any] = ..., _requests: Optional[Any] = ... + ) -> None: ... def cpus(self, amount: Any): ... def resource(self, resourceName: Any, amount: Any): ... @property diff --git a/third_party/3/pyspark/util.pyi b/third_party/3/pyspark/util.pyi index 34a35932..ffafd9af 100644 --- a/third_party/3/pyspark/util.pyi +++ b/third_party/3/pyspark/util.pyi @@ -23,7 +23,9 @@ __all__: List[str] def fail_on_stopiteration(f: Callable) -> Callable: ... def _parse_memory(s: str) -> int: ... -def _print_missing_jar(lib_name: str, pkg_name: str, jar_name: str, spark_version: str) -> None: ... +def _print_missing_jar( + lib_name: str, pkg_name: str, jar_name: str, spark_version: str +) -> None: ... class InheritableThread: def __init__(self, target: Any, *args: Any, **kwargs: Any): ... From aa508b06deda8ab470196c387521984c6c843f08 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 1 Sep 2020 21:30:16 +0200 Subject: [PATCH 3/6] Thanks for the feedback! --- third_party/3/pyspark/accumulators.pyi | 2 +- third_party/3/pyspark/broadcast.pyi | 5 +---- third_party/3/pyspark/serializers.pyi | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/third_party/3/pyspark/accumulators.pyi b/third_party/3/pyspark/accumulators.pyi index f60de257..0082d60f 100644 --- a/third_party/3/pyspark/accumulators.pyi +++ b/third_party/3/pyspark/accumulators.pyi @@ -30,7 +30,7 @@ U = TypeVar("U", bound=SupportsIAdd) import socketserver as SocketServer -_accumulatorRegistry: Dict = {} +_accumulatorRegistry: Dict[int, Accumulator] class Accumulator(Generic[T]): aid: int diff --git a/third_party/3/pyspark/broadcast.pyi b/third_party/3/pyspark/broadcast.pyi index f3600e92..3acc2a76 100644 --- a/third_party/3/pyspark/broadcast.pyi +++ b/third_party/3/pyspark/broadcast.pyi @@ -21,7 +21,7 @@ from typing import Any, Generic, Optional, TypeVar, Dict T = TypeVar("T") -_broadcastRegistry: Dict +_broadcastRegistry: Dict[int, Broadcast] class Broadcast(Generic[T]): def __init__( @@ -46,6 +46,3 @@ class BroadcastPickleRegistry(threading.local): def __iter__(self) -> None: ... def add(self, bcast: Any) -> None: ... def clear(self) -> None: ... - -class InheritableThread: - def __init__(self) -> None: ... diff --git a/third_party/3/pyspark/serializers.pyi b/third_party/3/pyspark/serializers.pyi index 5b858b1d..f082bd29 100644 --- a/third_party/3/pyspark/serializers.pyi +++ b/third_party/3/pyspark/serializers.pyi @@ -125,4 +125,4 @@ def read_int(stream): ... def read_long(stream): ... def read_bool(stream): ... def write_int(value, stream): ... -def write_long(write_long, stream): ... +def write_long(value, stream): ... From de1da71731a244f64279a0f611e53c0abbf92850 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 1 Sep 2020 21:47:59 +0200 Subject: [PATCH 4/6] Install both numpy and pandas --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c81ffbf5..21a8fcb2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ install: - pip install . script: - "./tests/mypy_test.py -p 3.6 3.7" +- pip3 install pandas numpy --upgrade - MYPY_TEST_PREFIX=$PWD MYPYPATH=$PWD/third_party/3 pytest - MYPYPATH=$PWD/third_party/3 mypy spark-source/examples/src/main/python/ml spark-source/examples/src/main/python/sql spark-source/examples/src/main/python/sql/streaming - black --check $PWD/third_party/3/pyspark From c4237dcf90ededf9641fac9fda79cb1fe273d961 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 3 Sep 2020 21:52:40 +0200 Subject: [PATCH 5/6] Remove Python2 relics --- third_party/3/pyspark/serializers.pyi | 4 ---- 1 file changed, 4 deletions(-) diff --git a/third_party/3/pyspark/serializers.pyi b/third_party/3/pyspark/serializers.pyi index f082bd29..3e063a3c 100644 --- a/third_party/3/pyspark/serializers.pyi +++ b/third_party/3/pyspark/serializers.pyi @@ -18,10 +18,6 @@ from typing import Any, Dict -basestring = str -unicode = str -xrange = range - __cls: Dict class SpecialLengths: From 5e394c92e8fd9376aeb1972d9adef0e74410e14a Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 7 Sep 2020 21:29:15 +0200 Subject: [PATCH 6/6] Add missing annotation --- third_party/3/pyspark/util.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/3/pyspark/util.pyi b/third_party/3/pyspark/util.pyi index 83327f0e..f243f266 100644 --- a/third_party/3/pyspark/util.pyi +++ b/third_party/3/pyspark/util.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Callable, Tuple +from typing import Any, Callable, Tuple, List from pyspark._typing import F import threading