From d233b13fab92093af712a5fb12253558a05927a1 Mon Sep 17 00:00:00 2001
From: Sean Owen <sean.owen@databricks.com>
Date: Wed, 16 Jan 2019 20:21:17 -0600
Subject: [PATCH 1/3] Misc code cleanup from lgtm.com analysis

---
 .../org/apache/spark/ui/static/executorspage.js  |  3 +--
 .../org/apache/spark/ui/static/historypage.js    | 16 ++++++++--------
 .../org/apache/spark/ui/static/spark-dag-viz.js  |  8 ++++----
 .../org/apache/spark/ui/static/stagepage.js      |  5 ++---
 .../org/apache/spark/ui/static/table.js          |  2 +-
 .../org/apache/spark/ui/static/utils.js          |  2 +-
 .../org/apache/spark/ui/static/webui.js          |  2 +-
 dev/pip-sanity-check.py                          |  1 -
 dev/run-tests.py                                 |  5 ++---
 .../JavaRandomForestClassificationExample.java   |  2 +-
 .../python/mllib/bisecting_k_means_example.py    |  2 +-
 .../python/mllib/isotonic_regression_example.py  |  2 +-
 .../python/mllib/multi_class_metrics_example.py  |  6 +++---
 .../main/python/mllib/ranking_metrics_example.py |  2 +-
 .../main/python/mllib/standard_scaler_example.py |  2 +-
 examples/src/main/python/sql/hive.py             |  2 +-
 .../launcher/SparkSubmitCommandBuilder.java      |  1 +
 python/pyspark/broadcast.py                      |  1 -
 python/pyspark/context.py                        |  1 -
 python/pyspark/java_gateway.py                   |  4 +---
 python/pyspark/ml/classification.py              |  2 +-
 python/pyspark/ml/fpm.py                         |  2 +-
 python/pyspark/ml/regression.py                  |  1 -
 python/pyspark/mllib/classification.py           |  4 +---
 python/pyspark/mllib/clustering.py               |  1 -
 python/pyspark/mllib/evaluation.py               |  3 +--
 python/pyspark/mllib/feature.py                  |  2 --
 python/pyspark/mllib/fpm.py                      |  4 +---
 python/pyspark/mllib/regression.py               |  3 +--
 python/pyspark/mllib/tree.py                     |  2 +-
 python/pyspark/mllib/util.py                     |  3 +--
 python/pyspark/profiler.py                       |  4 ++--
 python/pyspark/rdd.py                            |  6 ++----
 python/pyspark/sql/context.py                    |  6 ++----
 python/pyspark/sql/types.py                      |  6 ++----
 python/pyspark/streaming/context.py              |  5 +----
 python/pyspark/streaming/kinesis.py              |  4 +---
 python/pyspark/taskcontext.py                    |  1 -
 python/pyspark/worker.py                         |  1 -
 python/run-tests.py                              |  1 -
 python/setup.py                                  |  2 +-
 .../sql/execution/ui/static/spark-sql-viz.js     |  2 +-
 .../spark/streaming/ui/static/streaming-page.js  |  8 ++++----
 43 files changed, 55 insertions(+), 87 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
index a48c02ae279ba..98d67c91f24ba 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
@@ -114,7 +114,6 @@ $(document).ready(function () {
 
         var endPoint = createRESTEndPointForExecutorsPage(appId);
         $.getJSON(endPoint, function (response, status, jqXHR) {
-            var summary = [];
             var allExecCnt = 0;
             var allRDDBlocks = 0;
             var allMemoryUsed = 0;
@@ -505,7 +504,7 @@ $(document).ready(function () {
                         {data: 'allTotalTasks'},
                         {
                             data: function (row, type) {
-                                return type === 'display' ? (formatDuration(row.allTotalDuration, type) + ' (' + formatDuration(row.allTotalGCTime, type) + ')') : row.allTotalDuration
+                                return type === 'display' ? (formatDuration(row.allTotalDuration) + ' (' + formatDuration(row.allTotalGCTime) + ')') : row.allTotalDuration
                             },
                             "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) {
                                 if (oData.allTotalDuration > 0) {
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index abc2ec0fa6531..35e4de9ac6643 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -103,12 +103,12 @@ $(document).ready(function() {
       pageLength: 20
     });
 
-    historySummary = $("#history-summary");
-    searchString = historySummary["context"]["location"]["search"];
-    requestedIncomplete = getParameterByName("showIncomplete", searchString);
+    var historySummary = $("#history-summary");
+    var searchString = historySummary["context"]["location"]["search"];
+    var requestedIncomplete = getParameterByName("showIncomplete", searchString);
     requestedIncomplete = (requestedIncomplete == "true" ? true : false);
 
-    appParams = {
+    var appParams = {
       limit: appLimit,
       status: (requestedIncomplete ? "running" : "completed")
     };
@@ -116,7 +116,7 @@ $(document).ready(function() {
     $.getJSON(uiRoot + "/api/v1/applications", appParams, function(response,status,jqXHR) {
       var array = [];
       var hasMultipleAttempts = false;
-      for (i in response) {
+      for (var i in response) {
         var app = response[i];
         if (app["attempts"][0]["completed"] == requestedIncomplete) {
           continue; // if we want to show for Incomplete, we skip the completed apps; otherwise skip incomplete ones.
@@ -127,7 +127,7 @@ $(document).ready(function() {
             hasMultipleAttempts = true;
         }
         var num = app["attempts"].length;
-        for (j in app["attempts"]) {
+        for (var j in app["attempts"]) {
           var attempt = app["attempts"][j];
           attempt["startTime"] = formatTimeMillis(attempt["startTimeEpoch"]);
           attempt["endTime"] = formatTimeMillis(attempt["endTimeEpoch"]);
@@ -149,7 +149,7 @@ $(document).ready(function() {
         "applications": array,
         "hasMultipleAttempts": hasMultipleAttempts,
         "showCompletedColumns": !requestedIncomplete,
-      }
+      };
 
       $.get(uiRoot + "/static/historypage-template.html", function(template) {
         var sibling = historySummary.prev();
@@ -157,7 +157,7 @@ $(document).ready(function() {
         var apps = $(Mustache.render($(template).filter("#history-summary-template").html(),data));
         var attemptIdColumnName = 'attemptId';
         var startedColumnName = 'started';
-        var defaultSortColumn = completedColumnName = 'completed';
+        var completedColumnName = 'completed';
         var durationColumnName = 'duration';
         var conf = {
           "columns": [
diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
index 75b959fdeb59a..cf508ac573f39 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
@@ -220,7 +220,7 @@ function renderDagVizForJob(svgContainer) {
     } else {
       // Link each graph to the corresponding stage page (TODO: handle stage attempts)
       // Use the link from the stage table so it also works for the history server
-      var attemptId = 0
+      var attemptId = 0;
       var stageLink = d3.select("#stage-" + stageId + "-" + attemptId)
         .select("a.name-link")
         .attr("href");
@@ -236,7 +236,7 @@ function renderDagVizForJob(svgContainer) {
     // existing ones, taking into account the position and width of the last stage's
     // container. We do not need to do this for the first stage of this job.
     if (i > 0) {
-      var existingStages = svgContainer.selectAll("g.cluster.stage")
+      var existingStages = svgContainer.selectAll("g.cluster.stage");
       if (!existingStages.empty()) {
         var lastStage = d3.select(existingStages[0].pop());
         var lastStageWidth = toFloat(lastStage.select("rect").attr("width"));
@@ -369,8 +369,8 @@ function resizeSvg(svg) {
  * here this function is to enable line break.
  */
 function interpretLineBreak(svg) {
-  var allTSpan = svg.selectAll("tspan").each(function() {
-    node = d3.select(this);
+  svg.selectAll("tspan").each(function() {
+    var node = d3.select(this);
     var original = node[0][0].innerHTML;
     if (original.indexOf("\\n") != -1) {
       var arr = original.split("\\n");
diff --git a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
index 5b792ffc584d1..4fe2cd95c673c 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
@@ -263,7 +263,6 @@ function reselectCheckboxesBasedOnTaskTableState() {
 
 function getStageAttemptId() {
   var words = document.baseURI.split('?');
-  var attemptIdStr = words[1].split('&')[1];
   var digitsRegex = /[0-9]+/;
   // We are using regex here to extract the stage attempt id as there might be certain url's with format
   // like /proxy/application_1539986433979_27115/stages/stage/?id=0&attempt=0#tasksTitle
@@ -433,7 +432,7 @@ $(document).ready(function () {
                     "oLanguage": {
                         "sEmptyTable": "No data to show yet"
                     }
-                }
+                };
                 var executorSummaryTableSelector =
                     $("#summary-executor-table").DataTable(executorSummaryConf);
                 $('#parent-container [data-toggle="tooltip"]').tooltip();
@@ -612,7 +611,7 @@ $(document).ready(function () {
                     "searching": false,
                     "order": [[0, "asc"]],
                     "bAutoWidth": false
-                }
+                };
                 $("#accumulator-table").DataTable(accumulatorConf);
 
                 // building tasks table that uses server side functionality
diff --git a/core/src/main/resources/org/apache/spark/ui/static/table.js b/core/src/main/resources/org/apache/spark/ui/static/table.js
index 0315ebf5c48a9..fd258d5ee70d9 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/table.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/table.js
@@ -89,7 +89,7 @@ function onSearchStringChange() {
             if($(this).attr('id') && $(this).attr('id').match(/thread_[0-9]+_tr/) ) {
                 var children = $(this).children()
                 var found = false
-                for (i = 0; i < children.length; i++) {
+                for (var i = 0; i < children.length; i++) {
                     if (children.eq(i).text().toLowerCase().indexOf(searchString) >= 0) {
                         found = true
                     }
diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js
index 22985e31a7808..6fc34a9e1f7ea 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/utils.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js
@@ -170,7 +170,7 @@ function createRESTEndPointForExecutorsPage(appId) {
     if (ind > 0) {
         var appId = words[ind + 1];
         var newBaseURI = words.slice(0, ind + 2).join('/');
-        return newBaseURI + "/api/v1/applications/" + appId + "/allexecutors"
+        return newBaseURI + "/api/v1/applications/" + appId + "/allexecutors";
     }
     ind = words.indexOf("history");
     if (ind > 0) {
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.js b/core/src/main/resources/org/apache/spark/ui/static/webui.js
index b1254e08fa504..cc3d753e9078a 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.js
@@ -33,7 +33,7 @@ function collapseTable(thisName, table){
     var status = window.localStorage.getItem(thisName) == "true";
     status = !status;
 
-    thisClass = '.' + thisName
+    var thisClass = '.' + thisName
 
     // Expand the list of additional metrics.
     var tableDiv = $(thisClass).parent().find('.' + table);
diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py
index c491005f49719..4171f28684d59 100644
--- a/dev/pip-sanity-check.py
+++ b/dev/pip-sanity-check.py
@@ -18,7 +18,6 @@
 from __future__ import print_function
 
 from pyspark.sql import SparkSession
-from pyspark.ml.param import Params
 from pyspark.mllib.linalg import *
 import sys
 
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 27f7527052e29..cc2193f0c0a80 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -29,7 +29,7 @@
 
 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
-from sparktestsupport.toposort import toposort_flatten, toposort
+from sparktestsupport.toposort import toposort_flatten
 import sparktestsupport.modules as modules
 
 
@@ -443,7 +443,6 @@ def run_python_packaging_tests():
 def run_build_tests():
     set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS")
     run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")])
-    pass
 
 
 def run_sparkr_tests():
@@ -495,7 +494,7 @@ def main():
               " install one and retry.")
         sys.exit(2)
 
-    java_version = determine_java_version(java_exe)
+    #java_version = determine_java_version(java_exe)
 
     # install SparkR
     if which("R"):
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestClassificationExample.java
index 0707db8d3e839..7ee3c0a3e1925 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestClassificationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestClassificationExample.java
@@ -50,7 +50,7 @@ public static void main(String[] args) {
     // Empty categoricalFeaturesInfo indicates all features are continuous.
     int numClasses = 2;
     Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
-    Integer numTrees = 3; // Use more in practice.
+    int numTrees = 3; // Use more in practice.
     String featureSubsetStrategy = "auto"; // Let the algorithm choose.
     String impurity = "gini";
     int maxDepth = 5;
diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py
index 31f3e72d7ff1f..36e36fc6897f3 100644
--- a/examples/src/main/python/mllib/bisecting_k_means_example.py
+++ b/examples/src/main/python/mllib/bisecting_k_means_example.py
@@ -23,7 +23,7 @@
 
 from pyspark import SparkContext
 # $example on$
-from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
+from pyspark.mllib.clustering import BisectingKMeans
 # $example off$
 
 if __name__ == "__main__":
diff --git a/examples/src/main/python/mllib/isotonic_regression_example.py b/examples/src/main/python/mllib/isotonic_regression_example.py
index 33d618ab48ea9..f5322d79c45ba 100644
--- a/examples/src/main/python/mllib/isotonic_regression_example.py
+++ b/examples/src/main/python/mllib/isotonic_regression_example.py
@@ -23,7 +23,7 @@
 from pyspark import SparkContext
 # $example on$
 import math
-from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel
+from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel
 from pyspark.mllib.util import MLUtils
 # $example off$
 
diff --git a/examples/src/main/python/mllib/multi_class_metrics_example.py b/examples/src/main/python/mllib/multi_class_metrics_example.py
index 7dc5fb4f9127f..03a564e75be90 100644
--- a/examples/src/main/python/mllib/multi_class_metrics_example.py
+++ b/examples/src/main/python/mllib/multi_class_metrics_example.py
@@ -45,9 +45,9 @@
     metrics = MulticlassMetrics(predictionAndLabels)
 
     # Overall statistics
-    precision = metrics.precision()
-    recall = metrics.recall()
-    f1Score = metrics.fMeasure()
+    precision = metrics.precision(1.0)
+    recall = metrics.recall(1.0)
+    f1Score = metrics.fMeasure(1.0)
     print("Summary Stats")
     print("Precision = %s" % precision)
     print("Recall = %s" % recall)
diff --git a/examples/src/main/python/mllib/ranking_metrics_example.py b/examples/src/main/python/mllib/ranking_metrics_example.py
index 21333deded35d..0913bb34cf9d7 100644
--- a/examples/src/main/python/mllib/ranking_metrics_example.py
+++ b/examples/src/main/python/mllib/ranking_metrics_example.py
@@ -17,7 +17,7 @@
 
 # $example on$
 from pyspark.mllib.recommendation import ALS, Rating
-from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
+from pyspark.mllib.evaluation import RegressionMetrics
 # $example off$
 from pyspark import SparkContext
 
diff --git a/examples/src/main/python/mllib/standard_scaler_example.py b/examples/src/main/python/mllib/standard_scaler_example.py
index 442094e1bf366..11ed34427dfe2 100644
--- a/examples/src/main/python/mllib/standard_scaler_example.py
+++ b/examples/src/main/python/mllib/standard_scaler_example.py
@@ -19,7 +19,7 @@
 
 from pyspark import SparkContext
 # $example on$
-from pyspark.mllib.feature import StandardScaler, StandardScalerModel
+from pyspark.mllib.feature import StandardScaler
 from pyspark.mllib.linalg import Vectors
 from pyspark.mllib.util import MLUtils
 # $example off$
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index 33fc2dfbeefa2..e96a8af71adc3 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -23,7 +23,7 @@
 from __future__ import print_function
 
 # $example on:spark_hive$
-from os.path import expanduser, join, abspath
+from os.path import join, abspath
 
 from pyspark.sql import SparkSession
 from pyspark.sql import Row
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
index cc65f78b45c30..e3ee843f62449 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -455,6 +455,7 @@ protected boolean handle(String opt, String value) {
           conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, value);
           break;
         case CONF:
+          checkArgument(value != null, "Missing argument to %s", CONF);
           String[] setConf = value.split("=", 2);
           checkArgument(setConf.length == 2, "Invalid argument to %s: %s", CONF, value);
           conf.put(setConf[0], setConf[1]);
diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index 29358b5740e51..69447a6fb5613 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -17,7 +17,6 @@
 
 import gc
 import os
-import socket
 import sys
 from tempfile import NamedTemporaryFile
 import threading
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index e7923009dab8c..94c6f4adab77a 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -439,7 +439,6 @@ def stop(self):
                     ' been killed or may also be in a zombie state.',
                     RuntimeWarning
                 )
-                pass
             finally:
                 self._jsc = None
         if getattr(self, "_accumulatorServer", None):
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index c8c5f801f89bb..d8315c63a8fce 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -18,7 +18,6 @@
 import atexit
 import os
 import sys
-import select
 import signal
 import shlex
 import shutil
@@ -174,8 +173,7 @@ def local_connect_and_auth(port, auth_secret):
             errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg))
             sock.close()
             sock = None
-    else:
-        raise Exception("could not open socket: %s" % errors)
+    raise Exception("could not open socket: %s" % errors)
 
 
 def ensure_callback_server_started(gw):
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 6ddfce95a3d4d..89b927814c09e 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -23,7 +23,7 @@
 from pyspark.ml import Estimator, Model
 from pyspark.ml.param.shared import *
 from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \
-    GBTParams, HasVarianceImpurity, RandomForestParams, TreeEnsembleModel, TreeEnsembleParams
+    GBTParams, HasVarianceImpurity, RandomForestParams, TreeEnsembleModel
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
 from pyspark.ml.wrapper import JavaWrapper
diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py
index 734763ebd3fa6..ed71fb0c57591 100644
--- a/python/pyspark/ml/fpm.py
+++ b/python/pyspark/ml/fpm.py
@@ -18,7 +18,7 @@
 from pyspark import keyword_only, since
 from pyspark.sql import DataFrame
 from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, _jvm
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
 from pyspark.ml.param.shared import *
 
 __all__ = ["FPGrowth", "FPGrowthModel", "PrefixSpan"]
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 78cb4a6703554..9e1f8f88ca1c0 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -16,7 +16,6 @@
 #
 
 import sys
-import warnings
 
 from pyspark import since, keyword_only
 from pyspark.ml.param.shared import *
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index e00ed95ef0701..76a27f9bd9698 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -20,12 +20,10 @@
 import warnings
 
 import numpy
-from numpy import array
 
 from pyspark import RDD, since
-from pyspark.streaming import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
-from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
+from pyspark.mllib.linalg import SparseVector, _convert_to_vector
 from pyspark.mllib.regression import (
     LabeledPoint, LinearModel, _regression_train_wrapper,
     StreamingLinearAlgorithm)
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 4f4355ddb60ee..58da434fc38a6 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -33,7 +33,6 @@
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
 from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
-from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable
 from pyspark.streaming import DStream
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index b171e46871fdf..6ca6df672f304 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -16,12 +16,11 @@
 #
 
 import sys
-import warnings
 
 from pyspark import since
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc
 from pyspark.sql import SQLContext
-from pyspark.sql.types import StructField, StructType, DoubleType, IntegerType, ArrayType
+from pyspark.sql.types import StructField, StructType, DoubleType
 
 __all__ = ['BinaryClassificationMetrics', 'RegressionMetrics',
            'MulticlassMetrics', 'RankingMetrics']
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 6d7d4d61db043..b1bcdb9078e3e 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -22,8 +22,6 @@
 
 import sys
 import warnings
-import random
-import binascii
 if sys.version >= '3':
     basestring = str
     unicode = str
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index 6accb9b4926e8..373a141456b2f 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -17,11 +17,9 @@
 
 import sys
 
-import numpy
-from numpy import array
 from collections import namedtuple
 
-from pyspark import SparkContext, since
+from pyspark import since
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc
 from pyspark.mllib.util import JavaSaveable, JavaLoader, inherit_doc
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 6be45f51862c9..adf526bda0b12 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -19,12 +19,11 @@
 import warnings
 
 import numpy as np
-from numpy import array
 
 from pyspark import RDD, since
 from pyspark.streaming.dstream import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
-from pyspark.mllib.linalg import SparseVector, Vectors, _convert_to_vector
+from pyspark.mllib.linalg import SparseVector, _convert_to_vector
 from pyspark.mllib.util import Saveable, Loader
 
 __all__ = ['LabeledPoint', 'LinearModel',
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index b05734ce489d9..2d8df461acf9f 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -20,7 +20,7 @@
 import sys
 import random
 
-from pyspark import SparkContext, RDD, since
+from pyspark import RDD, since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 51f20db2927e2..0190bf3cc0e34 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -17,7 +17,6 @@
 
 import sys
 import numpy as np
-import warnings
 
 if sys.version > '3':
     xrange = range
@@ -420,7 +419,7 @@ def load(cls, sc, path):
                      was saved.
         :return: model instance
         """
-        raise NotImplemented
+        raise NotImplementedError
 
 
 @inherit_doc
diff --git a/python/pyspark/profiler.py b/python/pyspark/profiler.py
index 3c7656ab5758c..b9423b7604873 100644
--- a/python/pyspark/profiler.py
+++ b/python/pyspark/profiler.py
@@ -104,11 +104,11 @@ def __init__(self, ctx):
 
     def profile(self, func):
         """ Do profiling on the function `func`"""
-        raise NotImplemented
+        raise NotImplementedError
 
     def stats(self):
         """ Return the collected profiling stats (pstats.Stats)"""
-        raise NotImplemented
+        raise NotImplementedError
 
     def show(self, id):
         """ Print the profile stats to stdout, id is the RDD id """
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index b6e17cab44e9c..73969307a5e8d 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -25,7 +25,6 @@
 import heapq
 import bisect
 import random
-import socket
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
 from threading import Thread
@@ -42,8 +41,7 @@
 from pyspark.java_gateway import local_connect_and_auth
 from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
     BatchedSerializer, CloudPickleSerializer, PairDeserializer, \
-    PickleSerializer, pack_long, AutoBatchedSerializer, write_with_length, \
-    UTF8Deserializer
+    PickleSerializer, pack_long, AutoBatchedSerializer
 from pyspark.join import python_join, python_left_outer_join, \
     python_right_outer_join, python_full_outer_join, python_cogroup
 from pyspark.statcounter import StatCounter
@@ -53,7 +51,7 @@
 from pyspark.shuffle import Aggregator, ExternalMerger, \
     get_used_memory, ExternalSorter, ExternalGroupBy
 from pyspark.traceback_utils import SCCallSiteSync
-from pyspark.util import fail_on_stopiteration, _exception_message
+from pyspark.util import fail_on_stopiteration
 
 
 __all__ = ["RDD"]
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 1938965a7e175..48a49c583f9c7 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -537,10 +537,8 @@ def _test():
     globs['df'] = rdd.toDF()
     jsonStrings = [
         '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
-        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
-        '"field6":[{"field7": "row2"}]}',
-        '{"field1" : null, "field2": "row3", '
-        '"field3":{"field4":33, "field5": []}}'
+        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},"field6":[{"field7": "row2"}]}',
+        '{"field1" : null, "field2": "row3", "field3":{"field4":33, "field5": []}}'
     ]
     globs['jsonStrings'] = jsonStrings
     globs['json'] = sc.parallelize(jsonStrings)
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 1d24c40e5858e..22ee5d39db38b 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1018,14 +1018,12 @@ def _infer_type(obj):
         for key, value in obj.items():
             if key is not None and value is not None:
                 return MapType(_infer_type(key), _infer_type(value), True)
-        else:
-            return MapType(NullType(), NullType(), True)
+        return MapType(NullType(), NullType(), True)
     elif isinstance(obj, list):
         for v in obj:
             if v is not None:
                 return ArrayType(_infer_type(obj[0]), True)
-        else:
-            return ArrayType(NullType(), True)
+        return ArrayType(NullType(), True)
     elif isinstance(obj, array):
         if obj.typecode in _array_type_mappings:
             return ArrayType(_array_type_mappings[obj.typecode](), False)
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e1c194b446504..2d84373fb28ac 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,10 +17,7 @@
 
 from __future__ import print_function
 
-import os
-import sys
-
-from py4j.java_gateway import java_import, JavaObject
+from py4j.java_gateway import java_import
 
 from pyspark import RDD, SparkConf
 from pyspark.serializers import NoOpSerializer, UTF8Deserializer, CloudPickleSerializer
diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py
index b839859c45252..b3348828fdf6f 100644
--- a/python/pyspark/streaming/kinesis.py
+++ b/python/pyspark/streaming/kinesis.py
@@ -15,9 +15,7 @@
 # limitations under the License.
 #
 
-from py4j.protocol import Py4JJavaError
-
-from pyspark.serializers import PairDeserializer, NoOpSerializer
+from pyspark.serializers import NoOpSerializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming import DStream
 
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index de4b6af236667..dff5e183bdc78 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -16,7 +16,6 @@
 #
 
 from __future__ import print_function
-import socket
 
 from pyspark.java_gateway import local_connect_and_auth
 from pyspark.serializers import write_int, UTF8Deserializer
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index bf007b0c62d8d..1e7424ab3a1b3 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -28,7 +28,6 @@
     import resource
 except ImportError:
     has_resource_module = False
-import socket
 import traceback
 
 from pyspark.accumulators import _accumulatorRegistry
diff --git a/python/run-tests.py b/python/run-tests.py
index e45268c13769a..7456170ba2d56 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -33,7 +33,6 @@
     import Queue
 else:
     import queue as Queue
-from distutils.version import LooseVersion
 from multiprocessing import Manager
 
 
diff --git a/python/setup.py b/python/setup.py
index 7da67a4109ed1..22f0940db93e1 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -20,7 +20,7 @@
 import glob
 import os
 import sys
-from setuptools import setup, find_packages
+from setuptools import setup
 from shutil import copyfile, copytree, rmtree
 
 if sys.version_info < (2, 7):
diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js
index 5161fcde669e7..46d3fbc8c3cb4 100644
--- a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js
+++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js
@@ -42,7 +42,7 @@ function renderPlanViz() {
     setupTooltipForSparkPlanNode(i);
   }
 
-  resizeSvg(svg)
+  resizeSvg(svg);
 }
 
 /* -------------------- *
diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
index d004f34ab186c..5b75bc3011b6d 100644
--- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
+++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
@@ -129,7 +129,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
     svg.append("g")
         .attr("class", "x axis")
         .attr("transform", "translate(0," + height + ")")
-        .call(xAxis)
+        .call(xAxis);
 
     svg.append("g")
         .attr("class", "y axis")
@@ -198,7 +198,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
                     lastClickedBatch = null;
                 }
                 lastClickedBatch = d.x;
-                highlightBatchRow(lastClickedBatch)
+                highlightBatchRow(lastClickedBatch);
                 lastTimeout = window.setTimeout(function () {
                     lastTimeout = null;
                     if (lastClickedBatch != null) {
@@ -261,9 +261,9 @@ function drawHistogram(id, values, minY, maxY, unitY, batchInterval) {
 
     svg.append("g")
         .attr("class", "y axis")
-        .call(yAxis)
+        .call(yAxis);
 
-    var bar = svg.selectAll(".bar")
+    svg.selectAll(".bar")
         .data(data)
         .enter()
         .append("g")

From 503978db5d9218ccb52d6922ec045a8210875a47 Mon Sep 17 00:00:00 2001
From: Sean Owen <sean.owen@databricks.com>
Date: Wed, 16 Jan 2019 20:46:12 -0600
Subject: [PATCH 2/3] Fix style / remove more dead code

---
 dev/run-tests.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index cc2193f0c0a80..e1ed2744d78b3 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -153,30 +153,6 @@ def determine_java_executable():
     return java_exe if java_exe else which("java")
 
 
-JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch'])
-
-
-def determine_java_version(java_exe):
-    """Given a valid java executable will return its version in named tuple format
-    with accessors '.major', '.minor', '.patch', '.update'"""
-
-    raw_output = subprocess.check_output([java_exe, "-version"],
-                                         stderr=subprocess.STDOUT,
-                                         universal_newlines=True)
-
-    raw_output_lines = raw_output.split('\n')
-
-    # find raw version string, eg 'java version "1.8.0_25"'
-    raw_version_str = next(x for x in raw_output_lines if " version " in x)
-
-    match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str)
-
-    major = int(match.group(1))
-    minor = int(match.group(2))
-    patch = int(match.group(3))
-
-    return JavaVersion(major, minor, patch)
-
 # -------------------------------------------------------------------------------------------------
 # Functions for running the other build and test scripts
 # -------------------------------------------------------------------------------------------------
@@ -494,8 +470,6 @@ def main():
               " install one and retry.")
         sys.exit(2)
 
-    #java_version = determine_java_version(java_exe)
-
     # install SparkR
     if which("R"):
         run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])

From 2c7f66ddf658a4a57d35eac8526cc582d59fdb57 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Thu, 17 Jan 2019 15:25:06 +0800
Subject: [PATCH 3/3] Use `numpy.array` instead of `array` in doctests

---
 python/pyspark/mllib/classification.py | 12 ++++++------
 python/pyspark/mllib/regression.py     | 21 +++++++++++----------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 76a27f9bd9698..d2037be2c64f8 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -124,9 +124,9 @@ class LogisticRegressionModel(LinearClassificationModel):
     ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
     ... ]
     >>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data), iterations=10)
-    >>> lrm.predict(array([0.0, 1.0]))
+    >>> lrm.predict(numpy.array([0.0, 1.0]))
     1
-    >>> lrm.predict(array([1.0, 0.0]))
+    >>> lrm.predict(numpy.array([1.0, 0.0]))
     0
     >>> lrm.predict(SparseVector(2, {1: 1.0}))
     1
@@ -136,7 +136,7 @@ class LogisticRegressionModel(LinearClassificationModel):
     >>> path = tempfile.mkdtemp()
     >>> lrm.save(sc, path)
     >>> sameModel = LogisticRegressionModel.load(sc, path)
-    >>> sameModel.predict(array([0.0, 1.0]))
+    >>> sameModel.predict(numpy.array([0.0, 1.0]))
     1
     >>> sameModel.predict(SparseVector(2, {0: 1.0}))
     0
@@ -422,7 +422,7 @@ class SVMModel(LinearClassificationModel):
     >>> svm.predict(sc.parallelize([[1.0]])).collect()
     [1]
     >>> svm.clearThreshold()
-    >>> svm.predict(array([1.0]))
+    >>> svm.predict(numpy.array([1.0]))
     1.44...
 
     >>> sparse_data = [
@@ -575,9 +575,9 @@ class NaiveBayesModel(Saveable, Loader):
     ...     LabeledPoint(1.0, [1.0, 0.0]),
     ... ]
     >>> model = NaiveBayes.train(sc.parallelize(data))
-    >>> model.predict(array([0.0, 1.0]))
+    >>> model.predict(numpy.array([0.0, 1.0]))
     0.0
-    >>> model.predict(array([1.0, 0.0]))
+    >>> model.predict(numpy.array([1.0, 0.0]))
     1.0
     >>> model.predict(sc.parallelize([[1.0, 0.0]])).collect()
     [1.0]
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index adf526bda0b12..56ee0083abca4 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -167,15 +167,15 @@ class LinearRegressionModel(LinearRegressionModelBase):
     ...     LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
     ... ]
     >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
-    ...     initialWeights=array([1.0]))
-    >>> abs(lrm.predict(array([0.0])) - 0) < 0.5
+    ...     initialWeights=np.array([1.0]))
+    >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
     >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
-    ...    miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1, regType="l2",
+    ...    miniBatchFraction=1.0, initialWeights=np.array([1.0]), regParam=0.1, regType="l2",
     ...    intercept=True, validateData=True)
-    >>> abs(lrm.predict(array([0.0])) - 0) < 0.5
+    >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
@@ -304,7 +304,8 @@ class LassoModel(LinearRegressionModelBase):
     ...     LabeledPoint(3.0, [2.0]),
     ...     LabeledPoint(2.0, [3.0])
     ... ]
-    >>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, initialWeights=array([1.0]))
+    >>> lrm = LassoWithSGD.train(
+    ...     sc.parallelize(data), iterations=10, initialWeights=np.array([1.0]))
     >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True
     >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -335,13 +336,13 @@ class LassoModel(LinearRegressionModelBase):
     ...     LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
     ... ]
     >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
-    ...     initialWeights=array([1.0]))
+    ...     initialWeights=np.array([1.0]))
     >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
     >>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
-    ...     regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
+    ...     regParam=0.01, miniBatchFraction=1.0, initialWeights=np.array([1.0]), intercept=True,
     ...     validateData=True)
     >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True
@@ -448,7 +449,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
     ...     LabeledPoint(2.0, [3.0])
     ... ]
     >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10,
-    ...     initialWeights=array([1.0]))
+    ...     initialWeights=np.array([1.0]))
     >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True
     >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5
@@ -479,13 +480,13 @@ class RidgeRegressionModel(LinearRegressionModelBase):
     ...     LabeledPoint(2.0, SparseVector(1, {0: 3.0}))
     ... ]
     >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10,
-    ...     initialWeights=array([1.0]))
+    ...     initialWeights=np.array([1.0]))
     >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
     >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0,
-    ...     regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
+    ...     regParam=0.01, miniBatchFraction=1.0, initialWeights=np.array([1.0]), intercept=True,
     ...     validateData=True)
     >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
     True