diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md index 4d9b6416c01cb..2f410cf8bfd94 100644 --- a/R/CRAN_RELEASE.md +++ b/R/CRAN_RELEASE.md @@ -25,7 +25,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control. -Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`). +Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, e.g. `yum -q -y install qpdf`). To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible. diff --git a/R/install-dev.bat b/R/install-dev.bat index c570d93049a14..ae5aa589a19d1 100644 --- a/R/install-dev.bat +++ b/R/install-dev.bat @@ -26,7 +26,7 @@ MKDIR %SPARK_HOME%\R\lib rem When you pass the package path directly as an argument to R CMD INSTALL, rem it takes the path as 'C:\projects\spark\R\..\R\pkg"' as an example at -rem R 4.0. To work around this, directly go to the directoy and install it. +rem R 4.0. To work around this, directly go to the directory and install it. rem See also SPARK-32074 pushd %SPARK_HOME%\R\pkg\ R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" . diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 2ce53782d9af0..31a651ea1279b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2772,7 +2772,7 @@ setMethod("merge", #' Creates a list of columns by replacing the intersected ones with aliases #' #' Creates a list of columns by replacing the intersected ones with aliases. -#' The name of the alias column is formed by concatanating the original column name and a suffix. +#' The name of the alias column is formed by concatenating the original column name and a suffix. #' #' @param x a SparkDataFrame #' @param intersectedColNames a list of intersected column names of the SparkDataFrame @@ -3231,7 +3231,7 @@ setMethod("describe", #' \item stddev #' \item min #' \item max -#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%") +#' \item arbitrary approximate percentiles specified as a percentage (e.g., "75\%") #' } #' If no statistics are given, this function computes count, mean, stddev, min, #' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max. @@ -3743,7 +3743,7 @@ setMethod("histogram", #' #' @param x a SparkDataFrame. #' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}. -#' @param tableName yhe name of the table in the external database. +#' @param tableName the name of the table in the external database. #' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore' #' save mode (it is 'error' by default) #' @param ... additional JDBC database connection properties. diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 7a1d157bb8a36..408a3ff25b2b2 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -970,7 +970,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", MAXINT))))) # If the first sample didn't turn out large enough, keep trying to # take samples; this shouldn't happen often because we use a big - # multiplier for thei initial size + # multiplier for the initial size while (length(samples) < total) samples <- collectRDD(sampleRDD(x, withReplacement, fraction, as.integer(ceiling(stats::runif(1, @@ -1512,7 +1512,7 @@ setMethod("glom", #' #' @param x An RDD. #' @param y An RDD. -#' @return a new RDD created by performing the simple union (witout removing +#' @return a new RDD created by performing the simple union (without removing #' duplicates) of two input RDDs. #' @examples #'\dontrun{ diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index c0ac68332ec41..5ed0481f33d8f 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -203,7 +203,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { }) } - # SPAKR-SQL does not support '.' in column name, so replace it with '_' + # SPARK-SQL does not support '.' in column name, so replace it with '_' # TODO(davies): remove this once SPARK-2775 is fixed names <- lapply(names, function(n) { nn <- gsub(".", "_", n, fixed = TRUE) diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 037809cd0923e..be47d0117ed7f 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -54,7 +54,7 @@ setMethod("show", "WindowSpec", #' Defines the partitioning columns in a WindowSpec. #' #' @param x a WindowSpec. -#' @param col a column to partition on (desribed by the name or Column). +#' @param col a column to partition on (described by the name or Column). #' @param ... additional column(s) to partition on. #' @return A WindowSpec. #' @rdname partitionBy @@ -231,7 +231,7 @@ setMethod("rangeBetween", #' @rdname over #' @name over #' @aliases over,Column,WindowSpec-method -#' @family colum_func +#' @family column_func #' @examples #' \dontrun{ #' df <- createDataFrame(mtcars) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 835178990b485..9fa117ccb6281 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -135,7 +135,7 @@ createMethods() #' @rdname alias #' @name alias #' @aliases alias,Column-method -#' @family colum_func +#' @family column_func #' @examples #' \dontrun{ #' df <- createDataFrame(iris) @@ -161,7 +161,7 @@ setMethod("alias", #' #' @rdname substr #' @name substr -#' @family colum_func +#' @family column_func #' @aliases substr,Column-method #' #' @param x a Column. @@ -187,7 +187,7 @@ setMethod("substr", signature(x = "Column"), #' #' @rdname startsWith #' @name startsWith -#' @family colum_func +#' @family column_func #' @aliases startsWith,Column-method #' #' @param x vector of character string whose "starts" are considered @@ -206,7 +206,7 @@ setMethod("startsWith", signature(x = "Column"), #' #' @rdname endsWith #' @name endsWith -#' @family colum_func +#' @family column_func #' @aliases endsWith,Column-method #' #' @param x vector of character string whose "ends" are considered @@ -224,7 +224,7 @@ setMethod("endsWith", signature(x = "Column"), #' #' @rdname between #' @name between -#' @family colum_func +#' @family column_func #' @aliases between,Column-method #' #' @param x a Column @@ -251,7 +251,7 @@ setMethod("between", signature(x = "Column"), # nolint end #' @rdname cast #' @name cast -#' @family colum_func +#' @family column_func #' @aliases cast,Column-method #' #' @examples @@ -300,7 +300,7 @@ setMethod("%in%", #' Can be a single value or a Column. #' @rdname otherwise #' @name otherwise -#' @family colum_func +#' @family column_func #' @aliases otherwise,Column-method #' @note otherwise since 1.5.0 setMethod("otherwise", @@ -440,7 +440,7 @@ setMethod("withField", #' ) #' #' # However, if you are going to add/replace multiple nested fields, -#' # it is preffered to extract out the nested struct before +#' # it is preferred to extract out the nested struct before #' # adding/replacing multiple fields e.g. #' head( #' withColumn( diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index e3c9d9f8793d6..cca6c2c817de9 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -86,7 +86,7 @@ makeSplits <- function(numSerializedSlices, length) { # For instance, for numSerializedSlices of 22, length of 50 # [1] 0 0 2 2 4 4 6 6 6 9 9 11 11 13 13 15 15 15 18 18 20 20 22 22 22 # [26] 25 25 27 27 29 29 31 31 31 34 34 36 36 38 38 40 40 40 43 43 45 45 47 47 47 - # Notice the slice group with 3 slices (ie. 6, 15, 22) are roughly evenly spaced. + # Notice the slice group with 3 slices (i.e. 6, 15, 22) are roughly evenly spaced. # We are trying to reimplement the calculation in the positions method in ParallelCollectionRDD if (numSerializedSlices > 0) { unlist(lapply(0: (numSerializedSlices - 1), function(x) { @@ -116,7 +116,7 @@ makeSplits <- function(numSerializedSlices, length) { #' This change affects both createDataFrame and spark.lapply. #' In the specific one case that it is used to convert R native object into SparkDataFrame, it has #' always been kept at the default of 1. In the case the object is large, we are explicitly setting -#' the parallism to numSlices (which is still 1). +#' the parallelism to numSlices (which is still 1). #' #' Specifically, we are changing to split positions to match the calculation in positions() of #' ParallelCollectionRDD in Spark. diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 5d22340fb62a0..89a8fbecd36b0 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -250,7 +250,7 @@ readDeserializeWithKeysInArrow <- function(inputCon) { keys <- readMultipleObjects(inputCon) - # Read keys to map with each groupped batch later. + # Read keys to map with each grouped batch later. list(keys = keys, data = data) } diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 039d28a3a37b6..d5a5861d79b15 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -144,7 +144,7 @@ NULL #' @param y Column to compute on. #' @param pos In \itemize{ #' \item \code{locate}: a start position of search. -#' \item \code{overlay}: a start postiton for replacement. +#' \item \code{overlay}: a start position for replacement. #' } #' @param len In \itemize{ #' \item \code{lpad} the maximum length of each output result. @@ -2879,7 +2879,7 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"), }) #' @details -#' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is +#' \code{shiftRightUnsigned}: (Unsigned) shifts the given value numBits right. If the given value is #' a long value, it will return a long value else it will return an integer value. #' #' @rdname column_math_functions diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index ea2c0b4c0f42f..5bc5ae07c5f03 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -289,7 +289,7 @@ sparkCachePath <- function() { } # Length of the Spark cache specific relative path segments for each platform -# eg. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix +# e.g. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix # Must match sparkCachePath() exactly. sparkCacheRelPathLength <- function() { if (is_windows()) { diff --git a/R/pkg/R/mllib_fpm.R b/R/pkg/R/mllib_fpm.R index 30bc51b932041..65a43514930f0 100644 --- a/R/pkg/R/mllib_fpm.R +++ b/R/pkg/R/mllib_fpm.R @@ -125,7 +125,7 @@ setMethod("spark.freqItemsets", signature(object = "FPGrowthModel"), #' The \code{SparkDataFrame} contains five columns: #' \code{antecedent} (an array of the same type as the input column), #' \code{consequent} (an array of the same type as the input column), -#' \code{condfidence} (confidence for the rule) +#' \code{confidence} (confidence for the rule) #' \code{lift} (lift for the rule) #' and \code{support} (support for the rule) #' @rdname spark.fpGrowth diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R index f6aa48f5fa04a..b5a014b0a3cfd 100644 --- a/R/pkg/R/mllib_tree.R +++ b/R/pkg/R/mllib_tree.R @@ -53,7 +53,7 @@ setClass("DecisionTreeRegressionModel", representation(jobj = "jobj")) #' @note DecisionTreeClassificationModel since 2.3.0 setClass("DecisionTreeClassificationModel", representation(jobj = "jobj")) -# Create the summary of a tree ensemble model (eg. Random Forest, GBT) +# Create the summary of a tree ensemble model (e.g. Random Forest, GBT) summary.treeEnsemble <- function(model) { jobj <- model@jobj formula <- callJMethod(jobj, "formula") @@ -73,7 +73,7 @@ summary.treeEnsemble <- function(model) { jobj = jobj) } -# Prints the summary of tree ensemble models (eg. Random Forest, GBT) +# Prints the summary of tree ensemble models (e.g. Random Forest, GBT) print.summary.treeEnsemble <- function(x) { jobj <- x$jobj cat("Formula: ", x$formula) diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R index f38f1ac3a6b4c..d943d8d0ab4c0 100644 --- a/R/pkg/R/mllib_utils.R +++ b/R/pkg/R/mllib_utils.R @@ -18,7 +18,7 @@ # mllib_utils.R: Utilities for MLlib integration # Integration with R's standard functions. -# Most of MLlib's argorithms are provided in two flavours: +# Most of MLlib's algorithms are provided in two flavours: # - a specialization of the default R methods (glm). These methods try to respect # the inputs and the outputs of R's method to the largest extent, but some small differences # may exist. diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index b29381bb900fb..41676be03e951 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -239,7 +239,7 @@ setMethod("partitionByRDD", javaPairRDD <- callJMethod(javaPairRDD, "partitionBy", rPartitioner) # Call .values() on the result to get back the final result, the - # shuffled acutal content key-val pairs. + # shuffled actual content key-val pairs. r <- callJMethod(javaPairRDD, "values") RDD(r, serializedMode = "byte") @@ -411,7 +411,7 @@ setMethod("reduceByKeyLocally", #' \itemize{ #' \item createCombiner, which turns a V into a C (e.g., creates a one-element list) #' \item mergeValue, to merge a V into a C (e.g., adds it to the end of a list) - -#' \item mergeCombiners, to combine two C's into a single one (e.g., concatentates +#' \item mergeCombiners, to combine two C's into a single one (e.g., concatenates #' two lists). #' } #' diff --git a/R/pkg/R/streaming.R b/R/pkg/R/streaming.R index 5eccbdc9d3818..2bcfb363f9d24 100644 --- a/R/pkg/R/streaming.R +++ b/R/pkg/R/streaming.R @@ -93,7 +93,7 @@ setMethod("explain", #' lastProgress #' -#' Prints the most recent progess update of this streaming query in JSON format. +#' Prints the most recent progress update of this streaming query in JSON format. #' #' @param x a StreamingQuery. #' @rdname lastProgress diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 5d48a9eee2799..dfa83c35665ce 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -68,7 +68,7 @@ rToSQLTypes <- as.environment(list( "character" = "string", "logical" = "boolean")) -# Helper function of coverting decimal type. When backend returns column type in the +# Helper function of converting decimal type. When backend returns column type in the # format of decimal(,) (e.g., decimal(10, 0)), this function coverts the column type # as double type. This function converts backend returned types that are not the key # of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES. diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index d6f9f927d5cdc..264cbfc9ba929 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -930,7 +930,7 @@ getOne <- function(x, envir, inherits = TRUE, ifnotfound = NULL) { } # Returns a vector of parent directories, traversing up count times, starting with a full path -# eg. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return +# e.g. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return # this "/Users/user/Library/Caches/spark/spark2.2" # and "/Users/user/Library/Caches/spark" traverseParentDirs <- function(x, count) { diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index fb9db63b07cd0..4589bb9c6ad1b 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -32,7 +32,7 @@ inputCon <- socketConnection( SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET")) -# Waits indefinitely for a socket connecion by default. +# Waits indefinitely for a socket connection by default. selectTimeout <- NULL while (TRUE) { @@ -72,7 +72,7 @@ while (TRUE) { } }) } else if (is.null(children)) { - # If it is NULL, there are no children. Waits indefinitely for a socket connecion. + # If it is NULL, there are no children. Waits indefinitely for a socket connection. selectTimeout <- NULL } diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R index 1ef05ea621e83..dd271f91d0084 100644 --- a/R/pkg/inst/worker/worker.R +++ b/R/pkg/inst/worker/worker.R @@ -85,7 +85,7 @@ outputResult <- function(serializer, output, outputCon) { } # Constants -specialLengths <- list(END_OF_STERAM = 0L, TIMING_DATA = -1L) +specialLengths <- list(END_OF_STREAM = 0L, TIMING_DATA = -1L) # Timing R process boot bootTime <- currentTimeSecs() @@ -180,7 +180,7 @@ if (isEmpty != 0) { } else if (deserializer == "arrow" && mode == 1) { data <- SparkR:::readDeserializeInArrow(inputCon) # See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html - # rbind.fill might be an anternative to make it faster if plyr is installed. + # rbind.fill might be an alternative to make it faster if plyr is installed. # Also, note that, 'dapply' applies a function to each partition. data <- do.call("rbind", data) } @@ -212,7 +212,7 @@ if (isEmpty != 0) { if (serializer == "arrow") { # See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html - # rbind.fill might be an anternative to make it faster if plyr is installed. + # rbind.fill might be an alternative to make it faster if plyr is installed. combined <- do.call("rbind", outputs) SparkR:::writeSerializeInArrow(outputCon, combined) } @@ -285,7 +285,7 @@ SparkR:::writeDouble(outputCon, computeInputElapsDiff) # compute SparkR:::writeDouble(outputCon, outputComputeElapsDiff) # output # End of output -SparkR:::writeInt(outputCon, specialLengths$END_OF_STERAM) +SparkR:::writeInt(outputCon, specialLengths$END_OF_STREAM) close(outputCon) close(inputCon) diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R index e01f6ee005218..a52289e43ca5e 100644 --- a/R/pkg/tests/fulltests/test_Serde.R +++ b/R/pkg/tests/fulltests/test_Serde.R @@ -125,7 +125,7 @@ test_that("SerDe of list of lists", { sparkR.session.stop() -# Note that this test should be at the end of tests since the configruations used here are not +# Note that this test should be at the end of tests since the configurations used here are not # specific to sessions, and the Spark context is restarted. test_that("createDataFrame large objects", { for (encryptionEnabled in list("true", "false")) { diff --git a/R/pkg/tests/fulltests/test_jvm_api.R b/R/pkg/tests/fulltests/test_jvm_api.R index 8b3b4f73de170..3bf6ae556c079 100644 --- a/R/pkg/tests/fulltests/test_jvm_api.R +++ b/R/pkg/tests/fulltests/test_jvm_api.R @@ -20,11 +20,11 @@ context("JVM API") sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) test_that("Create and call methods on object", { - jarr <- sparkR.newJObject("java.util.ArrayList") + jarray <- sparkR.newJObject("java.util.ArrayList") # Add an element to the array - sparkR.callJMethod(jarr, "add", 1L) + sparkR.callJMethod(jarray, "add", 1L) # Check if get returns the same element - expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L) + expect_equal(sparkR.callJMethod(jarray, "get", 0L), 1L) }) test_that("Call static methods", { diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 45de1ef1bd3d1..0b4ecca097c59 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -2092,7 +2092,7 @@ test_that("higher order functions", { createDataFrame(data.frame(id = 1)), expr("CAST(array(1.0, 2.0, -3.0, -4.0) AS array) xs"), expr("CAST(array(0.0, 3.0, 48.0) AS array) ys"), - expr("array('FAILED', 'SUCCEDED') as vs"), + expr("array('FAILED', 'SUCCEEDED') as vs"), expr("map('foo', 1, 'bar', 2) as mx"), expr("map('foo', 42, 'bar', -1, 'baz', 0) as my") ) @@ -3666,7 +3666,7 @@ test_that("gapply() and gapplyCollect() on a DataFrame", { } # Computes the arithmetic mean of the second column by grouping - # on the first and third columns. Output the groupping value and the average. + # on the first and third columns. Output the grouping value and the average. schema <- structType(structField("a", "integer"), structField("c", "string"), structField("avg", "double")) df3 <- gapply( @@ -3964,7 +3964,7 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { paste("Error in listFunctions : analysis error - Database", "'zxwtyswklpf_db' does not exist")) - # recoverPartitions does not work with tempory view + # recoverPartitions does not work with temporary view expect_error(recoverPartitions("cars"), "no such table - Table or view 'cars' not found in database 'default'") expect_error(refreshTable("cars"), NA) diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R index c3fb9046fcda4..6c83a137cfb7b 100644 --- a/R/pkg/tests/fulltests/test_utils.R +++ b/R/pkg/tests/fulltests/test_utils.R @@ -116,7 +116,7 @@ test_that("cleanClosure on R functions", { actual <- get("y", envir = env, inherits = FALSE) expect_equal(actual, y) - # Test for combination for nested and sequenctial functions in a closure + # Test for combination for nested and sequential functions in a closure f1 <- function(x) x + 1 f2 <- function(x) f1(x) + 2 userFunc <- function(x) { f1(x); f2(x) } diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 3713e6c784855..a0608748696a3 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -146,7 +146,7 @@ sparkR.session.stop() Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs. -After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html). +After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (e.g. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html). ```{r, eval=FALSE} install.spark() diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java index d7423537ddfcf..4d7f76f673865 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java @@ -133,7 +133,7 @@ class LevelDBTypeInfo { // First create the parent indices, then the child indices. ti.indices().forEach(idx -> { - // In LevelDB, there is no parent index for the NUTURAL INDEX. + // In LevelDB, there is no parent index for the NATURAL INDEX. if (idx.parent().isEmpty() || idx.value().equals(KVIndex.NATURAL_INDEX_NAME)) { indices.put(idx.value(), new Index(idx, ti.getAccessor(idx.value()), null)); } diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java index 6dcc703e92669..eb2882074d7c7 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java @@ -303,7 +303,7 @@ public void close() { @Override public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) - .append("remoteAdress", channel.remoteAddress()) + .append("remoteAddress", channel.remoteAddress()) .append("clientId", clientId) .append("isActive", isActive()) .toString(); diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java index 64fdb32a67ada..c2b2edc7f07d5 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java @@ -287,7 +287,7 @@ private byte[] doCipherOp(int mode, byte[] in, boolean isFinal) } } } catch (InternalError ie) { - // SPARK-25535. The commons-cryto library will throw InternalError if something goes wrong, + // SPARK-25535. The commons-crypto library will throw InternalError if something goes wrong, // and leave bad state behind in the Java wrappers, so it's not safe to use them afterwards. if (mode == Cipher.ENCRYPT_MODE) { this.encryptor = null; diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java index 0790f0079c2bd..1c2061699a128 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java @@ -150,8 +150,8 @@ public void testEncryptedMessage() throws Exception { ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(buf); - while (emsg.transfered() < emsg.count()) { - emsg.transferTo(channel, emsg.transfered()); + while (emsg.transferred() < emsg.count()) { + emsg.transferTo(channel, emsg.transferred()); } assertEquals(data.length, channel.length()); } finally { @@ -196,9 +196,9 @@ public Long answer(InvocationOnMock invocationOnMock) throws Throwable { TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(region); ByteArrayWritableChannel channel = new ByteArrayWritableChannel(testDataLength); // "transferTo" should act correctly when the underlying FileRegion transfers 0 bytes. - assertEquals(0L, emsg.transferTo(channel, emsg.transfered())); - assertEquals(testDataLength, emsg.transferTo(channel, emsg.transfered())); - assertEquals(emsg.transfered(), emsg.count()); + assertEquals(0L, emsg.transferTo(channel, emsg.transferred())); + assertEquals(testDataLength, emsg.transferTo(channel, emsg.transferred())); + assertEquals(emsg.transferred(), emsg.count()); assertEquals(4, channel.length()); } finally { client.close(); diff --git a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java index 3bff34e210e3c..af1c2878672c0 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java @@ -129,8 +129,8 @@ private void testFileRegionBody(int totalWrites, int writesPerCall) throws Excep private ByteBuf doWrite(MessageWithHeader msg, int minExpectedWrites) throws Exception { int writes = 0; ByteArrayWritableChannel channel = new ByteArrayWritableChannel((int) msg.count()); - while (msg.transfered() < msg.count()) { - msg.transferTo(channel, msg.transfered()); + while (msg.transferred() < msg.count()) { + msg.transferTo(channel, msg.transferred()); writes++; } assertTrue("Not enough writes!", minExpectedWrites <= writes); diff --git a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java index ecaeec98da182..32c9acd327213 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java @@ -191,28 +191,28 @@ public void testEncryptedMessage() throws Exception { SaslEncryption.EncryptedMessage emsg = new SaslEncryption.EncryptedMessage(backend, msg, 1024); - long count = emsg.transferTo(channel, emsg.transfered()); + long count = emsg.transferTo(channel, emsg.transferred()); assertTrue(count < data.length); assertTrue(count > 0); // Here, the output buffer is full so nothing should be transferred. - assertEquals(0, emsg.transferTo(channel, emsg.transfered())); + assertEquals(0, emsg.transferTo(channel, emsg.transferred())); // Now there's room in the buffer, but not enough to transfer all the remaining data, // so the dummy count should be returned. channel.reset(); - assertEquals(1, emsg.transferTo(channel, emsg.transfered())); + assertEquals(1, emsg.transferTo(channel, emsg.transferred())); // Eventually, the whole message should be transferred. for (int i = 0; i < data.length / 32 - 2; i++) { channel.reset(); - assertEquals(1, emsg.transferTo(channel, emsg.transfered())); + assertEquals(1, emsg.transferTo(channel, emsg.transferred())); } channel.reset(); - count = emsg.transferTo(channel, emsg.transfered()); + count = emsg.transferTo(channel, emsg.transferred()); assertTrue("Unexpected count: " + count, count > 1 && count < data.length); - assertEquals(data.length, emsg.transfered()); + assertEquals(data.length, emsg.transferred()); } finally { msg.release(); } @@ -237,9 +237,9 @@ public void testEncryptedMessageChunking() throws Exception { new SaslEncryption.EncryptedMessage(backend, msg.convertToNetty(), data.length / 8); ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); - while (emsg.transfered() < emsg.count()) { + while (emsg.transferred() < emsg.count()) { channel.reset(); - emsg.transferTo(channel, emsg.transfered()); + emsg.transferTo(channel, emsg.transferred()); } verify(backend, times(8)).wrap(any(byte[].class), anyInt(), anyInt()); diff --git a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java index 45e1836da641f..634b40ed450ee 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java @@ -72,7 +72,7 @@ public void testMissingChunk() { Assert.assertNotNull(getChunk(manager, streamId, 2)); manager.connectionTerminated(dummyChannel); - // loaded buffers are not released yet as in production a MangedBuffer returned by getChunk() + // loaded buffers are not released yet as in production a ManagedBuffer returned by getChunk() // would only be released by Netty after it is written to the network Mockito.verify(buffer1, Mockito.never()).release(); Mockito.verify(buffer2, Mockito.never()).release(); diff --git a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java index 4b67aa80351d2..163c52b023822 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java @@ -98,7 +98,7 @@ public void testConsolidationPerf() throws Exception { writtenBytes += pieceBytes; } logger.info("Writing 300MiB frame buf with consolidation of threshold " + threshold - + " took " + totalTime + " milis"); + + " took " + totalTime + " millis"); } finally { for (ByteBuf buf : retained) { release(buf); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java index 670612fd6f66a..97ecaa627b66c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java @@ -32,7 +32,7 @@ * A DownloadFile that does not take any encryption settings into account for reading and * writing data. * - * This does *not* mean the data in the file is un-encrypted -- it could be that the data is + * This does *not* mean the data in the file is unencrypted -- it could be that the data is * already encrypted when its written, and subsequent layer is responsible for decrypting. */ public class SimpleDownloadFile implements DownloadFile { diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index b8dda22240042..c6aa5f0b58285 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -635,13 +635,13 @@ public UTF8String trimLeft() { public UTF8String trimLeft(UTF8String trimString) { if (trimString == null) return null; // the searching byte position in the source string - int srchIdx = 0; + int searchIdx = 0; // the first beginning byte position of a non-matching character int trimIdx = 0; - while (srchIdx < numBytes) { + while (searchIdx < numBytes) { UTF8String searchChar = copyUTF8String( - srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1); + searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1); int searchCharBytes = searchChar.numBytes; // try to find the matching for the searchChar in the trimString set if (trimString.find(searchChar, 0) >= 0) { @@ -650,9 +650,9 @@ public UTF8String trimLeft(UTF8String trimString) { // no matching, exit the search break; } - srchIdx += searchCharBytes; + searchIdx += searchCharBytes; } - if (srchIdx == 0) { + if (searchIdx == 0) { // Nothing trimmed return this; } diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala index 69a082053aa65..ab488e18ba3f4 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala @@ -192,7 +192,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp } } - val nullalbeSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString)) + val nullableSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString)) test("concat") { def concat(origin: Seq[String]): String = @@ -201,7 +201,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp forAll { (inputs: Seq[String]) => assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(inputs.mkString)) } - forAll (nullalbeSeq) { (inputs: Seq[String]) => + forAll (nullableSeq) { (inputs: Seq[String]) => assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(concat(inputs))) } } @@ -216,7 +216,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) === toUTF8(inputs.mkString(sep))) } - forAll(randomString, nullalbeSeq) {(sep: String, inputs: Seq[String]) => + forAll(randomString, nullableSeq) {(sep: String, inputs: Seq[String]) => assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) === toUTF8(concatWs(sep, inputs))) } diff --git a/dev/appveyor-guide.md b/dev/appveyor-guide.md index a8c0c1ef23ac3..c68b5de9e61d0 100644 --- a/dev/appveyor-guide.md +++ b/dev/appveyor-guide.md @@ -33,22 +33,22 @@ Currently, SparkR on Windows is being tested with [AppVeyor](https://ci.appveyor 2016-09-04 11 07 58 -- Click "Github". +- Click "GitHub". 2016-09-04 11 08 10 -#### After signing up, go to profile to link Github and AppVeyor. +#### After signing up, go to profile to link GitHub and AppVeyor. - Click your account and then click "Profile". 2016-09-04 11 09 43 -- Enable the link with GitHub via clicking "Link Github account". +- Enable the link with GitHub via clicking "Link GitHub account". 2016-09-04 11 09 52 -- Click "Authorize application" in Github site. +- Click "Authorize application" in GitHub site. 2016-09-04 11 10 05 @@ -63,11 +63,11 @@ Currently, SparkR on Windows is being tested with [AppVeyor](https://ci.appveyor 2016-08-30 12 16 35 -- Since we will use Github here, click the "GITHUB" button and then click "Authorize Github" so that AppVeyor can access the Github logs (e.g. commits). +- Since we will use GitHub here, click the "GITHUB" button and then click "Authorize GitHub" so that AppVeyor can access the GitHub logs (e.g. commits). 2016-09-04 11 10 22 -- Click "Authorize application" from Github (the above step will pop up this page). +- Click "Authorize application" from GitHub (the above step will pop up this page). 2016-09-04 11 10 27 diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations index ff41cccde0140..64bd9ada1bf61 100644 --- a/dev/create-release/known_translations +++ b/dev/create-release/known_translations @@ -1,5 +1,5 @@ # This is a mapping of names to be translated through translate-contributors.py -# The format expected on each line should be: - +# The format expected on each line should be: - 012huang - Weiyi Huang 07ARB - Ankit Raj Boudh 10110346 - Xian Liu diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 240f4c8dfd371..d2953a86afafd 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -452,7 +452,7 @@ if [[ "$1" == "publish-release" ]]; then if ! is_dry_run; then nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id - echo "Uplading files to $nexus_upload" + echo "Uploading files to $nexus_upload" for file in $(find . -type f) do # strip leading ./ diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index cc7ad931198a2..a0e9695d58361 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -110,7 +110,7 @@ def __str__(self): # Under the hood, this runs a `git log` on that tag and parses the fields # from the command output to construct a list of Commit objects. Note that # because certain fields reside in the commit description and cannot be parsed -# through the Github API itself, we need to do some intelligent regex parsing +# through the GitHub API itself, we need to do some intelligent regex parsing # to extract those fields. # # This is written using Git 1.8.5. @@ -140,7 +140,7 @@ def get_commits(tag): sys.exit("Unexpected format in commit: %s" % commit_digest) [_hash, author, title] = commit_digest.split(field_end_marker) # The PR number and github username is in the commit message - # itself and cannot be accessed through any Github API + # itself and cannot be accessed through any GitHub API pr_number = None match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body) if match: @@ -252,7 +252,7 @@ def nice_join(str_list): return ", ".join(str_list[:-1]) + ", and " + str_list[-1] -# Return the full name of the specified user on Github +# Return the full name of the specified user on GitHub # If the user doesn't exist, return None def get_github_name(author, github_client): if github_client: diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index 8340266527fc6..be5611ce65a7d 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -17,7 +17,7 @@ # This script translates invalid authors in the contributors list generated # by generate-contributors.py. When the script encounters an author name that -# is considered invalid, it searches Github and JIRA in an attempt to search +# is considered invalid, it searches GitHub and JIRA in an attempt to search # for replacements. This tool runs in two modes: # # (1) Interactive mode: For each invalid author name, this script presents @@ -68,7 +68,7 @@ if INTERACTIVE_MODE: print("Running in interactive mode. To disable this, provide the --non-interactive flag.") -# Setup Github and JIRA clients +# Setup GitHub and JIRA clients jira_options = {"server": JIRA_API_BASE} jira_client = JIRA(options=jira_options, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) github_client = Github(GITHUB_API_TOKEN) @@ -89,11 +89,11 @@ # Generate candidates for the given author. This should only be called if the given author # name does not represent a full name as this operation is somewhat expensive. Under the -# hood, it makes several calls to the Github and JIRA API servers to find the candidates. +# hood, it makes several calls to the GitHub and JIRA API servers to find the candidates. # # This returns a list of (candidate name, source) 2-tuples. E.g. # [ -# (NOT_FOUND, "No full name found for Github user andrewor14"), +# (NOT_FOUND, "No full name found for GitHub user andrewor14"), # ("Andrew Or", "Full name of JIRA user andrewor14"), # ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"), # ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"), @@ -104,12 +104,12 @@ def generate_candidates(author, issues): candidates = [] - # First check for full name of Github user + # First check for full name of GitHub user github_name = get_github_name(author, github_client) if github_name: - candidates.append((github_name, "Full name of Github user %s" % author)) + candidates.append((github_name, "Full name of GitHub user %s" % author)) else: - candidates.append((NOT_FOUND, "No full name found for Github user %s" % author)) + candidates.append((NOT_FOUND, "No full name found for GitHub user %s" % author)) # Then do the same for JIRA user jira_name = get_jira_name(author, jira_client) if jira_name: @@ -151,7 +151,7 @@ def generate_candidates(author, issues): candidates[i] = (candidate, source) return candidates -# Translate each invalid author by searching for possible candidates from Github and JIRA +# Translate each invalid author by searching for possible candidates from GitHub and JIRA # In interactive mode, this script presents the user with a list of choices and have the user # select from this list. Additionally, the user may also choose to enter a custom name. # In non-interactive mode, this script picks the first valid author name from the candidates @@ -180,12 +180,12 @@ def generate_candidates(author, issues): issues = temp_author.split("/")[1:] candidates = generate_candidates(author, issues) # Print out potential replacement candidates along with the sources, e.g. - # [X] No full name found for Github user andrewor14 + # [X] No full name found for GitHub user andrewor14 # [X] No assignee found for SPARK-1763 # [0] Andrew Or - Full name of JIRA user andrewor14 # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 - # [3] andrewor14 - Raw Github username + # [3] andrewor14 - Raw GitHub username # [4] Custom candidate_names = [] bad_prompts = [] # Prompts that can't actually be selected; print these first. @@ -207,7 +207,7 @@ def generate_candidates(author, issues): print(p) # In interactive mode, additionally provide "custom" option and await user response if INTERACTIVE_MODE: - print(" [%d] %s - Raw Github username" % (raw_index, author)) + print(" [%d] %s - Raw GitHub username" % (raw_index, author)) print(" [%d] Custom" % custom_index) response = raw_input(" Your choice: ") last_index = custom_index diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py index 9bcebaa22ab86..27451bba905dd 100755 --- a/dev/github_jira_sync.py +++ b/dev/github_jira_sync.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# Utility for updating JIRA's with information about Github pull requests +# Utility for updating JIRA's with information about GitHub pull requests import json import os @@ -142,9 +142,9 @@ def reset_pr_labels(pr_num, jira_components): jira_prs = get_jira_prs() previous_max = get_max_pr() -print("Retrieved %s JIRA PR's from Github" % len(jira_prs)) +print("Retrieved %s JIRA PR's from GitHub" % len(jira_prs)) jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max] -print("%s PR's remain after excluding visted ones" % len(jira_prs)) +print("%s PR's remain after excluding visited ones" % len(jira_prs)) num_updates = 0 considered = [] @@ -157,7 +157,7 @@ def reset_pr_labels(pr_num, jira_components): considered = considered + [pr_num] url = pr['html_url'] - title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) + title = "[GitHub] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) try: page = get_json(get_url(JIRA_API_BASE + "/rest/api/2/issue/" + issue + "/remotelink")) existing_links = map(lambda l: l['object']['url'], page) @@ -174,7 +174,7 @@ def reset_pr_labels(pr_num, jira_components): destination = {"title": title, "url": url, "icon": icon} # For all possible fields see: # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links - # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} + # application = {"name": "GitHub pull requests", "type": "org.apache.spark.jira.github"} jira_client.add_remote_link(issue, destination) comment = "User '%s' has created a pull request for this issue:" % pr['user']['login'] diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index 610fb1fd27027..4309a74773e89 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -38,7 +38,7 @@ def print_err(msg): def post_message_to_github(msg, ghprb_pull_id): - print("Attempting to post to Github...") + print("Attempting to post to GitHub...") api_url = os.getenv("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark") url = api_url + "/issues/" + ghprb_pull_id + "/comments" @@ -57,12 +57,12 @@ def post_message_to_github(msg, ghprb_pull_id): if response.getcode() == 201: print(" > Post successful.") except HTTPError as http_e: - print_err("Failed to post message to Github.") + print_err("Failed to post message to GitHub.") print_err(" > http_code: %s" % http_e.code) print_err(" > api_response: %s" % http_e.read()) print_err(" > data: %s" % posted_message) except URLError as url_e: - print_err("Failed to post message to Github.") + print_err("Failed to post message to GitHub.") print_err(" > urllib_status: %s" % url_e.reason[1]) print_err(" > data: %s" % posted_message) @@ -89,7 +89,7 @@ def run_pr_checks(pr_tests, ghprb_actual_commit, sha1): """ Executes a set of pull request checks to ease development and report issues with various components such as style, linting, dependencies, compatibilities, etc. - @return a list of messages to post back to Github + @return a list of messages to post back to GitHub """ # Ensure we save off the current HEAD to revert to current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip() @@ -109,7 +109,7 @@ def run_tests(tests_timeout): """ Runs the `dev/run-tests` script and responds with the correct error message under the various failure scenarios. - @return a tuple containing the test result code and the result note to post to Github + @return a tuple containing the test result code and the result note to post to GitHub """ test_result_code = subprocess.Popen(['timeout', @@ -198,16 +198,16 @@ def main(): # To write a PR test: # * the file must reside within the dev/tests directory # * be an executable bash script - # * accept three arguments on the command line, the first being the Github PR long commit - # hash, the second the Github SHA1 hash, and the final the current PR hash + # * accept three arguments on the command line, the first being the GitHub PR long commit + # hash, the second the GitHub SHA1 hash, and the final the current PR hash # * and, lastly, return string output to be included in the pr message output that will - # be posted to Github + # be posted to GitHub pr_tests = [ "pr_merge_ability", "pr_public_classes" ] - # `bind_message_base` returns a function to generate messages for Github posting + # `bind_message_base` returns a function to generate messages for GitHub posting github_message = functools.partial(pr_message, build_display_name, build_url, diff --git a/dev/run-tests.py b/dev/run-tests.py index 5bdbc0ffb850c..a1001ec5fd280 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -636,7 +636,7 @@ def main(): # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: - # else we're running locally or Github Actions. + # else we're running locally or GitHub Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") @@ -654,12 +654,12 @@ def main(): included_tags = [] excluded_tags = [] if should_only_test_modules: - # If we're running the tests in Github Actions, attempt to detect and test + # If we're running the tests in GitHub Actions, attempt to detect and test # only the affected modules. if test_env == "github_actions": if os.environ["GITHUB_INPUT_BRANCH"] != "": # Dispatched request - # Note that it assumes Github Actions has already merged + # Note that it assumes GitHub Actions has already merged # the given `GITHUB_INPUT_BRANCH` branch. changed_files = identify_changed_files_from_git_commits( "HEAD", target_branch=os.environ["GITHUB_SHA"]) diff --git a/dev/tests/pr_merge_ability.sh b/dev/tests/pr_merge_ability.sh index 25fdbccac4dd8..a32667730f76c 100755 --- a/dev/tests/pr_merge_ability.sh +++ b/dev/tests/pr_merge_ability.sh @@ -22,7 +22,7 @@ # another branch and returning results to be published. More details can be # found at dev/run-tests-jenkins. # -# Arg1: The Github Pull Request Actual Commit +# Arg1: The GitHub Pull Request Actual Commit # known as `ghprbActualCommit` in `run-tests-jenkins` # Arg2: The SHA1 hash # known as `sha1` in `run-tests-jenkins` diff --git a/dev/tests/pr_public_classes.sh b/dev/tests/pr_public_classes.sh index 479d1851fe0b8..ad1ad5e736594 100755 --- a/dev/tests/pr_public_classes.sh +++ b/dev/tests/pr_public_classes.sh @@ -22,7 +22,7 @@ # another branch and returning results to be published. More details can be # found at dev/run-tests-jenkins. # -# Arg1: The Github Pull Request Actual Commit +# Arg1: The GitHub Pull Request Actual Commit # known as `ghprbActualCommit` in `run-tests-jenkins` ghprbActualCommit="$1" diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 98769d951b6ac..5a66bfca27a27 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -1729,7 +1729,7 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") ) ++ Seq( - // [SPARK-21680][ML][MLLIB]optimzie Vector coompress + // [SPARK-21680][ML][MLLIB]optimize Vector compress ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize") ) ++ Seq( diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 05413b7091ad9..a5951e0452943 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -198,7 +198,7 @@ object SparkBuild extends PomBuild { ) // Silencer: Scala compiler plugin for warning suppression - // Aim: enable fatal warnings, but supress ones related to using of deprecated APIs + // Aim: enable fatal warnings, but suppress ones related to using of deprecated APIs // depends on scala version: // <2.13 - silencer 1.6.0 and compiler settings to enable fatal warnings // 2.13.0,2.13.1 - silencer 1.7.1 and compiler settings to enable fatal warnings @@ -222,7 +222,7 @@ object SparkBuild extends PomBuild { "-Xfatal-warnings", "-deprecation", "-Ywarn-unused-import", - "-P:silencer:globalFilters=.*deprecated.*" //regex to catch deprecation warnings and supress them + "-P:silencer:globalFilters=.*deprecated.*" //regex to catch deprecation warnings and suppress them ) } else { Seq( @@ -327,7 +327,7 @@ object SparkBuild extends PomBuild { // to be enabled in specific ones that have previous artifacts MimaKeys.mimaFailOnNoPrevious := false, - // To prevent intermittent compliation failures, see also SPARK-33297 + // To prevent intermittent compilation failures, see also SPARK-33297 // Apparently we can remove this when we use JDK 11. Test / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.Flat ) diff --git a/python/docs/source/_static/css/pyspark.css b/python/docs/source/_static/css/pyspark.css index 2fd8720e2fa0d..1e493c4c868e6 100644 --- a/python/docs/source/_static/css/pyspark.css +++ b/python/docs/source/_static/css/pyspark.css @@ -51,7 +51,7 @@ h3 { max-width: 80%; } -/* Left pannel size */ +/* Left panel size */ @media (min-width: 768px) { .col-md-3 { flex: 0 0 20%; diff --git a/python/docs/source/_templates/autosummary/class.rst b/python/docs/source/_templates/autosummary/class.rst index d794f797ee2ad..b5f62677ee0ed 100644 --- a/python/docs/source/_templates/autosummary/class.rst +++ b/python/docs/source/_templates/autosummary/class.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -.. Workaround to avoud documenting __init__. +.. Workaround to avoid documenting __init__. {% extends "!autosummary/class.rst" %} diff --git a/python/docs/source/development/debugging.rst b/python/docs/source/development/debugging.rst index bc141a6f44a6f..829919858f67a 100644 --- a/python/docs/source/development/debugging.rst +++ b/python/docs/source/development/debugging.rst @@ -54,7 +54,7 @@ Enter the name of this new configuration, for example, ``MyRemoteDebugger`` and .. image:: ../../../../docs/img/pyspark-remote-debug1.png :alt: PyCharm remote debugger setting -| After that, you should install the corresponding version of the ``pydevd-pycahrm`` package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install. +| After that, you should install the corresponding version of the ``pydevd-pycharm`` package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install. .. code-block:: text diff --git a/python/docs/source/development/testing.rst b/python/docs/source/development/testing.rst index 08fd730a19f4b..3eab8d04511d6 100644 --- a/python/docs/source/development/testing.rst +++ b/python/docs/source/development/testing.rst @@ -53,5 +53,5 @@ Running tests using GitHub Actions ---------------------------------- You can run the full PySpark tests by using GitHub Actions in your own forked GitHub -repositry with a few clicks. Please refer to +repository with a few clicks. Please refer to `Running tests in your forked repository using GitHub Actions `_ for more details. diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 9c9ff7fa7844b..a90f5fe159553 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -42,7 +42,7 @@ PySpark installation using `PyPI `_ is as fol pip install pyspark -If you want to install extra dependencies for a specific componenet, you can install it as below: +If you want to install extra dependencies for a specific component, you can install it as below: .. code-block:: bash @@ -105,7 +105,7 @@ Now activate the newly created environment with the following command: conda activate pyspark_env You can install pyspark by `Using PyPI <#using-pypi>`_ to install PySpark in the newly created -environment, for example as below. It will install PySpark under the new virtual environemnt +environment, for example as below. It will install PySpark under the new virtual environment ``pyspark_env`` created above. .. code-block:: bash @@ -126,7 +126,7 @@ Manually Downloading -------------------- PySpark is included in the distributions available at the `Apache Spark website `_. -You can download a distribution you want from the site. After that, uncompress the tar file into the directoy where you want +You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want to install Spark, for example, as below: .. code-block:: bash diff --git a/python/docs/source/getting_started/quickstart.ipynb b/python/docs/source/getting_started/quickstart.ipynb index ab3645591955f..550b532fefc14 100644 --- a/python/docs/source/getting_started/quickstart.ipynb +++ b/python/docs/source/getting_started/quickstart.ipynb @@ -11,7 +11,7 @@ "\n", "There is also other useful information in Apache Spark documentation site, see the latest version of [Spark SQL and DataFrames](https://spark.apache.org/docs/latest/sql-programming-guide.html), [RDD Programming Guide](https://spark.apache.org/docs/latest/rdd-programming-guide.html), [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html), [Spark Streaming Programming Guide](https://spark.apache.org/docs/latest/streaming-programming-guide.html) and [Machine Learning Library (MLlib) Guide](https://spark.apache.org/docs/latest/ml-guide.html).\n", "\n", - "PySaprk applications start with initializing `SparkSession` which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users." + "PySpark applications start with initializing `SparkSession` which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users." ] }, { @@ -392,7 +392,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`DataFrame.collect()` collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too larget to fit in the driver side because it collects all the data from executors to the driver side." + "`DataFrame.collect()` collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver side because it collects all the data from executors to the driver side." ] }, { diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst index 4286f616374c5..6a631052a642d 100644 --- a/python/docs/source/index.rst +++ b/python/docs/source/index.rst @@ -30,7 +30,7 @@ of Spark's features such as Spark SQL, DataFrame, Streaming, MLlib (Machine Learning) and Spark Core. .. image:: ../../../docs/img/pyspark-components.png - :alt: PySpark Compoenents + :alt: PySpark Components **Spark SQL and DataFrame** diff --git a/python/pyspark/__init__.pyi b/python/pyspark/__init__.pyi index 98bd40684c01b..ef07c32b1db7b 100644 --- a/python/pyspark/__init__.pyi +++ b/python/pyspark/__init__.pyi @@ -53,7 +53,7 @@ from pyspark.taskcontext import ( # noqa: F401 ) from pyspark.util import InheritableThread as InheritableThread # noqa: F401 -# Compatiblity imports +# Compatibility imports from pyspark.sql import ( # noqa: F401 SQLContext as SQLContext, HiveContext as HiveContext, diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index 8e683e7a6988b..58c274bd79720 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -88,7 +88,7 @@ def g(): DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL # Track the provenance of reconstructed dynamic classes to make it possible to -# recontruct instances from the matching singleton class definition when +# reconstruct instances from the matching singleton class definition when # appropriate and preserve the usual "isinstance" semantics of Python objects. _DYNAMIC_CLASS_TRACKER_BY_CLASS = weakref.WeakKeyDictionary() _DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() @@ -236,7 +236,7 @@ def _extract_code_globals(co): out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} # Declaring a function inside another one using the "def ..." - # syntax generates a constant code object corresonding to the one + # syntax generates a constant code object corresponding to the one # of the nested function's As the nested function may itself need # global variables, we need to introspect its code, extract its # globals, (look for code object in it's co_consts attribute..) and @@ -457,7 +457,7 @@ def _is_parametrized_type_hint(obj): is_typing = getattr(obj, '__origin__', None) is not None # typing_extensions.Literal - is_litteral = getattr(obj, '__values__', None) is not None + is_literal = getattr(obj, '__values__', None) is not None # typing_extensions.Final is_final = getattr(obj, '__type__', None) is not None @@ -469,7 +469,7 @@ def _is_parametrized_type_hint(obj): getattr(obj, '__result__', None) is not None and getattr(obj, '__args__', None) is not None ) - return any((is_typing, is_litteral, is_final, is_union, is_tuple, + return any((is_typing, is_literal, is_final, is_union, is_tuple, is_callable)) def _create_parametrized_type_hint(origin, args): @@ -699,7 +699,7 @@ def _make_skel_func(code, cell_count, base_globals=None): """ # This function is deprecated and should be removed in cloudpickle 1.7 warnings.warn( - "A pickle file created using an old (<=1.4.1) version of cloudpicke " + "A pickle file created using an old (<=1.4.1) version of cloudpickle " "is currently being loaded. This is not supported by cloudpickle and " "will break in cloudpickle 1.7", category=UserWarning ) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index e8e46b88fdc91..3c48ff7b0a885 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -6,7 +6,7 @@ is only available for Python versions 3.8+, a lot of backward-compatibility code is also removed. -Note that the C Pickler sublassing API is CPython-specific. Therefore, some +Note that the C Pickler subclassing API is CPython-specific. Therefore, some guards present in cloudpickle.py that were written to handle PyPy specificities are not present in cloudpickle_fast.py """ @@ -179,7 +179,7 @@ def _class_getstate(obj): clsdict.pop('__weakref__', None) if issubclass(type(obj), abc.ABCMeta): - # If obj is an instance of an ABCMeta subclass, dont pickle the + # If obj is an instance of an ABCMeta subclass, don't pickle the # cache/negative caches populated during isinstance/issubclass # checks, but pickle the list of registered subclasses of obj. clsdict.pop('_abc_cache', None) @@ -407,7 +407,7 @@ def _class_reduce(obj): def _function_setstate(obj, state): - """Update the state of a dynaamic function. + """Update the state of a dynamic function. As __closure__ and __globals__ are readonly attributes of a function, we cannot rely on the native setstate routine of pickle.load_build, that calls @@ -556,7 +556,7 @@ def dump(self, obj): # `dispatch` attribute. Earlier versions of the protocol 5 CloudPickler # used `CloudPickler.dispatch` as a class-level attribute storing all # reducers implemented by cloudpickle, but the attribute name was not a - # great choice given the meaning of `Cloudpickler.dispatch` when + # great choice given the meaning of `CloudPickler.dispatch` when # `CloudPickler` extends the pure-python pickler. dispatch = dispatch_table @@ -630,7 +630,7 @@ def reducer_override(self, obj): return self._function_reduce(obj) else: # fallback to save_global, including the Pickler's - # distpatch_table + # dispatch_table return NotImplemented else: diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 9c9e3f4b3c881..8dc6251323571 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -258,7 +258,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, sys.path.insert(1, filepath) except Exception: warnings.warn( - "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to " + "Failed to add file [%s] specified in 'spark.submit.pyFiles' to " "Python path:\n %s" % (path, "\n ".join(sys.path)), RuntimeWarning) @@ -601,7 +601,7 @@ def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): tempFile.close() return reader_func(tempFile.name) finally: - # we eagerily reads the file so we can delete right after. + # we eagerly reads the file so we can delete right after. os.unlink(tempFile.name) def pickleFile(self, name, minPartitions=None): diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index eafa5d90f9ff8..172b451190aa9 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -208,7 +208,7 @@ def local_connect_and_auth(port, auth_secret): return (sockfile, sock) except socket.error as e: emsg = str(e) - errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg)) + errors.append("tried to connect to %s, but an error occurred: %s" % (sa, emsg)) sock.close() sock = None raise Exception("could not open socket: %s" % errors) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 82b9a6db1eb92..8138f34d7a19e 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -5798,7 +5798,7 @@ def setHandleInvalid(self, value): class _VarianceThresholdSelectorParams(HasFeaturesCol, HasOutputCol): """ Params for :py:class:`VarianceThresholdSelector` and - :py:class:`VarianceThresholdSelectorrModel`. + :py:class:`VarianceThresholdSelectorModel`. .. versionadded:: 3.1.0 """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 5ce484d964a5a..d37654a7388f5 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1491,7 +1491,7 @@ def setParams(self, *, featuresCol="features", labelCol="label", predictionCol=" maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impuriy="variance", featureSubsetStrategy="all", validationTol=0.01, + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0, weightCol=None): """ diff --git a/python/pyspark/ml/regression.pyi b/python/pyspark/ml/regression.pyi index 5cb0e7a5092f7..a3f4644de2a68 100644 --- a/python/pyspark/ml/regression.pyi +++ b/python/pyspark/ml/regression.pyi @@ -477,7 +477,7 @@ class GBTRegressor( maxIter: int = ..., stepSize: float = ..., seed: Optional[int] = ..., - impuriy: str = ..., + impurity: str = ..., featureSubsetStrategy: str = ..., validationTol: float = ..., validationIndicatorCol: Optional[str] = ..., diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py index f8b61b7c57919..50475210607c8 100644 --- a/python/pyspark/ml/tests/test_algorithms.py +++ b/python/pyspark/ml/tests/test_algorithms.py @@ -116,7 +116,7 @@ def test_output_columns(self): output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"]) - def test_parallelism_doesnt_change_output(self): + def test_parallelism_does_not_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], diff --git a/python/pyspark/ml/tests/test_image.py b/python/pyspark/ml/tests/test_image.py index ceecdae971c99..1001598779d48 100644 --- a/python/pyspark/ml/tests/test_image.py +++ b/python/pyspark/ml/tests/test_image.py @@ -33,7 +33,7 @@ def test_read_images(self): self.assertEqual(df.count(), 4) first_row = df.take(1)[0][0] # compare `schema.simpleString()` instead of directly compare schema, - # because the df loaded from datasouce may change schema column nullability. + # because the df loaded from datasource may change schema column nullability. self.assertEqual(df.schema.simpleString(), ImageSchema.imageSchema.simpleString()) self.assertEqual(df.schema["image"].dataType.simpleString(), ImageSchema.columnSchema.simpleString()) diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index b99a4150c396d..6dffcadfc5dd5 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -843,7 +843,7 @@ def setInitialCenters(self, centers, weights): @since('1.5.0') def setRandomCenters(self, dim, weight, seed): """ - Set the initial centres to be random samples from + Set the initial centers to be random samples from a gaussian population with constant weights. """ rng = random.RandomState(seed) diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index f3be827fb6e4f..38808ed5f48f7 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -439,7 +439,7 @@ def meanAveragePrecision(self): """ Returns the mean average precision (MAP) of all the queries. If a query has an empty ground truth set, the average precision will be zero and - a log warining is generated. + a log warning is generated. """ return self.call("meanAveragePrecision") @@ -448,7 +448,7 @@ def meanAveragePrecisionAt(self, k): """ Returns the mean average precision (MAP) at first k ranking of all the queries. If a query has an empty ground truth set, the average precision will be zero and - a log warining is generated. + a log warning is generated. """ return self.call("meanAveragePrecisionAt", int(k)) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 77bca86ac1b27..d5a23a2217bd0 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -739,7 +739,7 @@ def _validate(self, dstream): "dstream should be a DStream object, got %s" % type(dstream)) if not self._model: raise ValueError( - "Model must be intialized using setInitialWeights") + "Model must be initialized using setInitialWeights") @since("1.5.0") def predictOn(self, dstream): diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index 43454ba5187dd..22da0471b8400 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -159,7 +159,7 @@ def chiSqTest(observed, expected=None): """ If `observed` is Vector, conduct Pearson's chi-squared goodness of fit test of the observed data against the expected distribution, - or againt the uniform distribution (by default), with each category + or against the uniform distribution (by default), with each category having an expected frequency of `1 / len(observed)`. If `observed` is matrix, conduct Pearson's independence test on the diff --git a/python/pyspark/mllib/tests/test_streaming_algorithms.py b/python/pyspark/mllib/tests/test_streaming_algorithms.py index b94fb2778d88d..f6c6779e83f13 100644 --- a/python/pyspark/mllib/tests/test_streaming_algorithms.py +++ b/python/pyspark/mllib/tests/test_streaming_algorithms.py @@ -189,7 +189,7 @@ def generateLogisticInput(offset, scale, nPoints, seed): Generate 1 / (1 + exp(-x * scale + offset)) where, - x is randomnly distributed and the threshold + x is randomly distributed and the threshold and labels for each sample in x is obtained from a random uniform distribution. """ diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 1964070040cdf..34faaacff5eb3 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1253,7 +1253,7 @@ def histogram(self, buckets): and 50 we would have a histogram of 1,0,1. If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), - this can be switched from an O(log n) inseration to O(1) per + this can be switched from an O(log n) insertion to O(1) per element (where n is the number of buckets). Buckets must be sorted, not contain any duplicates, and have @@ -2292,7 +2292,7 @@ def groupWith(self, other, *others): """ return python_cogroup((self, other) + others, numPartitions=None) - # TODO: add variant with custom parittioner + # TODO: add variant with custom partitioner def cogroup(self, other, numPartitions=None): """ For each key k in `self` or `other`, return a resulting RDD that diff --git a/python/pyspark/resource/requests.py b/python/pyspark/resource/requests.py index 74d26d04312c4..4deb22b5948f0 100644 --- a/python/pyspark/resource/requests.py +++ b/python/pyspark/resource/requests.py @@ -189,7 +189,7 @@ def requests(self): class TaskResourceRequest(object): """ - A task resource request. This is used in conjuntion with the + A task resource request. This is used in conjunction with the :class:`pyspark.resource.ResourceProfile` to programmatically specify the resources needed for an RDD that will be applied at the stage level. The amount is specified as a Double to allow for saying you want more than 1 task per resource. Valid values @@ -226,7 +226,7 @@ def amount(self): class TaskResourceRequests(object): """ - A set of task resource requests. This is used in conjuntion with the + A set of task resource requests. This is used in conjunction with the :class:`pyspark.resource.ResourceProfileBuilder` to programmatically specify the resources needed for an RDD that will be applied at the stage level. diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py index 89be6295f9888..4ba846227188c 100644 --- a/python/pyspark/shuffle.py +++ b/python/pyspark/shuffle.py @@ -418,7 +418,7 @@ def _cleanup(self): class ExternalSorter(object): """ - ExtenalSorter will divide the elements into chunks, sort them in + ExternalSorter will divide the elements into chunks, sort them in memory and dump them into disks, finally merge them back. The spilling will only happen when the used memory goes above diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 345e81bd2d73e..760805400aca9 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -425,7 +425,7 @@ def dropFields(self, *fieldNames): +--------------+ However, if you are going to add/replace multiple nested fields, - it is preffered to extract out the nested struct before + it is preferred to extract out the nested struct before adding/replacing multiple fields e.g. >>> df.select(col("a").withField( diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 9fae27a2d9c6c..fe7d26d1bcfd2 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1497,7 +1497,7 @@ def summary(self, *statistics): - stddev - min - max - - arbitrary approximate percentiles specified as a percentage (eg, 75%) + - arbitrary approximate percentiles specified as a percentage (e.g., 75%) If no statistics are given, this function computes count, mean, stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max. diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 4af5d1f484ee4..f2e9a48c39b59 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1261,7 +1261,7 @@ def spark_partition_id(): Notes ----- - This is indeterministic because it depends on data partitioning and task scheduling. + This is non deterministic because it depends on data partitioning and task scheduling. Examples -------- @@ -4071,7 +4071,7 @@ def _get_lambda_parameters(f): # We should exclude functions that use # variable args and keyword argnames # as well as keyword only args - supported_parmeter_types = { + supported_parameter_types = { inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.POSITIONAL_ONLY, } @@ -4086,7 +4086,7 @@ def _get_lambda_parameters(f): ) # and all arguments can be used as positional - if not all(p.kind in supported_parmeter_types for p in parameters): + if not all(p.kind in supported_parameter_types for p in parameters): raise ValueError( "f should use only POSITIONAL or POSITIONAL OR KEYWORD arguments" ) @@ -4601,7 +4601,7 @@ def years(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4625,7 +4625,7 @@ def months(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4649,7 +4649,7 @@ def days(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4673,7 +4673,7 @@ def hours(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. diff --git a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi index de679ee2cd017..9148e7a2dca8e 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -# This Protocol resuses core Pandas annotation. +# This Protocol reuses core Pandas annotation. # Overall pipeline looks as follows # - Stubgen pandas.core.frame # - Add Protocol as a base class diff --git a/python/pyspark/sql/pandas/_typing/protocols/series.pyi b/python/pyspark/sql/pandas/_typing/protocols/series.pyi index 14babb067da0d..f2de2e8b129fd 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/series.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/series.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -# This Protocol resuses core Pandas annotation. +# This Protocol reuses core Pandas annotation. # Overall pipeline looks as follows # - Stubgen pandas.core.series # - Add Protocol as a base class diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 750aa4b0e6c56..4cd0b196d3366 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -99,7 +99,7 @@ def pandas_udf(f=None, returnType=None, functionType=None): ... s3['col2'] = s1 + s2.str.len() ... return s3 ... - >>> # Create a Spark DataFrame that has three columns including a sturct column. + >>> # Create a Spark DataFrame that has three columns including a struct column. ... df = spark.createDataFrame( ... [[1, "a string", ("a nested string",)]], ... "long_col long, string_col string, struct_col struct") @@ -114,7 +114,7 @@ def pandas_udf(f=None, returnType=None, functionType=None): | |-- col1: string (nullable = true) | |-- col2: long (nullable = true) - In the following sections, it describes the cominations of the supported type hints. For + In the following sections, it describes the combinations of the supported type hints. For simplicity, `pandas.DataFrame` variant is omitted. * Series to Series diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index ee68b95fc478d..a639a8d51f55c 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -484,7 +484,7 @@ def dummy_pandas_udf(df): col('temp0.key') == col('temp1.key')) self.assertEquals(res.count(), 5) - def test_mixed_scalar_udfs_followed_by_grouby_apply(self): + def test_mixed_scalar_udfs_followed_by_groupby_apply(self): df = self.spark.range(0, 10).toDF('v1') df = df.withColumn('v2', udf(lambda x: x + 1, 'int')(df['v1'])) \ .withColumn('v3', pandas_udf(lambda x: x + 2, 'int')(df['v1'])) diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py index a7dcbfd32ac1c..9a1c0edcce4ed 100644 --- a/python/pyspark/sql/tests/test_udf.py +++ b/python/pyspark/sql/tests/test_udf.py @@ -459,7 +459,7 @@ def test_udf_with_string_return_type(self): self.assertTupleEqual(expected, actual) - def test_udf_shouldnt_accept_noncallable_object(self): + def test_udf_should_not_accept_noncallable_object(self): non_callable = None self.assertRaises(TypeError, UserDefinedFunction, non_callable, StringType()) @@ -683,7 +683,7 @@ def tearDown(self): if SparkContext._active_spark_context is not None: SparkContext._active_spark_context.stop() - def test_udf_init_shouldnt_initialize_context(self): + def test_udf_init_should_not_initialize_context(self): UserDefinedFunction(lambda x: x, StringType()) self.assertIsNone( diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 18f8ba29f95a2..f5db783d2b5bc 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -151,10 +151,10 @@ def toJArray(gateway, jtype, arr): arr : python type list """ - jarr = gateway.new_array(jtype, len(arr)) + jarray = gateway.new_array(jtype, len(arr)) for i in range(0, len(arr)): - jarr[i] = arr[i] - return jarr + jarray[i] = arr[i] + return jarray def require_test_compiled(): diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index c4dc0d3af3332..2e6d7ede88551 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -281,7 +281,7 @@ def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_ def textFileStream(self, directory): """ Create an input stream that monitors a Hadoop-compatible file system - for new files and reads them as text files. Files must be wrriten to the + for new files and reads them as text files. Files must be written to the monitored directory by "moving" them from another location within the same file system. File names starting with . are ignored. The text files must be encoded as UTF-8. diff --git a/python/pyspark/tests/test_context.py b/python/pyspark/tests/test_context.py index d86f6c3c1571c..8397ef1c4b62d 100644 --- a/python/pyspark/tests/test_context.py +++ b/python/pyspark/tests/test_context.py @@ -175,8 +175,8 @@ def test_parallelize_eager_cleanup(self): with SparkContext() as sc: temp_files = os.listdir(sc._temp_dir) rdd = sc.parallelize([0, 1, 2]) - post_parallalize_temp_files = os.listdir(sc._temp_dir) - self.assertEqual(temp_files, post_parallalize_temp_files) + post_parallelize_temp_files = os.listdir(sc._temp_dir) + self.assertEqual(temp_files, post_parallelize_temp_files) def test_set_conf(self): # This is for an internal use case. When there is an existing SparkContext, diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 1b09d327a5dfe..8ca4bb37e5fa4 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -59,7 +59,7 @@ def report_times(outfile, boot, init, finish): def add_path(path): - # worker can be used, so donot add path multiple times + # worker can be used, so do not add path multiple times if path not in sys.path: # overwrite system packages sys.path.insert(1, path) diff --git a/python/test_support/userlibrary.py b/python/test_support/userlibrary.py index 73fd26e71f10d..90cd30723ddfe 100755 --- a/python/test_support/userlibrary.py +++ b/python/test_support/userlibrary.py @@ -16,7 +16,7 @@ # """ -Used to test shipping of code depenencies with SparkContext.addPyFile(). +Used to test shipping of code dependencies with SparkContext.addPyFile(). """ diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala index e3af1ccc24f1c..41194f3a2676f 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala @@ -420,7 +420,7 @@ private[spark] object Config extends Logging { val KUBERNETES_FILE_UPLOAD_PATH = ConfigBuilder("spark.kubernetes.file.upload.path") .doc("Hadoop compatible file system path where files from the local file system " + - "will be uploded to in cluster mode.") + "will be uploaded to in cluster mode.") .version("3.0.0") .stringConf .createOptional diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala index 3f2cb485bbb31..22764d9d2eb0e 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala @@ -52,7 +52,7 @@ import org.apache.spark.util.ThreadUtils * time-windowed chunks. Each subscriber can choose to receive their snapshot chunks at different * time intervals. *
- * The subcriber notification callback is guaranteed to be called from a single thread at a time. + * The subscriber notification callback is guaranteed to be called from a single thread at a time. */ private[spark] class ExecutorPodsSnapshotsStoreImpl(subscribersExecutor: ScheduledExecutorService) extends ExecutorPodsSnapshotsStore with Logging { @@ -142,7 +142,7 @@ private[spark] class ExecutorPodsSnapshotsStoreImpl(subscribersExecutor: Schedul } if (notificationCount.decrementAndGet() > 0) { - // There was another concurrent request for this subcriber. Schedule a task to + // There was another concurrent request for this subscriber. Schedule a task to // immediately process snapshots again, so that the subscriber can pick up any // changes that may have happened between the time it started looking at snapshots // above, and the time the concurrent request arrived. diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala index 349cbd04f6027..156740d7c8aee 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala @@ -49,14 +49,14 @@ class KubernetesVolumeUtilsSuite extends SparkFunSuite { val sparkConf = new SparkConf(false) sparkConf.set("test.persistentVolumeClaim.volumeName.mount.path", "/path") sparkConf.set("test.persistentVolumeClaim.volumeName.mount.readOnly", "true") - sparkConf.set("test.persistentVolumeClaim.volumeName.options.claimName", "claimeName") + sparkConf.set("test.persistentVolumeClaim.volumeName.options.claimName", "claimName") val volumeSpec = KubernetesVolumeUtils.parseVolumesWithPrefix(sparkConf, "test.").head assert(volumeSpec.volumeName === "volumeName") assert(volumeSpec.mountPath === "/path") assert(volumeSpec.mountReadOnly) assert(volumeSpec.volumeConf.asInstanceOf[KubernetesPVCVolumeConf] === - KubernetesPVCVolumeConf("claimeName")) + KubernetesPVCVolumeConf("claimName")) } test("Parses emptyDir volumes correctly") { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala index 95ee37e3daa41..38f8fac1858f1 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala @@ -42,7 +42,7 @@ class MountVolumesFeatureStepSuite extends SparkFunSuite { assert(configuredPod.container.getVolumeMounts.get(0).getReadOnly === false) } - test("Mounts pesistentVolumeClaims") { + test("Mounts persistentVolumeClaims") { val volumeConf = KubernetesVolumeSpec( "testVolume", "/tmp", diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala index bd42f6f05655f..5927af176062d 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala @@ -230,7 +230,7 @@ package object config { ConfigBuilder("spark.mesos.appJar.local.resolution.mode") .doc("Provides support for the `local:///` scheme to reference the app jar resource in " + "cluster mode. If user uses a local resource (`local:///path/to/jar`) and the config " + - "option is not used it defaults to `host` eg. the mesos fetcher tries to get the " + + "option is not used it defaults to `host` e.g. the mesos fetcher tries to get the " + "resource from the host's file system. If the value is unknown it prints a warning msg " + "in the dispatcher logs and defaults to `host`. If the value is `container` then spark " + "submit in the container will use the jar in the container's path: `/path/to/jar`.") diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala index b5a360167679e..1091de9967ece 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala @@ -356,7 +356,7 @@ trait MesosSchedulerUtils extends Logging { * https://github.com/apache/mesos/blob/master/src/common/values.cpp * https://github.com/apache/mesos/blob/master/src/common/attributes.cpp * - * @param constraintsVal constains string consisting of ';' separated key-value pairs (separated + * @param constraintsVal contains string consisting of ';' separated key-value pairs (separated * by ':') * @return Map of constraints to match resources offers. */ diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 57af76b46fe64..ac50c1c77a24e 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -781,7 +781,7 @@ private[yarn] class YarnAllocator( val (exitCausedByApp, containerExitReason) = exitStatus match { case ContainerExitStatus.SUCCESS => (false, s"Executor for container $containerId exited because of a YARN event (e.g., " + - "pre-emption) and not because of an error in the running job.") + "preemption) and not because of an error in the running job.") case ContainerExitStatus.PREEMPTED => // Preemption is not the fault of the running tasks, since YARN preempts containers // merely to do resource sharing, and tasks that fail due to preempted executors could diff --git a/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java b/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java index df0ebcc9871ac..89e012ecd42e1 100644 --- a/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java +++ b/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java @@ -112,7 +112,7 @@ public static int waitForPort(int port, int retries) * The ports are all closed afterwards, * so other network services started may grab those same ports. * - * @param numPorts number of required port nubmers + * @param numPorts number of required port numbers * @return array of available port numbers * @throws IOException */ diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala index c2bdd971a0fe9..188a48509212d 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala @@ -250,7 +250,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (Some(shuffleInfo2)) s2.stop() - // another stop & restart should be fine though (eg., we recover from previous corruption) + // another stop & restart should be fine though (e.g., we recover from previous corruption) s3 = new YarnShuffleService s3.setRecoveryPath(new Path(recoveryLocalDir.toURI)) s3.init(yarnConfig) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala index 570663c6f6ad3..7a8e3f1d2ccf4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala @@ -163,7 +163,7 @@ private[python] object PythonTransformFunctionSerializer { private[streaming] object PythonDStream { /** - * can not access PythonTransformFunctionSerializer.register() via Py4j + * cannot access PythonTransformFunctionSerializer.register() via Py4j * Py4JError: PythonTransformFunctionSerializerregister does not exist in the JVM */ def registerSerializer(ser: PythonTransformFunctionSerializer): Unit = { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index e037f26088347..ca4f3670d5ad7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -960,7 +960,7 @@ object DStream { /** Get the creation site of a DStream from the stack trace of when the DStream is created. */ private[streaming] def getCreationSite(): CallSite = { /** Filtering function that excludes non-user classes for a streaming application */ - def streamingExclustionFunction(className: String): Boolean = { + def streamingExclusionFunction(className: String): Boolean = { def doesMatch(r: Regex): Boolean = r.findFirstIn(className).isDefined val isSparkClass = doesMatch(SPARK_CLASS_REGEX) val isSparkExampleClass = doesMatch(SPARK_EXAMPLES_CLASS_REGEX) @@ -972,6 +972,6 @@ object DStream { // non-Spark and non-Scala class, as the rest would streaming application classes. (isSparkClass || isScalaClass) && !isSparkExampleClass && !isSparkStreamingTestClass } - org.apache.spark.util.Utils.getCallSite(streamingExclustionFunction) + org.apache.spark.util.Utils.getCallSite(streamingExclusionFunction) } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala index 006bcad5d68c2..ef040681adf37 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala @@ -39,7 +39,7 @@ private[streaming] object HdfsUtils { throw new IllegalStateException("File exists and there is no append support!") } } else { - // we dont' want to use hdfs erasure coding, as that lacks support for append and hflush + // we don't want to use hdfs erasure coding, as that lacks support for append and hflush SparkHadoopUtil.createFile(dfs, dfsPath, false) } } diff --git a/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java index c7cde5674f547..8a57b0c58b228 100644 --- a/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java @@ -1595,7 +1595,7 @@ public void testContextGetOrCreate() throws InterruptedException { /* TEST DISABLED: Pending a discussion about checkpoint() semantics with TD @SuppressWarnings("unchecked") @Test - public void testCheckpointofIndividualStream() throws InterruptedException { + public void testCheckpointOfIndividualStream() throws InterruptedException { List> inputData = Arrays.asList( Arrays.asList("this", "is"), Arrays.asList("a", "test"), diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala index b2b8d2f41fc80..3ffaa62bd75ac 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala @@ -541,12 +541,12 @@ class MapWithStateSuite extends SparkFunSuite with LocalStreamingContext // Setup the stream computation val ssc = new StreamingContext(sc, Seconds(1)) val inputStream = new TestInputStream(ssc, input, numPartitions = 2) - val trackeStateStream = inputStream.map(x => (x, 1)).mapWithState(mapWithStateSpec) + val trackedStateStream = inputStream.map(x => (x, 1)).mapWithState(mapWithStateSpec) val collectedOutputs = new ConcurrentLinkedQueue[Seq[T]] - val outputStream = new TestOutputStream(trackeStateStream, collectedOutputs) + val outputStream = new TestOutputStream(trackedStateStream, collectedOutputs) val collectedStateSnapshots = new ConcurrentLinkedQueue[Seq[(K, S)]] val stateSnapshotStream = new TestOutputStream( - trackeStateStream.stateSnapshots(), collectedStateSnapshots) + trackedStateStream.stateSnapshots(), collectedStateSnapshots) outputStream.register() stateSnapshotStream.register() diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala index 58ce3a93251a9..f06b1feb8c0cd 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala @@ -320,7 +320,7 @@ class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with B makeStateRDDWithLongLineageDataRDD, reliableCheckpoint = true, rddCollectFunc _) /** Generate MapWithStateRDD with parent state RDD having a long lineage */ - def makeStateRDDWithLongLineageParenttateRDD( + def makeStateRDDWithLongLineageParentStateRDD( longLineageRDD: RDD[Int]): MapWithStateRDD[Int, Int, Int, Int] = { // Create a MapWithStateRDD that has a long lineage using the data RDD with a long lineage @@ -337,9 +337,9 @@ class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with B } testRDD( - makeStateRDDWithLongLineageParenttateRDD, reliableCheckpoint = true, rddCollectFunc _) + makeStateRDDWithLongLineageParentStateRDD, reliableCheckpoint = true, rddCollectFunc _) testRDDPartitions( - makeStateRDDWithLongLineageParenttateRDD, reliableCheckpoint = true, rddCollectFunc _) + makeStateRDDWithLongLineageParentStateRDD, reliableCheckpoint = true, rddCollectFunc _) } test("checkpointing empty state RDD") {