diff --git a/.gitignore b/.gitignore index b4dd1d05a5f6..225aa61eaa17 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ .idea/ .idea_modules/ .project +.pydevproject .scala_dependencies .settings /lib/ @@ -78,3 +79,6 @@ spark-warehouse/ .RData .RHistory .Rhistory +*.Rproj +*.Rproj.* + diff --git a/R/check-cran.sh b/R/check-cran.sh new file mode 100755 index 000000000000..b3a6860961c1 --- /dev/null +++ b/R/check-cran.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -o pipefail +set -e + +FWDIR="$(cd `dirname $0`; pwd)" +pushd $FWDIR > /dev/null + +if [ ! -z "$R_HOME" ] + then + R_SCRIPT_PATH="$R_HOME/bin" + else + # if system wide R_HOME is not found, then exit + if [ ! `command -v R` ]; then + echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed." + exit 1 + fi + R_SCRIPT_PATH="$(dirname $(which R))" +fi +echo "USING R_HOME = $R_HOME" + +# Build the latest docs +$FWDIR/create-docs.sh + +# Build a zip file containing the source package +"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg + +# Run check as-cran. +# TODO(shivaram): Remove the skip tests once we figure out the install mechanism + +VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'` + +"$R_SCRIPT_PATH/"R CMD check --as-cran --no-tests SparkR_"$VERSION".tar.gz + +popd > /dev/null diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore new file mode 100644 index 000000000000..544d203a6dce --- /dev/null +++ b/R/pkg/.Rbuildignore @@ -0,0 +1,5 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.lintr$ +^src-native$ +^html$ diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 963a1bb5806a..ac73d6c79891 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,12 +1,10 @@ Package: SparkR Type: Package -Title: R frontend for Spark +Title: R Frontend for Apache Spark Version: 2.0.0 -Date: 2013-09-09 +Date: 2016-07-07 Author: The Apache Software Foundation Maintainer: Shivaram Venkataraman -Imports: - methods Depends: R (>= 3.0), methods, @@ -14,7 +12,7 @@ Suggests: testthat, e1071, survival -Description: R frontend for Spark +Description: The SparkR package provides an R frontend for Apache Spark. License: Apache License (== 2.0) Collate: 'schema.R' diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index bc3aceba2256..1d74c6d95578 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -341,5 +341,14 @@ export("partitionBy", "rowsBetween", "rangeBetween") -export("window.partitionBy", - "window.orderBy") +export("windowPartitionBy", + "windowOrderBy") + +S3method(print, jobj) +S3method(print, structField) +S3method(print, structType) +S3method(print, summary.GeneralizedLinearRegressionModel) +S3method(structField, character) +S3method(structField, jobj) +S3method(structType, jobj) +S3method(structType, structField) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 47f9203aceca..a4733313ed16 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -35,7 +35,7 @@ setOldClass("structType") #' @slot env An R environment that stores bookkeeping states of the SparkDataFrame #' @slot sdf A Java object reference to the backing Scala DataFrame #' @seealso \link{createDataFrame}, \link{read.json}, \link{table} -#' @seealso \url{https://spark.apache.org/docs/latest/sparkr.html#sparkdataframe} +#' @seealso \url{https://spark.apache.org/docs/latest/sparkr.html#sparkr-dataframes} #' @export #' @examples #'\dontrun{ @@ -74,6 +74,7 @@ dataFrame <- function(sdf, isCached = FALSE) { #' @family SparkDataFrame functions #' @rdname printSchema #' @name printSchema +#' @aliases printSchema,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -99,6 +100,7 @@ setMethod("printSchema", #' @family SparkDataFrame functions #' @rdname schema #' @name schema +#' @aliases schema,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -121,6 +123,7 @@ setMethod("schema", #' @param x A SparkDataFrame #' @param extended Logical. If extended is FALSE, explain() only prints the physical plan. #' @family SparkDataFrame functions +#' @aliases explain,SparkDataFrame-method #' @rdname explain #' @name explain #' @export @@ -154,6 +157,7 @@ setMethod("explain", #' @family SparkDataFrame functions #' @rdname isLocal #' @name isLocal +#' @aliases isLocal,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -179,6 +183,7 @@ setMethod("isLocal", #' truncated. However, if set greater than zero, truncates strings longer than `truncate` #' characters and all cells will be aligned right. #' @family SparkDataFrame functions +#' @aliases showDF,SparkDataFrame-method #' @rdname showDF #' @name showDF #' @export @@ -210,6 +215,7 @@ setMethod("showDF", #' #' @family SparkDataFrame functions #' @rdname show +#' @aliases show,SparkDataFrame-method #' @name show #' @export #' @examples @@ -238,6 +244,7 @@ setMethod("show", "SparkDataFrame", #' @family SparkDataFrame functions #' @rdname dtypes #' @name dtypes +#' @aliases dtypes,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -264,7 +271,7 @@ setMethod("dtypes", #' @family SparkDataFrame functions #' @rdname columns #' @name columns - +#' @aliases columns,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -285,6 +292,7 @@ setMethod("columns", #' @rdname columns #' @name names +#' @aliases names,SparkDataFrame-method #' @note names since 1.5.0 setMethod("names", signature(x = "SparkDataFrame"), @@ -293,6 +301,7 @@ setMethod("names", }) #' @rdname columns +#' @aliases names<-,SparkDataFrame-method #' @name names<- #' @note names<- since 1.5.0 setMethod("names<-", @@ -305,6 +314,7 @@ setMethod("names<-", }) #' @rdname columns +#' @aliases colnames,SparkDataFrame-method #' @name colnames #' @note colnames since 1.6.0 setMethod("colnames", @@ -314,6 +324,7 @@ setMethod("colnames", }) #' @rdname columns +#' @aliases colnames<-,SparkDataFrame-method #' @name colnames<- #' @note colnames<- since 1.6.0 setMethod("colnames<-", @@ -350,6 +361,7 @@ setMethod("colnames<-", #' @param x A SparkDataFrame #' @return value A character vector with the column types of the given SparkDataFrame #' @rdname coltypes +#' @aliases coltypes,SparkDataFrame-method #' @name coltypes #' @family SparkDataFrame functions #' @export @@ -405,6 +417,7 @@ setMethod("coltypes", #' to keep that column as-is. #' @rdname coltypes #' @name coltypes<- +#' @aliases coltypes<-,SparkDataFrame,character-method #' @export #' @examples #'\dontrun{ @@ -453,6 +466,7 @@ setMethod("coltypes<-", #' @family SparkDataFrame functions #' @rdname createOrReplaceTempView #' @name createOrReplaceTempView +#' @aliases createOrReplaceTempView,SparkDataFrame,character-method #' @export #' @examples #'\dontrun{ @@ -479,6 +493,7 @@ setMethod("createOrReplaceTempView", #' @seealso \link{createOrReplaceTempView} #' @rdname registerTempTable-deprecated #' @name registerTempTable +#' @aliases registerTempTable,SparkDataFrame,character-method #' @export #' @examples #'\dontrun{ @@ -508,6 +523,7 @@ setMethod("registerTempTable", #' @family SparkDataFrame functions #' @rdname insertInto #' @name insertInto +#' @aliases insertInto,SparkDataFrame,character-method #' @export #' @examples #'\dontrun{ @@ -534,6 +550,7 @@ setMethod("insertInto", #' @param x A SparkDataFrame #' #' @family SparkDataFrame functions +#' @aliases cache,SparkDataFrame-method #' @rdname cache #' @name cache #' @export @@ -564,6 +581,7 @@ setMethod("cache", #' @family SparkDataFrame functions #' @rdname persist #' @name persist +#' @aliases persist,SparkDataFrame,character-method #' @export #' @examples #'\dontrun{ @@ -591,6 +609,7 @@ setMethod("persist", #' #' @family SparkDataFrame functions #' @rdname unpersist-methods +#' @aliases unpersist,SparkDataFrame-method #' @name unpersist #' @export #' @examples @@ -627,6 +646,7 @@ setMethod("unpersist", #' @family SparkDataFrame functions #' @rdname repartition #' @name repartition +#' @aliases repartition,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -670,6 +690,7 @@ setMethod("repartition", #' #' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects +#' @aliases toJSON,SparkDataFrame-method #' @noRd #' @examples #'\dontrun{ @@ -697,6 +718,7 @@ setMethod("toJSON", #' @family SparkDataFrame functions #' @rdname write.json #' @name write.json +#' @aliases write.json,SparkDataFrame,character-method #' @export #' @examples #'\dontrun{ @@ -722,6 +744,7 @@ setMethod("write.json", #' @param path The directory where the file is saved #' #' @family SparkDataFrame functions +#' @aliases write.orc,SparkDataFrame,character-method #' @rdname write.orc #' @name write.orc #' @export @@ -751,6 +774,7 @@ setMethod("write.orc", #' @family SparkDataFrame functions #' @rdname write.parquet #' @name write.parquet +#' @aliases write.parquet,SparkDataFrame,character-method #' @export #' @examples #'\dontrun{ @@ -770,6 +794,7 @@ setMethod("write.parquet", #' @rdname write.parquet #' @name saveAsParquetFile +#' @aliases saveAsParquetFile,SparkDataFrame,character-method #' @export #' @note saveAsParquetFile since 1.4.0 setMethod("saveAsParquetFile", @@ -789,6 +814,7 @@ setMethod("saveAsParquetFile", #' @param path The directory where the file is saved #' #' @family SparkDataFrame functions +#' @aliases write.text,SparkDataFrame,character-method #' @rdname write.text #' @name write.text #' @export @@ -814,6 +840,7 @@ setMethod("write.text", #' @param x A SparkDataFrame #' #' @family SparkDataFrame functions +#' @aliases distinct,SparkDataFrame-method #' @rdname distinct #' @name distinct #' @export @@ -834,6 +861,7 @@ setMethod("distinct", #' @rdname distinct #' @name unique +#' @aliases unique,SparkDataFrame-method #' @note unique since 1.5.0 setMethod("unique", signature(x = "SparkDataFrame"), @@ -851,6 +879,7 @@ setMethod("unique", #' @param seed Randomness seed value #' #' @family SparkDataFrame functions +#' @aliases sample,SparkDataFrame,logical,numeric-method #' @rdname sample #' @name sample #' @export @@ -879,6 +908,7 @@ setMethod("sample", }) #' @rdname sample +#' @aliases sample_frac,SparkDataFrame,logical,numeric-method #' @name sample_frac #' @note sample_frac since 1.4.0 setMethod("sample_frac", @@ -895,6 +925,7 @@ setMethod("sample_frac", #' @family SparkDataFrame functions #' @rdname nrow #' @name count +#' @aliases count,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -912,6 +943,7 @@ setMethod("count", #' @name nrow #' @rdname nrow +#' @aliases nrow,SparkDataFrame-method #' @note nrow since 1.5.0 setMethod("nrow", signature(x = "SparkDataFrame"), @@ -926,6 +958,7 @@ setMethod("nrow", #' @family SparkDataFrame functions #' @rdname ncol #' @name ncol +#' @aliases ncol,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -948,6 +981,7 @@ setMethod("ncol", #' #' @family SparkDataFrame functions #' @rdname dim +#' @aliases dim,SparkDataFrame-method #' @name dim #' @export #' @examples @@ -972,6 +1006,7 @@ setMethod("dim", #' #' @family SparkDataFrame functions #' @rdname collect +#' @aliases collect,SparkDataFrame-method #' @name collect #' @export #' @examples @@ -1045,6 +1080,7 @@ setMethod("collect", #' @family SparkDataFrame functions #' @rdname limit #' @name limit +#' @aliases limit,SparkDataFrame,numeric-method #' @export #' @examples #' \dontrun{ @@ -1066,6 +1102,7 @@ setMethod("limit", #' @family SparkDataFrame functions #' @rdname take #' @name take +#' @aliases take,SparkDataFrame,numeric-method #' @export #' @examples #'\dontrun{ @@ -1093,6 +1130,7 @@ setMethod("take", #' @return A data.frame #' #' @family SparkDataFrame functions +#' @aliases head,SparkDataFrame-method #' @rdname head #' @name head #' @export @@ -1116,6 +1154,7 @@ setMethod("head", #' @param x A SparkDataFrame #' #' @family SparkDataFrame functions +#' @aliases first,SparkDataFrame-method #' @rdname first #' @name first #' @export @@ -1166,6 +1205,7 @@ setMethod("toRDD", #' @param x a SparkDataFrame #' @return a GroupedData #' @family SparkDataFrame functions +#' @aliases groupBy,SparkDataFrame-method #' @rdname groupBy #' @name groupBy #' @export @@ -1193,6 +1233,7 @@ setMethod("groupBy", #' @rdname groupBy #' @name group_by +#' @aliases group_by,SparkDataFrame-method #' @note group_by since 1.4.0 setMethod("group_by", signature(x = "SparkDataFrame"), @@ -1206,7 +1247,8 @@ setMethod("group_by", #' #' @param x a SparkDataFrame #' @family SparkDataFrame functions -#' @rdname agg +#' @aliases agg,SparkDataFrame-method +#' @rdname summarize #' @name agg #' @export #' @note agg since 1.4.0 @@ -1216,8 +1258,9 @@ setMethod("agg", agg(groupBy(x), ...) }) -#' @rdname agg +#' @rdname summarize #' @name summarize +#' @aliases summarize,SparkDataFrame-method #' @note summarize since 1.4.0 setMethod("summarize", signature(x = "SparkDataFrame"), @@ -1256,6 +1299,7 @@ dapplyInternal <- function(x, func, schema) { #' It must match the output of func. #' @family SparkDataFrame functions #' @rdname dapply +#' @aliases dapply,SparkDataFrame,function,structType-method #' @name dapply #' @seealso \link{dapplyCollect} #' @export @@ -1294,7 +1338,7 @@ setMethod("dapply", #' dapplyCollect #' #' Apply a function to each partition of a SparkDataFrame and collect the result back -#’ to R as a data.frame. +#' to R as a data.frame. #' #' @param x A SparkDataFrame #' @param func A function to be applied to each partition of the SparkDataFrame. @@ -1303,6 +1347,7 @@ setMethod("dapply", #' The output of func should be a R data.frame. #' @family SparkDataFrame functions #' @rdname dapplyCollect +#' @aliases dapplyCollect,SparkDataFrame,function-method #' @name dapplyCollect #' @seealso \link{dapply} #' @export @@ -1358,6 +1403,7 @@ setMethod("dapplyCollect", #' output column with preferred output column name and corresponding data type. #' @return a SparkDataFrame #' @family SparkDataFrame functions +#' @aliases gapply,SparkDataFrame-method #' @rdname gapply #' @name gapply #' @seealso \link{gapplyCollect} @@ -1446,6 +1492,7 @@ setMethod("gapply", #' The output of `func` is a local R data.frame. #' @return a data.frame #' @family SparkDataFrame functions +#' @aliases gapplyCollect,SparkDataFrame-method #' @rdname gapplyCollect #' @name gapplyCollect #' @seealso \link{gapply} @@ -1592,6 +1639,7 @@ getColumn <- function(x, c) { #' @rdname select #' @name $ +#' @aliases $,SparkDataFrame-method #' @note $ since 1.4.0 setMethod("$", signature(x = "SparkDataFrame"), function(x, name) { @@ -1600,6 +1648,7 @@ setMethod("$", signature(x = "SparkDataFrame"), #' @rdname select #' @name $<- +#' @aliases $<-,SparkDataFrame-method #' @note $<- since 1.4.0 setMethod("$<-", signature(x = "SparkDataFrame"), function(x, name, value) { @@ -1618,6 +1667,7 @@ setClassUnion("numericOrcharacter", c("numeric", "character")) #' @rdname subset #' @name [[ +#' @aliases [[,SparkDataFrame,numericOrcharacter-method #' @note [[ since 1.4.0 setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"), function(x, i) { @@ -1630,6 +1680,7 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"), #' @rdname subset #' @name [ +#' @aliases [,SparkDataFrame-method #' @note [ since 1.4.0 setMethod("[", signature(x = "SparkDataFrame"), function(x, i, j, ..., drop = F) { @@ -1677,6 +1728,7 @@ setMethod("[", signature(x = "SparkDataFrame"), #' @return A new SparkDataFrame containing only the rows that meet the condition with selected columns #' @export #' @family SparkDataFrame functions +#' @aliases subset,SparkDataFrame-method #' @rdname subset #' @name subset #' @family subsetting functions @@ -1714,6 +1766,7 @@ setMethod("subset", signature(x = "SparkDataFrame"), #' @export #' @family SparkDataFrame functions #' @rdname select +#' @aliases select,SparkDataFrame,character-method #' @name select #' @family subsetting functions #' @examples @@ -1743,6 +1796,7 @@ setMethod("select", signature(x = "SparkDataFrame", col = "character"), #' @rdname select #' @export +#' @aliases select,SparkDataFrame,Column-method #' @note select(SparkDataFrame, Column) since 1.4.0 setMethod("select", signature(x = "SparkDataFrame", col = "Column"), function(x, col, ...) { @@ -1755,6 +1809,7 @@ setMethod("select", signature(x = "SparkDataFrame", col = "Column"), #' @rdname select #' @export +#' @aliases select,SparkDataFrame,list-method #' @note select(SparkDataFrame, list) since 1.4.0 setMethod("select", signature(x = "SparkDataFrame", col = "list"), @@ -1779,6 +1834,7 @@ setMethod("select", #' @param ... Additional expressions #' @return A SparkDataFrame #' @family SparkDataFrame functions +#' @aliases selectExpr,SparkDataFrame,character-method #' @rdname selectExpr #' @name selectExpr #' @export @@ -1808,6 +1864,7 @@ setMethod("selectExpr", #' @param col A Column expression. #' @return A SparkDataFrame with the new column added or the existing column replaced. #' @family SparkDataFrame functions +#' @aliases withColumn,SparkDataFrame,character,Column-method #' @rdname withColumn #' @name withColumn #' @seealso \link{rename} \link{mutate} @@ -1837,6 +1894,7 @@ setMethod("withColumn", #' @param col a named argument of the form name = col #' @return A new SparkDataFrame with the new columns added or replaced. #' @family SparkDataFrame functions +#' @aliases mutate,SparkDataFrame-method #' @rdname mutate #' @name mutate #' @seealso \link{rename} \link{withColumn} @@ -1912,6 +1970,7 @@ setMethod("mutate", #' @export #' @rdname mutate +#' @aliases transform,SparkDataFrame-method #' @name transform #' @note transform since 1.5.0 setMethod("transform", @@ -1931,6 +1990,7 @@ setMethod("transform", #' @family SparkDataFrame functions #' @rdname rename #' @name withColumnRenamed +#' @aliases withColumnRenamed,SparkDataFrame,character,character-method #' @seealso \link{mutate} #' @export #' @examples @@ -1957,6 +2017,7 @@ setMethod("withColumnRenamed", #' @param ... A named pair of the form new_column_name = existing_column #' @rdname rename #' @name rename +#' @aliases rename,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -1999,6 +2060,7 @@ setClassUnion("characterOrColumn", c("character", "Column")) #' a character vector is specified for col #' @return A SparkDataFrame where all elements are sorted. #' @family SparkDataFrame functions +#' @aliases arrange,SparkDataFrame,Column-method #' @rdname arrange #' @name arrange #' @export @@ -2026,6 +2088,7 @@ setMethod("arrange", #' @rdname arrange #' @name arrange +#' @aliases arrange,SparkDataFrame,character-method #' @export #' @note arrange(SparkDataFrame, character) since 1.4.0 setMethod("arrange", @@ -2059,6 +2122,7 @@ setMethod("arrange", #' @rdname arrange #' @name orderBy +#' @aliases orderBy,SparkDataFrame,characterOrColumn-method #' @export #' @note orderBy(SparkDataFrame, characterOrColumn) since 1.4.0 setMethod("orderBy", @@ -2076,6 +2140,7 @@ setMethod("orderBy", #' or a string containing a SQL statement #' @return A SparkDataFrame containing only the rows that meet the condition. #' @family SparkDataFrame functions +#' @aliases filter,SparkDataFrame,characterOrColumn-method #' @rdname filter #' @name filter #' @family subsetting functions @@ -2101,6 +2166,7 @@ setMethod("filter", #' @rdname filter #' @name where +#' @aliases where,SparkDataFrame,characterOrColumn-method #' @note where since 1.4.0 setMethod("where", signature(x = "SparkDataFrame", condition = "characterOrColumn"), @@ -2118,6 +2184,7 @@ setMethod("where", #' If the first argument contains a character vector, the followings are ignored. #' @return A SparkDataFrame with duplicate rows removed. #' @family SparkDataFrame functions +#' @aliases dropDuplicates,SparkDataFrame-method #' @rdname dropDuplicates #' @name dropDuplicates #' @export @@ -2164,6 +2231,7 @@ setMethod("dropDuplicates", #' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner". #' @return A SparkDataFrame containing the result of the join operation. #' @family SparkDataFrame functions +#' @aliases join,SparkDataFrame,SparkDataFrame-method #' @rdname join #' @name join #' @seealso \link{merge} @@ -2223,6 +2291,7 @@ setMethod("join", #' outer join will be returned. If all.x and all.y are set to TRUE, a full #' outer join will be returned. #' @family SparkDataFrame functions +#' @aliases merge,SparkDataFrame,SparkDataFrame-method #' @rdname merge #' @seealso \link{join} #' @export @@ -2360,6 +2429,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' @family SparkDataFrame functions #' @rdname union #' @name union +#' @aliases union,SparkDataFrame,SparkDataFrame-method #' @seealso \link{rbind} #' @export #' @examples @@ -2381,6 +2451,7 @@ setMethod("union", #' unionAll is deprecated - use union instead #' @rdname union #' @name unionAll +#' @aliases unionAll,SparkDataFrame,SparkDataFrame-method #' @export #' @note unionAll since 1.4.0 setMethod("unionAll", @@ -2399,6 +2470,7 @@ setMethod("unionAll", #' @param ... Additional SparkDataFrame #' @return A SparkDataFrame containing the result of the union. #' @family SparkDataFrame functions +#' @aliases rbind,SparkDataFrame-method #' @rdname rbind #' @name rbind #' @seealso \link{union} @@ -2428,6 +2500,7 @@ setMethod("rbind", #' @param y A SparkDataFrame #' @return A SparkDataFrame containing the result of the intersect. #' @family SparkDataFrame functions +#' @aliases intersect,SparkDataFrame,SparkDataFrame-method #' @rdname intersect #' @name intersect #' @export @@ -2455,6 +2528,7 @@ setMethod("intersect", #' @param y A SparkDataFrame #' @return A SparkDataFrame containing the result of the except operation. #' @family SparkDataFrame functions +#' @aliases except,SparkDataFrame,SparkDataFrame-method #' @rdname except #' @name except #' @export @@ -2498,6 +2572,7 @@ setMethod("except", #' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default) #' #' @family SparkDataFrame functions +#' @aliases write.df,SparkDataFrame,character-method #' @rdname write.df #' @name write.df #' @export @@ -2529,6 +2604,7 @@ setMethod("write.df", #' @rdname write.df #' @name saveDF +#' @aliases saveDF,SparkDataFrame,character-method #' @export #' @note saveDF since 1.4.0 setMethod("saveDF", @@ -2558,6 +2634,7 @@ setMethod("saveDF", #' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default) #' #' @family SparkDataFrame functions +#' @aliases saveAsTable,SparkDataFrame,character-method #' @rdname saveAsTable #' @name saveAsTable #' @export @@ -2595,6 +2672,7 @@ setMethod("saveAsTable", #' @param ... Additional expressions #' @return A SparkDataFrame #' @family SparkDataFrame functions +#' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method #' @rdname summary #' @name describe #' @export @@ -2618,6 +2696,7 @@ setMethod("describe", #' @rdname summary #' @name describe +#' @aliases describe,SparkDataFrame-method #' @note describe(SparkDataFrame) since 1.4.0 setMethod("describe", signature(x = "SparkDataFrame"), @@ -2628,6 +2707,7 @@ setMethod("describe", #' @rdname summary #' @name summary +#' @aliases summary,SparkDataFrame-method #' @note summary(SparkDataFrame) since 1.5.0 setMethod("summary", signature(object = "SparkDataFrame"), @@ -2653,6 +2733,7 @@ setMethod("summary", #' #' @family SparkDataFrame functions #' @rdname nafunctions +#' @aliases dropna,SparkDataFrame-method #' @name dropna #' @export #' @examples @@ -2682,6 +2763,7 @@ setMethod("dropna", #' @rdname nafunctions #' @name na.omit +#' @aliases na.omit,SparkDataFrame-method #' @export #' @note na.omit since 1.5.0 setMethod("na.omit", @@ -2707,6 +2789,7 @@ setMethod("na.omit", #' #' @rdname nafunctions #' @name fillna +#' @aliases fillna,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -2770,6 +2853,7 @@ setMethod("fillna", #' @param x a SparkDataFrame #' @return a data.frame #' @family SparkDataFrame functions +#' @aliases as.data.frame,SparkDataFrame-method #' @rdname as.data.frame #' @examples \dontrun{ #' @@ -2791,6 +2875,7 @@ setMethod("as.data.frame", #' #' @family SparkDataFrame functions #' @rdname attach +#' @aliases attach,SparkDataFrame-method #' @param what (SparkDataFrame) The SparkDataFrame to attach #' @param pos (integer) Specify position in search() where to attach. #' @param name (character) Name to use for the attached SparkDataFrame. Names @@ -2821,6 +2906,7 @@ setMethod("attach", #' #' @rdname with #' @family SparkDataFrame functions +#' @aliases with,SparkDataFrame-method #' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment. #' @param expr (expression) Expression to evaluate. #' @param ... arguments to be passed to future methods. @@ -2844,6 +2930,7 @@ setMethod("with", #' #' @name str #' @rdname str +#' @aliases str,SparkDataFrame-method #' @family SparkDataFrame functions #' @param object a SparkDataFrame #' @examples \dontrun{ @@ -2925,6 +3012,7 @@ setMethod("str", #' @family SparkDataFrame functions #' @rdname drop #' @name drop +#' @aliases drop,SparkDataFrame-method #' @export #' @examples #'\dontrun{ @@ -2950,6 +3038,10 @@ setMethod("drop", }) # Expose base::drop +#' @name drop +#' @rdname drop +#' @aliases drop,ANY-method +#' @export setMethod("drop", signature(x = "ANY"), function(x) { @@ -2966,6 +3058,7 @@ setMethod("drop", #' @param colname the name of the column to build the histogram from. #' @return a data.frame with the histogram statistics, i.e., counts and centroids. #' @rdname histogram +#' @aliases histogram,SparkDataFrame,characterOrColumn-method #' @family SparkDataFrame functions #' @export #' @examples @@ -3025,7 +3118,7 @@ setMethod("histogram", # columns AND all of them have names 100 characters long (which is very unlikely), # AND they run 1 billion histograms, the probability of collision will roughly be # 1 in 4.4 x 10 ^ 96 - colname <- paste(base:::sample(c(letters, LETTERS), + colname <- paste(base::sample(c(letters, LETTERS), size = min(max(nchar(colnames(df))) + 1, 100), replace = TRUE), collapse = "") @@ -3100,6 +3193,7 @@ setMethod("histogram", #' @family SparkDataFrame functions #' @rdname write.jdbc #' @name write.jdbc +#' @aliases write.jdbc,SparkDataFrame,character,character-method #' @export #' @examples #'\dontrun{ @@ -3127,6 +3221,7 @@ setMethod("write.jdbc", #' @param seed A seed to use for random split #' #' @family SparkDataFrame functions +#' @aliases randomSplit,SparkDataFrame,numeric-method #' @rdname randomSplit #' @name randomSplit #' @export diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index bc0daa25c9f6..a14bcd91b3ea 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -48,7 +48,9 @@ getInternalType <- function(x) { #' @return whatever the target returns #' @noRd dispatchFunc <- function(newFuncSig, x, ...) { - funcName <- as.character(sys.call(sys.parent())[[1]]) + # When called with SparkR::createDataFrame, sys.call()[[1]] returns c(::, SparkR, createDataFrame) + callsite <- as.character(sys.call(sys.parent())[[1]]) + funcName <- callsite[[length(callsite)]] f <- get(paste0(funcName, ".default")) # Strip sqlContext from list of parameters and then pass the rest along. contextNames <- c("org.apache.spark.sql.SQLContext", @@ -267,6 +269,9 @@ as.DataFrame.default <- function(data, schema = NULL, samplingRatio = 1.0) { createDataFrame(data, schema, samplingRatio) } +#' @rdname createDataFrame +#' @aliases as.DataFrame +#' @export as.DataFrame <- function(x, ...) { dispatchFunc("as.DataFrame(data, schema = NULL, samplingRatio = 1.0)", x, ...) } diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 9f3b1e4be560..474638009624 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -22,10 +22,10 @@ NULL #' S4 class that represents a WindowSpec #' -#' WindowSpec can be created by using window.partitionBy() or window.orderBy() +#' WindowSpec can be created by using windowPartitionBy() or windowOrderBy() #' #' @rdname WindowSpec -#' @seealso \link{window.partitionBy}, \link{window.orderBy} +#' @seealso \link{windowPartitionBy}, \link{windowOrderBy} #' #' @param sws A Java object reference to the backing Scala WindowSpec #' @export @@ -58,6 +58,7 @@ setMethod("show", "WindowSpec", #' @return a WindowSpec #' @rdname partitionBy #' @name partitionBy +#' @aliases partitionBy,WindowSpec-method #' @family windowspec_method #' @export #' @examples @@ -89,6 +90,7 @@ setMethod("partitionBy", #' @return a WindowSpec #' @rdname arrange #' @name orderBy +#' @aliases orderBy,WindowSpec,character-method #' @family windowspec_method #' @export #' @examples @@ -105,6 +107,7 @@ setMethod("orderBy", #' @rdname arrange #' @name orderBy +#' @aliases orderBy,WindowSpec,Column-method #' @export #' @note orderBy(WindowSpec, Column) since 2.0.0 setMethod("orderBy", @@ -131,6 +134,7 @@ setMethod("orderBy", #' The frame is unbounded if this is the maximum long value. #' @return a WindowSpec #' @rdname rowsBetween +#' @aliases rowsBetween,WindowSpec,numeric,numeric-method #' @name rowsBetween #' @family windowspec_method #' @export @@ -162,6 +166,7 @@ setMethod("rowsBetween", #' The frame is unbounded if this is the maximum long value. #' @return a WindowSpec #' @rdname rangeBetween +#' @aliases rangeBetween,WindowSpec,numeric,numeric-method #' @name rangeBetween #' @family windowspec_method #' @export @@ -187,6 +192,7 @@ setMethod("rangeBetween", #' #' @rdname over #' @name over +#' @aliases over,Column,WindowSpec-method #' @family colum_func #' @export #' @note over since 2.0.0 diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 1a65912d3aed..0edb9d2ae5c4 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -44,6 +44,9 @@ setMethod("initialize", "Column", function(.Object, jc) { .Object }) +#' @rdname column +#' @name column +#' @aliases column,jobj-method setMethod("column", signature(x = "jobj"), function(x) { @@ -52,6 +55,7 @@ setMethod("column", #' @rdname show #' @name show +#' @aliases show,Column-method #' @export #' @note show(Column) since 1.4.0 setMethod("show", "Column", @@ -131,8 +135,12 @@ createMethods() #' #' Set a new name for a column #' +#' @param object Column to rename +#' @param data new name to use +#' #' @rdname alias #' @name alias +#' @aliases alias,Column-method #' @family colum_func #' @export #' @note alias since 1.4.0 @@ -153,6 +161,7 @@ setMethod("alias", #' @rdname substr #' @name substr #' @family colum_func +#' @aliases substr,Column-method #' #' @param start starting position #' @param stop ending position @@ -171,8 +180,9 @@ setMethod("substr", signature(x = "Column"), #' @rdname startsWith #' @name startsWith #' @family colum_func +#' @aliases startsWith,Column-method #' -#' @param x vector of character string whose “starts” are considered +#' @param x vector of character string whose "starts" are considered #' @param prefix character vector (often of length one) #' @note startsWith since 1.4.0 setMethod("startsWith", signature(x = "Column"), @@ -189,8 +199,9 @@ setMethod("startsWith", signature(x = "Column"), #' @rdname endsWith #' @name endsWith #' @family colum_func +#' @aliases endsWith,Column-method #' -#' @param x vector of character string whose “ends” are considered +#' @param x vector of character string whose "ends" are considered #' @param suffix character vector (often of length one) #' @note endsWith since 1.4.0 setMethod("endsWith", signature(x = "Column"), @@ -206,6 +217,7 @@ setMethod("endsWith", signature(x = "Column"), #' @rdname between #' @name between #' @family colum_func +#' @aliases between,Column-method #' #' @param bounds lower and upper bounds #' @note between since 1.5.0 @@ -224,10 +236,10 @@ setMethod("between", signature(x = "Column"), #' @rdname cast #' @name cast #' @family colum_func +#' @aliases cast,Column-method #' #' @examples \dontrun{ #' cast(df$age, "string") -#' cast(df$name, list(type="array", elementType="byte", containsNull = TRUE)) #' } #' @note cast since 1.4.0 setMethod("cast", @@ -235,12 +247,8 @@ setMethod("cast", function(x, dataType) { if (is.character(dataType)) { column(callJMethod(x@jc, "cast", dataType)) - } else if (is.list(dataType)) { - json <- tojson(dataType) - jdataType <- callJStatic("org.apache.spark.sql.types.DataType", "fromJson", json) - column(callJMethod(x@jc, "cast", jdataType)) } else { - stop("dataType should be character or list") + stop("dataType should be character") } }) @@ -248,7 +256,7 @@ setMethod("cast", #' #' @rdname match #' @name %in% -#' @aliases %in% +#' @aliases %in%,Column-method #' @return a matched values as a result of comparing with given values. #' @export #' @examples @@ -272,6 +280,7 @@ setMethod("%in%", #' @rdname otherwise #' @name otherwise #' @family colum_func +#' @aliases otherwise,Column-method #' @export #' @note otherwise since 1.5.0 setMethod("otherwise", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 52d46f9d7612..573c915a5c67 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -27,6 +27,7 @@ NULL #' @rdname lit #' @name lit #' @export +#' @aliases lit,ANY-method #' @examples #' \dontrun{ #' lit(df$name) @@ -46,11 +47,14 @@ setMethod("lit", signature("ANY"), #' #' Computes the absolute value. #' +#' @param x Column to compute on. +#' #' @rdname abs #' @name abs #' @family normal_funcs #' @export #' @examples \dontrun{abs(df$c)} +#' @aliases abs,Column-method #' @note abs since 1.5.0 setMethod("abs", signature(x = "Column"), @@ -64,11 +68,14 @@ setMethod("abs", #' Computes the cosine inverse of the given value; the returned angle is in the range #' 0.0 through pi. #' +#' @param x Column to compute on. +#' #' @rdname acos #' @name acos #' @family math_funcs #' @export #' @examples \dontrun{acos(df$c)} +#' @aliases acos,Column-method #' @note acos since 1.5.0 setMethod("acos", signature(x = "Column"), @@ -82,10 +89,13 @@ setMethod("acos", #' Returns the approximate number of distinct items in a group. This is a column #' aggregate function. #' +#' @param x Column to compute on. +#' #' @rdname approxCountDistinct #' @name approxCountDistinct #' @return the approximate number of distinct items in a group. #' @export +#' @aliases approxCountDistinct,Column-method #' @examples \dontrun{approxCountDistinct(df$c)} #' @note approxCountDistinct(Column) since 1.4.0 setMethod("approxCountDistinct", @@ -100,10 +110,13 @@ setMethod("approxCountDistinct", #' Computes the numeric value of the first character of the string column, and returns the #' result as a int column. #' +#' @param x Column to compute on. +#' #' @rdname ascii #' @name ascii #' @family string_funcs #' @export +#' @aliases ascii,Column-method #' @examples \dontrun{\dontrun{ascii(df$c)}} #' @note ascii since 1.5.0 setMethod("ascii", @@ -118,10 +131,13 @@ setMethod("ascii", #' Computes the sine inverse of the given value; the returned angle is in the range #' -pi/2 through pi/2. #' +#' @param x Column to compute on. +#' #' @rdname asin #' @name asin #' @family math_funcs #' @export +#' @aliases asin,Column-method #' @examples \dontrun{asin(df$c)} #' @note asin since 1.5.0 setMethod("asin", @@ -135,10 +151,13 @@ setMethod("asin", #' #' Computes the tangent inverse of the given value. #' +#' @param x Column to compute on. +#' #' @rdname atan #' @name atan #' @family math_funcs #' @export +#' @aliases atan,Column-method #' @examples \dontrun{atan(df$c)} #' @note atan since 1.5.0 setMethod("atan", @@ -152,10 +171,13 @@ setMethod("atan", #' #' Aggregate function: returns the average of the values in a group. #' +#' @param x Column to compute on. +#' #' @rdname avg #' @name avg #' @family agg_funcs #' @export +#' @aliases avg,Column-method #' @examples \dontrun{avg(df$c)} #' @note avg since 1.4.0 setMethod("avg", @@ -170,10 +192,13 @@ setMethod("avg", #' Computes the BASE64 encoding of a binary column and returns it as a string column. #' This is the reverse of unbase64. #' +#' @param x Column to compute on. +#' #' @rdname base64 #' @name base64 #' @family string_funcs #' @export +#' @aliases base64,Column-method #' @examples \dontrun{base64(df$c)} #' @note base64 since 1.5.0 setMethod("base64", @@ -188,10 +213,13 @@ setMethod("base64", #' An expression that returns the string representation of the binary value of the given long #' column. For example, bin("12") returns "1100". #' +#' @param x Column to compute on. +#' #' @rdname bin #' @name bin #' @family math_funcs #' @export +#' @aliases bin,Column-method #' @examples \dontrun{bin(df$c)} #' @note bin since 1.5.0 setMethod("bin", @@ -205,10 +233,13 @@ setMethod("bin", #' #' Computes bitwise NOT. #' +#' @param x Column to compute on. +#' #' @rdname bitwiseNOT #' @name bitwiseNOT #' @family normal_funcs #' @export +#' @aliases bitwiseNOT,Column-method #' @examples \dontrun{bitwiseNOT(df$c)} #' @note bitwiseNOT since 1.5.0 setMethod("bitwiseNOT", @@ -222,10 +253,13 @@ setMethod("bitwiseNOT", #' #' Computes the cube-root of the given value. #' +#' @param x Column to compute on. +#' #' @rdname cbrt #' @name cbrt #' @family math_funcs #' @export +#' @aliases cbrt,Column-method #' @examples \dontrun{cbrt(df$c)} #' @note cbrt since 1.4.0 setMethod("cbrt", @@ -239,10 +273,13 @@ setMethod("cbrt", #' #' Computes the ceiling of the given value. #' +#' @param x Column to compute on. +#' #' @rdname ceil #' @name ceil #' @family math_funcs #' @export +#' @aliases ceil,Column-method #' @examples \dontrun{ceil(df$c)} #' @note ceil since 1.5.0 setMethod("ceil", @@ -263,11 +300,14 @@ col <- function(x) { #' Returns a Column based on the given column name #' #' Returns a Column based on the given column name. +# +#' @param x Character column name. #' #' @rdname column #' @name column #' @family normal_funcs #' @export +#' @aliases column,character-method #' @examples \dontrun{column(df)} #' @note column since 1.6.0 setMethod("column", @@ -279,10 +319,13 @@ setMethod("column", #' #' Computes the Pearson Correlation Coefficient for two Columns. #' +#' @param x Column to compute on. +#' #' @rdname corr #' @name corr #' @family math_funcs #' @export +#' @aliases corr,Column-method #' @examples \dontrun{corr(df$c, df$d)} #' @note corr since 1.6.0 setMethod("corr", signature(x = "Column"), @@ -296,10 +339,13 @@ setMethod("corr", signature(x = "Column"), #' #' Compute the sample covariance between two expressions. #' +#' @param x Column to compute on. +#' #' @rdname cov #' @name cov #' @family math_funcs #' @export +#' @aliases cov,characterOrColumn-method #' @examples #' \dontrun{ #' cov(df$c, df$d) @@ -315,7 +361,11 @@ setMethod("cov", signature(x = "characterOrColumn"), }) #' @rdname cov +#' +#' @param col1 First column to compute cov_samp. +#' @param col2 Second column to compute cov_samp. #' @name covar_samp +#' @aliases covar_samp,characterOrColumn,characterOrColumn-method #' @note covar_samp since 2.0.0 setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"), function(col1, col2) { @@ -332,10 +382,14 @@ setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterO #' #' Compute the population covariance between two expressions. #' +#' @param col1 First column to compute cov_pop. +#' @param col2 Second column to compute cov_pop. +#' #' @rdname covar_pop #' @name covar_pop #' @family math_funcs #' @export +#' @aliases covar_pop,characterOrColumn,characterOrColumn-method #' @examples #' \dontrun{ #' covar_pop(df$c, df$d) @@ -357,9 +411,12 @@ setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOr #' #' Computes the cosine of the given value. #' +#' @param x Column to compute on. +#' #' @rdname cos #' @name cos #' @family math_funcs +#' @aliases cos,Column-method #' @export #' @examples \dontrun{cos(df$c)} #' @note cos since 1.5.0 @@ -374,9 +431,12 @@ setMethod("cos", #' #' Computes the hyperbolic cosine of the given value. #' +#' @param x Column to compute on. +#' #' @rdname cosh #' @name cosh #' @family math_funcs +#' @aliases cosh,Column-method #' @export #' @examples \dontrun{cosh(df$c)} #' @note cosh since 1.5.0 @@ -391,9 +451,12 @@ setMethod("cosh", #' #' Returns the number of items in a group. This is a column aggregate function. #' -#' @rdname count +#' @param x Column to compute on. +#' +#' @rdname nrow #' @name count #' @family agg_funcs +#' @aliases count,Column-method #' @export #' @examples \dontrun{count(df$c)} #' @note count since 1.4.0 @@ -409,9 +472,12 @@ setMethod("count", #' Calculates the cyclic redundancy check value (CRC32) of a binary column and #' returns the value as a bigint. #' +#' @param x Column to compute on. +#' #' @rdname crc32 #' @name crc32 #' @family misc_funcs +#' @aliases crc32,Column-method #' @export #' @examples \dontrun{crc32(df$c)} #' @note crc32 since 1.5.0 @@ -426,9 +492,12 @@ setMethod("crc32", #' #' Calculates the hash code of given columns, and returns the result as a int column. #' +#' @param x Column to compute on. +#' #' @rdname hash #' @name hash #' @family misc_funcs +#' @aliases hash,Column-method #' @export #' @examples \dontrun{hash(df$c)} #' @note hash since 2.0.0 @@ -447,9 +516,12 @@ setMethod("hash", #' #' Extracts the day of the month as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname dayofmonth #' @name dayofmonth #' @family datetime_funcs +#' @aliases dayofmonth,Column-method #' @export #' @examples \dontrun{dayofmonth(df$c)} #' @note dayofmonth since 1.5.0 @@ -464,9 +536,12 @@ setMethod("dayofmonth", #' #' Extracts the day of the year as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname dayofyear #' @name dayofyear #' @family datetime_funcs +#' @aliases dayofyear,Column-method #' @export #' @examples \dontrun{dayofyear(df$c)} #' @note dayofyear since 1.5.0 @@ -482,9 +557,13 @@ setMethod("dayofyear", #' Computes the first argument into a string from a binary using the provided character set #' (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). #' +#' @param x Column to compute on. +#' @param charset Character set to use +#' #' @rdname decode #' @name decode #' @family string_funcs +#' @aliases decode,Column,character-method #' @export #' @examples \dontrun{decode(df$c, "UTF-8")} #' @note decode since 1.6.0 @@ -500,9 +579,13 @@ setMethod("decode", #' Computes the first argument into a binary from a string using the provided character set #' (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). #' +#' @param x Column to compute on. +#' @param charset Character set to use +#' #' @rdname encode #' @name encode #' @family string_funcs +#' @aliases encode,Column,character-method #' @export #' @examples \dontrun{encode(df$c, "UTF-8")} #' @note encode since 1.6.0 @@ -517,9 +600,12 @@ setMethod("encode", #' #' Computes the exponential of the given value. #' +#' @param x Column to compute on. +#' #' @rdname exp #' @name exp #' @family math_funcs +#' @aliases exp,Column-method #' @export #' @examples \dontrun{exp(df$c)} #' @note exp since 1.5.0 @@ -534,8 +620,11 @@ setMethod("exp", #' #' Computes the exponential of the given value minus one. #' +#' @param x Column to compute on. +#' #' @rdname expm1 #' @name expm1 +#' @aliases expm1,Column-method #' @family math_funcs #' @export #' @examples \dontrun{expm1(df$c)} @@ -551,8 +640,11 @@ setMethod("expm1", #' #' Computes the factorial of the given value. #' +#' @param x Column to compute on. +#' #' @rdname factorial #' @name factorial +#' @aliases factorial,Column-method #' @family math_funcs #' @export #' @examples \dontrun{factorial(df$c)} @@ -571,8 +663,11 @@ setMethod("factorial", #' The function by default returns the first values it sees. It will return the first non-missing #' value it sees when na.rm is set to true. If all values are missing, then NA is returned. #' +#' @param x Column to compute on. +#' #' @rdname first #' @name first +#' @aliases first,characterOrColumn-method #' @family agg_funcs #' @export #' @examples @@ -597,8 +692,11 @@ setMethod("first", #' #' Computes the floor of the given value. #' +#' @param x Column to compute on. +#' #' @rdname floor #' @name floor +#' @aliases floor,Column-method #' @family math_funcs #' @export #' @examples \dontrun{floor(df$c)} @@ -614,9 +712,12 @@ setMethod("floor", #' #' Computes hex value of the given column. #' +#' @param x Column to compute on. +#' #' @rdname hex #' @name hex #' @family math_funcs +#' @aliases hex,Column-method #' @export #' @examples \dontrun{hex(df$c)} #' @note hex since 1.5.0 @@ -631,8 +732,11 @@ setMethod("hex", #' #' Extracts the hours as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname hour #' @name hour +#' @aliases hour,Column-method #' @family datetime_funcs #' @export #' @examples \dontrun{hour(df$c)} @@ -651,9 +755,12 @@ setMethod("hour", #' #' For example, "hello world" will become "Hello World". #' +#' @param x Column to compute on. +#' #' @rdname initcap #' @name initcap #' @family string_funcs +#' @aliases initcap,Column-method #' @export #' @examples \dontrun{initcap(df$c)} #' @note initcap since 1.5.0 @@ -668,9 +775,12 @@ setMethod("initcap", #' #' Return true if the column is NaN, alias for \link{isnan} #' +#' @param x Column to compute on. +#' #' @rdname is.nan #' @name is.nan #' @family normal_funcs +#' @aliases is.nan,Column-method #' @export #' @examples #' \dontrun{ @@ -686,6 +796,7 @@ setMethod("is.nan", #' @rdname is.nan #' @name isnan +#' @aliases isnan,Column-method #' @note isnan since 2.0.0 setMethod("isnan", signature(x = "Column"), @@ -698,8 +809,11 @@ setMethod("isnan", #' #' Aggregate function: returns the kurtosis of the values in a group. #' +#' @param x Column to compute on. +#' #' @rdname kurtosis #' @name kurtosis +#' @aliases kurtosis,Column-method #' @family agg_funcs #' @export #' @examples \dontrun{kurtosis(df$c)} @@ -718,8 +832,11 @@ setMethod("kurtosis", #' The function by default returns the last values it sees. It will return the last non-missing #' value it sees when na.rm is set to true. If all values are missing, then NA is returned. #' +#' @param x Column to compute on. +#' #' @rdname last #' @name last +#' @aliases last,characterOrColumn-method #' @family agg_funcs #' @export #' @examples @@ -746,8 +863,11 @@ setMethod("last", #' For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the #' month in July 2015. #' +#' @param x Column to compute on. +#' #' @rdname last_day #' @name last_day +#' @aliases last_day,Column-method #' @family datetime_funcs #' @export #' @examples \dontrun{last_day(df$c)} @@ -763,8 +883,11 @@ setMethod("last_day", #' #' Computes the length of a given string or binary column. #' +#' @param x Column to compute on. +#' #' @rdname length #' @name length +#' @aliases length,Column-method #' @family string_funcs #' @export #' @examples \dontrun{length(df$c)} @@ -780,8 +903,11 @@ setMethod("length", #' #' Computes the natural logarithm of the given value. #' +#' @param x Column to compute on. +#' #' @rdname log #' @name log +#' @aliases log,Column-method #' @family math_funcs #' @export #' @examples \dontrun{log(df$c)} @@ -797,9 +923,12 @@ setMethod("log", #' #' Computes the logarithm of the given value in base 10. #' +#' @param x Column to compute on. +#' #' @rdname log10 #' @name log10 #' @family math_funcs +#' @aliases log10,Column-method #' @export #' @examples \dontrun{log10(df$c)} #' @note log10 since 1.5.0 @@ -814,9 +943,12 @@ setMethod("log10", #' #' Computes the natural logarithm of the given value plus one. #' +#' @param x Column to compute on. +#' #' @rdname log1p #' @name log1p #' @family math_funcs +#' @aliases log1p,Column-method #' @export #' @examples \dontrun{log1p(df$c)} #' @note log1p since 1.5.0 @@ -831,9 +963,12 @@ setMethod("log1p", #' #' Computes the logarithm of the given column in base 2. #' +#' @param x Column to compute on. +#' #' @rdname log2 #' @name log2 #' @family math_funcs +#' @aliases log2,Column-method #' @export #' @examples \dontrun{log2(df$c)} #' @note log2 since 1.5.0 @@ -848,9 +983,12 @@ setMethod("log2", #' #' Converts a string column to lower case. #' +#' @param x Column to compute on. +#' #' @rdname lower #' @name lower #' @family string_funcs +#' @aliases lower,Column-method #' @export #' @examples \dontrun{lower(df$c)} #' @note lower since 1.4.0 @@ -865,9 +1003,12 @@ setMethod("lower", #' #' Trim the spaces from left end for the specified string value. #' +#' @param x Column to compute on. +#' #' @rdname ltrim #' @name ltrim #' @family string_funcs +#' @aliases ltrim,Column-method #' @export #' @examples \dontrun{ltrim(df$c)} #' @note ltrim since 1.5.0 @@ -882,9 +1023,12 @@ setMethod("ltrim", #' #' Aggregate function: returns the maximum value of the expression in a group. #' +#' @param x Column to compute on. +#' #' @rdname max #' @name max #' @family agg_funcs +#' @aliases max,Column-method #' @export #' @examples \dontrun{max(df$c)} #' @note max since 1.5.0 @@ -900,9 +1044,12 @@ setMethod("max", #' Calculates the MD5 digest of a binary column and returns the value #' as a 32 character hex string. #' +#' @param x Column to compute on. +#' #' @rdname md5 #' @name md5 #' @family misc_funcs +#' @aliases md5,Column-method #' @export #' @examples \dontrun{md5(df$c)} #' @note md5 since 1.5.0 @@ -918,9 +1065,12 @@ setMethod("md5", #' Aggregate function: returns the average of the values in a group. #' Alias for avg. #' +#' @param x Column to compute on. +#' #' @rdname mean #' @name mean #' @family agg_funcs +#' @aliases mean,Column-method #' @export #' @examples \dontrun{mean(df$c)} #' @note mean since 1.5.0 @@ -935,8 +1085,11 @@ setMethod("mean", #' #' Aggregate function: returns the minimum value of the expression in a group. #' +#' @param x Column to compute on. +#' #' @rdname min #' @name min +#' @aliases min,Column-method #' @family agg_funcs #' @export #' @examples \dontrun{min(df$c)} @@ -952,8 +1105,11 @@ setMethod("min", #' #' Extracts the minutes as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname minute #' @name minute +#' @aliases minute,Column-method #' @family datetime_funcs #' @export #' @examples \dontrun{minute(df$c)} @@ -981,6 +1137,7 @@ setMethod("minute", #' This is equivalent to the MONOTONICALLY_INCREASING_ID function in SQL. #' #' @rdname monotonically_increasing_id +#' @aliases monotonically_increasing_id,missing-method #' @name monotonically_increasing_id #' @family misc_funcs #' @export @@ -996,8 +1153,11 @@ setMethod("monotonically_increasing_id", #' #' Extracts the month as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname month #' @name month +#' @aliases month,Column-method #' @family datetime_funcs #' @export #' @examples \dontrun{month(df$c)} @@ -1013,9 +1173,12 @@ setMethod("month", #' #' Unary minus, i.e. negate the expression. #' +#' @param x Column to compute on. +#' #' @rdname negate #' @name negate #' @family normal_funcs +#' @aliases negate,Column-method #' @export #' @examples \dontrun{negate(df$c)} #' @note negate since 1.5.0 @@ -1030,9 +1193,12 @@ setMethod("negate", #' #' Extracts the quarter as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname quarter #' @name quarter #' @family datetime_funcs +#' @aliases quarter,Column-method #' @export #' @examples \dontrun{quarter(df$c)} #' @note quarter since 1.5.0 @@ -1047,9 +1213,12 @@ setMethod("quarter", #' #' Reverses the string column and returns it as a new string column. #' +#' @param x Column to compute on. +#' #' @rdname reverse #' @name reverse #' @family string_funcs +#' @aliases reverse,Column-method #' @export #' @examples \dontrun{reverse(df$c)} #' @note reverse since 1.5.0 @@ -1065,9 +1234,12 @@ setMethod("reverse", #' Returns the double value that is closest in value to the argument and #' is equal to a mathematical integer. #' +#' @param x Column to compute on. +#' #' @rdname rint #' @name rint #' @family math_funcs +#' @aliases rint,Column-method #' @export #' @examples \dontrun{rint(df$c)} #' @note rint since 1.5.0 @@ -1082,9 +1254,12 @@ setMethod("rint", #' #' Returns the value of the column `e` rounded to 0 decimal places using HALF_UP rounding mode. #' +#' @param x Column to compute on. +#' #' @rdname round #' @name round #' @family math_funcs +#' @aliases round,Column-method #' @export #' @examples \dontrun{round(df$c)} #' @note round since 1.5.0 @@ -1102,9 +1277,12 @@ setMethod("round", #' Also known as Gaussian rounding or bankers' rounding that rounds to the nearest even number. #' bround(2.5, 0) = 2, bround(3.5, 0) = 4. #' +#' @param x Column to compute on. +#' #' @rdname bround #' @name bround #' @family math_funcs +#' @aliases bround,Column-method #' @export #' @examples \dontrun{bround(df$c, 0)} #' @note bround since 2.0.0 @@ -1120,9 +1298,12 @@ setMethod("bround", #' #' Trim the spaces from right end for the specified string value. #' +#' @param x Column to compute on. +#' #' @rdname rtrim #' @name rtrim #' @family string_funcs +#' @aliases rtrim,Column-method #' @export #' @examples \dontrun{rtrim(df$c)} #' @note rtrim since 1.5.0 @@ -1137,9 +1318,12 @@ setMethod("rtrim", #' #' Aggregate function: alias for \link{stddev_samp} #' +#' @param x Column to compute on. +#' #' @rdname sd #' @name sd #' @family agg_funcs +#' @aliases sd,Column-method #' @seealso \link{stddev_pop}, \link{stddev_samp} #' @export #' @examples @@ -1160,9 +1344,12 @@ setMethod("sd", #' #' Extracts the seconds as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname second #' @name second #' @family datetime_funcs +#' @aliases second,Column-method #' @export #' @examples \dontrun{second(df$c)} #' @note second since 1.5.0 @@ -1178,9 +1365,12 @@ setMethod("second", #' Calculates the SHA-1 digest of a binary column and returns the value #' as a 40 character hex string. #' +#' @param x Column to compute on. +#' #' @rdname sha1 #' @name sha1 #' @family misc_funcs +#' @aliases sha1,Column-method #' @export #' @examples \dontrun{sha1(df$c)} #' @note sha1 since 1.5.0 @@ -1195,8 +1385,11 @@ setMethod("sha1", #' #' Computes the signum of the given value. #' +#' @param x Column to compute on. +#' #' @rdname sign #' @name signum +#' @aliases signum,Column-method #' @family math_funcs #' @export #' @examples \dontrun{signum(df$c)} @@ -1212,9 +1405,12 @@ setMethod("signum", #' #' Computes the sine of the given value. #' +#' @param x Column to compute on. +#' #' @rdname sin #' @name sin #' @family math_funcs +#' @aliases sin,Column-method #' @export #' @examples \dontrun{sin(df$c)} #' @note sin since 1.5.0 @@ -1229,9 +1425,12 @@ setMethod("sin", #' #' Computes the hyperbolic sine of the given value. #' +#' @param x Column to compute on. +#' #' @rdname sinh #' @name sinh #' @family math_funcs +#' @aliases sinh,Column-method #' @export #' @examples \dontrun{sinh(df$c)} #' @note sinh since 1.5.0 @@ -1246,9 +1445,12 @@ setMethod("sinh", #' #' Aggregate function: returns the skewness of the values in a group. #' +#' @param x Column to compute on. +#' #' @rdname skewness #' @name skewness #' @family agg_funcs +#' @aliases skewness,Column-method #' @export #' @examples \dontrun{skewness(df$c)} #' @note skewness since 1.6.0 @@ -1263,9 +1465,12 @@ setMethod("skewness", #' #' Return the soundex code for the specified expression. #' +#' @param x Column to compute on. +#' #' @rdname soundex #' @name soundex #' @family string_funcs +#' @aliases soundex,Column-method #' @export #' @examples \dontrun{soundex(df$c)} #' @note soundex since 1.5.0 @@ -1286,6 +1491,7 @@ setMethod("soundex", #' #' @rdname spark_partition_id #' @name spark_partition_id +#' @aliases spark_partition_id,missing-method #' @export #' @examples #' \dontrun{select(df, spark_partition_id())} @@ -1298,6 +1504,7 @@ setMethod("spark_partition_id", }) #' @rdname sd +#' @aliases stddev,Column-method #' @name stddev #' @note stddev since 1.6.0 setMethod("stddev", @@ -1311,9 +1518,12 @@ setMethod("stddev", #' #' Aggregate function: returns the population standard deviation of the expression in a group. #' +#' @param x Column to compute on. +#' #' @rdname stddev_pop #' @name stddev_pop #' @family agg_funcs +#' @aliases stddev_pop,Column-method #' @seealso \link{sd}, \link{stddev_samp} #' @export #' @examples \dontrun{stddev_pop(df$c)} @@ -1329,9 +1539,12 @@ setMethod("stddev_pop", #' #' Aggregate function: returns the unbiased sample standard deviation of the expression in a group. #' +#' @param x Column to compute on. +#' #' @rdname stddev_samp #' @name stddev_samp #' @family agg_funcs +#' @aliases stddev_samp,Column-method #' @seealso \link{stddev_pop}, \link{sd} #' @export #' @examples \dontrun{stddev_samp(df$c)} @@ -1347,9 +1560,12 @@ setMethod("stddev_samp", #' #' Creates a new struct column that composes multiple input columns. #' +#' @param x Column to compute on. +#' #' @rdname struct #' @name struct #' @family normal_funcs +#' @aliases struct,characterOrColumn-method #' @export #' @examples #' \dontrun{ @@ -1373,9 +1589,12 @@ setMethod("struct", #' #' Computes the square root of the specified float value. #' +#' @param x Column to compute on. +#' #' @rdname sqrt #' @name sqrt #' @family math_funcs +#' @aliases sqrt,Column-method #' @export #' @examples \dontrun{sqrt(df$c)} #' @note sqrt since 1.5.0 @@ -1390,9 +1609,12 @@ setMethod("sqrt", #' #' Aggregate function: returns the sum of all values in the expression. #' +#' @param x Column to compute on. +#' #' @rdname sum #' @name sum #' @family agg_funcs +#' @aliases sum,Column-method #' @export #' @examples \dontrun{sum(df$c)} #' @note sum since 1.5.0 @@ -1407,9 +1629,12 @@ setMethod("sum", #' #' Aggregate function: returns the sum of distinct values in the expression. #' +#' @param x Column to compute on. +#' #' @rdname sumDistinct #' @name sumDistinct #' @family agg_funcs +#' @aliases sumDistinct,Column-method #' @export #' @examples \dontrun{sumDistinct(df$c)} #' @note sumDistinct since 1.4.0 @@ -1424,9 +1649,12 @@ setMethod("sumDistinct", #' #' Computes the tangent of the given value. #' +#' @param x Column to compute on. +#' #' @rdname tan #' @name tan #' @family math_funcs +#' @aliases tan,Column-method #' @export #' @examples \dontrun{tan(df$c)} #' @note tan since 1.5.0 @@ -1441,9 +1669,12 @@ setMethod("tan", #' #' Computes the hyperbolic tangent of the given value. #' +#' @param x Column to compute on. +#' #' @rdname tanh #' @name tanh #' @family math_funcs +#' @aliases tanh,Column-method #' @export #' @examples \dontrun{tanh(df$c)} #' @note tanh since 1.5.0 @@ -1458,9 +1689,12 @@ setMethod("tanh", #' #' Converts an angle measured in radians to an approximately equivalent angle measured in degrees. #' +#' @param x Column to compute on. +#' #' @rdname toDegrees #' @name toDegrees #' @family math_funcs +#' @aliases toDegrees,Column-method #' @export #' @examples \dontrun{toDegrees(df$c)} #' @note toDegrees since 1.4.0 @@ -1475,9 +1709,12 @@ setMethod("toDegrees", #' #' Converts an angle measured in degrees to an approximately equivalent angle measured in radians. #' +#' @param x Column to compute on. +#' #' @rdname toRadians #' @name toRadians #' @family math_funcs +#' @aliases toRadians,Column-method #' @export #' @examples \dontrun{toRadians(df$c)} #' @note toRadians since 1.4.0 @@ -1492,9 +1729,12 @@ setMethod("toRadians", #' #' Converts the column into DateType. #' +#' @param x Column to compute on. +#' #' @rdname to_date #' @name to_date #' @family datetime_funcs +#' @aliases to_date,Column-method #' @export #' @examples \dontrun{to_date(df$c)} #' @note to_date since 1.5.0 @@ -1509,9 +1749,12 @@ setMethod("to_date", #' #' Trim the spaces from both ends for the specified string column. #' +#' @param x Column to compute on. +#' #' @rdname trim #' @name trim #' @family string_funcs +#' @aliases trim,Column-method #' @export #' @examples \dontrun{trim(df$c)} #' @note trim since 1.5.0 @@ -1527,9 +1770,12 @@ setMethod("trim", #' Decodes a BASE64 encoded string column and returns it as a binary column. #' This is the reverse of base64. #' +#' @param x Column to compute on. +#' #' @rdname unbase64 #' @name unbase64 #' @family string_funcs +#' @aliases unbase64,Column-method #' @export #' @examples \dontrun{unbase64(df$c)} #' @note unbase64 since 1.5.0 @@ -1545,9 +1791,12 @@ setMethod("unbase64", #' Inverse of hex. Interprets each pair of characters as a hexadecimal number #' and converts to the byte representation of number. #' +#' @param x Column to compute on. +#' #' @rdname unhex #' @name unhex #' @family math_funcs +#' @aliases unhex,Column-method #' @export #' @examples \dontrun{unhex(df$c)} #' @note unhex since 1.5.0 @@ -1562,9 +1811,12 @@ setMethod("unhex", #' #' Converts a string column to upper case. #' +#' @param x Column to compute on. +#' #' @rdname upper #' @name upper #' @family string_funcs +#' @aliases upper,Column-method #' @export #' @examples \dontrun{upper(df$c)} #' @note upper since 1.4.0 @@ -1579,9 +1831,12 @@ setMethod("upper", #' #' Aggregate function: alias for \link{var_samp}. #' +#' @param x Column to compute on. +#' #' @rdname var #' @name var #' @family agg_funcs +#' @aliases var,Column-method #' @seealso \link{var_pop}, \link{var_samp} #' @export #' @examples @@ -1599,6 +1854,7 @@ setMethod("var", }) #' @rdname var +#' @aliases variance,Column-method #' @name variance #' @note variance since 1.6.0 setMethod("variance", @@ -1612,9 +1868,12 @@ setMethod("variance", #' #' Aggregate function: returns the population variance of the values in a group. #' +#' @param x Column to compute on. +#' #' @rdname var_pop #' @name var_pop #' @family agg_funcs +#' @aliases var_pop,Column-method #' @seealso \link{var}, \link{var_samp} #' @export #' @examples \dontrun{var_pop(df$c)} @@ -1630,8 +1889,11 @@ setMethod("var_pop", #' #' Aggregate function: returns the unbiased variance of the values in a group. #' +#' @param x Column to compute on. +#' #' @rdname var_samp #' @name var_samp +#' @aliases var_samp,Column-method #' @family agg_funcs #' @seealso \link{var_pop}, \link{var} #' @export @@ -1648,8 +1910,11 @@ setMethod("var_samp", #' #' Extracts the week number as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname weekofyear #' @name weekofyear +#' @aliases weekofyear,Column-method #' @family datetime_funcs #' @export #' @examples \dontrun{weekofyear(df$c)} @@ -1665,9 +1930,12 @@ setMethod("weekofyear", #' #' Extracts the year as an integer from a given date/timestamp/string. #' +#' @param x Column to compute on. +#' #' @rdname year #' @name year #' @family datetime_funcs +#' @aliases year,Column-method #' @export #' @examples \dontrun{year(df$c)} #' @note year since 1.5.0 @@ -1682,10 +1950,14 @@ setMethod("year", #' #' Returns the angle theta from the conversion of rectangular coordinates (x, y) to #' polar coordinates (r, theta). +# +#' @param x Column to compute on. +#' @param y Column to compute on. #' #' @rdname atan2 #' @name atan2 #' @family math_funcs +#' @aliases atan2,Column-method #' @export #' @examples \dontrun{atan2(df$c, x)} #' @note atan2 since 1.5.0 @@ -1702,8 +1974,12 @@ setMethod("atan2", signature(y = "Column"), #' #' Returns the number of days from `start` to `end`. #' +#' @param x start Column to use. +#' @param y end Column to use. +#' #' @rdname datediff #' @name datediff +#' @aliases datediff,Column-method #' @family datetime_funcs #' @export #' @examples \dontrun{datediff(df$c, x)} @@ -1720,10 +1996,14 @@ setMethod("datediff", signature(y = "Column"), #' hypot #' #' Computes "sqrt(a^2 + b^2)" without intermediate overflow or underflow. +# +#' @param x Column to compute on. +#' @param y Column to compute on. #' #' @rdname hypot #' @name hypot #' @family math_funcs +#' @aliases hypot,Column-method #' @export #' @examples \dontrun{hypot(df$c, x)} #' @note hypot since 1.4.0 @@ -1740,9 +2020,13 @@ setMethod("hypot", signature(y = "Column"), #' #' Computes the Levenshtein distance of the two given string columns. #' +#' @param x Column to compute on. +#' @param y Column to compute on. +#' #' @rdname levenshtein #' @name levenshtein #' @family string_funcs +#' @aliases levenshtein,Column-method #' @export #' @examples \dontrun{levenshtein(df$c, x)} #' @note levenshtein since 1.5.0 @@ -1759,9 +2043,13 @@ setMethod("levenshtein", signature(y = "Column"), #' #' Returns number of months between dates `date1` and `date2`. #' +#' @param x start Column to use. +#' @param y end Column to use. +#' #' @rdname months_between #' @name months_between #' @family datetime_funcs +#' @aliases months_between,Column-method #' @export #' @examples \dontrun{months_between(df$c, x)} #' @note months_between since 1.5.0 @@ -1779,9 +2067,13 @@ setMethod("months_between", signature(y = "Column"), #' Returns col1 if it is not NaN, or col2 if col1 is NaN. #' Both inputs should be floating point columns (DoubleType or FloatType). #' +#' @param x first Column. +#' @param y second Column. +#' #' @rdname nanvl #' @name nanvl #' @family normal_funcs +#' @aliases nanvl,Column-method #' @export #' @examples \dontrun{nanvl(df$c, x)} #' @note nanvl since 1.5.0 @@ -1798,10 +2090,14 @@ setMethod("nanvl", signature(y = "Column"), #' #' Returns the positive value of dividend mod divisor. #' +#' @param x divisor Column. +#' @param y dividend Column. +#' #' @rdname pmod #' @name pmod #' @docType methods #' @family math_funcs +#' @aliases pmod,Column-method #' @export #' @examples \dontrun{pmod(df$c, x)} #' @note pmod since 1.5.0 @@ -1817,6 +2113,10 @@ setMethod("pmod", signature(y = "Column"), #' @rdname approxCountDistinct #' @name approxCountDistinct +#' +#' @param rsd maximum estimation error allowed (default = 0.05) +#' +#' @aliases approxCountDistinct,Column-method #' @export #' @examples \dontrun{approxCountDistinct(df$c, 0.02)} #' @note approxCountDistinct(Column, numeric) since 1.4.0 @@ -1829,9 +2129,13 @@ setMethod("approxCountDistinct", #' Count Distinct #' +#' @param x Column to compute on +#' @param ... other columns +#' #' @family agg_funcs #' @rdname countDistinct #' @name countDistinct +#' @aliases countDistinct,Column-method #' @return the number of distinct items in a group. #' @export #' @examples \dontrun{countDistinct(df$c)} @@ -1852,10 +2156,14 @@ setMethod("countDistinct", #' concat #' #' Concatenates multiple input string columns together into a single string column. +#' +#' @param x Column to compute on +#' @param ... other columns #' #' @family string_funcs #' @rdname concat #' @name concat +#' @aliases concat,Column-method #' @export #' @examples \dontrun{concat(df$strings, df$strings2)} #' @note concat since 1.5.0 @@ -1875,9 +2183,13 @@ setMethod("concat", #' Returns the greatest value of the list of column names, skipping null values. #' This function takes at least 2 parameters. It will return null if all parameters are null. #' +#' @param x Column to compute on +#' @param ... other columns +#' #' @family normal_funcs #' @rdname greatest #' @name greatest +#' @aliases greatest,Column-method #' @export #' @examples \dontrun{greatest(df$c, df$d)} #' @note greatest since 1.5.0 @@ -1898,8 +2210,12 @@ setMethod("greatest", #' Returns the least value of the list of column names, skipping null values. #' This function takes at least 2 parameters. It will return null if all parameters are null. #' +#' @param x Column to compute on +#' @param ... other columns +#' #' @family normal_funcs #' @rdname least +#' @aliases least,Column-method #' @name least #' @export #' @examples \dontrun{least(df$c, df$d)} @@ -1917,7 +2233,9 @@ setMethod("least", }) #' @rdname ceil +#' #' @name ceiling +#' @aliases ceiling,Column-method #' @export #' @examples \dontrun{ceiling(df$c)} #' @note ceiling since 1.5.0 @@ -1928,7 +2246,10 @@ setMethod("ceiling", }) #' @rdname sign +#' @param x Column to compute on +#' #' @name sign +#' @aliases sign,Column-method #' @export #' @examples \dontrun{sign(df$c)} #' @note sign since 1.5.0 @@ -1941,8 +2262,12 @@ setMethod("sign", signature(x = "Column"), #' #' Aggregate function: returns the number of distinct items in a group. #' +#' @param x Column to compute on +#' @param ... other columns +#' #' @rdname countDistinct #' @name n_distinct +#' @aliases n_distinct,Column-method #' @export #' @examples \dontrun{n_distinct(df$c)} #' @note n_distinct since 1.4.0 @@ -1951,8 +2276,11 @@ setMethod("n_distinct", signature(x = "Column"), countDistinct(x, ...) }) -#' @rdname count +#' @rdname nrow +#' @param x Column to compute on +#' #' @name n +#' @aliases n,Column-method #' @export #' @examples \dontrun{n(df$c)} #' @note n since 1.4.0 @@ -1972,9 +2300,13 @@ setMethod("n", signature(x = "Column"), #' NOTE: Use when ever possible specialized functions like \code{year}. These benefit from a #' specialized implementation. #' +#' @param y Column to compute on +#' @param x date format specification +#' #' @family datetime_funcs #' @rdname date_format #' @name date_format +#' @aliases date_format,Column,character-method #' @export #' @examples \dontrun{date_format(df$t, 'MM/dd/yyy')} #' @note date_format since 1.5.0 @@ -1988,9 +2320,13 @@ setMethod("date_format", signature(y = "Column", x = "character"), #' #' Assumes given timestamp is UTC and converts to given timezone. #' +#' @param y Column to compute on +#' @param x time zone to use +#' #' @family datetime_funcs #' @rdname from_utc_timestamp #' @name from_utc_timestamp +#' @aliases from_utc_timestamp,Column,character-method #' @export #' @examples \dontrun{from_utc_timestamp(df$t, 'PST')} #' @note from_utc_timestamp since 1.5.0 @@ -2011,6 +2347,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), #' @param y column to check #' @param x substring to check #' @family string_funcs +#' @aliases instr,Column,character-method #' @rdname instr #' @name instr #' @export @@ -2033,9 +2370,13 @@ setMethod("instr", signature(y = "Column", x = "character"), #' Day of the week parameter is case insensitive, and accepts first three or two characters: #' "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun". #' +#' @param y Column to compute on +#' @param x Day of the week string +#' #' @family datetime_funcs #' @rdname next_day #' @name next_day +#' @aliases next_day,Column,character-method #' @export #' @examples #'\dontrun{ @@ -2053,9 +2394,13 @@ setMethod("next_day", signature(y = "Column", x = "character"), #' #' Assumes given timestamp is in given timezone and converts to UTC. #' +#' @param y Column to compute on +#' @param x timezone to use +#' #' @family datetime_funcs #' @rdname to_utc_timestamp #' @name to_utc_timestamp +#' @aliases to_utc_timestamp,Column,character-method #' @export #' @examples \dontrun{to_utc_timestamp(df$t, 'PST')} #' @note to_utc_timestamp since 1.5.0 @@ -2069,9 +2414,13 @@ setMethod("to_utc_timestamp", signature(y = "Column", x = "character"), #' #' Returns the date that is numMonths after startDate. #' +#' @param y Column to compute on +#' @param x Number of months to add +#' #' @name add_months #' @family datetime_funcs #' @rdname add_months +#' @aliases add_months,Column,numeric-method #' @export #' @examples \dontrun{add_months(df$d, 1)} #' @note add_months since 1.5.0 @@ -2085,9 +2434,13 @@ setMethod("add_months", signature(y = "Column", x = "numeric"), #' #' Returns the date that is `days` days after `start` #' +#' @param y Column to compute on +#' @param x Number of days to add +#' #' @family datetime_funcs #' @rdname date_add #' @name date_add +#' @aliases date_add,Column,numeric-method #' @export #' @examples \dontrun{date_add(df$d, 1)} #' @note date_add since 1.5.0 @@ -2101,9 +2454,13 @@ setMethod("date_add", signature(y = "Column", x = "numeric"), #' #' Returns the date that is `days` days before `start` #' +#' @param y Column to compute on +#' @param x Number of days to substract +#' #' @family datetime_funcs #' @rdname date_sub #' @name date_sub +#' @aliases date_sub,Column,numeric-method #' @export #' @examples \dontrun{date_sub(df$d, 1)} #' @note date_sub since 1.5.0 @@ -2126,6 +2483,7 @@ setMethod("date_sub", signature(y = "Column", x = "numeric"), #' @family string_funcs #' @rdname format_number #' @name format_number +#' @aliases format_number,Column,numeric-method #' @export #' @examples \dontrun{format_number(df$n, 4)} #' @note format_number since 1.5.0 @@ -2147,6 +2505,7 @@ setMethod("format_number", signature(y = "Column", x = "numeric"), #' @family misc_funcs #' @rdname sha2 #' @name sha2 +#' @aliases sha2,Column,numeric-method #' @export #' @examples \dontrun{sha2(df$c, 256)} #' @note sha2 since 1.5.0 @@ -2161,9 +2520,13 @@ setMethod("sha2", signature(y = "Column", x = "numeric"), #' Shift the given value numBits left. If the given value is a long value, this function #' will return a long value else it will return an integer value. #' +#' @param y column to compute on. +#' @param x number of bits to shift. +#' #' @family math_funcs #' @rdname shiftLeft #' @name shiftLeft +#' @aliases shiftLeft,Column,numeric-method #' @export #' @examples \dontrun{shiftLeft(df$c, 1)} #' @note shiftLeft since 1.5.0 @@ -2180,9 +2543,13 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"), #' Shift the given value numBits right. If the given value is a long value, it will return #' a long value else it will return an integer value. #' +#' @param y column to compute on. +#' @param x number of bits to shift. +#' #' @family math_funcs #' @rdname shiftRight #' @name shiftRight +#' @aliases shiftRight,Column,numeric-method #' @export #' @examples \dontrun{shiftRight(df$c, 1)} #' @note shiftRight since 1.5.0 @@ -2199,9 +2566,13 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"), #' Unsigned shift the given value numBits right. If the given value is a long value, #' it will return a long value else it will return an integer value. #' +#' @param y column to compute on. +#' @param x number of bits to shift. +#' #' @family math_funcs #' @rdname shiftRightUnsigned #' @name shiftRightUnsigned +#' @aliases shiftRightUnsigned,Column,numeric-method #' @export #' @examples \dontrun{shiftRightUnsigned(df$c, 1)} #' @note shiftRightUnsigned since 1.5.0 @@ -2218,9 +2589,14 @@ setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"), #' Concatenates multiple input string columns together into a single string column, #' using the given separator. #' +#' @param x column to concatenate. +#' @param sep separator to use. +#' @param ... other columns to concatenate. +#' #' @family string_funcs #' @rdname concat_ws #' @name concat_ws +#' @aliases concat_ws,character,Column-method #' @export #' @examples \dontrun{concat_ws('-', df$s, df$d)} #' @note concat_ws since 1.5.0 @@ -2235,8 +2611,13 @@ setMethod("concat_ws", signature(sep = "character", x = "Column"), #' #' Convert a number in a string column from one base to another. #' +#' @param x column to convert. +#' @param fromBase base to convert from. +#' @param toBase base to convert to. +#' #' @family math_funcs #' @rdname conv +#' @aliases conv,Column,numeric,numeric-method #' @name conv #' @export #' @examples \dontrun{conv(df$n, 2, 16)} @@ -2258,6 +2639,7 @@ setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeri #' #' @family normal_funcs #' @rdname expr +#' @aliases expr,character-method #' @name expr #' @export #' @examples \dontrun{expr('length(name)')} @@ -2275,6 +2657,7 @@ setMethod("expr", signature(x = "character"), #' @family string_funcs #' @rdname format_string #' @name format_string +#' @aliases format_string,character,Column-method #' @export #' @examples \dontrun{format_string('%d %s', df$a, df$b)} #' @note format_string since 1.5.0 @@ -2296,6 +2679,7 @@ setMethod("format_string", signature(format = "character", x = "Column"), #' @family datetime_funcs #' @rdname from_unixtime #' @name from_unixtime +#' @aliases from_unixtime,Column-method #' @export #' @examples #'\dontrun{ @@ -2334,6 +2718,7 @@ setMethod("from_unixtime", signature(x = "Column"), #' @family datetime_funcs #' @rdname window #' @name window +#' @aliases window,Column-method #' @export #' @examples #'\dontrun{ @@ -2383,6 +2768,7 @@ setMethod("window", signature(x = "Column"), #' #' @family string_funcs #' @rdname locate +#' @aliases locate,character,Column-method #' @name locate #' @export #' @examples \dontrun{locate('b', df$c, 1)} @@ -2401,6 +2787,7 @@ setMethod("locate", signature(substr = "character", str = "Column"), #' #' @family string_funcs #' @rdname lpad +#' @aliases lpad,Column,numeric,character-method #' @name lpad #' @export #' @examples \dontrun{lpad(df$c, 6, '#')} @@ -2420,6 +2807,7 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"), #' @family normal_funcs #' @rdname rand #' @name rand +#' @aliases rand,missing-method #' @export #' @examples \dontrun{rand()} #' @note rand since 1.5.0 @@ -2431,6 +2819,7 @@ setMethod("rand", signature(seed = "missing"), #' @rdname rand #' @name rand +#' @aliases rand,numeric-method #' @export #' @note rand(numeric) since 1.5.0 setMethod("rand", signature(seed = "numeric"), @@ -2446,6 +2835,7 @@ setMethod("rand", signature(seed = "numeric"), #' @family normal_funcs #' @rdname randn #' @name randn +#' @aliases randn,missing-method #' @export #' @examples \dontrun{randn()} #' @note randn since 1.5.0 @@ -2457,6 +2847,7 @@ setMethod("randn", signature(seed = "missing"), #' @rdname randn #' @name randn +#' @aliases randn,numeric-method #' @export #' @note randn(numeric) since 1.5.0 setMethod("randn", signature(seed = "numeric"), @@ -2472,6 +2863,7 @@ setMethod("randn", signature(seed = "numeric"), #' @family string_funcs #' @rdname regexp_extract #' @name regexp_extract +#' @aliases regexp_extract,Column,character,numeric-method #' @export #' @examples \dontrun{regexp_extract(df$c, '(\d+)-(\d+)', 1)} #' @note regexp_extract since 1.5.0 @@ -2491,6 +2883,7 @@ setMethod("regexp_extract", #' @family string_funcs #' @rdname regexp_replace #' @name regexp_replace +#' @aliases regexp_replace,Column,character,character-method #' @export #' @examples \dontrun{regexp_replace(df$c, '(\\d+)', '--')} #' @note regexp_replace since 1.5.0 @@ -2510,6 +2903,7 @@ setMethod("regexp_replace", #' @family string_funcs #' @rdname rpad #' @name rpad +#' @aliases rpad,Column,numeric,character-method #' @export #' @examples \dontrun{rpad(df$c, 6, '#')} #' @note rpad since 1.5.0 @@ -2530,6 +2924,7 @@ setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"), #' #' @family string_funcs #' @rdname substring_index +#' @aliases substring_index,Column,character,numeric-method #' @name substring_index #' @export #' @examples @@ -2557,6 +2952,7 @@ setMethod("substring_index", #' @family string_funcs #' @rdname translate #' @name translate +#' @aliases translate,Column,character,character-method #' @export #' @examples \dontrun{translate(df$c, 'rnlt', '123')} #' @note translate since 1.5.0 @@ -2575,6 +2971,7 @@ setMethod("translate", #' @family datetime_funcs #' @rdname unix_timestamp #' @name unix_timestamp +#' @aliases unix_timestamp,missing,missing-method #' @export #' @examples #'\dontrun{ @@ -2591,6 +2988,7 @@ setMethod("unix_timestamp", signature(x = "missing", format = "missing"), #' @rdname unix_timestamp #' @name unix_timestamp +#' @aliases unix_timestamp,Column,missing-method #' @export #' @note unix_timestamp(Column) since 1.5.0 setMethod("unix_timestamp", signature(x = "Column", format = "missing"), @@ -2601,6 +2999,7 @@ setMethod("unix_timestamp", signature(x = "Column", format = "missing"), #' @rdname unix_timestamp #' @name unix_timestamp +#' @aliases unix_timestamp,Column,character-method #' @export #' @note unix_timestamp(Column, character) since 1.5.0 setMethod("unix_timestamp", signature(x = "Column", format = "character"), @@ -2616,6 +3015,7 @@ setMethod("unix_timestamp", signature(x = "Column", format = "character"), #' @family normal_funcs #' @rdname when #' @name when +#' @aliases when,Column-method #' @seealso \link{ifelse} #' @export #' @examples \dontrun{when(df$age == 2, df$age + 1)} @@ -2636,6 +3036,7 @@ setMethod("when", signature(condition = "Column", value = "ANY"), #' @family normal_funcs #' @rdname ifelse #' @name ifelse +#' @aliases ifelse,Column-method #' @seealso \link{when} #' @export #' @examples \dontrun{ @@ -2671,6 +3072,7 @@ setMethod("ifelse", #' @rdname cume_dist #' @name cume_dist #' @family window_funcs +#' @aliases cume_dist,missing-method #' @export #' @examples \dontrun{cume_dist()} #' @note cume_dist since 1.6.0 @@ -2694,6 +3096,7 @@ setMethod("cume_dist", #' @rdname dense_rank #' @name dense_rank #' @family window_funcs +#' @aliases dense_rank,missing-method #' @export #' @examples \dontrun{dense_rank()} #' @note dense_rank since 1.6.0 @@ -2714,6 +3117,7 @@ setMethod("dense_rank", #' #' @rdname lag #' @name lag +#' @aliases lag,characterOrColumn-method #' @family window_funcs #' @export #' @examples \dontrun{lag(df$c)} @@ -2739,10 +3143,15 @@ setMethod("lag", #' an `offset` of one will return the next row at any given point in the window partition. #' #' This is equivalent to the LEAD function in SQL. +#' +#' @param x Column to compute on +#' @param offset Number of rows to offset +#' @param defaultValue (Optional) default value to use #' #' @rdname lead #' @name lead #' @family window_funcs +#' @aliases lead,characterOrColumn,numeric-method #' @export #' @examples \dontrun{lead(df$c)} #' @note lead since 1.6.0 @@ -2763,13 +3172,16 @@ setMethod("lead", #' ntile #' #' Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window -#' partition. Fow example, if `n` is 4, the first quarter of the rows will get value 1, the second +#' partition. For example, if `n` is 4, the first quarter of the rows will get value 1, the second #' quarter will get 2, the third quarter will get 3, and the last quarter will get 4. #' #' This is equivalent to the NTILE function in SQL. #' +#' @param x Number of ntile groups +#' #' @rdname ntile #' @name ntile +#' @aliases ntile,numeric-method #' @family window_funcs #' @export #' @examples \dontrun{ntile(1)} @@ -2794,6 +3206,7 @@ setMethod("ntile", #' @rdname percent_rank #' @name percent_rank #' @family window_funcs +#' @aliases percent_rank,missing-method #' @export #' @examples \dontrun{percent_rank()} #' @note percent_rank since 1.6.0 @@ -2818,6 +3231,7 @@ setMethod("percent_rank", #' @rdname rank #' @name rank #' @family window_funcs +#' @aliases rank,missing-method #' @export #' @examples \dontrun{rank()} #' @note rank since 1.6.0 @@ -2829,6 +3243,10 @@ setMethod("rank", }) # Expose rank() in the R base package +#' @name rank +#' @rdname rank +#' @aliases rank,ANY-method +#' @export setMethod("rank", signature(x = "ANY"), function(x, ...) { @@ -2843,6 +3261,7 @@ setMethod("rank", #' #' @rdname row_number #' @name row_number +#' @aliases row_number,missing-method #' @family window_funcs #' @export #' @examples \dontrun{row_number()} @@ -2863,6 +3282,7 @@ setMethod("row_number", #' @param x A Column #' @param value A value to be checked if contained in the column #' @rdname array_contains +#' @aliases array_contains,Column-method #' @name array_contains #' @family collection_funcs #' @export @@ -2879,9 +3299,12 @@ setMethod("array_contains", #' #' Creates a new row for each element in the given array or map column. #' +#' @param x Column to compute on +#' #' @rdname explode #' @name explode #' @family collection_funcs +#' @aliases explode,Column-method #' @export #' @examples \dontrun{explode(df$c)} #' @note explode since 1.5.0 @@ -2895,9 +3318,12 @@ setMethod("explode", #' size #' #' Returns length of array or map. +#' +#' @param x Column to compute on #' #' @rdname size #' @name size +#' @aliases size,Column-method #' @family collection_funcs #' @export #' @examples \dontrun{size(df$c)} @@ -2920,6 +3346,7 @@ setMethod("size", #' FALSE, sorting is in descending order. #' @rdname sort_array #' @name sort_array +#' @aliases sort_array,Column-method #' @family collection_funcs #' @export #' @examples @@ -2939,9 +3366,12 @@ setMethod("sort_array", #' #' Creates a new row for each element with position in the given array or map column. #' +#' @param x Column to compute on +#' #' @rdname posexplode #' @name posexplode #' @family collection_funcs +#' @aliases posexplode,Column-method #' @export #' @examples \dontrun{posexplode(df$c)} #' @note posexplode since 2.1.0 diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index df057bda99f8..e7444ac2467d 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -51,7 +51,7 @@ setGeneric("collectPartition", standardGeneric("collectPartition") }) -# @rdname count +# @rdname nrow # @export setGeneric("count", function(x) { standardGeneric("count") }) @@ -395,7 +395,7 @@ setGeneric("value", function(bcast) { standardGeneric("value") }) #################### SparkDataFrame Methods ######################## -#' @rdname agg +#' @rdname summarize #' @export setGeneric("agg", function (x, ...) { standardGeneric("agg") }) @@ -654,7 +654,7 @@ setGeneric("showDF", function(x, ...) { standardGeneric("showDF") }) # @export setGeneric("subset", function(x, ...) { standardGeneric("subset") }) -#' @rdname agg +#' @rdname summarize #' @export setGeneric("summarize", function(x, ...) { standardGeneric("summarize") }) @@ -779,13 +779,13 @@ setGeneric("rowsBetween", function(x, start, end) { standardGeneric("rowsBetween #' @export setGeneric("rangeBetween", function(x, start, end) { standardGeneric("rangeBetween") }) -#' @rdname window.partitionBy +#' @rdname windowPartitionBy #' @export -setGeneric("window.partitionBy", function(col, ...) { standardGeneric("window.partitionBy") }) +setGeneric("windowPartitionBy", function(col, ...) { standardGeneric("windowPartitionBy") }) -#' @rdname window.orderBy +#' @rdname windowOrderBy #' @export -setGeneric("window.orderBy", function(col, ...) { standardGeneric("window.orderBy") }) +setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy") }) ###################### Expression Function Methods ########################## @@ -1022,7 +1022,7 @@ setGeneric("month", function(x) { standardGeneric("month") }) #' @export setGeneric("months_between", function(y, x) { standardGeneric("months_between") }) -#' @rdname count +#' @rdname nrow #' @export setGeneric("n", function(x) { standardGeneric("n") }) diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 5ed7e8abb43d..85348ae76baa 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -47,6 +47,7 @@ groupedData <- function(sgd) { #' @rdname show +#' @aliases show,GroupedData-method #' @note show(GroupedData) since 1.4.0 setMethod("show", "GroupedData", function(object) { @@ -61,6 +62,7 @@ setMethod("show", "GroupedData", #' @param x a GroupedData #' @return a SparkDataFrame #' @rdname count +#' @aliases count,GroupedData-method #' @export #' @examples #' \dontrun{ @@ -84,6 +86,7 @@ setMethod("count", #' @param x a GroupedData #' @return a SparkDataFrame #' @rdname summarize +#' @aliases agg,GroupedData-method #' @name agg #' @family agg_funcs #' @export @@ -121,6 +124,7 @@ setMethod("agg", #' @rdname summarize #' @name summarize +#' @aliases summarize,GroupedData-method #' @note summarize since 1.4.0 setMethod("summarize", signature(x = "GroupedData"), @@ -146,6 +150,7 @@ methods <- c("avg", "max", "mean", "min", "sum") #' @param values A value or a list/vector of distinct values for the output columns. #' @return GroupedData object #' @rdname pivot +#' @aliases pivot,GroupedData,character-method #' @name pivot #' @export #' @examples @@ -198,6 +203,7 @@ createMethods() #' #' @param x A GroupedData #' @rdname gapply +#' @aliases gapply,GroupedData-method #' @name gapply #' @export #' @note gapply(GroupedData) since 2.0.0 @@ -212,6 +218,7 @@ setMethod("gapply", #' #' @param x A GroupedData #' @rdname gapplyCollect +#' @aliases gapplyCollect,GroupedData-method #' @name gapplyCollect #' @export #' @note gapplyCollect(GroupedData) since 2.0.0 @@ -243,4 +250,4 @@ gapplyInternal <- function(x, func, schema) { broadcastArr, if (class(schema) == "structType") { schema$jobj } else { NULL }) dataFrame(sdf) -} \ No newline at end of file +} diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 94e1f654f519..50c601fcd9e1 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -91,6 +91,7 @@ NULL #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' @param tol Positive convergence tolerance of iterations. #' @param maxIter Integer giving the maximal number of IRLS iterations. +#' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model #' @rdname spark.glm #' @name spark.glm @@ -306,6 +307,7 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' @param initMode The initialization algorithm choosen to fit the model #' @return \code{spark.kmeans} returns a fitted k-means model #' @rdname spark.kmeans +#' @aliases spark.kmeans,SparkDataFrame,formula-method #' @name spark.kmeans #' @export #' @examples @@ -418,6 +420,7 @@ setMethod("predict", signature(object = "KMeansModel"), #' @param smoothing Smoothing parameter #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model #' @rdname spark.naiveBayes +#' @aliases spark.naiveBayes,SparkDataFrame,formula-method #' @name spark.naiveBayes #' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/} #' @export @@ -512,7 +515,6 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat #' which means throw exception if the output path exists. #' #' @rdname spark.kmeans -#' @name write.ml #' @export #' @note write.ml(KMeansModel, character) since 2.0.0 setMethod("write.ml", signature(object = "KMeansModel", path = "character"), diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index a91e9980df93..b429f5de13b8 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -26,6 +26,7 @@ #' @param x a structField object (created with the field() function) #' @param ... additional structField objects #' @return a structType object +#' @rdname structType #' @export #' @examples #'\dontrun{ @@ -40,13 +41,19 @@ structType <- function(x, ...) { UseMethod("structType", x) } -structType.jobj <- function(x) { +#' @rdname structType +#' @method structType jobj +#' @export +structType.jobj <- function(x, ...) { obj <- structure(list(), class = "structType") obj$jobj <- x obj$fields <- function() { lapply(callJMethod(obj$jobj, "fields"), structField) } obj } +#' @rdname structType +#' @method structType structField +#' @export structType.structField <- function(x, ...) { fields <- list(x, ...) if (!all(sapply(fields, inherits, "structField"))) { @@ -104,7 +111,10 @@ structField <- function(x, ...) { UseMethod("structField", x) } -structField.jobj <- function(x) { +#' @rdname structField +#' @method structField jobj +#' @export +structField.jobj <- function(x, ...) { obj <- structure(list(), class = "structField") obj$jobj <- x obj$name <- function() { callJMethod(x, "name") } @@ -179,7 +189,7 @@ checkType <- function(type) { #' @param nullable A logical vector indicating whether or not the field is nullable #' @rdname structField #' @export -structField.character <- function(x, type, nullable = TRUE) { +structField.character <- function(x, type, nullable = TRUE, ...) { if (class(x) != "character") { stop("Field name must be a string.") } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 62659b0c0ce5..524f7c4a26b6 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -28,14 +28,6 @@ connExists <- function(env) { }) } -#' @rdname sparkR.session.stop -#' @name sparkR.stop -#' @export -#' @note sparkR.stop since 1.4.0 -sparkR.stop <- function() { - sparkR.session.stop() -} - #' Stop the Spark Session and Spark Context #' #' Stop the Spark Session and Spark Context. @@ -90,6 +82,14 @@ sparkR.session.stop <- function() { clearJobjs() } +#' @rdname sparkR.session.stop +#' @name sparkR.stop +#' @export +#' @note sparkR.stop since 1.4.0 +sparkR.stop <- function() { + sparkR.session.stop() +} + #' (Deprecated) Initialize a new Spark Context #' #' This function initializes a new SparkContext. @@ -155,6 +155,10 @@ sparkR.sparkContext <- function( existingPort <- Sys.getenv("EXISTING_SPARKR_BACKEND_PORT", "") if (existingPort != "") { + if (length(packages) != 0) { + warning(paste("sparkPackages has no effect when using spark-submit or sparkR shell", + " please use the --packages commandline instead", sep = ",")) + } backendPort <- existingPort } else { path <- tempfile(pattern = "backend_port") diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index c92352e1b063..2b4ce195cbdd 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -34,6 +34,7 @@ setOldClass("jobj") #' #' @rdname crosstab #' @name crosstab +#' @aliases crosstab,SparkDataFrame,character,character-method #' @family stat functions #' @export #' @examples @@ -59,6 +60,7 @@ setMethod("crosstab", #' #' @rdname cov #' @name cov +#' @aliases cov,SparkDataFrame-method #' @family stat functions #' @export #' @examples @@ -88,6 +90,7 @@ setMethod("cov", #' #' @rdname corr #' @name corr +#' @aliases corr,SparkDataFrame-method #' @family stat functions #' @export #' @examples @@ -120,6 +123,7 @@ setMethod("corr", #' #' @rdname freqItems #' @name freqItems +#' @aliases freqItems,SparkDataFrame,character-method #' @family stat functions #' @export #' @examples @@ -158,6 +162,7 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), #' #' @rdname approxQuantile #' @name approxQuantile +#' @aliases approxQuantile,SparkDataFrame,character,numeric,numeric-method #' @family stat functions #' @export #' @examples @@ -188,6 +193,7 @@ setMethod("approxQuantile", #' @return A new SparkDataFrame that represents the stratified sample #' #' @rdname sampleBy +#' @aliases sampleBy,SparkDataFrame,character,list,numeric-method #' @name sampleBy #' @family stat functions #' @export diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index e75bfbf037fb..240b9f669bdd 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -126,20 +126,16 @@ hashCode <- function(key) { as.integer(bitwXor(intBits[2], intBits[1])) } else if (class(key) == "character") { # TODO: SPARK-7839 means we might not have the native library available - if (is.loaded("stringHashCode")) { - .Call("stringHashCode", key) + n <- nchar(key) + if (n == 0) { + 0L } else { - n <- nchar(key) - if (n == 0) { - 0L - } else { - asciiVals <- sapply(charToRaw(key), function(x) { strtoi(x, 16L) }) - hashC <- 0 - for (k in 1:length(asciiVals)) { - hashC <- mult31AndAdd(hashC, asciiVals[k]) - } - as.integer(hashC) + asciiVals <- sapply(charToRaw(key), function(x) { strtoi(x, 16L) }) + hashC <- 0 + for (k in 1:length(asciiVals)) { + hashC <- mult31AndAdd(hashC, asciiVals[k]) } + as.integer(hashC) } } else { warning(paste("Could not hash object, returning 0", sep = "")) diff --git a/R/pkg/R/window.R b/R/pkg/R/window.R index e4bc933b9aab..215d0e7b5cfb 100644 --- a/R/pkg/R/window.R +++ b/R/pkg/R/window.R @@ -17,23 +17,29 @@ # window.R - Utility functions for defining window in DataFrames -#' window.partitionBy +#' windowPartitionBy #' #' Creates a WindowSpec with the partitioning defined. #' -#' @rdname window.partitionBy -#' @name window.partitionBy +#' @param col A column name or Column by which rows are partitioned to +#' windows. +#' @param ... Optional column names or Columns in addition to col, by +#' which rows are partitioned to windows. +#' +#' @rdname windowPartitionBy +#' @name windowPartitionBy +#' @aliases windowPartitionBy,character-method #' @export #' @examples #' \dontrun{ -#' ws <- window.partitionBy("key1", "key2") +#' ws <- windowPartitionBy("key1", "key2") #' df1 <- select(df, over(lead("value", 1), ws)) #' -#' ws <- window.partitionBy(df$key1, df$key2) +#' ws <- windowPartitionBy(df$key1, df$key2) #' df1 <- select(df, over(lead("value", 1), ws)) #' } -#' @note window.partitionBy(character) since 2.0.0 -setMethod("window.partitionBy", +#' @note windowPartitionBy(character) since 2.0.0 +setMethod("windowPartitionBy", signature(col = "character"), function(col, ...) { windowSpec( @@ -43,11 +49,12 @@ setMethod("window.partitionBy", list(...))) }) -#' @rdname window.partitionBy -#' @name window.partitionBy +#' @rdname windowPartitionBy +#' @name windowPartitionBy +#' @aliases windowPartitionBy,Column-method #' @export -#' @note window.partitionBy(Column) since 2.0.0 -setMethod("window.partitionBy", +#' @note windowPartitionBy(Column) since 2.0.0 +setMethod("windowPartitionBy", signature(col = "Column"), function(col, ...) { jcols <- lapply(list(col, ...), function(c) { @@ -59,23 +66,29 @@ setMethod("window.partitionBy", jcols)) }) -#' window.orderBy +#' windowOrderBy #' #' Creates a WindowSpec with the ordering defined. #' -#' @rdname window.orderBy -#' @name window.orderBy +#' @param col A column name or Column by which rows are ordered within +#' windows. +#' @param ... Optional column names or Columns in addition to col, by +#' which rows are ordered within windows. +#' +#' @rdname windowOrderBy +#' @name windowOrderBy +#' @aliases windowOrderBy,character-method #' @export #' @examples #' \dontrun{ -#' ws <- window.orderBy("key1", "key2") +#' ws <- windowOrderBy("key1", "key2") #' df1 <- select(df, over(lead("value", 1), ws)) #' -#' ws <- window.orderBy(df$key1, df$key2) +#' ws <- windowOrderBy(df$key1, df$key2) #' df1 <- select(df, over(lead("value", 1), ws)) #' } -#' @note window.orderBy(character) since 2.0.0 -setMethod("window.orderBy", +#' @note windowOrderBy(character) since 2.0.0 +setMethod("windowOrderBy", signature(col = "character"), function(col, ...) { windowSpec( @@ -85,11 +98,12 @@ setMethod("window.orderBy", list(...))) }) -#' @rdname window.orderBy -#' @name window.orderBy +#' @rdname windowOrderBy +#' @name windowOrderBy +#' @aliases windowOrderBy,Column-method #' @export -#' @note window.orderBy(Column) since 2.0.0 -setMethod("window.orderBy", +#' @note windowOrderBy(Column) since 2.0.0 +setMethod("windowOrderBy", signature(col = "Column"), function(col, ...) { jcols <- lapply(list(col, ...), function(c) { diff --git a/R/pkg/inst/test_support/sparktestjar_2.10-1.0.jar b/R/pkg/inst/test_support/sparktestjar_2.10-1.0.jar deleted file mode 100644 index 1d5c2af631aa..000000000000 Binary files a/R/pkg/inst/test_support/sparktestjar_2.10-1.0.jar and /dev/null differ diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R index 84e4845f180b..c9615c8d4faf 100644 --- a/R/pkg/inst/tests/testthat/jarTest.R +++ b/R/pkg/inst/tests/testthat/jarTest.R @@ -16,17 +16,17 @@ # library(SparkR) -sparkSession <- sparkR.session() +sc <- sparkR.session() -helloTest <- SparkR:::callJStatic("sparkR.test.hello", +helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass", "helloWorld", "Dave") +stopifnot(identical(helloTest, "Hello Dave")) -basicFunction <- SparkR:::callJStatic("sparkR.test.basicFunction", +basicFunction <- SparkR:::callJStatic("sparkrtest.DummyClass", "addStuff", 2L, 2L) +stopifnot(basicFunction == 4L) sparkR.session.stop() -output <- c(helloTest, basicFunction) -writeLines(output) diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R index 940c91f376cd..4bc935c79eb0 100644 --- a/R/pkg/inst/tests/testthat/packageInAJarTest.R +++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R @@ -17,7 +17,7 @@ library(SparkR) library(sparkPackageTest) -sparkSession <- sparkR.session() +sparkR.session() run1 <- myfunc(5L) diff --git a/R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/testthat/test_Serde.R index 96fb6dda2645..b5f6f1b54fa8 100644 --- a/R/pkg/inst/tests/testthat/test_Serde.R +++ b/R/pkg/inst/tests/testthat/test_Serde.R @@ -17,7 +17,7 @@ context("SerDe functionality") -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) test_that("SerDe of primitive types", { x <- callJStatic("SparkRHandler", "echo", 1L) @@ -75,3 +75,5 @@ test_that("SerDe of list of lists", { y <- callJStatic("SparkRHandler", "echo", x) expect_equal(x, y) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R index b69f017de81d..56ac8eb72801 100644 --- a/R/pkg/inst/tests/testthat/test_binaryFile.R +++ b/R/pkg/inst/tests/testthat/test_binaryFile.R @@ -18,7 +18,7 @@ context("functions on binary files") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) mockFile <- c("Spark is pretty.", "Spark is awesome.") @@ -88,3 +88,5 @@ test_that("saveAsObjectFile()/objectFile() works with multiple paths", { unlink(fileName1, recursive = TRUE) unlink(fileName2, recursive = TRUE) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/testthat/test_binary_function.R index 6f51d2068727..ae7abe20ccbe 100644 --- a/R/pkg/inst/tests/testthat/test_binary_function.R +++ b/R/pkg/inst/tests/testthat/test_binary_function.R @@ -18,7 +18,7 @@ context("binary functions") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) # Data @@ -100,3 +100,5 @@ test_that("zipPartitions() on RDDs", { unlink(fileName) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/testthat/test_broadcast.R index cf1d43277105..c7fefb5cf9cb 100644 --- a/R/pkg/inst/tests/testthat/test_broadcast.R +++ b/R/pkg/inst/tests/testthat/test_broadcast.R @@ -18,7 +18,7 @@ context("broadcast variables") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) # Partitioned data @@ -47,3 +47,5 @@ test_that("without using broadcast variable", { expected <- list(sum(randomMat) * 1, sum(randomMat) * 2) expect_equal(actual, expected) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index 2a1bd61b1111..8bd134a58d68 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -63,18 +63,14 @@ test_that("repeatedly starting and stopping SparkR", { } }) -# Does not work consistently even with Hive off -# nolint start -# test_that("repeatedly starting and stopping SparkR", { -# for (i in 1:4) { -# sparkR.session(enableHiveSupport = FALSE) -# df <- createDataFrame(data.frame(dummy=1:i)) -# expect_equal(count(df), i) -# sparkR.session.stop() -# Sys.sleep(5) # Need more time to shutdown Hive metastore -# } -# }) -# nolint end +test_that("repeatedly starting and stopping SparkSession", { + for (i in 1:4) { + sparkR.session(enableHiveSupport = FALSE) + df <- createDataFrame(data.frame(dummy = 1:i)) + expect_equal(count(df), i) + sparkR.session.stop() + } +}) test_that("rdd GC across sparkR.stop", { sc <- sparkR.sparkContext() # sc should get id 0 @@ -96,6 +92,7 @@ test_that("rdd GC across sparkR.stop", { count(rdd3) count(rdd4) + sparkR.session.stop() }) test_that("job group functions can be called", { @@ -164,7 +161,7 @@ test_that("sparkJars sparkPackages as comma-separated strings", { }) test_that("spark.lapply should perform simple transforms", { - sc <- sparkR.sparkContext() + sparkR.sparkContext() doubled <- spark.lapply(1:10, function(x) { 2 * x }) expect_equal(doubled, as.list(2 * 1:10)) sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_includeJAR.R b/R/pkg/inst/tests/testthat/test_includeJAR.R deleted file mode 100644 index 512dd39cb29f..000000000000 --- a/R/pkg/inst/tests/testthat/test_includeJAR.R +++ /dev/null @@ -1,36 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -context("include an external JAR in SparkContext") - -runScript <- function() { - sparkHome <- Sys.getenv("SPARK_HOME") - sparkTestJarPath <- "R/lib/SparkR/test_support/sparktestjar_2.10-1.0.jar" - jarPath <- paste("--jars", shQuote(file.path(sparkHome, sparkTestJarPath))) - scriptPath <- file.path(sparkHome, "R/lib/SparkR/tests/testthat/jarTest.R") - submitPath <- file.path(sparkHome, paste("bin/", determineSparkSubmitBin(), sep = "")) - combinedArgs <- paste(jarPath, scriptPath, sep = " ") - res <- launchScript(submitPath, combinedArgs, capture = TRUE) - tail(res, 2) -} - -test_that("sparkJars tag in SparkContext", { - testOutput <- runScript() - helloTest <- testOutput[1] - expect_equal(helloTest, "Hello, Dave") - basicFunction <- testOutput[2] - expect_equal(basicFunction, "4") -}) diff --git a/R/pkg/inst/tests/testthat/test_includePackage.R b/R/pkg/inst/tests/testthat/test_includePackage.R index d6a3766539c0..ca2b90057278 100644 --- a/R/pkg/inst/tests/testthat/test_includePackage.R +++ b/R/pkg/inst/tests/testthat/test_includePackage.R @@ -18,7 +18,7 @@ context("include R packages") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) # Partitioned data @@ -56,3 +56,5 @@ test_that("use include package", { actual <- collect(data) } }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 753da8176097..ab390a86d1cc 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -20,7 +20,7 @@ library(testthat) context("MLlib functions") # Tests for MLlib functions in SparkR -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) test_that("formula of spark.glm", { training <- suppressWarnings(createDataFrame(iris)) @@ -453,3 +453,5 @@ test_that("spark.survreg", { expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4) } }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_parallelize_collect.R b/R/pkg/inst/tests/testthat/test_parallelize_collect.R index f79a8a70aafb..959d7ab9e644 100644 --- a/R/pkg/inst/tests/testthat/test_parallelize_collect.R +++ b/R/pkg/inst/tests/testthat/test_parallelize_collect.R @@ -33,7 +33,7 @@ numPairs <- list(list(1, 1), list(1, 2), list(2, 2), list(2, 3)) strPairs <- list(list(strList, strList), list(strList, strList)) # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) jsc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) # Tests @@ -108,3 +108,5 @@ test_that("parallelize() and collect() work for lists of pairs (pairwise data)", expect_equal(collect(strPairsRDDD1), strPairs) expect_equal(collect(strPairsRDDD2), strPairs) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/testthat/test_rdd.R index 429311d2924f..508a3a7dfd5f 100644 --- a/R/pkg/inst/tests/testthat/test_rdd.R +++ b/R/pkg/inst/tests/testthat/test_rdd.R @@ -18,7 +18,7 @@ context("basic RDD functions") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) # Data @@ -800,3 +800,5 @@ test_that("Test correct concurrency of RRDD.compute()", { count <- callJMethod(zrdd, "count") expect_equal(count, 1000) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_shuffle.R b/R/pkg/inst/tests/testthat/test_shuffle.R index 7d4f34201644..2586056773f1 100644 --- a/R/pkg/inst/tests/testthat/test_shuffle.R +++ b/R/pkg/inst/tests/testthat/test_shuffle.R @@ -18,7 +18,7 @@ context("partitionBy, groupByKey, reduceByKey etc.") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) # Data @@ -220,3 +220,5 @@ test_that("test partitionBy with string keys", { expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first)) expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second)) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index fdd6020db9d0..3f3cb766b38f 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -237,7 +237,7 @@ test_that("read csv as DataFrame", { "Empty,Dummy,Placeholder") writeLines(mockLinesCsv, csvPath) - df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.string = "Empty") + df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty") expect_equal(count(df2), 4) withoutna2 <- na.omit(df2, how = "any", cols = "year") expect_equal(count(withoutna2), 3) @@ -2376,7 +2376,7 @@ test_that("gapply() and gapplyCollect() on a DataFrame", { test_that("Window functions on a DataFrame", { df <- createDataFrame(list(list(1L, "1"), list(2L, "2"), list(1L, "1"), list(2L, "2")), schema = c("key", "value")) - ws <- orderBy(window.partitionBy("key"), "value") + ws <- orderBy(windowPartitionBy("key"), "value") result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws))) names(result) <- c("key", "value") expected <- data.frame(key = c(1L, NA, 2L, NA), @@ -2384,17 +2384,17 @@ test_that("Window functions on a DataFrame", { stringsAsFactors = FALSE) expect_equal(result, expected) - ws <- orderBy(window.partitionBy(df$key), df$value) + ws <- orderBy(windowPartitionBy(df$key), df$value) result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws))) names(result) <- c("key", "value") expect_equal(result, expected) - ws <- partitionBy(window.orderBy("value"), "key") + ws <- partitionBy(windowOrderBy("value"), "key") result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws))) names(result) <- c("key", "value") expect_equal(result, expected) - ws <- partitionBy(window.orderBy(df$value), df$key) + ws <- partitionBy(windowOrderBy(df$value), df$key) result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws))) names(result) <- c("key", "value") expect_equal(result, expected) @@ -2405,7 +2405,8 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { a <- 1:3 b <- c("a", "b", "c") ldf <- data.frame(a, b) - df <- suppressWarnings(createDataFrame(sqlContext, ldf)) + # Call function with namespace :: operator - SPARK-16538 + df <- suppressWarnings(SparkR::createDataFrame(sqlContext, ldf)) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) expect_equal(count(df), 3) @@ -2423,6 +2424,13 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { before <- suppressWarnings(createDataFrame(sqlContext, iris)) after <- suppressWarnings(createDataFrame(iris)) expect_equal(collect(before), collect(after)) + + # more tests for SPARK-16538 + createOrReplaceTempView(df, "table") + SparkR::tables() + SparkR::sql("SELECT 1") + suppressWarnings(SparkR::sql(sqlContext, "SELECT * FROM table")) + suppressWarnings(SparkR::dropTempTable(sqlContext, "table")) }) test_that("randomSplit", { @@ -2481,3 +2489,5 @@ unlink(parquetPath) unlink(orcPath) unlink(jsonPath) unlink(jsonPathNa) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_take.R b/R/pkg/inst/tests/testthat/test_take.R index daf5e41abe13..07f00c9915d1 100644 --- a/R/pkg/inst/tests/testthat/test_take.R +++ b/R/pkg/inst/tests/testthat/test_take.R @@ -30,7 +30,7 @@ strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ", "raising me. But they're both dead now. I didn't kill them. Honest.") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) test_that("take() gives back the original elements in correct count and order", { @@ -65,3 +65,5 @@ test_that("take() gives back the original elements in correct count and order", expect_equal(length(take(numListRDD, 0)), 0) expect_equal(length(take(numVectorRDD, 0)), 0) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_textFile.R b/R/pkg/inst/tests/testthat/test_textFile.R index 7b2cc74753fe..b7dcbe472ac2 100644 --- a/R/pkg/inst/tests/testthat/test_textFile.R +++ b/R/pkg/inst/tests/testthat/test_textFile.R @@ -18,7 +18,7 @@ context("the textFile() function") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) mockFile <- c("Spark is pretty.", "Spark is awesome.") @@ -160,3 +160,5 @@ test_that("Pipelined operations on RDDs created using textFile", { unlink(fileName) }) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R index 21a119a06b93..58ff3debfa70 100644 --- a/R/pkg/inst/tests/testthat/test_utils.R +++ b/R/pkg/inst/tests/testthat/test_utils.R @@ -18,7 +18,7 @@ context("functions in utils.R") # JavaSparkContext handle -sparkSession <- sparkR.session() +sparkSession <- sparkR.session(enableHiveSupport = FALSE) sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) test_that("convertJListToRList() gives back (deserializes) the original JLists @@ -182,3 +182,5 @@ test_that("overrideEnvs", { expect_equal(config[["param_only"]], "blah") expect_equal(config[["config_only"]], "ok") }) + +sparkR.session.stop() diff --git a/assembly/pom.xml b/assembly/pom.xml index 1b25b7c6afb0..971a62f87a21 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-assembly_2.11 Spark Project Assembly http://spark.apache.org/ diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index b1a37e847216..81f0c6e2265a 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-network-common_2.11 jar Spark Project Networking diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 51c06b9e5a07..d211bd5bd194 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-network-shuffle_2.11 jar Spark Project Shuffle Streaming Service diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java index 1cc0fb65d726..1270cef621b7 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java @@ -113,7 +113,8 @@ protected void handleMessage( } } else if (msgObj instanceof RegisterExecutor) { - final Timer.Context responseDelayContext = metrics.registerExecutorRequestLatencyMillis.time(); + final Timer.Context responseDelayContext = + metrics.registerExecutorRequestLatencyMillis.time(); try { RegisterExecutor msg = (RegisterExecutor) msgObj; checkAuth(client, msg.appId); diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index d43eb71aa5f6..606ad1573961 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-network-yarn_2.11 jar Spark Project YARN Shuffle Service diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index 9807383ec3bc..22e47ac1b792 100644 --- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -68,7 +68,12 @@ public class YarnShuffleService extends AuxiliaryService { private static final String SPARK_AUTHENTICATE_KEY = "spark.authenticate"; private static final boolean DEFAULT_SPARK_AUTHENTICATE = false; - private static final String RECOVERY_FILE_NAME = "registeredExecutor.ldb"; + private static final String RECOVERY_FILE_NAME = "registeredExecutors.ldb"; + + // Whether failure during service initialization should stop the NM. + @VisibleForTesting + static final String STOP_ON_FAILURE_KEY = "spark.yarn.shuffle.stopOnFailure"; + private static final boolean DEFAULT_STOP_ON_FAILURE = false; // An entity that manages the shuffle secret per application // This is used only if authentication is enabled @@ -119,44 +124,50 @@ private boolean isAuthenticationEnabled() { * Start the shuffle server with the given configuration. */ @Override - protected void serviceInit(Configuration conf) { + protected void serviceInit(Configuration conf) throws Exception { _conf = conf; - // In case this NM was killed while there were running spark applications, we need to restore - // lost state for the existing executors. We look for an existing file in the NM's local dirs. - // If we don't find one, then we choose a file to use to save the state next time. Even if - // an application was stopped while the NM was down, we expect yarn to call stopApplication() - // when it comes back - registeredExecutorFile = - new File(getRecoveryPath().toUri().getPath(), RECOVERY_FILE_NAME); - - TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf)); - // If authentication is enabled, set up the shuffle server to use a - // special RPC handler that filters out unauthenticated fetch requests - boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE); + boolean stopOnFailure = conf.getBoolean(STOP_ON_FAILURE_KEY, DEFAULT_STOP_ON_FAILURE); + try { + // In case this NM was killed while there were running spark applications, we need to restore + // lost state for the existing executors. We look for an existing file in the NM's local dirs. + // If we don't find one, then we choose a file to use to save the state next time. Even if + // an application was stopped while the NM was down, we expect yarn to call stopApplication() + // when it comes back + registeredExecutorFile = + new File(getRecoveryPath().toUri().getPath(), RECOVERY_FILE_NAME); + + TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf)); blockHandler = new ExternalShuffleBlockHandler(transportConf, registeredExecutorFile); - } catch (Exception e) { - logger.error("Failed to initialize external shuffle service", e); - } - List bootstraps = Lists.newArrayList(); - if (authEnabled) { - secretManager = new ShuffleSecretManager(); - bootstraps.add(new SaslServerBootstrap(transportConf, secretManager)); - } + // If authentication is enabled, set up the shuffle server to use a + // special RPC handler that filters out unauthenticated fetch requests + boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE); + List bootstraps = Lists.newArrayList(); + if (authEnabled) { + secretManager = new ShuffleSecretManager(); + bootstraps.add(new SaslServerBootstrap(transportConf, secretManager)); + } - int port = conf.getInt( - SPARK_SHUFFLE_SERVICE_PORT_KEY, DEFAULT_SPARK_SHUFFLE_SERVICE_PORT); - TransportContext transportContext = new TransportContext(transportConf, blockHandler); - shuffleServer = transportContext.createServer(port, bootstraps); - // the port should normally be fixed, but for tests its useful to find an open port - port = shuffleServer.getPort(); - boundPort = port; - String authEnabledString = authEnabled ? "enabled" : "not enabled"; - logger.info("Started YARN shuffle service for Spark on port {}. " + - "Authentication is {}. Registered executor file is {}", port, authEnabledString, - registeredExecutorFile); + int port = conf.getInt( + SPARK_SHUFFLE_SERVICE_PORT_KEY, DEFAULT_SPARK_SHUFFLE_SERVICE_PORT); + TransportContext transportContext = new TransportContext(transportConf, blockHandler); + shuffleServer = transportContext.createServer(port, bootstraps); + // the port should normally be fixed, but for tests its useful to find an open port + port = shuffleServer.getPort(); + boundPort = port; + String authEnabledString = authEnabled ? "enabled" : "not enabled"; + logger.info("Started YARN shuffle service for Spark on port {}. " + + "Authentication is {}. Registered executor file is {}", port, authEnabledString, + registeredExecutorFile); + } catch (Exception e) { + if (stopOnFailure) { + throw e; + } else { + noteFailure(e); + } + } } @Override diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 6f2fdd538f48..626f023a5b99 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-sketch_2.11 jar Spark Project Sketch diff --git a/common/tags/pom.xml b/common/tags/pom.xml index 842a1c80e0b8..1c60d510e570 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-tags_2.11 jar Spark Project Tags diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml index 08af461f1964..45af98d94ef9 100644 --- a/common/unsafe/pom.xml +++ b/common/unsafe/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-unsafe_2.11 jar Spark Project Unsafe diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java index 3cd4264680bf..355748238540 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java @@ -24,7 +24,6 @@ import java.util.Map; import org.apache.spark.unsafe.Platform; -import org.apache.spark.unsafe.memory.MemoryAllocator; /** * A simple {@link MemoryAllocator} that can allocate up to 16GB using a JVM long primitive array. diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java index 8bd2b06db8b8..7b588681d979 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java @@ -23,12 +23,12 @@ public interface MemoryAllocator { * Whether to fill newly allocated and deallocated memory with 0xa5 and 0x5a bytes respectively. * This helps catch misuse of uninitialized or freed memory, but imposes some overhead. */ - public static final boolean MEMORY_DEBUG_FILL_ENABLED = Boolean.parseBoolean( + boolean MEMORY_DEBUG_FILL_ENABLED = Boolean.parseBoolean( System.getProperty("spark.memory.debugFill", "false")); // Same as jemalloc's debug fill values. - public static final byte MEMORY_DEBUG_FILL_CLEAN_VALUE = (byte)0xa5; - public static final byte MEMORY_DEBUG_FILL_FREED_VALUE = (byte)0x5a; + byte MEMORY_DEBUG_FILL_CLEAN_VALUE = (byte)0xa5; + byte MEMORY_DEBUG_FILL_FREED_VALUE = (byte)0x5a; /** * Allocates a contiguous block of memory. Note that the allocated memory is not guaranteed diff --git a/core/pom.xml b/core/pom.xml index 1b880d883d15..04b94a258c71 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-core_2.11 core diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java index 0e9defe5b4a5..83dc61c5e5e5 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java @@ -88,6 +88,7 @@ final class BypassMergeSortShuffleWriter extends ShuffleWriter { /** Array of file writers, one for each partition */ private DiskBlockObjectWriter[] partitionWriters; + private FileSegment[] partitionWriterSegments; @Nullable private MapStatus mapStatus; private long[] partitionLengths; @@ -131,6 +132,7 @@ public void write(Iterator> records) throws IOException { final SerializerInstance serInstance = serializer.newInstance(); final long openStartTime = System.nanoTime(); partitionWriters = new DiskBlockObjectWriter[numPartitions]; + partitionWriterSegments = new FileSegment[numPartitions]; for (int i = 0; i < numPartitions; i++) { final Tuple2 tempShuffleBlockIdPlusFile = blockManager.diskBlockManager().createTempShuffleBlock(); @@ -150,8 +152,10 @@ public void write(Iterator> records) throws IOException { partitionWriters[partitioner.getPartition(key)].write(key, record._2()); } - for (DiskBlockObjectWriter writer : partitionWriters) { - writer.commitAndClose(); + for (int i = 0; i < numPartitions; i++) { + final DiskBlockObjectWriter writer = partitionWriters[i]; + partitionWriterSegments[i] = writer.commitAndGet(); + writer.close(); } File output = shuffleBlockResolver.getDataFile(shuffleId, mapId); @@ -184,7 +188,7 @@ private long[] writePartitionedFile(File outputFile) throws IOException { boolean threwException = true; try { for (int i = 0; i < numPartitions; i++) { - final File file = partitionWriters[i].fileSegment().file(); + final File file = partitionWriterSegments[i].file(); if (file.exists()) { final FileInputStream in = new FileInputStream(file); boolean copyThrewException = true; diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java index cf38a04ed7cf..cfec724fe9e8 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java @@ -37,6 +37,7 @@ import org.apache.spark.serializer.SerializerInstance; import org.apache.spark.storage.BlockManager; import org.apache.spark.storage.DiskBlockObjectWriter; +import org.apache.spark.storage.FileSegment; import org.apache.spark.storage.TempShuffleBlockId; import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.LongArray; @@ -150,10 +151,6 @@ private void writeSortedFile(boolean isLastFile) throws IOException { final ShuffleInMemorySorter.ShuffleSorterIterator sortedRecords = inMemSorter.getSortedIterator(); - // Currently, we need to open a new DiskBlockObjectWriter for each partition; we can avoid this - // after SPARK-5581 is fixed. - DiskBlockObjectWriter writer; - // Small writes to DiskBlockObjectWriter will be fairly inefficient. Since there doesn't seem to // be an API to directly transfer bytes from managed memory to the disk writer, we buffer // data through a byte array. This array does not need to be large enough to hold a single @@ -175,7 +172,8 @@ private void writeSortedFile(boolean isLastFile) throws IOException { // around this, we pass a dummy no-op serializer. final SerializerInstance ser = DummySerializerInstance.INSTANCE; - writer = blockManager.getDiskWriter(blockId, file, ser, fileBufferSizeBytes, writeMetricsToUse); + final DiskBlockObjectWriter writer = + blockManager.getDiskWriter(blockId, file, ser, fileBufferSizeBytes, writeMetricsToUse); int currentPartition = -1; while (sortedRecords.hasNext()) { @@ -185,12 +183,10 @@ private void writeSortedFile(boolean isLastFile) throws IOException { if (partition != currentPartition) { // Switch to the new partition if (currentPartition != -1) { - writer.commitAndClose(); - spillInfo.partitionLengths[currentPartition] = writer.fileSegment().length(); + final FileSegment fileSegment = writer.commitAndGet(); + spillInfo.partitionLengths[currentPartition] = fileSegment.length(); } currentPartition = partition; - writer = - blockManager.getDiskWriter(blockId, file, ser, fileBufferSizeBytes, writeMetricsToUse); } final long recordPointer = sortedRecords.packedRecordPointer.getRecordPointer(); @@ -209,15 +205,14 @@ private void writeSortedFile(boolean isLastFile) throws IOException { writer.recordWritten(); } - if (writer != null) { - writer.commitAndClose(); - // If `writeSortedFile()` was called from `closeAndGetSpills()` and no records were inserted, - // then the file might be empty. Note that it might be better to avoid calling - // writeSortedFile() in that case. - if (currentPartition != -1) { - spillInfo.partitionLengths[currentPartition] = writer.fileSegment().length(); - spills.add(spillInfo); - } + final FileSegment committedSegment = writer.commitAndGet(); + writer.close(); + // If `writeSortedFile()` was called from `closeAndGetSpills()` and no records were inserted, + // then the file might be empty. Note that it might be better to avoid calling + // writeSortedFile() in that case. + if (currentPartition != -1) { + spillInfo.partitionLengths[currentPartition] = committedSegment.length(); + spills.add(spillInfo); } if (!isLastFile) { // i.e. this is a spill file diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java index 9ba760e8422f..164b9d70b79d 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java @@ -136,7 +136,8 @@ public void write( } public void close() throws IOException { - writer.commitAndClose(); + writer.commitAndGet(); + writer.close(); writer = null; writeBuffer = null; } diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html new file mode 100644 index 000000000000..64ea719141f4 --- /dev/null +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html @@ -0,0 +1,105 @@ + + + diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js new file mode 100644 index 000000000000..b2b2363d3ac6 --- /dev/null +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -0,0 +1,470 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +function formatStatus(status, type) { + if (type !== 'display') return status; + if (status) { + return "Active" + } else { + return "Dead" + } +} + +jQuery.extend(jQuery.fn.dataTableExt.oSort, { + "title-numeric-pre": function (a) { + var x = a.match(/title="*(-?[0-9\.]+)/)[1]; + return parseFloat(x); + }, + + "title-numeric-asc": function (a, b) { + return ((a < b) ? -1 : ((a > b) ? 1 : 0)); + }, + + "title-numeric-desc": function (a, b) { + return ((a < b) ? 1 : ((a > b) ? -1 : 0)); + } +}); + +$(document).ajaxStop($.unblockUI); +$(document).ajaxStart(function () { + $.blockUI({message: '

Loading Executors Page...

'}); +}); + +function createTemplateURI(appId) { + var words = document.baseURI.split('/'); + var ind = words.indexOf("proxy"); + if (ind > 0) { + var baseURI = words.slice(0, ind + 1).join('/') + '/' + appId + '/static/executorspage-template.html'; + return baseURI; + } + ind = words.indexOf("history"); + if(ind > 0) { + var baseURI = words.slice(0, ind).join('/') + '/static/executorspage-template.html'; + return baseURI; + } + return location.origin + "/static/executorspage-template.html"; +} + +function getStandAloneppId(cb) { + var words = document.baseURI.split('/'); + var ind = words.indexOf("proxy"); + if (ind > 0) { + var appId = words[ind + 1]; + cb(appId); + return; + } + ind = words.indexOf("history"); + if (ind > 0) { + var appId = words[ind + 1]; + cb(appId); + return; + } + //Looks like Web UI is running in standalone mode + //Let's get application-id using REST End Point + $.getJSON(location.origin + "/api/v1/applications", function(response, status, jqXHR) { + if (response && response.length > 0) { + var appId = response[0].id + cb(appId); + return; + } + }); +} + +function createRESTEndPoint(appId) { + var words = document.baseURI.split('/'); + var ind = words.indexOf("proxy"); + if (ind > 0) { + var appId = words[ind + 1]; + var newBaseURI = words.slice(0, ind + 2).join('/'); + return newBaseURI + "/api/v1/applications/" + appId + "/allexecutors" + } + ind = words.indexOf("history"); + if (ind > 0) { + var appId = words[ind + 1]; + var attemptId = words[ind + 2]; + var newBaseURI = words.slice(0, ind).join('/'); + if (isNaN(attemptId)) { + return newBaseURI + "/api/v1/applications/" + appId + "/allexecutors"; + } else { + return newBaseURI + "/api/v1/applications/" + appId + "/" + attemptId + "/allexecutors"; + } + } + return location.origin + "/api/v1/applications/" + appId + "/allexecutors"; +} + +function formatLogsCells(execLogs, type) { + if (type !== 'display') return Object.keys(execLogs); + if (!execLogs) return; + var result = ''; + $.each(execLogs, function (logName, logUrl) { + result += '' + }); + return result; +} + +// Determine Color Opacity from 0.5-1 +// activeTasks range from 0 to maxTasks +function activeTasksAlpha(activeTasks, maxTasks) { + return maxTasks > 0 ? ((activeTasks / maxTasks) * 0.5 + 0.5) : 1; +} + +function activeTasksStyle(activeTasks, maxTasks) { + return activeTasks > 0 ? ("hsla(240, 100%, 50%, " + activeTasksAlpha(activeTasks, maxTasks) + ")") : ""; +} + +// failedTasks range max at 10% failure, alpha max = 1 +function failedTasksAlpha(failedTasks, totalTasks) { + return totalTasks > 0 ? + (Math.min(10 * failedTasks / totalTasks, 1) * 0.5 + 0.5) : 1; +} + +function failedTasksStyle(failedTasks, totalTasks) { + return failedTasks > 0 ? + ("hsla(0, 100%, 50%, " + failedTasksAlpha(failedTasks, totalTasks) + ")") : ""; +} + +// totalDuration range from 0 to 50% GC time, alpha max = 1 +function totalDurationAlpha(totalGCTime, totalDuration) { + return totalDuration > 0 ? + (Math.min(totalGCTime / totalDuration + 0.5, 1)) : 1; +} + +function totalDurationStyle(totalGCTime, totalDuration) { + // Red if GC time over GCTimePercent of total time + // When GCTimePercent is edited change ToolTips.TASK_TIME to match + var GCTimePercent = 0.1; + return (totalGCTime > GCTimePercent * totalDuration) ? + ("hsla(0, 100%, 50%, " + totalDurationAlpha(totalGCTime, totalDuration) + ")") : ""; +} + +function totalDurationColor(totalGCTime, totalDuration) { + // Red if GC time over GCTimePercent of total time + // When GCTimePercent is edited change ToolTips.TASK_TIME to match + var GCTimePercent = 0.1; + return (totalGCTime > GCTimePercent * totalDuration) ? "white" : "black"; +} + +$(document).ready(function () { + $.extend($.fn.dataTable.defaults, { + stateSave: true, + lengthMenu: [[20, 40, 60, 100, -1], [20, 40, 60, 100, "All"]], + pageLength: 20 + }); + + executorsSummary = $("#active-executors"); + + getStandAloneppId(function (appId) { + + var endPoint = createRESTEndPoint(appId); + $.getJSON(endPoint, function (response, status, jqXHR) { + var summary = []; + var allExecCnt = 0; + var allRDDBlocks = 0; + var allMemoryUsed = 0; + var allMaxMemory = 0; + var allDiskUsed = 0; + var allTotalCores = 0; + var allMaxTasks = 0; + var allActiveTasks = 0; + var allFailedTasks = 0; + var allCompletedTasks = 0; + var allTotalTasks = 0; + var allTotalDuration = 0; + var allTotalGCTime = 0; + var allTotalInputBytes = 0; + var allTotalShuffleRead = 0; + var allTotalShuffleWrite = 0; + + var activeExecCnt = 0; + var activeRDDBlocks = 0; + var activeMemoryUsed = 0; + var activeMaxMemory = 0; + var activeDiskUsed = 0; + var activeTotalCores = 0; + var activeMaxTasks = 0; + var activeActiveTasks = 0; + var activeFailedTasks = 0; + var activeCompletedTasks = 0; + var activeTotalTasks = 0; + var activeTotalDuration = 0; + var activeTotalGCTime = 0; + var activeTotalInputBytes = 0; + var activeTotalShuffleRead = 0; + var activeTotalShuffleWrite = 0; + + var deadExecCnt = 0; + var deadRDDBlocks = 0; + var deadMemoryUsed = 0; + var deadMaxMemory = 0; + var deadDiskUsed = 0; + var deadTotalCores = 0; + var deadMaxTasks = 0; + var deadActiveTasks = 0; + var deadFailedTasks = 0; + var deadCompletedTasks = 0; + var deadTotalTasks = 0; + var deadTotalDuration = 0; + var deadTotalGCTime = 0; + var deadTotalInputBytes = 0; + var deadTotalShuffleRead = 0; + var deadTotalShuffleWrite = 0; + + response.forEach(function (exec) { + allExecCnt += 1; + allRDDBlocks += exec.rddBlocks; + allMemoryUsed += exec.memoryUsed; + allMaxMemory += exec.maxMemory; + allDiskUsed += exec.diskUsed; + allTotalCores += exec.totalCores; + allMaxTasks += exec.maxTasks; + allActiveTasks += exec.activeTasks; + allFailedTasks += exec.failedTasks; + allCompletedTasks += exec.completedTasks; + allTotalTasks += exec.totalTasks; + allTotalDuration += exec.totalDuration; + allTotalGCTime += exec.totalGCTime; + allTotalInputBytes += exec.totalInputBytes; + allTotalShuffleRead += exec.totalShuffleRead; + allTotalShuffleWrite += exec.totalShuffleWrite; + if (exec.isActive) { + activeExecCnt += 1; + activeRDDBlocks += exec.rddBlocks; + activeMemoryUsed += exec.memoryUsed; + activeMaxMemory += exec.maxMemory; + activeDiskUsed += exec.diskUsed; + activeTotalCores += exec.totalCores; + activeMaxTasks += exec.maxTasks; + activeActiveTasks += exec.activeTasks; + activeFailedTasks += exec.failedTasks; + activeCompletedTasks += exec.completedTasks; + activeTotalTasks += exec.totalTasks; + activeTotalDuration += exec.totalDuration; + activeTotalGCTime += exec.totalGCTime; + activeTotalInputBytes += exec.totalInputBytes; + activeTotalShuffleRead += exec.totalShuffleRead; + activeTotalShuffleWrite += exec.totalShuffleWrite; + } else { + deadExecCnt += 1; + deadRDDBlocks += exec.rddBlocks; + deadMemoryUsed += exec.memoryUsed; + deadMaxMemory += exec.maxMemory; + deadDiskUsed += exec.diskUsed; + deadTotalCores += exec.totalCores; + deadMaxTasks += exec.maxTasks; + deadActiveTasks += exec.activeTasks; + deadFailedTasks += exec.failedTasks; + deadCompletedTasks += exec.completedTasks; + deadTotalTasks += exec.totalTasks; + deadTotalDuration += exec.totalDuration; + deadTotalGCTime += exec.totalGCTime; + deadTotalInputBytes += exec.totalInputBytes; + deadTotalShuffleRead += exec.totalShuffleRead; + deadTotalShuffleWrite += exec.totalShuffleWrite; + } + }); + + var totalSummary = { + "execCnt": ( "Total(" + allExecCnt + ")"), + "allRDDBlocks": allRDDBlocks, + "allMemoryUsed": allMemoryUsed, + "allMaxMemory": allMaxMemory, + "allDiskUsed": allDiskUsed, + "allTotalCores": allTotalCores, + "allMaxTasks": allMaxTasks, + "allActiveTasks": allActiveTasks, + "allFailedTasks": allFailedTasks, + "allCompletedTasks": allCompletedTasks, + "allTotalTasks": allTotalTasks, + "allTotalDuration": allTotalDuration, + "allTotalGCTime": allTotalGCTime, + "allTotalInputBytes": allTotalInputBytes, + "allTotalShuffleRead": allTotalShuffleRead, + "allTotalShuffleWrite": allTotalShuffleWrite + }; + var activeSummary = { + "execCnt": ( "Active(" + activeExecCnt + ")"), + "allRDDBlocks": activeRDDBlocks, + "allMemoryUsed": activeMemoryUsed, + "allMaxMemory": activeMaxMemory, + "allDiskUsed": activeDiskUsed, + "allTotalCores": activeTotalCores, + "allMaxTasks": activeMaxTasks, + "allActiveTasks": activeActiveTasks, + "allFailedTasks": activeFailedTasks, + "allCompletedTasks": activeCompletedTasks, + "allTotalTasks": activeTotalTasks, + "allTotalDuration": activeTotalDuration, + "allTotalGCTime": activeTotalGCTime, + "allTotalInputBytes": activeTotalInputBytes, + "allTotalShuffleRead": activeTotalShuffleRead, + "allTotalShuffleWrite": activeTotalShuffleWrite + }; + var deadSummary = { + "execCnt": ( "Dead(" + deadExecCnt + ")" ), + "allRDDBlocks": deadRDDBlocks, + "allMemoryUsed": deadMemoryUsed, + "allMaxMemory": deadMaxMemory, + "allDiskUsed": deadDiskUsed, + "allTotalCores": deadTotalCores, + "allMaxTasks": deadMaxTasks, + "allActiveTasks": deadActiveTasks, + "allFailedTasks": deadFailedTasks, + "allCompletedTasks": deadCompletedTasks, + "allTotalTasks": deadTotalTasks, + "allTotalDuration": deadTotalDuration, + "allTotalGCTime": deadTotalGCTime, + "allTotalInputBytes": deadTotalInputBytes, + "allTotalShuffleRead": deadTotalShuffleRead, + "allTotalShuffleWrite": deadTotalShuffleWrite + }; + + var data = {executors: response, "execSummary": [activeSummary, deadSummary, totalSummary]}; + $.get(createTemplateURI(appId), function (template) { + + executorsSummary.append(Mustache.render($(template).filter("#executors-summary-template").html(), data)); + var selector = "#active-executors-table"; + var conf = { + "data": response, + "columns": [ + { + data: function (row, type) { + return type !== 'display' ? (isNaN(row.id) ? 0 : row.id ) : row.id; + } + }, + {data: 'hostPort'}, + {data: 'isActive', render: formatStatus}, + {data: 'rddBlocks'}, + { + data: function (row, type) { + return type === 'display' ? (formatBytes(row.memoryUsed, type) + ' / ' + formatBytes(row.maxMemory, type)) : row.memoryUsed; + } + }, + {data: 'diskUsed', render: formatBytes}, + {data: 'totalCores'}, + { + data: 'activeTasks', + "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) { + if (sData > 0) { + $(nTd).css('color', 'white'); + $(nTd).css('background', activeTasksStyle(oData.activeTasks, oData.maxTasks)); + } + } + }, + { + data: 'failedTasks', + "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) { + if (sData > 0) { + $(nTd).css('color', 'white'); + $(nTd).css('background', failedTasksStyle(oData.failedTasks, oData.totalTasks)); + } + } + }, + {data: 'completedTasks'}, + {data: 'totalTasks'}, + { + data: function (row, type) { + return type === 'display' ? (formatDuration(row.totalDuration) + ' (' + formatDuration(row.totalGCTime) + ')') : row.totalDuration + }, + "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) { + if (oData.totalDuration > 0) { + $(nTd).css('color', totalDurationColor(oData.totalGCTime, oData.totalDuration)); + $(nTd).css('background', totalDurationStyle(oData.totalGCTime, oData.totalDuration)); + } + } + }, + {data: 'totalInputBytes', render: formatBytes}, + {data: 'totalShuffleRead', render: formatBytes}, + {data: 'totalShuffleWrite', render: formatBytes}, + {data: 'executorLogs', render: formatLogsCells}, + { + data: 'id', render: function (data, type) { + return type === 'display' ? ("Thread Dump" ) : data; + } + } + ], + "order": [[0, "asc"]] + }; + + $(selector).DataTable(conf); + $('#active-executors [data-toggle="tooltip"]').tooltip(); + + var sumSelector = "#summary-execs-table"; + var sumConf = { + "data": [activeSummary, deadSummary, totalSummary], + "columns": [ + { + data: 'execCnt', + "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) { + $(nTd).css('font-weight', 'bold'); + } + }, + {data: 'allRDDBlocks'}, + { + data: function (row, type) { + return type === 'display' ? (formatBytes(row.allMemoryUsed, type) + ' / ' + formatBytes(row.allMaxMemory, type)) : row.allMemoryUsed; + } + }, + {data: 'allDiskUsed', render: formatBytes}, + {data: 'allTotalCores'}, + { + data: 'allActiveTasks', + "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) { + if (sData > 0) { + $(nTd).css('color', 'white'); + $(nTd).css('background', activeTasksStyle(oData.allActiveTasks, oData.allMaxTasks)); + } + } + }, + { + data: 'allFailedTasks', + "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) { + if (sData > 0) { + $(nTd).css('color', 'white'); + $(nTd).css('background', failedTasksStyle(oData.allFailedTasks, oData.allTotalTasks)); + } + } + }, + {data: 'allCompletedTasks'}, + {data: 'allTotalTasks'}, + { + data: function (row, type) { + return type === 'display' ? (formatDuration(row.allTotalDuration, type) + ' (' + formatDuration(row.allTotalGCTime, type) + ')') : row.allTotalDuration + }, + "fnCreatedCell": function (nTd, sData, oData, iRow, iCol) { + if (oData.allTotalDuration > 0) { + $(nTd).css('color', totalDurationColor(oData.allTotalGCTime, oData.allTotalDuration)); + $(nTd).css('background', totalDurationStyle(oData.allTotalGCTime, oData.allTotalDuration)); + } + } + }, + {data: 'allTotalInputBytes', render: formatBytes}, + {data: 'allTotalShuffleRead', render: formatBytes}, + {data: 'allTotalShuffleWrite', render: formatBytes} + ], + "paging": false, + "searching": false, + "info": false + + }; + + $(sumSelector).DataTable(sumConf); + $('#execSummary [data-toggle="tooltip"]').tooltip(); + + }); + }); + }); +}); diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js index d2161662d567..5b9afb59ef8e 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js @@ -15,28 +15,6 @@ * limitations under the License. */ -// this function works exactly the same as UIUtils.formatDuration -function formatDuration(milliseconds) { - if (milliseconds < 100) { - return milliseconds + " ms"; - } - var seconds = milliseconds * 1.0 / 1000; - if (seconds < 1) { - return seconds.toFixed(1) + " s"; - } - if (seconds < 60) { - return seconds.toFixed(0) + " s"; - } - var minutes = seconds / 60; - if (minutes < 10) { - return minutes.toFixed(1) + " min"; - } else if (minutes < 60) { - return minutes.toFixed(0) + " min"; - } - var hours = minutes / 60; - return hours.toFixed(1) + " h"; -} - function makeIdNumeric(id) { var strs = id.split("_"); if (strs.length < 3) { diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js new file mode 100644 index 000000000000..edc0ee2ce181 --- /dev/null +++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// this function works exactly the same as UIUtils.formatDuration +function formatDuration(milliseconds) { + if (milliseconds < 100) { + return milliseconds + " ms"; + } + var seconds = milliseconds * 1.0 / 1000; + if (seconds < 1) { + return seconds.toFixed(1) + " s"; + } + if (seconds < 60) { + return seconds.toFixed(0) + " s"; + } + var minutes = seconds / 60; + if (minutes < 10) { + return minutes.toFixed(1) + " min"; + } else if (minutes < 60) { + return minutes.toFixed(0) + " min"; + } + var hours = minutes / 60; + return hours.toFixed(1) + " h"; +} + +function formatBytes(bytes, type) { + if (type !== 'display') return bytes; + if (bytes == 0) return '0.0 B'; + var k = 1000; + var dm = 1; + var sizes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']; + var i = Math.floor(Math.log(bytes) / Math.log(k)); + return parseFloat((bytes / Math.pow(k, i)).toFixed(dm)) + ' ' + sizes[i]; +} diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 6bd950205fad..486d535da0bc 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -46,6 +46,8 @@ private[spark] class MapOutputTrackerMasterEndpoint( override val rpcEnv: RpcEnv, tracker: MapOutputTrackerMaster, conf: SparkConf) extends RpcEndpoint with Logging { + logDebug("init") // force eager creation of logger + override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case GetMapOutputStatuses(shuffleId: Int) => val hostPort = context.senderAddress.hostPort diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 33ed0d5493e0..f6af9ccc41b3 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -248,7 +248,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { * - This will throw an exception is the config is not optional and the value is not set. */ private[spark] def get[T](entry: ConfigEntry[T]): T = { - entry.readFrom(this) + entry.readFrom(settings, getenv) } /** diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 57d1f09f6b15..6d7f05d21710 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -556,6 +556,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli // Make sure the context is stopped if the user forgets about it. This avoids leaving // unfinished event logs around after the JVM exits cleanly. It doesn't help if the JVM // is killed, though. + logDebug("Adding shutdown hook") // force eager creation of logger _shutdownHookRef = ShutdownHookManager.addShutdownHook( ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () => logInfo("Invoking stop() from shutdown hook") diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala index 16157414fd12..77825e75e513 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala @@ -37,6 +37,15 @@ private[spark] object RUtils { ) } + /** + * Check if SparkR is installed before running tests that use SparkR. + */ + def isSparkRInstalled: Boolean = { + localSparkRPackagePath.filter { pkgDir => + new File(Seq(pkgDir, "SparkR").mkString(File.separator)).exists + }.isDefined + } + /** * Get the list of paths for R packages in various deployment modes, of which the first * path is for the SparkR package itself. The second path is for R packages built as diff --git a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala index 37a19a495bee..13eadbe44f61 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala @@ -128,6 +128,7 @@ object ExternalShuffleService extends Logging { server = newShuffleService(sparkConf, securityManager) server.start() + logDebug("Adding shutdown hook") // force eager creation of logger ShutdownHookManager.addShutdownHook { () => logInfo("Shutting down shuffle service.") server.stop() diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala index 2fad1120cdc8..74f78021ed6c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala @@ -43,8 +43,9 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("") { if (allAppsSize > 0) { ++ - ++ - + ++ + ++ + } else if (requestedIncomplete) {

No incomplete applications found!

} else { diff --git a/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala b/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala index a057977eb0dd..73b6ca384438 100644 --- a/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala +++ b/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala @@ -104,6 +104,7 @@ private[mesos] object MesosClusterDispatcher extends Logging { } val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf) dispatcher.start() + logDebug("Adding shutdown hook") // force eager creation of logger ShutdownHookManager.addShutdownHook { () => logInfo("Shutdown hook is shutting down dispatcher") dispatcher.stop() diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index ccc6c36e9c79..e30839c49c04 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -23,6 +23,7 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable import scala.util.{Failure, Success} +import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.TaskState.TaskState @@ -64,8 +65,7 @@ private[spark] class CoarseGrainedExecutorBackend( case Success(msg) => // Always receive `true`. Just ignore it case Failure(e) => - logError(s"Cannot register with driver: $driverUrl", e) - exitExecutor(1) + exitExecutor(1, s"Cannot register with driver: $driverUrl", e) }(ThreadUtils.sameThread) } @@ -78,16 +78,19 @@ private[spark] class CoarseGrainedExecutorBackend( override def receive: PartialFunction[Any, Unit] = { case RegisteredExecutor => logInfo("Successfully registered with driver") - executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false) + try { + executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false) + } catch { + case NonFatal(e) => + exitExecutor(1, "Unable to create executor due to " + e.getMessage, e) + } case RegisterExecutorFailed(message) => - logError("Slave registration failed: " + message) - exitExecutor(1) + exitExecutor(1, "Slave registration failed: " + message) case LaunchTask(data) => if (executor == null) { - logError("Received LaunchTask command but executor was null") - exitExecutor(1) + exitExecutor(1, "Received LaunchTask command but executor was null") } else { val taskDesc = ser.deserialize[TaskDescription](data.value) logInfo("Got assigned task " + taskDesc.taskId) @@ -97,8 +100,7 @@ private[spark] class CoarseGrainedExecutorBackend( case KillTask(taskId, _, interruptThread) => if (executor == null) { - logError("Received KillTask command but executor was null") - exitExecutor(1) + exitExecutor(1, "Received KillTask command but executor was null") } else { executor.killTask(taskId, interruptThread) } @@ -127,8 +129,7 @@ private[spark] class CoarseGrainedExecutorBackend( if (stopping.get()) { logInfo(s"Driver from $remoteAddress disconnected during shutdown") } else if (driver.exists(_.address == remoteAddress)) { - logError(s"Driver $remoteAddress disassociated! Shutting down.") - exitExecutor(1) + exitExecutor(1, s"Driver $remoteAddress disassociated! Shutting down.") } else { logWarning(s"An unknown ($remoteAddress) driver disconnected.") } @@ -147,7 +148,14 @@ private[spark] class CoarseGrainedExecutorBackend( * executor exits differently. For e.g. when an executor goes down, * back-end may not want to take the parent process down. */ - protected def exitExecutor(code: Int): Unit = System.exit(code) + protected def exitExecutor(code: Int, reason: String, throwable: Throwable = null) = { + if (throwable != null) { + logError(reason, throwable) + } else { + logError(reason) + } + System.exit(code) + } } private[spark] object CoarseGrainedExecutorBackend extends Logging { diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala index 5d50e3851a9f..0f5c8a9e02ab 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala @@ -116,11 +116,17 @@ private[spark] class TypedConfigBuilder[T]( /** Creates a [[ConfigEntry]] that has a default value. */ def createWithDefault(default: T): ConfigEntry[T] = { - val transformedDefault = converter(stringConverter(default)) - val entry = new ConfigEntryWithDefault[T](parent.key, transformedDefault, converter, - stringConverter, parent._doc, parent._public) - parent._onCreate.foreach(_(entry)) - entry + // Treat "String" as a special case, so that both createWithDefault and createWithDefaultString + // behave the same w.r.t. variable expansion of default values. + if (default.isInstanceOf[String]) { + createWithDefaultString(default.asInstanceOf[String]) + } else { + val transformedDefault = converter(stringConverter(default)) + val entry = new ConfigEntryWithDefault[T](parent.key, transformedDefault, converter, + stringConverter, parent._doc, parent._public) + parent._onCreate.foreach(_(entry)) + entry + } } /** @@ -128,8 +134,7 @@ private[spark] class TypedConfigBuilder[T]( * [[String]] and must be a valid value for the entry. */ def createWithDefaultString(default: String): ConfigEntry[T] = { - val typedDefault = converter(default) - val entry = new ConfigEntryWithDefault[T](parent.key, typedDefault, converter, stringConverter, + val entry = new ConfigEntryWithDefaultString[T](parent.key, default, converter, stringConverter, parent._doc, parent._public) parent._onCreate.foreach(_(entry)) entry diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala index f7296b487c0e..e2e23b3c3c32 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala @@ -17,11 +17,35 @@ package org.apache.spark.internal.config +import java.util.{Map => JMap} + +import scala.util.matching.Regex + import org.apache.spark.SparkConf /** * An entry contains all meta information for a configuration. * + * Config options created using this feature support variable expansion. If the config value + * contains variable references of the form "${prefix:variableName}", the reference will be replaced + * with the value of the variable depending on the prefix. The prefix can be one of: + * + * - no prefix: if the config key starts with "spark", looks for the value in the Spark config + * - system: looks for the value in the system properties + * - env: looks for the value in the environment + * + * So referencing "${spark.master}" will look for the value of "spark.master" in the Spark + * configuration, while referencing "${env:MASTER}" will read the value from the "MASTER" + * environment variable. + * + * For known Spark configuration keys (i.e. those created using `ConfigBuilder`), references + * will also consider the default value when it exists. + * + * If the reference cannot be resolved, the original string will be retained. + * + * Variable expansion is also applied to the default values of config entries that have a default + * value declared as a string. + * * @param key the key for the configuration * @param defaultValue the default value for the configuration * @param valueConverter how to convert a string to the value. It should throw an exception if the @@ -42,17 +66,27 @@ private[spark] abstract class ConfigEntry[T] ( val doc: String, val isPublic: Boolean) { + import ConfigEntry._ + + registerEntry(this) + def defaultValueString: String - def readFrom(conf: SparkConf): T + def readFrom(conf: JMap[String, String], getenv: String => String): T - // This is used by SQLConf, since it doesn't use SparkConf to store settings and thus cannot - // use readFrom(). def defaultValue: Option[T] = None override def toString: String = { s"ConfigEntry(key=$key, defaultValue=$defaultValueString, doc=$doc, public=$isPublic)" } + + protected def readAndExpand( + conf: JMap[String, String], + getenv: String => String, + usedRefs: Set[String] = Set()): Option[String] = { + Option(conf.get(key)).map(expand(_, conf, getenv, usedRefs)) + } + } private class ConfigEntryWithDefault[T] ( @@ -68,12 +102,36 @@ private class ConfigEntryWithDefault[T] ( override def defaultValueString: String = stringConverter(_defaultValue) - override def readFrom(conf: SparkConf): T = { - conf.getOption(key).map(valueConverter).getOrElse(_defaultValue) + def readFrom(conf: JMap[String, String], getenv: String => String): T = { + readAndExpand(conf, getenv).map(valueConverter).getOrElse(_defaultValue) } } +private class ConfigEntryWithDefaultString[T] ( + key: String, + _defaultValue: String, + valueConverter: String => T, + stringConverter: T => String, + doc: String, + isPublic: Boolean) + extends ConfigEntry(key, valueConverter, stringConverter, doc, isPublic) { + + override def defaultValue: Option[T] = Some(valueConverter(_defaultValue)) + + override def defaultValueString: String = _defaultValue + + def readFrom(conf: JMap[String, String], getenv: String => String): T = { + Option(conf.get(key)) + .orElse(Some(_defaultValue)) + .map(ConfigEntry.expand(_, conf, getenv, Set())) + .map(valueConverter) + .get + } + +} + + /** * A config entry that does not have a default value. */ @@ -88,7 +146,9 @@ private[spark] class OptionalConfigEntry[T]( override def defaultValueString: String = "" - override def readFrom(conf: SparkConf): Option[T] = conf.getOption(key).map(rawValueConverter) + override def readFrom(conf: JMap[String, String], getenv: String => String): Option[T] = { + readAndExpand(conf, getenv).map(rawValueConverter) + } } @@ -99,13 +159,67 @@ private class FallbackConfigEntry[T] ( key: String, doc: String, isPublic: Boolean, - private val fallback: ConfigEntry[T]) + private[config] val fallback: ConfigEntry[T]) extends ConfigEntry[T](key, fallback.valueConverter, fallback.stringConverter, doc, isPublic) { override def defaultValueString: String = s"" - override def readFrom(conf: SparkConf): T = { - conf.getOption(key).map(valueConverter).getOrElse(fallback.readFrom(conf)) + override def readFrom(conf: JMap[String, String], getenv: String => String): T = { + Option(conf.get(key)).map(valueConverter).getOrElse(fallback.readFrom(conf, getenv)) + } + +} + +private object ConfigEntry { + + private val knownConfigs = new java.util.concurrent.ConcurrentHashMap[String, ConfigEntry[_]]() + + private val REF_RE = "\\$\\{(?:(\\w+?):)?(\\S+?)\\}".r + + def registerEntry(entry: ConfigEntry[_]): Unit = { + val existing = knownConfigs.putIfAbsent(entry.key, entry) + require(existing == null, s"Config entry ${entry.key} already registered!") + } + + def findEntry(key: String): ConfigEntry[_] = knownConfigs.get(key) + + /** + * Expand the `value` according to the rules explained in ConfigEntry. + */ + def expand( + value: String, + conf: JMap[String, String], + getenv: String => String, + usedRefs: Set[String]): String = { + REF_RE.replaceAllIn(value, { m => + val prefix = m.group(1) + val name = m.group(2) + val replacement = prefix match { + case null => + require(!usedRefs.contains(name), s"Circular reference in $value: $name") + if (name.startsWith("spark.")) { + Option(findEntry(name)) + .flatMap(_.readAndExpand(conf, getenv, usedRefs = usedRefs + name)) + .orElse(Option(conf.get(name))) + .orElse(defaultValueString(name)) + } else { + None + } + case "system" => sys.props.get(name) + case "env" => Option(getenv(name)) + case _ => None + } + Regex.quoteReplacement(replacement.getOrElse(m.matched)) + }) + } + + private def defaultValueString(key: String): Option[String] = { + findEntry(key) match { + case e: ConfigEntryWithDefault[_] => Some(e.defaultValueString) + case e: ConfigEntryWithDefaultString[_] => Some(e.defaultValueString) + case e: FallbackConfigEntry[_] => defaultValueString(e.fallback.key) + case _ => None + } } } diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index b7a5b222087e..a4905dd51b94 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -699,18 +699,28 @@ abstract class RDD[T: ClassTag]( * Return an RDD created by piping elements to a forked external process. */ def pipe(command: String): RDD[String] = withScope { - pipe(command) + // Similar to Runtime.exec(), if we are given a single string, split it into words + // using a standard StringTokenizer (i.e. by spaces) + pipe(PipedRDD.tokenize(command)) } /** * Return an RDD created by piping elements to a forked external process. */ def pipe(command: String, env: Map[String, String]): RDD[String] = withScope { - pipe(command, env) + // Similar to Runtime.exec(), if we are given a single string, split it into words + // using a standard StringTokenizer (i.e. by spaces) + pipe(PipedRDD.tokenize(command), env) } /** - * Return an RDD created by piping elements to a forked external process. + * Return an RDD created by piping elements to a forked external process. The resulting RDD + * is computed by executing the given process once per partition. All elements + * of each input partition are written to a process's stdin as lines of input separated + * by a newline. The resulting partition consists of the process's stdout output, with + * each line of stdout resulting in one element of the output partition. A process is invoked + * even for empty partitions. + * * The print behavior can be customized by providing two functions. * * @param command command to run in forked process. diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala index 2dd453cd6397..7bed6851d0cd 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala @@ -184,6 +184,8 @@ private[spark] object OutputCommitCoordinator { override val rpcEnv: RpcEnv, outputCommitCoordinator: OutputCommitCoordinator) extends RpcEndpoint with Logging { + logDebug("init") // force eager creation of logger + override def receive: PartialFunction[Any, Unit] = { case StopCoordinator => logInfo("OutputCommitCoordinator stopped!") diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index 73bd4c58e16f..39b0f4d0e2c0 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -353,38 +353,60 @@ private[spark] class MesosClusterScheduler( } } - private def buildDriverCommand(desc: MesosDriverDescription): CommandInfo = { - val appJar = CommandInfo.URI.newBuilder() - .setValue(desc.jarUrl.stripPrefix("file:").stripPrefix("local:")).build() - val builder = CommandInfo.newBuilder().addUris(appJar) - val entries = conf.getOption("spark.executor.extraLibraryPath") - .map(path => Seq(path) ++ desc.command.libraryPathEntries) - .getOrElse(desc.command.libraryPathEntries) - - val prefixEnv = if (!entries.isEmpty) { - Utils.libraryPathEnvPrefix(entries) - } else { - "" + private def getDriverExecutorURI(desc: MesosDriverDescription) = { + desc.schedulerProperties.get("spark.executor.uri") + .orElse(desc.command.environment.get("SPARK_EXECUTOR_URI")) + } + + private def getDriverEnvironment(desc: MesosDriverDescription): Environment = { + val env = { + val executorOpts = desc.schedulerProperties.map { case (k, v) => s"-D$k=$v" }.mkString(" ") + val executorEnv = Map("SPARK_EXECUTOR_OPTS" -> executorOpts) + + val prefix = "spark.mesos.driverEnv." + val driverEnv = desc.schedulerProperties.filterKeys(_.startsWith(prefix)) + .map { case (k, v) => (k.substring(prefix.length), v) } + + driverEnv ++ executorEnv ++ desc.command.environment } + val envBuilder = Environment.newBuilder() - desc.command.environment.foreach { case (k, v) => - envBuilder.addVariables(Variable.newBuilder().setName(k).setValue(v).build()) + env.foreach { case (k, v) => + envBuilder.addVariables(Variable.newBuilder().setName(k).setValue(v)) } - // Pass all spark properties to executor. - val executorOpts = desc.schedulerProperties.map { case (k, v) => s"-D$k=$v" }.mkString(" ") - envBuilder.addVariables( - Variable.newBuilder().setName("SPARK_EXECUTOR_OPTS").setValue(executorOpts)) + envBuilder.build() + } + + private def getDriverUris(desc: MesosDriverDescription): List[CommandInfo.URI] = { + val confUris = List(conf.getOption("spark.mesos.uris"), + desc.schedulerProperties.get("spark.mesos.uris"), + desc.schedulerProperties.get("spark.submit.pyFiles")).flatMap( + _.map(_.split(",").map(_.trim)) + ).flatten + + val jarUrl = desc.jarUrl.stripPrefix("file:").stripPrefix("local:") + + ((jarUrl :: confUris) ++ getDriverExecutorURI(desc).toList).map(uri => + CommandInfo.URI.newBuilder().setValue(uri.trim()).build()) + } + + private def getDriverCommandValue(desc: MesosDriverDescription): String = { val dockerDefined = desc.schedulerProperties.contains("spark.mesos.executor.docker.image") - val executorUri = desc.schedulerProperties.get("spark.executor.uri") - .orElse(desc.command.environment.get("SPARK_EXECUTOR_URI")) + val executorUri = getDriverExecutorURI(desc) // Gets the path to run spark-submit, and the path to the Mesos sandbox. val (executable, sandboxPath) = if (dockerDefined) { // Application jar is automatically downloaded in the mounted sandbox by Mesos, // and the path to the mounted volume is stored in $MESOS_SANDBOX env variable. ("./bin/spark-submit", "$MESOS_SANDBOX") } else if (executorUri.isDefined) { - builder.addUris(CommandInfo.URI.newBuilder().setValue(executorUri.get).build()) val folderBasename = executorUri.get.split('/').last.split('.').head + + val entries = conf.getOption("spark.executor.extraLibraryPath") + .map(path => Seq(path) ++ desc.command.libraryPathEntries) + .getOrElse(desc.command.libraryPathEntries) + + val prefixEnv = if (!entries.isEmpty) Utils.libraryPathEnvPrefix(entries) else "" + val cmdExecutable = s"cd $folderBasename*; $prefixEnv bin/spark-submit" // Sandbox path points to the parent folder as we chdir into the folderBasename. (cmdExecutable, "..") @@ -399,20 +421,18 @@ private[spark] class MesosClusterScheduler( // Sandbox points to the current directory by default with Mesos. (cmdExecutable, ".") } - val primaryResource = new File(sandboxPath, desc.jarUrl.split("/").last).toString() val cmdOptions = generateCmdOption(desc, sandboxPath).mkString(" ") + val primaryResource = new File(sandboxPath, desc.jarUrl.split("/").last).toString() val appArguments = desc.command.arguments.mkString(" ") - builder.setValue(s"$executable $cmdOptions $primaryResource $appArguments") - builder.setEnvironment(envBuilder.build()) - conf.getOption("spark.mesos.uris").map { uris => - setupUris(uris, builder) - } - desc.schedulerProperties.get("spark.mesos.uris").map { uris => - setupUris(uris, builder) - } - desc.schedulerProperties.get("spark.submit.pyFiles").map { pyFiles => - setupUris(pyFiles, builder) - } + + s"$executable $cmdOptions $primaryResource $appArguments" + } + + private def buildDriverCommand(desc: MesosDriverDescription): CommandInfo = { + val builder = CommandInfo.newBuilder() + builder.setValue(getDriverCommandValue(desc)) + builder.setEnvironment(getDriverEnvironment(desc)) + builder.addAllUris(getDriverUris(desc).asJava) builder.build() } diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllExecutorListResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllExecutorListResource.scala new file mode 100644 index 000000000000..01f2a18122e6 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllExecutorListResource.scala @@ -0,0 +1,41 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.spark.status.api.v1 + +import javax.ws.rs.{GET, Produces} +import javax.ws.rs.core.MediaType + +import org.apache.spark.ui.SparkUI +import org.apache.spark.ui.exec.ExecutorsPage + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class AllExecutorListResource(ui: SparkUI) { + + @GET + def executorList(): Seq[ExecutorSummary] = { + val listener = ui.executorsListener + listener.synchronized { + // The follow codes should be protected by `listener` to make sure no executors will be + // removed before we query their status. See SPARK-12784. + (0 until listener.activeStorageStatusList.size).map { statusId => + ExecutorsPage.getExecInfo(listener, statusId, isActive = true) + } ++ (0 until listener.deadStorageStatusList.size).map { statusId => + ExecutorsPage.getExecInfo(listener, statusId, isActive = false) + } + } + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala index b21d36d4a8d8..d0d9ef1165e8 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala @@ -91,7 +91,7 @@ private[v1] object AllJobsResource { numTasks = job.numTasks, numActiveTasks = job.numActiveTasks, numCompletedTasks = job.numCompletedTasks, - numSkippedTasks = job.numCompletedTasks, + numSkippedTasks = job.numSkippedTasks, numFailedTasks = job.numFailedTasks, numActiveStages = job.numActiveStages, numCompletedStages = job.completedStageIndices.size, diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala index 681f295006e3..de927117e1f6 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala @@ -91,6 +91,13 @@ private[v1] class ApiRootResource extends UIRootFromServletContext { } } + @Path("applications/{appId}/allexecutors") + def getAllExecutors(@PathParam("appId") appId: String): AllExecutorListResource = { + uiRoot.withSparkUI(appId, None) { ui => + new AllExecutorListResource(ui) + } + } + @Path("applications/{appId}/{attemptId}/executors") def getExecutors( @PathParam("appId") appId: String, @@ -100,6 +107,15 @@ private[v1] class ApiRootResource extends UIRootFromServletContext { } } + @Path("applications/{appId}/{attemptId}/allexecutors") + def getAllExecutors( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): AllExecutorListResource = { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new AllExecutorListResource(ui) + } + } + @Path("applications/{appId}/stages") def getStages(@PathParam("appId") appId: String): AllStagesResource = { diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala index 0666be2dcb01..3d43e3c367aa 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala @@ -141,6 +141,7 @@ private[spark] class DiskBlockManager(conf: SparkConf, deleteFilesOnStop: Boolea } private def addShutdownHook(): AnyRef = { + logDebug("Adding shutdown hook") // force eager creation of logger ShutdownHookManager.addShutdownHook(ShutdownHookManager.TEMP_DIR_SHUTDOWN_PRIORITY + 1) { () => logInfo("Shutdown hook called") DiskBlockManager.this.doStop() diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index 5b493f470b50..e5b1bf2f4b43 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -27,8 +27,10 @@ import org.apache.spark.util.Utils /** * A class for writing JVM objects directly to a file on disk. This class allows data to be appended - * to an existing block and can guarantee atomicity in the case of faults as it allows the caller to - * revert partial writes. + * to an existing block. For efficiency, it retains the underlying file channel across + * multiple commits. This channel is kept open until close() is called. In case of faults, + * callers should instead close with revertPartialWritesAndClose() to atomically revert the + * uncommitted partial writes. * * This class does not support concurrent writes. Also, once the writer has been opened it cannot be * reopened again. @@ -46,34 +48,49 @@ private[spark] class DiskBlockObjectWriter( extends OutputStream with Logging { + /** + * Guards against close calls, e.g. from a wrapping stream. + * Call manualClose to close the stream that was extended by this trait. + * Commit uses this trait to close object streams without paying the + * cost of closing and opening the underlying file. + */ + private trait ManualCloseOutputStream extends OutputStream { + abstract override def close(): Unit = { + flush() + } + + def manualClose(): Unit = { + super.close() + } + } + /** The file channel, used for repositioning / truncating the file. */ private var channel: FileChannel = null + private var mcs: ManualCloseOutputStream = null private var bs: OutputStream = null private var fos: FileOutputStream = null private var ts: TimeTrackingOutputStream = null private var objOut: SerializationStream = null private var initialized = false + private var streamOpen = false private var hasBeenClosed = false - private var commitAndCloseHasBeenCalled = false /** * Cursors used to represent positions in the file. * - * xxxxxxxx|--------|--- | - * ^ ^ ^ - * | | finalPosition - * | reportedPosition - * initialPosition + * xxxxxxxxxx|----------|-----| + * ^ ^ ^ + * | | channel.position() + * | reportedPosition + * committedPosition * - * initialPosition: Offset in the file where we start writing. Immutable. * reportedPosition: Position at the time of the last update to the write metrics. - * finalPosition: Offset where we stopped writing. Set on closeAndCommit() then never changed. + * committedPosition: Offset after last committed write. * -----: Current writes to the underlying file. - * xxxxx: Existing contents of the file. + * xxxxx: Committed contents of the file. */ - private val initialPosition = file.length() - private var finalPosition: Long = -1 - private var reportedPosition = initialPosition + private var committedPosition = file.length() + private var reportedPosition = committedPosition /** * Keep track of number of records written and also use this to periodically @@ -81,67 +98,98 @@ private[spark] class DiskBlockObjectWriter( */ private var numRecordsWritten = 0 + private def initialize(): Unit = { + fos = new FileOutputStream(file, true) + channel = fos.getChannel() + ts = new TimeTrackingOutputStream(writeMetrics, fos) + class ManualCloseBufferedOutputStream + extends BufferedOutputStream(ts, bufferSize) with ManualCloseOutputStream + mcs = new ManualCloseBufferedOutputStream + } + def open(): DiskBlockObjectWriter = { if (hasBeenClosed) { throw new IllegalStateException("Writer already closed. Cannot be reopened.") } - fos = new FileOutputStream(file, true) - ts = new TimeTrackingOutputStream(writeMetrics, fos) - channel = fos.getChannel() - bs = compressStream(new BufferedOutputStream(ts, bufferSize)) + if (!initialized) { + initialize() + initialized = true + } + bs = compressStream(mcs) objOut = serializerInstance.serializeStream(bs) - initialized = true + streamOpen = true this } - override def close() { + /** + * Close and cleanup all resources. + * Should call after committing or reverting partial writes. + */ + private def closeResources(): Unit = { if (initialized) { - Utils.tryWithSafeFinally { - if (syncWrites) { - // Force outstanding writes to disk and track how long it takes - objOut.flush() - val start = System.nanoTime() - fos.getFD.sync() - writeMetrics.incWriteTime(System.nanoTime() - start) - } - } { - objOut.close() - } - + mcs.manualClose() channel = null + mcs = null bs = null fos = null ts = null objOut = null initialized = false + streamOpen = false hasBeenClosed = true } } - def isOpen: Boolean = objOut != null + /** + * Commits any remaining partial writes and closes resources. + */ + override def close() { + if (initialized) { + Utils.tryWithSafeFinally { + commitAndGet() + } { + closeResources() + } + } + } /** * Flush the partial writes and commit them as a single atomic block. + * A commit may write additional bytes to frame the atomic block. + * + * @return file segment with previous offset and length committed on this call. */ - def commitAndClose(): Unit = { - if (initialized) { + def commitAndGet(): FileSegment = { + if (streamOpen) { // NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the // serializer stream and the lower level stream. objOut.flush() bs.flush() - close() - finalPosition = file.length() - // In certain compression codecs, more bytes are written after close() is called - writeMetrics.incBytesWritten(finalPosition - reportedPosition) + objOut.close() + streamOpen = false + + if (syncWrites) { + // Force outstanding writes to disk and track how long it takes + val start = System.nanoTime() + fos.getFD.sync() + writeMetrics.incWriteTime(System.nanoTime() - start) + } + + val pos = channel.position() + val fileSegment = new FileSegment(file, committedPosition, pos - committedPosition) + committedPosition = pos + // In certain compression codecs, more bytes are written after streams are closed + writeMetrics.incBytesWritten(committedPosition - reportedPosition) + reportedPosition = committedPosition + fileSegment } else { - finalPosition = file.length() + new FileSegment(file, committedPosition, 0) } - commitAndCloseHasBeenCalled = true } /** - * Reverts writes that haven't been flushed yet. Callers should invoke this function + * Reverts writes that haven't been committed yet. Callers should invoke this function * when there are runtime exceptions. This method will not throw, though it may be * unsuccessful in truncating written data. * @@ -152,16 +200,15 @@ private[spark] class DiskBlockObjectWriter( // truncating the file to its initial position. try { if (initialized) { - writeMetrics.decBytesWritten(reportedPosition - initialPosition) + writeMetrics.decBytesWritten(reportedPosition - committedPosition) writeMetrics.decRecordsWritten(numRecordsWritten) - objOut.flush() - bs.flush() - close() + streamOpen = false + closeResources() } val truncateStream = new FileOutputStream(file, true) try { - truncateStream.getChannel.truncate(initialPosition) + truncateStream.getChannel.truncate(committedPosition) file } finally { truncateStream.close() @@ -177,7 +224,7 @@ private[spark] class DiskBlockObjectWriter( * Writes a key-value pair. */ def write(key: Any, value: Any) { - if (!initialized) { + if (!streamOpen) { open() } @@ -189,7 +236,7 @@ private[spark] class DiskBlockObjectWriter( override def write(b: Int): Unit = throw new UnsupportedOperationException() override def write(kvBytes: Array[Byte], offs: Int, len: Int): Unit = { - if (!initialized) { + if (!streamOpen) { open() } @@ -208,18 +255,6 @@ private[spark] class DiskBlockObjectWriter( } } - /** - * Returns the file segment of committed data that this Writer has written. - * This is only valid after commitAndClose() has been called. - */ - def fileSegment(): FileSegment = { - if (!commitAndCloseHasBeenCalled) { - throw new IllegalStateException( - "fileSegment() is only valid after commitAndClose() has been called") - } - new FileSegment(file, initialPosition, finalPosition - initialPosition) - } - /** * Report the number of bytes written in this writer's shuffle write metrics. * Note that this is only valid before the underlying streams are closed. diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 740f5e5f7fe3..2b6c538485c5 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -201,7 +201,8 @@ private[spark] object UIUtils extends Logging { activeTab: SparkUITab, refreshInterval: Option[Int] = None, helpText: Option[String] = None, - showVisualization: Boolean = false): Seq[Node] = { + showVisualization: Boolean = false, + useDataTables: Boolean = false): Seq[Node] = { val appName = activeTab.appName val shortAppName = if (appName.length < 36) appName else appName.take(32) + "..." @@ -216,6 +217,7 @@ private[spark] object UIUtils extends Logging { {commonHeaderNodes} {if (showVisualization) vizHeaderNodes else Seq.empty} + {if (useDataTables) dataTablesHeaderNodes else Seq.empty} {appName} - {title} diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala index 67deb7b14bcb..287390b87bd7 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala @@ -20,7 +20,6 @@ package org.apache.spark.ui.exec import java.net.URLEncoder import javax.servlet.http.HttpServletRequest -import scala.util.Try import scala.xml.Node import org.apache.spark.status.api.v1.ExecutorSummary @@ -54,285 +53,17 @@ private[ui] class ExecutorsPage( // When GCTimePercent is edited change ToolTips.TASK_TIME to match private val GCTimePercent = 0.1 - // a safe String to Int for sorting ids (converts non-numeric Strings to -1) - private def idStrToInt(str: String) : Int = Try(str.toInt).getOrElse(-1) - def render(request: HttpServletRequest): Seq[Node] = { - val (activeExecutorInfo, deadExecutorInfo) = listener.synchronized { - // The follow codes should be protected by `listener` to make sure no executors will be - // removed before we query their status. See SPARK-12784. - val _activeExecutorInfo = { - for (statusId <- 0 until listener.activeStorageStatusList.size) - yield ExecutorsPage.getExecInfo(listener, statusId, isActive = true) - } - val _deadExecutorInfo = { - for (statusId <- 0 until listener.deadStorageStatusList.size) - yield ExecutorsPage.getExecInfo(listener, statusId, isActive = false) - } - (_activeExecutorInfo, _deadExecutorInfo) - } - - val execInfo = activeExecutorInfo ++ deadExecutorInfo - implicit val idOrder = Ordering[Int].on((s: String) => idStrToInt(s)).reverse - val execInfoSorted = execInfo.sortBy(_.id) - val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty - - val execTable = { - - - - - - - - - - - - - - - - - - {if (logsExist) else Seq.empty} - {if (threadDumpEnabled) else Seq.empty} - - - {execInfoSorted.map(execRow(_, logsExist))} - -
Executor IDAddressStatusRDD BlocksStorage MemoryDisk UsedCoresActive TasksFailed TasksComplete TasksTotal TasksTask Time (GC Time)InputShuffle Read - - - Shuffle Write - - LogsThread Dump
- } - val content = -
-
-

Summary

- {execSummary(activeExecutorInfo, deadExecutorInfo)} -
-
-
-
-

Executors

- {execTable} -
-
; - - UIUtils.headerSparkPage("Executors", content, parent) - } - - /** Render an HTML row representing an executor */ - private def execRow(info: ExecutorSummary, logsExist: Boolean): Seq[Node] = { - val maximumMemory = info.maxMemory - val memoryUsed = info.memoryUsed - val diskUsed = info.diskUsed - val executorStatus = - if (info.isActive) { - "Active" - } else { - "Dead" - } - - - {info.id} - {info.hostPort} - - {executorStatus} - - {info.rddBlocks} - - {Utils.bytesToString(memoryUsed)} / - {Utils.bytesToString(maximumMemory)} - - - {Utils.bytesToString(diskUsed)} - - {info.totalCores} - {taskData(info.maxTasks, info.activeTasks, info.failedTasks, info.completedTasks, - info.totalTasks, info.totalDuration, info.totalGCTime)} - - {Utils.bytesToString(info.totalInputBytes)} - - - {Utils.bytesToString(info.totalShuffleRead)} - - - {Utils.bytesToString(info.totalShuffleWrite)} - - { - if (logsExist) { - - { - info.executorLogs.map { case (logName, logUrl) => - - } - } - +
+ { +
++ + ++ + } - } - { - if (threadDumpEnabled) { - if (info.isActive) { - val encodedId = URLEncoder.encode(info.id, "UTF-8") - - Thread Dump - - } else { - - } - } else { - Seq.empty - } - } - - } - - private def execSummaryRow(execInfo: Seq[ExecutorSummary], rowName: String): Seq[Node] = { - val maximumMemory = execInfo.map(_.maxMemory).sum - val memoryUsed = execInfo.map(_.memoryUsed).sum - val diskUsed = execInfo.map(_.diskUsed).sum - val totalCores = execInfo.map(_.totalCores).sum - val totalInputBytes = execInfo.map(_.totalInputBytes).sum - val totalShuffleRead = execInfo.map(_.totalShuffleRead).sum - val totalShuffleWrite = execInfo.map(_.totalShuffleWrite).sum - - - {rowName}({execInfo.size}) - {execInfo.map(_.rddBlocks).sum} - - {Utils.bytesToString(memoryUsed)} / - {Utils.bytesToString(maximumMemory)} - - - {Utils.bytesToString(diskUsed)} - - {totalCores} - {taskData(execInfo.map(_.maxTasks).sum, - execInfo.map(_.activeTasks).sum, - execInfo.map(_.failedTasks).sum, - execInfo.map(_.completedTasks).sum, - execInfo.map(_.totalTasks).sum, - execInfo.map(_.totalDuration).sum, - execInfo.map(_.totalGCTime).sum)} - - {Utils.bytesToString(totalInputBytes)} - - - {Utils.bytesToString(totalShuffleRead)} - - - {Utils.bytesToString(totalShuffleWrite)} - - - } - - private def execSummary(activeExecInfo: Seq[ExecutorSummary], deadExecInfo: Seq[ExecutorSummary]): - Seq[Node] = { - val totalExecInfo = activeExecInfo ++ deadExecInfo - val activeRow = execSummaryRow(activeExecInfo, "Active"); - val deadRow = execSummaryRow(deadExecInfo, "Dead"); - val totalRow = execSummaryRow(totalExecInfo, "Total"); - - - - - - - - - - - - - - - - - - - {activeRow} - {deadRow} - {totalRow} - -
RDD BlocksStorage MemoryDisk UsedCoresActive TasksFailed TasksComplete TasksTotal TasksTask Time (GC Time)InputShuffle Read - - Shuffle Write - -
- } - - private def taskData( - maxTasks: Int, - activeTasks: Int, - failedTasks: Int, - completedTasks: Int, - totalTasks: Int, - totalDuration: Long, - totalGCTime: Long): Seq[Node] = { - // Determine Color Opacity from 0.5-1 - // activeTasks range from 0 to maxTasks - val activeTasksAlpha = - if (maxTasks > 0) { - (activeTasks.toDouble / maxTasks) * 0.5 + 0.5 - } else { - 1 - } - // failedTasks range max at 10% failure, alpha max = 1 - val failedTasksAlpha = - if (totalTasks > 0) { - math.min(10 * failedTasks.toDouble / totalTasks, 1) * 0.5 + 0.5 - } else { - 1 - } - // totalDuration range from 0 to 50% GC time, alpha max = 1 - val totalDurationAlpha = - if (totalDuration > 0) { - math.min(totalGCTime.toDouble / totalDuration + 0.5, 1) - } else { - 1 - } - - val tableData = - 0) { - "background:hsla(240, 100%, 50%, " + activeTasksAlpha + ");color:white" - } else { - "" - } - }>{activeTasks} - 0) { - "background:hsla(0, 100%, 50%, " + failedTasksAlpha + ");color:white" - } else { - "" - } - }>{failedTasks} - {completedTasks} - {totalTasks} - GCTimePercent * totalDuration) { - "background:hsla(0, 100%, 50%, " + totalDurationAlpha + ");color:white" - } else { - "" - } - }> - {Utils.msDurationToString(totalDuration)} - ({Utils.msDurationToString(totalGCTime)}) - ; +
; - tableData + UIUtils.headerSparkPage("Executors", content, parent, useDataTables = true) } } diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index 93ac67e5db0d..4001fac3c3d5 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -54,6 +54,7 @@ private[spark] object ShutdownHookManager extends Logging { private val shutdownDeletePaths = new scala.collection.mutable.HashSet[String]() // Add a shutdown hook to delete the temp dirs when the JVM exits + logDebug("Adding shutdown hook") // force eager creation of logger addShutdownHook(TEMP_DIR_SHUTDOWN_PRIORITY) { () => logInfo("Shutdown hook called") // we need to materialize the paths to delete because deleteRecursively removes items from diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 298e6243aace..6ab9e99d89e4 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2342,10 +2342,27 @@ private[spark] object Utils extends Logging { * Return the initial number of executors for dynamic allocation. */ def getDynamicAllocationInitialExecutors(conf: SparkConf): Int = { - Seq( + if (conf.get(DYN_ALLOCATION_INITIAL_EXECUTORS) < conf.get(DYN_ALLOCATION_MIN_EXECUTORS)) { + logWarning(s"${DYN_ALLOCATION_INITIAL_EXECUTORS.key} less than " + + s"${DYN_ALLOCATION_MIN_EXECUTORS.key} is invalid, ignoring its setting, " + + "please update your configs.") + } + + if (conf.get(EXECUTOR_INSTANCES).getOrElse(0) < conf.get(DYN_ALLOCATION_MIN_EXECUTORS)) { + logWarning(s"${EXECUTOR_INSTANCES.key} less than " + + s"${DYN_ALLOCATION_MIN_EXECUTORS.key} is invalid, ignoring its setting, " + + "please update your configs.") + } + + val initialExecutors = Seq( conf.get(DYN_ALLOCATION_MIN_EXECUTORS), conf.get(DYN_ALLOCATION_INITIAL_EXECUTORS), conf.get(EXECUTOR_INSTANCES).getOrElse(0)).max + + logInfo(s"Using initial executors = $initialExecutors, max of " + + s"${DYN_ALLOCATION_INITIAL_EXECUTORS.key}, ${DYN_ALLOCATION_MIN_EXECUTORS.key} and " + + s"${EXECUTOR_INSTANCES.key}") + initialExecutors } def tryWithResource[R <: Closeable, T](createResource: => R)(f: R => T): T = { @@ -2392,9 +2409,9 @@ private[spark] object Utils extends Logging { * "spark.yarn.dist.jars" properties, while in other modes it returns the jar files pointed by * only the "spark.jars" property. */ - def getUserJars(conf: SparkConf): Seq[String] = { + def getUserJars(conf: SparkConf, isShell: Boolean = false): Seq[String] = { val sparkJars = conf.getOption("spark.jars") - if (conf.get("spark.master") == "yarn") { + if (conf.get("spark.master") == "yarn" && isShell) { val yarnJars = conf.getOption("spark.yarn.dist.jars") unionFileLists(sparkJars, yarnJars).toSeq } else { diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala index 6ddc72afde27..8c8860bb37a4 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala @@ -105,8 +105,8 @@ class ExternalAppendOnlyMap[K, V, C]( private val fileBufferSize = sparkConf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024 - // Write metrics for current spill - private var curWriteMetrics: ShuffleWriteMetrics = _ + // Write metrics + private val writeMetrics: ShuffleWriteMetrics = new ShuffleWriteMetrics() // Peak size of the in-memory map observed so far, in bytes private var _peakMemoryUsedBytes: Long = 0L @@ -206,8 +206,7 @@ class ExternalAppendOnlyMap[K, V, C]( private[this] def spillMemoryIteratorToDisk(inMemoryIterator: Iterator[(K, C)]) : DiskMapIterator = { val (blockId, file) = diskBlockManager.createTempLocalBlock() - curWriteMetrics = new ShuffleWriteMetrics() - var writer = blockManager.getDiskWriter(blockId, file, ser, fileBufferSize, curWriteMetrics) + val writer = blockManager.getDiskWriter(blockId, file, ser, fileBufferSize, writeMetrics) var objectsWritten = 0 // List of batch sizes (bytes) in the order they are written to disk @@ -215,11 +214,9 @@ class ExternalAppendOnlyMap[K, V, C]( // Flush the disk writer's contents to disk, and update relevant variables def flush(): Unit = { - val w = writer - writer = null - w.commitAndClose() - _diskBytesSpilled += curWriteMetrics.bytesWritten - batchSizes.append(curWriteMetrics.bytesWritten) + val segment = writer.commitAndGet() + batchSizes.append(segment.length) + _diskBytesSpilled += segment.length objectsWritten = 0 } @@ -232,25 +229,20 @@ class ExternalAppendOnlyMap[K, V, C]( if (objectsWritten == serializerBatchSize) { flush() - curWriteMetrics = new ShuffleWriteMetrics() - writer = blockManager.getDiskWriter(blockId, file, ser, fileBufferSize, curWriteMetrics) } } if (objectsWritten > 0) { flush() - } else if (writer != null) { - val w = writer - writer = null - w.revertPartialWritesAndClose() + writer.close() + } else { + writer.revertPartialWritesAndClose() } success = true } finally { if (!success) { // This code path only happens if an exception was thrown above before we set success; // close our stuff and let the exception be thrown further - if (writer != null) { - writer.revertPartialWritesAndClose() - } + writer.revertPartialWritesAndClose() if (file.exists()) { if (!file.delete()) { logWarning(s"Error deleting ${file}") diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala index 4067acee738e..708a0070e225 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala @@ -272,14 +272,9 @@ private[spark] class ExternalSorter[K, V, C]( // These variables are reset after each flush var objectsWritten: Long = 0 - var spillMetrics: ShuffleWriteMetrics = null - var writer: DiskBlockObjectWriter = null - def openWriter(): Unit = { - assert (writer == null && spillMetrics == null) - spillMetrics = new ShuffleWriteMetrics - writer = blockManager.getDiskWriter(blockId, file, serInstance, fileBufferSize, spillMetrics) - } - openWriter() + val spillMetrics: ShuffleWriteMetrics = new ShuffleWriteMetrics + val writer: DiskBlockObjectWriter = + blockManager.getDiskWriter(blockId, file, serInstance, fileBufferSize, spillMetrics) // List of batch sizes (bytes) in the order they are written to disk val batchSizes = new ArrayBuffer[Long] @@ -288,14 +283,11 @@ private[spark] class ExternalSorter[K, V, C]( val elementsPerPartition = new Array[Long](numPartitions) // Flush the disk writer's contents to disk, and update relevant variables. - // The writer is closed at the end of this process, and cannot be reused. + // The writer is committed at the end of this process. def flush(): Unit = { - val w = writer - writer = null - w.commitAndClose() - _diskBytesSpilled += spillMetrics.bytesWritten - batchSizes.append(spillMetrics.bytesWritten) - spillMetrics = null + val segment = writer.commitAndGet() + batchSizes.append(segment.length) + _diskBytesSpilled += segment.length objectsWritten = 0 } @@ -311,24 +303,21 @@ private[spark] class ExternalSorter[K, V, C]( if (objectsWritten == serializerBatchSize) { flush() - openWriter() } } if (objectsWritten > 0) { flush() - } else if (writer != null) { - val w = writer - writer = null - w.revertPartialWritesAndClose() + } else { + writer.revertPartialWritesAndClose() } success = true } finally { - if (!success) { + if (success) { + writer.close() + } else { // This code path only happens if an exception was thrown above before we set success; // close our stuff and let the exception be thrown further - if (writer != null) { - writer.revertPartialWritesAndClose() - } + writer.revertPartialWritesAndClose() if (file.exists()) { if (!file.delete()) { logWarning(s"Error deleting ${file}") @@ -693,42 +682,37 @@ private[spark] class ExternalSorter[K, V, C]( blockId: BlockId, outputFile: File): Array[Long] = { - val writeMetrics = context.taskMetrics().shuffleWriteMetrics - // Track location of each range in the output file val lengths = new Array[Long](numPartitions) + val writer = blockManager.getDiskWriter(blockId, outputFile, serInstance, fileBufferSize, + context.taskMetrics().shuffleWriteMetrics) if (spills.isEmpty) { // Case where we only have in-memory data val collection = if (aggregator.isDefined) map else buffer val it = collection.destructiveSortedWritablePartitionedIterator(comparator) while (it.hasNext) { - val writer = blockManager.getDiskWriter( - blockId, outputFile, serInstance, fileBufferSize, writeMetrics) val partitionId = it.nextPartition() while (it.hasNext && it.nextPartition() == partitionId) { it.writeNext(writer) } - writer.commitAndClose() - val segment = writer.fileSegment() + val segment = writer.commitAndGet() lengths(partitionId) = segment.length } } else { // We must perform merge-sort; get an iterator by partition and write everything directly. for ((id, elements) <- this.partitionedIterator) { if (elements.hasNext) { - val writer = blockManager.getDiskWriter( - blockId, outputFile, serInstance, fileBufferSize, writeMetrics) for (elem <- elements) { writer.write(elem._1, elem._2) } - writer.commitAndClose() - val segment = writer.fileSegment() + val segment = writer.commitAndGet() lengths(id) = segment.length } } } + writer.close() context.taskMetrics().incMemoryBytesSpilled(memoryBytesSpilled) context.taskMetrics().incDiskBytesSpilled(diskBytesSpilled) context.taskMetrics().incPeakExecutionMemory(peakMemoryUsedBytes) diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java index 8ca54b24d82e..e393db06a01f 100644 --- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java +++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java @@ -21,6 +21,7 @@ import java.util.HashMap; import java.util.Map; +import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,10 +41,15 @@ public class SparkLauncherSuite { private static final Logger LOG = LoggerFactory.getLogger(SparkLauncherSuite.class); private static final NamedThreadFactory TF = new NamedThreadFactory("SparkLauncherSuite-%d"); + private SparkLauncher launcher; + + @Before + public void configureLauncher() { + launcher = new SparkLauncher().setSparkHome(System.getProperty("spark.test.home")); + } + @Test public void testSparkArgumentHandling() throws Exception { - SparkLauncher launcher = new SparkLauncher() - .setSparkHome(System.getProperty("spark.test.home")); SparkSubmitOptionParser opts = new SparkSubmitOptionParser(); launcher.addSparkArg(opts.HELP); @@ -85,14 +91,67 @@ public void testSparkArgumentHandling() throws Exception { assertEquals("bar", launcher.builder.conf.get("spark.foo")); } + @Test(expected=IllegalStateException.class) + public void testRedirectTwiceFails() throws Exception { + launcher.setAppResource("fake-resource.jar") + .setMainClass("my.fake.class.Fake") + .redirectError() + .redirectError(ProcessBuilder.Redirect.PIPE) + .launch(); + } + + @Test(expected=IllegalStateException.class) + public void testRedirectToLogWithOthersFails() throws Exception { + launcher.setAppResource("fake-resource.jar") + .setMainClass("my.fake.class.Fake") + .redirectToLog("fakeLog") + .redirectError(ProcessBuilder.Redirect.PIPE) + .launch(); + } + + @Test + public void testRedirectErrorToOutput() throws Exception { + launcher.redirectError(); + assertTrue(launcher.redirectErrorStream); + } + + @Test + public void testRedirectsSimple() throws Exception { + launcher.redirectError(ProcessBuilder.Redirect.PIPE); + assertNotNull(launcher.errorStream); + assertEquals(launcher.errorStream.type(), ProcessBuilder.Redirect.Type.PIPE); + + launcher.redirectOutput(ProcessBuilder.Redirect.PIPE); + assertNotNull(launcher.outputStream); + assertEquals(launcher.outputStream.type(), ProcessBuilder.Redirect.Type.PIPE); + } + + @Test + public void testRedirectLastWins() throws Exception { + launcher.redirectError(ProcessBuilder.Redirect.PIPE) + .redirectError(ProcessBuilder.Redirect.INHERIT); + assertEquals(launcher.errorStream.type(), ProcessBuilder.Redirect.Type.INHERIT); + + launcher.redirectOutput(ProcessBuilder.Redirect.PIPE) + .redirectOutput(ProcessBuilder.Redirect.INHERIT); + assertEquals(launcher.outputStream.type(), ProcessBuilder.Redirect.Type.INHERIT); + } + + @Test + public void testRedirectToLog() throws Exception { + launcher.redirectToLog("fakeLogger"); + assertTrue(launcher.redirectToLog); + assertTrue(launcher.builder.getEffectiveConfig() + .containsKey(SparkLauncher.CHILD_PROCESS_LOGGER_NAME)); + } + @Test public void testChildProcLauncher() throws Exception { SparkSubmitOptionParser opts = new SparkSubmitOptionParser(); Map env = new HashMap<>(); env.put("SPARK_PRINT_LAUNCH_COMMAND", "1"); - SparkLauncher launcher = new SparkLauncher(env) - .setSparkHome(System.getProperty("spark.test.home")) + launcher .setMaster("local") .setAppResource(SparkLauncher.NO_RESOURCE) .addSparkArg(opts.CONF, diff --git a/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json index bb6bf434be90..c108fa61a431 100644 --- a/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json @@ -6,7 +6,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, diff --git a/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json index bb6bf434be90..c108fa61a431 100644 --- a/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json @@ -6,7 +6,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, diff --git a/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json index 1583e5ddef56..3d7407004d26 100644 --- a/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json @@ -6,7 +6,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, @@ -20,7 +20,7 @@ "numTasks" : 16, "numActiveTasks" : 0, "numCompletedTasks" : 15, - "numSkippedTasks" : 15, + "numSkippedTasks" : 0, "numFailedTasks" : 1, "numActiveStages" : 0, "numCompletedStages" : 1, @@ -34,7 +34,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, diff --git a/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json index f1f0ec885587..10c7e1c0b36f 100644 --- a/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json @@ -6,7 +6,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, diff --git a/core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json index 1583e5ddef56..3d7407004d26 100644 --- a/core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json @@ -6,7 +6,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, @@ -20,7 +20,7 @@ "numTasks" : 16, "numActiveTasks" : 0, "numCompletedTasks" : 15, - "numSkippedTasks" : 15, + "numSkippedTasks" : 0, "numFailedTasks" : 1, "numActiveStages" : 0, "numCompletedStages" : 1, @@ -34,7 +34,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, diff --git a/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json index c232c9832375..6a9bafd6b219 100644 --- a/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json @@ -6,7 +6,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, @@ -20,7 +20,7 @@ "numTasks" : 8, "numActiveTasks" : 0, "numCompletedTasks" : 8, - "numSkippedTasks" : 8, + "numSkippedTasks" : 0, "numFailedTasks" : 0, "numActiveStages" : 0, "numCompletedStages" : 1, diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 0b020592b06d..b2bc8861083b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -32,6 +32,7 @@ import org.apache.spark.api.r.RUtils import org.apache.spark.deploy.SparkSubmit._ import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate import org.apache.spark.internal.Logging +import org.apache.spark.TestUtils.JavaSourceFromString import org.apache.spark.util.{ResetSystemProperties, Utils} // Note: this suite mixes in ResetSystemProperties because SparkSubmit.main() sets a bunch @@ -417,6 +418,8 @@ class SparkSubmitSuite // See https://gist.github.com/shivaram/3a2fecce60768a603dac for a error log ignore("correctly builds R packages included in a jar with --packages") { assume(RUtils.isRInstalled, "R isn't installed on this machine.") + // Check if the SparkR package is installed + assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") val main = MavenCoordinate("my.great.lib", "mylib", "0.1") val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val rScriptDir = @@ -435,6 +438,41 @@ class SparkSubmitSuite } } + test("include an external JAR in SparkR") { + assume(RUtils.isRInstalled, "R isn't installed on this machine.") + val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) + // Check if the SparkR package is installed + assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") + val rScriptDir = + Seq(sparkHome, "R", "pkg", "inst", "tests", "testthat", "jarTest.R").mkString(File.separator) + assert(new File(rScriptDir).exists) + + // compile a small jar containing a class that will be called from R code. + val tempDir = Utils.createTempDir() + val srcDir = new File(tempDir, "sparkrtest") + srcDir.mkdirs() + val excSource = new JavaSourceFromString(new File(srcDir, "DummyClass").getAbsolutePath, + """package sparkrtest; + | + |public class DummyClass implements java.io.Serializable { + | public static String helloWorld(String arg) { return "Hello " + arg; } + | public static int addStuff(int arg1, int arg2) { return arg1 + arg2; } + |} + """.stripMargin) + val excFile = TestUtils.createCompiledClass("DummyClass", srcDir, excSource, Seq.empty) + val jarFile = new File(tempDir, "sparkRTestJar-%s.jar".format(System.currentTimeMillis())) + val jarURL = TestUtils.createJar(Seq(excFile), jarFile, directoryPrefix = Some("sparkrtest")) + + val args = Seq( + "--name", "testApp", + "--master", "local", + "--jars", jarURL.toString, + "--verbose", + "--conf", "spark.ui.enabled=false", + rScriptDir) + runSparkSubmit(args) + } + test("resolves command line argument paths correctly") { val jars = "/jar1,/jar2" // --jars val files = "hdfs:/file1,file2" // --files diff --git a/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala index 337fd7e85e81..ebdb69f31e36 100644 --- a/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala @@ -19,14 +19,21 @@ package org.apache.spark.internal.config import java.util.concurrent.TimeUnit +import scala.collection.JavaConverters._ +import scala.collection.mutable.HashMap + import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.network.util.ByteUnit class ConfigEntrySuite extends SparkFunSuite { + private val PREFIX = "spark.ConfigEntrySuite" + + private def testKey(name: String): String = s"$PREFIX.$name" + test("conf entry: int") { val conf = new SparkConf() - val iConf = ConfigBuilder("spark.int").intConf.createWithDefault(1) + val iConf = ConfigBuilder(testKey("int")).intConf.createWithDefault(1) assert(conf.get(iConf) === 1) conf.set(iConf, 2) assert(conf.get(iConf) === 2) @@ -34,21 +41,21 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: long") { val conf = new SparkConf() - val lConf = ConfigBuilder("spark.long").longConf.createWithDefault(0L) + val lConf = ConfigBuilder(testKey("long")).longConf.createWithDefault(0L) conf.set(lConf, 1234L) assert(conf.get(lConf) === 1234L) } test("conf entry: double") { val conf = new SparkConf() - val dConf = ConfigBuilder("spark.double").doubleConf.createWithDefault(0.0) + val dConf = ConfigBuilder(testKey("double")).doubleConf.createWithDefault(0.0) conf.set(dConf, 20.0) assert(conf.get(dConf) === 20.0) } test("conf entry: boolean") { val conf = new SparkConf() - val bConf = ConfigBuilder("spark.boolean").booleanConf.createWithDefault(false) + val bConf = ConfigBuilder(testKey("boolean")).booleanConf.createWithDefault(false) assert(!conf.get(bConf)) conf.set(bConf, true) assert(conf.get(bConf)) @@ -56,7 +63,7 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: optional") { val conf = new SparkConf() - val optionalConf = ConfigBuilder("spark.optional").intConf.createOptional + val optionalConf = ConfigBuilder(testKey("optional")).intConf.createOptional assert(conf.get(optionalConf) === None) conf.set(optionalConf, 1) assert(conf.get(optionalConf) === Some(1)) @@ -64,8 +71,8 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: fallback") { val conf = new SparkConf() - val parentConf = ConfigBuilder("spark.int").intConf.createWithDefault(1) - val confWithFallback = ConfigBuilder("spark.fallback").fallbackConf(parentConf) + val parentConf = ConfigBuilder(testKey("parent")).intConf.createWithDefault(1) + val confWithFallback = ConfigBuilder(testKey("fallback")).fallbackConf(parentConf) assert(conf.get(confWithFallback) === 1) conf.set(confWithFallback, 2) assert(conf.get(parentConf) === 1) @@ -74,7 +81,8 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: time") { val conf = new SparkConf() - val time = ConfigBuilder("spark.time").timeConf(TimeUnit.SECONDS).createWithDefaultString("1h") + val time = ConfigBuilder(testKey("time")).timeConf(TimeUnit.SECONDS) + .createWithDefaultString("1h") assert(conf.get(time) === 3600L) conf.set(time.key, "1m") assert(conf.get(time) === 60L) @@ -82,7 +90,8 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: bytes") { val conf = new SparkConf() - val bytes = ConfigBuilder("spark.bytes").bytesConf(ByteUnit.KiB).createWithDefaultString("1m") + val bytes = ConfigBuilder(testKey("bytes")).bytesConf(ByteUnit.KiB) + .createWithDefaultString("1m") assert(conf.get(bytes) === 1024L) conf.set(bytes.key, "1k") assert(conf.get(bytes) === 1L) @@ -90,7 +99,7 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: string seq") { val conf = new SparkConf() - val seq = ConfigBuilder("spark.seq").stringConf.toSequence.createWithDefault(Seq()) + val seq = ConfigBuilder(testKey("seq")).stringConf.toSequence.createWithDefault(Seq()) conf.set(seq.key, "1,,2, 3 , , 4") assert(conf.get(seq) === Seq("1", "2", "3", "4")) conf.set(seq, Seq("1", "2")) @@ -99,7 +108,7 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: int seq") { val conf = new SparkConf() - val seq = ConfigBuilder("spark.seq").intConf.toSequence.createWithDefault(Seq()) + val seq = ConfigBuilder(testKey("intSeq")).intConf.toSequence.createWithDefault(Seq()) conf.set(seq.key, "1,,2, 3 , , 4") assert(conf.get(seq) === Seq(1, 2, 3, 4)) conf.set(seq, Seq(1, 2)) @@ -108,7 +117,7 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: transformation") { val conf = new SparkConf() - val transformationConf = ConfigBuilder("spark.transformation") + val transformationConf = ConfigBuilder(testKey("transformation")) .stringConf .transform(_.toLowerCase()) .createWithDefault("FOO") @@ -120,7 +129,7 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: valid values check") { val conf = new SparkConf() - val enum = ConfigBuilder("spark.enum") + val enum = ConfigBuilder(testKey("enum")) .stringConf .checkValues(Set("a", "b", "c")) .createWithDefault("a") @@ -138,7 +147,7 @@ class ConfigEntrySuite extends SparkFunSuite { test("conf entry: conversion error") { val conf = new SparkConf() - val conversionTest = ConfigBuilder("spark.conversionTest").doubleConf.createOptional + val conversionTest = ConfigBuilder(testKey("conversionTest")).doubleConf.createOptional conf.set(conversionTest.key, "abc") val conversionError = intercept[IllegalArgumentException] { conf.get(conversionTest) @@ -148,8 +157,81 @@ class ConfigEntrySuite extends SparkFunSuite { test("default value handling is null-safe") { val conf = new SparkConf() - val stringConf = ConfigBuilder("spark.string").stringConf.createWithDefault(null) + val stringConf = ConfigBuilder(testKey("string")).stringConf.createWithDefault(null) assert(conf.get(stringConf) === null) } + test("variable expansion") { + val env = Map("ENV1" -> "env1") + val conf = HashMap("spark.value1" -> "value1", "spark.value2" -> "value2") + + def getenv(key: String): String = env.getOrElse(key, null) + + def expand(value: String): String = ConfigEntry.expand(value, conf.asJava, getenv, Set()) + + assert(expand("${spark.value1}") === "value1") + assert(expand("spark.value1 is: ${spark.value1}") === "spark.value1 is: value1") + assert(expand("${spark.value1} ${spark.value2}") === "value1 value2") + assert(expand("${spark.value3}") === "${spark.value3}") + + // Make sure anything that is not in the "spark." namespace is ignored. + conf("notspark.key") = "value" + assert(expand("${notspark.key}") === "${notspark.key}") + + assert(expand("${env:ENV1}") === "env1") + assert(expand("${system:user.name}") === sys.props("user.name")) + + val stringConf = ConfigBuilder(testKey("stringForExpansion")) + .stringConf + .createWithDefault("string1") + val optionalConf = ConfigBuilder(testKey("optionForExpansion")) + .stringConf + .createOptional + val intConf = ConfigBuilder(testKey("intForExpansion")) + .intConf + .createWithDefault(42) + val fallbackConf = ConfigBuilder(testKey("fallbackForExpansion")) + .fallbackConf(intConf) + + assert(expand("${" + stringConf.key + "}") === "string1") + assert(expand("${" + optionalConf.key + "}") === "${" + optionalConf.key + "}") + assert(expand("${" + intConf.key + "}") === "42") + assert(expand("${" + fallbackConf.key + "}") === "42") + + conf(optionalConf.key) = "string2" + assert(expand("${" + optionalConf.key + "}") === "string2") + + conf(fallbackConf.key) = "84" + assert(expand("${" + fallbackConf.key + "}") === "84") + + assert(expand("${spark.value1") === "${spark.value1") + + // Unknown prefixes. + assert(expand("${unknown:value}") === "${unknown:value}") + + // Chained references. + val conf1 = ConfigBuilder(testKey("conf1")) + .stringConf + .createWithDefault("value1") + val conf2 = ConfigBuilder(testKey("conf2")) + .stringConf + .createWithDefault("value2") + + conf(conf2.key) = "${" + conf1.key + "}" + assert(expand("${" + conf2.key + "}") === conf1.defaultValueString) + + // Circular references. + conf(conf1.key) = "${" + conf2.key + "}" + val e = intercept[IllegalArgumentException] { + expand("${" + conf2.key + "}") + } + assert(e.getMessage().contains("Circular")) + + // Default string values with variable references. + val parameterizedStringConf = ConfigBuilder(testKey("stringWithParams")) + .stringConf + .createWithDefault("${spark.value1}") + assert(parameterizedStringConf.readFrom(conf.asJava, getenv) === conf("spark.value1")) + } + } diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala index 27cfdc7aced5..f8d523fa2c6a 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala @@ -51,6 +51,22 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext { } } + test("basic pipe with tokenization") { + if (testCommandAvailable("wc")) { + val nums = sc.makeRDD(Array(1, 2, 3, 4), 2) + + // verify that both RDD.pipe(command: String) and RDD.pipe(command: String, env) work good + for (piped <- Seq(nums.pipe("wc -l"), nums.pipe("wc -l", Map[String, String]()))) { + val c = piped.collect() + assert(c.size === 2) + assert(c(0).trim === "2") + assert(c(1).trim === "2") + } + } else { + assert(true) + } + } + test("failure in iterating over pipe input") { if (testCommandAvailable("cat")) { val nums = @@ -122,6 +138,14 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext { } } + test("pipe with empty partition") { + val data = sc.parallelize(Seq("foo", "bing"), 8) + val piped = data.pipe("wc -c") + assert(piped.count == 8) + val charCounts = piped.map(_.trim.toInt).collect().toSet + assert(Set(0, 4, 5) == charCounts) + } + test("pipe with env variable") { if (testCommandAvailable("printenv")) { val nums = sc.makeRDD(Array(1, 2, 3, 4), 2) diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala index a32423dc4fde..026075902791 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala @@ -31,17 +31,24 @@ import org.scalatest.mock.MockitoSugar import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.deploy.Command import org.apache.spark.deploy.mesos.MesosDriverDescription - +import org.apache.spark.scheduler.cluster.mesos.Utils class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar { private val command = new Command("mainClass", Seq("arg"), Map(), Seq(), Seq(), Seq()) + private var driver: SchedulerDriver = _ private var scheduler: MesosClusterScheduler = _ - override def beforeEach(): Unit = { + private def setScheduler(sparkConfVars: Map[String, String] = null): Unit = { val conf = new SparkConf() conf.setMaster("mesos://localhost:5050") conf.setAppName("spark mesos") + + if (sparkConfVars != null) { + conf.setAll(sparkConfVars) + } + + driver = mock[SchedulerDriver] scheduler = new MesosClusterScheduler( new BlackHoleMesosClusterPersistenceEngineFactory, conf) { override def start(): Unit = { ready = true } @@ -50,9 +57,11 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi } test("can queue drivers") { + setScheduler() + val response = scheduler.submitDriver( - new MesosDriverDescription("d1", "jar", 1000, 1, true, - command, Map[String, String](), "s1", new Date())) + new MesosDriverDescription("d1", "jar", 1000, 1, true, + command, Map[String, String](), "s1", new Date())) assert(response.success) val response2 = scheduler.submitDriver(new MesosDriverDescription( @@ -65,6 +74,8 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi } test("can kill queued drivers") { + setScheduler() + val response = scheduler.submitDriver( new MesosDriverDescription("d1", "jar", 1000, 1, true, command, Map[String, String](), "s1", new Date())) @@ -76,6 +87,8 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi } test("can handle multiple roles") { + setScheduler() + val driver = mock[SchedulerDriver] val response = scheduler.submitDriver( new MesosDriverDescription("d1", "jar", 1200, 1.5, true, @@ -138,6 +151,8 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi } test("escapes commandline args for the shell") { + setScheduler() + val conf = new SparkConf() conf.setMaster("mesos://localhost:5050") conf.setAppName("spark mesos") @@ -172,4 +187,28 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi assert(escape(s"onlywrap${char}this") === wrapped(s"onlywrap${char}this")) }) } + + test("supports spark.mesos.driverEnv.*") { + setScheduler() + + val mem = 1000 + val cpu = 1 + + val response = scheduler.submitDriver( + new MesosDriverDescription("d1", "jar", mem, cpu, true, + command, + Map("spark.mesos.executor.home" -> "test", + "spark.app.name" -> "test", + "spark.mesos.driverEnv.TEST_ENV" -> "TEST_VAL"), + "s1", + new Date())) + assert(response.success) + + val offer = Utils.createOffer("o1", "s1", mem, cpu) + scheduler.resourceOffers(driver, List(offer).asJava) + val tasks = Utils.verifyTaskLaunched(driver, "o1") + val env = tasks.head.getCommand.getEnvironment.getVariablesList.asScala.map(v => + (v.getName, v.getValue)).toMap + assert(env.getOrElse("TEST_ENV", null) == "TEST_VAL") + } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index 7f21d4c623af..c2779d7b3565 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -24,8 +24,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.mesos.{Protos, Scheduler, SchedulerDriver} import org.apache.mesos.Protos._ -import org.apache.mesos.Protos.Value.Scalar -import org.mockito.{ArgumentCaptor, Matchers} +import org.mockito.Matchers import org.mockito.Matchers._ import org.mockito.Mockito._ import org.scalatest.mock.MockitoSugar @@ -35,6 +34,7 @@ import org.apache.spark.{LocalSparkContext, SecurityManager, SparkConf, SparkCon import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.scheduler.TaskSchedulerImpl +import org.apache.spark.scheduler.cluster.mesos.Utils._ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext @@ -59,7 +59,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite // launches a task on a valid offer offerResources(offers) - verifyTaskLaunched("o1") + verifyTaskLaunched(driver, "o1") // kills executors backend.doRequestTotalExecutors(0) @@ -74,7 +74,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite // Launches a new task when requested executors is positive backend.doRequestTotalExecutors(2) offerResources(offers, 2) - verifyTaskLaunched("o2") + verifyTaskLaunched(driver, "o2") } test("mesos supports killing and relaunching tasks with executors") { @@ -86,7 +86,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite val offer1 = (minMem, minCpu) val offer2 = (minMem, 1) offerResources(List(offer1, offer2)) - verifyTaskLaunched("o1") + verifyTaskLaunched(driver, "o1") // accounts for a killed task val status = createTaskStatus("0", "s1", TaskState.TASK_KILLED) @@ -95,7 +95,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite // Launches a new task on a valid offer from the same slave offerResources(List(offer2)) - verifyTaskLaunched("o2") + verifyTaskLaunched(driver, "o2") } test("mesos supports spark.executor.cores") { @@ -106,10 +106,10 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite val offers = List((executorMemory * 2, executorCores + 1)) offerResources(offers) - val taskInfos = verifyTaskLaunched("o1") - assert(taskInfos.size() == 1) + val taskInfos = verifyTaskLaunched(driver, "o1") + assert(taskInfos.length == 1) - val cpus = backend.getResource(taskInfos.iterator().next().getResourcesList, "cpus") + val cpus = backend.getResource(taskInfos(0).getResourcesList, "cpus") assert(cpus == executorCores) } @@ -120,10 +120,10 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite val offerCores = 10 offerResources(List((executorMemory * 2, offerCores))) - val taskInfos = verifyTaskLaunched("o1") - assert(taskInfos.size() == 1) + val taskInfos = verifyTaskLaunched(driver, "o1") + assert(taskInfos.length == 1) - val cpus = backend.getResource(taskInfos.iterator().next().getResourcesList, "cpus") + val cpus = backend.getResource(taskInfos(0).getResourcesList, "cpus") assert(cpus == offerCores) } @@ -134,10 +134,10 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite val executorMemory = backend.executorMemory(sc) offerResources(List((executorMemory, maxCores + 1))) - val taskInfos = verifyTaskLaunched("o1") - assert(taskInfos.size() == 1) + val taskInfos = verifyTaskLaunched(driver, "o1") + assert(taskInfos.length == 1) - val cpus = backend.getResource(taskInfos.iterator().next().getResourcesList, "cpus") + val cpus = backend.getResource(taskInfos(0).getResourcesList, "cpus") assert(cpus == maxCores) } @@ -156,7 +156,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite (executorMemory, maxCores + 1), (executorMemory, maxCores + 1))) - verifyTaskLaunched("o1") + verifyTaskLaunched(driver, "o1") verifyDeclinedOffer(driver, createOfferId("o2"), true) } @@ -171,8 +171,8 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite (executorMemory * 2, executorCores * 2), (executorMemory * 2, executorCores * 2))) - verifyTaskLaunched("o1") - verifyTaskLaunched("o2") + verifyTaskLaunched(driver, "o1") + verifyTaskLaunched(driver, "o2") } test("mesos creates multiple executors on a single slave") { @@ -184,8 +184,8 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite offerResources(List((executorMemory * 2, executorCores * 2))) // verify two executors were started on a single offer - val taskInfos = verifyTaskLaunched("o1") - assert(taskInfos.size() == 2) + val taskInfos = verifyTaskLaunched(driver, "o1") + assert(taskInfos.length == 2) } test("mesos doesn't register twice with the same shuffle service") { @@ -194,11 +194,11 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite val offer1 = createOffer("o1", "s1", mem, cpu) backend.resourceOffers(driver, List(offer1).asJava) - verifyTaskLaunched("o1") + verifyTaskLaunched(driver, "o1") val offer2 = createOffer("o2", "s1", mem, cpu) backend.resourceOffers(driver, List(offer2).asJava) - verifyTaskLaunched("o2") + verifyTaskLaunched(driver, "o2") val status1 = createTaskStatus("0", "s1", TaskState.TASK_RUNNING) backend.statusUpdate(driver, status1) @@ -216,7 +216,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite val offer1 = createOffer("o1", "s1", mem, cpu) backend.resourceOffers(driver, List(offer1).asJava) - verifyTaskLaunched("o1") + verifyTaskLaunched(driver, "o1") backend.doKillExecutors(List("0")) verify(driver, times(1)).killTask(createTaskId("0")) @@ -269,14 +269,6 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite backend.resourceOffers(driver, mesosOffers.asJava) } - private def verifyTaskLaunched(offerId: String): java.util.Collection[TaskInfo] = { - val captor = ArgumentCaptor.forClass(classOf[java.util.Collection[TaskInfo]]) - verify(driver, times(1)).launchTasks( - Matchers.eq(Collections.singleton(createOfferId(offerId))), - captor.capture()) - captor.getValue - } - private def createTaskStatus(taskId: String, slaveId: String, state: TaskState): TaskStatus = { TaskStatus.newBuilder() .setTaskId(TaskID.newBuilder().setValue(taskId).build()) @@ -285,41 +277,6 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite .build } - - private def createOfferId(offerId: String): OfferID = { - OfferID.newBuilder().setValue(offerId).build() - } - - private def createSlaveId(slaveId: String): SlaveID = { - SlaveID.newBuilder().setValue(slaveId).build() - } - - private def createExecutorId(executorId: String): ExecutorID = { - ExecutorID.newBuilder().setValue(executorId).build() - } - - private def createTaskId(taskId: String): TaskID = { - TaskID.newBuilder().setValue(taskId).build() - } - - private def createOffer(offerId: String, slaveId: String, mem: Int, cpu: Int): Offer = { - val builder = Offer.newBuilder() - builder.addResourcesBuilder() - .setName("mem") - .setType(Value.Type.SCALAR) - .setScalar(Scalar.newBuilder().setValue(mem)) - builder.addResourcesBuilder() - .setName("cpus") - .setType(Value.Type.SCALAR) - .setScalar(Scalar.newBuilder().setValue(cpu)) - builder.setId(createOfferId(offerId)) - .setFrameworkId(FrameworkID.newBuilder() - .setValue("f1")) - .setSlaveId(SlaveID.newBuilder().setValue(slaveId)) - .setHostname(s"host${slaveId}") - .build() - } - private def createSchedulerBackend( taskScheduler: TaskSchedulerImpl, driver: SchedulerDriver, @@ -364,9 +321,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite .set("spark.mesos.driver.webui.url", "http://webui") if (sparkConfVars != null) { - for (attr <- sparkConfVars) { - sparkConf.set(attr._1, attr._2) - } + sparkConf.setAll(sparkConfVars) } sc = new SparkContext(sparkConf) diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala new file mode 100644 index 000000000000..ff26d14ef55c --- /dev/null +++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler.cluster.mesos + +import java.util.Collections + +import org.apache.mesos.Protos._ +import org.apache.mesos.Protos.Value.Scalar +import org.apache.mesos.SchedulerDriver +import org.mockito.{ArgumentCaptor, Matchers} +import org.mockito.Mockito._ +import scala.collection.JavaConverters._ + +object Utils { + def createOffer(offerId: String, slaveId: String, mem: Int, cpu: Int): Offer = { + val builder = Offer.newBuilder() + builder.addResourcesBuilder() + .setName("mem") + .setType(Value.Type.SCALAR) + .setScalar(Scalar.newBuilder().setValue(mem)) + builder.addResourcesBuilder() + .setName("cpus") + .setType(Value.Type.SCALAR) + .setScalar(Scalar.newBuilder().setValue(cpu)) + builder.setId(createOfferId(offerId)) + .setFrameworkId(FrameworkID.newBuilder() + .setValue("f1")) + .setSlaveId(SlaveID.newBuilder().setValue(slaveId)) + .setHostname(s"host${slaveId}") + .build() + } + + def verifyTaskLaunched(driver: SchedulerDriver, offerId: String): List[TaskInfo] = { + val captor = ArgumentCaptor.forClass(classOf[java.util.Collection[TaskInfo]]) + verify(driver, times(1)).launchTasks( + Matchers.eq(Collections.singleton(createOfferId(offerId))), + captor.capture()) + captor.getValue.asScala.toList + } + + def createOfferId(offerId: String): OfferID = { + OfferID.newBuilder().setValue(offerId).build() + } + + def createSlaveId(slaveId: String): SlaveID = { + SlaveID.newBuilder().setValue(slaveId).build() + } + + def createExecutorId(executorId: String): ExecutorID = { + ExecutorID.newBuilder().setValue(executorId).build() + } + + def createTaskId(taskId: String): TaskID = { + TaskID.newBuilder().setValue(taskId).build() + } +} diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala index ec4ef4b2fcbf..059c2c244452 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala @@ -60,7 +60,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { } assert(writeMetrics.bytesWritten > 0) assert(writeMetrics.recordsWritten === 16385) - writer.commitAndClose() + writer.commitAndGet() + writer.close() assert(file.length() == writeMetrics.bytesWritten) } @@ -100,6 +101,40 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { } } + test("calling revertPartialWritesAndClose() on a partial write should truncate up to commit") { + val file = new File(tempDir, "somefile") + val writeMetrics = new ShuffleWriteMetrics() + val writer = new DiskBlockObjectWriter( + file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + + writer.write(Long.box(20), Long.box(30)) + val firstSegment = writer.commitAndGet() + assert(firstSegment.length === file.length()) + assert(writeMetrics.shuffleBytesWritten === file.length()) + + writer.write(Long.box(40), Long.box(50)) + + writer.revertPartialWritesAndClose() + assert(firstSegment.length === file.length()) + assert(writeMetrics.shuffleBytesWritten === file.length()) + } + + test("calling revertPartialWritesAndClose() after commit() should have no effect") { + val file = new File(tempDir, "somefile") + val writeMetrics = new ShuffleWriteMetrics() + val writer = new DiskBlockObjectWriter( + file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + + writer.write(Long.box(20), Long.box(30)) + val firstSegment = writer.commitAndGet() + assert(firstSegment.length === file.length()) + assert(writeMetrics.shuffleBytesWritten === file.length()) + + writer.revertPartialWritesAndClose() + assert(firstSegment.length === file.length()) + assert(writeMetrics.shuffleBytesWritten === file.length()) + } + test("calling revertPartialWritesAndClose() on a closed block writer should have no effect") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() @@ -108,7 +143,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { for (i <- 1 to 1000) { writer.write(i, i) } - writer.commitAndClose() + writer.commitAndGet() + writer.close() val bytesWritten = writeMetrics.bytesWritten assert(writeMetrics.recordsWritten === 1000) writer.revertPartialWritesAndClose() @@ -116,7 +152,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.bytesWritten === bytesWritten) } - test("commitAndClose() should be idempotent") { + test("commit() and close() should be idempotent") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter( @@ -124,11 +160,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { for (i <- 1 to 1000) { writer.write(i, i) } - writer.commitAndClose() + writer.commitAndGet() + writer.close() val bytesWritten = writeMetrics.bytesWritten val writeTime = writeMetrics.writeTime assert(writeMetrics.recordsWritten === 1000) - writer.commitAndClose() + writer.commitAndGet() + writer.close() assert(writeMetrics.recordsWritten === 1000) assert(writeMetrics.bytesWritten === bytesWritten) assert(writeMetrics.writeTime === writeTime) @@ -152,26 +190,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.writeTime === writeTime) } - test("fileSegment() can only be called after commitAndClose() has been called") { + test("commit() and close() without ever opening or writing") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter( file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) - for (i <- 1 to 1000) { - writer.write(i, i) - } - intercept[IllegalStateException] { - writer.fileSegment() - } + val segment = writer.commitAndGet() writer.close() - } - - test("commitAndClose() without ever opening or writing") { - val file = new File(tempDir, "somefile") - val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) - writer.commitAndClose() - assert(writer.fileSegment().length === 0) + assert(segment.length === 0) } } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index f5d0fb00b732..30952a945834 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -782,6 +782,9 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { conf.set("spark.dynamicAllocation.initialExecutors", "3")) === 4) assert(Utils.getDynamicAllocationInitialExecutors( // should use initialExecutors conf.set("spark.dynamicAllocation.initialExecutors", "5")) === 5) + assert(Utils.getDynamicAllocationInitialExecutors( // should use minExecutors + conf.set("spark.dynamicAllocation.initialExecutors", "2") + .set("spark.executor.instances", "1")) === 3) } diff --git a/dev/.rat-excludes b/dev/.rat-excludes index fb582dec56d5..0c866717a3f4 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -99,3 +99,4 @@ spark-deps-.* .*tsv org.apache.spark.scheduler.ExternalClusterManager .*\.sql +.Rbuildignore diff --git a/dev/audit-release/.gitignore b/dev/audit-release/.gitignore deleted file mode 100644 index 7e057a92b3c4..000000000000 --- a/dev/audit-release/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -project/ -spark_audit* diff --git a/dev/audit-release/README.md b/dev/audit-release/README.md deleted file mode 100644 index 37b2a0afb7ae..000000000000 --- a/dev/audit-release/README.md +++ /dev/null @@ -1,12 +0,0 @@ -Test Application Builds -======================= - -This directory includes test applications which are built when auditing releases. You can run them locally by setting appropriate environment variables. - -``` -$ cd sbt_app_core -$ SCALA_VERSION=2.11.7 \ - SPARK_VERSION=1.0.0-SNAPSHOT \ - SPARK_RELEASE_REPOSITORY=file:///home/patrick/.ivy2/local \ - sbt run -``` diff --git a/dev/audit-release/audit_release.py b/dev/audit-release/audit_release.py deleted file mode 100755 index b28e7a427b8f..000000000000 --- a/dev/audit-release/audit_release.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Audits binary and maven artifacts for a Spark release. -# Requires GPG and Maven. -# usage: -# python audit_release.py - -import os -import re -import shutil -import subprocess -import sys -import time -import urllib2 - -# Note: The following variables must be set before use! -RELEASE_URL = "http://people.apache.org/~andrewor14/spark-1.1.1-rc1/" -RELEASE_KEY = "XXXXXXXX" # Your 8-digit hex -RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1033" -RELEASE_VERSION = "1.1.1" -SCALA_VERSION = "2.11.7" -SCALA_BINARY_VERSION = "2.11" - -# Do not set these -LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S") -LOG_FILE = open(LOG_FILE_NAME, 'w') -WORK_DIR = "/tmp/audit_%s" % int(time.time()) -MAVEN_CMD = "mvn" -GPG_CMD = "gpg" -SBT_CMD = "sbt -Dsbt.log.noformat=true" - -# Track failures to print them at the end -failures = [] - -# Log a message. Use sparingly because this flushes every write. -def log(msg): - LOG_FILE.write(msg + "\n") - LOG_FILE.flush() - -def log_and_print(msg): - print msg - log(msg) - -# Prompt the user to delete the scratch directory used -def clean_work_files(): - response = raw_input("OK to delete scratch directory '%s'? (y/N) " % WORK_DIR) - if response == "y": - shutil.rmtree(WORK_DIR) - -# Run the given command and log its output to the log file -def run_cmd(cmd, exit_on_failure=True): - log("Running command: %s" % cmd) - ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE) - if ret != 0 and exit_on_failure: - log_and_print("Command failed: %s" % cmd) - clean_work_files() - sys.exit(-1) - return ret - -def run_cmd_with_output(cmd): - log_and_print("Running command: %s" % cmd) - return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE) - -# Test if the given condition is successful -# If so, print the pass message; otherwise print the failure message -def test(cond, msg): - return passed(msg) if cond else failed(msg) - -def passed(msg): - log_and_print("[PASSED] %s" % msg) - -def failed(msg): - failures.append(msg) - log_and_print("[**FAILED**] %s" % msg) - -def get_url(url): - return urllib2.urlopen(url).read() - -# If the path exists, prompt the user to delete it -# If the resource is not deleted, abort -def ensure_path_not_present(path): - full_path = os.path.expanduser(path) - if os.path.exists(full_path): - print "Found %s locally." % full_path - response = raw_input("This can interfere with testing published artifacts. OK to delete? (y/N) ") - if response == "y": - shutil.rmtree(full_path) - else: - print "Abort." - sys.exit(-1) - -log_and_print("|-------- Starting Spark audit tests for release %s --------|" % RELEASE_VERSION) -log_and_print("Log output can be found in %s" % LOG_FILE_NAME) - -original_dir = os.getcwd() - -# For each of these modules, we'll test an 'empty' application in sbt and -# maven that links against them. This will catch issues with messed up -# dependencies within those projects. -modules = [ - "spark-core", "spark-mllib", "spark-streaming", "spark-repl", - "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka-0-8", - "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl" -] -modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules) - -# Check for directories that might interfere with tests -local_ivy_spark = "~/.ivy2/local/org.apache.spark" -cache_ivy_spark = "~/.ivy2/cache/org.apache.spark" -local_maven_kafka = "~/.m2/repository/org/apache/kafka" -local_maven_kafka = "~/.m2/repository/org/apache/spark" -map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka]) - -# SBT build tests -log_and_print("==== Building SBT modules ====") -os.chdir("blank_sbt_build") -os.environ["SPARK_VERSION"] = RELEASE_VERSION -os.environ["SCALA_VERSION"] = SCALA_VERSION -os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY -os.environ["SPARK_AUDIT_MASTER"] = "local" -for module in modules: - log("==== Building module %s in SBT ====" % module) - os.environ["SPARK_MODULE"] = module - ret = run_cmd("%s clean update" % SBT_CMD, exit_on_failure=False) - test(ret == 0, "SBT build against '%s' module" % module) -os.chdir(original_dir) - -# SBT application tests -log_and_print("==== Building SBT applications ====") -for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]: - log("==== Building application %s in SBT ====" % app) - os.chdir(app) - ret = run_cmd("%s clean run" % SBT_CMD, exit_on_failure=False) - test(ret == 0, "SBT application (%s)" % app) - os.chdir(original_dir) - -# Maven build tests -os.chdir("blank_maven_build") -log_and_print("==== Building Maven modules ====") -for module in modules: - log("==== Building module %s in maven ====" % module) - cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' - '-Dspark.module="%s" clean compile' % - (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module)) - ret = run_cmd(cmd, exit_on_failure=False) - test(ret == 0, "maven build against '%s' module" % module) -os.chdir(original_dir) - -# Maven application tests -log_and_print("==== Building Maven applications ====") -os.chdir("maven_app_core") -mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' - '-Dscala.binary.version="%s" clean compile ' - 'exec:java -Dexec.mainClass="SimpleApp"' % - (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, SCALA_BINARY_VERSION)) -ret = run_cmd(mvn_exec_cmd, exit_on_failure=False) -test(ret == 0, "maven application (core)") -os.chdir(original_dir) - -# Binary artifact tests -if os.path.exists(WORK_DIR): - print "Working directory '%s' already exists" % WORK_DIR - sys.exit(-1) -os.mkdir(WORK_DIR) -os.chdir(WORK_DIR) - -index_page = get_url(RELEASE_URL) -artifact_regex = r = re.compile("") -artifacts = r.findall(index_page) - -# Verify artifact integrity -for artifact in artifacts: - log_and_print("==== Verifying download integrity for artifact: %s ====" % artifact) - - artifact_url = "%s/%s" % (RELEASE_URL, artifact) - key_file = "%s.asc" % artifact - run_cmd("wget %s" % artifact_url) - run_cmd("wget %s/%s" % (RELEASE_URL, key_file)) - run_cmd("wget %s%s" % (artifact_url, ".sha")) - - # Verify signature - run_cmd("%s --keyserver pgp.mit.edu --recv-key %s" % (GPG_CMD, RELEASE_KEY)) - run_cmd("%s %s" % (GPG_CMD, key_file)) - passed("Artifact signature verified.") - - # Verify md5 - my_md5 = run_cmd_with_output("%s --print-md MD5 %s" % (GPG_CMD, artifact)).strip() - release_md5 = get_url("%s.md5" % artifact_url).strip() - test(my_md5 == release_md5, "Artifact MD5 verified.") - - # Verify sha - my_sha = run_cmd_with_output("%s --print-md SHA512 %s" % (GPG_CMD, artifact)).strip() - release_sha = get_url("%s.sha" % artifact_url).strip() - test(my_sha == release_sha, "Artifact SHA verified.") - - # Verify Apache required files - dir_name = artifact.replace(".tgz", "") - run_cmd("tar xvzf %s" % artifact) - base_files = os.listdir(dir_name) - test("CHANGES.txt" in base_files, "Tarball contains CHANGES.txt file") - test("NOTICE" in base_files, "Tarball contains NOTICE file") - test("LICENSE" in base_files, "Tarball contains LICENSE file") - - os.chdir(WORK_DIR) - -# Report result -log_and_print("\n") -if len(failures) == 0: - log_and_print("*** ALL TESTS PASSED ***") -else: - log_and_print("XXXXX SOME TESTS DID NOT PASS XXXXX") - for f in failures: - log_and_print(" %s" % f) -os.chdir(original_dir) - -# Clean up -clean_work_files() - -log_and_print("|-------- Spark release audit complete --------|") diff --git a/dev/audit-release/blank_maven_build/pom.xml b/dev/audit-release/blank_maven_build/pom.xml deleted file mode 100644 index 02dd9046c9a4..000000000000 --- a/dev/audit-release/blank_maven_build/pom.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - - spark.audit - spark-audit - 4.0.0 - Spark Release Auditor - jar - 1.0 - - - Spray.cc repository - http://repo.spray.cc - - - Spark Staging Repo - ${spark.release.repository} - - - - - org.apache.spark - ${spark.module} - ${spark.version} - - - diff --git a/dev/audit-release/blank_sbt_build/build.sbt b/dev/audit-release/blank_sbt_build/build.sbt deleted file mode 100644 index 62815542e5bd..000000000000 --- a/dev/audit-release/blank_sbt_build/build.sbt +++ /dev/null @@ -1,30 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Spark Release Auditor" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" % System.getenv.get("SPARK_MODULE") % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Eclipse Paho Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/", - "Maven Repository" at "http://repo1.maven.org/maven2/", - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/maven_app_core/input.txt b/dev/audit-release/maven_app_core/input.txt deleted file mode 100644 index 837b6f85ae97..000000000000 --- a/dev/audit-release/maven_app_core/input.txt +++ /dev/null @@ -1,8 +0,0 @@ -a -b -c -d -a -b -c -d diff --git a/dev/audit-release/maven_app_core/pom.xml b/dev/audit-release/maven_app_core/pom.xml deleted file mode 100644 index b51639682557..000000000000 --- a/dev/audit-release/maven_app_core/pom.xml +++ /dev/null @@ -1,52 +0,0 @@ - - - - - spark.audit - spark-audit - 4.0.0 - Simple Project - jar - 1.0 - - - Spray.cc repository - http://repo.spray.cc - - - Spark Staging Repo - ${spark.release.repository} - - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - - - - - - - maven-compiler-plugin - 3.1 - - - - diff --git a/dev/audit-release/maven_app_core/src/main/java/SimpleApp.java b/dev/audit-release/maven_app_core/src/main/java/SimpleApp.java deleted file mode 100644 index 5217689e7c09..000000000000 --- a/dev/audit-release/maven_app_core/src/main/java/SimpleApp.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; - -public class SimpleApp { - public static void main(String[] args) { - String logFile = "input.txt"; - JavaSparkContext sc = new JavaSparkContext("local", "Simple App"); - JavaRDD logData = sc.textFile(logFile).cache(); - - long numAs = logData.filter(new Function() { - public Boolean call(String s) { return s.contains("a"); } - }).count(); - - long numBs = logData.filter(new Function() { - public Boolean call(String s) { return s.contains("b"); } - }).count(); - - if (numAs != 2 || numBs != 2) { - System.out.println("Failed to parse log files with Spark"); - System.exit(-1); - } - System.out.println("Test succeeded"); - sc.stop(); - } -} diff --git a/dev/audit-release/sbt_app_core/build.sbt b/dev/audit-release/sbt_app_core/build.sbt deleted file mode 100644 index 291b1d6440ba..000000000000 --- a/dev/audit-release/sbt_app_core/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_core/input.txt b/dev/audit-release/sbt_app_core/input.txt deleted file mode 100644 index 837b6f85ae97..000000000000 --- a/dev/audit-release/sbt_app_core/input.txt +++ /dev/null @@ -1,8 +0,0 @@ -a -b -c -d -a -b -c -d diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala deleted file mode 100644 index 61d91c70e970..000000000000 --- a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.util.Try - -import org.apache.spark.SparkConf -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ - -object SimpleApp { - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Spark App").setMaster(master) - case None => new SparkConf().setAppName("Simple Spark App") - } - val logFile = "input.txt" - val sc = new SparkContext(conf) - val logData = sc.textFile(logFile, 2).cache() - val numAs = logData.filter(line => line.contains("a")).count() - val numBs = logData.filter(line => line.contains("b")).count() - if (numAs != 2 || numBs != 2) { - println("Failed to parse log files with Spark") - System.exit(-1) - } - - // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue - val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess - val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess - if (!foundConsole) { - println("Console sink not loaded via spark-core") - System.exit(-1) - } - if (foundGanglia) { - println("Ganglia sink was loaded via spark-core") - System.exit(-1) - } - - // Remove kinesis from default build due to ASL license issue - val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess - if (foundKinesis) { - println("Kinesis was loaded via spark-core") - System.exit(-1) - } - } -} -// scalastyle:on println diff --git a/dev/audit-release/sbt_app_ganglia/build.sbt b/dev/audit-release/sbt_app_ganglia/build.sbt deleted file mode 100644 index 6d9474acf5bb..000000000000 --- a/dev/audit-release/sbt_app_ganglia/build.sbt +++ /dev/null @@ -1,30 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Ganglia Test" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-ganglia-lgpl" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_graphx/build.sbt b/dev/audit-release/sbt_app_graphx/build.sbt deleted file mode 100644 index dd11245e67d4..000000000000 --- a/dev/audit-release/sbt_app_graphx/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-graphx" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala b/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala deleted file mode 100644 index 2f0b6ef9a567..000000000000 --- a/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import org.apache.spark.{SparkContext, SparkConf} -import org.apache.spark.SparkContext._ -import org.apache.spark.graphx._ -import org.apache.spark.rdd.RDD - -object GraphXApp { - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple GraphX App").setMaster(master) - case None => new SparkConf().setAppName("Simple Graphx App") - } - val sc = new SparkContext(conf) - SparkContext.jarOfClass(this.getClass).foreach(sc.addJar) - - val users: RDD[(VertexId, (String, String))] = - sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), - (5L, ("franklin", "prof")), (2L, ("istoica", "prof")), - (4L, ("peter", "student")))) - val relationships: RDD[Edge[String]] = - sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), - Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"), - Edge(4L, 0L, "student"), Edge(5L, 0L, "colleague"))) - val defaultUser = ("John Doe", "Missing") - val graph = Graph(users, relationships, defaultUser) - // Notice that there is a user 0 (for which we have no information) connected to users - // 4 (peter) and 5 (franklin). - val triplets = graph.triplets.map(e => (e.srcAttr._1, e.dstAttr._1)).collect - if (!triplets.exists(_ == ("peter", "John Doe"))) { - println("Failed to run GraphX") - System.exit(-1) - } - println("Test succeeded") - } -} -// scalastyle:on println diff --git a/dev/audit-release/sbt_app_hive/build.sbt b/dev/audit-release/sbt_app_hive/build.sbt deleted file mode 100644 index c8824f2b15e5..000000000000 --- a/dev/audit-release/sbt_app_hive/build.sbt +++ /dev/null @@ -1,29 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-hive" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Maven Repository" at "http://repo1.maven.org/maven2/", - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_hive/data.txt b/dev/audit-release/sbt_app_hive/data.txt deleted file mode 100644 index 0229e67f51e0..000000000000 --- a/dev/audit-release/sbt_app_hive/data.txt +++ /dev/null @@ -1,9 +0,0 @@ -0val_0 -1val_1 -2val_2 -3val_3 -4val_4 -5val_5 -6val_6 -7val_7 -9val_9 diff --git a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala deleted file mode 100644 index 8cbfb9cd41b3..000000000000 --- a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.collection.mutable.{ListBuffer, Queue} - -import org.apache.spark.{SparkConf, SparkContext, SparkSession} -import org.apache.spark.rdd.RDD - -case class Person(name: String, age: Int) - -object SparkSqlExample { - - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) - case None => new SparkConf().setAppName("Simple Sql App") - } - val sc = new SparkContext(conf) - val sparkSession = SparkSession.builder - .enableHiveSupport() - .getOrCreate() - - import sparkSession._ - sql("DROP TABLE IF EXISTS src") - sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") - sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src") - val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect() - results.foreach(println) - - def test(f: => Boolean, failureMsg: String) = { - if (!f) { - println(failureMsg) - System.exit(-1) - } - } - - test(results.size == 5, "Unexpected number of selected elements: " + results) - println("Test succeeded") - sc.stop() - } -} -// scalastyle:on println diff --git a/dev/audit-release/sbt_app_kinesis/build.sbt b/dev/audit-release/sbt_app_kinesis/build.sbt deleted file mode 100644 index 981bc7957b5e..000000000000 --- a/dev/audit-release/sbt_app_kinesis/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Kinesis Test" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala deleted file mode 100644 index adc25b57d6aa..000000000000 --- a/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.util.Try - -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ - -object SimpleApp { - def main(args: Array[String]) { - val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess - if (!foundKinesis) { - println("Kinesis not loaded via kinesis-asl") - System.exit(-1) - } - } -} -// scalastyle:on println diff --git a/dev/audit-release/sbt_app_sql/build.sbt b/dev/audit-release/sbt_app_sql/build.sbt deleted file mode 100644 index 9116180f71a4..000000000000 --- a/dev/audit-release/sbt_app_sql/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-sql" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala deleted file mode 100644 index 10026314ef7a..000000000000 --- a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.collection.mutable.{ListBuffer, Queue} - -import org.apache.spark.SparkConf -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SQLContext - -case class Person(name: String, age: Int) - -object SparkSqlExample { - - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) - case None => new SparkConf().setAppName("Simple Sql App") - } - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - - import sqlContext.implicits._ - import sqlContext._ - - val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x)).toDF() - people.createOrReplaceTempView("people") - val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") - val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect() - teenagerNames.foreach(println) - - def test(f: => Boolean, failureMsg: String) = { - if (!f) { - println(failureMsg) - System.exit(-1) - } - } - - test(teenagerNames.size == 7, "Unexpected number of selected elements: " + teenagerNames) - println("Test succeeded") - sc.stop() - } -} -// scalastyle:on println diff --git a/dev/audit-release/sbt_app_streaming/build.sbt b/dev/audit-release/sbt_app_streaming/build.sbt deleted file mode 100644 index cb369d516dd1..000000000000 --- a/dev/audit-release/sbt_app_streaming/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-streaming" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala b/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala deleted file mode 100644 index d6a074687f4a..000000000000 --- a/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.collection.mutable.{ListBuffer, Queue} - -import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming._ - -object SparkStreamingExample { - - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Streaming App").setMaster(master) - case None => new SparkConf().setAppName("Simple Streaming App") - } - val ssc = new StreamingContext(conf, Seconds(1)) - val seen = ListBuffer[RDD[Int]]() - - val rdd1 = ssc.sparkContext.makeRDD(1 to 100, 10) - val rdd2 = ssc.sparkContext.makeRDD(1 to 1000, 10) - val rdd3 = ssc.sparkContext.makeRDD(1 to 10000, 10) - - val queue = Queue(rdd1, rdd2, rdd3) - val stream = ssc.queueStream(queue) - - stream.foreachRDD(rdd => seen += rdd) - ssc.start() - Thread.sleep(5000) - - def test(f: => Boolean, failureMsg: String) = { - if (!f) { - println(failureMsg) - System.exit(-1) - } - } - - val rddCounts = seen.map(rdd => rdd.count()).filter(_ > 0) - test(rddCounts.length == 3, "Did not collect three RDD's from stream") - test(rddCounts.toSet == Set(100, 1000, 10000), "Did not find expected streams") - - println("Test succeeded") - - ssc.stop() - } -} -// scalastyle:on println diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index feb3474cf1a7..5d536b7c245e 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -12,8 +12,8 @@ avro-1.7.7.jar avro-ipc-1.7.7.jar avro-mapred-1.7.7-hadoop2.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -147,6 +147,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 3e960358f74b..d16f42a97d37 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -15,8 +15,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -154,6 +154,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 3fc14a6fbf4e..2e261cb9a543 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -15,8 +15,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -154,6 +154,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 909fbb885222..67f38f4c220d 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -19,8 +19,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -162,6 +162,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index b986a313a0c7..07583963d913 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -19,8 +19,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -163,6 +163,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml index 3fd3ee2823f7..0c6b9b20a6e4 100644 --- a/docs/_data/menu-ml.yaml +++ b/docs/_data/menu-ml.yaml @@ -1,5 +1,5 @@ -- text: "Overview: estimators, transformers and pipelines" - url: ml-guide.html +- text: Pipelines + url: ml-pipeline.html - text: Extracting, transforming and selecting features url: ml-features.html - text: Classification and Regression @@ -8,5 +8,7 @@ url: ml-clustering.html - text: Collaborative filtering url: ml-collaborative-filtering.html +- text: Model selection and tuning + url: ml-tuning.html - text: Advanced topics url: ml-advanced.html diff --git a/docs/_includes/nav-left-wrapper-ml.html b/docs/_includes/nav-left-wrapper-ml.html index e2d7eda027c6..00ac6cc0dbc7 100644 --- a/docs/_includes/nav-left-wrapper-ml.html +++ b/docs/_includes/nav-left-wrapper-ml.html @@ -1,8 +1,8 @@
-

spark.ml package

+

MLlib: Main Guide

{% include nav-left.html nav=include.nav-ml %} -

spark.mllib package

+

MLlib: RDD-based API Guide

{% include nav-left.html nav=include.nav-mllib %}
\ No newline at end of file diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index 2d0c3fd71293..d3bf082aa751 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -74,7 +74,7 @@
  • Spark Streaming
  • DataFrames, Datasets and SQL
  • Structured Streaming
  • -
  • MLlib (Machine Learning)
  • +
  • MLlib (Machine Learning)
  • GraphX (Graph Processing)
  • SparkR (R on Spark)
  • diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 306888801df2..6ea1d438f529 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -45,7 +45,15 @@ def render(context) @file = File.join(@code_dir, snippet_file) @lang = snippet_file.split('.').last - code = File.open(@file).read.encode("UTF-8") + begin + code = File.open(@file).read.encode("UTF-8") + rescue => e + # We need to explicitly exit on execptions here because Jekyll will silently swallow + # them, leading to silent build failures (see https://github.com/jekyll/jekyll/issues/5104) + puts(e) + puts(e.backtrace) + exit 1 + end code = select_lines(code) rendered_code = Pygments.highlight(code, :lexer => @lang) diff --git a/docs/configuration.md b/docs/configuration.md index 1e95b862441f..86a9bd97d334 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1188,7 +1188,9 @@ Apart from these, the following properties are also available, and may be useful spark.task.maxFailures 4 - Number of individual task failures before giving up on the job. + Number of failures of any particular task before giving up on the job. + The total number of failures spread across different tasks will not cause the job + to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1. diff --git a/docs/index.md b/docs/index.md index 7157afc411bc..0cb8803783a0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,7 +8,7 @@ description: Apache Spark SPARK_VERSION_SHORT documentation homepage Apache Spark is a fast and general-purpose cluster computing system. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. -It also supports a rich set of higher-level tools including [Spark SQL](sql-programming-guide.html) for SQL and structured data processing, [MLlib](mllib-guide.html) for machine learning, [GraphX](graphx-programming-guide.html) for graph processing, and [Spark Streaming](streaming-programming-guide.html). +It also supports a rich set of higher-level tools including [Spark SQL](sql-programming-guide.html) for SQL and structured data processing, [MLlib](ml-guide.html) for machine learning, [GraphX](graphx-programming-guide.html) for graph processing, and [Spark Streaming](streaming-programming-guide.html). # Downloading @@ -87,7 +87,7 @@ options for deployment: * Modules built on Spark: * [Spark Streaming](streaming-programming-guide.html): processing real-time data streams * [Spark SQL, Datasets, and DataFrames](sql-programming-guide.html): support for structured data and relational queries - * [MLlib](mllib-guide.html): built-in machine learning library + * [MLlib](ml-guide.html): built-in machine learning library * [GraphX](graphx-programming-guide.html): Spark's new API for graph processing **API Docs:** diff --git a/docs/job-scheduling.md b/docs/job-scheduling.md index 40b6cd99cc27..807944f20a78 100644 --- a/docs/job-scheduling.md +++ b/docs/job-scheduling.md @@ -83,18 +83,7 @@ In Mesos coarse-grained mode, run `$SPARK_HOME/sbin/start-mesos-shuffle-service. slave nodes with `spark.shuffle.service.enabled` set to `true`. For instance, you may do so through Marathon. -In YARN mode, start the shuffle service on each `NodeManager` as follows: - -1. Build Spark with the [YARN profile](building-spark.html). Skip this step if you are using a -pre-packaged distribution. -2. Locate the `spark--yarn-shuffle.jar`. This should be under -`$SPARK_HOME/common/network-yarn/target/scala-` if you are building Spark yourself, and under -`lib` if you are using a distribution. -2. Add this jar to the classpath of all `NodeManager`s in your cluster. -3. In the `yarn-site.xml` on each node, add `spark_shuffle` to `yarn.nodemanager.aux-services`, -then set `yarn.nodemanager.aux-services.spark_shuffle.class` to -`org.apache.spark.network.yarn.YarnShuffleService`. -4. Restart all `NodeManager`s in your cluster. +In YARN mode, follow the instructions [here](running-on-yarn.html#configuring-the-external-shuffle-service). All other relevant configurations are optional and under the `spark.dynamicAllocation.*` and `spark.shuffle.service.*` namespaces. For more detail, see the diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md index 1c5f844b08e4..f5804fdeee5a 100644 --- a/docs/ml-advanced.md +++ b/docs/ml-advanced.md @@ -1,7 +1,7 @@ --- layout: global -title: Advanced topics - spark.ml -displayTitle: Advanced topics - spark.ml +title: Advanced topics +displayTitle: Advanced topics --- * Table of contents diff --git a/docs/ml-ann.md b/docs/ml-ann.md index c2d9bd200f62..7c460c4af6f4 100644 --- a/docs/ml-ann.md +++ b/docs/ml-ann.md @@ -1,7 +1,7 @@ --- layout: global -title: Multilayer perceptron classifier - spark.ml -displayTitle: Multilayer perceptron classifier - spark.ml +title: Multilayer perceptron classifier +displayTitle: Multilayer perceptron classifier --- > This section has been moved into the diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index 3d6106b532ff..7c2437eacde3 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -1,7 +1,7 @@ --- layout: global -title: Classification and regression - spark.ml -displayTitle: Classification and regression - spark.ml +title: Classification and regression +displayTitle: Classification and regression --- @@ -22,37 +22,14 @@ displayTitle: Classification and regression - spark.ml \newcommand{\zero}{\mathbf{0}} \]` +This page covers algorithms for Classification and Regression. It also includes sections +discussing specific classes of algorithms, such as linear methods, trees, and ensembles. + **Table of Contents** * This will become a table of contents (this text will be scraped). {:toc} -In `spark.ml`, we implement popular linear methods such as logistic -regression and linear least squares with $L_1$ or $L_2$ regularization. -Refer to [the linear methods in mllib](mllib-linear-methods.html) for -details about implementation and tuning. We also include a DataFrame API for [Elastic -net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid -of $L_1$ and $L_2$ regularization proposed in [Zou et al, Regularization -and variable selection via the elastic -net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). -Mathematically, it is defined as a convex combination of the $L_1$ and -the $L_2$ regularization terms: -`\[ -\alpha \left( \lambda \|\wv\|_1 \right) + (1-\alpha) \left( \frac{\lambda}{2}\|\wv\|_2^2 \right) , \alpha \in [0, 1], \lambda \geq 0 -\]` -By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$ -regularization as special cases. For example, if a [linear -regression](https://en.wikipedia.org/wiki/Linear_regression) model is -trained with the elastic net parameter $\alpha$ set to $1$, it is -equivalent to a -[Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. -On the other hand, if $\alpha$ is set to $0$, the trained model reduces -to a [ridge -regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. -We implement Pipelines API for both linear regression and logistic -regression with elastic net regularization. - - # Classification ## Logistic regression @@ -760,7 +737,34 @@ Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.ml.html#pyspa +# Linear methods + +We implement popular linear methods such as logistic +regression and linear least squares with $L_1$ or $L_2$ regularization. +Refer to [the linear methods guide for the RDD-based API](mllib-linear-methods.html) for +details about implementation and tuning; this information is still relevant. +We also include a DataFrame API for [Elastic +net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid +of $L_1$ and $L_2$ regularization proposed in [Zou et al, Regularization +and variable selection via the elastic +net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). +Mathematically, it is defined as a convex combination of the $L_1$ and +the $L_2$ regularization terms: +`\[ +\alpha \left( \lambda \|\wv\|_1 \right) + (1-\alpha) \left( \frac{\lambda}{2}\|\wv\|_2^2 \right) , \alpha \in [0, 1], \lambda \geq 0 +\]` +By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$ +regularization as special cases. For example, if a [linear +regression](https://en.wikipedia.org/wiki/Linear_regression) model is +trained with the elastic net parameter $\alpha$ set to $1$, it is +equivalent to a +[Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. +On the other hand, if $\alpha$ is set to $0$, the trained model reduces +to a [ridge +regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. +We implement Pipelines API for both linear regression and logistic +regression with elastic net regularization. # Decision trees diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index 8656eb4001f4..8a0a61cb595e 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -1,10 +1,12 @@ --- layout: global -title: Clustering - spark.ml -displayTitle: Clustering - spark.ml +title: Clustering +displayTitle: Clustering --- -In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html). +This page describes clustering algorithms in MLlib. +The [guide for clustering in the RDD-based API](mllib-clustering.html) also has relevant information +about these algorithms. **Table of Contents** diff --git a/docs/ml-collaborative-filtering.md b/docs/ml-collaborative-filtering.md index 8bd75f3bcf7a..1d02d6933cb4 100644 --- a/docs/ml-collaborative-filtering.md +++ b/docs/ml-collaborative-filtering.md @@ -1,7 +1,7 @@ --- layout: global -title: Collaborative Filtering - spark.ml -displayTitle: Collaborative Filtering - spark.ml +title: Collaborative Filtering +displayTitle: Collaborative Filtering --- * Table of contents diff --git a/docs/ml-decision-tree.md b/docs/ml-decision-tree.md index a721d55bc675..5e1eeb95e472 100644 --- a/docs/ml-decision-tree.md +++ b/docs/ml-decision-tree.md @@ -1,7 +1,7 @@ --- layout: global -title: Decision trees - spark.ml -displayTitle: Decision trees - spark.ml +title: Decision trees +displayTitle: Decision trees --- > This section has been moved into the diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index 303773e8038f..97f1bdc803d0 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -1,7 +1,7 @@ --- layout: global -title: Tree ensemble methods - spark.ml -displayTitle: Tree ensemble methods - spark.ml +title: Tree ensemble methods +displayTitle: Tree ensemble methods --- > This section has been moved into the diff --git a/docs/ml-features.md b/docs/ml-features.md index 88fd291b4be5..602011484548 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1,7 +1,7 @@ --- layout: global -title: Extracting, transforming and selecting features - spark.ml -displayTitle: Extracting, transforming and selecting features - spark.ml +title: Extracting, transforming and selecting features +displayTitle: Extracting, transforming and selecting features --- This section covers algorithms for working with features, roughly divided into these groups: @@ -216,7 +216,7 @@ for more details on the API. [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more advanced tokenization based on regular expression (regex) matching. - By default, the parameter "pattern" (regex, default: \\s+) is used as delimiters to split the input text. + By default, the parameter "pattern" (regex, default: `"\\s+"`) is used as delimiters to split the input text. Alternatively, users can set parameter "gaps" to false indicating the regex "pattern" denotes "tokens" rather than splitting gaps, and find all matching occurrences as the tokenization result. @@ -815,7 +815,7 @@ The rescaled value for a feature E is calculated as, `\begin{equation} Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min \end{equation}` -For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)` +For the case `$E_{max} == E_{min}$`, `$Rescaled(e_i) = 0.5 * (max + min)$` Note that since zero values will probably be transformed to non-zero values, output of the transformer will be `DenseVector` even for sparse input. diff --git a/docs/ml-guide.md b/docs/ml-guide.md index dae86d84804d..5abec63b7ab4 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -1,323 +1,214 @@ --- layout: global -title: "Overview: estimators, transformers and pipelines - spark.ml" -displayTitle: "Overview: estimators, transformers and pipelines - spark.ml" +title: "MLlib: Main Guide" +displayTitle: "Machine Learning Library (MLlib) Guide" --- +MLlib is Spark's machine learning (ML) library. +Its goal is to make practical machine learning scalable and easy. +At a high level, it provides tools such as: -`\[ -\newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} -\newcommand{\x}{\mathbf{x}} -\newcommand{\y}{\mathbf{y}} -\newcommand{\wv}{\mathbf{w}} -\newcommand{\av}{\mathbf{\alpha}} -\newcommand{\bv}{\mathbf{b}} -\newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} -\newcommand{\zero}{\mathbf{0}} -\]` +* ML Algorithms: common learning algorithms such as classification, regression, clustering, and collaborative filtering +* Featurization: feature extraction, transformation, dimensionality reduction, and selection +* Pipelines: tools for constructing, evaluating, and tuning ML Pipelines +* Persistence: saving and load algorithms, models, and Pipelines +* Utilities: linear algebra, statistics, data handling, etc. +# Announcement: DataFrame-based API is primary API -The `spark.ml` package aims to provide a uniform set of high-level APIs built on top of -[DataFrames](sql-programming-guide.html#dataframes) that help users create and tune practical -machine learning pipelines. -See the [algorithm guides](#algorithm-guides) section below for guides on sub-packages of -`spark.ml`, including feature transformers unique to the Pipelines API, ensembles, and more. +**The MLlib RDD-based API is now in maintenance mode.** -**Table of contents** +As of Spark 2.0, the [RDD](programming-guide.html#resilient-distributed-datasets-rdds)-based APIs in the `spark.mllib` package have entered maintenance mode. +The primary Machine Learning API for Spark is now the [DataFrame](sql-programming-guide.html)-based API in the `spark.ml` package. -* This will become a table of contents (this text will be scraped). -{:toc} +*What are the implications?* +* MLlib will still support the RDD-based API in `spark.mllib` with bug fixes. +* MLlib will not add new features to the RDD-based API. +* In the Spark 2.x releases, MLlib will add features to the DataFrames-based API to reach feature parity with the RDD-based API. +* After reaching feature parity (roughly estimated for Spark 2.2), the RDD-based API will be deprecated. +* The RDD-based API is expected to be removed in Spark 3.0. -# Main concepts in Pipelines +*Why is MLlib switching to the DataFrame-based API?* -Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple -algorithms into a single pipeline, or workflow. -This section covers the key concepts introduced by the Spark ML API, where the pipeline concept is -mostly inspired by the [scikit-learn](http://scikit-learn.org/) project. +* DataFrames provide a more user-friendly API than RDDs. The many benefits of DataFrames include Spark Datasources, SQL/DataFrame queries, Tungsten and Catalyst optimizations, and uniform APIs across languages. +* The DataFrame-based API for MLlib provides a uniform API across ML algorithms and across multiple languages. +* DataFrames facilitate practical ML Pipelines, particularly feature transformations. See the [Pipelines guide](ml-pipeline.md) for details. -* **[`DataFrame`](ml-guide.html#dataframe)**: Spark ML uses `DataFrame` from Spark SQL as an ML - dataset, which can hold a variety of data types. - E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions. +# Dependencies -* **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`. -E.g., an ML model is a `Transformer` which transforms a `DataFrame` with features into a `DataFrame` with predictions. +MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on +[netlib-java](https://github.com/fommil/netlib-java) for optimised numerical processing. +If native libraries[^1] are not available at runtime, you will see a warning message and a pure JVM +implementation will be used instead. -* **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`. -E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model. +Due to licensing issues with runtime proprietary binaries, we do not include `netlib-java`'s native +proxies by default. +To configure `netlib-java` / Breeze to use system optimised binaries, include +`com.github.fommil.netlib:all:1.1.2` (or build Spark with `-Pnetlib-lgpl`) as a dependency of your +project and read the [netlib-java](https://github.com/fommil/netlib-java) documentation for your +platform's additional installation instructions. -* **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow. +To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer. -* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters. +[^1]: To learn more about the benefits and background of system optimised natives, you may wish to + watch Sam Halliday's ScalaX talk on [High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/). -## DataFrame +# Migration guide -Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data. -Spark ML adopts the `DataFrame` from Spark SQL in order to support a variety of data types. +MLlib is under active development. +The APIs marked `Experimental`/`DeveloperApi` may change in future releases, +and the migration guide below will explain all changes between releases. -`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types. -In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types. +## From 1.6 to 2.0 -A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`. See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples. +### Breaking changes -Columns in a `DataFrame` are named. The code examples below use names such as "text," "features," and "label." +There were several breaking changes in Spark 2.0, which are outlined below. -## Pipeline components +**Linear algebra classes for DataFrame-based APIs** -### Transformers +Spark's linear algebra dependencies were moved to a new project, `mllib-local` +(see [SPARK-13944](https://issues.apache.org/jira/browse/SPARK-13944)). +As part of this change, the linear algebra classes were copied to a new package, `spark.ml.linalg`. +The DataFrame-based APIs in `spark.ml` now depend on the `spark.ml.linalg` classes, +leading to a few breaking changes, predominantly in various model classes +(see [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810) for a full list). -A `Transformer` is an abstraction that includes feature transformers and learned models. -Technically, a `Transformer` implements a method `transform()`, which converts one `DataFrame` into -another, generally by appending one or more columns. -For example: +**Note:** the RDD-based APIs in `spark.mllib` continue to depend on the previous package `spark.mllib.linalg`. -* A feature transformer might take a `DataFrame`, read a column (e.g., text), map it into a new - column (e.g., feature vectors), and output a new `DataFrame` with the mapped column appended. -* A learning model might take a `DataFrame`, read the column containing feature vectors, predict the - label for each feature vector, and output a new `DataFrame` with predicted labels appended as a - column. +_Converting vectors and matrices_ -### Estimators +While most pipeline components support backward compatibility for loading, +some existing `DataFrames` and pipelines in Spark versions prior to 2.0, that contain vector or matrix +columns, may need to be migrated to the new `spark.ml` vector and matrix types. +Utilities for converting `DataFrame` columns from `spark.mllib.linalg` to `spark.ml.linalg` types +(and vice versa) can be found in `spark.mllib.util.MLUtils`. -An `Estimator` abstracts the concept of a learning algorithm or any algorithm that fits or trains on -data. -Technically, an `Estimator` implements a method `fit()`, which accepts a `DataFrame` and produces a -`Model`, which is a `Transformer`. -For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling -`fit()` trains a `LogisticRegressionModel`, which is a `Model` and hence a `Transformer`. - -### Properties of pipeline components - -`Transformer.transform()`s and `Estimator.fit()`s are both stateless. In the future, stateful algorithms may be supported via alternative concepts. - -Each instance of a `Transformer` or `Estimator` has a unique ID, which is useful in specifying parameters (discussed below). - -## Pipeline - -In machine learning, it is common to run a sequence of algorithms to process and learn from data. -E.g., a simple text document processing workflow might include several stages: - -* Split each document's text into words. -* Convert each document's words into a numerical feature vector. -* Learn a prediction model using the feature vectors and labels. - -Spark ML represents such a workflow as a `Pipeline`, which consists of a sequence of -`PipelineStage`s (`Transformer`s and `Estimator`s) to be run in a specific order. -We will use this simple workflow as a running example in this section. - -### How it works - -A `Pipeline` is specified as a sequence of stages, and each stage is either a `Transformer` or an `Estimator`. -These stages are run in order, and the input `DataFrame` is transformed as it passes through each stage. -For `Transformer` stages, the `transform()` method is called on the `DataFrame`. -For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the `DataFrame`. - -We illustrate this for the simple text document workflow. The figure below is for the *training time* usage of a `Pipeline`. - -

    - Spark ML Pipeline Example -

    - -Above, the top row represents a `Pipeline` with three stages. -The first two (`Tokenizer` and `HashingTF`) are `Transformer`s (blue), and the third (`LogisticRegression`) is an `Estimator` (red). -The bottom row represents data flowing through the pipeline, where cylinders indicate `DataFrame`s. -The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw text documents and labels. -The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`. -The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`. -Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`. -If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` -method on the `DataFrame` before passing the `DataFrame` to the next stage. - -A `Pipeline` is an `Estimator`. -Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel`, which is a -`Transformer`. -This `PipelineModel` is used at *test time*; the figure below illustrates this usage. - -

    - Spark ML PipelineModel Example -

    - -In the figure above, the `PipelineModel` has the same number of stages as the original `Pipeline`, but all `Estimator`s in the original `Pipeline` have become `Transformer`s. -When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed -through the fitted pipeline in order. -Each stage's `transform()` method updates the dataset and passes it to the next stage. - -`Pipeline`s and `PipelineModel`s help to ensure that training and test data go through identical feature processing steps. - -### Details - -*DAG `Pipeline`s*: A `Pipeline`'s stages are specified as an ordered array. The examples given here are all for linear `Pipeline`s, i.e., `Pipeline`s in which each stage uses data produced by the previous stage. It is possible to create non-linear `Pipeline`s as long as the data flow graph forms a Directed Acyclic Graph (DAG). This graph is currently specified implicitly based on the input and output column names of each stage (generally specified as parameters). If the `Pipeline` forms a DAG, then the stages must be specified in topological order. - -*Runtime checking*: Since `Pipeline`s can operate on `DataFrame`s with varied types, they cannot use -compile-time type checking. -`Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`. -This type checking is done using the `DataFrame` *schema*, a description of the data types of columns in the `DataFrame`. - -*Unique Pipeline stages*: A `Pipeline`'s stages should be unique instances. E.g., the same instance -`myHashingTF` should not be inserted into the `Pipeline` twice since `Pipeline` stages must have -unique IDs. However, different instances `myHashingTF1` and `myHashingTF2` (both of type `HashingTF`) -can be put into the same `Pipeline` since different instances will be created with different IDs. - -## Parameters - -Spark ML `Estimator`s and `Transformer`s use a uniform API for specifying parameters. - -A `Param` is a named parameter with self-contained documentation. -A `ParamMap` is a set of (parameter, value) pairs. - -There are two main ways to pass parameters to an algorithm: - -1. Set parameters for an instance. E.g., if `lr` is an instance of `LogisticRegression`, one could - call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations. - This API resembles the API used in `spark.mllib` package. -2. Pass a `ParamMap` to `fit()` or `transform()`. Any parameters in the `ParamMap` will override parameters previously specified via setter methods. - -Parameters belong to specific instances of `Estimator`s and `Transformer`s. -For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`. -This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`. - -## Saving and Loading Pipelines - -Often times it is worth it to save a model or a pipeline to disk for later use. In Spark 1.6, a model import/export functionality was added to the Pipeline API. Most basic transformers are supported as well as some of the more basic ML models. Please refer to the algorithm's API documentation to see if saving and loading is supported. - -# Code examples - -This section gives code examples illustrating the functionality discussed above. -For more info, please refer to the API documentation -([Scala](api/scala/index.html#org.apache.spark.ml.package), -[Java](api/java/org/apache/spark/ml/package-summary.html), -and [Python](api/python/pyspark.ml.html)). -Some Spark ML algorithms are wrappers for `spark.mllib` algorithms, and the -[MLlib programming guide](mllib-guide.html) has details on specific algorithms. - -## Example: Estimator, Transformer, and Param - -This example covers the concepts of `Estimator`, `Transformer`, and `Param`. +There are also utility methods available for converting single instances of +vectors and matrices. Use the `asML` method on a `mllib.linalg.Vector` / `mllib.linalg.Matrix` +for converting to `ml.linalg` types, and +`mllib.linalg.Vectors.fromML` / `mllib.linalg.Matrices.fromML` +for converting to `mllib.linalg` types.
    +
    -
    -{% include_example scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala %} -
    +{% highlight scala %} +import org.apache.spark.mllib.util.MLUtils -
    -{% include_example java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java %} -
    +// convert DataFrame columns +val convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF) +val convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF) +// convert a single vector or matrix +val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML +val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML +{% endhighlight %} -
    -{% include_example python/ml/estimator_transformer_param_example.py %} -
    - -
    - -## Example: Pipeline - -This example follows the simple text document `Pipeline` illustrated in the figures above. - -
    - -
    -{% include_example scala/org/apache/spark/examples/ml/PipelineExample.scala %} -
    - -
    -{% include_example java/org/apache/spark/examples/ml/JavaPipelineExample.java %} -
    - -
    -{% include_example python/ml/pipeline_example.py %} -
    - -
    - -## Example: model selection via cross-validation - -An important task in ML is *model selection*, or using data to find the best model or parameters for a given task. This is also called *tuning*. -`Pipeline`s facilitate model selection by making it easy to tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately. - -Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.Evaluator). -`CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets; e.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. -`CrossValidator` iterates through the set of `ParamMap`s. For each `ParamMap`, it trains the given `Estimator` and evaluates it using the given `Evaluator`. - -The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator) -for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator) -for binary data, or a [`MulticlassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator) -for multiclass problems. The default metric used to choose the best `ParamMap` can be overridden by the `setMetricName` -method in each of these evaluators. - -The `ParamMap` which produces the best evaluation metric (averaged over the `$k$` folds) is selected as the best model. -`CrossValidator` finally fits the `Estimator` using the best `ParamMap` and the entire dataset. - -The following example demonstrates using `CrossValidator` to select from a grid of parameters. -To help construct the parameter grid, we use the [`ParamGridBuilder`](api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) utility. - -Note that cross-validation over a grid of parameters is expensive. -E.g., in the example below, the parameter grid has 3 values for `hashingTF.numFeatures` and 2 values for `lr.regParam`, and `CrossValidator` uses 2 folds. This multiplies out to `$(3 \times 2) \times 2 = 12$` different models being trained. -In realistic settings, it can be common to try many more parameters and use more folds (`$k=3$` and `$k=10$` are common). -In other words, using `CrossValidator` can be very expensive. -However, it is also a well-established method for choosing parameters which is more statistically sound than heuristic hand-tuning. - -
    - -
    -{% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala %} -
    - -
    -{% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java %} -
    - -
    - -{% include_example python/ml/cross_validator.py %} -
    - -
    - -## Example: model selection via train validation split -In addition to `CrossValidator` Spark also offers `TrainValidationSplit` for hyper-parameter tuning. -`TrainValidationSplit` only evaluates each combination of parameters once, as opposed to k times in - the case of `CrossValidator`. It is therefore less expensive, - but will not produce as reliable results when the training dataset is not sufficiently large. - -`TrainValidationSplit` takes an `Estimator`, a set of `ParamMap`s provided in the `estimatorParamMaps` parameter, -and an `Evaluator`. -It begins by splitting the dataset into two parts using the `trainRatio` parameter -which are used as separate training and test datasets. For example with `$trainRatio=0.75$` (default), -`TrainValidationSplit` will generate a training and test dataset pair where 75% of the data is used for training and 25% for validation. -Similar to `CrossValidator`, `TrainValidationSplit` also iterates through the set of `ParamMap`s. -For each combination of parameters, it trains the given `Estimator` and evaluates it using the given `Evaluator`. -The `ParamMap` which produces the best evaluation metric is selected as the best option. -`TrainValidationSplit` finally fits the `Estimator` using the best `ParamMap` and the entire dataset. - -
    - -
    -{% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala %} +Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for further detail.
    -{% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java %} -
    -
    -{% include_example python/ml/train_validation_split.py %} -
    +{% highlight java %} +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.sql.Dataset; + +// convert DataFrame columns +Dataset convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF); +Dataset convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF); +// convert a single vector or matrix +org.apache.spark.ml.linalg.Vector mlVec = mllibVec.asML(); +org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML(); +{% endhighlight %} + +Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for further detail. +
    + +
    + +{% highlight python %} +from pyspark.mllib.util import MLUtils + +# convert DataFrame columns +convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF) +convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF) +# convert a single vector or matrix +mlVec = mllibVec.asML() +mlMat = mllibMat.asML() +{% endhighlight %} + +Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for further detail. +
    +
    + +**Deprecated methods removed** + +Several deprecated methods were removed in the `spark.mllib` and `spark.ml` packages: + +* `setScoreCol` in `ml.evaluation.BinaryClassificationEvaluator` +* `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml` +* `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`) +* `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`) +* `defaultStategy` in `mllib.tree.configuration.Strategy` +* `build` in `mllib.tree.Node` +* libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils` + +A full list of breaking changes can be found at [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810). + +### Deprecations and changes of behavior + +**Deprecations** + +Deprecations in the `spark.mllib` and `spark.ml` packages include: + +* [SPARK-14984](https://issues.apache.org/jira/browse/SPARK-14984): + In `spark.ml.regression.LinearRegressionSummary`, the `model` field has been deprecated. +* [SPARK-13784](https://issues.apache.org/jira/browse/SPARK-13784): + In `spark.ml.regression.RandomForestRegressionModel` and `spark.ml.classification.RandomForestClassificationModel`, + the `numTrees` parameter has been deprecated in favor of `getNumTrees` method. +* [SPARK-13761](https://issues.apache.org/jira/browse/SPARK-13761): + In `spark.ml.param.Params`, the `validateParams` method has been deprecated. + We move all functionality in overridden methods to the corresponding `transformSchema`. +* [SPARK-14829](https://issues.apache.org/jira/browse/SPARK-14829): + In `spark.mllib` package, `LinearRegressionWithSGD`, `LassoWithSGD`, `RidgeRegressionWithSGD` and `LogisticRegressionWithSGD` have been deprecated. + We encourage users to use `spark.ml.regression.LinearRegresson` and `spark.ml.classification.LogisticRegresson`. +* [SPARK-14900](https://issues.apache.org/jira/browse/SPARK-14900): + In `spark.mllib.evaluation.MulticlassMetrics`, the parameters `precision`, `recall` and `fMeasure` have been deprecated in favor of `accuracy`. +* [SPARK-15644](https://issues.apache.org/jira/browse/SPARK-15644): + In `spark.ml.util.MLReader` and `spark.ml.util.MLWriter`, the `context` method has been deprecated in favor of `session`. +* In `spark.ml.feature.ChiSqSelectorModel`, the `setLabelCol` method has been deprecated since it was not used by `ChiSqSelectorModel`. + +**Changes of behavior** + +Changes of behavior in the `spark.mllib` and `spark.ml` packages include: + +* [SPARK-7780](https://issues.apache.org/jira/browse/SPARK-7780): + `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegresson` for binary classification now. + This will introduce the following behavior changes for `spark.mllib.classification.LogisticRegressionWithLBFGS`: + * The intercept will not be regularized when training binary classification model with L1/L2 Updater. + * If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate. +* [SPARK-13429](https://issues.apache.org/jira/browse/SPARK-13429): + In order to provide better and consistent result with `spark.ml.classification.LogisticRegresson`, + the default value of `spark.mllib.classification.LogisticRegressionWithLBFGS`: `convergenceTol` has been changed from 1E-4 to 1E-6. +* [SPARK-12363](https://issues.apache.org/jira/browse/SPARK-12363): + Fix a bug of `PowerIterationClustering` which will likely change its result. +* [SPARK-13048](https://issues.apache.org/jira/browse/SPARK-13048): + `LDA` using the `EM` optimizer will keep the last checkpoint by default, if checkpointing is being used. +* [SPARK-12153](https://issues.apache.org/jira/browse/SPARK-12153): + `Word2Vec` now respects sentence boundaries. Previously, it did not handle them correctly. +* [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574): + `HashingTF` uses `MurmurHash3` as default hash algorithm in both `spark.ml` and `spark.mllib`. +* [SPARK-14768](https://issues.apache.org/jira/browse/SPARK-14768): + The `expectedType` argument for PySpark `Param` was removed. +* [SPARK-14931](https://issues.apache.org/jira/browse/SPARK-14931): + Some default `Param` values, which were mismatched between pipelines in Scala and Python, have been changed. +* [SPARK-13600](https://issues.apache.org/jira/browse/SPARK-13600): + `QuantileDiscretizer` now uses `spark.sql.DataFrameStatFunctions.approxQuantile` to find splits (previously used custom sampling logic). + The output buckets will differ for same input data and params. + +## Previous Spark versions + +Earlier migration guides are archived [on this page](ml-migration-guides.html). - +--- diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index a8754835cab9..eb39173505ae 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -1,7 +1,7 @@ --- layout: global -title: Linear methods - spark.ml -displayTitle: Linear methods - spark.ml +title: Linear methods +displayTitle: Linear methods --- > This section has been moved into the diff --git a/docs/ml-migration-guides.md b/docs/ml-migration-guides.md new file mode 100644 index 000000000000..82bf9d7760fb --- /dev/null +++ b/docs/ml-migration-guides.md @@ -0,0 +1,159 @@ +--- +layout: global +title: Old Migration Guides - MLlib +displayTitle: Old Migration Guides - MLlib +description: MLlib migration guides from before Spark SPARK_VERSION_SHORT +--- + +The migration guide for the current Spark version is kept on the [MLlib Guide main page](ml-guide.html#migration-guide). + +## From 1.5 to 1.6 + +There are no breaking API changes in the `spark.mllib` or `spark.ml` packages, but there are +deprecations and changes of behavior. + +Deprecations: + +* [SPARK-11358](https://issues.apache.org/jira/browse/SPARK-11358): + In `spark.mllib.clustering.KMeans`, the `runs` parameter has been deprecated. +* [SPARK-10592](https://issues.apache.org/jira/browse/SPARK-10592): + In `spark.ml.classification.LogisticRegressionModel` and + `spark.ml.regression.LinearRegressionModel`, the `weights` field has been deprecated in favor of + the new name `coefficients`. This helps disambiguate from instance (row) "weights" given to + algorithms. + +Changes of behavior: + +* [SPARK-7770](https://issues.apache.org/jira/browse/SPARK-7770): + `spark.mllib.tree.GradientBoostedTrees`: `validationTol` has changed semantics in 1.6. + Previously, it was a threshold for absolute change in error. Now, it resembles the behavior of + `GradientDescent`'s `convergenceTol`: For large errors, it uses relative error (relative to the + previous error); for small errors (`< 0.01`), it uses absolute error. +* [SPARK-11069](https://issues.apache.org/jira/browse/SPARK-11069): + `spark.ml.feature.RegexTokenizer`: Previously, it did not convert strings to lowercase before + tokenizing. Now, it converts to lowercase by default, with an option not to. This matches the + behavior of the simpler `Tokenizer` transformer. + +## From 1.4 to 1.5 + +In the `spark.mllib` package, there are no breaking API changes but several behavior changes: + +* [SPARK-9005](https://issues.apache.org/jira/browse/SPARK-9005): + `RegressionMetrics.explainedVariance` returns the average regression sum of squares. +* [SPARK-8600](https://issues.apache.org/jira/browse/SPARK-8600): `NaiveBayesModel.labels` become + sorted. +* [SPARK-3382](https://issues.apache.org/jira/browse/SPARK-3382): `GradientDescent` has a default + convergence tolerance `1e-3`, and hence iterations might end earlier than 1.4. + +In the `spark.ml` package, there exists one breaking API change and one behavior change: + +* [SPARK-9268](https://issues.apache.org/jira/browse/SPARK-9268): Java's varargs support is removed + from `Params.setDefault` due to a + [Scala compiler bug](https://issues.scala-lang.org/browse/SI-9013). +* [SPARK-10097](https://issues.apache.org/jira/browse/SPARK-10097): `Evaluator.isLargerBetter` is + added to indicate metric ordering. Metrics like RMSE no longer flip signs as in 1.4. + +## From 1.3 to 1.4 + +In the `spark.mllib` package, there were several breaking changes, but all in `DeveloperApi` or `Experimental` APIs: + +* Gradient-Boosted Trees + * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/index.html#org.apache.spark.mllib.tree.loss.Loss) method was changed. This is only an issues for users who wrote their own losses for GBTs. + * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.BoostingStrategy) have been changed because of a modification to the case class fields. This could be an issue for users who use `BoostingStrategy` to set GBT parameters. +* *(Breaking change)* The return value of [`LDA.run`](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) has changed. It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`. The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm. + +In the `spark.ml` package, several major API changes occurred, including: + +* `Param` and other APIs for specifying parameters +* `uid` unique IDs for Pipeline components +* Reorganization of certain classes + +Since the `spark.ml` API was an alpha component in Spark 1.3, we do not list all changes here. +However, since 1.4 `spark.ml` is no longer an alpha component, we will provide details on any API +changes for future releases. + +## From 1.2 to 1.3 + +In the `spark.mllib` package, there were several breaking changes. The first change (in `ALS`) is the only one in a component not marked as Alpha or Experimental. + +* *(Breaking change)* In [`ALS`](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS), the extraneous method `solveLeastSquares` has been removed. The `DeveloperApi` method `analyzeBlocks` was also removed. +* *(Breaking change)* [`StandardScalerModel`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScalerModel) remains an Alpha component. In it, the `variance` method has been replaced with the `std` method. To compute the column variance values returned by the original `variance` method, simply square the standard deviation values returned by `std`. +* *(Breaking change)* [`StreamingLinearRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD) remains an Experimental component. In it, there were two changes: + * The constructor taking arguments was removed in favor of a builder pattern using the default constructor plus parameter setter methods. + * Variable `model` is no longer public. +* *(Breaking change)* [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) remains an Experimental component. In it and its associated classes, there were several changes: + * In `DecisionTree`, the deprecated class method `train` has been removed. (The object/static `train` methods remain.) + * In `Strategy`, the `checkpointDir` parameter has been removed. Checkpointing is still supported, but the checkpoint directory must be set before calling tree and tree ensemble training. +* `PythonMLlibAPI` (the interface between Scala/Java and Python for MLlib) was a public API but is now private, declared `private[python]`. This was never meant for external use. +* In linear regression (including Lasso and ridge regression), the squared loss is now divided by 2. + So in order to produce the same result as in 1.2, the regularization parameter needs to be divided by 2 and the step size needs to be multiplied by 2. + +In the `spark.ml` package, the main API changes are from Spark SQL. We list the most important changes here: + +* The old [SchemaRDD](http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API. All algorithms in `spark.ml` which used to use SchemaRDD now use DataFrame. +* In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`. These implicits have been moved, so we now call `import sqlContext.implicits._`. +* Java APIs for SQL have also changed accordingly. Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details. + +Other changes were in `LogisticRegression`: + +* The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability"). The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future). +* In Spark 1.2, `LogisticRegressionModel` did not include an intercept. In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS). The option to use an intercept will be added in the future. + +## From 1.1 to 1.2 + +The only API changes in MLlib v1.2 are in +[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree), +which continues to be an experimental API in MLlib 1.2: + +1. *(Breaking change)* The Scala API for classification takes a named argument specifying the number +of classes. In MLlib v1.1, this argument was called `numClasses` in Python and +`numClassesForClassification` in Scala. In MLlib v1.2, the names are both set to `numClasses`. +This `numClasses` parameter is specified either via +[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy) +or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) +static `trainClassifier` and `trainRegressor` methods. + +2. *(Breaking change)* The API for +[`Node`](api/scala/index.html#org.apache.spark.mllib.tree.model.Node) has changed. +This should generally not affect user code, unless the user manually constructs decision trees +(instead of using the `trainClassifier` or `trainRegressor` methods). +The tree `Node` now includes more information, including the probability of the predicted label +(for classification). + +3. Printing methods' output has changed. The `toString` (Scala/Java) and `__repr__` (Python) methods used to print the full model; they now print a summary. For the full model, use `toDebugString`. + +Examples in the Spark distribution and examples in the +[Decision Trees Guide](mllib-decision-tree.html#examples) have been updated accordingly. + +## From 1.0 to 1.1 + +The only API changes in MLlib v1.1 are in +[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree), +which continues to be an experimental API in MLlib 1.1: + +1. *(Breaking change)* The meaning of tree depth has been changed by 1 in order to match +the implementations of trees in +[scikit-learn](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree) +and in [rpart](http://cran.r-project.org/web/packages/rpart/index.html). +In MLlib v1.0, a depth-1 tree had 1 leaf node, and a depth-2 tree had 1 root node and 2 leaf nodes. +In MLlib v1.1, a depth-0 tree has 1 leaf node, and a depth-1 tree has 1 root node and 2 leaf nodes. +This depth is specified by the `maxDepth` parameter in +[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy) +or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) +static `trainClassifier` and `trainRegressor` methods. + +2. *(Non-breaking change)* We recommend using the newly added `trainClassifier` and `trainRegressor` +methods to build a [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree), +rather than using the old parameter class `Strategy`. These new training methods explicitly +separate classification and regression, and they replace specialized parameter types with +simple `String` types. + +Examples of the new, recommended `trainClassifier` and `trainRegressor` are given in the +[Decision Trees Guide](mllib-decision-tree.html#examples). + +## From 0.9 to 1.0 + +In MLlib v1.0, we support both dense and sparse input in a unified way, which introduces a few +breaking changes. If your data is sparse, please store it in a sparse format instead of dense to +take advantage of sparsity in both storage and computation. Details are described below. + diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md new file mode 100644 index 000000000000..adb057ba7e25 --- /dev/null +++ b/docs/ml-pipeline.md @@ -0,0 +1,245 @@ +--- +layout: global +title: ML Pipelines +displayTitle: ML Pipelines +--- + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + +In this section, we introduce the concept of ***ML Pipelines***. +ML Pipelines provide a uniform set of high-level APIs built on top of +[DataFrames](sql-programming-guide.html) that help users create and tune practical +machine learning pipelines. + +**Table of Contents** + +* This will become a table of contents (this text will be scraped). +{:toc} + +# Main concepts in Pipelines + +MLlib standardizes APIs for machine learning algorithms to make it easier to combine multiple +algorithms into a single pipeline, or workflow. +This section covers the key concepts introduced by the Pipelines API, where the pipeline concept is +mostly inspired by the [scikit-learn](http://scikit-learn.org/) project. + +* **[`DataFrame`](ml-guide.html#dataframe)**: This ML API uses `DataFrame` from Spark SQL as an ML + dataset, which can hold a variety of data types. + E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions. + +* **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`. +E.g., an ML model is a `Transformer` which transforms a `DataFrame` with features into a `DataFrame` with predictions. + +* **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`. +E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model. + +* **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow. + +* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters. + +## DataFrame + +Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data. +This API adopts the `DataFrame` from Spark SQL in order to support a variety of data types. + +`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types. +In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types. + +A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`. See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples. + +Columns in a `DataFrame` are named. The code examples below use names such as "text," "features," and "label." + +## Pipeline components + +### Transformers + +A `Transformer` is an abstraction that includes feature transformers and learned models. +Technically, a `Transformer` implements a method `transform()`, which converts one `DataFrame` into +another, generally by appending one or more columns. +For example: + +* A feature transformer might take a `DataFrame`, read a column (e.g., text), map it into a new + column (e.g., feature vectors), and output a new `DataFrame` with the mapped column appended. +* A learning model might take a `DataFrame`, read the column containing feature vectors, predict the + label for each feature vector, and output a new `DataFrame` with predicted labels appended as a + column. + +### Estimators + +An `Estimator` abstracts the concept of a learning algorithm or any algorithm that fits or trains on +data. +Technically, an `Estimator` implements a method `fit()`, which accepts a `DataFrame` and produces a +`Model`, which is a `Transformer`. +For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling +`fit()` trains a `LogisticRegressionModel`, which is a `Model` and hence a `Transformer`. + +### Properties of pipeline components + +`Transformer.transform()`s and `Estimator.fit()`s are both stateless. In the future, stateful algorithms may be supported via alternative concepts. + +Each instance of a `Transformer` or `Estimator` has a unique ID, which is useful in specifying parameters (discussed below). + +## Pipeline + +In machine learning, it is common to run a sequence of algorithms to process and learn from data. +E.g., a simple text document processing workflow might include several stages: + +* Split each document's text into words. +* Convert each document's words into a numerical feature vector. +* Learn a prediction model using the feature vectors and labels. + +MLlib represents such a workflow as a `Pipeline`, which consists of a sequence of +`PipelineStage`s (`Transformer`s and `Estimator`s) to be run in a specific order. +We will use this simple workflow as a running example in this section. + +### How it works + +A `Pipeline` is specified as a sequence of stages, and each stage is either a `Transformer` or an `Estimator`. +These stages are run in order, and the input `DataFrame` is transformed as it passes through each stage. +For `Transformer` stages, the `transform()` method is called on the `DataFrame`. +For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the `DataFrame`. + +We illustrate this for the simple text document workflow. The figure below is for the *training time* usage of a `Pipeline`. + +

    + ML Pipeline Example +

    + +Above, the top row represents a `Pipeline` with three stages. +The first two (`Tokenizer` and `HashingTF`) are `Transformer`s (blue), and the third (`LogisticRegression`) is an `Estimator` (red). +The bottom row represents data flowing through the pipeline, where cylinders indicate `DataFrame`s. +The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw text documents and labels. +The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`. +The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`. +Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`. +If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` +method on the `DataFrame` before passing the `DataFrame` to the next stage. + +A `Pipeline` is an `Estimator`. +Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel`, which is a +`Transformer`. +This `PipelineModel` is used at *test time*; the figure below illustrates this usage. + +

    + ML PipelineModel Example +

    + +In the figure above, the `PipelineModel` has the same number of stages as the original `Pipeline`, but all `Estimator`s in the original `Pipeline` have become `Transformer`s. +When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed +through the fitted pipeline in order. +Each stage's `transform()` method updates the dataset and passes it to the next stage. + +`Pipeline`s and `PipelineModel`s help to ensure that training and test data go through identical feature processing steps. + +### Details + +*DAG `Pipeline`s*: A `Pipeline`'s stages are specified as an ordered array. The examples given here are all for linear `Pipeline`s, i.e., `Pipeline`s in which each stage uses data produced by the previous stage. It is possible to create non-linear `Pipeline`s as long as the data flow graph forms a Directed Acyclic Graph (DAG). This graph is currently specified implicitly based on the input and output column names of each stage (generally specified as parameters). If the `Pipeline` forms a DAG, then the stages must be specified in topological order. + +*Runtime checking*: Since `Pipeline`s can operate on `DataFrame`s with varied types, they cannot use +compile-time type checking. +`Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`. +This type checking is done using the `DataFrame` *schema*, a description of the data types of columns in the `DataFrame`. + +*Unique Pipeline stages*: A `Pipeline`'s stages should be unique instances. E.g., the same instance +`myHashingTF` should not be inserted into the `Pipeline` twice since `Pipeline` stages must have +unique IDs. However, different instances `myHashingTF1` and `myHashingTF2` (both of type `HashingTF`) +can be put into the same `Pipeline` since different instances will be created with different IDs. + +## Parameters + +MLlib `Estimator`s and `Transformer`s use a uniform API for specifying parameters. + +A `Param` is a named parameter with self-contained documentation. +A `ParamMap` is a set of (parameter, value) pairs. + +There are two main ways to pass parameters to an algorithm: + +1. Set parameters for an instance. E.g., if `lr` is an instance of `LogisticRegression`, one could + call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations. + This API resembles the API used in `spark.mllib` package. +2. Pass a `ParamMap` to `fit()` or `transform()`. Any parameters in the `ParamMap` will override parameters previously specified via setter methods. + +Parameters belong to specific instances of `Estimator`s and `Transformer`s. +For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`. +This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`. + +## Saving and Loading Pipelines + +Often times it is worth it to save a model or a pipeline to disk for later use. In Spark 1.6, a model import/export functionality was added to the Pipeline API. Most basic transformers are supported as well as some of the more basic ML models. Please refer to the algorithm's API documentation to see if saving and loading is supported. + +# Code examples + +This section gives code examples illustrating the functionality discussed above. +For more info, please refer to the API documentation +([Scala](api/scala/index.html#org.apache.spark.ml.package), +[Java](api/java/org/apache/spark/ml/package-summary.html), +and [Python](api/python/pyspark.ml.html)). + +## Example: Estimator, Transformer, and Param + +This example covers the concepts of `Estimator`, `Transformer`, and `Param`. + +
    + +
    +{% include_example scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala %} +
    + +
    +{% include_example java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java %} +
    + +
    +{% include_example python/ml/estimator_transformer_param_example.py %} +
    + +
    + +## Example: Pipeline + +This example follows the simple text document `Pipeline` illustrated in the figures above. + +
    + +
    +{% include_example scala/org/apache/spark/examples/ml/PipelineExample.scala %} +
    + +
    +{% include_example java/org/apache/spark/examples/ml/JavaPipelineExample.java %} +
    + +
    +{% include_example python/ml/pipeline_example.py %} +
    + +
    + +## Model selection (hyperparameter tuning) + +A big benefit of using ML Pipelines is hyperparameter optimization. See the [ML Tuning Guide](ml-tuning.html) for more information on automatic model selection. diff --git a/docs/ml-survival-regression.md b/docs/ml-survival-regression.md index 856ceb2f4e7f..efa3c21c7ca1 100644 --- a/docs/ml-survival-regression.md +++ b/docs/ml-survival-regression.md @@ -1,7 +1,7 @@ --- layout: global -title: Survival Regression - spark.ml -displayTitle: Survival Regression - spark.ml +title: Survival Regression +displayTitle: Survival Regression --- > This section has been moved into the diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md new file mode 100644 index 000000000000..2ca90c7092fd --- /dev/null +++ b/docs/ml-tuning.md @@ -0,0 +1,121 @@ +--- +layout: global +title: "ML Tuning" +displayTitle: "ML Tuning: model selection and hyperparameter tuning" +--- + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + +This section describes how to use MLlib's tooling for tuning ML algorithms and Pipelines. +Built-in Cross-Validation and other tooling allow users to optimize hyperparameters in algorithms and Pipelines. + +**Table of contents** + +* This will become a table of contents (this text will be scraped). +{:toc} + +# Model selection (a.k.a. hyperparameter tuning) + +An important task in ML is *model selection*, or using data to find the best model or parameters for a given task. This is also called *tuning*. +Tuning may be done for individual `Estimator`s such as `LogisticRegression`, or for entire `Pipeline`s which include multiple algorithms, featurization, and other steps. Users can tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately. + +MLlib supports model selection using tools such as [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) and [`TrainValidationSplit`](api/scala/index.html#org.apache.spark.ml.tuning.TrainValidationSplit). +These tools require the following items: + +* [`Estimator`](api/scala/index.html#org.apache.spark.ml.Estimator): algorithm or `Pipeline` to tune +* Set of `ParamMap`s: parameters to choose from, sometimes called a "parameter grid" to search over +* [`Evaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.Evaluator): metric to measure how well a fitted `Model` does on held-out test data + +At a high level, these model selection tools work as follows: + +* They split the input data into separate training and test datasets. +* For each (training, test) pair, they iterate through the set of `ParamMap`s: + * For each `ParamMap`, they fit the `Estimator` using those parameters, get the fitted `Model`, and evaluate the `Model`'s performance using the `Evaluator`. +* They select the `Model` produced by the best-performing set of parameters. + +The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator) +for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator) +for binary data, or a [`MulticlassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator) +for multiclass problems. The default metric used to choose the best `ParamMap` can be overridden by the `setMetricName` +method in each of these evaluators. + +To help construct the parameter grid, users can use the [`ParamGridBuilder`](api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) utility. + +# Cross-Validation + +`CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets. E.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. To evaluate a particular `ParamMap`, `CrossValidator` computes the average evaluation metric for the 3 `Model`s produced by fitting the `Estimator` on the 3 different (training, test) dataset pairs. + +After identifying the best `ParamMap`, `CrossValidator` finally re-fits the `Estimator` using the best `ParamMap` and the entire dataset. + +## Example: model selection via cross-validation + +The following example demonstrates using `CrossValidator` to select from a grid of parameters. + +Note that cross-validation over a grid of parameters is expensive. +E.g., in the example below, the parameter grid has 3 values for `hashingTF.numFeatures` and 2 values for `lr.regParam`, and `CrossValidator` uses 2 folds. This multiplies out to `$(3 \times 2) \times 2 = 12$` different models being trained. +In realistic settings, it can be common to try many more parameters and use more folds (`$k=3$` and `$k=10$` are common). +In other words, using `CrossValidator` can be very expensive. +However, it is also a well-established method for choosing parameters which is more statistically sound than heuristic hand-tuning. + +
    + +
    +{% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala %} +
    + +
    +{% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java %} +
    + +
    + +{% include_example python/ml/cross_validator.py %} +
    + +
    + +# Train-Validation Split + +In addition to `CrossValidator` Spark also offers `TrainValidationSplit` for hyper-parameter tuning. +`TrainValidationSplit` only evaluates each combination of parameters once, as opposed to k times in + the case of `CrossValidator`. It is therefore less expensive, + but will not produce as reliable results when the training dataset is not sufficiently large. + +Unlike `CrossValidator`, `TrainValidationSplit` creates a single (training, test) dataset pair. +It splits the dataset into these two parts using the `trainRatio` parameter. For example with `$trainRatio=0.75$`, +`TrainValidationSplit` will generate a training and test dataset pair where 75% of the data is used for training and 25% for validation. + +Like `CrossValidator`, `TrainValidationSplit` finally fits the `Estimator` using the best `ParamMap` and the entire dataset. + +## Example: model selection via train validation split + +
    + +
    +{% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala %} +
    + +
    +{% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java %} +
    + +
    +{% include_example python/ml/train_validation_split.py %} +
    + +
    diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md index aaf8bd465c9a..a7b90de09369 100644 --- a/docs/mllib-classification-regression.md +++ b/docs/mllib-classification-regression.md @@ -1,7 +1,7 @@ --- layout: global -title: Classification and Regression - spark.mllib -displayTitle: Classification and Regression - spark.mllib +title: Classification and Regression - RDD-based API +displayTitle: Classification and Regression - RDD-based API --- The `spark.mllib` package supports various methods for diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index 073927c30bc6..d5f6ae379a85 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -1,7 +1,7 @@ --- layout: global -title: Clustering - spark.mllib -displayTitle: Clustering - spark.mllib +title: Clustering - RDD-based API +displayTitle: Clustering - RDD-based API --- [Clustering](https://en.wikipedia.org/wiki/Cluster_analysis) is an unsupervised learning problem whereby we aim to group subsets diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md index 5c33292aaf08..0f891a09a6e6 100644 --- a/docs/mllib-collaborative-filtering.md +++ b/docs/mllib-collaborative-filtering.md @@ -1,7 +1,7 @@ --- layout: global -title: Collaborative Filtering - spark.mllib -displayTitle: Collaborative Filtering - spark.mllib +title: Collaborative Filtering - RDD-based API +displayTitle: Collaborative Filtering - RDD-based API --- * Table of contents diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index ef56aebbc360..7dd3c97a83e4 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -1,7 +1,7 @@ --- layout: global -title: Data Types - MLlib -displayTitle: Data Types - MLlib +title: Data Types - RDD-based API +displayTitle: Data Types - RDD-based API --- * Table of contents diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md index 11f5de1fc95e..0e753b8dd04a 100644 --- a/docs/mllib-decision-tree.md +++ b/docs/mllib-decision-tree.md @@ -1,7 +1,7 @@ --- layout: global -title: Decision Trees - spark.mllib -displayTitle: Decision Trees - spark.mllib +title: Decision Trees - RDD-based API +displayTitle: Decision Trees - RDD-based API --- * Table of contents diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md index cceddce9f79a..539cbc1b3163 100644 --- a/docs/mllib-dimensionality-reduction.md +++ b/docs/mllib-dimensionality-reduction.md @@ -1,7 +1,7 @@ --- layout: global -title: Dimensionality Reduction - spark.mllib -displayTitle: Dimensionality Reduction - spark.mllib +title: Dimensionality Reduction - RDD-based API +displayTitle: Dimensionality Reduction - RDD-based API --- * Table of contents diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md index 5543262a8967..e1984b6c8d5a 100644 --- a/docs/mllib-ensembles.md +++ b/docs/mllib-ensembles.md @@ -1,7 +1,7 @@ --- layout: global -title: Ensembles - spark.mllib -displayTitle: Ensembles - spark.mllib +title: Ensembles - RDD-based API +displayTitle: Ensembles - RDD-based API --- * Table of contents diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index c49bc4ff124b..ac82f43cfb79 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -1,7 +1,7 @@ --- layout: global -title: Evaluation Metrics - spark.mllib -displayTitle: Evaluation Metrics - spark.mllib +title: Evaluation Metrics - RDD-based API +displayTitle: Evaluation Metrics - RDD-based API --- * Table of contents diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 67c033e9e400..867be7f2932e 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -1,7 +1,7 @@ --- layout: global -title: Feature Extraction and Transformation - spark.mllib -displayTitle: Feature Extraction and Transformation - spark.mllib +title: Feature Extraction and Transformation - RDD-based API +displayTitle: Feature Extraction and Transformation - RDD-based API --- * Table of contents diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md index a7b55dc5e566..93e3f0b2d226 100644 --- a/docs/mllib-frequent-pattern-mining.md +++ b/docs/mllib-frequent-pattern-mining.md @@ -1,7 +1,7 @@ --- layout: global -title: Frequent Pattern Mining - spark.mllib -displayTitle: Frequent Pattern Mining - spark.mllib +title: Frequent Pattern Mining - RDD-based API +displayTitle: Frequent Pattern Mining - RDD-based API --- Mining frequent items, itemsets, subsequences, or other substructures is usually among the diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 17fd3e1edf4b..30112c72c9c3 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -1,32 +1,12 @@ --- layout: global -title: MLlib -displayTitle: Machine Learning Library (MLlib) Guide -description: MLlib machine learning library overview for Spark SPARK_VERSION_SHORT +title: "MLlib: RDD-based API" +displayTitle: "MLlib: RDD-based API" --- -MLlib is Spark's machine learning (ML) library. -Its goal is to make practical machine learning scalable and easy. -It consists of common learning algorithms and utilities, including classification, regression, -clustering, collaborative filtering, dimensionality reduction, as well as lower-level optimization -primitives and higher-level pipeline APIs. - -It divides into two packages: - -* [`spark.mllib`](mllib-guide.html#data-types-algorithms-and-utilities) contains the original API - built on top of [RDDs](programming-guide.html#resilient-distributed-datasets-rdds). -* [`spark.ml`](ml-guide.html) provides higher-level API - built on top of [DataFrames](sql-programming-guide.html#dataframes) for constructing ML pipelines. - -Using `spark.ml` is recommended because with DataFrames the API is more versatile and flexible. -But we will keep supporting `spark.mllib` along with the development of `spark.ml`. -Users should be comfortable using `spark.mllib` features and expect more features coming. -Developers should contribute new algorithms to `spark.ml` if they fit the ML pipeline concept well, -e.g., feature extractors and transformers. - -We list major functionality from both below, with links to detailed guides. - -# spark.mllib: data types, algorithms, and utilities +This page documents sections of the MLlib guide for the RDD-based API (the `spark.mllib` package). +Please see the [MLlib Main Guide](ml-guide.html) for the DataFrame-based API (the `spark.ml` package), +which is now the primary API for MLlib. * [Data types](mllib-data-types.html) * [Basic statistics](mllib-statistics.html) @@ -65,192 +45,3 @@ We list major functionality from both below, with links to detailed guides. * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd) * [limited-memory BFGS (L-BFGS)](mllib-optimization.html#limited-memory-bfgs-l-bfgs) -# spark.ml: high-level APIs for ML pipelines - -* [Overview: estimators, transformers and pipelines](ml-guide.html) -* [Extracting, transforming and selecting features](ml-features.html) -* [Classification and regression](ml-classification-regression.html) -* [Clustering](ml-clustering.html) -* [Collaborative filtering](ml-collaborative-filtering.html) -* [Advanced topics](ml-advanced.html) - -Some techniques are not available yet in spark.ml, most notably dimensionality reduction -Users can seamlessly combine the implementation of these techniques found in `spark.mllib` with the rest of the algorithms found in `spark.ml`. - -# Dependencies - -MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on -[netlib-java](https://github.com/fommil/netlib-java) for optimised numerical processing. -If natives libraries[^1] are not available at runtime, you will see a warning message and a pure JVM -implementation will be used instead. - -Due to licensing issues with runtime proprietary binaries, we do not include `netlib-java`'s native -proxies by default. -To configure `netlib-java` / Breeze to use system optimised binaries, include -`com.github.fommil.netlib:all:1.1.2` (or build Spark with `-Pnetlib-lgpl`) as a dependency of your -project and read the [netlib-java](https://github.com/fommil/netlib-java) documentation for your -platform's additional installation instructions. - -To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer. - -[^1]: To learn more about the benefits and background of system optimised natives, you may wish to - watch Sam Halliday's ScalaX talk on [High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/). - -# Migration guide - -MLlib is under active development. -The APIs marked `Experimental`/`DeveloperApi` may change in future releases, -and the migration guide below will explain all changes between releases. - -## From 1.6 to 2.0 - -### Breaking changes - -There were several breaking changes in Spark 2.0, which are outlined below. - -**Linear algebra classes for DataFrame-based APIs** - -Spark's linear algebra dependencies were moved to a new project, `mllib-local` -(see [SPARK-13944](https://issues.apache.org/jira/browse/SPARK-13944)). -As part of this change, the linear algebra classes were copied to a new package, `spark.ml.linalg`. -The DataFrame-based APIs in `spark.ml` now depend on the `spark.ml.linalg` classes, -leading to a few breaking changes, predominantly in various model classes -(see [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810) for a full list). - -**Note:** the RDD-based APIs in `spark.mllib` continue to depend on the previous package `spark.mllib.linalg`. - -_Converting vectors and matrices_ - -While most pipeline components support backward compatibility for loading, -some existing `DataFrames` and pipelines in Spark versions prior to 2.0, that contain vector or matrix -columns, may need to be migrated to the new `spark.ml` vector and matrix types. -Utilities for converting `DataFrame` columns from `spark.mllib.linalg` to `spark.ml.linalg` types -(and vice versa) can be found in `spark.mllib.util.MLUtils`. - -There are also utility methods available for converting single instances of -vectors and matrices. Use the `asML` method on a `mllib.linalg.Vector` / `mllib.linalg.Matrix` -for converting to `ml.linalg` types, and -`mllib.linalg.Vectors.fromML` / `mllib.linalg.Matrices.fromML` -for converting to `mllib.linalg` types. - -
    -
    - -{% highlight scala %} -import org.apache.spark.mllib.util.MLUtils - -// convert DataFrame columns -val convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF) -val convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF) -// convert a single vector or matrix -val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML -val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML -{% endhighlight %} - -Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for further detail. -
    - -
    - -{% highlight java %} -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.sql.Dataset; - -// convert DataFrame columns -Dataset convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF); -Dataset convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF); -// convert a single vector or matrix -org.apache.spark.ml.linalg.Vector mlVec = mllibVec.asML(); -org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML(); -{% endhighlight %} - -Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for further detail. -
    - -
    - -{% highlight python %} -from pyspark.mllib.util import MLUtils - -# convert DataFrame columns -convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF) -convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF) -# convert a single vector or matrix -mlVec = mllibVec.asML() -mlMat = mllibMat.asML() -{% endhighlight %} - -Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for further detail. -
    -
    - -**Deprecated methods removed** - -Several deprecated methods were removed in the `spark.mllib` and `spark.ml` packages: - -* `setScoreCol` in `ml.evaluation.BinaryClassificationEvaluator` -* `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml` -* `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`) -* `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`) -* `defaultStategy` in `mllib.tree.configuration.Strategy` -* `build` in `mllib.tree.Node` -* libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils` - -A full list of breaking changes can be found at [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810). - -### Deprecations and changes of behavior - -**Deprecations** - -Deprecations in the `spark.mllib` and `spark.ml` packages include: - -* [SPARK-14984](https://issues.apache.org/jira/browse/SPARK-14984): - In `spark.ml.regression.LinearRegressionSummary`, the `model` field has been deprecated. -* [SPARK-13784](https://issues.apache.org/jira/browse/SPARK-13784): - In `spark.ml.regression.RandomForestRegressionModel` and `spark.ml.classification.RandomForestClassificationModel`, - the `numTrees` parameter has been deprecated in favor of `getNumTrees` method. -* [SPARK-13761](https://issues.apache.org/jira/browse/SPARK-13761): - In `spark.ml.param.Params`, the `validateParams` method has been deprecated. - We move all functionality in overridden methods to the corresponding `transformSchema`. -* [SPARK-14829](https://issues.apache.org/jira/browse/SPARK-14829): - In `spark.mllib` package, `LinearRegressionWithSGD`, `LassoWithSGD`, `RidgeRegressionWithSGD` and `LogisticRegressionWithSGD` have been deprecated. - We encourage users to use `spark.ml.regression.LinearRegresson` and `spark.ml.classification.LogisticRegresson`. -* [SPARK-14900](https://issues.apache.org/jira/browse/SPARK-14900): - In `spark.mllib.evaluation.MulticlassMetrics`, the parameters `precision`, `recall` and `fMeasure` have been deprecated in favor of `accuracy`. -* [SPARK-15644](https://issues.apache.org/jira/browse/SPARK-15644): - In `spark.ml.util.MLReader` and `spark.ml.util.MLWriter`, the `context` method has been deprecated in favor of `session`. -* In `spark.ml.feature.ChiSqSelectorModel`, the `setLabelCol` method has been deprecated since it was not used by `ChiSqSelectorModel`. - -**Changes of behavior** - -Changes of behavior in the `spark.mllib` and `spark.ml` packages include: - -* [SPARK-7780](https://issues.apache.org/jira/browse/SPARK-7780): - `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegresson` for binary classification now. - This will introduce the following behavior changes for `spark.mllib.classification.LogisticRegressionWithLBFGS`: - * The intercept will not be regularized when training binary classification model with L1/L2 Updater. - * If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate. -* [SPARK-13429](https://issues.apache.org/jira/browse/SPARK-13429): - In order to provide better and consistent result with `spark.ml.classification.LogisticRegresson`, - the default value of `spark.mllib.classification.LogisticRegressionWithLBFGS`: `convergenceTol` has been changed from 1E-4 to 1E-6. -* [SPARK-12363](https://issues.apache.org/jira/browse/SPARK-12363): - Fix a bug of `PowerIterationClustering` which will likely change its result. -* [SPARK-13048](https://issues.apache.org/jira/browse/SPARK-13048): - `LDA` using the `EM` optimizer will keep the last checkpoint by default, if checkpointing is being used. -* [SPARK-12153](https://issues.apache.org/jira/browse/SPARK-12153): - `Word2Vec` now respects sentence boundaries. Previously, it did not handle them correctly. -* [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574): - `HashingTF` uses `MurmurHash3` as default hash algorithm in both `spark.ml` and `spark.mllib`. -* [SPARK-14768](https://issues.apache.org/jira/browse/SPARK-14768): - The `expectedType` argument for PySpark `Param` was removed. -* [SPARK-14931](https://issues.apache.org/jira/browse/SPARK-14931): - Some default `Param` values, which were mismatched between pipelines in Scala and Python, have been changed. -* [SPARK-13600](https://issues.apache.org/jira/browse/SPARK-13600): - `QuantileDiscretizer` now uses `spark.sql.DataFrameStatFunctions.approxQuantile` to find splits (previously used custom sampling logic). - The output buckets will differ for same input data and params. - -## Previous Spark versions - -Earlier migration guides are archived [on this page](mllib-migration-guides.html). - ---- diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md index 8ede4407d584..d90905a86ade 100644 --- a/docs/mllib-isotonic-regression.md +++ b/docs/mllib-isotonic-regression.md @@ -1,7 +1,7 @@ --- layout: global -title: Isotonic regression - spark.mllib -displayTitle: Regression - spark.mllib +title: Isotonic regression - RDD-based API +displayTitle: Regression - RDD-based API --- ## Isotonic regression diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md index 17d781ac23f8..6fcd3ae85700 100644 --- a/docs/mllib-linear-methods.md +++ b/docs/mllib-linear-methods.md @@ -1,7 +1,7 @@ --- layout: global -title: Linear Methods - spark.mllib -displayTitle: Linear Methods - spark.mllib +title: Linear Methods - RDD-based API +displayTitle: Linear Methods - RDD-based API --- * Table of contents diff --git a/docs/mllib-migration-guides.md b/docs/mllib-migration-guides.md index 970c6697f433..ea6f93fcf67f 100644 --- a/docs/mllib-migration-guides.md +++ b/docs/mllib-migration-guides.md @@ -1,159 +1,9 @@ --- layout: global -title: Old Migration Guides - spark.mllib -displayTitle: Old Migration Guides - spark.mllib -description: MLlib migration guides from before Spark SPARK_VERSION_SHORT +title: Old Migration Guides - MLlib +displayTitle: Old Migration Guides - MLlib --- -The migration guide for the current Spark version is kept on the [MLlib Programming Guide main page](mllib-guide.html#migration-guide). - -## From 1.5 to 1.6 - -There are no breaking API changes in the `spark.mllib` or `spark.ml` packages, but there are -deprecations and changes of behavior. - -Deprecations: - -* [SPARK-11358](https://issues.apache.org/jira/browse/SPARK-11358): - In `spark.mllib.clustering.KMeans`, the `runs` parameter has been deprecated. -* [SPARK-10592](https://issues.apache.org/jira/browse/SPARK-10592): - In `spark.ml.classification.LogisticRegressionModel` and - `spark.ml.regression.LinearRegressionModel`, the `weights` field has been deprecated in favor of - the new name `coefficients`. This helps disambiguate from instance (row) "weights" given to - algorithms. - -Changes of behavior: - -* [SPARK-7770](https://issues.apache.org/jira/browse/SPARK-7770): - `spark.mllib.tree.GradientBoostedTrees`: `validationTol` has changed semantics in 1.6. - Previously, it was a threshold for absolute change in error. Now, it resembles the behavior of - `GradientDescent`'s `convergenceTol`: For large errors, it uses relative error (relative to the - previous error); for small errors (`< 0.01`), it uses absolute error. -* [SPARK-11069](https://issues.apache.org/jira/browse/SPARK-11069): - `spark.ml.feature.RegexTokenizer`: Previously, it did not convert strings to lowercase before - tokenizing. Now, it converts to lowercase by default, with an option not to. This matches the - behavior of the simpler `Tokenizer` transformer. - -## From 1.4 to 1.5 - -In the `spark.mllib` package, there are no breaking API changes but several behavior changes: - -* [SPARK-9005](https://issues.apache.org/jira/browse/SPARK-9005): - `RegressionMetrics.explainedVariance` returns the average regression sum of squares. -* [SPARK-8600](https://issues.apache.org/jira/browse/SPARK-8600): `NaiveBayesModel.labels` become - sorted. -* [SPARK-3382](https://issues.apache.org/jira/browse/SPARK-3382): `GradientDescent` has a default - convergence tolerance `1e-3`, and hence iterations might end earlier than 1.4. - -In the `spark.ml` package, there exists one breaking API change and one behavior change: - -* [SPARK-9268](https://issues.apache.org/jira/browse/SPARK-9268): Java's varargs support is removed - from `Params.setDefault` due to a - [Scala compiler bug](https://issues.scala-lang.org/browse/SI-9013). -* [SPARK-10097](https://issues.apache.org/jira/browse/SPARK-10097): `Evaluator.isLargerBetter` is - added to indicate metric ordering. Metrics like RMSE no longer flip signs as in 1.4. - -## From 1.3 to 1.4 - -In the `spark.mllib` package, there were several breaking changes, but all in `DeveloperApi` or `Experimental` APIs: - -* Gradient-Boosted Trees - * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/index.html#org.apache.spark.mllib.tree.loss.Loss) method was changed. This is only an issues for users who wrote their own losses for GBTs. - * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.BoostingStrategy) have been changed because of a modification to the case class fields. This could be an issue for users who use `BoostingStrategy` to set GBT parameters. -* *(Breaking change)* The return value of [`LDA.run`](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) has changed. It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`. The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm. - -In the `spark.ml` package, several major API changes occurred, including: - -* `Param` and other APIs for specifying parameters -* `uid` unique IDs for Pipeline components -* Reorganization of certain classes - -Since the `spark.ml` API was an alpha component in Spark 1.3, we do not list all changes here. -However, since 1.4 `spark.ml` is no longer an alpha component, we will provide details on any API -changes for future releases. - -## From 1.2 to 1.3 - -In the `spark.mllib` package, there were several breaking changes. The first change (in `ALS`) is the only one in a component not marked as Alpha or Experimental. - -* *(Breaking change)* In [`ALS`](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS), the extraneous method `solveLeastSquares` has been removed. The `DeveloperApi` method `analyzeBlocks` was also removed. -* *(Breaking change)* [`StandardScalerModel`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScalerModel) remains an Alpha component. In it, the `variance` method has been replaced with the `std` method. To compute the column variance values returned by the original `variance` method, simply square the standard deviation values returned by `std`. -* *(Breaking change)* [`StreamingLinearRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD) remains an Experimental component. In it, there were two changes: - * The constructor taking arguments was removed in favor of a builder pattern using the default constructor plus parameter setter methods. - * Variable `model` is no longer public. -* *(Breaking change)* [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) remains an Experimental component. In it and its associated classes, there were several changes: - * In `DecisionTree`, the deprecated class method `train` has been removed. (The object/static `train` methods remain.) - * In `Strategy`, the `checkpointDir` parameter has been removed. Checkpointing is still supported, but the checkpoint directory must be set before calling tree and tree ensemble training. -* `PythonMLlibAPI` (the interface between Scala/Java and Python for MLlib) was a public API but is now private, declared `private[python]`. This was never meant for external use. -* In linear regression (including Lasso and ridge regression), the squared loss is now divided by 2. - So in order to produce the same result as in 1.2, the regularization parameter needs to be divided by 2 and the step size needs to be multiplied by 2. - -In the `spark.ml` package, the main API changes are from Spark SQL. We list the most important changes here: - -* The old [SchemaRDD](http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API. All algorithms in Spark ML which used to use SchemaRDD now use DataFrame. -* In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`. These implicits have been moved, so we now call `import sqlContext.implicits._`. -* Java APIs for SQL have also changed accordingly. Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details. - -Other changes were in `LogisticRegression`: - -* The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability"). The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future). -* In Spark 1.2, `LogisticRegressionModel` did not include an intercept. In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS). The option to use an intercept will be added in the future. - -## From 1.1 to 1.2 - -The only API changes in MLlib v1.2 are in -[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree), -which continues to be an experimental API in MLlib 1.2: - -1. *(Breaking change)* The Scala API for classification takes a named argument specifying the number -of classes. In MLlib v1.1, this argument was called `numClasses` in Python and -`numClassesForClassification` in Scala. In MLlib v1.2, the names are both set to `numClasses`. -This `numClasses` parameter is specified either via -[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy) -or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) -static `trainClassifier` and `trainRegressor` methods. - -2. *(Breaking change)* The API for -[`Node`](api/scala/index.html#org.apache.spark.mllib.tree.model.Node) has changed. -This should generally not affect user code, unless the user manually constructs decision trees -(instead of using the `trainClassifier` or `trainRegressor` methods). -The tree `Node` now includes more information, including the probability of the predicted label -(for classification). - -3. Printing methods' output has changed. The `toString` (Scala/Java) and `__repr__` (Python) methods used to print the full model; they now print a summary. For the full model, use `toDebugString`. - -Examples in the Spark distribution and examples in the -[Decision Trees Guide](mllib-decision-tree.html#examples) have been updated accordingly. - -## From 1.0 to 1.1 - -The only API changes in MLlib v1.1 are in -[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree), -which continues to be an experimental API in MLlib 1.1: - -1. *(Breaking change)* The meaning of tree depth has been changed by 1 in order to match -the implementations of trees in -[scikit-learn](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree) -and in [rpart](http://cran.r-project.org/web/packages/rpart/index.html). -In MLlib v1.0, a depth-1 tree had 1 leaf node, and a depth-2 tree had 1 root node and 2 leaf nodes. -In MLlib v1.1, a depth-0 tree has 1 leaf node, and a depth-1 tree has 1 root node and 2 leaf nodes. -This depth is specified by the `maxDepth` parameter in -[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy) -or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) -static `trainClassifier` and `trainRegressor` methods. - -2. *(Non-breaking change)* We recommend using the newly added `trainClassifier` and `trainRegressor` -methods to build a [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree), -rather than using the old parameter class `Strategy`. These new training methods explicitly -separate classification and regression, and they replace specialized parameter types with -simple `String` types. - -Examples of the new, recommended `trainClassifier` and `trainRegressor` are given in the -[Decision Trees Guide](mllib-decision-tree.html#examples). - -## From 0.9 to 1.0 - -In MLlib v1.0, we support both dense and sparse input in a unified way, which introduces a few -breaking changes. If your data is sparse, please store it in a sparse format instead of dense to -take advantage of sparsity in both storage and computation. Details are described below. +The migration guide for the current Spark version is kept on the [MLlib Guide main page](ml-guide.html#migration-guide). +Past migration guides are now stored at [ml-migration-guides.html](ml-migration-guides.html). diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md index d0d594af6a4a..7471d18a0ddd 100644 --- a/docs/mllib-naive-bayes.md +++ b/docs/mllib-naive-bayes.md @@ -1,7 +1,7 @@ --- layout: global -title: Naive Bayes - spark.mllib -displayTitle: Naive Bayes - spark.mllib +title: Naive Bayes - RDD-based API +displayTitle: Naive Bayes - RDD-based API --- [Naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier) is a simple diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md index f90b66f8e2c4..eefd7dcf1108 100644 --- a/docs/mllib-optimization.md +++ b/docs/mllib-optimization.md @@ -1,7 +1,7 @@ --- layout: global -title: Optimization - spark.mllib -displayTitle: Optimization - spark.mllib +title: Optimization - RDD-based API +displayTitle: Optimization - RDD-based API --- * Table of contents diff --git a/docs/mllib-pmml-model-export.md b/docs/mllib-pmml-model-export.md index 7f2347dc0b76..d3530908706d 100644 --- a/docs/mllib-pmml-model-export.md +++ b/docs/mllib-pmml-model-export.md @@ -1,7 +1,7 @@ --- layout: global -title: PMML model export - spark.mllib -displayTitle: PMML model export - spark.mllib +title: PMML model export - RDD-based API +displayTitle: PMML model export - RDD-based API --- * Table of contents diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 329855e565b2..12797bd8688e 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -1,7 +1,7 @@ --- layout: global -title: Basic Statistics - spark.mllib -displayTitle: Basic Statistics - spark.mllib +title: Basic Statistics - RDD-based API +displayTitle: Basic Statistics - RDD-based API --- * Table of contents diff --git a/docs/monitoring.md b/docs/monitoring.md index fa6c899a40b6..c8694762ffd7 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -225,9 +225,10 @@ for the history server, they would typically be accessible at `http:// EndpointMeaning @@ -288,7 +289,11 @@ where `[base-app-id]` is the YARN application ID. /applications/[app-id]/executors - A list of all executors for the given application. + A list of all active executors for the given application. + + + /applications/[app-id]/allexecutors + A list of all(active and dead) executors for the given application. /applications/[app-id]/storage/rdd diff --git a/docs/programming-guide.md b/docs/programming-guide.md index 3872aecff25d..888c12f18635 100644 --- a/docs/programming-guide.md +++ b/docs/programming-guide.md @@ -1101,6 +1101,9 @@ for details. +The Spark RDD API also exposes asynchronous versions of some actions, like `foreachAsync` for `foreach`, which immediately return a `FutureAction` to the caller instead of blocking on completion of the action. This can be used to manage or wait for the asynchronous execution of the action. + + ### Shuffle operations Certain operations within Spark trigger an event known as the shuffle. The shuffle is Spark's @@ -1568,7 +1571,7 @@ have changed from returning (key, list of values) pairs to (key, iterable of val Migration guides are also available for [Spark Streaming](streaming-programming-guide.html#migration-guide-from-091-or-below-to-1x), -[MLlib](mllib-guide.html#migration-guide) and [GraphX](graphx-programming-guide.html#migrating-from-spark-091). +[MLlib](ml-guide.html#migration-guide) and [GraphX](graphx-programming-guide.html#migrating-from-spark-091). # Where to Go from Here diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 5219e99fee73..10dc9ce89004 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -428,6 +428,16 @@ See the [configuration page](configuration.html) for information on Spark config If unset it will point to Spark's internal web UI. + + spark.mesos.driverEnv.[EnvironmentVariableName] + (none) + + This only affects drivers submitted in cluster mode. Add the + environment variable specified by EnvironmentVariableName to the + driver process. The user can specify multiple of these to set + multiple environment variables. + + spark.mesos.dispatcher.webui.url (none) diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 4e92042da687..befd3eaee9d8 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -539,6 +539,37 @@ launch time. This is done by listing them in the `spark.yarn.access.namenodes` p spark.yarn.access.namenodes hdfs://ireland.example.org:8020/,hdfs://frankfurt.example.org:8020/ ``` +## Configuring the External Shuffle Service + +To start the Spark Shuffle Service on each `NodeManager` in your YARN cluster, follow these +instructions: + +1. Build Spark with the [YARN profile](building-spark.html). Skip this step if you are using a +pre-packaged distribution. +1. Locate the `spark--yarn-shuffle.jar`. This should be under +`$SPARK_HOME/common/network-yarn/target/scala-` if you are building Spark yourself, and under +`lib` if you are using a distribution. +1. Add this jar to the classpath of all `NodeManager`s in your cluster. +1. In the `yarn-site.xml` on each node, add `spark_shuffle` to `yarn.nodemanager.aux-services`, +then set `yarn.nodemanager.aux-services.spark_shuffle.class` to +`org.apache.spark.network.yarn.YarnShuffleService`. +1. Restart all `NodeManager`s in your cluster. + +The following extra configuration options are available when the shuffle service is running on YARN: + + + + + + + + +
    Property NameDefaultMeaning
    spark.yarn.shuffle.stopOnFailurefalse + Whether to stop the NodeManager when there's a failure in the Spark Shuffle Service's + initialization. This prevents application failures caused by running containers on + NodeManagers where the Spark Shuffle Service is not running. +
    + ## Launching your application with Apache Oozie Apache Oozie can launch Spark applications as part of a workflow. diff --git a/docs/sparkr.md b/docs/sparkr.md index b4acb230403a..4bbc362c5208 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -54,7 +54,7 @@ if (nchar(Sys.getenv("SPARK_HOME")) < 1) { Sys.setenv(SPARK_HOME = "/home/spark") } library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"))) -sc <- sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory="2g")) +sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "2g")) {% endhighlight %} @@ -111,19 +111,17 @@ head(df) SparkR supports operating on a variety of data sources through the `SparkDataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources. The general method for creating SparkDataFrames from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active SparkSession will be used automatically. SparkR supports reading JSON, CSV and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [Avro](http://spark-packages.org/package/databricks/spark-avro). These packages can either be added by -specifying `--packages` with `spark-submit` or `sparkR` commands, or if creating context through `init` -you can specify the packages with the `packages` argument. +specifying `--packages` with `spark-submit` or `sparkR` commands, or if initializing SparkSession with `sparkPackages` parameter when in an interactive R shell or from RStudio.
    {% highlight r %} -sc <- sparkR.session(sparkPackages="com.databricks:spark-avro_2.11:3.0.0") +sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0") {% endhighlight %}
    We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail.
    - {% highlight r %} people <- read.df("./examples/src/main/resources/people.json", "json") head(people) @@ -138,6 +136,18 @@ printSchema(people) # |-- age: long (nullable = true) # |-- name: string (nullable = true) +# Similarly, multiple files can be read with read.json +people <- read.json(c("./examples/src/main/resources/people.json", "./examples/src/main/resources/people2.json")) + +{% endhighlight %} +
    + +The data sources API natively supports CSV formatted input files. For more information please refer to SparkR [read.df](api/R/read.df.html) API documentation. + +
    +{% highlight r %} +df <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "NA") + {% endhighlight %}
    @@ -146,7 +156,7 @@ to a Parquet file using `write.df`.
    {% highlight r %} -write.df(people, path="people.parquet", source="parquet", mode="overwrite") +write.df(people, path = "people.parquet", source = "parquet", mode = "overwrite") {% endhighlight %}
    @@ -262,8 +272,8 @@ In SparkR, we support several kinds of User-Defined Functions: ##### dapply Apply a function to each partition of a `SparkDataFrame`. The function to be applied to each partition of the `SparkDataFrame` -and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function -should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match the R function's output. +and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match to [data types](#data-type-mapping-between-r-and-spark) of returned value. +
    {% highlight r %} @@ -271,7 +281,7 @@ should be a `data.frame`. Schema specifies the row format of the resulting a `Sp # Note that we can apply UDF to DataFrame. schema <- structType(structField("eruptions", "double"), structField("waiting", "double"), structField("waiting_secs", "double")) -df1 <- dapply(df, function(x) {x <- cbind(x, x$waiting * 60)}, schema) +df1 <- dapply(df, function(x) { x <- cbind(x, x$waiting * 60) }, schema) head(collect(df1)) ## eruptions waiting waiting_secs ##1 3.600 79 4740 @@ -285,8 +295,8 @@ head(collect(df1)) ##### dapplyCollect Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function -should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` only can be used if the -output of UDF run on all the partitions can fit in driver memory. +should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. +
    {% highlight r %} @@ -306,6 +316,135 @@ head(ldf, 3) {% endhighlight %}
    +#### Run a given function on a large dataset grouping by input column(s) and using `gapply` or `gapplyCollect` + +##### gapply +Apply a function to each group of a `SparkDataFrame`. The function is to be applied to each group of the `SparkDataFrame` and should have only two parameters: grouping key and R `data.frame` corresponding to +that key. The groups are chosen from `SparkDataFrame`s column(s). +The output of function should be a `data.frame`. Schema specifies the row format of the resulting +`SparkDataFrame`. It must represent R function's output schema on the basis of Spark [data types](#data-type-mapping-between-r-and-spark). The column names of the returned `data.frame` are set by user. + +
    +{% highlight r %} + +# Determine six waiting times with the largest eruption time in minutes. +schema <- structType(structField("waiting", "double"), structField("max_eruption", "double")) +result <- gapply( + df, + "waiting", + function(key, x) { + y <- data.frame(key, max(x$eruptions)) + }, + schema) +head(collect(arrange(result, "max_eruption", decreasing = TRUE))) + +## waiting max_eruption +##1 64 5.100 +##2 69 5.067 +##3 71 5.033 +##4 87 5.000 +##5 63 4.933 +##6 89 4.900 +{% endhighlight %} +
    + +##### gapplyCollect +Like `gapply`, applies a function to each partition of a `SparkDataFrame` and collect the result back to R data.frame. The output of the function should be a `data.frame`. But, the schema is not required to be passed. Note that `gapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. + +
    +{% highlight r %} + +# Determine six waiting times with the largest eruption time in minutes. +result <- gapplyCollect( + df, + "waiting", + function(key, x) { + y <- data.frame(key, max(x$eruptions)) + colnames(y) <- c("waiting", "max_eruption") + y + }) +head(result[order(result$max_eruption, decreasing = TRUE), ]) + +## waiting max_eruption +##1 64 5.100 +##2 69 5.067 +##3 71 5.033 +##4 87 5.000 +##5 63 4.933 +##6 89 4.900 + +{% endhighlight %} +
    + +#### Data type mapping between R and Spark + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    RSpark
    bytebyte
    integerinteger
    floatfloat
    doubledouble
    numericdouble
    characterstring
    stringstring
    binarybinary
    rawbinary
    logicalboolean
    POSIXcttimestamp
    POSIXlttimestamp
    Datedate
    arrayarray
    listarray
    envmap
    + #### Run local R functions distributed using `spark.lapply` ##### spark.lapply @@ -313,9 +452,9 @@ Similar to `lapply` in native R, `spark.lapply` runs a function over a list of e Applies a function in a manner that is similar to `doParallel` or `lapply` to elements of a list. The results of all the computations should fit in a single machine. If that is not the case they can do something like `df <- createDataFrame(list)` and then use `dapply` +
    {% highlight r %} - # Perform distributed training of multiple models with spark.lapply. Here, we pass # a read-only list of arguments which specifies family the generalized linear model should be. families <- c("gaussian", "poisson") @@ -436,4 +575,3 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma - The method `registerTempTable` has been deprecated to be replaced by `createOrReplaceTempView`. - The method `dropTempTable` has been deprecated to be replaced by `dropTempView`. - The `sc` SparkContext parameter is no longer required for these functions: `setJobGroup`, `clearJobGroup`, `cancelJobGroup` - diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index e838a13af72d..ad123d7ceaf2 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -65,21 +65,21 @@ Throughout this document, we will often refer to Scala/Java Datasets of `Row`s a The entry point into all functionality in Spark is the [`SparkSession`](api/scala/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder()`: -{% include_example init_session scala/org/apache/spark/examples/sql/RDDRelation.scala %} +{% include_example init_session scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
    The entry point into all functionality in Spark is the [`SparkSession`](api/java/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder()`: -{% include_example init_session java/org/apache/spark/examples/sql/JavaSparkSQL.java %} +{% include_example init_session java/org/apache/spark/examples/sql/JavaSparkSQLExample.java %}
    The entry point into all functionality in Spark is the [`SparkSession`](api/python/pyspark.sql.html#pyspark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder`: -{% include_example init_session python/sql.py %} +{% include_example init_session python/sql/basic.py %}
    @@ -105,14 +105,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight scala %} -val spark: SparkSession // An existing SparkSession. -val df = spark.read.json("examples/src/main/resources/people.json") - -// Displays the content of the DataFrame to stdout -df.show() -{% endhighlight %} - +{% include_example create_df scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
    @@ -121,14 +114,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight java %} -SparkSession spark = ...; // An existing SparkSession. -Dataset df = spark.read().json("examples/src/main/resources/people.json"); - -// Displays the content of the DataFrame to stdout -df.show(); -{% endhighlight %} - +{% include_example create_df java/org/apache/spark/examples/sql/JavaSparkSQLExample.java %}
    @@ -137,14 +123,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight python %} -# spark is an existing SparkSession -df = spark.read.json("examples/src/main/resources/people.json") - -# Displays the content of the DataFrame to stdout -df.show() -{% endhighlight %} - +{% include_example create_df python/sql/basic.py %}
    @@ -169,110 +148,20 @@ Here we include some basic examples of structured data processing using Datasets
    -{% highlight scala %} -val spark: SparkSession // An existing SparkSession - -// Create the DataFrame -val df = spark.read.json("examples/src/main/resources/people.json") - -// Show the content of the DataFrame -df.show() -// age name -// null Michael -// 30 Andy -// 19 Justin - -// Print the schema in a tree format -df.printSchema() -// root -// |-- age: long (nullable = true) -// |-- name: string (nullable = true) - -// Select only the "name" column -df.select("name").show() -// name -// Michael -// Andy -// Justin - -// Select everybody, but increment the age by 1 -df.select(df("name"), df("age") + 1).show() -// name (age + 1) -// Michael null -// Andy 31 -// Justin 20 - -// Select people older than 21 -df.filter(df("age") > 21).show() -// age name -// 30 Andy - -// Count people by age -df.groupBy("age").count().show() -// age count -// null 1 -// 19 1 -// 30 1 -{% endhighlight %} +{% include_example untyped_ops scala/org/apache/spark/examples/sql/SparkSQLExample.scala %} For a complete list of the types of operations that can be performed on a Dataset refer to the [API Documentation](api/scala/index.html#org.apache.spark.sql.Dataset). In addition to simple column references and expressions, Datasets also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/scala/index.html#org.apache.spark.sql.functions$). - -
    -{% highlight java %} -SparkSession spark = ...; // An existing SparkSession - -// Create the DataFrame -Dataset df = spark.read().json("examples/src/main/resources/people.json"); - -// Show the content of the DataFrame -df.show(); -// age name -// null Michael -// 30 Andy -// 19 Justin - -// Print the schema in a tree format -df.printSchema(); -// root -// |-- age: long (nullable = true) -// |-- name: string (nullable = true) - -// Select only the "name" column -df.select("name").show(); -// name -// Michael -// Andy -// Justin - -// Select everybody, but increment the age by 1 -df.select(df.col("name"), df.col("age").plus(1)).show(); -// name (age + 1) -// Michael null -// Andy 31 -// Justin 20 - -// Select people older than 21 -df.filter(df.col("age").gt(21)).show(); -// age name -// 30 Andy - -// Count people by age -df.groupBy("age").count().show(); -// age count -// null 1 -// 19 1 -// 30 1 -{% endhighlight %} + +{% include_example untyped_ops java/org/apache/spark/examples/sql/JavaSparkSQLExample.java %} For a complete list of the types of operations that can be performed on a Dataset refer to the [API Documentation](api/java/org/apache/spark/sql/Dataset.html). In addition to simple column references and expressions, Datasets also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/java/org/apache/spark/sql/functions.html). -
    @@ -282,53 +171,7 @@ interactive data exploration, users are highly encouraged to use the latter form, which is future proof and won't break with column names that are also attributes on the DataFrame class. -{% highlight python %} -# spark is an existing SparkSession - -# Create the DataFrame -df = spark.read.json("examples/src/main/resources/people.json") - -# Show the content of the DataFrame -df.show() -## age name -## null Michael -## 30 Andy -## 19 Justin - -# Print the schema in a tree format -df.printSchema() -## root -## |-- age: long (nullable = true) -## |-- name: string (nullable = true) - -# Select only the "name" column -df.select("name").show() -## name -## Michael -## Andy -## Justin - -# Select everybody, but increment the age by 1 -df.select(df['name'], df['age'] + 1).show() -## name (age + 1) -## Michael null -## Andy 31 -## Justin 20 - -# Select people older than 21 -df.filter(df['age'] > 21).show() -## age name -## 30 Andy - -# Count people by age -df.groupBy("age").count().show() -## age count -## null 1 -## 19 1 -## 30 1 - -{% endhighlight %} - +{% include_example untyped_ops python/sql/basic.py %} For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/python/pyspark.sql.html#pyspark.sql.DataFrame). In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/python/pyspark.sql.html#module-pyspark.sql.functions). @@ -353,28 +196,19 @@ In addition to simple column references and expressions, DataFrames also have a
    The `sql` function on a `SparkSession` enables applications to run SQL queries programmatically and returns the result as a `DataFrame`. -{% highlight scala %} -val spark = ... // An existing SparkSession -val df = spark.sql("SELECT * FROM table") -{% endhighlight %} +{% include_example run_sql scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
    The `sql` function on a `SparkSession` enables applications to run SQL queries programmatically and returns the result as a `Dataset`. -{% highlight java %} -SparkSession spark = ... // An existing SparkSession -Dataset df = spark.sql("SELECT * FROM table") -{% endhighlight %} +{% include_example run_sql java/org/apache/spark/examples/sql/JavaSparkSQLExample.java %}
    The `sql` function on a `SparkSession` enables applications to run SQL queries programmatically and returns the result as a `DataFrame`. -{% highlight python %} -# spark is an existing SparkSession -df = spark.sql("SELECT * FROM table") -{% endhighlight %} +{% include_example run_sql python/sql/basic.py %}
    @@ -397,53 +231,11 @@ the bytes back into an object.
    - -{% highlight scala %} -// Encoders for most common types are automatically provided by importing spark.implicits._ -val ds = Seq(1, 2, 3).toDS() -ds.map(_ + 1).collect() // Returns: Array(2, 3, 4) - -// Encoders are also created for case classes. -case class Person(name: String, age: Long) -val ds = Seq(Person("Andy", 32)).toDS() - -// DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name. -val path = "examples/src/main/resources/people.json" -val people = spark.read.json(path).as[Person] - -{% endhighlight %} - +{% include_example create_ds scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
    - -{% highlight java %} -SparkSession spark = ... // An existing SparkSession - -// Encoders for most common types are provided in class Encoders. -Dataset ds = spark.createDataset(Arrays.asList(1, 2, 3), Encoders.INT()); -ds.map(new MapFunction() { - @Override - public Integer call(Integer value) throws Exception { - return value + 1; - } -}, Encoders.INT()); // Returns: [2, 3, 4] - -Person person = new Person(); -person.setName("Andy"); -person.setAge(32); - -// Encoders are also created for Java beans. -Dataset ds = spark.createDataset( - Collections.singletonList(person), - Encoders.bean(Person.class) -); - -// DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name. -String path = "examples/src/main/resources/people.json"; -Dataset people = spark.read().json(path).as(Encoders.bean(Person.class)); -{% endhighlight %} - +{% include_example create_ds java/org/apache/spark/examples/sql/JavaSparkSQLExample.java %}
    @@ -470,38 +262,7 @@ reflection and become the names of the columns. Case classes can also be nested types such as `Seq`s or `Array`s. This RDD can be implicitly converted to a DataFrame and then be registered as a table. Tables can be used in subsequent SQL statements. -{% highlight scala %} -val spark: SparkSession // An existing SparkSession -// this is used to implicitly convert an RDD to a DataFrame. -import spark.implicits._ - -// Define the schema using a case class. -// Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit, -// you can use custom classes that implement the Product interface. -case class Person(name: String, age: Int) - -// Create an RDD of Person objects and register it as a temporary view. -val people = sc - .textFile("examples/src/main/resources/people.txt") - .map(_.split(",")) - .map(p => Person(p(0), p(1).trim.toInt)) - .toDF() -people.createOrReplaceTempView("people") - -// SQL statements can be run by using the sql methods provided by spark. -val teenagers = spark.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19") - -// The columns of a row in the result can be accessed by field index: -teenagers.map(t => "Name: " + t(0)).collect().foreach(println) - -// or by field name: -teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println) - -// row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T] -teenagers.map(_.getValuesMap[Any](List("name", "age"))).collect().foreach(println) -// Map("name" -> "Justin", "age" -> 19) -{% endhighlight %} - +{% include_example schema_inferring scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
    @@ -513,68 +274,7 @@ does not support JavaBeans that contain `Map` field(s). Nested JavaBeans and `Li fields are supported though. You can create a JavaBean by creating a class that implements Serializable and has getters and setters for all of its fields. -{% highlight java %} - -public static class Person implements Serializable { - private String name; - private int age; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public int getAge() { - return age; - } - - public void setAge(int age) { - this.age = age; - } -} - -{% endhighlight %} - - -A schema can be applied to an existing RDD by calling `createDataFrame` and providing the Class object -for the JavaBean. - -{% highlight java %} -SparkSession spark = ...; // An existing SparkSession - -// Load a text file and convert each line to a JavaBean. -JavaRDD people = spark.sparkContext.textFile("examples/src/main/resources/people.txt").map( - new Function() { - public Person call(String line) throws Exception { - String[] parts = line.split(","); - - Person person = new Person(); - person.setName(parts[0]); - person.setAge(Integer.parseInt(parts[1].trim())); - - return person; - } - }); - -// Apply a schema to an RDD of JavaBeans and register it as a table. -Dataset schemaPeople = spark.createDataFrame(people, Person.class); -schemaPeople.createOrReplaceTempView("people"); - -// SQL can be run over RDDs that have been registered as tables. -Dataset teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") - -// The columns of a row in the result can be accessed by ordinal. -List teenagerNames = teenagers.map(new MapFunction() { - public String call(Row row) { - return "Name: " + row.getString(0); - } -}).collectAsList(); - -{% endhighlight %} - +{% include_example schema_inferring java/org/apache/spark/examples/sql/JavaSparkSQLExample.java %}
    @@ -583,29 +283,7 @@ Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the dataty key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table, and the types are inferred by sampling the whole datase, similar to the inference that is performed on JSON files. -{% highlight python %} -# spark is an existing SparkSession. -from pyspark.sql import Row -sc = spark.sparkContext - -# Load a text file and convert each line to a Row. -lines = sc.textFile("examples/src/main/resources/people.txt") -parts = lines.map(lambda l: l.split(",")) -people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) - -# Infer the schema, and register the DataFrame as a table. -schemaPeople = spark.createDataFrame(people) -schemaPeople.createOrReplaceTempView("people") - -# SQL can be run over DataFrames that have been registered as a table. -teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") - -# The results of SQL queries are RDDs and support all the normal RDD operations. -teenNames = teenagers.map(lambda p: "Name: " + p.name) -for teenName in teenNames.collect(): - print(teenName) -{% endhighlight %} - +{% include_example schema_inferring python/sql/basic.py %}
    @@ -628,43 +306,8 @@ a `DataFrame` can be created programmatically with three steps. by `SparkSession`. For example: -{% highlight scala %} -val spark: SparkSession // An existing SparkSession - -// Create an RDD -val people = sc.textFile("examples/src/main/resources/people.txt") - -// The schema is encoded in a string -val schemaString = "name age" - -// Import Row. -import org.apache.spark.sql.Row; - -// Import Spark SQL data types -import org.apache.spark.sql.types.{StructType, StructField, StringType}; - -// Generate the schema based on the string of schema -val schema = StructType(schemaString.split(" ").map { fieldName => - StructField(fieldName, StringType, true) -}) - -// Convert records of the RDD (people) to Rows. -val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim)) - -// Apply the schema to the RDD. -val peopleDataFrame = spark.createDataFrame(rowRDD, schema) - -// Creates a temporary view using the DataFrame. -peopleDataFrame.createOrReplaceTempView("people") - -// SQL statements can be run by using the sql methods provided by spark. -val results = spark.sql("SELECT name FROM people") - -// The columns of a row in the result can be accessed by field index or by field name. -results.map(t => "Name: " + t(0)).collect().foreach(println) -{% endhighlight %} - +{% include_example programmatic_schema scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
    @@ -681,62 +324,8 @@ a `Dataset` can be created programmatically with three steps. by `SparkSession`. For example: -{% highlight java %} -import org.apache.spark.api.java.function.Function; -// Import factory methods provided by DataTypes. -import org.apache.spark.sql.types.DataTypes; -// Import StructType and StructField -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.types.StructField; -// Import Row. -import org.apache.spark.sql.Row; -// Import RowFactory. -import org.apache.spark.sql.RowFactory; - -SparkSession spark = ...; // An existing SparkSession. -JavaSparkContext sc = spark.sparkContext - -// Load a text file and convert each line to a JavaBean. -JavaRDD people = sc.textFile("examples/src/main/resources/people.txt"); - -// The schema is encoded in a string -String schemaString = "name age"; - -// Generate the schema based on the string of schema -List fields = new ArrayList<>(); -for (String fieldName: schemaString.split(" ")) { - fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true)); -} -StructType schema = DataTypes.createStructType(fields); - -// Convert records of the RDD (people) to Rows. -JavaRDD rowRDD = people.map( - new Function() { - public Row call(String record) throws Exception { - String[] fields = record.split(","); - return RowFactory.create(fields[0], fields[1].trim()); - } - }); - -// Apply the schema to the RDD. -Dataset peopleDataFrame = spark.createDataFrame(rowRDD, schema); - -// Creates a temporary view using the DataFrame. -peopleDataFrame.createOrReplaceTempView("people"); - -// SQL can be run over a temporary view created using DataFrames. -Dataset results = spark.sql("SELECT name FROM people"); - -// The results of SQL queries are DataFrames and support all the normal RDD operations. -// The columns of a row in the result can be accessed by ordinal. -List names = results.javaRDD().map(new Function() { - public String call(Row row) { - return "Name: " + row.getString(0); - } -}).collect(); - -{% endhighlight %} +{% include_example programmatic_schema java/org/apache/spark/examples/sql/JavaSparkSQLExample.java %}
    @@ -752,39 +341,8 @@ tuples or lists in the RDD created in the step 1. 3. Apply the schema to the RDD via `createDataFrame` method provided by `SparkSession`. For example: -{% highlight python %} -# Import SparkSession and data types -from pyspark.sql.types import * - -# spark is an existing SparkSession. -sc = spark.sparkContext - -# Load a text file and convert each line to a tuple. -lines = sc.textFile("examples/src/main/resources/people.txt") -parts = lines.map(lambda l: l.split(",")) -people = parts.map(lambda p: (p[0], p[1].strip())) - -# The schema is encoded in a string. -schemaString = "name age" - -fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] -schema = StructType(fields) - -# Apply the schema to the RDD. -schemaPeople = spark.createDataFrame(people, schema) - -# Creates a temporary view using the DataFrame -schemaPeople.createOrReplaceTempView("people") - -# SQL can be run over DataFrames that have been registered as a table. -results = spark.sql("SELECT name FROM people") - -# The results of SQL queries are RDDs and support all the normal RDD operations. -names = results.map(lambda p: "Name: " + p.name) -for name in names.collect(): - print(name) -{% endhighlight %} +{% include_example programmatic_schema python/sql/basic.py %}
    @@ -805,34 +363,16 @@ In the simplest form, the default data source (`parquet` unless otherwise config
    - -{% highlight scala %} -val df = spark.read.load("examples/src/main/resources/users.parquet") -df.select("name", "favorite_color").write.save("namesAndFavColors.parquet") -{% endhighlight %} - +{% include_example generic_load_save_functions scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
    - -{% highlight java %} - -Dataset df = spark.read().load("examples/src/main/resources/users.parquet"); -df.select("name", "favorite_color").write().save("namesAndFavColors.parquet"); - -{% endhighlight %} - +{% include_example generic_load_save_functions java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
    -{% highlight python %} - -df = spark.read.load("examples/src/main/resources/users.parquet") -df.select("name", "favorite_color").write.save("namesAndFavColors.parquet") - -{% endhighlight %} - +{% include_example generic_load_save_functions python/sql/datasource.py %}
    @@ -852,34 +392,16 @@ using this syntax.
    - -{% highlight scala %} -val df = spark.read.format("json").load("examples/src/main/resources/people.json") -df.select("name", "age").write.format("parquet").save("namesAndAges.parquet") -{% endhighlight %} - +{% include_example manual_load_options scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
    - -{% highlight java %} - -Dataset df = spark.read().format("json").load("examples/src/main/resources/people.json"); -df.select("name", "age").write().format("parquet").save("namesAndAges.parquet"); - -{% endhighlight %} - +{% include_example manual_load_options java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
    -{% highlight python %} - -df = spark.read.load("examples/src/main/resources/people.json", format="json") -df.select("name", "age").write.save("namesAndAges.parquet", format="parquet") - -{% endhighlight %} - +{% include_example manual_load_options python/sql/datasource.py %}
    @@ -895,26 +417,16 @@ file directly with SQL.
    - -{% highlight scala %} -val df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") -{% endhighlight %} - +{% include_example direct_sql scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
    - -{% highlight java %} -Dataset df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); -{% endhighlight %} +{% include_example direct_sql java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
    -{% highlight python %} -df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") -{% endhighlight %} - +{% include_example direct_sql python/sql/datasource.py %}
    @@ -997,77 +509,16 @@ Using the data from the above example:
    - -{% highlight scala %} -// spark from the previous example is used in this example. -// This is used to implicitly convert an RDD to a DataFrame. -import spark.implicits._ - -val people: RDD[Person] = ... // An RDD of case class objects, from the previous example. - -// The RDD is implicitly converted to a DataFrame by implicits, allowing it to be stored using Parquet. -people.write.parquet("people.parquet") - -// Read in the parquet file created above. Parquet files are self-describing so the schema is preserved. -// The result of loading a Parquet file is also a DataFrame. -val parquetFile = spark.read.parquet("people.parquet") - -// Parquet files can also be used to create a temporary view and then used in SQL statements. -parquetFile.createOrReplaceTempView("parquetFile") -val teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") -teenagers.map(t => "Name: " + t(0)).collect().foreach(println) -{% endhighlight %} - +{% include_example basic_parquet_example scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
    - -{% highlight java %} -// spark from the previous example is used in this example. - -Dataset schemaPeople = ... // The DataFrame from the previous example. - -// DataFrames can be saved as Parquet files, maintaining the schema information. -schemaPeople.write().parquet("people.parquet"); - -// Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. -// The result of loading a parquet file is also a DataFrame. -Dataset parquetFile = spark.read().parquet("people.parquet"); - -// Parquet files can also be used to create a temporary view and then used in SQL statements. -parquetFile.createOrReplaceTempView("parquetFile"); -Dataset teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); -List teenagerNames = teenagers.javaRDD().map(new Function() { - public String call(Row row) { - return "Name: " + row.getString(0); - } -}).collect(); -{% endhighlight %} - +{% include_example basic_parquet_example java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
    -{% highlight python %} -# spark from the previous example is used in this example. - -schemaPeople # The DataFrame from the previous example. - -# DataFrames can be saved as Parquet files, maintaining the schema information. -schemaPeople.write.parquet("people.parquet") - -# Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. -# The result of loading a parquet file is also a DataFrame. -parquetFile = spark.read.parquet("people.parquet") - -# Parquet files can also be used to create a temporary view and then used in SQL statements. -parquetFile.createOrReplaceTempView("parquetFile"); -teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") -teenNames = teenagers.map(lambda p: "Name: " + p.name) -for teenName in teenNames.collect(): - print(teenName) -{% endhighlight %} - +{% include_example basic_parquet_example python/sql/datasource.py %}
    @@ -1172,65 +623,16 @@ turned it off by default starting from 1.5.0. You may enable it by
    +{% include_example schema_merging scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} +
    -{% highlight scala %} -// spark from the previous example is used in this example. -// This is used to implicitly convert an RDD to a DataFrame. -import spark.implicits._ - -// Create a simple DataFrame, stored into a partition directory -val df1 = sc.makeRDD(1 to 5).map(i => (i, i * 2)).toDF("single", "double") -df1.write.parquet("data/test_table/key=1") - -// Create another DataFrame in a new partition directory, -// adding a new column and dropping an existing column -val df2 = sc.makeRDD(6 to 10).map(i => (i, i * 3)).toDF("single", "triple") -df2.write.parquet("data/test_table/key=2") - -// Read the partitioned table -val df3 = spark.read.option("mergeSchema", "true").parquet("data/test_table") -df3.printSchema() - -// The final schema consists of all 3 columns in the Parquet files together -// with the partitioning column appeared in the partition directory paths. -// root -// |-- single: int (nullable = true) -// |-- double: int (nullable = true) -// |-- triple: int (nullable = true) -// |-- key : int (nullable = true) -{% endhighlight %} - +
    +{% include_example schema_merging java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
    -{% highlight python %} -# spark from the previous example is used in this example. - -# Create a simple DataFrame, stored into a partition directory -df1 = spark.createDataFrame(sc.parallelize(range(1, 6))\ - .map(lambda i: Row(single=i, double=i * 2))) -df1.write.parquet("data/test_table/key=1") - -# Create another DataFrame in a new partition directory, -# adding a new column and dropping an existing column -df2 = spark.createDataFrame(sc.parallelize(range(6, 11)) - .map(lambda i: Row(single=i, triple=i * 3))) -df2.write.parquet("data/test_table/key=2") - -# Read the partitioned table -df3 = spark.read.option("mergeSchema", "true").parquet("data/test_table") -df3.printSchema() - -# The final schema consists of all 3 columns in the Parquet files together -# with the partitioning column appeared in the partition directory paths. -# root -# |-- single: int (nullable = true) -# |-- double: int (nullable = true) -# |-- triple: int (nullable = true) -# |-- key : int (nullable = true) -{% endhighlight %} - +{% include_example schema_merging python/sql/datasource.py %}
    @@ -1280,8 +682,8 @@ metadata.
    {% highlight scala %} -// spark is an existing HiveContext -spark.refreshTable("my_table") +// spark is an existing SparkSession +spark.catalog.refreshTable("my_table") {% endhighlight %}
    @@ -1289,8 +691,8 @@ spark.refreshTable("my_table")
    {% highlight java %} -// spark is an existing HiveContext -spark.refreshTable("my_table") +// spark is an existing SparkSession +spark.catalog().refreshTable("my_table"); {% endhighlight %}
    @@ -1298,8 +700,8 @@ spark.refreshTable("my_table")
    {% highlight python %} -# spark is an existing HiveContext -spark.refreshTable("my_table") +# spark is an existing SparkSession +spark.catalog.refreshTable("my_table") {% endhighlight %}
    @@ -1402,33 +804,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. -{% highlight scala %} -val spark: SparkSession // An existing SparkSession - -// A JSON dataset is pointed to by path. -// The path can be either a single text file or a directory storing text files. -val path = "examples/src/main/resources/people.json" -val people = spark.read.json(path) - -// The inferred schema can be visualized using the printSchema() method. -people.printSchema() -// root -// |-- age: long (nullable = true) -// |-- name: string (nullable = true) - -// Creates a temporary view using the DataFrame -people.createOrReplaceTempView("people") - -// SQL statements can be run by using the sql methods provided by spark. -val teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") - -// Alternatively, a DataFrame can be created for a JSON dataset represented by -// an RDD[String] storing one JSON object per string. -val anotherPeopleRDD = sc.parallelize( - """{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil) -val anotherPeople = spark.read.json(anotherPeopleRDD) -{% endhighlight %} - +{% include_example json_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
    @@ -1440,33 +816,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. -{% highlight java %} -// sc is an existing JavaSparkContext. -SparkSession spark = new org.apache.spark.sql.SparkSession(sc); - -// A JSON dataset is pointed to by path. -// The path can be either a single text file or a directory storing text files. -Dataset people = spark.read().json("examples/src/main/resources/people.json"); - -// The inferred schema can be visualized using the printSchema() method. -people.printSchema(); -// root -// |-- age: long (nullable = true) -// |-- name: string (nullable = true) - -// Creates a temporary view using the DataFrame -people.createOrReplaceTempView("people"); - -// SQL statements can be run by using the sql methods provided by spark. -Dataset teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); - -// Alternatively, a DataFrame can be created for a JSON dataset represented by -// an RDD[String] storing one JSON object per string. -List jsonData = Arrays.asList( - "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); -JavaRDD anotherPeopleRDD = sc.parallelize(jsonData); -Dataset anotherPeople = spark.read().json(anotherPeopleRDD); -{% endhighlight %} +{% include_example json_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
    @@ -1477,31 +827,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. -{% highlight python %} -# spark is an existing SparkSession. - -# A JSON dataset is pointed to by path. -# The path can be either a single text file or a directory storing text files. -people = spark.read.json("examples/src/main/resources/people.json") - -# The inferred schema can be visualized using the printSchema() method. -people.printSchema() -# root -# |-- age: long (nullable = true) -# |-- name: string (nullable = true) - -# Creates a temporary view using the DataFrame. -people.createOrReplaceTempView("people") - -# SQL statements can be run by using the sql methods provided by `spark`. -teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") - -# Alternatively, a DataFrame can be created for a JSON dataset represented by -# an RDD[String] storing one JSON object per string. -anotherPeopleRDD = sc.parallelize([ - '{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']) -anotherPeople = spark.jsonRDD(anotherPeopleRDD) -{% endhighlight %} +{% include_example json_dataset python/sql/datasource.py %}
    @@ -1561,18 +887,7 @@ the `hive.metastore.warehouse.dir` property in `hive-site.xml` is deprecated sin Instead, use `spark.sql.warehouse.dir` to specify the default location of database in warehouse. You may need to grant write privilege to the user who starts the spark application. -{% highlight scala %} -// warehouse_location points to the default location for managed databases and tables -val conf = new SparkConf().setAppName("HiveFromSpark").set("spark.sql.warehouse.dir", warehouse_location) -val spark = SparkSession.builder.config(conf).enableHiveSupport().getOrCreate() - -spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") -spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") - -// Queries are expressed in HiveQL -spark.sql("FROM src SELECT key, value").collect().foreach(println) -{% endhighlight %} - +{% include_example spark_hive scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala %}
    @@ -1587,17 +902,7 @@ the `hive.metastore.warehouse.dir` property in `hive-site.xml` is deprecated sin Instead, use `spark.sql.warehouse.dir` to specify the default location of database in warehouse. You may need to grant write privilege to the user who starts the spark application. -{% highlight java %} -SparkSession spark = SparkSession.builder().appName("JavaSparkSQL").getOrCreate(); - -spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); -spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"); - -// Queries are expressed in HiveQL. -List results = spark.sql("FROM src SELECT key, value").collectAsList(); - -{% endhighlight %} - +{% include_example spark_hive java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java %}
    @@ -1612,17 +917,7 @@ the `hive.metastore.warehouse.dir` property in `hive-site.xml` is deprecated sin Instead, use `spark.sql.warehouse.dir` to specify the default location of database in warehouse. You may need to grant write privilege to the user who starts the spark application. -{% highlight python %} -# spark is an existing SparkSession - -spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") -spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") - -# Queries can be expressed in HiveQL. -results = spark.sql("FROM src SELECT key, value").collect() - -{% endhighlight %} - +{% include_example spark_hive python/sql/hive.py %}
    @@ -2381,9 +1676,8 @@ Spark SQL and DataFrames support the following data types: All data types of Spark SQL are located in the package `org.apache.spark.sql.types`. You can access them by doing -{% highlight scala %} -import org.apache.spark.sql.types._ -{% endhighlight %} + +{% include_example data_types scala/org/apache/spark/examples/sql/SparkSQLExample.scala %} diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 2ee3b80185c2..de82a064d1d3 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -15,7 +15,7 @@ like Kafka, Flume, Kinesis, or TCP sockets, and can be processed using complex algorithms expressed with high-level functions like `map`, `reduce`, `join` and `window`. Finally, processed data can be pushed out to filesystems, databases, and live dashboards. In fact, you can apply Spark's -[machine learning](mllib-guide.html) and +[machine learning](ml-guide.html) and [graph processing](graphx-programming-guide.html) algorithms on data streams.

    @@ -1673,7 +1673,7 @@ See the [DataFrames and SQL](sql-programming-guide.html) guide to learn more abo *** ## MLlib Operations -You can also easily use machine learning algorithms provided by [MLlib](mllib-guide.html). First of all, there are streaming machine learning algorithms (e.g. [Streaming Linear Regression](mllib-linear-methods.html#streaming-linear-regression), [Streaming KMeans](mllib-clustering.html#streaming-k-means), etc.) which can simultaneously learn from the streaming data as well as apply the model on the streaming data. Beyond these, for a much larger class of machine learning algorithms, you can learn a learning model offline (i.e. using historical data) and then apply the model online on streaming data. See the [MLlib](mllib-guide.html) guide for more details. +You can also easily use machine learning algorithms provided by [MLlib](ml-guide.html). First of all, there are streaming machine learning algorithms (e.g. [Streaming Linear Regression](mllib-linear-methods.html#streaming-linear-regression), [Streaming KMeans](mllib-clustering.html#streaming-k-means), etc.) which can simultaneously learn from the streaming data as well as apply the model on the streaming data. Beyond these, for a much larger class of machine learning algorithms, you can learn a learning model offline (i.e. using historical data) and then apply the model online on streaming data. See the [MLlib](ml-guide.html) guide for more details. *** diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index 79493968db27..8c14c3d220a2 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -22,14 +22,49 @@ Let’s say you want to maintain a running word count of text data received from

    +{% highlight scala %} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.SparkSession + +val spark = SparkSession + .builder + .appName("StructuredNetworkWordCount") + .getOrCreate() + +import spark.implicits._ +{% endhighlight %}
    +{% highlight java %} +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.streaming.StreamingQuery; + +import java.util.Arrays; +import java.util.Iterator; + +SparkSession spark = SparkSession + .builder() + .appName("JavaStructuredNetworkWordCount") + .getOrCreate(); +{% endhighlight %}
    +{% highlight python %} +from pyspark.sql import SparkSession +from pyspark.sql.functions import explode +from pyspark.sql.functions import split + +spark = SparkSession\ + .builder()\ + .appName("StructuredNetworkWordCount")\ + .getOrCreate() +{% endhighlight %} +
    @@ -38,18 +73,6 @@ Next, let’s create a streaming DataFrame that represents text data received fr
    -{% highlight scala %} -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.SparkSession - -val spark = SparkSession - .builder - .appName("StructuredNetworkWordCount") - .getOrCreate() -{% endhighlight %} - -Next, let’s create a streaming DataFrame that represents text data received from a server listening on localhost:9999, and transform the DataFrame to calculate word counts. - {% highlight scala %} // Create DataFrame representing the stream of input lines from connection to localhost:9999 val lines = spark.readStream @@ -65,29 +88,11 @@ val words = lines.as[String].flatMap(_.split(" ")) val wordCounts = words.groupBy("value").count() {% endhighlight %} -This `lines` DataFrame represents an unbounded table containing the streaming text data. This table contains one column of strings named “value”, and each line in the streaming text data becomes a row in the table. Note, that this is not currently receiving any data as we are just setting up the transformation, and have not yet started it. Next, we have converted the DataFrame to a Dataset of String using `.as(Encoders.STRING())`, so that we can apply the `flatMap` operation to split each line into multiple words. The resultant `words` Dataset contains all the words. Finally, we have defined the `wordCounts` DataFrame by grouping by the unique values in the Dataset and counting them. Note that this is a streaming DataFrame which represents the running word counts of the stream. +This `lines` DataFrame represents an unbounded table containing the streaming text data. This table contains one column of strings named “value”, and each line in the streaming text data becomes a row in the table. Note, that this is not currently receiving any data as we are just setting up the transformation, and have not yet started it. Next, we have converted the DataFrame to a Dataset of String using `.as[String]`, so that we can apply the `flatMap` operation to split each line into multiple words. The resultant `words` Dataset contains all the words. Finally, we have defined the `wordCounts` DataFrame by grouping by the unique values in the Dataset and counting them. Note that this is a streaming DataFrame which represents the running word counts of the stream.
    -{% highlight java %} -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.sql.*; -import org.apache.spark.sql.streaming.StreamingQuery; - -import java.util.Arrays; -import java.util.Iterator; - -SparkSession spark = SparkSession - .builder() - .appName("JavaStructuredNetworkWordCount") - .getOrCreate(); - -import spark.implicits._ -{% endhighlight %} - -Next, let’s create a streaming DataFrame that represents text data received from a server listening on localhost:9999, and transform the DataFrame to calculate word counts. - {% highlight java %} // Create DataFrame representing the stream of input lines from connection to localhost:9999 Dataset lines = spark @@ -117,19 +122,6 @@ This `lines` DataFrame represents an unbounded table containing the streaming te
    -{% highlight python %} -from pyspark.sql import SparkSession -from pyspark.sql.functions import explode -from pyspark.sql.functions import split - -spark = SparkSession\ - .builder()\ - .appName("StructuredNetworkWordCount")\ - .getOrCreate() -{% endhighlight %} - -Next, let’s create a streaming DataFrame that represents text data received from a server listening on localhost:9999, and transform the DataFrame to calculate word counts. - {% highlight python %} # Create DataFrame representing the stream of input lines from connection to localhost:9999 lines = spark\ @@ -223,7 +215,7 @@ $ ./bin/run-example org.apache.spark.examples.sql.streaming.JavaStructuredNetwor {% endhighlight %}
    - {% highlight bash %} +{% highlight bash %} $ ./bin/spark-submit examples/src/main/python/sql/streaming/structured_network_wordcount.py localhost 9999 {% endhighlight %}
    @@ -389,7 +381,7 @@ The “Output” is defined as what gets written out to the external storage. Th Note that each mode is applicable on certain types of queries. This is discussed in detail [later](#output-modes). To illustrate the use of this model, let’s understand the model in context of -the Quick Example above. The first `lines` DataFrame is the input table, and +the [Quick Example](#quick-example) above. The first `lines` DataFrame is the input table, and the final `wordCounts` DataFrame is the result table. Note that the query on streaming `lines` DataFrame to generate `wordCounts` is *exactly the same* as it would be a static DataFrame. However, when this query is started, Spark @@ -410,15 +402,14 @@ see how this model handles event-time based processing and late arriving data. ## Handling Event-time and Late Data Event-time is the time embedded in the data itself. For many applications, you may want to operate on this event-time. For example, if you want to get the number of events generated by IoT devices every minute, then you probably want to use the time when the data was generated (that is, event-time in the data), rather than the time Spark receives them. This event-time is very naturally expressed in this model -- each event from the devices is a row in the table, and event-time is a column value in the row. This allows window-based aggregations (e.g. number of event every minute) to be just a special type of grouping and aggregation on the even-time column -- each time window is a group and each row can belong to multiple windows/groups. Therefore, such event-time-window-based aggregation queries can be defined consistently on both a static dataset (e.g. from collected device events logs) as well as on a data stream, making the life of the user much easier. -Furthermore this model naturally handles data that has arrived later than expected based on its event-time. Since Spark is updating the Result Table, it has full control over updating/cleaning up the aggregates when there is late data. While not yet implemented in Spark 2.0, event-time watermarking will be used to manage this data. These are explained later in more details in the [Window Operations](#window-operations-on-event-time) section. +Furthermore, this model naturally handles data that has arrived later than expected based on its event-time. Since Spark is updating the Result Table, it has full control over updating/cleaning up the aggregates when there is late data. While not yet implemented in Spark 2.0, event-time watermarking will be used to manage this data. These are explained later in more details in the [Window Operations](#window-operations-on-event-time) section. ## Fault Tolerance Semantics Delivering end-to-end exactly-once semantics was one of key goals behind the design of Structured Streaming. To achieve that, we have designed the Structured Streaming sources, the sinks and the execution engine to reliably track the exact progress of the processing so that it can handle any kind of failure by restarting and/or reprocessing. Every streaming source is assumed to have offsets (similar to Kafka offsets, or Kinesis sequence numbers) to track the read position in the stream. The engine uses checkpointing and write ahead logs to record the offset range of the data being processed in each trigger. The streaming sinks are designed to be idempotent for handling reprocessing. Together, using replayable sources and idempotant sinks, Structured Streaming can ensure **end-to-end exactly-once semantics** under any failure. # API using Datasets and DataFrames -Since Spark 2.0, DataFrames and Datasets can represent static, bounded data, as well as streaming, unbounded data. Similar to static Datasets/DataFrames, you can use the common entry point `SparkSession` ( -[Scala](api/scala/index.html#org.apache.spark.sql.SparkSession)/ +Since Spark 2.0, DataFrames and Datasets can represent static, bounded data, as well as streaming, unbounded data. Similar to static Datasets/DataFrames, you can use the common entry point `SparkSession` ([Scala](api/scala/index.html#org.apache.spark.sql.SparkSession)/ [Java](api/java/org/apache/spark/sql/SparkSession.html)/ [Python](api/python/pyspark.sql.html#pyspark.sql.SparkSession) docs) to create streaming DataFrames/Datasets from streaming sources, and apply the same operations on them as static DataFrames/Datasets. If you are not familiar with Datasets/DataFrames, you are strongly advised to familiarize yourself with them using the [DataFrame/Dataset Programming Guide](sql-programming-guide.html). @@ -427,9 +418,9 @@ Since Spark 2.0, DataFrames and Datasets can represent static, bounded data, as Streaming DataFrames can be created through the `DataStreamReader` interface ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.DataStreamReader)/ [Java](api/java/org/apache/spark/sql/streaming/DataStreamReader.html)/ -[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader) docs) returned by `SparkSession.readStream()`. Similar to the read interface for creating static DataFrame, you can specify the details of the source - data format, schema, options, etc. In Spark 2.0, there are a few built-in sources. +[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader) docs) returned by `SparkSession.readStream()`. Similar to the read interface for creating static DataFrame, you can specify the details of the source – data format, schema, options, etc. In Spark 2.0, there are a few built-in sources. - - **File sources** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations. + - **File source** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations. - **Socket source (for testing)** - Reads UTF8 text data from a socket connection. The listening server socket is at the driver. Note that this should be used only for testing as this does not provide end-to-end fault-tolerance guarantees. @@ -439,7 +430,7 @@ Here are some examples.
    {% highlight scala %} -val spark: SparkSession = … +val spark: SparkSession = ... // Read text from socket val socketDF = spark @@ -458,7 +449,7 @@ val userSchema = new StructType().add("name", "string").add("age", "integer") val csvDF = spark .readStream .option("sep", ";") - .schema(userSchema) // Specify schema of the parquet files + .schema(userSchema) // Specify schema of the csv files .csv("/path/to/directory") // Equivalent to format("csv").load("/path/to/directory") {% endhighlight %} @@ -485,7 +476,7 @@ StructType userSchema = new StructType().add("name", "string").add("age", "integ Dataset[Row] csvDF = spark .readStream() .option("sep", ";") - .schema(userSchema) // Specify schema of the parquet files + .schema(userSchema) // Specify schema of the csv files .csv("/path/to/directory"); // Equivalent to format("csv").load("/path/to/directory") {% endhighlight %} @@ -493,7 +484,7 @@ Dataset[Row] csvDF = spark
    {% highlight python %} -spark = SparkSession. …. +spark = SparkSession. ... # Read text from socket socketDF = spark \ @@ -519,10 +510,10 @@ csvDF = spark \
    -These examples generate streaming DataFrames that are untyped, meaning that the schema of the DataFrame is not checked at compile time, only checked at runtime when the query is submitted. Some operations like `map`, `flatMap`, etc. need the type to be known at compile time. To do those, you can convert these untyped streaming DataFrames to typed streaming Datasets using the same methods as static DataFrame. See the SQL Programming Guide for more details. Additionally, more details on the supported streaming sources are discussed later in the document. +These examples generate streaming DataFrames that are untyped, meaning that the schema of the DataFrame is not checked at compile time, only checked at runtime when the query is submitted. Some operations like `map`, `flatMap`, etc. need the type to be known at compile time. To do those, you can convert these untyped streaming DataFrames to typed streaming Datasets using the same methods as static DataFrame. See the [SQL Programming Guide](sql-programming-guide.html) for more details. Additionally, more details on the supported streaming sources are discussed later in the document. ## Operations on streaming DataFrames/Datasets -You can apply all kinds of operations on streaming DataFrames/Datasets - ranging from untyped, SQL-like operations (e.g. select, where, groupBy), to typed RDD-like operations (e.g. map, filter, flatMap). See the [SQL programming guide](sql-programming-guide.html) for more details. Let’s take a look at a few example operations that you can use. +You can apply all kinds of operations on streaming DataFrames/Datasets – ranging from untyped, SQL-like operations (e.g. `select`, `where`, `groupBy`), to typed RDD-like operations (e.g. `map`, `filter`, `flatMap`). See the [SQL programming guide](sql-programming-guide.html) for more details. Let’s take a look at a few example operations that you can use. ### Basic Operations - Selection, Projection, Aggregation Most of the common operations on DataFrame/Dataset are supported for streaming. The few operations that are not supported are [discussed later](#unsupported-operations) in this section. @@ -608,7 +599,7 @@ ds.groupByKey(new MapFunction() { // using typed API df = ... # streaming DataFrame with IOT device data with schema { device: string, type: string, signal: double, time: DateType } -# Select the devices which have signal more than 11 +# Select the devices which have signal more than 10 df.select("device").where("signal > 10") # Running count of the number of updates for each device type @@ -620,58 +611,55 @@ df.groupBy("type").count() ### Window Operations on Event Time Aggregations over a sliding event-time window are straightforward with Structured Streaming. The key idea to understand about window-based aggregations are very similar to grouped aggregations. In a grouped aggregation, aggregate values (e.g. counts) are maintained for each unique value in the user-specified grouping column. In case of window-based aggregations, aggregate values are maintained for each window the event-time of a row falls into. Let's understand this with an illustration. -Imagine our quick example is modified and the stream now contains lines along with the time when the line was generated. Instead of running word counts, we want to count words within 10 minute windows, updating every 5 minutes. That is, word counts in words received between 10 minute windows 12:00 - 12:10, 12:05 - 12:15, 12:10 - 12:20, etc. Note that 12:00 - 12:10 means data that arrived after 12:00 but before 12:10. Now, consider a word that was received at 12:07. This word should increment the counts corresponding to two windows 12:00 - 12:10 and 12:05 - 12:15. So the counts will be indexed by both, the grouping key (i.e. the word) and the window (can be calculated from the event-time). +Imagine our [quick example](#quick-example) is modified and the stream now contains lines along with the time when the line was generated. Instead of running word counts, we want to count words within 10 minute windows, updating every 5 minutes. That is, word counts in words received between 10 minute windows 12:00 - 12:10, 12:05 - 12:15, 12:10 - 12:20, etc. Note that 12:00 - 12:10 means data that arrived after 12:00 but before 12:10. Now, consider a word that was received at 12:07. This word should increment the counts corresponding to two windows 12:00 - 12:10 and 12:05 - 12:15. So the counts will be indexed by both, the grouping key (i.e. the word) and the window (can be calculated from the event-time). The result tables would look something like the following. ![Window Operations](img/structured-streaming-window.png) -Since this windowing is similar to grouping, in code, you can use `groupBy()` and `window()` operations to express windowed aggregations. +Since this windowing is similar to grouping, in code, you can use `groupBy()` and `window()` operations to express windowed aggregations. You can see the full code for the below examples in +[Scala]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredNetworkWordCountWindowed.scala)/ +[Java]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCountWindowed.java)/ +[Python]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py).
    {% highlight scala %} -// Number of events in every 1 minute time windows -df.groupBy(window(df.col("time"), "1 minute")) - .count() +import spark.implicits._ +val words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String } -// Average number of events for each device type in every 1 minute time windows -df.groupBy( - df.col("type"), - window(df.col("time"), "1 minute")) - .avg("signal") +// Group the data by window and word and compute the count of each group +val windowedCounts = words.groupBy( + window($"timestamp", "10 minutes", "5 minutes"), + $"word" +).count() {% endhighlight %}
    {% highlight java %} -import static org.apache.spark.sql.functions.window; - -// Number of events in every 1 minute time windows -df.groupBy(window(df.col("time"), "1 minute")) - .count(); - -// Average number of events for each device type in every 1 minute time windows -df.groupBy( - df.col("type"), - window(df.col("time"), "1 minute")) - .avg("signal"); +Dataset words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String } +// Group the data by window and word and compute the count of each group +Dataset windowedCounts = words.groupBy( + functions.window(words.col("timestamp"), "10 minutes", "5 minutes"), + words.col("word") +).count(); {% endhighlight %}
    {% highlight python %} -from pyspark.sql.functions import window - -# Number of events in every 1 minute time windows -df.groupBy(window("time", "1 minute")).count() +words = ... # streaming DataFrame of schema { timestamp: Timestamp, word: String } -# Average number of events for each device type in every 1 minute time windows -df.groupBy("type", window("time", "1 minute")).avg("signal") +# Group the data by window and word and compute the count of each group +windowedCounts = words.groupBy( + window(words.timestamp, '10 minutes', '5 minutes'), + words.word +).count() {% endhighlight %}
    @@ -680,7 +668,7 @@ df.groupBy("type", window("time", "1 minute")).avg("signal") Now consider what happens if one of the events arrives late to the application. For example, a word that was generated at 12:04 but it was received at 12:11. -Since this windowing is based on the time in the data, the time 12:04 should be considered for windowing. This occurs naturally in our window-based grouping - the late data is automatically placed in the proper windows and the correct aggregates updated as illustrated below. +Since this windowing is based on the time in the data, the time 12:04 should be considered for windowing. This occurs naturally in our window-based grouping – the late data is automatically placed in the proper windows and the correct aggregates are updated as illustrated below. ![Handling Late Data](img/structured-streaming-late-data.png) @@ -714,8 +702,8 @@ streamingDf.join(staticDf, "type", "right_join"); // right outer join with a st
    {% highlight python %} -staticDf = spark.read. … -streamingDf = spark.readStream. … +staticDf = spark.read. ... +streamingDf = spark.readStream. ... streamingDf.join(staticDf, "type") # inner equi-join with a static DF streamingDf.join(staticDf, "type", "right_join") # right outer join with a static DF {% endhighlight %} @@ -744,7 +732,7 @@ However, note that all of the operations applicable on static DataFrames/Dataset - Any kind of joins between two streaming Datasets are not yet supported. -In addition, there are some Dataset methods that will not work on streaming Datasets. They are actions that will immediately run queries and return results, which does not makes sense on a streaming Dataset. Rather those functionalities can be done by explicitly starting a streaming query (see the next section regarding that). +In addition, there are some Dataset methods that will not work on streaming Datasets. They are actions that will immediately run queries and return results, which does not make sense on a streaming Dataset. Rather, those functionalities can be done by explicitly starting a streaming query (see the next section regarding that). - `count()` - Cannot return a single count from a streaming Dataset. Instead, use `ds.groupBy.count()` which returns a streaming Dataset containing a running count. @@ -756,10 +744,9 @@ If you try any of these operations, you will see an AnalysisException like "oper ## Starting Streaming Queries Once you have defined the final result DataFrame/Dataset, all that is left is for you start the streaming computation. To do that, you have to use the -`DataStreamWriter` ( -[Scala](api/scala/index.html#org.apache.spark.sql.streaming.DataStreamWriter)/ +`DataStreamWriter` ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.DataStreamWriter)/ [Java](api/java/org/apache/spark/sql/streaming/DataStreamWriter.html)/ -[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamWriter) docs) returned through `Dataset.writeSteram()`. You will have to specify one or more of the following in this interface. +[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamWriter) docs) returned through `Dataset.writeStream()`. You will have to specify one or more of the following in this interface. - *Details of the output sink:* Data format, location, etc. @@ -769,12 +756,12 @@ Once you have defined the final result DataFrame/Dataset, all that is left is fo - *Trigger interval:* Optionally, specify the trigger interval. If it is not specified, the system will check for availability of new data as soon as the previous processing has completed. If a trigger time is missed because the previous processing has not completed, then the system will attempt to trigger at the next trigger point, not immediately after the processing has completed. -- *Checkpoint location:* For some output sinks where the end-to-end fault-tolerance can be guaranteed, specify the location where the system will write all the checkpoint information. This should be a directory in a HDFS-compatible fault-tolerant file system. The semantics of checkpointing is discussed in more detail in the next section. +- *Checkpoint location:* For some output sinks where the end-to-end fault-tolerance can be guaranteed, specify the location where the system will write all the checkpoint information. This should be a directory in an HDFS-compatible fault-tolerant file system. The semantics of checkpointing is discussed in more detail in the next section. #### Output Modes There are two types of output mode currently implemented. -- **Append mode (default)** - This is the default mode, where only the new rows added to the result table since the last trigger will be outputted to the sink. This is only applicable to queries that *do not have any aggregations* (e.g. queries with only select, where, map, flatMap, filter, join, etc.). +- **Append mode (default)** - This is the default mode, where only the new rows added to the result table since the last trigger will be outputted to the sink. This is only applicable to queries that *do not have any aggregations* (e.g. queries with only `select`, `where`, `map`, `flatMap`, `filter`, `join`, etc.). - **Complete mode** - The whole result table will be outputted to the sink.This is only applicable to queries that *have aggregations*. @@ -829,7 +816,7 @@ Here is a table of all the sinks, and the corresponding settings.
    -Finally, you have to call `start()` to actually to start the execution of the query. This returns a StreamingQuery object which is a handle to the continuously running execution. You can use this object to manage the query, which we will discuss in the next subsection. For now, let’s understand all this with a few examples. +Finally, you have to call `start()` to actually start the execution of the query. This returns a StreamingQuery object which is a handle to the continuously running execution. You can use this object to manage the query, which we will discuss in the next subsection. For now, let’s understand all this with a few examples.
    @@ -861,7 +848,7 @@ aggDF .format("console") .start() -// Have all the aggregates in an in memory table +// Have all the aggregates in an in-memory table aggDF .writeStream .queryName("aggregates") // this query name will be the table name @@ -877,7 +864,7 @@ spark.sql("select * from aggregates").show() // interactively query in-memory {% highlight java %} // ========== DF with no aggregations ========== -Dataset noAggDF = deviceDataDf.select("device").where("signal > 10") +Dataset noAggDF = deviceDataDf.select("device").where("signal > 10"); // Print new data to console noAggDF @@ -901,7 +888,7 @@ aggDF .format("console") .start(); -// Have all the aggregates in an in memory table +// Have all the aggregates in an in-memory table aggDF .writeStream() .queryName("aggregates") // this query name will be the table name @@ -957,7 +944,7 @@ spark.sql("select * from aggregates").show() # interactively query in-memory t #### Using Foreach The `foreach` operation allows arbitrary operations to be computed on the output data. As of Spark 2.0, this is available only for Scala and Java. To use this, you will have to implement the interface `ForeachWriter` ([Scala](api/scala/index.html#org.apache.spark.sql.ForeachWriter)/ -[Java](api/java/org/apache/spark/sql/ForeachWriter.html) docs), which has methods that gets called whenever there is a sequence of rows generated as output after a trigger. Note the following important points. +[Java](api/java/org/apache/spark/sql/ForeachWriter.html) docs), which has methods that get called whenever there is a sequence of rows generated as output after a trigger. Note the following important points. - The writer must be serializable, as it will be serialized and sent to the executors for execution. @@ -992,7 +979,7 @@ query.awaitTermination() // block until query is terminated, with stop() or wi query.exception() // the exception if the query has been terminated with error -query.souceStatus() // progress information about data has been read from the input sources +query.sourceStatus() // progress information about data has been read from the input sources query.sinkStatus() // progress information about data written to the output sink {% endhighlight %} @@ -1016,7 +1003,7 @@ query.awaitTermination(); // block until query is terminated, with stop() or w query.exception(); // the exception if the query has been terminated with error -query.souceStatus(); // progress information about data has been read from the input sources +query.sourceStatus(); // progress information about data has been read from the input sources query.sinkStatus(); // progress information about data written to the output sink @@ -1040,7 +1027,7 @@ query.awaitTermination() # block until query is terminated, with stop() or wit query.exception() # the exception if the query has been terminated with error -query.souceStatus() # progress information about data has been read from the input sources +query.sourceStatus() # progress information about data has been read from the input sources query.sinkStatus() # progress information about data written to the output sink @@ -1049,8 +1036,7 @@ query.sinkStatus() # progress information about data written to the output sin
    -You can start any number of queries in a single SparkSession. They will all be running concurrently sharing the cluster resources. You can use `sparkSession.streams()` to get the `StreamingQueryManager` ( -[Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryManager)/ +You can start any number of queries in a single SparkSession. They will all be running concurrently sharing the cluster resources. You can use `sparkSession.streams()` to get the `StreamingQueryManager` ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryManager)/ [Java](api/java/org/apache/spark/sql/streaming/StreamingQueryManager.html)/ [Python](api/python/pyspark.sql.html#pyspark.sql.streaming.StreamingQueryManager) docs) that can be used to manage the currently active queries. @@ -1058,7 +1044,7 @@ You can start any number of queries in a single SparkSession. They will all be r
    {% highlight scala %} -val spark: SparkSession = … +val spark: SparkSession = ... spark.streams.active // get the list of currently active streaming queries @@ -1073,11 +1059,11 @@ spark.streams.awaitAnyTermination() // block until any one of them terminates {% highlight java %} SparkSession spark = ... -spark.streams().active() // get the list of currently active streaming queries +spark.streams().active(); // get the list of currently active streaming queries -spark.streams().get(id) // get a query object by its unique id +spark.streams().get(id); // get a query object by its unique id -spark.streams().awaitAnyTermination() // block until any one of them terminates +spark.streams().awaitAnyTermination(); // block until any one of them terminates {% endhighlight %}
    @@ -1096,12 +1082,11 @@ spark.streams().awaitAnyTermination() # block until any one of them terminates
    -Finally, for asynchronous monitoring of streaming queries, you can create and attach a `StreamingQueryListener` ( -[Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryListener)/ +Finally, for asynchronous monitoring of streaming queries, you can create and attach a `StreamingQueryListener` ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryListener)/ [Java](api/java/org/apache/spark/sql/streaming/StreamingQueryListener.html) docs), which will give you regular callback-based updates when queries are started and terminated. ## Recovering from Failures with Checkpointing -In case of a failure or intentional shutdown, you can recover the previous progress and state of a previous query, and continue where it left off. This is done using checkpointing and write ahead logs. You can configure a query with a checkpoint location, and the query will save all the progress information (i.e. range of offsets processed in each trigger), and the running aggregates (e.g. word counts in the quick example) will be saved the checkpoint location. As of Spark 2.0, this checkpoint location has to be a path in a HDFS compatible file system, and can be set as an option in the DataStreamWriter when [starting a query](#starting-streaming-queries). +In case of a failure or intentional shutdown, you can recover the previous progress and state of a previous query, and continue where it left off. This is done using checkpointing and write ahead logs. You can configure a query with a checkpoint location, and the query will save all the progress information (i.e. range of offsets processed in each trigger) and the running aggregates (e.g. word counts in the [quick example](#quick-example)) to the checkpoint location. As of Spark 2.0, this checkpoint location has to be a path in an HDFS compatible file system, and can be set as an option in the DataStreamWriter when [starting a query](#starting-streaming-queries).
    diff --git a/examples/pom.xml b/examples/pom.xml index 6b9b94454ba0..90bbd3fbb940 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-examples_2.11 jar Spark Project Examples diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java index 9a43189c9146..4ccd8f6ce265 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java @@ -60,7 +60,7 @@ public static void main(String[] args) { .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) - .setRegParam(0.01); + .setRegParam(0.001); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); @@ -71,7 +71,7 @@ public static void main(String[] args) { Dataset test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), - new JavaDocument(6L, "mapreduce spark"), + new JavaDocument(6L, "spark hadoop spark"), new JavaDocument(7L, "apache hadoop") ), JavaDocument.class); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java deleted file mode 100644 index ca80d0d8bba5..000000000000 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import java.util.List; - -import com.google.common.collect.Lists; - -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.ml.param.ParamMap; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; - -/** - * A simple example demonstrating ways to specify parameters for Estimators and Transformers. - * Run with - * {{{ - * bin/run-example ml.JavaSimpleParamsExample - * }}} - */ -public class JavaSimpleParamsExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaSimpleParamsExample") - .getOrCreate(); - - // Prepare training data. - // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans - // into DataFrames, where it uses the bean metadata to infer the schema. - List localTraining = Lists.newArrayList( - new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), - new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), - new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), - new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); - Dataset training = - spark.createDataFrame(localTraining, LabeledPoint.class); - - // Create a LogisticRegression instance. This instance is an Estimator. - LogisticRegression lr = new LogisticRegression(); - // Print out the parameters, documentation, and any default values. - System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); - - // We may set parameters using setter methods. - lr.setMaxIter(10) - .setRegParam(0.01); - - // Learn a LogisticRegression model. This uses the parameters stored in lr. - LogisticRegressionModel model1 = lr.fit(training); - // Since model1 is a Model (i.e., a Transformer produced by an Estimator), - // we can view the parameters it used during fit(). - // This prints the parameter (name: value) pairs, where names are unique IDs for this - // LogisticRegression instance. - System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); - - // We may alternatively specify parameters using a ParamMap. - ParamMap paramMap = new ParamMap(); - paramMap.put(lr.maxIter().w(20)); // Specify 1 Param. - paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter. - double[] thresholds = {0.5, 0.5}; - paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params. - - // One can also combine ParamMaps. - ParamMap paramMap2 = new ParamMap(); - paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name. - ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); - - // Now learn a new model using the paramMapCombined parameters. - // paramMapCombined overrides all parameters set earlier via lr.set* methods. - LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); - System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); - - // Prepare test documents. - List localTest = Lists.newArrayList( - new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), - new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), - new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); - Dataset test = spark.createDataFrame(localTest, LabeledPoint.class); - - // Make predictions on test documents using the Transformer.transform() method. - // LogisticRegressionModel.transform will only use the 'features' column. - // Note that model2.transform() outputs a 'myProbability' column instead of the usual - // 'probability' column since we renamed the lr.probabilityCol parameter previously. - Dataset results = model2.transform(test); - Dataset rows = results.select("features", "label", "myProbability", "prediction"); - for (Row r: rows.collectAsList()) { - System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) - + ", prediction=" + r.get(3)); - } - - spark.stop(); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java deleted file mode 100644 index 7c24c46d2e28..000000000000 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import java.util.List; - -import com.google.common.collect.Lists; - -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.feature.HashingTF; -import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; - -/** - * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java - * bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of - * this example {@link SimpleTextClassificationPipeline}. Run with - *
    - * bin/run-example ml.JavaSimpleTextClassificationPipeline
    - * 
    - */ -public class JavaSimpleTextClassificationPipeline { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaSimpleTextClassificationPipeline") - .getOrCreate(); - - // Prepare training documents, which are labeled. - List localTraining = Lists.newArrayList( - new LabeledDocument(0L, "a b c d e spark", 1.0), - new LabeledDocument(1L, "b d", 0.0), - new LabeledDocument(2L, "spark f g h", 1.0), - new LabeledDocument(3L, "hadoop mapreduce", 0.0)); - Dataset training = - spark.createDataFrame(localTraining, LabeledDocument.class); - - // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. - Tokenizer tokenizer = new Tokenizer() - .setInputCol("text") - .setOutputCol("words"); - HashingTF hashingTF = new HashingTF() - .setNumFeatures(1000) - .setInputCol(tokenizer.getOutputCol()) - .setOutputCol("features"); - LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.001); - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); - - // Fit the pipeline to training documents. - PipelineModel model = pipeline.fit(training); - - // Prepare test documents, which are unlabeled. - List localTest = Lists.newArrayList( - new Document(4L, "spark i j k"), - new Document(5L, "l m n"), - new Document(6L, "spark hadoop spark"), - new Document(7L, "apache hadoop")); - Dataset test = spark.createDataFrame(localTest, Document.class); - - // Make predictions on test documents. - Dataset predictions = model.transform(test); - for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) { - System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) - + ", prediction=" + r.get(3)); - } - - spark.stop(); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java index def599442912..278cce084218 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java @@ -47,7 +47,7 @@ public static void main(String[] args) { .setOutputCol("filtered"); List data = Arrays.asList( - RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), + RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) ); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java index 1cc16bb60d17..a206cef4c232 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java @@ -57,17 +57,24 @@ public static void main(String[] args) { Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); - Dataset wordsDataFrame = tokenizer.transform(sentenceDataFrame); - for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) { + RegexTokenizer regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); + + Dataset tokenized = tokenizer.transform(sentenceDataFrame); + for (Row r : tokenized.select("words", "label").takeAsList(3)) { java.util.List words = r.getList(0); for (String word : words) System.out.print(word + " "); System.out.println(); } - RegexTokenizer regexTokenizer = new RegexTokenizer() - .setInputCol("sentence") - .setOutputCol("words") - .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); + Dataset regexTokenized = regexTokenizer.transform(sentenceDataFrame); + for (Row r : regexTokenized.select("words", "label").takeAsList(3)) { + java.util.List words = r.getList(0); + for (String word : words) System.out.print(word + " "); + System.out.println(); + } // $example off$ spark.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java new file mode 100644 index 000000000000..ec02c8bbb8ef --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.sql; + +// $example on:schema_merging$ +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +// $example off:schema_merging$ + +// $example on:basic_parquet_example$ +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +// import org.apache.spark.sql.Encoders; +// $example on:schema_merging$ +// $example on:json_dataset$ +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +// $example off:json_dataset$ +// $example off:schema_merging$ +// $example off:basic_parquet_example$ +import org.apache.spark.sql.SparkSession; + +public class JavaSQLDataSourceExample { + + // $example on:schema_merging$ + public static class Square implements Serializable { + private int value; + private int square; + + // Getters and setters... + // $example off:schema_merging$ + public int getValue() { + return value; + } + + public void setValue(int value) { + this.value = value; + } + + public int getSquare() { + return square; + } + + public void setSquare(int square) { + this.square = square; + } + // $example on:schema_merging$ + } + // $example off:schema_merging$ + + // $example on:schema_merging$ + public static class Cube implements Serializable { + private int value; + private int cube; + + // Getters and setters... + // $example off:schema_merging$ + public int getValue() { + return value; + } + + public void setValue(int value) { + this.value = value; + } + + public int getCube() { + return cube; + } + + public void setCube(int cube) { + this.cube = cube; + } + // $example on:schema_merging$ + } + // $example off:schema_merging$ + + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("Java Spark SQL Data Sources Example") + .config("spark.some.config.option", "some-value") + .getOrCreate(); + + runBasicDataSourceExample(spark); + runBasicParquetExample(spark); + runParquetSchemaMergingExample(spark); + runJsonDatasetExample(spark); + + spark.stop(); + } + + private static void runBasicDataSourceExample(SparkSession spark) { + // $example on:generic_load_save_functions$ + Dataset usersDF = spark.read().load("examples/src/main/resources/users.parquet"); + usersDF.select("name", "favorite_color").write().save("namesAndFavColors.parquet"); + // $example off:generic_load_save_functions$ + // $example on:manual_load_options$ + Dataset peopleDF = + spark.read().format("json").load("examples/src/main/resources/people.json"); + peopleDF.select("name", "age").write().format("parquet").save("namesAndAges.parquet"); + // $example off:manual_load_options$ + // $example on:direct_sql$ + Dataset sqlDF = + spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); + // $example off:direct_sql$ + } + + private static void runBasicParquetExample(SparkSession spark) { + // $example on:basic_parquet_example$ + Dataset peopleDF = spark.read().json("examples/src/main/resources/people.json"); + + // DataFrames can be saved as Parquet files, maintaining the schema information + peopleDF.write().parquet("people.parquet"); + + // Read in the Parquet file created above. + // Parquet files are self-describing so the schema is preserved + // The result of loading a parquet file is also a DataFrame + Dataset parquetFileDF = spark.read().parquet("people.parquet"); + + // Parquet files can also be used to create a temporary view and then used in SQL statements + parquetFileDF.createOrReplaceTempView("parquetFile"); + Dataset namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); + Dataset namesDS = namesDF.map(new MapFunction() { + public String call(Row row) { + return "Name: " + row.getString(0); + } + }, Encoders.STRING()); + namesDS.show(); + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + // $example off:basic_parquet_example$ + } + + private static void runParquetSchemaMergingExample(SparkSession spark) { + // $example on:schema_merging$ + List squares = new ArrayList<>(); + for (int value = 1; value <= 5; value++) { + Square square = new Square(); + square.setValue(value); + square.setSquare(value * value); + squares.add(square); + } + + // Create a simple DataFrame, store into a partition directory + Dataset squaresDF = spark.createDataFrame(squares, Square.class); + squaresDF.write().parquet("data/test_table/key=1"); + + List cubes = new ArrayList<>(); + for (int value = 6; value <= 10; value++) { + Cube cube = new Cube(); + cube.setValue(value); + cube.setCube(value * value * value); + cubes.add(cube); + } + + // Create another DataFrame in a new partition directory, + // adding a new column and dropping an existing column + Dataset cubesDF = spark.createDataFrame(cubes, Cube.class); + cubesDF.write().parquet("data/test_table/key=2"); + + // Read the partitioned table + Dataset mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table"); + mergedDF.printSchema(); + + // The final schema consists of all 3 columns in the Parquet files together + // with the partitioning column appeared in the partition directory paths + // root + // |-- value: int (nullable = true) + // |-- square: int (nullable = true) + // |-- cube: int (nullable = true) + // |-- key : int (nullable = true) + // $example off:schema_merging$ + } + + private static void runJsonDatasetExample(SparkSession spark) { + // $example on:json_dataset$ + // A JSON dataset is pointed to by path. + // The path can be either a single text file or a directory storing text files + Dataset people = spark.read().json("examples/src/main/resources/people.json"); + + // The inferred schema can be visualized using the printSchema() method + people.printSchema(); + // root + // |-- age: long (nullable = true) + // |-- name: string (nullable = true) + + // Creates a temporary view using the DataFrame + people.createOrReplaceTempView("people"); + + // SQL statements can be run by using the sql methods provided by spark + Dataset namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); + namesDF.show(); + // +------+ + // | name| + // +------+ + // |Justin| + // +------+ + // $example off:json_dataset$ + } + +} diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java deleted file mode 100644 index 7fc6c007b684..000000000000 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.sql; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example on:init_session$ -import org.apache.spark.sql.SparkSession; -// $example off:init_session$ - -public class JavaSparkSQL { - public static class Person implements Serializable { - private String name; - private int age; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public int getAge() { - return age; - } - - public void setAge(int age) { - this.age = age; - } - } - - public static void main(String[] args) throws Exception { - // $example on:init_session$ - SparkSession spark = SparkSession - .builder() - .appName("JavaSparkSQL") - .config("spark.some.config.option", "some-value") - .getOrCreate(); - // $example off:init_session$ - - System.out.println("=== Data source: RDD ==="); - // Load a text file and convert each line to a Java Bean. - String file = "examples/src/main/resources/people.txt"; - JavaRDD people = spark.read().textFile(file).javaRDD().map( - new Function() { - @Override - public Person call(String line) { - String[] parts = line.split(","); - - Person person = new Person(); - person.setName(parts[0]); - person.setAge(Integer.parseInt(parts[1].trim())); - - return person; - } - }); - - // Apply a schema to an RDD of Java Beans and create a temporary view - Dataset schemaPeople = spark.createDataFrame(people, Person.class); - schemaPeople.createOrReplaceTempView("people"); - - // SQL can be run over RDDs which backs a temporary view. - Dataset teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); - - // The results of SQL queries are DataFrames and support all the normal RDD operations. - // The columns of a row in the result can be accessed by ordinal. - List teenagerNames = teenagers.toJavaRDD().map(new Function() { - @Override - public String call(Row row) { - return "Name: " + row.getString(0); - } - }).collect(); - for (String name: teenagerNames) { - System.out.println(name); - } - - System.out.println("=== Data source: Parquet File ==="); - // DataFrames can be saved as parquet files, maintaining the schema information. - schemaPeople.write().parquet("people.parquet"); - - // Read in the parquet file created above. - // Parquet files are self-describing so the schema is preserved. - // The result of loading a parquet file is also a DataFrame. - Dataset parquetFile = spark.read().parquet("people.parquet"); - - // A temporary view can be created by using Parquet files and then used in SQL statements. - parquetFile.createOrReplaceTempView("parquetFile"); - Dataset teenagers2 = - spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); - teenagerNames = teenagers2.toJavaRDD().map(new Function() { - @Override - public String call(Row row) { - return "Name: " + row.getString(0); - } - }).collect(); - for (String name: teenagerNames) { - System.out.println(name); - } - - System.out.println("=== Data source: JSON Dataset ==="); - // A JSON dataset is pointed by path. - // The path can be either a single text file or a directory storing text files. - String path = "examples/src/main/resources/people.json"; - // Create a DataFrame from the file(s) pointed by path - Dataset peopleFromJsonFile = spark.read().json(path); - - // Because the schema of a JSON dataset is automatically inferred, to write queries, - // it is better to take a look at what is the schema. - peopleFromJsonFile.printSchema(); - // The schema of people is ... - // root - // |-- age: IntegerType - // |-- name: StringType - - // Creates a temporary view using the DataFrame - peopleFromJsonFile.createOrReplaceTempView("people"); - - // SQL statements can be run by using the sql methods provided by `spark` - Dataset teenagers3 = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); - - // The results of SQL queries are DataFrame and support all the normal RDD operations. - // The columns of a row in the result can be accessed by ordinal. - teenagerNames = teenagers3.toJavaRDD().map(new Function() { - @Override - public String call(Row row) { return "Name: " + row.getString(0); } - }).collect(); - for (String name: teenagerNames) { - System.out.println(name); - } - - // Alternatively, a DataFrame can be created for a JSON dataset represented by - // a RDD[String] storing one JSON object per string. - List jsonData = Arrays.asList( - "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); - JavaRDD anotherPeopleRDD = spark - .createDataFrame(jsonData, String.class).toJSON().javaRDD(); - Dataset peopleFromJsonRDD = spark.read().json(anotherPeopleRDD); - - // Take a look at the schema of this new DataFrame. - peopleFromJsonRDD.printSchema(); - // The schema of anotherPeople is ... - // root - // |-- address: StructType - // | |-- city: StringType - // | |-- state: StringType - // |-- name: StringType - - peopleFromJsonRDD.createOrReplaceTempView("people2"); - - Dataset peopleWithCity = spark.sql("SELECT name, address.city FROM people2"); - List nameAndCity = peopleWithCity.toJavaRDD().map(new Function() { - @Override - public String call(Row row) { - return "Name: " + row.getString(0) + ", City: " + row.getString(1); - } - }).collect(); - for (String name: nameAndCity) { - System.out.println(name); - } - - spark.stop(); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java new file mode 100644 index 000000000000..afc18078d471 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.sql; + +// $example on:programmatic_schema$ +import java.util.ArrayList; +import java.util.List; +// $example off:programmatic_schema$ +// $example on:create_ds$ +import java.util.Arrays; +import java.util.Collections; +import java.io.Serializable; +// $example off:create_ds$ + +// $example on:schema_inferring$ +// $example on:programmatic_schema$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +// $example off:programmatic_schema$ +// $example on:create_ds$ +import org.apache.spark.api.java.function.MapFunction; +// $example on:create_df$ +// $example on:run_sql$ +// $example on:programmatic_schema$ +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +// $example off:programmatic_schema$ +// $example off:create_df$ +// $example off:run_sql$ +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +// $example off:create_ds$ +// $example off:schema_inferring$ +import org.apache.spark.sql.RowFactory; +// $example on:init_session$ +import org.apache.spark.sql.SparkSession; +// $example off:init_session$ +// $example on:programmatic_schema$ +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off:programmatic_schema$ + +// $example on:untyped_ops$ +// col("...") is preferable to df.col("...") +import static org.apache.spark.sql.functions.col; +// $example off:untyped_ops$ + +public class JavaSparkSQLExample { + // $example on:create_ds$ + public static class Person implements Serializable { + private String name; + private int age; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + } + // $example off:create_ds$ + + public static void main(String[] args) { + // $example on:init_session$ + SparkSession spark = SparkSession + .builder() + .appName("Java Spark SQL Example") + .config("spark.some.config.option", "some-value") + .getOrCreate(); + // $example off:init_session$ + + runBasicDataFrameExample(spark); + runDatasetCreationExample(spark); + runInferSchemaExample(spark); + runProgrammaticSchemaExample(spark); + + spark.stop(); + } + + private static void runBasicDataFrameExample(SparkSession spark) { + // $example on:create_df$ + Dataset df = spark.read().json("examples/src/main/resources/people.json"); + + // Displays the content of the DataFrame to stdout + df.show(); + // +----+-------+ + // | age| name| + // +----+-------+ + // |null|Michael| + // | 30| Andy| + // | 19| Justin| + // +----+-------+ + // $example off:create_df$ + + // $example on:untyped_ops$ + // Print the schema in a tree format + df.printSchema(); + // root + // |-- age: long (nullable = true) + // |-- name: string (nullable = true) + + // Select only the "name" column + df.select("name").show(); + // +-------+ + // | name| + // +-------+ + // |Michael| + // | Andy| + // | Justin| + // +-------+ + + // Select everybody, but increment the age by 1 + df.select(col("name"), col("age").plus(1)).show(); + // +-------+---------+ + // | name|(age + 1)| + // +-------+---------+ + // |Michael| null| + // | Andy| 31| + // | Justin| 20| + // +-------+---------+ + + // Select people older than 21 + df.filter(col("age").gt(21)).show(); + // +---+----+ + // |age|name| + // +---+----+ + // | 30|Andy| + // +---+----+ + + // Count people by age + df.groupBy("age").count().show(); + // +----+-----+ + // | age|count| + // +----+-----+ + // | 19| 1| + // |null| 1| + // | 30| 1| + // +----+-----+ + // $example off:untyped_ops$ + + // $example on:run_sql$ + // Register the DataFrame as a SQL temporary view + df.createOrReplaceTempView("people"); + + Dataset sqlDF = spark.sql("SELECT * FROM people"); + sqlDF.show(); + // +----+-------+ + // | age| name| + // +----+-------+ + // |null|Michael| + // | 30| Andy| + // | 19| Justin| + // +----+-------+ + // $example off:run_sql$ + } + + private static void runDatasetCreationExample(SparkSession spark) { + // $example on:create_ds$ + // Create an instance of a Bean class + Person person = new Person(); + person.setName("Andy"); + person.setAge(32); + + // Encoders are created for Java beans + Encoder personEncoder = Encoders.bean(Person.class); + Dataset javaBeanDS = spark.createDataset( + Collections.singletonList(person), + personEncoder + ); + javaBeanDS.show(); + // +---+----+ + // |age|name| + // +---+----+ + // | 32|Andy| + // +---+----+ + + // Encoders for most common types are provided in class Encoders + Encoder integerEncoder = Encoders.INT(); + Dataset primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); + Dataset transformedDS = primitiveDS.map(new MapFunction() { + @Override + public Integer call(Integer value) throws Exception { + return value + 1; + } + }, integerEncoder); + transformedDS.collect(); // Returns [2, 3, 4] + + // DataFrames can be converted to a Dataset by providing a class. Mapping based on name + String path = "examples/src/main/resources/people.json"; + Dataset peopleDS = spark.read().json(path).as(personEncoder); + peopleDS.show(); + // +----+-------+ + // | age| name| + // +----+-------+ + // |null|Michael| + // | 30| Andy| + // | 19| Justin| + // +----+-------+ + // $example off:create_ds$ + } + + private static void runInferSchemaExample(SparkSession spark) { + // $example on:schema_inferring$ + // Create an RDD of Person objects from a text file + JavaRDD peopleRDD = spark.read() + .textFile("examples/src/main/resources/people.txt") + .javaRDD() + .map(new Function() { + @Override + public Person call(String line) throws Exception { + String[] parts = line.split(","); + Person person = new Person(); + person.setName(parts[0]); + person.setAge(Integer.parseInt(parts[1].trim())); + return person; + } + }); + + // Apply a schema to an RDD of JavaBeans to get a DataFrame + Dataset peopleDF = spark.createDataFrame(peopleRDD, Person.class); + // Register the DataFrame as a temporary view + peopleDF.createOrReplaceTempView("people"); + + // SQL statements can be run by using the sql methods provided by spark + Dataset teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); + + // The columns of a row in the result can be accessed by field index + Encoder stringEncoder = Encoders.STRING(); + Dataset teenagerNamesByIndexDF = teenagersDF.map(new MapFunction() { + @Override + public String call(Row row) throws Exception { + return "Name: " + row.getString(0); + } + }, stringEncoder); + teenagerNamesByIndexDF.show(); + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + + // or by field name + Dataset teenagerNamesByFieldDF = teenagersDF.map(new MapFunction() { + @Override + public String call(Row row) throws Exception { + return "Name: " + row.getAs("name"); + } + }, stringEncoder); + teenagerNamesByFieldDF.show(); + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + // $example off:schema_inferring$ + } + + private static void runProgrammaticSchemaExample(SparkSession spark) { + // $example on:programmatic_schema$ + // Create an RDD + JavaRDD peopleRDD = spark.sparkContext() + .textFile("examples/src/main/resources/people.txt", 1) + .toJavaRDD(); + + // The schema is encoded in a string + String schemaString = "name age"; + + // Generate the schema based on the string of schema + List fields = new ArrayList<>(); + for (String fieldName : schemaString.split(" ")) { + StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); + fields.add(field); + } + StructType schema = DataTypes.createStructType(fields); + + // Convert records of the RDD (people) to Rows + JavaRDD rowRDD = peopleRDD.map(new Function() { + @Override + public Row call(String record) throws Exception { + String[] attributes = record.split(","); + return RowFactory.create(attributes[0], attributes[1].trim()); + } + }); + + // Apply the schema to the RDD + Dataset peopleDataFrame = spark.createDataFrame(rowRDD, schema); + + // Creates a temporary view using the DataFrame + peopleDataFrame.createOrReplaceTempView("people"); + + // SQL can be run over a temporary view created using DataFrames + Dataset results = spark.sql("SELECT name FROM people"); + + // The results of SQL queries are DataFrames and support all the normal RDD operations + // The columns of a row in the result can be accessed by field index or by field name + Dataset namesDS = results.map(new MapFunction() { + @Override + public String call(Row row) throws Exception { + return "Name: " + row.getString(0); + } + }, Encoders.STRING()); + namesDS.show(); + // +-------------+ + // | value| + // +-------------+ + // |Name: Michael| + // | Name: Andy| + // | Name: Justin| + // +-------------+ + // $example off:programmatic_schema$ + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java b/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java new file mode 100644 index 000000000000..76dd160d5568 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.sql.hive; + +// $example on:spark_hive$ +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +// $example off:spark_hive$ + +public class JavaSparkHiveExample { + + // $example on:spark_hive$ + public static class Record implements Serializable { + private int key; + private String value; + + public int getKey() { + return key; + } + + public void setKey(int key) { + this.key = key; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + } + // $example off:spark_hive$ + + public static void main(String[] args) { + // $example on:spark_hive$ + // warehouseLocation points to the default location for managed databases and tables + String warehouseLocation = "file:" + System.getProperty("user.dir") + "spark-warehouse"; + SparkSession spark = SparkSession + .builder() + .appName("Java Spark Hive Example") + .config("spark.sql.warehouse.dir", warehouseLocation) + .enableHiveSupport() + .getOrCreate(); + + spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); + spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"); + + // Queries are expressed in HiveQL + spark.sql("SELECT * FROM src").show(); + // +---+-------+ + // |key| value| + // +---+-------+ + // |238|val_238| + // | 86| val_86| + // |311|val_311| + // ... + + // Aggregation queries are also supported. + spark.sql("SELECT COUNT(*) FROM src").show(); + // +--------+ + // |count(1)| + // +--------+ + // | 500 | + // +--------+ + + // The results of SQL queries are themselves DataFrames and support all normal functions. + Dataset sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); + + // The items in DaraFrames are of type Row, which lets you to access each column by ordinal. + Dataset stringsDS = sqlDF.map(new MapFunction() { + @Override + public String call(Row row) throws Exception { + return "Key: " + row.get(0) + ", Value: " + row.get(1); + } + }, Encoders.STRING()); + stringsDS.show(); + // +--------------------+ + // | value| + // +--------------------+ + // |Key: 0, Value: val_0| + // |Key: 0, Value: val_0| + // |Key: 0, Value: val_0| + // ... + + // You can also use DataFrames to create temporary views within a SparkSession. + List records = new ArrayList<>(); + for (int key = 1; key < 100; key++) { + Record record = new Record(); + record.setKey(key); + record.setValue("val_" + key); + records.add(record); + } + Dataset recordsDF = spark.createDataFrame(records, Record.class); + recordsDF.createOrReplaceTempView("records"); + + // Queries can then join DataFrames data with data stored in Hive. + spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); + // +---+------+---+------+ + // |key| value|key| value| + // +---+------+---+------+ + // | 2| val_2| 2| val_2| + // | 2| val_2| 2| val_2| + // | 4| val_4| 4| val_4| + // ... + // $example off:spark_hive$ + + spark.stop(); + } +} diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py index 80290e7de9b0..6d3241876ad5 100755 --- a/examples/src/main/python/als.py +++ b/examples/src/main/python/als.py @@ -39,7 +39,7 @@ def rmse(R, ms, us): return np.sqrt(np.sum(np.power(diff, 2)) / (M * U)) -def update(i, vec, mat, ratings): +def update(i, mat, ratings): uu = mat.shape[0] ff = mat.shape[1] @@ -88,7 +88,7 @@ def update(i, vec, mat, ratings): for i in range(ITERATIONS): ms = sc.parallelize(range(M), partitions) \ - .map(lambda x: update(x, msb.value[x, :], usb.value, Rb.value)) \ + .map(lambda x: update(x, usb.value, Rb.value)) \ .collect() # collect() returns a list, so array ends up being # a 3-d array, we take the first 2 dims for the matrix @@ -96,7 +96,7 @@ def update(i, vec, mat, ratings): msb = sc.broadcast(ms) us = sc.parallelize(range(U), partitions) \ - .map(lambda x: update(x, usb.value[x, :], msb.value, Rb.value.T)) \ + .map(lambda x: update(x, msb.value, Rb.value.T)) \ .collect() us = matrix(np.array(us)[:, :, 0]) usb = sc.broadcast(us) diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py index 060f0171ffdb..2f0ca995e55c 100644 --- a/examples/src/main/python/ml/aft_survival_regression.py +++ b/examples/src/main/python/ml/aft_survival_regression.py @@ -32,7 +32,7 @@ if __name__ == "__main__": spark = SparkSession \ .builder \ - .appName("PythonAFTSurvivalRegressionExample") \ + .appName("AFTSurvivalRegressionExample") \ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index ee0399ac5eb2..1263cb5d177a 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -31,7 +31,7 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("PythonBisectingKMeansExample")\ + .appName("BisectingKMeansExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py index a41df6cf946f..907eec67a0eb 100644 --- a/examples/src/main/python/ml/cross_validator.py +++ b/examples/src/main/python/ml/cross_validator.py @@ -24,7 +24,7 @@ from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # $example off$ -from pyspark.sql import Row, SparkSession +from pyspark.sql import SparkSession """ A simple example demonstrating model selection using CrossValidator. @@ -39,6 +39,7 @@ .builder\ .appName("CrossValidatorExample")\ .getOrCreate() + # $example on$ # Prepare training documents, which are labeled. training = spark.createDataFrame([ diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py index c1818d72fe46..109f901012c9 100644 --- a/examples/src/main/python/ml/dataframe_example.py +++ b/examples/src/main/python/ml/dataframe_example.py @@ -34,15 +34,16 @@ if len(sys.argv) > 2: print("Usage: dataframe_example.py ", file=sys.stderr) exit(-1) - spark = SparkSession\ - .builder\ - .appName("DataFrameExample")\ - .getOrCreate() - if len(sys.argv) == 2: + elif len(sys.argv) == 2: input = sys.argv[1] else: input = "data/mllib/sample_libsvm_data.txt" + spark = SparkSession \ + .builder \ + .appName("DataFrameExample") \ + .getOrCreate() + # Load input data print("Loading LIBSVM file with UDT from " + input + ".") df = spark.read.format("libsvm").load(input).cache() diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py index 708f1af6cc6e..d6e2977de008 100644 --- a/examples/src/main/python/ml/decision_tree_classification_example.py +++ b/examples/src/main/python/ml/decision_tree_classification_example.py @@ -31,7 +31,7 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("decision_tree_classification_example")\ + .appName("DecisionTreeClassificationExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py index 3bd3fd30f8e5..eb2105143539 100644 --- a/examples/src/main/python/ml/estimator_transformer_param_example.py +++ b/examples/src/main/python/ml/estimator_transformer_param_example.py @@ -18,6 +18,7 @@ """ Estimator Transformer Param Example. """ +from __future__ import print_function # $example on$ from pyspark.ml.linalg import Vectors @@ -42,7 +43,7 @@ # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. - print "LogisticRegression parameters:\n" + lr.explainParams() + "\n" + print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) @@ -51,8 +52,8 @@ # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. - print "Model 1 was fit using parameters: " - print model1.extractParamMap() + print("Model 1 was fit using parameters: ") + print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20} @@ -67,8 +68,8 @@ # Now learn a new model using the paramMapCombined parameters. # paramMapCombined overrides all parameters set earlier via lr.set* methods. model2 = lr.fit(training, paramMapCombined) - print "Model 2 was fit using parameters: " - print model2.extractParamMap() + print("Model 2 was fit using parameters: ") + print(model2.extractParamMap()) # Prepare test data test = spark.createDataFrame([ @@ -81,9 +82,12 @@ # Note that model2.transform() outputs a "myProbability" column instead of the usual # 'probability' column since we renamed the lr.probabilityCol parameter previously. prediction = model2.transform(test) - selected = prediction.select("features", "label", "myProbability", "prediction") - for row in selected.collect(): - print row + result = prediction.select("features", "label", "myProbability", "prediction") \ + .collect() + + for row in result: + print("features=%s, label=%s -> prob=%s, prediction=%s" + % (row.features, row.label, row.myProbability, row.prediction)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py index 2ca13d68f689..edc258de0568 100644 --- a/examples/src/main/python/ml/gaussian_mixture_example.py +++ b/examples/src/main/python/ml/gaussian_mixture_example.py @@ -31,7 +31,7 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("PythonGuassianMixtureExample")\ + .appName("GaussianMixtureExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py index 6c2d7e7b810d..c2042fd7b7b0 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py @@ -31,7 +31,7 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("gradient_boosted_tree_classifier_example")\ + .appName("GradientBoostedTreeClassifierExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py index 5dd2272748d7..cc96c973e4b2 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py @@ -31,7 +31,7 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("gradient_boosted_tree_regressor_example")\ + .appName("GradientBoostedTreeRegressorExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py index 1e61bd8eff14..a41b8ffacbde 100644 --- a/examples/src/main/python/ml/isotonic_regression_example.py +++ b/examples/src/main/python/ml/isotonic_regression_example.py @@ -21,7 +21,7 @@ from __future__ import print_function # $example on$ -from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel +from pyspark.ml.regression import IsotonicRegression # $example off$ from pyspark.sql import SparkSession @@ -30,11 +30,11 @@ Run with: bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py """ -if __name__ == "__main__": +if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("PythonIsotonicRegressionExample")\ + .appName("IsotonicRegressionExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index 4b8b7291f918..6846ec459971 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -31,12 +31,10 @@ This example requires NumPy (http://www.numpy.org/). """ - if __name__ == "__main__": - spark = SparkSession\ .builder\ - .appName("PythonKMeansExample")\ + .appName("KMeansExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py index 5ce810fccc6f..2dc1742ff7a0 100644 --- a/examples/src/main/python/ml/lda_example.py +++ b/examples/src/main/python/ml/lda_example.py @@ -23,16 +23,13 @@ # $example off$ from pyspark.sql import SparkSession - """ An example demonstrating LDA. Run with: bin/spark-submit examples/src/main/python/ml/lda_example.py """ - if __name__ == "__main__": - # Creates a SparkSession spark = SparkSession \ .builder \ .appName("LDAExample") \ diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py index aa33bef5a3dd..2cc38c285579 100644 --- a/examples/src/main/python/ml/multilayer_perceptron_classification.py +++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py @@ -31,18 +31,23 @@ # Load training data data = spark.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") + # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] + # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] + # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) + # train the model model = trainer.fit(train) + # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py index 9ac07f2c8ee2..55263adb4666 100644 --- a/examples/src/main/python/ml/n_gram_example.py +++ b/examples/src/main/python/ml/n_gram_example.py @@ -34,8 +34,10 @@ (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]) ], ["label", "words"]) + ngram = NGram(inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordDataFrame) + for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3): print(ngrams_label) # $example off$ diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index 8bc32222fe32..aa23f298c8c5 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -26,13 +26,14 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("naive_bayes_example")\ + .appName("NaiveBayesExample")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") + # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] @@ -43,6 +44,7 @@ # train the model model = nb.fit(train) + # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index b82087bebae8..8e00c25d9342 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -30,11 +30,10 @@ bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py """ - if __name__ == "__main__": spark = SparkSession \ .builder \ - .appName("PythonOneVsRestExample") \ + .appName("OneVsRestExample") \ .getOrCreate() # $example on$ @@ -62,7 +61,7 @@ # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) - print("Test Error : " + str(1 - accuracy)) + print("Test Error = %g" % (1.0 - accuracy)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index b9fceef68e70..47faf8d2026f 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -41,6 +41,7 @@ stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) + encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec") encoded = encoder.transform(indexed) encoded.select("id", "categoryVec").show() diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py index 414629ff88bf..38746aced096 100644 --- a/examples/src/main/python/ml/pca_example.py +++ b/examples/src/main/python/ml/pca_example.py @@ -34,8 +34,10 @@ (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] df = spark.createDataFrame(data, ["features"]) + pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df) + result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # $example off$ diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py index bd10cfd7a252..2d0865578ac7 100644 --- a/examples/src/main/python/ml/pipeline_example.py +++ b/examples/src/main/python/ml/pipeline_example.py @@ -38,12 +38,13 @@ (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), - (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) + (3L, "hadoop mapreduce", 0.0) + ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") - lr = LogisticRegression(maxIter=10, regParam=0.01) + lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. @@ -53,8 +54,9 @@ test = spark.createDataFrame([ (4L, "spark i j k"), (5L, "l m n"), - (6L, "mapreduce spark"), - (7L, "apache hadoop")], ["id", "text"]) + (6L, "spark hadoop spark"), + (7L, "apache hadoop") + ], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py index b46c1ba2f439..b464ee86b6e8 100644 --- a/examples/src/main/python/ml/polynomial_expansion_example.py +++ b/examples/src/main/python/ml/polynomial_expansion_example.py @@ -30,13 +30,15 @@ .getOrCreate() # $example on$ - df = spark\ - .createDataFrame([(Vectors.dense([-2.0, 2.3]),), - (Vectors.dense([0.0, 0.0]),), - (Vectors.dense([0.6, -1.1]),)], - ["features"]) + df = spark.createDataFrame([ + (Vectors.dense([-2.0, 2.3]),), + (Vectors.dense([0.0, 0.0]),), + (Vectors.dense([0.6, -1.1]),) + ], ["features"]) + px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(df) + for expanded in polyDF.select("polyFeatures").take(3): print(expanded) # $example off$ diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py index 6f422f840ad2..788a0baffebb 100644 --- a/examples/src/main/python/ml/quantile_discretizer_example.py +++ b/examples/src/main/python/ml/quantile_discretizer_example.py @@ -22,18 +22,22 @@ # $example off$ from pyspark.sql import SparkSession - if __name__ == "__main__": - spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate() + spark = SparkSession\ + .builder\ + .appName("QuantileDiscretizerExample")\ + .getOrCreate() # $example on$ data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)] df = spark.createDataFrame(data, ["id", "hour"]) # $example off$ + # Output of QuantileDiscretizer for such small datasets can depend on the number of # partitions. Here we force a single partition to ensure consistent results. # Note this is not necessary for normal use cases df = df.repartition(1) + # $example on$ discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py index eb9ded9af555..4eaa94dd7f48 100644 --- a/examples/src/main/python/ml/random_forest_classifier_example.py +++ b/examples/src/main/python/ml/random_forest_classifier_example.py @@ -23,7 +23,7 @@ # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier -from pyspark.ml.feature import StringIndexer, VectorIndexer +from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ from pyspark.sql import SparkSession @@ -31,7 +31,7 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("random_forest_classifier_example")\ + .appName("RandomForestClassifierExample")\ .getOrCreate() # $example on$ @@ -41,6 +41,7 @@ # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) + # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ @@ -52,8 +53,12 @@ # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) + # Convert indexed labels back to original labels. + labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", + labels=labelIndexer.labels) + # Chain indexers and forest in a Pipeline - pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) + pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) @@ -62,7 +67,7 @@ predictions = model.transform(testData) # Select example rows to display. - predictions.select("prediction", "indexedLabel", "features").show(5) + predictions.select("predictedLabel", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py index 3a793737dba8..a34edff2ecaa 100644 --- a/examples/src/main/python/ml/random_forest_regressor_example.py +++ b/examples/src/main/python/ml/random_forest_regressor_example.py @@ -31,7 +31,7 @@ if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("random_forest_regressor_example")\ + .appName("RandomForestRegressorExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py index d5df3ce4f591..6629239db29e 100644 --- a/examples/src/main/python/ml/rformula_example.py +++ b/examples/src/main/python/ml/rformula_example.py @@ -34,10 +34,12 @@ (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) + formula = RFormula( formula="clicked ~ country + hour", featuresCol="features", labelCol="label") + output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # $example off$ diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py deleted file mode 100644 index 2f1eaa6f947f..000000000000 --- a/examples/src/main/python/ml/simple_params_example.py +++ /dev/null @@ -1,95 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import pprint -import sys - -from pyspark.ml.classification import LogisticRegression -from pyspark.ml.linalg import DenseVector -from pyspark.sql import Row, SparkSession - -""" -A simple example demonstrating ways to specify parameters for Estimators and Transformers. -Run with: - bin/spark-submit examples/src/main/python/ml/simple_params_example.py -""" - -if __name__ == "__main__": - spark = SparkSession \ - .builder \ - .appName("SimpleParamsExample") \ - .getOrCreate() - - # prepare training data. - # We create an RDD of LabeledPoints and convert them into a DataFrame. - # A LabeledPoint is an Object with two fields named label and features - # and Spark SQL identifies these fields and creates the schema appropriately. - training = spark.createDataFrame([ - Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])), - Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])), - Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])), - Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))]) - - # Create a LogisticRegression instance with maxIter = 10. - # This instance is an Estimator. - lr = LogisticRegression(maxIter=10) - # Print out the parameters, documentation, and any default values. - print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") - - # We may also set parameters using setter methods. - lr.setRegParam(0.01) - - # Learn a LogisticRegression model. This uses the parameters stored in lr. - model1 = lr.fit(training) - - # Since model1 is a Model (i.e., a Transformer produced by an Estimator), - # we can view the parameters it used during fit(). - # This prints the parameter (name: value) pairs, where names are unique IDs for this - # LogisticRegression instance. - print("Model 1 was fit using parameters:\n") - pprint.pprint(model1.extractParamMap()) - - # We may alternatively specify parameters using a parameter map. - # paramMap overrides all lr parameters set earlier. - paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"} - - # Now learn a new model using the new parameters. - model2 = lr.fit(training, paramMap) - print("Model 2 was fit using parameters:\n") - pprint.pprint(model2.extractParamMap()) - - # prepare test data. - test = spark.createDataFrame([ - Row(label=1.0, features=DenseVector([-1.0, 1.5, 1.3])), - Row(label=0.0, features=DenseVector([3.0, 2.0, -0.1])), - Row(label=0.0, features=DenseVector([0.0, 2.2, -1.5]))]) - - # Make predictions on test data using the Transformer.transform() method. - # LogisticRegressionModel.transform will only use the 'features' column. - # Note that model2.transform() outputs a 'myProbability' column instead of the usual - # 'probability' column since we renamed the lr.probabilityCol parameter previously. - result = model2.transform(test) \ - .select("features", "label", "myProbability", "prediction") \ - .collect() - - for row in result: - print("features=%s,label=%s -> prob=%s, prediction=%s" - % (row.features, row.label, row.myProbability, row.prediction)) - - spark.stop() diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py deleted file mode 100644 index b528b59be962..000000000000 --- a/examples/src/main/python/ml/simple_text_classification_pipeline.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark.ml import Pipeline -from pyspark.ml.classification import LogisticRegression -from pyspark.ml.feature import HashingTF, Tokenizer -from pyspark.sql import Row, SparkSession - - -""" -A simple text classification pipeline that recognizes "spark" from -input text. This is to show how to create and configure a Spark ML -pipeline in Python. Run with: - - bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py -""" - - -if __name__ == "__main__": - spark = SparkSession\ - .builder\ - .appName("SimpleTextClassificationPipeline")\ - .getOrCreate() - - # Prepare training documents, which are labeled. - training = spark.createDataFrame([ - (0, "a b c d e spark", 1.0), - (1, "b d", 0.0), - (2, "spark f g h", 1.0), - (3, "hadoop mapreduce", 0.0) - ], ["id", "text", "label"]) - - # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. - tokenizer = Tokenizer(inputCol="text", outputCol="words") - hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features") - lr = LogisticRegression(maxIter=10, regParam=0.001) - pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) - - # Fit the pipeline to training documents. - model = pipeline.fit(training) - - # Prepare test documents, which are unlabeled. - test = spark.createDataFrame([ - (4, "spark i j k"), - (5, "l m n"), - (6, "spark hadoop spark"), - (7, "apache hadoop") - ], ["id", "text"]) - - # Make predictions on test documents and print columns of interest. - prediction = model.transform(test) - selected = prediction.select("id", "text", "prediction") - for row in selected.collect(): - print(row) - - spark.stop() diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py index 395fdeffc537..8a8392cc1fca 100644 --- a/examples/src/main/python/ml/stopwords_remover_example.py +++ b/examples/src/main/python/ml/stopwords_remover_example.py @@ -30,7 +30,7 @@ # $example on$ sentenceData = spark.createDataFrame([ - (0, ["I", "saw", "the", "red", "baloon"]), + (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["label", "raw"]) diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py index a328e040f563..2255bfb9c1a6 100644 --- a/examples/src/main/python/ml/string_indexer_example.py +++ b/examples/src/main/python/ml/string_indexer_example.py @@ -32,6 +32,7 @@ df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) + indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") indexed = indexer.fit(df).transform(df) indexed.show() diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index fb4ad992fb80..4ab7eb696499 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -34,8 +34,10 @@ (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") ], ["label", "sentence"]) + tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) + hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors @@ -43,6 +45,7 @@ idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) + for features_label in rescaledData.select("features", "label").take(3): print(features_label) # $example off$ diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py index e61ec920d228..89f506070533 100644 --- a/examples/src/main/python/ml/tokenizer_example.py +++ b/examples/src/main/python/ml/tokenizer_example.py @@ -34,12 +34,19 @@ (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) + tokenizer = Tokenizer(inputCol="sentence", outputCol="words") - wordsDataFrame = tokenizer.transform(sentenceDataFrame) - for words_label in wordsDataFrame.select("words", "label").take(3): - print(words_label) + regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) + + tokenized = tokenizer.transform(sentenceDataFrame) + for words_label in tokenized.select("words", "label").take(3): + print(words_label) + + regexTokenized = regexTokenizer.transform(sentenceDataFrame) + for words_label in regexTokenized.select("words", "label").take(3): + print(words_label) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py index 5f5c52aca8c4..a92b861f8317 100644 --- a/examples/src/main/python/ml/train_validation_split.py +++ b/examples/src/main/python/ml/train_validation_split.py @@ -35,18 +35,21 @@ .builder\ .appName("TrainValidationSplit")\ .getOrCreate() + # $example on$ # Prepare training and test data. data = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") - train, test = data.randomSplit([0.7, 0.3]) - lr = LinearRegression(maxIter=10, regParam=0.1) + train, test = data.randomSplit([0.9, 0.1], seed=12345) + + lr = LinearRegression(maxIter=10) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ + .addGrid(lr.fitIntercept, [False, True])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() @@ -60,6 +63,7 @@ # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) + # Make predictions on test data. model is the model with combination of parameters # that performed best. prediction = model.transform(test) diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py index bbfc316ff2d3..eac33711adaa 100644 --- a/examples/src/main/python/ml/vector_assembler_example.py +++ b/examples/src/main/python/ml/vector_assembler_example.py @@ -33,9 +33,11 @@ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) + assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") + output = assembler.transform(dataset) print(output.select("features", "clicked").first()) # $example off$ diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 9b00e0f84136..3912c135be2e 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -30,6 +30,7 @@ # $example on$ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 66500bee152f..78a91c92fc53 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -35,9 +35,11 @@ ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), ) ], ["text"]) + # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) + result = model.transform(documentDF) for feature in result.select("result").take(3): print(feature) diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py deleted file mode 100644 index ea11d2c4c7b3..000000000000 --- a/examples/src/main/python/sql.py +++ /dev/null @@ -1,83 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import os -import sys - -# $example on:init_session$ -from pyspark.sql import SparkSession -# $example off:init_session$ -from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType - - -if __name__ == "__main__": - # $example on:init_session$ - spark = SparkSession\ - .builder\ - .appName("PythonSQL")\ - .config("spark.some.config.option", "some-value")\ - .getOrCreate() - # $example off:init_session$ - - # A list of Rows. Infer schema from the first row, create a DataFrame and print the schema - rows = [Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)] - some_df = spark.createDataFrame(rows) - some_df.printSchema() - - # A list of tuples - tuples = [("John", 19), ("Smith", 23), ("Sarah", 18)] - # Schema with two fields - person_name and person_age - schema = StructType([StructField("person_name", StringType(), False), - StructField("person_age", IntegerType(), False)]) - # Create a DataFrame by applying the schema to the RDD and print the schema - another_df = spark.createDataFrame(tuples, schema) - another_df.printSchema() - # root - # |-- age: long (nullable = true) - # |-- name: string (nullable = true) - - # A JSON dataset is pointed to by path. - # The path can be either a single text file or a directory storing text files. - if len(sys.argv) < 2: - path = "file://" + \ - os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json") - else: - path = sys.argv[1] - # Create a DataFrame from the file(s) pointed to by path - people = spark.read.json(path) - # root - # |-- person_name: string (nullable = false) - # |-- person_age: integer (nullable = false) - - # The inferred schema can be visualized using the printSchema() method. - people.printSchema() - # root - # |-- age: long (nullable = true) - # |-- name: string (nullable = true) - - # Creates a temporary view using the DataFrame. - people.createOrReplaceTempView("people") - - # SQL statements can be run by using the sql methods provided by `spark` - teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") - - for each in teenagers.collect(): - print(each[0]) - - spark.stop() diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py new file mode 100644 index 000000000000..74f5009581e4 --- /dev/null +++ b/examples/src/main/python/sql/basic.py @@ -0,0 +1,194 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +# $example on:init_session$ +from pyspark.sql import SparkSession +# $example off:init_session$ + +# $example on:schema_inferring$ +from pyspark.sql import Row +# $example off:schema_inferring$ + +# $example on:programmatic_schema$ +# Import data types +from pyspark.sql.types import * +# $example off:programmatic_schema$ + +""" +A simple example demonstrating basic Spark SQL features. +Run with: + ./bin/spark-submit examples/src/main/python/sql/basic.py +""" + + +def basic_df_example(spark): + # $example on:create_df$ + # spark is an existing SparkSession + df = spark.read.json("examples/src/main/resources/people.json") + # Displays the content of the DataFrame to stdout + df.show() + # +----+-------+ + # | age| name| + # +----+-------+ + # |null|Michael| + # | 30| Andy| + # | 19| Justin| + # +----+-------+ + # $example off:create_df$ + + # $example on:untyped_ops$ + # spark, df are from the previous example + # Print the schema in a tree format + df.printSchema() + # root + # |-- age: long (nullable = true) + # |-- name: string (nullable = true) + + # Select only the "name" column + df.select("name").show() + # +-------+ + # | name| + # +-------+ + # |Michael| + # | Andy| + # | Justin| + # +-------+ + + # Select everybody, but increment the age by 1 + df.select(df['name'], df['age'] + 1).show() + # +-------+---------+ + # | name|(age + 1)| + # +-------+---------+ + # |Michael| null| + # | Andy| 31| + # | Justin| 20| + # +-------+---------+ + + # Select people older than 21 + df.filter(df['age'] > 21).show() + # +---+----+ + # |age|name| + # +---+----+ + # | 30|Andy| + # +---+----+ + + # Count people by age + df.groupBy("age").count().show() + # +----+-----+ + # | age|count| + # +----+-----+ + # | 19| 1| + # |null| 1| + # | 30| 1| + # +----+-----+ + # $example off:untyped_ops$ + + # $example on:run_sql$ + # Register the DataFrame as a SQL temporary view + df.createOrReplaceTempView("people") + + sqlDF = spark.sql("SELECT * FROM people") + sqlDF.show() + # +----+-------+ + # | age| name| + # +----+-------+ + # |null|Michael| + # | 30| Andy| + # | 19| Justin| + # +----+-------+ + # $example off:run_sql$ + + +def schema_inference_example(spark): + # $example on:schema_inferring$ + sc = spark.sparkContext + + # Load a text file and convert each line to a Row. + lines = sc.textFile("examples/src/main/resources/people.txt") + parts = lines.map(lambda l: l.split(",")) + people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) + + # Infer the schema, and register the DataFrame as a table. + schemaPeople = spark.createDataFrame(people) + schemaPeople.createOrReplaceTempView("people") + + # SQL can be run over DataFrames that have been registered as a table. + teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") + + # The results of SQL queries are Dataframe objects. + # rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`. + teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect() + for name in teenNames: + print(name) + # Name: Justin + # $example off:schema_inferring$ + + +def programmatic_schema_example(spark): + # $example on:programmatic_schema$ + sc = spark.sparkContext + + # Load a text file and convert each line to a Row. + lines = sc.textFile("examples/src/main/resources/people.txt") + parts = lines.map(lambda l: l.split(",")) + # Each line is converted to a tuple. + people = parts.map(lambda p: (p[0], p[1].strip())) + + # The schema is encoded in a string. + schemaString = "name age" + + fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] + schema = StructType(fields) + + # Apply the schema to the RDD. + schemaPeople = spark.createDataFrame(people, schema) + + # Creates a temporary view using the DataFrame + schemaPeople.createOrReplaceTempView("people") + + # Creates a temporary view using the DataFrame + schemaPeople.createOrReplaceTempView("people") + + # SQL can be run over DataFrames that have been registered as a table. + results = spark.sql("SELECT name FROM people") + + results.show() + # +-------+ + # | name| + # +-------+ + # |Michael| + # | Andy| + # | Justin| + # +-------+ + # $example off:programmatic_schema$ + +if __name__ == "__main__": + # $example on:init_session$ + spark = SparkSession \ + .builder \ + .appName("PythonSQL") \ + .config("spark.some.config.option", "some-value") \ + .getOrCreate() + # $example off:init_session$ + + basic_df_example(spark) + schema_inference_example(spark) + programmatic_schema_example(spark) + + spark.stop() diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py new file mode 100644 index 000000000000..0bdc3d66ff98 --- /dev/null +++ b/examples/src/main/python/sql/datasource.py @@ -0,0 +1,154 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark.sql import SparkSession +# $example on:schema_merging$ +from pyspark.sql import Row +# $example off:schema_merging$ + +""" +A simple example demonstrating Spark SQL data sources. +Run with: + ./bin/spark-submit examples/src/main/python/sql/datasource.py +""" + + +def basic_datasource_example(spark): + # $example on:generic_load_save_functions$ + df = spark.read.load("examples/src/main/resources/users.parquet") + df.select("name", "favorite_color").write.save("namesAndFavColors.parquet") + # $example off:generic_load_save_functions$ + + # $example on:manual_load_options$ + df = spark.read.load("examples/src/main/resources/people.json", format="json") + df.select("name", "age").write.save("namesAndAges.parquet", format="parquet") + # $example off:manual_load_options$ + + # $example on:direct_sql$ + df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") + # $example off:direct_sql$ + + +def parquet_example(spark): + # $example on:basic_parquet_example$ + peopleDF = spark.read.json("examples/src/main/resources/people.json") + + # DataFrames can be saved as Parquet files, maintaining the schema information. + peopleDF.write.parquet("people.parquet") + + # Read in the Parquet file created above. + # Parquet files are self-describing so the schema is preserved. + # The result of loading a parquet file is also a DataFrame. + parquetFile = spark.read.parquet("people.parquet") + + # Parquet files can also be used to create a temporary view and then used in SQL statements. + parquetFile.createOrReplaceTempView("parquetFile") + teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") + teenagers.show() + # +------+ + # | name| + # +------+ + # |Justin| + # +------+ + # $example off:basic_parquet_example$ + + +def parquet_schema_merging_example(spark): + # $example on:schema_merging$ + # spark is from the previous example. + # Create a simple DataFrame, stored into a partition directory + sc = spark.sparkContext + + squaresDF = spark.createDataFrame(sc.parallelize(range(1, 6)) + .map(lambda i: Row(single=i, double=i ** 2))) + squaresDF.write.parquet("data/test_table/key=1") + + # Create another DataFrame in a new partition directory, + # adding a new column and dropping an existing column + cubesDF = spark.createDataFrame(sc.parallelize(range(6, 11)) + .map(lambda i: Row(single=i, triple=i ** 3))) + cubesDF.write.parquet("data/test_table/key=2") + + # Read the partitioned table + mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table") + mergedDF.printSchema() + + # The final schema consists of all 3 columns in the Parquet files together + # with the partitioning column appeared in the partition directory paths. + # root + # |-- double: long (nullable = true) + # |-- single: long (nullable = true) + # |-- triple: long (nullable = true) + # |-- key: integer (nullable = true) + # $example off:schema_merging$ + + +def json_dataset_examplg(spark): + # $example on:json_dataset$ + # spark is from the previous example. + sc = spark.sparkContext + + # A JSON dataset is pointed to by path. + # The path can be either a single text file or a directory storing text files + path = "examples/src/main/resources/people.json" + peopleDF = spark.read.json(path) + + # The inferred schema can be visualized using the printSchema() method + peopleDF.printSchema() + # root + # |-- age: long (nullable = true) + # |-- name: string (nullable = true) + + # Creates a temporary view using the DataFrame + peopleDF.createOrReplaceTempView("people") + + # SQL statements can be run by using the sql methods provided by spark + teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19") + teenagerNamesDF.show() + # +------+ + # | name| + # +------+ + # |Justin| + # +------+ + + # Alternatively, a DataFrame can be created for a JSON dataset represented by + # an RDD[String] storing one JSON object per string + jsonStrings = ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}'] + otherPeopleRDD = sc.parallelize(jsonStrings) + otherPeople = spark.read.json(otherPeopleRDD) + otherPeople.show() + # +---------------+----+ + # | address|name| + # +---------------+----+ + # |[Columbus,Ohio]| Yin| + # +---------------+----+ + # $example off:json_dataset$ + +if __name__ == "__main__": + spark = SparkSession \ + .builder \ + .appName("PythonSQL") \ + .getOrCreate() + + basic_datasource_example(spark) + parquet_example(spark) + parquet_schema_merging_example(spark) + json_dataset_examplg(spark) + + spark.stop() diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py new file mode 100644 index 000000000000..d9ce5cef1f2b --- /dev/null +++ b/examples/src/main/python/sql/hive.py @@ -0,0 +1,96 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +# $example on:spark_hive$ +from os.path import expanduser, join + +from pyspark.sql import SparkSession +from pyspark.sql import Row +# $example off:spark_hive$ + +""" +A simple example demonstrating Spark SQL Hive integration. +Run with: + ./bin/spark-submit examples/src/main/python/sql/hive.py +""" + + +if __name__ == "__main__": + # $example on:spark_hive$ + # warehouse_location points to the default location for managed databases and tables + warehouse_location = 'file:${system:user.dir}/spark-warehouse' + + spark = SparkSession \ + .builder \ + .appName("PythonSQL") \ + .config("spark.sql.warehouse.dir", warehouse_location) \ + .enableHiveSupport() \ + .getOrCreate() + + # spark is an existing SparkSession + spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") + spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") + + # Queries are expressed in HiveQL + spark.sql("SELECT * FROM src").show() + # +---+-------+ + # |key| value| + # +---+-------+ + # |238|val_238| + # | 86| val_86| + # |311|val_311| + # ... + + # Aggregation queries are also supported. + spark.sql("SELECT COUNT(*) FROM src").show() + # +--------+ + # |count(1)| + # +--------+ + # | 500 | + # +--------+ + + # The results of SQL queries are themselves DataFrames and support all normal functions. + sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key") + + # The items in DaraFrames are of type Row, which allows you to access each column by ordinal. + stringsDS = sqlDF.rdd.map(lambda row: "Key: %d, Value: %s" % (row.key, row.value)) + for record in stringsDS.collect(): + print(record) + # Key: 0, Value: val_0 + # Key: 0, Value: val_0 + # Key: 0, Value: val_0 + # ... + + # You can also use DataFrames to create temporary views within a SparkSession. + Record = Row("key", "value") + recordsDF = spark.createDataFrame(map(lambda i: Record(i, "val_" + str(i)), range(1, 101))) + recordsDF.createOrReplaceTempView("records") + + # Queries can then join DataFrame data with data stored in Hive. + spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show() + # +---+------+---+------+ + # |key| value|key| value| + # +---+------+---+------+ + # | 2| val_2| 2| val_2| + # | 4| val_4| 4| val_4| + # | 5| val_5| 5| val_5| + # ... + # $example off:spark_hive$ + + spark.stop() diff --git a/examples/src/main/python/streaming/queue_stream.py b/examples/src/main/python/streaming/queue_stream.py index b3808907f74a..bdd2d4851949 100644 --- a/examples/src/main/python/streaming/queue_stream.py +++ b/examples/src/main/python/streaming/queue_stream.py @@ -22,7 +22,6 @@ To run this example use `$ bin/spark-submit examples/src/main/python/streaming/queue_stream.py """ -import sys import time from pyspark import SparkContext diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index eba3f1b91e2d..33e88e15fd47 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -144,7 +144,7 @@ write.df(df1, "data/test_table/key=1", "parquet", "overwrite") write.df(df2, "data/test_table/key=2", "parquet", "overwrite") # Read the partitioned table -df3 <- read.df("data/test_table", "parquet", mergeSchema="true") +df3 <- read.df("data/test_table", "parquet", mergeSchema = "true") printSchema(df3) # The final schema consists of all 3 columns in the Parquet files together @@ -195,3 +195,6 @@ results <- collect(sql("FROM src SELECT key, value")) # $example on:jdbc$ df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password") # $example off:jdbc$ + +# Stop the SparkSession now +sparkR.session.stop() diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index 295f9b427622..82b85f2f590f 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -18,7 +18,7 @@ library(SparkR) # Initialize SparkSession -sc <- sparkR.session(appName = "SparkR-DataFrame-example") +sparkR.session(appName = "SparkR-DataFrame-example") # Create a simple local data.frame localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala index a68fd0285f56..86eed3867c53 100644 --- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala @@ -18,7 +18,6 @@ // scalastyle:off println package org.apache.spark.examples -import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession /** diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala index 2c2bf421bc5d..26095b46f58e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala @@ -33,8 +33,10 @@ import org.apache.spark.sql.SparkSession */ object GaussianMixtureExample { def main(args: Array[String]): Unit = { - // Creates a SparkSession - val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate() + val spark = SparkSession + .builder + .appName(s"${this.getClass.getSimpleName}") + .getOrCreate() // $example on$ // Loads data diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala index 7c5d3f23411f..a840559d2445 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala @@ -33,8 +33,6 @@ import org.apache.spark.sql.SparkSession object IsotonicRegressionExample { def main(args: Array[String]): Unit = { - - // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index 2341b36db240..a1d19e138ded 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -34,7 +34,6 @@ import org.apache.spark.sql.SparkSession object KMeansExample { def main(args: Array[String]): Unit = { - // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala index 75fef2922adb..1cd2641f9a8d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala @@ -46,6 +46,7 @@ object ModelSelectionViaTrainValidationSplitExample { val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) val lr = new LinearRegression() + .setMaxIter(10) // We use a ParamGridBuilder to construct a grid of parameters to search over. // TrainValidationSplit will try all combinations of values and determine best model using diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala index e8a9b32da966..a39e3202ba19 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala @@ -39,27 +39,33 @@ object MultilayerPerceptronClassifierExample { // Load the data stored in LIBSVM format as a DataFrame. val data = spark.read.format("libsvm") .load("data/mllib/sample_multiclass_classification_data.txt") + // Split the data into train and test val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L) val train = splits(0) val test = splits(1) + // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) val layers = Array[Int](4, 5, 4, 3) + // create the trainer and set its parameters val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(100) + // train the model val model = trainer.fit(train) + // compute accuracy on the test set val result = model.transform(test) val predictionAndLabels = result.select("prediction", "label") val evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy") + println("Accuracy: " + evaluator.evaluate(predictionAndLabels)) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index 7089a4bc87aa..3ae0623c4c0f 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -30,6 +30,7 @@ object NaiveBayesExample { .builder .appName("NaiveBayesExample") .getOrCreate() + // $example on$ // Load the data stored in LIBSVM format as a DataFrame. val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala index b16692b1fa36..12f8663b9ce5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala @@ -54,7 +54,7 @@ object PipelineExample { .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) - .setRegParam(0.01) + .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) @@ -74,7 +74,7 @@ object PipelineExample { val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), - (6L, "mapreduce spark"), + (6L, "spark hadoop spark"), (7L, "apache hadoop") )).toDF("id", "text") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala index 2f7e217b8fe2..aedb9e7d3bb7 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala @@ -28,16 +28,16 @@ object QuantileDiscretizerExample { .builder .appName("QuantileDiscretizerExample") .getOrCreate() - import spark.implicits._ // $example on$ val data = Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)) - var df = spark.createDataFrame(data).toDF("id", "hour") + val df = spark.createDataFrame(data).toDF("id", "hour") // $example off$ // Output of QuantileDiscretizer for such small datasets can depend on the number of // partitions. Here we force a single partition to ensure consistent results. // Note this is not necessary for normal use cases .repartition(1) + // $example on$ val discretizer = new QuantileDiscretizer() .setInputCol("hour") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala index 9ea492014644..3498fa8a50c6 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala @@ -36,10 +36,12 @@ object RFormulaExample { (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0) )).toDF("id", "country", "hour", "clicked") + val formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") .setLabelCol("label") + val output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala deleted file mode 100644 index 29f1f509608a..000000000000 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.ml - -import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg.{Vector, Vectors} -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.sql.{Row, SparkSession} - -/** - * A simple example demonstrating ways to specify parameters for Estimators and Transformers. - * Run with - * {{{ - * bin/run-example ml.SimpleParamsExample - * }}} - */ -object SimpleParamsExample { - - def main(args: Array[String]) { - val spark = SparkSession - .builder - .appName("SimpleParamsExample") - .getOrCreate() - import spark.implicits._ - - // Prepare training data. - // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes - // into DataFrames, where it uses the case class metadata to infer the schema. - val training = spark.createDataFrame(Seq( - LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), - LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), - LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), - LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)))) - - // Create a LogisticRegression instance. This instance is an Estimator. - val lr = new LogisticRegression() - // Print out the parameters, documentation, and any default values. - println("LogisticRegression parameters:\n" + lr.explainParams() + "\n") - - // We may set parameters using setter methods. - lr.setMaxIter(10) - .setRegParam(0.01) - - // Learn a LogisticRegression model. This uses the parameters stored in lr. - val model1 = lr.fit(training) - // Since model1 is a Model (i.e., a Transformer produced by an Estimator), - // we can view the parameters it used during fit(). - // This prints the parameter (name: value) pairs, where names are unique IDs for this - // LogisticRegression instance. - println("Model 1 was fit using parameters: " + model1.parent.extractParamMap()) - - // We may alternatively specify parameters using a ParamMap, - // which supports several methods for specifying parameters. - val paramMap = ParamMap(lr.maxIter -> 20) - paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. - paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.5, 0.5)) // Specify multiple Params. - - // One can also combine ParamMaps. - val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name - val paramMapCombined = paramMap ++ paramMap2 - - // Now learn a new model using the paramMapCombined parameters. - // paramMapCombined overrides all parameters set earlier via lr.set* methods. - val model2 = lr.fit(training.toDF(), paramMapCombined) - println("Model 2 was fit using parameters: " + model2.parent.extractParamMap()) - - // Prepare test data. - val test = spark.createDataFrame(Seq( - LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), - LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), - LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)))) - - // Make predictions on test data using the Transformer.transform() method. - // LogisticRegressionModel.transform will only use the 'features' column. - // Note that model2.transform() outputs a 'myProbability' column instead of the usual - // 'probability' column since we renamed the lr.probabilityCol parameter previously. - model2.transform(test) - .select("features", "label", "myProbability", "prediction") - .collect() - .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => - println(s"($features, $label) -> prob=$prob, prediction=$prediction") - } - - spark.stop() - } -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala deleted file mode 100644 index 0b2a058bb61a..000000000000 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.ml - -import scala.beans.BeanInfo - -import org.apache.spark.ml.Pipeline -import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.feature.{HashingTF, Tokenizer} -import org.apache.spark.ml.linalg.Vector -import org.apache.spark.sql.{Row, SparkSession} - -@BeanInfo -case class LabeledDocument(id: Long, text: String, label: Double) - -@BeanInfo -case class Document(id: Long, text: String) - -/** - * A simple text classification pipeline that recognizes "spark" from input text. This is to show - * how to create and configure an ML pipeline. Run with - * {{{ - * bin/run-example ml.SimpleTextClassificationPipeline - * }}} - */ -object SimpleTextClassificationPipeline { - - def main(args: Array[String]) { - val spark = SparkSession - .builder - .appName("SimpleTextClassificationPipeline") - .getOrCreate() - import spark.implicits._ - - // Prepare training documents, which are labeled. - val training = spark.createDataFrame(Seq( - LabeledDocument(0L, "a b c d e spark", 1.0), - LabeledDocument(1L, "b d", 0.0), - LabeledDocument(2L, "spark f g h", 1.0), - LabeledDocument(3L, "hadoop mapreduce", 0.0))) - - // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. - val tokenizer = new Tokenizer() - .setInputCol("text") - .setOutputCol("words") - val hashingTF = new HashingTF() - .setNumFeatures(1000) - .setInputCol(tokenizer.getOutputCol) - .setOutputCol("features") - val lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.001) - val pipeline = new Pipeline() - .setStages(Array(tokenizer, hashingTF, lr)) - - // Fit the pipeline to training documents. - val model = pipeline.fit(training.toDF()) - - // Prepare test documents, which are unlabeled. - val test = spark.createDataFrame(Seq( - Document(4L, "spark i j k"), - Document(5L, "l m n"), - Document(6L, "spark hadoop spark"), - Document(7L, "apache hadoop"))) - - // Make predictions on test documents. - model.transform(test.toDF()) - .select("id", "text", "probability", "prediction") - .collect() - .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => - println(s"($id, $text) --> prob=$prob, prediction=$prediction") - } - - spark.stop() - } -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala index fb1a43e962cd..a56de0856dbb 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala @@ -36,7 +36,7 @@ object StopWordsRemoverExample { .setOutputCol("filtered") val dataSet = spark.createDataFrame(Seq( - (0, Seq("I", "saw", "the", "red", "baloon")), + (0, Seq("I", "saw", "the", "red", "balloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 33b5daec5978..97f6fcce15e3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -40,13 +40,16 @@ object TfIdfExample { val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) + val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) + val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) + val rescaledData = idfModel.transform(featurizedData) rescaledData.select("features", "label").take(3).foreach(println) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala index 1c70dc700b91..90d0faaf4750 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -45,6 +45,7 @@ object TokenizerExample { val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("words", "label").take(3).foreach(println) + val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("words", "label").take(3).foreach(println) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala index 9ac562360729..5c8bd19f20a8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala @@ -45,6 +45,7 @@ object Word2VecExample { .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) + val result = model.transform(documentDF) result.select("result").take(3).foreach(println) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala new file mode 100644 index 000000000000..0caba12af0bd --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.sql + +import org.apache.spark.sql.SparkSession + +object SQLDataSourceExample { + + case class Person(name: String, age: Long) + + def main(args: Array[String]) { + val spark = SparkSession + .builder() + .appName("Spark SQL Data Soures Example") + .config("spark.some.config.option", "some-value") + .getOrCreate() + + runBasicDataSourceExample(spark) + runBasicParquetExample(spark) + runParquetSchemaMergingExample(spark) + runJsonDatasetExample(spark) + + spark.stop() + } + + private def runBasicDataSourceExample(spark: SparkSession): Unit = { + // $example on:generic_load_save_functions$ + val usersDF = spark.read.load("examples/src/main/resources/users.parquet") + usersDF.select("name", "favorite_color").write.save("namesAndFavColors.parquet") + // $example off:generic_load_save_functions$ + // $example on:manual_load_options$ + val peopleDF = spark.read.format("json").load("examples/src/main/resources/people.json") + peopleDF.select("name", "age").write.format("parquet").save("namesAndAges.parquet") + // $example off:manual_load_options$ + // $example on:direct_sql$ + val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") + // $example off:direct_sql$ + } + + private def runBasicParquetExample(spark: SparkSession): Unit = { + // $example on:basic_parquet_example$ + // Encoders for most common types are automatically provided by importing spark.implicits._ + import spark.implicits._ + + val peopleDF = spark.read.json("examples/src/main/resources/people.json") + + // DataFrames can be saved as Parquet files, maintaining the schema information + peopleDF.write.parquet("people.parquet") + + // Read in the parquet file created above + // Parquet files are self-describing so the schema is preserved + // The result of loading a Parquet file is also a DataFrame + val parquetFileDF = spark.read.parquet("people.parquet") + + // Parquet files can also be used to create a temporary view and then used in SQL statements + parquetFileDF.createOrReplaceTempView("parquetFile") + val namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19") + namesDF.map(attributes => "Name: " + attributes(0)).show() + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + // $example off:basic_parquet_example$ + } + + private def runParquetSchemaMergingExample(spark: SparkSession): Unit = { + // $example on:schema_merging$ + // This is used to implicitly convert an RDD to a DataFrame. + import spark.implicits._ + + // Create a simple DataFrame, store into a partition directory + val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square") + squaresDF.write.parquet("data/test_table/key=1") + + // Create another DataFrame in a new partition directory, + // adding a new column and dropping an existing column + val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube") + cubesDF.write.parquet("data/test_table/key=2") + + // Read the partitioned table + val mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table") + mergedDF.printSchema() + + // The final schema consists of all 3 columns in the Parquet files together + // with the partitioning column appeared in the partition directory paths + // root + // |-- value: int (nullable = true) + // |-- square: int (nullable = true) + // |-- cube: int (nullable = true) + // |-- key : int (nullable = true) + // $example off:schema_merging$ + } + + private def runJsonDatasetExample(spark: SparkSession): Unit = { + // $example on:json_dataset$ + // A JSON dataset is pointed to by path. + // The path can be either a single text file or a directory storing text files + val path = "examples/src/main/resources/people.json" + val peopleDF = spark.read.json(path) + + // The inferred schema can be visualized using the printSchema() method + peopleDF.printSchema() + // root + // |-- age: long (nullable = true) + // |-- name: string (nullable = true) + + // Creates a temporary view using the DataFrame + peopleDF.createOrReplaceTempView("people") + + // SQL statements can be run by using the sql methods provided by spark + val teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19") + teenagerNamesDF.show() + // +------+ + // | name| + // +------+ + // |Justin| + // +------+ + + // Alternatively, a DataFrame can be created for a JSON dataset represented by + // an RDD[String] storing one JSON object per string + val otherPeopleRDD = spark.sparkContext.makeRDD( + """{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil) + val otherPeople = spark.read.json(otherPeopleRDD) + otherPeople.show() + // +---------------+----+ + // | address|name| + // +---------------+----+ + // |[Columbus,Ohio]| Yin| + // +---------------+----+ + // $example off:json_dataset$ + } + +} diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala new file mode 100644 index 000000000000..952c074d0345 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.sql + +// $example on:schema_inferring$ +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.Encoder +// $example off:schema_inferring$ +import org.apache.spark.sql.Row +// $example on:init_session$ +import org.apache.spark.sql.SparkSession +// $example off:init_session$ +// $example on:programmatic_schema$ +// $example on:data_types$ +import org.apache.spark.sql.types._ +// $example off:data_types$ +// $example off:programmatic_schema$ + +object SparkSQLExample { + + // $example on:create_ds$ + // Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit, + // you can use custom classes that implement the Product interface + case class Person(name: String, age: Long) + // $example off:create_ds$ + + def main(args: Array[String]) { + // $example on:init_session$ + val spark = SparkSession + .builder() + .appName("Spark SQL Example") + .config("spark.some.config.option", "some-value") + .getOrCreate() + + // For implicit conversions like converting RDDs to DataFrames + import spark.implicits._ + // $example off:init_session$ + + runBasicDataFrameExample(spark) + runDatasetCreationExample(spark) + runInferSchemaExample(spark) + runProgrammaticSchemaExample(spark) + + spark.stop() + } + + private def runBasicDataFrameExample(spark: SparkSession): Unit = { + // $example on:create_df$ + val df = spark.read.json("examples/src/main/resources/people.json") + + // Displays the content of the DataFrame to stdout + df.show() + // +----+-------+ + // | age| name| + // +----+-------+ + // |null|Michael| + // | 30| Andy| + // | 19| Justin| + // +----+-------+ + // $example off:create_df$ + + // $example on:untyped_ops$ + // This import is needed to use the $-notation + import spark.implicits._ + // Print the schema in a tree format + df.printSchema() + // root + // |-- age: long (nullable = true) + // |-- name: string (nullable = true) + + // Select only the "name" column + df.select("name").show() + // +-------+ + // | name| + // +-------+ + // |Michael| + // | Andy| + // | Justin| + // +-------+ + + // Select everybody, but increment the age by 1 + df.select($"name", $"age" + 1).show() + // +-------+---------+ + // | name|(age + 1)| + // +-------+---------+ + // |Michael| null| + // | Andy| 31| + // | Justin| 20| + // +-------+---------+ + + // Select people older than 21 + df.filter($"age" > 21).show() + // +---+----+ + // |age|name| + // +---+----+ + // | 30|Andy| + // +---+----+ + + // Count people by age + df.groupBy("age").count().show() + // +----+-----+ + // | age|count| + // +----+-----+ + // | 19| 1| + // |null| 1| + // | 30| 1| + // +----+-----+ + // $example off:untyped_ops$ + + // $example on:run_sql$ + // Register the DataFrame as a SQL temporary view + df.createOrReplaceTempView("people") + + val sqlDF = spark.sql("SELECT * FROM people") + sqlDF.show() + // +----+-------+ + // | age| name| + // +----+-------+ + // |null|Michael| + // | 30| Andy| + // | 19| Justin| + // +----+-------+ + // $example off:run_sql$ + } + + private def runDatasetCreationExample(spark: SparkSession): Unit = { + import spark.implicits._ + // $example on:create_ds$ + // Encoders are created for case classes + val caseClassDS = Seq(Person("Andy", 32)).toDS() + caseClassDS.show() + // +----+---+ + // |name|age| + // +----+---+ + // |Andy| 32| + // +----+---+ + + // Encoders for most common types are automatically provided by importing spark.implicits._ + val primitiveDS = Seq(1, 2, 3).toDS() + primitiveDS.map(_ + 1).collect() // Returns: Array(2, 3, 4) + + // DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name + val path = "examples/src/main/resources/people.json" + val peopleDS = spark.read.json(path).as[Person] + peopleDS.show() + // +----+-------+ + // | age| name| + // +----+-------+ + // |null|Michael| + // | 30| Andy| + // | 19| Justin| + // +----+-------+ + // $example off:create_ds$ + } + + private def runInferSchemaExample(spark: SparkSession): Unit = { + // $example on:schema_inferring$ + // For implicit conversions from RDDs to DataFrames + import spark.implicits._ + + // Create an RDD of Person objects from a text file, convert it to a Dataframe + val peopleDF = spark.sparkContext + .textFile("examples/src/main/resources/people.txt") + .map(_.split(",")) + .map(attributes => Person(attributes(0), attributes(1).trim.toInt)) + .toDF() + // Register the DataFrame as a temporary view + peopleDF.createOrReplaceTempView("people") + + // SQL statements can be run by using the sql methods provided by Spark + val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19") + + // The columns of a row in the result can be accessed by field index + teenagersDF.map(teenager => "Name: " + teenager(0)).show() + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + + // or by field name + teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show() + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + + // No pre-defined encoders for Dataset[Map[K,V]], define explicitly + implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]] + // Primitive types and case classes can be also defined as + implicit val stringIntMapEncoder: Encoder[Map[String, Int]] = ExpressionEncoder() + + // row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T] + teenagersDF.map(teenager => teenager.getValuesMap[Any](List("name", "age"))).collect() + // Array(Map("name" -> "Justin", "age" -> 19)) + // $example off:schema_inferring$ + } + + private def runProgrammaticSchemaExample(spark: SparkSession): Unit = { + import spark.implicits._ + // $example on:programmatic_schema$ + // Create an RDD + val peopleRDD = spark.sparkContext.textFile("examples/src/main/resources/people.txt") + + // The schema is encoded in a string + val schemaString = "name age" + + // Generate the schema based on the string of schema + val fields = schemaString.split(" ") + .map(fieldName => StructField(fieldName, StringType, nullable = true)) + val schema = StructType(fields) + + // Convert records of the RDD (people) to Rows + val rowRDD = peopleRDD + .map(_.split(",")) + .map(attributes => Row(attributes(0), attributes(1).trim)) + + // Apply the schema to the RDD + val peopleDF = spark.createDataFrame(rowRDD, schema) + + // Creates a temporary view using the DataFrame + peopleDF.createOrReplaceTempView("people") + + // SQL can be run over a temporary view created using DataFrames + val results = spark.sql("SELECT name FROM people") + + // The results of SQL queries are DataFrames and support all the normal RDD operations + // The columns of a row in the result can be accessed by field index or by field name + results.map(attributes => "Name: " + attributes(0)).show() + // +-------------+ + // | value| + // +-------------+ + // |Name: Michael| + // | Name: Andy| + // | Name: Justin| + // +-------------+ + // $example off:programmatic_schema$ + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala deleted file mode 100644 index 2343f98c8d07..000000000000 --- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.sql.hive - -import java.io.File - -import com.google.common.io.{ByteStreams, Files} - -import org.apache.spark.sql._ - -object HiveFromSpark { - case class Record(key: Int, value: String) - - // Copy kv1.txt file from classpath to temporary directory - val kv1Stream = HiveFromSpark.getClass.getResourceAsStream("/kv1.txt") - val kv1File = File.createTempFile("kv1", "txt") - kv1File.deleteOnExit() - ByteStreams.copy(kv1Stream, Files.newOutputStreamSupplier(kv1File)) - - def main(args: Array[String]) { - // When working with Hive, one must instantiate `SparkSession` with Hive support, including - // connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined - // functions. Users who do not have an existing Hive deployment can still enable Hive support. - // When not configured by the hive-site.xml, the context automatically creates `metastore_db` - // in the current directory and creates a directory configured by `spark.sql.warehouse.dir`, - // which defaults to the directory `spark-warehouse` in the current directory that the spark - // application is started. - val spark = SparkSession.builder - .appName("HiveFromSpark") - .enableHiveSupport() - .getOrCreate() - - import spark.implicits._ - import spark.sql - - sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") - sql(s"LOAD DATA LOCAL INPATH '${kv1File.getAbsolutePath}' INTO TABLE src") - - // Queries are expressed in HiveQL - println("Result of 'SELECT *': ") - sql("SELECT * FROM src").collect().foreach(println) - - // Aggregation queries are also supported. - val count = sql("SELECT COUNT(*) FROM src").collect().head.getLong(0) - println(s"COUNT(*): $count") - - // The results of SQL queries are themselves RDDs and support all normal RDD functions. The - // items in the RDD are of type Row, which allows you to access each column by ordinal. - val rddFromSql = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key") - - println("Result of RDD.map:") - val rddAsStrings = rddFromSql.rdd.map { - case Row(key: Int, value: String) => s"Key: $key, Value: $value" - } - - // You can also use RDDs to create temporary views within a HiveContext. - val rdd = spark.sparkContext.parallelize((1 to 100).map(i => Record(i, s"val_$i"))) - rdd.toDF().createOrReplaceTempView("records") - - // Queries can then join RDD data with data stored in Hive. - println("Result of SELECT *:") - sql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println) - - spark.stop() - } -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala new file mode 100644 index 000000000000..11e84c0e4563 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.sql.hive + +// $example on:spark_hive$ +import org.apache.spark.sql.Row +import org.apache.spark.sql.SparkSession +// $example off:spark_hive$ + +object SparkHiveExample { + + // $example on:spark_hive$ + case class Record(key: Int, value: String) + // $example off:spark_hive$ + + def main(args: Array[String]) { + // When working with Hive, one must instantiate `SparkSession` with Hive support, including + // connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined + // functions. Users who do not have an existing Hive deployment can still enable Hive support. + // When not configured by the hive-site.xml, the context automatically creates `metastore_db` + // in the current directory and creates a directory configured by `spark.sql.warehouse.dir`, + // which defaults to the directory `spark-warehouse` in the current directory that the spark + // application is started. + + // $example on:spark_hive$ + // warehouseLocation points to the default location for managed databases and tables + val warehouseLocation = "file:${system:user.dir}/spark-warehouse" + + val spark = SparkSession + .builder() + .appName("Spark Hive Example") + .config("spark.sql.warehouse.dir", warehouseLocation) + .enableHiveSupport() + .getOrCreate() + + import spark.implicits._ + import spark.sql + + sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") + sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") + + // Queries are expressed in HiveQL + sql("SELECT * FROM src").show() + // +---+-------+ + // |key| value| + // +---+-------+ + // |238|val_238| + // | 86| val_86| + // |311|val_311| + // ... + + // Aggregation queries are also supported. + sql("SELECT COUNT(*) FROM src").show() + // +--------+ + // |count(1)| + // +--------+ + // | 500 | + // +--------+ + + // The results of SQL queries are themselves DataFrames and support all normal functions. + val sqlDF = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key") + + // The items in DaraFrames are of type Row, which allows you to access each column by ordinal. + val stringsDS = sqlDF.map { + case Row(key: Int, value: String) => s"Key: $key, Value: $value" + } + stringsDS.show() + // +--------------------+ + // | value| + // +--------------------+ + // |Key: 0, Value: val_0| + // |Key: 0, Value: val_0| + // |Key: 0, Value: val_0| + // ... + + // You can also use DataFrames to create temporary views within a SparkSession. + val recordsDF = spark.createDataFrame((1 to 100).map(i => Record(i, s"val_$i"))) + recordsDF.createOrReplaceTempView("records") + + // Queries can then join DataFrame data with data stored in Hive. + sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show() + // +---+------+---+------+ + // |key| value|key| value| + // +---+------+---+------+ + // | 2| val_2| 2| val_2| + // | 4| val_4| 4| val_4| + // | 5| val_5| 5| val_5| + // ... + // $example off:spark_hive$ + + spark.stop() + } +} diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 9e0636e63b5d..fb0292a5f11e 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-flume-assembly_2.11 jar Spark Project External Flume Assembly diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 6b788d7da388..5e9275c8e66d 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-flume-sink_2.11 streaming-flume-sink diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 1b1105d763a0..7b68ca7373fe 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-flume_2.11 streaming-flume diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml index a0923932908e..1bc206e8675f 100644 --- a/external/java8-tests/pom.xml +++ b/external/java8-tests/pom.xml @@ -24,7 +24,6 @@ ../../pom.xml - org.apache.spark java8-tests_2.11 pom Spark Project Java 8 Tests diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml index 6c1a120c5e0f..4f5045326a00 100644 --- a/external/kafka-0-10-assembly/pom.xml +++ b/external/kafka-0-10-assembly/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-kafka-0-10-assembly_2.11 jar Spark Integration for Kafka 0.10 Assembly diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml index b38de96cd56b..60afaa582b61 100644 --- a/external/kafka-0-10/pom.xml +++ b/external/kafka-0-10/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-kafka-0-10_2.11 streaming-kafka-0-10 diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml index 4839ed4f427b..bc02b8a66246 100644 --- a/external/kafka-0-8-assembly/pom.xml +++ b/external/kafka-0-8-assembly/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-kafka-0-8-assembly_2.11 jar Spark Project External Kafka Assembly diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml index 948b9f500a17..91ccd4a927e9 100644 --- a/external/kafka-0-8/pom.xml +++ b/external/kafka-0-8/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-kafka-0-8_2.11 streaming-kafka-0-8 diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml index 09ca2c3aac0d..df528b359802 100644 --- a/external/kinesis-asl-assembly/pom.xml +++ b/external/kinesis-asl-assembly/pom.xml @@ -25,7 +25,6 @@ ../../pom.xml - org.apache.spark spark-streaming-kinesis-asl-assembly_2.11 jar Spark Project Kinesis Assembly diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml index 3bb80343f028..57809ff692c2 100644 --- a/external/kinesis-asl/pom.xml +++ b/external/kinesis-asl/pom.xml @@ -25,7 +25,6 @@ - org.apache.spark spark-streaming-kinesis-asl_2.11 jar Spark Kinesis Integration diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml index 34cbd9a62073..fab409d3e9f9 100644 --- a/external/spark-ganglia-lgpl/pom.xml +++ b/external/spark-ganglia-lgpl/pom.xml @@ -25,7 +25,6 @@ - org.apache.spark spark-ganglia-lgpl_2.11 jar Spark Ganglia Integration diff --git a/graphx/pom.xml b/graphx/pom.xml index c04580cb4696..bd4e53371b86 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-graphx_2.11 graphx diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala old mode 100644 new mode 100755 index 1fa92b019541..e4f80ffcb451 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala @@ -44,6 +44,9 @@ object StronglyConnectedComponents { // graph we are going to work with in our iterations var sccWorkGraph = graph.mapVertices { case (vid, _) => (vid, false) }.cache() + // helper variables to unpersist cached graphs + var prevSccGraph = sccGraph + var numVertices = sccWorkGraph.numVertices var iter = 0 while (sccWorkGraph.numVertices > 0 && iter < numIter) { @@ -64,48 +67,59 @@ object StronglyConnectedComponents { // write values to sccGraph sccGraph = sccGraph.outerJoinVertices(finalVertices) { (vid, scc, opt) => opt.getOrElse(scc) - } + }.cache() + // materialize vertices and edges + sccGraph.vertices.count() + sccGraph.edges.count() + // sccGraph materialized so, unpersist can be done on previous + prevSccGraph.unpersist(blocking = false) + prevSccGraph = sccGraph + // only keep vertices that are not final sccWorkGraph = sccWorkGraph.subgraph(vpred = (vid, data) => !data._2).cache() } while (sccWorkGraph.numVertices < numVertices) - sccWorkGraph = sccWorkGraph.mapVertices{ case (vid, (color, isFinal)) => (vid, isFinal) } + // if iter < numIter at this point sccGraph that is returned + // will not be recomputed and pregel executions are pointless + if (iter < numIter) { + sccWorkGraph = sccWorkGraph.mapVertices { case (vid, (color, isFinal)) => (vid, isFinal) } - // collect min of all my neighbor's scc values, update if it's smaller than mine - // then notify any neighbors with scc values larger than mine - sccWorkGraph = Pregel[(VertexId, Boolean), ED, VertexId]( - sccWorkGraph, Long.MaxValue, activeDirection = EdgeDirection.Out)( - (vid, myScc, neighborScc) => (math.min(myScc._1, neighborScc), myScc._2), - e => { - if (e.srcAttr._1 < e.dstAttr._1) { - Iterator((e.dstId, e.srcAttr._1)) - } else { - Iterator() - } - }, - (vid1, vid2) => math.min(vid1, vid2)) + // collect min of all my neighbor's scc values, update if it's smaller than mine + // then notify any neighbors with scc values larger than mine + sccWorkGraph = Pregel[(VertexId, Boolean), ED, VertexId]( + sccWorkGraph, Long.MaxValue, activeDirection = EdgeDirection.Out)( + (vid, myScc, neighborScc) => (math.min(myScc._1, neighborScc), myScc._2), + e => { + if (e.srcAttr._1 < e.dstAttr._1) { + Iterator((e.dstId, e.srcAttr._1)) + } else { + Iterator() + } + }, + (vid1, vid2) => math.min(vid1, vid2)) - // start at root of SCCs. Traverse values in reverse, notify all my neighbors - // do not propagate if colors do not match! - sccWorkGraph = Pregel[(VertexId, Boolean), ED, Boolean]( - sccWorkGraph, false, activeDirection = EdgeDirection.In)( - // vertex is final if it is the root of a color - // or it has the same color as a neighbor that is final - (vid, myScc, existsSameColorFinalNeighbor) => { - val isColorRoot = vid == myScc._1 - (myScc._1, myScc._2 || isColorRoot || existsSameColorFinalNeighbor) - }, - // activate neighbor if they are not final, you are, and you have the same color - e => { - val sameColor = e.dstAttr._1 == e.srcAttr._1 - val onlyDstIsFinal = e.dstAttr._2 && !e.srcAttr._2 - if (sameColor && onlyDstIsFinal) { - Iterator((e.srcId, e.dstAttr._2)) - } else { - Iterator() - } - }, - (final1, final2) => final1 || final2) + // start at root of SCCs. Traverse values in reverse, notify all my neighbors + // do not propagate if colors do not match! + sccWorkGraph = Pregel[(VertexId, Boolean), ED, Boolean]( + sccWorkGraph, false, activeDirection = EdgeDirection.In)( + // vertex is final if it is the root of a color + // or it has the same color as a neighbor that is final + (vid, myScc, existsSameColorFinalNeighbor) => { + val isColorRoot = vid == myScc._1 + (myScc._1, myScc._2 || isColorRoot || existsSameColorFinalNeighbor) + }, + // activate neighbor if they are not final, you are, and you have the same color + e => { + val sameColor = e.dstAttr._1 == e.srcAttr._1 + val onlyDstIsFinal = e.dstAttr._2 && !e.srcAttr._2 + if (sameColor && onlyDstIsFinal) { + Iterator((e.srcId, e.dstAttr._2)) + } else { + Iterator() + } + }, + (final1, final2) => final1 || final2) + } } sccGraph } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala index 4da1ecb2a9af..2b3e5f98c4fe 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala @@ -119,7 +119,7 @@ object GraphGenerators extends Logging { * A random graph generator using the R-MAT model, proposed in * "R-MAT: A Recursive Model for Graph Mining" by Chakrabarti et al. * - * See [[http://www.cs.cmu.edu/~christos/PUBLICATIONS/siam04.pdf]]. + * See http://www.cs.cmu.edu/~christos/PUBLICATIONS/siam04.pdf. */ def rmatGraph(sc: SparkContext, requestedNumVertices: Int, numEdges: Int): Graph[Int, Int] = { // let N = requestedNumVertices diff --git a/launcher/pom.xml b/launcher/pom.xml index 3db649bd3251..6023cf077186 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -26,7 +26,6 @@ ../pom.xml - org.apache.spark spark-launcher_2.11 jar Spark Project Launcher diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java index 1bfda289dec3..c0779e1c4e9a 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java +++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java @@ -21,7 +21,6 @@ import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.ThreadFactory; import java.util.logging.Level; import java.util.logging.Logger; @@ -31,8 +30,6 @@ class ChildProcAppHandle implements SparkAppHandle { private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName()); - private static final ThreadFactory REDIRECTOR_FACTORY = - new NamedThreadFactory("launcher-proc-%d"); private final String secret; private final LauncherServer server; @@ -127,7 +124,7 @@ String getSecret() { void setChildProc(Process childProc, String loggerName) { this.childProc = childProc; this.redirector = new OutputRedirector(childProc.getInputStream(), loggerName, - REDIRECTOR_FACTORY); + SparkLauncher.REDIRECTOR_FACTORY); } void setConnection(LauncherConnection connection) { diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java index 08873f581123..41f7f1f3ed5a 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java @@ -24,6 +24,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ThreadFactory; import java.util.concurrent.atomic.AtomicInteger; import static org.apache.spark.launcher.CommandBuilderUtils.*; @@ -82,6 +83,9 @@ public class SparkLauncher { /** Used internally to create unique logger names. */ private static final AtomicInteger COUNTER = new AtomicInteger(); + /** Factory for creating OutputRedirector threads. **/ + static final ThreadFactory REDIRECTOR_FACTORY = new NamedThreadFactory("launcher-proc-%d"); + static final Map launcherConfig = new HashMap<>(); /** @@ -99,6 +103,11 @@ public static void setConfig(String name, String value) { // Visible for testing. final SparkSubmitCommandBuilder builder; + File workingDir; + boolean redirectToLog; + boolean redirectErrorStream; + ProcessBuilder.Redirect errorStream; + ProcessBuilder.Redirect outputStream; public SparkLauncher() { this(null); @@ -358,6 +367,83 @@ public SparkLauncher setVerbose(boolean verbose) { return this; } + /** + * Sets the working directory of spark-submit. + * + * @param dir The directory to set as spark-submit's working directory. + * @return This launcher. + */ + public SparkLauncher directory(File dir) { + workingDir = dir; + return this; + } + + /** + * Specifies that stderr in spark-submit should be redirected to stdout. + * + * @return This launcher. + */ + public SparkLauncher redirectError() { + redirectErrorStream = true; + return this; + } + + /** + * Redirects error output to the specified Redirect. + * + * @param to The method of redirection. + * @return This launcher. + */ + public SparkLauncher redirectError(ProcessBuilder.Redirect to) { + errorStream = to; + return this; + } + + /** + * Redirects standard output to the specified Redirect. + * + * @param to The method of redirection. + * @return This launcher. + */ + public SparkLauncher redirectOutput(ProcessBuilder.Redirect to) { + outputStream = to; + return this; + } + + /** + * Redirects error output to the specified File. + * + * @param errFile The file to which stderr is written. + * @return This launcher. + */ + public SparkLauncher redirectError(File errFile) { + errorStream = ProcessBuilder.Redirect.to(errFile); + return this; + } + + /** + * Redirects error output to the specified File. + * + * @param outFile The file to which stdout is written. + * @return This launcher. + */ + public SparkLauncher redirectOutput(File outFile) { + outputStream = ProcessBuilder.Redirect.to(outFile); + return this; + } + + /** + * Sets all output to be logged and redirected to a logger with the specified name. + * + * @param loggerName The name of the logger to log stdout and stderr. + * @return This launcher. + */ + public SparkLauncher redirectToLog(String loggerName) { + setConf(CHILD_PROCESS_LOGGER_NAME, loggerName); + redirectToLog = true; + return this; + } + /** * Launches a sub-process that will start the configured Spark application. *

    @@ -367,7 +453,12 @@ public SparkLauncher setVerbose(boolean verbose) { * @return A process handle for the Spark app. */ public Process launch() throws IOException { - return createBuilder().start(); + Process childProc = createBuilder().start(); + if (redirectToLog) { + String loggerName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME); + new OutputRedirector(childProc.getInputStream(), loggerName, REDIRECTOR_FACTORY); + } + return childProc; } /** @@ -383,12 +474,13 @@ public Process launch() throws IOException { * a child process, {@link SparkAppHandle#kill()} can still be used to kill the child process. *

    * Currently, all applications are launched as child processes. The child's stdout and stderr - * are merged and written to a logger (see java.util.logging). The logger's name - * can be defined by setting {@link #CHILD_PROCESS_LOGGER_NAME} in the app's configuration. If - * that option is not set, the code will try to derive a name from the application's name or - * main class / script file. If those cannot be determined, an internal, unique name will be - * used. In all cases, the logger name will start with "org.apache.spark.launcher.app", to fit - * more easily into the configuration of commonly-used logging systems. + * are merged and written to a logger (see java.util.logging) only if redirection + * has not otherwise been configured on this SparkLauncher. The logger's name can be + * defined by setting {@link #CHILD_PROCESS_LOGGER_NAME} in the app's configuration. If that + * option is not set, the code will try to derive a name from the application's name or main + * class / script file. If those cannot be determined, an internal, unique name will be used. + * In all cases, the logger name will start with "org.apache.spark.launcher.app", to fit more + * easily into the configuration of commonly-used logging systems. * * @since 1.6.0 * @param listeners Listeners to add to the handle before the app is launched. @@ -400,27 +492,33 @@ public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) thr handle.addListener(l); } - String appName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME); - if (appName == null) { - if (builder.appName != null) { - appName = builder.appName; - } else if (builder.mainClass != null) { - int dot = builder.mainClass.lastIndexOf("."); - if (dot >= 0 && dot < builder.mainClass.length() - 1) { - appName = builder.mainClass.substring(dot + 1, builder.mainClass.length()); + String loggerName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME); + ProcessBuilder pb = createBuilder(); + // Only setup stderr + stdout to logger redirection if user has not otherwise configured output + // redirection. + if (loggerName == null) { + String appName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME); + if (appName == null) { + if (builder.appName != null) { + appName = builder.appName; + } else if (builder.mainClass != null) { + int dot = builder.mainClass.lastIndexOf("."); + if (dot >= 0 && dot < builder.mainClass.length() - 1) { + appName = builder.mainClass.substring(dot + 1, builder.mainClass.length()); + } else { + appName = builder.mainClass; + } + } else if (builder.appResource != null) { + appName = new File(builder.appResource).getName(); } else { - appName = builder.mainClass; + appName = String.valueOf(COUNTER.incrementAndGet()); } - } else if (builder.appResource != null) { - appName = new File(builder.appResource).getName(); - } else { - appName = String.valueOf(COUNTER.incrementAndGet()); } + String loggerPrefix = getClass().getPackage().getName(); + loggerName = String.format("%s.app.%s", loggerPrefix, appName); + pb.redirectErrorStream(true); } - String loggerPrefix = getClass().getPackage().getName(); - String loggerName = String.format("%s.app.%s", loggerPrefix, appName); - ProcessBuilder pb = createBuilder().redirectErrorStream(true); pb.environment().put(LauncherProtocol.ENV_LAUNCHER_PORT, String.valueOf(LauncherServer.getServerInstance().getPort())); pb.environment().put(LauncherProtocol.ENV_LAUNCHER_SECRET, handle.getSecret()); @@ -455,6 +553,29 @@ private ProcessBuilder createBuilder() { for (Map.Entry e : builder.childEnv.entrySet()) { pb.environment().put(e.getKey(), e.getValue()); } + + if (workingDir != null) { + pb.directory(workingDir); + } + + // Only one of redirectError and redirectError(...) can be specified. + // Similarly, if redirectToLog is specified, no other redirections should be specified. + checkState(!redirectErrorStream || errorStream == null, + "Cannot specify both redirectError() and redirectError(...) "); + checkState(!redirectToLog || + (!redirectErrorStream && errorStream == null && outputStream == null), + "Cannot used redirectToLog() in conjunction with other redirection methods."); + + if (redirectErrorStream || redirectToLog) { + pb.redirectErrorStream(true); + } + if (errorStream != null) { + pb.redirectError(errorStream); + } + if (outputStream != null) { + pb.redirectOutput(outputStream); + } + return pb; } diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml index 3816a15b142d..8c985fd13ac0 100644 --- a/mllib-local/pom.xml +++ b/mllib-local/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-mllib-local_2.11 mllib-local diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala index c74e5d44a328..0659324aad1f 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala @@ -66,7 +66,7 @@ sealed trait Vector extends Serializable { /** * Returns a hash code value for the vector. The hash code is based on its size and its first 128 - * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]]. + * nonzero entries, using a hash algorithm similar to `java.util.Arrays.hashCode`. */ override def hashCode(): Int = { // This is a reference implementation. It calls return in foreachActive, which is slow. diff --git a/mllib/pom.xml b/mllib/pom.xml index c7a9036695ad..4484998a49c8 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-mllib_2.11 mllib diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index a1d08b3a6e78..195a93e08672 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -27,7 +27,7 @@ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util._ @@ -78,7 +78,6 @@ abstract class PipelineStage extends Params with Logging { } /** - * :: Experimental :: * A simple pipeline, which acts as an estimator. A Pipeline consists of a sequence of stages, each * of which is either an [[Estimator]] or a [[Transformer]]. When [[Pipeline#fit]] is called, the * stages are executed in order. If a stage is an [[Estimator]], its [[Estimator#fit]] method will @@ -90,7 +89,6 @@ abstract class PipelineStage extends Params with Logging { * an identity transformer. */ @Since("1.2.0") -@Experimental class Pipeline @Since("1.4.0") ( @Since("1.4.0") override val uid: String) extends Estimator[PipelineModel] with MLWritable { @@ -214,7 +212,7 @@ object Pipeline extends MLReadable[Pipeline] { } } - /** Methods for [[MLReader]] and [[MLWriter]] shared between [[Pipeline]] and [[PipelineModel]] */ + /** Methods for `MLReader` and `MLWriter` shared between [[Pipeline]] and [[PipelineModel]] */ private[ml] object SharedReadWrite { import org.json4s.JsonDSL._ @@ -282,11 +280,9 @@ object Pipeline extends MLReadable[Pipeline] { } /** - * :: Experimental :: * Represents a fitted pipeline. */ @Since("1.2.0") -@Experimental class PipelineModel private[ml] ( @Since("1.4.0") override val uid: String, @Since("1.4.0") val stages: Array[Transformer]) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala index 569a5fb99376..e29d7f48a1d6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala @@ -165,7 +165,7 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, } /** - * Transforms dataset by reading from [[featuresCol]], calling [[predict()]], and storing + * Transforms dataset by reading from [[featuresCol]], calling `predict`, and storing * the predictions as a new column [[predictionCol]]. * * @param dataset input dataset diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala index e35b04a1cf42..6decea72719f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala @@ -50,7 +50,7 @@ private[spark] trait ClassifierParams * Single-label binary or multiclass classification. * Classes are indexed {0, 1, ..., numClasses - 1}. * - * @tparam FeaturesType Type of input features. E.g., [[Vector]] + * @tparam FeaturesType Type of input features. E.g., `Vector` * @tparam E Concrete Estimator type * @tparam M Concrete Model type */ @@ -134,7 +134,7 @@ abstract class Classifier[ * Model produced by a [[Classifier]]. * Classes are indexed {0, 1, ..., numClasses - 1}. * - * @tparam FeaturesType Type of input features. E.g., [[Vector]] + * @tparam FeaturesType Type of input features. E.g., `Vector` * @tparam M Concrete Model type */ @DeveloperApi @@ -151,7 +151,7 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur * Transforms dataset by reading from [[featuresCol]], and appending new columns as specified by * parameters: * - predicted labels as [[predictionCol]] of type [[Double]] - * - raw predictions (confidences) as [[rawPredictionCol]] of type [[Vector]]. + * - raw predictions (confidences) as [[rawPredictionCol]] of type `Vector`. * * @param dataset input dataset * @return transformed dataset diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index c65d3d5b5442..71293017e052 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.param.ParamMap @@ -36,14 +36,12 @@ import org.apache.spark.sql.Dataset /** - * :: Experimental :: - * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm + * Decision tree learning algorithm (http://en.wikipedia.org/wiki/Decision_tree_learning) * for classification. * It supports both binary and multiclass labels, as well as both continuous and categorical * features. */ @Since("1.4.0") -@Experimental class DecisionTreeClassifier @Since("1.4.0") ( @Since("1.4.0") override val uid: String) extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] @@ -127,7 +125,6 @@ class DecisionTreeClassifier @Since("1.4.0") ( } @Since("1.4.0") -@Experimental object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifier] { /** Accessor for supported impurities: entropy, gini */ @Since("1.4.0") @@ -138,13 +135,11 @@ object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifi } /** - * :: Experimental :: - * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for classification. + * Decision tree model (http://en.wikipedia.org/wiki/Decision_tree_learning) for classification. * It supports both binary and multiclass labels, as well as both continuous and categorical * features. */ @Since("1.4.0") -@Experimental class DecisionTreeClassificationModel private[ml] ( @Since("1.4.0")override val uid: String, @Since("1.4.0")override val rootNode: Node, diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 4e534baddc63..ba70293273f9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -21,7 +21,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.feature.LabeledPoint @@ -40,8 +40,7 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType /** - * :: Experimental :: - * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]] + * Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting) * learning algorithm for classification. * It supports binary labels, as well as both continuous and categorical features. * Note: Multiclass labels are not currently supported. @@ -57,7 +56,6 @@ import org.apache.spark.sql.types.DoubleType * [https://issues.apache.org/jira/browse/SPARK-4240] */ @Since("1.4.0") -@Experimental class GBTClassifier @Since("1.4.0") ( @Since("1.4.0") override val uid: String) extends Predictor[Vector, GBTClassifier, GBTClassificationModel] @@ -149,7 +147,6 @@ class GBTClassifier @Since("1.4.0") ( } @Since("1.4.0") -@Experimental object GBTClassifier extends DefaultParamsReadable[GBTClassifier] { /** Accessor for supported loss settings: logistic */ @@ -161,8 +158,7 @@ object GBTClassifier extends DefaultParamsReadable[GBTClassifier] { } /** - * :: Experimental :: - * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]] + * Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting) * model for classification. * It supports binary labels, as well as both continuous and categorical features. * Note: Multiclass labels are not currently supported. @@ -171,7 +167,6 @@ object GBTClassifier extends DefaultParamsReadable[GBTClassifier] { * @param _treeWeights Weights for the decision trees in the ensemble. */ @Since("1.6.0") -@Experimental class GBTClassificationModel private[ml]( @Since("1.6.0") override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], @@ -238,8 +233,8 @@ class GBTClassificationModel private[ml]( * The importance vector is normalized to sum to 1. This method is suggested by Hastie et al. * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) * and follows the implementation from scikit-learn. - * - * @see [[DecisionTreeClassificationModel.featureImportances]] + + * See `DecisionTreeClassificationModel.featureImportances` */ @Since("2.0.0") lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 9c9f5ced4e35..7694773c816b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -151,13 +151,11 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas } /** - * :: Experimental :: * Logistic regression. * Currently, this class only supports binary classification. It will support multiclass * in the future. */ @Since("1.2.0") -@Experimental class LogisticRegression @Since("1.2.0") ( @Since("1.4.0") override val uid: String) extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel] @@ -475,11 +473,9 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { } /** - * :: Experimental :: * Model produced by [[LogisticRegression]]. */ @Since("1.4.0") -@Experimental class LogisticRegressionModel private[spark] ( @Since("1.4.0") override val uid: String, @Since("2.0.0") val coefficients: Vector, @@ -862,10 +858,10 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns the receiver operating characteristic (ROC) curve, * which is a Dataframe having two fields (FPR, TPR) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. + * See http://en.wikipedia.org/wiki/Receiver_operating_characteristic * - * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. - * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic */ @Since("1.5.0") @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") @@ -873,7 +869,7 @@ class BinaryLogisticRegressionSummary private[classification] ( /** * Computes the area under the receiver operating characteristic (ROC) curve. * - * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -883,7 +879,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns the precision-recall curve, which is a Dataframe containing * two fields recall, precision with (0.0, 1.0) prepended to it. * - * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -892,7 +888,7 @@ class BinaryLogisticRegressionSummary private[classification] ( /** * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. * - * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -905,7 +901,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * Every possible probability obtained in transforming the dataset are used * as thresholds used in calculating the precision. * - * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -918,7 +914,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * Every possible probability obtained in transforming the dataset are used * as thresholds used in calculating the recall. * - * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 76ef32aa3dc1..7264a99b47eb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -100,7 +100,7 @@ private[classification] trait MultilayerPerceptronParams extends PredictorParams @Since("2.0.0") final def getInitialWeights: Vector = $(initialWeights) - setDefault(maxIter -> 100, tol -> 1e-4, blockSize -> 128, + setDefault(maxIter -> 100, tol -> 1e-6, blockSize -> 128, solver -> MultilayerPerceptronClassifier.LBFGS, stepSize -> 0.03) } @@ -190,7 +190,7 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( /** * Set the convergence tolerance of iterations. * Smaller value will lead to higher accuracy with the cost of more iterations. - * Default is 1E-4. + * Default is 1E-6. * * @group setParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index c99ae30155e3..ab977c8802e3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.classification import org.apache.hadoop.fs.Path import org.apache.spark.SparkException -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators} @@ -63,7 +63,6 @@ private[ml] trait NaiveBayesParams extends PredictorParams { } /** - * :: Experimental :: * Naive Bayes Classifiers. * It supports both Multinomial NB * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]]) @@ -74,7 +73,6 @@ private[ml] trait NaiveBayesParams extends PredictorParams { * The input feature values must be nonnegative. */ @Since("1.5.0") -@Experimental class NaiveBayes @Since("1.5.0") ( @Since("1.5.0") override val uid: String) extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel] @@ -121,14 +119,12 @@ object NaiveBayes extends DefaultParamsReadable[NaiveBayes] { } /** - * :: Experimental :: * Model produced by [[NaiveBayes]] * @param pi log of class priors, whose dimension is C (number of classes) * @param theta log of class conditional probabilities, whose dimension is C (number of classes) * by D (number of features) */ @Since("1.5.0") -@Experimental class NaiveBayesModel private[ml] ( @Since("1.5.0") override val uid: String, @Since("2.0.0") val pi: Vector, diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index 047a378b79aa..f4ab0a074c42 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -29,7 +29,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.Vector @@ -117,7 +117,6 @@ private[ml] object OneVsRestParams extends ClassifierTypeTrait { } /** - * :: Experimental :: * Model produced by [[OneVsRest]]. * This stores the models resulting from training k binary classifiers: one for each class. * Each example is scored against all k models, and the model with the highest score @@ -130,7 +129,6 @@ private[ml] object OneVsRestParams extends ClassifierTypeTrait { * (taking label 0). */ @Since("1.4.0") -@Experimental final class OneVsRestModel private[ml] ( @Since("1.4.0") override val uid: String, private[ml] val labelMetadata: Metadata, @@ -260,8 +258,6 @@ object OneVsRestModel extends MLReadable[OneVsRestModel] { } /** - * :: Experimental :: - * * Reduction of Multiclass Classification to Binary Classification. * Performs reduction using one against all strategy. * For a multiclass classification with k classes, train k models (one per class). @@ -269,7 +265,6 @@ object OneVsRestModel extends MLReadable[OneVsRestModel] { * is picked to label the example. */ @Since("1.4.0") -@Experimental final class OneVsRest @Since("1.4.0") ( @Since("1.4.0") override val uid: String) extends Estimator[OneVsRestModel] with OneVsRestParams with MLWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index 59277d0f42b3..88642abf6322 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -45,7 +45,7 @@ private[classification] trait ProbabilisticClassifierParams * * Single-label binary or multiclass classifier which can output class conditional probabilities. * - * @tparam FeaturesType Type of input features. E.g., [[Vector]] + * @tparam FeaturesType Type of input features. E.g., `Vector` * @tparam E Concrete Estimator type * @tparam M Concrete Model type */ @@ -70,7 +70,7 @@ abstract class ProbabilisticClassifier[ * Model produced by a [[ProbabilisticClassifier]]. * Classes are indexed {0, 1, ..., numClasses - 1}. * - * @tparam FeaturesType Type of input features. E.g., [[Vector]] + * @tparam FeaturesType Type of input features. E.g., `Vector` * @tparam M Concrete Model type */ @DeveloperApi @@ -89,8 +89,8 @@ abstract class ProbabilisticClassificationModel[ * Transforms dataset by reading from [[featuresCol]], and appending new columns as specified by * parameters: * - predicted labels as [[predictionCol]] of type [[Double]] - * - raw predictions (confidences) as [[rawPredictionCol]] of type [[Vector]] - * - probability of each class as [[probabilityCol]] of type [[Vector]]. + * - raw predictions (confidences) as [[rawPredictionCol]] of type `Vector` + * - probability of each class as [[probabilityCol]] of type `Vector`. * * @param dataset input dataset * @return transformed dataset @@ -210,7 +210,7 @@ private[ml] object ProbabilisticClassificationModel { /** * Normalize a vector of raw predictions to be a multinomial probability vector, in place. * - * The input raw predictions should be >= 0. + * The input raw predictions should be nonnegative. * The output vector sums to 1, unless the input vector is all-0 (in which case the output is * all-0 too). * diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 9a26a5c5b143..4ab132e5f294 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.classification import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.param.ParamMap @@ -36,14 +36,12 @@ import org.apache.spark.sql.functions._ /** - * :: Experimental :: * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] learning algorithm for * classification. * It supports both binary and multiclass labels, as well as both continuous and categorical * features. */ @Since("1.4.0") -@Experimental class RandomForestClassifier @Since("1.4.0") ( @Since("1.4.0") override val uid: String) extends ProbabilisticClassifier[Vector, RandomForestClassifier, RandomForestClassificationModel] @@ -124,7 +122,6 @@ class RandomForestClassifier @Since("1.4.0") ( } @Since("1.4.0") -@Experimental object RandomForestClassifier extends DefaultParamsReadable[RandomForestClassifier] { /** Accessor for supported impurity settings: entropy, gini */ @Since("1.4.0") @@ -140,7 +137,6 @@ object RandomForestClassifier extends DefaultParamsReadable[RandomForestClassifi } /** - * :: Experimental :: * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] model for classification. * It supports both binary and multiclass labels, as well as both continuous and categorical * features. @@ -149,7 +145,6 @@ object RandomForestClassifier extends DefaultParamsReadable[RandomForestClassifi * Warning: These have null parents. */ @Since("1.4.0") -@Experimental class RandomForestClassificationModel private[ml] ( @Since("1.5.0") override val uid: String, private val _trees: Array[DecisionTreeClassificationModel], diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 778cd0fee71c..034f2c3fa2fd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -37,7 +37,7 @@ import org.apache.spark.mllib.linalg.MatrixImplicits._ import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} -import org.apache.spark.sql.functions.{col, monotonicallyIncreasingId, udf} +import org.apache.spark.sql.functions.{col, monotonically_increasing_id, udf} import org.apache.spark.sql.types.StructType @@ -888,7 +888,7 @@ object LDA extends DefaultParamsReadable[LDA] { dataset: Dataset[_], featuresCol: String): RDD[(Long, OldVector)] = { dataset - .withColumn("docId", monotonicallyIncreasingId()) + .withColumn("docId", monotonically_increasing_id()) .select("docId", featuresCol) .rdd .map { case Row(docId: Long, features: Vector) => diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala index 5f765c071b9c..e7b949ddce34 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala @@ -30,7 +30,8 @@ import org.apache.spark.sql.Dataset abstract class Evaluator extends Params { /** - * Evaluates model output and returns a scalar metric (larger is better). + * Evaluates model output and returns a scalar metric. + * The value of [[isLargerBetter]] specifies whether larger values are better. * * @param dataset a dataset that contains labels/observations and predictions. * @param paramMap parameter map that specifies the input columns and output metrics @@ -42,7 +43,9 @@ abstract class Evaluator extends Params { } /** - * Evaluates the output. + * Evaluates model output and returns a scalar metric. + * The value of [[isLargerBetter]] specifies whether larger values are better. + * * @param dataset a dataset that contains labels/observations and predictions. * @return metric */ @@ -50,7 +53,7 @@ abstract class Evaluator extends Params { def evaluate(dataset: Dataset[_]): Double /** - * Indicates whether the metric returned by [[evaluate()]] should be maximized (true, default) + * Indicates whether the metric returned by `evaluate` should be maximized (true, default) * or minimized (false). * A given evaluator may support multiple metrics which may be maximized or minimized. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index fa9634fdfa7e..2b0862c60fdf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ @@ -31,10 +31,8 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** - * :: Experimental :: * Binarize a column of continuous features given a threshold. */ -@Experimental @Since("1.4.0") final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index caffc39e2be1..100d9e7f6cbc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import java.{util => ju} import org.apache.spark.SparkException -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.Model import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ @@ -31,10 +31,8 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} /** - * :: Experimental :: * `Bucketizer` maps a column of continuous features to a column of feature buckets. */ -@Experimental @Since("1.4.0") final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Model[Bucketizer] with HasInputCol with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 712634dffbf1..1482eb3d1f7a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.attribute.{AttributeGroup, _} import org.apache.spark.ml.linalg.{Vector, VectorUDT} @@ -42,8 +42,8 @@ private[feature] trait ChiSqSelectorParams extends Params /** * Number of features that selector will select (ordered by statistic value descending). If the - * number of features is < numTopFeatures, then this will select all features. The default value - * of numTopFeatures is 50. + * number of features is less than numTopFeatures, then this will select all features. + * The default value of numTopFeatures is 50. * @group param */ final val numTopFeatures = new IntParam(this, "numTopFeatures", @@ -57,11 +57,9 @@ private[feature] trait ChiSqSelectorParams extends Params } /** - * :: Experimental :: * Chi-Squared feature selection, which selects categorical features to use for predicting a * categorical label. */ -@Experimental @Since("1.6.0") final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams with DefaultParamsWritable { @@ -116,10 +114,8 @@ object ChiSqSelector extends DefaultParamsReadable[ChiSqSelector] { } /** - * :: Experimental :: * Model fitted by [[ChiSqSelector]]. */ -@Experimental @Since("1.6.0") final class ChiSqSelectorModel private[ml] ( @Since("1.6.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 96e6f1c512e9..6299f74a6bf9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vectors, VectorUDT} @@ -116,10 +116,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit } /** - * :: Experimental :: * Extracts a vocabulary from document collections and generates a [[CountVectorizerModel]]. */ -@Experimental @Since("1.5.0") class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends Estimator[CountVectorizerModel] with CountVectorizerParams with DefaultParamsWritable { @@ -201,11 +199,9 @@ object CountVectorizer extends DefaultParamsReadable[CountVectorizer] { } /** - * :: Experimental :: * Converts a text document to a sparse vector of token counts. * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. */ -@Experimental @Since("1.5.0") class CountVectorizerModel( @Since("1.5.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala index 9605145e12c2..6ff36b35ca4c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam @@ -27,7 +27,6 @@ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType /** - * :: Experimental :: * A feature transformer that takes the 1D discrete cosine transform of a real vector. No zero * padding is performed on the input vector. * It returns a real vector of the same length representing the DCT. The return vector is scaled @@ -35,7 +34,6 @@ import org.apache.spark.sql.types.DataType * * More information on [[https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia]]. */ -@Experimental @Since("1.5.0") class DCT @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends UnaryTransformer[Vector, Vector, DCT] with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala index d07833e5805d..f860b3a787b4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param @@ -27,12 +27,10 @@ import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType /** - * :: Experimental :: * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a * provided "weight" vector. In other words, it scales each column of the dataset by a scalar * multiplier. */ -@Experimental @Since("1.4.0") class ElementwiseProduct @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends UnaryTransformer[Vector, Vector, ElementwiseProduct] with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 6ca7336cd048..a8792a35ff4a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ @@ -29,7 +29,6 @@ import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} /** - * :: Experimental :: * Maps a sequence of terms to their term frequencies using the hashing trick. * Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32) * to calculate the hash code value for the term object. @@ -37,7 +36,6 @@ import org.apache.spark.sql.types.{ArrayType, StructType} * it is advisable to use a power of two as the numFeatures parameter; * otherwise the features will not be mapped evenly to the columns. */ -@Experimental @Since("1.2.0") class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala index 5d6287f0e3f1..6386dd8a1080 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ @@ -61,10 +61,8 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol } /** - * :: Experimental :: * Compute the Inverse Document Frequency (IDF) given a collection of documents. */ -@Experimental @Since("1.4.0") final class IDF @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Estimator[IDFModel] with IDFBase with DefaultParamsWritable { @@ -111,10 +109,8 @@ object IDF extends DefaultParamsReadable[IDF] { } /** - * :: Experimental :: * Model fitted by [[IDF]]. */ -@Experimental @Since("1.4.0") class IDFModel private[ml] ( @Since("1.4.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala index dca28b5c5d34..7b11f86279b9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.SparkException -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.attribute._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ @@ -32,7 +32,6 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** - * :: Experimental :: * Implements the feature interaction transform. This transformer takes in Double and Vector type * columns and outputs a flattened vector of their feature interactions. To handle interaction, * we first one-hot encode any nominal features. Then, a vector of the feature cross-products is @@ -42,7 +41,6 @@ import org.apache.spark.sql.types._ * `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal * with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`. */ -@Experimental @Since("1.6.0") class Interaction @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Transformer with HasInputCols with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala index f7f1d4203959..6cefa7086c88 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala @@ -23,6 +23,8 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.Vector /** + * :: Experimental :: + * * Class that represents the features and labels of a data point. * * @param label Label for this data point. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index d5ad5abced46..9ed8d83324cf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamMap, Params} @@ -74,18 +74,16 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H } /** - * :: Experimental :: * Rescale each feature individually to a common range [min, max] linearly using column summary * statistics, which is also known as min-max normalization or Rescaling. The rescaled value for * feature E is calculated as, * - * Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min + * `Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min` * - * For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min) + * For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)`. * Note that since zero values will probably be transformed to non-zero values, output of the * transformer will be DenseVector even for sparse input. */ -@Experimental @Since("1.5.0") class MinMaxScaler @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends Estimator[MinMaxScalerModel] with MinMaxScalerParams with DefaultParamsWritable { @@ -138,7 +136,6 @@ object MinMaxScaler extends DefaultParamsReadable[MinMaxScaler] { } /** - * :: Experimental :: * Model fitted by [[MinMaxScaler]]. * * @param originalMin min value for each original column during fitting @@ -146,7 +143,6 @@ object MinMaxScaler extends DefaultParamsReadable[MinMaxScaler] { * * TODO: The transformer does not yet set the metadata in the output column (SPARK-8529). */ -@Experimental @Since("1.5.0") class MinMaxScalerModel private[ml] ( @Since("1.5.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala index 9c1f1ad443bb..4463aea0097e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala @@ -17,14 +17,13 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} /** - * :: Experimental :: * A feature transformer that converts the input array of strings into an array of n-grams. Null * values in the input array are ignored. * It returns an array of n-grams where each n-gram is represented by a space-separated string of @@ -34,7 +33,6 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} * When the input array length is less than n (number of elements per n-gram), no n-grams are * returned. */ -@Experimental @Since("1.5.0") class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends UnaryTransformer[Seq[String], Seq[String], NGram] with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala index f9cbad90c9f3..eb0690058013 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} @@ -27,10 +27,8 @@ import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType /** - * :: Experimental :: * Normalize a vector to have unit norm using the given p-norm. */ -@Experimental @Since("1.4.0") class Normalizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends UnaryTransformer[Vector, Vector, Normalizer] with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index 01828ede6bc6..8b04b5de6fd2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.Vectors @@ -29,7 +29,6 @@ import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{DoubleType, NumericType, StructType} /** - * :: Experimental :: * A one-hot encoder that maps a column of category indices to a column of binary vectors, with * at most a single one-value per row that indicates the input category index. * For example with 5 categories, an input value of 2.0 would map to an output vector of @@ -42,7 +41,6 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType} * * @see [[StringIndexer]] for converting categorical values into category indices */ -@Experimental @Since("1.4.0") class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index ef8b08545db2..6b913480fdc2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ @@ -59,12 +59,11 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC } } + /** - * :: Experimental :: * PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]] * principal components. */ -@Experimental @Since("1.5.0") class PCA @Since("1.5.0") ( @Since("1.5.0") override val uid: String) @@ -116,14 +115,12 @@ object PCA extends DefaultParamsReadable[PCA] { } /** - * :: Experimental :: * Model fitted by [[PCA]]. Transforms vectors to a lower dimensional space. * * @param pc A principal components Matrix. Each column is one principal component. * @param explainedVariance A vector of proportions of variance explained by * each principal component. */ -@Experimental @Since("1.5.0") class PCAModel private[ml] ( @Since("1.5.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 7b35fdeaf40c..72fb35bd79ad 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import scala.collection.mutable -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} @@ -27,14 +27,12 @@ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType /** - * :: Experimental :: * Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion, * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an * expansion of a product of sums expresses it as a sum of products by using the fact that * multiplication distributes over addition". Take a 2-variable feature vector as an example: * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. */ -@Experimental @Since("1.4.0") class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends UnaryTransformer[Vector, Vector, PolynomialExpansion] with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index 96b8e7d9f7fa..9a636bd8a5e4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml._ import org.apache.spark.ml.attribute.NominalAttribute @@ -64,7 +64,6 @@ private[feature] trait QuantileDiscretizerBase extends Params } /** - * :: Experimental :: * `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned * categorical features. The number of bins can be set using the `numBuckets` parameter. * The bin ranges are chosen using an approximate algorithm (see the documentation for @@ -73,7 +72,6 @@ private[feature] trait QuantileDiscretizerBase extends Params * `relativeError` parameter. The lower and upper bin bounds will be `-Infinity` and `+Infinity`, * covering all real values. */ -@Experimental @Since("1.6.0") final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Estimator[Bucketizer] with QuantileDiscretizerBase with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala index b8715746fee5..289037640fd4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ @@ -25,7 +25,6 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType /** - * :: Experimental :: * Implements the transformations which are defined by SQL statement. * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...' * where '__THIS__' represents the underlying table of the input dataset. @@ -37,7 +36,6 @@ import org.apache.spark.sql.types.StructType * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5 * - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b */ -@Experimental @Since("1.6.0") class SQLTransformer @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Transformer with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index b4be95494fd1..2494cf51a2bd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ @@ -76,7 +76,6 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with } /** - * :: Experimental :: * Standardizes features by removing the mean and scaling to unit variance using column summary * statistics on the samples in the training set. * @@ -85,7 +84,6 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with * corrected sample standard deviation]], * which is computed as the square root of the unbiased sample variance. */ -@Experimental @Since("1.2.0") class StandardScaler @Since("1.4.0") ( @Since("1.4.0") override val uid: String) @@ -138,13 +136,11 @@ object StandardScaler extends DefaultParamsReadable[StandardScaler] { } /** - * :: Experimental :: * Model fitted by [[StandardScaler]]. * * @param std Standard deviation of the StandardScalerModel * @param mean Mean of the StandardScalerModel */ -@Experimental @Since("1.2.0") class StandardScalerModel private[ml] ( @Since("1.4.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 1a6f42f773cd..666070037cdd 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} @@ -27,12 +27,10 @@ import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} /** - * :: Experimental :: * A feature transformer that filters out stop words from input. * Note: null values from input array are preserved unless adding null to stopWords explicitly. * @see [[http://en.wikipedia.org/wiki/Stop_words]] */ -@Experimental @Since("1.5.0") class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 028e540fe535..fe79e2ec808a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path import org.apache.spark.SparkException -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model, Transformer} import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} import org.apache.spark.ml.param._ @@ -55,7 +55,6 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha } /** - * :: Experimental :: * A label indexer that maps a string column of labels to an ML column of label indices. * If the input column is numeric, we cast it to string and index the string values. * The indices are in [0, numLabels), ordered by label frequencies. @@ -63,7 +62,6 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha * * @see [[IndexToString]] for the inverse transformation */ -@Experimental @Since("1.4.0") class StringIndexer @Since("1.4.0") ( @Since("1.4.0") override val uid: String) extends Estimator[StringIndexerModel] @@ -112,7 +110,6 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] { } /** - * :: Experimental :: * Model fitted by [[StringIndexer]]. * * NOTE: During transformation, if the input column does not exist, @@ -121,7 +118,6 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] { * * @param labels Ordered list of labels, corresponding to indices to be assigned. */ -@Experimental @Since("1.4.0") class StringIndexerModel ( @Since("1.4.0") override val uid: String, @@ -250,7 +246,6 @@ object StringIndexerModel extends MLReadable[StringIndexerModel] { } /** - * :: Experimental :: * A [[Transformer]] that maps a column of indices back to a new column of corresponding * string values. * The index-string mapping is either from the ML attributes of the input column, @@ -258,7 +253,6 @@ object StringIndexerModel extends MLReadable[StringIndexerModel] { * * @see [[StringIndexer]] for converting strings into indices */ -@Experimental @Since("1.5.0") class IndexToString private[ml] (@Since("1.5.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 010c948749f3..45d8fa94a8f8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -17,19 +17,17 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} /** - * :: Experimental :: * A tokenizer that converts the input string to lowercase and then splits it by white spaces. * * @see [[RegexTokenizer]] */ -@Experimental @Since("1.2.0") class Tokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends UnaryTransformer[String, Seq[String], Tokenizer] with DefaultParamsWritable { @@ -59,13 +57,11 @@ object Tokenizer extends DefaultParamsReadable[Tokenizer] { } /** - * :: Experimental :: * A regex based tokenizer that extracts tokens either by using the provided regex pattern to split * the text (default) or repeatedly matching the regex (if `gaps` is false). * Optional parameters also allow filtering tokens using a minimal length. * It returns an array of strings that can be empty. */ -@Experimental @Since("1.4.0") class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends UnaryTransformer[String, Seq[String], RegexTokenizer] with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 4939dabd987e..142a2ae44c69 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.SparkException -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute, UnresolvedAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} @@ -32,10 +32,8 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** - * :: Experimental :: * A feature transformer that merges multiple columns into a vector column. */ -@Experimental @Since("1.4.0") class VectorAssembler @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer with HasInputCols with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 5656a9f979fc..d1a5c2e82581 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -24,7 +24,7 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, VectorUDT} @@ -59,7 +59,6 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu } /** - * :: Experimental :: * Class for indexing categorical feature columns in a dataset of [[Vector]]. * * This has 2 usage modes: @@ -93,7 +92,6 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu * - Add warning if a categorical feature has only 1 category. * - Add option for allowing unknown categories. */ -@Experimental @Since("1.4.0") class VectorIndexer @Since("1.4.0") ( @Since("1.4.0") override val uid: String) @@ -247,7 +245,6 @@ object VectorIndexer extends DefaultParamsReadable[VectorIndexer] { } /** - * :: Experimental :: * Model fitted by [[VectorIndexer]]. Transform categorical features to use 0-based indices * instead of their original values. * - Categorical features are mapped to indices. @@ -263,7 +260,6 @@ object VectorIndexer extends DefaultParamsReadable[VectorIndexer] { * Values are maps from original features values to 0-based category indices. * If a feature is not in this map, it is treated as continuous. */ -@Experimental @Since("1.4.0") class VectorIndexerModel private[ml] ( @Since("1.4.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala index 6769e490c51c..966ccb85d0e0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.{Attribute, AttributeGroup} import org.apache.spark.ml.linalg._ @@ -29,7 +29,6 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType /** - * :: Experimental :: * This class takes a feature vector and outputs a new feature vector with a subarray of the * original features. * @@ -40,7 +39,6 @@ import org.apache.spark.sql.types.StructType * The output vector will order features with the selected indices first (in the order given), * followed by the selected names (in the order given). */ -@Experimental @Since("1.5.0") final class VectorSlicer @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 0cac3fa2d7e5..c2b434c3d5cb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -19,8 +19,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.SparkContext -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} import org.apache.spark.ml.param._ @@ -115,11 +114,9 @@ private[feature] trait Word2VecBase extends Params } /** - * :: Experimental :: * Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further * natural language processing or machine learning process. */ -@Experimental @Since("1.4.0") final class Word2Vec @Since("1.4.0") ( @Since("1.4.0") override val uid: String) @@ -202,10 +199,8 @@ object Word2Vec extends DefaultParamsReadable[Word2Vec] { } /** - * :: Experimental :: * Model fitted by [[Word2Vec]]. */ -@Experimental @Since("1.4.0") class Word2VecModel private[ml] ( @Since("1.4.0") override val uid: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala similarity index 92% rename from mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala rename to mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala index 52a6fd25e2fa..a66ba27a7b9c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala @@ -17,15 +17,16 @@ package org.apache.spark.ml.linalg -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.sql.types.DataType /** * :: DeveloperApi :: * SQL data types for vectors and matrices. */ +@Since("2.0.0") @DeveloperApi -object sqlDataTypes { +object SQLDataTypes { /** Data type for [[Vector]]. */ val VectorType: DataType = new VectorUDT diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index ecec61a72f82..9245931b27ca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -28,9 +28,9 @@ import scala.collection.JavaConverters._ import org.json4s._ import org.json4s.jackson.JsonMethods._ -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.ml.linalg.JsonVectorConverter +import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.Identifiable /** @@ -510,11 +510,9 @@ class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[In } /** - * :: Experimental :: * A param and its value. */ @Since("1.2.0") -@Experimental case class ParamPair[T] @Since("1.2.0") ( @Since("1.2.0") param: Param[T], @Since("1.2.0") value: T) { @@ -554,7 +552,7 @@ trait Params extends Identifiable with Serializable { * * This only needs to check for interactions between parameters. * Parameter value checks which do not depend on other parameters are handled by - * [[Param.validate()]]. This method does not handle input/output column parameters; + * `Param.validate()`. This method does not handle input/output column parameters; * those are checked during schema validation. * @deprecated Will be removed in 2.1.0. All the checks should be merged into transformSchema */ @@ -582,8 +580,7 @@ trait Params extends Identifiable with Serializable { } /** - * Explains all params of this instance. - * @see [[explainParam()]] + * Explains all params of this instance. See `explainParam()`. */ def explainParams(): String = { params.map(explainParam).mkString("\n") @@ -680,7 +677,7 @@ trait Params extends Identifiable with Serializable { /** * Sets default values for a list of params. * - * Note: Java developers should use the single-parameter [[setDefault()]]. + * Note: Java developers should use the single-parameter `setDefault`. * Annotating this with varargs can cause compilation failures due to a Scala compiler bug. * See SPARK-9268. * @@ -714,8 +711,7 @@ trait Params extends Identifiable with Serializable { /** * Creates a copy of this instance with the same UID and some extra params. * Subclasses should implement this method and set the return type properly. - * - * @see [[defaultCopy()]] + * See `defaultCopy()`. */ def copy(extra: ParamMap): Params @@ -732,7 +728,8 @@ trait Params extends Identifiable with Serializable { /** * Extracts the embedded default param values and user-supplied values, and then merges them with * extra values from input into a flat param map, where the latter value is used if there exist - * conflicts, i.e., with ordering: default param values < user-supplied values < extra. + * conflicts, i.e., with ordering: + * default param values less than user-supplied values less than extra. */ final def extractParamMap(extra: ParamMap): ParamMap = { defaultParamMap ++ paramMap ++ extra @@ -797,11 +794,9 @@ trait Params extends Identifiable with Serializable { abstract class JavaParams extends Params /** - * :: Experimental :: * A param to value map. */ @Since("1.2.0") -@Experimental final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) extends Serializable { @@ -952,7 +947,6 @@ final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) } @Since("1.2.0") -@Experimental object ParamMap { /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 5dc2433e55c3..02e2384afe53 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -26,12 +26,12 @@ import scala.util.{Sorting, Try} import scala.util.hashing.byteswap64 import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.JsonDSL._ import org.apache.spark.{Dependency, Partitioner, ShuffleDependency, SparkContext} -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ @@ -99,7 +99,7 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w with HasPredictionCol with HasCheckpointInterval with HasSeed { /** - * Param for rank of the matrix factorization (>= 1). + * Param for rank of the matrix factorization (positive). * Default: 10 * @group param */ @@ -109,7 +109,7 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w def getRank: Int = $(rank) /** - * Param for number of user blocks (>= 1). + * Param for number of user blocks (positive). * Default: 10 * @group param */ @@ -120,7 +120,7 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w def getNumUserBlocks: Int = $(numUserBlocks) /** - * Param for number of item blocks (>= 1). + * Param for number of item blocks (positive). * Default: 10 * @group param */ @@ -141,7 +141,7 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w def getImplicitPrefs: Boolean = $(implicitPrefs) /** - * Param for the alpha parameter in the implicit preference formulation (>= 0). + * Param for the alpha parameter in the implicit preference formulation (nonnegative). * Default: 1.0 * @group param */ @@ -174,7 +174,7 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w /** * Param for StorageLevel for intermediate datasets. Pass in a string representation of - * [[StorageLevel]]. Cannot be "NONE". + * `StorageLevel`. Cannot be "NONE". * Default: "MEMORY_AND_DISK". * * @group expertParam @@ -188,7 +188,7 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w /** * Param for StorageLevel for ALS model factors. Pass in a string representation of - * [[StorageLevel]]. + * `StorageLevel`. * Default: "MEMORY_AND_DISK". * * @group expertParam @@ -222,14 +222,12 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w } /** - * :: Experimental :: * Model fitted by ALS. * * @param rank rank of the matrix factorization model * @param userFactors a DataFrame that stores user factors in two columns: `id` and `features` * @param itemFactors a DataFrame that stores item factors in two columns: `id` and `features` */ -@Experimental @Since("1.3.0") class ALSModel private[ml] ( @Since("1.4.0") override val uid: String, @@ -333,7 +331,6 @@ object ALSModel extends MLReadable[ALSModel] { } /** - * :: Experimental :: * Alternating Least Squares (ALS) matrix factorization. * * ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices, @@ -354,15 +351,14 @@ object ALSModel extends MLReadable[ALSModel] { * * For implicit preference data, the algorithm used is based on * "Collaborative Filtering for Implicit Feedback Datasets", available at - * [[http://dx.doi.org/10.1109/ICDM.2008.22]], adapted for the blocked approach used here. + * http://dx.doi.org/10.1109/ICDM.2008.22, adapted for the blocked approach used here. * * Essentially instead of finding the low-rank approximations to the rating matrix `R`, * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if - * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of + * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of * indicated user * preferences rather than explicit ratings given to items. */ -@Experimental @Since("1.3.0") class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] with ALSParams with DefaultParamsWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 7c51845a2581..2b9912657f51 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -244,7 +244,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val msg = s"${optimizer.getClass.getName} failed." throw new SparkException(msg) } - state.x.toArray.clone() } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 7ff6d0afd55c..ebc6c12ddcf9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector @@ -38,13 +38,11 @@ import org.apache.spark.sql.functions._ /** - * :: Experimental :: * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm * for regression. * It supports both continuous and categorical features. */ @Since("1.4.0") -@Experimental class DecisionTreeRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Predictor[Vector, DecisionTreeRegressor, DecisionTreeRegressionModel] with DecisionTreeRegressorParams with DefaultParamsWritable { @@ -125,7 +123,6 @@ class DecisionTreeRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S } @Since("1.4.0") -@Experimental object DecisionTreeRegressor extends DefaultParamsReadable[DecisionTreeRegressor] { /** Accessor for supported impurities: variance */ final val supportedImpurities: Array[String] = TreeRegressorParams.supportedImpurities @@ -135,13 +132,11 @@ object DecisionTreeRegressor extends DefaultParamsReadable[DecisionTreeRegressor } /** - * :: Experimental :: * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for regression. * It supports both continuous and categorical features. * @param rootNode Root of the decision tree */ @Since("1.4.0") -@Experimental class DecisionTreeRegressionModel private[ml] ( override val uid: String, override val rootNode: Node, diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 6223555504d7..ce355938ec1c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -38,7 +38,6 @@ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ /** - * :: Experimental :: * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]] * learning algorithm for regression. * It supports both continuous and categorical features. @@ -56,7 +55,6 @@ import org.apache.spark.sql.functions._ * [https://issues.apache.org/jira/browse/SPARK-4240] */ @Since("1.4.0") -@Experimental class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Predictor[Vector, GBTRegressor, GBTRegressionModel] with GBTRegressorParams with DefaultParamsWritable with Logging { @@ -135,7 +133,6 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String) } @Since("1.4.0") -@Experimental object GBTRegressor extends DefaultParamsReadable[GBTRegressor] { /** Accessor for supported loss settings: squared (L2), absolute (L1) */ @@ -147,8 +144,6 @@ object GBTRegressor extends DefaultParamsReadable[GBTRegressor] { } /** - * :: Experimental :: - * * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]] * model for regression. * It supports both continuous and categorical features. @@ -156,7 +151,6 @@ object GBTRegressor extends DefaultParamsReadable[GBTRegressor] { * @param _treeWeights Weights for the decision trees in the ensemble. */ @Since("1.4.0") -@Experimental class GBTRegressionModel private[ml]( override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index a23e90d9e125..2bdc09e1db24 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -376,7 +376,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine def deviance(y: Double, mu: Double, weight: Double): Double /** - * Akaike's 'An Information Criterion'(AIC) value of the family for a given dataset. + * Akaike Information Criterion (AIC) value of the family for a given dataset. * * @param predictions an RDD of (y, mu, weight) of instances in evaluation dataset * @param deviance the deviance for the fitted model in evaluation dataset @@ -702,13 +702,13 @@ class GeneralizedLinearRegressionModel private[ml] ( import GeneralizedLinearRegression._ - lazy val familyObj = Family.fromName($(family)) - lazy val linkObj = if (isDefined(link)) { + private lazy val familyObj = Family.fromName($(family)) + private lazy val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { familyObj.defaultLink } - lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj) + private lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj) override protected def predict(features: Vector): Double = { val eta = predictLink(features) @@ -1021,7 +1021,7 @@ class GeneralizedLinearRegressionSummary private[regression] ( rss / degreesOfFreedom } - /** Akaike's "An Information Criterion"(AIC) for the fitted model. */ + /** Akaike Information Criterion (AIC) for the fitted model. */ @Since("2.0.0") lazy val aic: Double = { val w = weightCol diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index 9b9429a328d0..35396446edc1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.regression import org.apache.hadoop.fs.Path -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} @@ -120,7 +120,6 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures } /** - * :: Experimental :: * Isotonic regression. * * Currently implemented using parallelized pool adjacent violators algorithm. @@ -129,7 +128,6 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures * Uses [[org.apache.spark.mllib.regression.IsotonicRegression]]. */ @Since("1.5.0") -@Experimental class IsotonicRegression @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends Estimator[IsotonicRegressionModel] with IsotonicRegressionBase with DefaultParamsWritable { @@ -192,7 +190,6 @@ object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] { } /** - * :: Experimental :: * Model fitted by IsotonicRegression. * Predicts using a piecewise linear function. * @@ -202,7 +199,6 @@ object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] { * model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]]. */ @Since("1.5.0") -@Experimental class IsotonicRegressionModel private[ml] ( override val uid: String, private val oldModel: MLlibIsotonicRegressionModel) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 6b82ae14e1ce..a0ff7f07aa3d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -54,7 +54,6 @@ private[regression] trait LinearRegressionParams extends PredictorParams with HasFitIntercept with HasStandardization with HasWeightCol with HasSolver /** - * :: Experimental :: * Linear regression. * * The learning objective is to minimize the squared error, with regularization. @@ -68,7 +67,6 @@ private[regression] trait LinearRegressionParams extends PredictorParams * - L2 + L1 (elastic net) */ @Since("1.3.0") -@Experimental class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String) extends Regressor[Vector, LinearRegression, LinearRegressionModel] with LinearRegressionParams with DefaultParamsWritable with Logging { @@ -327,11 +325,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String throw new SparkException(msg) } - if (!state.actuallyConverged) { - logWarning("LinearRegression training fininshed but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - /* The coefficients are trained in the scaled space; we're converting them back to the original space. @@ -387,11 +380,9 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] { } /** - * :: Experimental :: * Model produced by [[LinearRegression]]. */ @Since("1.3.0") -@Experimental class LinearRegressionModel private[ml] ( @Since("1.4.0") override val uid: String, @Since("2.0.0") val coefficients: Vector, @@ -798,16 +789,16 @@ class LinearRegressionSummary private[regression] ( * * Now, the first derivative of the objective function in scaled space is * {{{ - * \frac{\partial L}{\partial\w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i} + * \frac{\partial L}{\partial w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i} * }}} * However, ($x_i - \bar{x_i}$) will densify the computation, so it's not * an ideal formula when the training dataset is sparse format. * - * This can be addressed by adding the dense \bar{x_i} / \har{x_i} terms + * This can be addressed by adding the dense \bar{x_i} / \hat{x_i} terms * in the end by keeping the sum of diff. The first derivative of total * objective function from all the samples is * {{{ - * \frac{\partial L}{\partial\w_i} = + * \frac{\partial L}{\partial w_i} = * 1/N \sum_j diff_j (x_{ij} - \bar{x_i}) / \hat{x_i} * = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) - diffSum \bar{x_i}) / \hat{x_i}) * = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + correction_i) @@ -826,7 +817,7 @@ class LinearRegressionSummary private[regression] ( * the training dataset, which can be easily computed in distributed fashion, and is * sparse format friendly. * {{{ - * \frac{\partial L}{\partial\w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + * \frac{\partial L}{\partial w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) * }}}, * * @param coefficients The coefficients corresponding to the features. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 4f4d3d27841d..0ad00aa6f928 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.regression import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector @@ -37,12 +37,10 @@ import org.apache.spark.sql.functions._ /** - * :: Experimental :: * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] learning algorithm for regression. * It supports both continuous and categorical features. */ @Since("1.4.0") -@Experimental class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Predictor[Vector, RandomForestRegressor, RandomForestRegressionModel] with RandomForestRegressorParams with DefaultParamsWritable { @@ -118,7 +116,6 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S } @Since("1.4.0") -@Experimental object RandomForestRegressor extends DefaultParamsReadable[RandomForestRegressor]{ /** Accessor for supported impurity settings: variance */ @Since("1.4.0") @@ -135,7 +132,6 @@ object RandomForestRegressor extends DefaultParamsReadable[RandomForestRegressor } /** - * :: Experimental :: * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] model for regression. * It supports both continuous and categorical features. * @@ -143,7 +139,6 @@ object RandomForestRegressor extends DefaultParamsReadable[RandomForestRegressor * @param numFeatures Number of features used by this model */ @Since("1.4.0") -@Experimental class RandomForestRegressionModel private[ml] ( override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala index d5e5c454605b..07e98a142b10 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala @@ -17,17 +17,14 @@ package org.apache.spark.ml.tree -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.tree.impurity.ImpurityCalculator import org.apache.spark.mllib.tree.model.{ImpurityStats, InformationGainStats => OldInformationGainStats, Node => OldNode, Predict => OldPredict} /** - * :: DeveloperApi :: * Decision tree node interface. */ -@DeveloperApi sealed abstract class Node extends Serializable { // TODO: Add aggregate stats (once available). This will happen after we move the DecisionTree @@ -109,12 +106,10 @@ private[ml] object Node { } /** - * :: DeveloperApi :: * Decision tree leaf node. * @param prediction Prediction this node makes * @param impurity Impurity measure at this node (for training data) */ -@DeveloperApi class LeafNode private[ml] ( override val prediction: Double, override val impurity: Double, @@ -147,17 +142,15 @@ class LeafNode private[ml] ( } /** - * :: DeveloperApi :: * Internal Decision Tree node. * @param prediction Prediction this node would make if it were a leaf node * @param impurity Impurity measure at this node (for training data) - * @param gain Information gain value. - * Values < 0 indicate missing values; this quirk will be removed with future updates. + * @param gain Information gain value. Values less than 0 indicate missing values; + * this quirk will be removed with future updates. * @param leftChild Left-hand child node * @param rightChild Right-hand child node * @param split Information about the test used to split to the left or right child. */ -@DeveloperApi class InternalNode private[ml] ( override val prediction: Double, override val impurity: Double, @@ -167,6 +160,9 @@ class InternalNode private[ml] ( val split: Split, override private[ml] val impurityStats: ImpurityCalculator) extends Node { + // Note to developers: The constructor argument impurityStats should be reconsidered before we + // make the constructor public. We may be able to improve the representation. + override def toString: String = { s"InternalNode(prediction = $prediction, impurity = $impurity, split = $split)" } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala index 9704e15cd838..dff44e2d49ec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala @@ -19,18 +19,16 @@ package org.apache.spark.ml.tree import java.util.Objects -import org.apache.spark.annotation.{DeveloperApi, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.tree.configuration.{FeatureType => OldFeatureType} import org.apache.spark.mllib.tree.model.{Split => OldSplit} /** - * :: DeveloperApi :: * Interface for a "Split," which specifies a test made at a decision tree node * to choose the left or right path. */ -@DeveloperApi sealed trait Split extends Serializable { /** Index of feature which this split tests */ @@ -67,14 +65,12 @@ private[tree] object Split { } /** - * :: DeveloperApi :: * Split which tests a categorical feature. * @param featureIndex Index of the feature to test * @param _leftCategories If the feature value is in this set of categories, then the split goes * left. Otherwise, it goes right. * @param numCategories Number of categories for this feature. */ -@DeveloperApi class CategoricalSplit private[ml] ( override val featureIndex: Int, _leftCategories: Array[Double], @@ -153,13 +149,11 @@ class CategoricalSplit private[ml] ( } /** - * :: DeveloperApi :: * Split which tests a continuous feature. * @param featureIndex Index of the feature to test - * @param threshold If the feature value is <= this threshold, then the split goes left. - * Otherwise, it goes right. + * @param threshold If the feature value is less than or equal to this threshold, then the + * split goes left. Otherwise, it goes right. */ -@DeveloperApi class ContinuousSplit private[ml] (override val featureIndex: Int, val threshold: Double) extends Split { diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index 5b6fcc53c2dd..d3cbc363799a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -415,12 +415,12 @@ private[ml] object EnsembleModelReadWrite { /** * Helper method for loading a tree ensemble from disk. * This reconstructs all trees, returning the root nodes. - * @param path Path given to [[saveImpl()]] + * @param path Path given to `saveImpl` * @param className Class name for ensemble model type * @param treeClassName Class name for tree model type in the ensemble * @return (ensemble metadata, array over trees of (tree metadata, root node)), * where the root node is linked with all descendents - * @see [[saveImpl()]] for how the model was saved + * @see `saveImpl` for how the model was saved */ def loadImpl( path: String, diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 7d42da4a2ffa..520557849b9e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -25,7 +25,7 @@ import com.github.fommil.netlib.F2jBLAS import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml._ import org.apache.spark.ml.evaluation.Evaluator @@ -55,11 +55,9 @@ private[ml] trait CrossValidatorParams extends ValidatorParams { } /** - * :: Experimental :: * K-fold cross validation. */ @Since("1.2.0") -@Experimental class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) extends Estimator[CrossValidatorModel] with CrossValidatorParams with MLWritable with Logging { @@ -190,7 +188,6 @@ object CrossValidator extends MLReadable[CrossValidator] { } /** - * :: Experimental :: * Model from k-fold cross validation. * * @param bestModel The best model selected from k-fold cross validation. @@ -198,7 +195,6 @@ object CrossValidator extends MLReadable[CrossValidator] { * [[CrossValidator.estimatorParamMaps]], in the corresponding order. */ @Since("1.2.0") -@Experimental class CrossValidatorModel private[ml] ( @Since("1.4.0") override val uid: String, @Since("1.2.0") val bestModel: Model[_], diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala index 7d12f447f796..d369e7a61cdc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala @@ -20,15 +20,13 @@ package org.apache.spark.ml.tuning import scala.annotation.varargs import scala.collection.mutable -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.param._ /** - * :: Experimental :: * Builder for a param grid used in grid search-based model selection. */ @Since("1.2.0") -@Experimental class ParamGridBuilder @Since("1.2.0") { private val paramGrid = mutable.Map.empty[Param[_], Iterable[_]] diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala index f6f2bad401a1..0fdba1cb8814 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala @@ -25,7 +25,7 @@ import scala.language.existentials import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.evaluation.Evaluator @@ -54,14 +54,12 @@ private[ml] trait TrainValidationSplitParams extends ValidatorParams { } /** - * :: Experimental :: * Validation for hyper-parameter tuning. * Randomly splits the input dataset into train and validation sets, * and uses evaluation metric on the validation set to select the best model. * Similar to [[CrossValidator]], but only splits the set once. */ @Since("1.5.0") -@Experimental class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: String) extends Estimator[TrainValidationSplitModel] with TrainValidationSplitParams with MLWritable with Logging { @@ -188,7 +186,6 @@ object TrainValidationSplit extends MLReadable[TrainValidationSplit] { } /** - * :: Experimental :: * Model from train validation split. * * @param uid Id. @@ -196,7 +193,6 @@ object TrainValidationSplit extends MLReadable[TrainValidationSplit] { * @param validationMetrics Evaluated validation metrics. */ @Since("1.5.0") -@Experimental class TrainValidationSplitModel private[ml] ( @Since("1.5.0") override val uid: String, @Since("1.5.0") val bestModel: Model[_], diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 91edcf2a7925..f1664ce4ab3f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -22,7 +22,7 @@ import java.util.Random import scala.annotation.tailrec import scala.collection.mutable -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} @@ -31,8 +31,6 @@ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel /** - * :: Experimental :: - * * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" * by Steinbach, Karypis, and Kumar, with modification to fit Spark. * The algorithm starts from a single cluster that contains all points. @@ -54,7 +52,6 @@ import org.apache.spark.storage.StorageLevel * KDD Workshop on Text Mining, 2000.]] */ @Since("1.6.0") -@Experimental class BisectingKMeans private ( private var k: Int, private var maxIterations: Int, @@ -398,8 +395,6 @@ private object BisectingKMeans extends Serializable { } /** - * :: Experimental :: - * * Represents a node in a clustering tree. * * @param index node index, negative for internal nodes and non-negative for leaf nodes @@ -411,7 +406,6 @@ private object BisectingKMeans extends Serializable { * @param children children nodes */ @Since("1.6.0") -@Experimental private[clustering] class ClusteringTreeNode private[clustering] ( val index: Int, val size: Long, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 11fd940b8b20..8438015ccece 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -23,7 +23,7 @@ import org.json4s.jackson.JsonMethods._ import org.json4s.JsonDSL._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.Vector @@ -32,8 +32,6 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} /** - * :: Experimental :: - * * Clustering model produced by [[BisectingKMeans]]. * The prediction is done level-by-level from the root node to a leaf node, and at each node among * its children the closest to the input point is selected. @@ -41,7 +39,6 @@ import org.apache.spark.sql.{Row, SparkSession} * @param root the root node of the clustering tree */ @Since("1.6.0") -@Experimental class BisectingKMeansModel private[clustering] ( private[clustering] val root: ClusteringTreeNode ) extends Serializable with Saveable with Logging { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index d29582630041..90d8a558f10d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -25,7 +25,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.api.java.{JavaPairRDD, JavaRDD} import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} @@ -426,13 +426,10 @@ class LocalLDAModel private[spark] ( } /** - * :: Experimental :: - * * Local (non-distributed) model fitted by [[LDA]]. * * This model stores the inferred topics only; it does not store info about the training dataset. */ -@Experimental @Since("1.5.0") object LocalLDAModel extends Loader[LocalLDAModel] { @@ -787,7 +784,13 @@ class DistributedLDAModel private[clustering] ( @Since("1.5.0") def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => - val topIndices = argtopk(topicCounts, k) + // TODO: Remove work-around for the breeze bug. + // https://github.com/scalanlp/breeze/issues/561 + val topIndices = if (k == topicCounts.length) { + Seq.range(0, k) + } else { + argtopk(topicCounts, k) + } val sumCounts = sum(topicCounts) val weights = if (sumCounts != 0) { topicCounts(topIndices) / sumCounts @@ -822,15 +825,12 @@ class DistributedLDAModel private[clustering] ( } /** - * :: Experimental :: - * * Distributed model fitted by [[LDA]]. * This type of model is currently only produced by Expectation-Maximization (EM). * * This model stores the inferred topics, the full training dataset, and the topic distribution * for each training document. */ -@Experimental @Since("1.5.0") object DistributedLDAModel extends Loader[DistributedLDAModel] { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 2436efba3248..e2c6aca553c1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -508,8 +508,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val weight = rho() val N = gammat.rows.toDouble val alpha = this.alpha.asBreeze.toDenseVector - val logphat: BDM[Double] = sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)) / N - val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat.toDenseVector) + val logphat: BDV[Double] = + sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)).t / N + val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat) val c = N * trigamma(sum(alpha)) val q = -N * trigamma(alpha) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index c8c2823bbaf0..56fb2d33c2ca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -173,8 +173,8 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { * Creates a ChiSquared feature selector. * @param numTopFeatures number of features that selector will select * (ordered by statistic value descending) - * Note that if the number of features is < numTopFeatures, then this will - * select all features. + * Note that if the number of features is less than numTopFeatures, + * then this will select all features. */ @Since("1.3.0") class ChiSqSelector @Since("1.3.0") ( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala index 15b72205ac17..aaecfa8d45dc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala @@ -70,7 +70,7 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { } /** - * Java-friendly version of [[fit()]] + * Java-friendly version of `fit()`. */ @Since("1.4.0") def fit(sources: JavaRDD[Vector]): PCAModel = fit(sources.rdd) @@ -91,7 +91,7 @@ class PCAModel private[spark] ( * Transform a vector by computed Principal Components. * * @param vector vector to be transformed. - * Vector must be the same length as the source vectors given to [[PCA.fit()]]. + * Vector must be the same length as the source vectors given to `PCA.fit()`. * @return transformed vector. Vector will be of length k. */ @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala index b7d6c6056803..3e86c6c59c95 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala @@ -27,9 +27,8 @@ import org.apache.spark.rdd.RDD * Standardizes features by removing the mean and scaling to unit std using column summary * statistics on the samples in the training set. * - * The "unit std" is computed using the - * [[https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation - * corrected sample standard deviation]], + * The "unit std" is computed using the corrected sample standard deviation + * (https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation), * which is computed as the square root of the unbiased sample variance. * * @param withMean False by default. Centers the data with mean before scaling. It will build a diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index f2211df3f943..bc75646d532d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -434,6 +434,9 @@ class Word2Vec extends Serializable with Logging { bcSyn1Global.unpersist(false) } newSentences.unpersist() + expTable.destroy() + bcVocab.destroy() + bcVocabHash.destroy() val wordArray = vocab.map(_.word) new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 9a63cc29dacb..3c26d2670841 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.fpm import scala.collection.JavaConverters._ import scala.reflect.ClassTag -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.internal.Logging @@ -28,14 +28,11 @@ import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset import org.apache.spark.rdd.RDD /** - * :: Experimental :: - * * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates * association rules which have a single item as the consequent. * */ @Since("1.5.0") -@Experimental class AssociationRules private[fpm] ( private var minConfidence: Double) extends Logging with Serializable { @@ -95,8 +92,6 @@ class AssociationRules private[fpm] ( object AssociationRules { /** - * :: Experimental :: - * * An association rule between sets of items. * @param antecedent hypotheses of the rule. Java users should call [[Rule#javaAntecedent]] * instead. @@ -106,7 +101,6 @@ object AssociationRules { * */ @Since("1.5.0") - @Experimental class Rule[Item] private[fpm] ( @Since("1.5.0") val antecedent: Array[Item], @Since("1.5.0") val consequent: Array[Item], diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index c13c794775fe..7382000791cf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -30,7 +30,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods.{compact, render} import org.apache.spark.SparkContext -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.internal.Logging @@ -42,8 +42,6 @@ import org.apache.spark.sql.types._ import org.apache.spark.storage.StorageLevel /** - * :: Experimental :: - * * A parallel PrefixSpan algorithm to mine frequent sequential patterns. * The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns * Efficiently by Prefix-Projected Pattern Growth ([[http://doi.org/10.1109/ICDE.2001.914830]]). @@ -60,7 +58,6 @@ import org.apache.spark.storage.StorageLevel * @see [[https://en.wikipedia.org/wiki/Sequential_Pattern_Mining Sequential Pattern Mining * (Wikipedia)]] */ -@Experimental @Since("1.5.0") class PrefixSpan private ( private var minSupport: Double, @@ -230,7 +227,6 @@ class PrefixSpan private ( } -@Experimental @Since("1.5.0") object PrefixSpan extends Logging { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala index 4591cb88ef15..8024b1c0031f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.linalg -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since /** * Represents singular value decomposition (SVD) factors. @@ -26,10 +26,8 @@ import org.apache.spark.annotation.{Experimental, Since} case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType) /** - * :: Experimental :: * Represents QR factors. */ @Since("1.5.0") -@Experimental case class QRDecomposition[QType, RType](Q: QType, R: RType) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 639295c69525..978235058706 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -426,16 +426,21 @@ class BlockMatrix @Since("1.3.0") ( partitioner: GridPartitioner): (BlockDestinations, BlockDestinations) = { val leftMatrix = blockInfo.keys.collect() // blockInfo should already be cached val rightMatrix = other.blocks.keys.collect() + + val rightCounterpartsHelper = rightMatrix.groupBy(_._1).mapValues(_.map(_._2)) val leftDestinations = leftMatrix.map { case (rowIndex, colIndex) => - val rightCounterparts = rightMatrix.filter(_._1 == colIndex) - val partitions = rightCounterparts.map(b => partitioner.getPartition((rowIndex, b._2))) + val rightCounterparts = rightCounterpartsHelper.getOrElse(colIndex, Array()) + val partitions = rightCounterparts.map(b => partitioner.getPartition((rowIndex, b))) ((rowIndex, colIndex), partitions.toSet) }.toMap + + val leftCounterpartsHelper = leftMatrix.groupBy(_._2).mapValues(_.map(_._1)) val rightDestinations = rightMatrix.map { case (rowIndex, colIndex) => - val leftCounterparts = leftMatrix.filter(_._2 == rowIndex) - val partitions = leftCounterparts.map(b => partitioner.getPartition((b._1, colIndex))) + val leftCounterparts = leftCounterpartsHelper.getOrElse(rowIndex, Array()) + val partitions = leftCounterparts.map(b => partitioner.getPartition((b, colIndex))) ((rowIndex, colIndex), partitions.toSet) }.toMap + (leftDestinations, rightDestinations) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 480a64548cb7..f37235500565 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -21,7 +21,7 @@ import scala.collection.mutable.ArrayBuffer import breeze.linalg.{norm, DenseVector => BDV} -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD @@ -53,11 +53,9 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va } /** - * :: Experimental :: * Set fraction of data to be used for each SGD iteration. * Default 1.0 (corresponding to deterministic/classical gradient descent) */ - @Experimental def setMiniBatchFraction(fraction: Double): this.type = { require(fraction > 0 && fraction <= 1.0, s"Fraction for mini-batch SGD must be in range (0, 1] but got ${fraction}") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index ec6ffe6e1943..e49363c2c64d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -212,6 +212,7 @@ object LBFGS extends Logging { state = states.next() } lossHistory += state.value + val weights = Vectors.fromBreeze(state.x) val lossHistoryArray = lossHistory.result() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala index 274ac7c99553..5d61796f1de6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult import org.jpmml.model.JAXBUtil import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory /** @@ -45,20 +45,16 @@ trait PMMLExportable { } /** - * :: Experimental :: * Export the model to a local file in PMML format */ - @Experimental @Since("1.4.0") def toPMML(localPath: String): Unit = { toPMML(new StreamResult(new File(localPath))) } /** - * :: Experimental :: * Export the model to a directory on a distributed file system in PMML format */ - @Experimental @Since("1.4.0") def toPMML(sc: SparkContext, path: String): Unit = { val pmml = toPMML() @@ -66,20 +62,16 @@ trait PMMLExportable { } /** - * :: Experimental :: * Export the model to the OutputStream in PMML format */ - @Experimental @Since("1.4.0") def toPMML(outputStream: OutputStream): Unit = { toPMML(new StreamResult(outputStream)) } /** - * :: Experimental :: * Export the model to a String in PMML format */ - @Experimental @Since("1.4.0") def toPMML(): String = { val writer = new StringWriter diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index 1cd6f2a8969a..377326f8739b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -35,6 +35,7 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession +import org.apache.spark.RangePartitioner /** * Regression model for isotonic regression. @@ -408,9 +409,11 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali */ private def parallelPoolAdjacentViolators( input: RDD[(Double, Double, Double)]): Array[(Double, Double, Double)] = { - val parallelStepResult = input - .sortBy(x => (x._2, x._1)) - .glom() + val keyedInput = input.keyBy(_._2) + val parallelStepResult = keyedInput + .partitionBy(new RangePartitioner(keyedInput.getNumPartitions, keyedInput)) + .values + .mapPartitions(p => Iterator(p.toArray.sortBy(x => (x._2, x._1)))) .flatMap(poolAdjacentViolators) .collect() .sortBy(x => (x._2, x._1)) // Sort again because collect() doesn't promise ordering. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index d4de0fd7d5f4..964f419d120d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -47,9 +47,10 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S private var currM2: Array[Double] = _ private var currL1: Array[Double] = _ private var totalCnt: Long = 0 - private var weightSum: Double = 0.0 + private var totalWeightSum: Double = 0.0 private var weightSquareSum: Double = 0.0 - private var nnz: Array[Double] = _ + private var weightSum: Array[Double] = _ + private var nnz: Array[Long] = _ private var currMax: Array[Double] = _ private var currMin: Array[Double] = _ @@ -74,7 +75,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S currM2n = Array.ofDim[Double](n) currM2 = Array.ofDim[Double](n) currL1 = Array.ofDim[Double](n) - nnz = Array.ofDim[Double](n) + weightSum = Array.ofDim[Double](n) + nnz = Array.ofDim[Long](n) currMax = Array.fill[Double](n)(Double.MinValue) currMin = Array.fill[Double](n)(Double.MaxValue) } @@ -86,7 +88,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S val localCurrM2n = currM2n val localCurrM2 = currM2 val localCurrL1 = currL1 - val localNnz = nnz + val localWeightSum = weightSum + val localNumNonzeros = nnz val localCurrMax = currMax val localCurrMin = currMin instance.foreachActive { (index, value) => @@ -100,16 +103,17 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S val prevMean = localCurrMean(index) val diff = value - prevMean - localCurrMean(index) = prevMean + weight * diff / (localNnz(index) + weight) + localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight) localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff localCurrM2(index) += weight * value * value localCurrL1(index) += weight * math.abs(value) - localNnz(index) += weight + localWeightSum(index) += weight + localNumNonzeros(index) += 1 } } - weightSum += weight + totalWeightSum += weight weightSquareSum += weight * weight totalCnt += 1 this @@ -124,17 +128,18 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") def merge(other: MultivariateOnlineSummarizer): this.type = { - if (this.weightSum != 0.0 && other.weightSum != 0.0) { + if (this.totalWeightSum != 0.0 && other.totalWeightSum != 0.0) { require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " + s"Expecting $n but got ${other.n}.") totalCnt += other.totalCnt - weightSum += other.weightSum + totalWeightSum += other.totalWeightSum weightSquareSum += other.weightSquareSum var i = 0 while (i < n) { - val thisNnz = nnz(i) - val otherNnz = other.nnz(i) + val thisNnz = weightSum(i) + val otherNnz = other.weightSum(i) val totalNnz = thisNnz + otherNnz + val totalCnnz = nnz(i) + other.nnz(i) if (totalNnz != 0.0) { val deltaMean = other.currMean(i) - currMean(i) // merge mean together @@ -149,18 +154,20 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S currMax(i) = math.max(currMax(i), other.currMax(i)) currMin(i) = math.min(currMin(i), other.currMin(i)) } - nnz(i) = totalNnz + weightSum(i) = totalNnz + nnz(i) = totalCnnz i += 1 } - } else if (weightSum == 0.0 && other.weightSum != 0.0) { + } else if (totalWeightSum == 0.0 && other.totalWeightSum != 0.0) { this.n = other.n this.currMean = other.currMean.clone() this.currM2n = other.currM2n.clone() this.currM2 = other.currM2.clone() this.currL1 = other.currL1.clone() this.totalCnt = other.totalCnt - this.weightSum = other.weightSum + this.totalWeightSum = other.totalWeightSum this.weightSquareSum = other.weightSquareSum + this.weightSum = other.weightSum.clone() this.nnz = other.nnz.clone() this.currMax = other.currMax.clone() this.currMin = other.currMin.clone() @@ -174,12 +181,12 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") override def mean: Vector = { - require(weightSum > 0, s"Nothing has been added to this summarizer.") + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") val realMean = Array.ofDim[Double](n) var i = 0 while (i < n) { - realMean(i) = currMean(i) * (nnz(i) / weightSum) + realMean(i) = currMean(i) * (weightSum(i) / totalWeightSum) i += 1 } Vectors.dense(realMean) @@ -191,11 +198,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") override def variance: Vector = { - require(weightSum > 0, s"Nothing has been added to this summarizer.") + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") val realVariance = Array.ofDim[Double](n) - val denominator = weightSum - (weightSquareSum / weightSum) + val denominator = totalWeightSum - (weightSquareSum / totalWeightSum) // Sample variance is computed, if the denominator is less than 0, the variance is just 0. if (denominator > 0.0) { @@ -203,8 +210,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S var i = 0 val len = currM2n.length while (i < len) { - realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * - (weightSum - nnz(i)) / weightSum) / denominator + realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) * + (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator i += 1 } } @@ -224,9 +231,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") override def numNonzeros: Vector = { - require(weightSum > 0, s"Nothing has been added to this summarizer.") + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") - Vectors.dense(nnz) + Vectors.dense(weightSum) } /** @@ -235,11 +242,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") override def max: Vector = { - require(weightSum > 0, s"Nothing has been added to this summarizer.") + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") var i = 0 while (i < n) { - if ((nnz(i) < weightSum) && (currMax(i) < 0.0)) currMax(i) = 0.0 + if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0 i += 1 } Vectors.dense(currMax) @@ -251,11 +258,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") override def min: Vector = { - require(weightSum > 0, s"Nothing has been added to this summarizer.") + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") var i = 0 while (i < n) { - if ((nnz(i) < weightSum) && (currMin(i) > 0.0)) currMin(i) = 0.0 + if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0 i += 1 } Vectors.dense(currMin) @@ -267,7 +274,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.2.0") override def normL2: Vector = { - require(weightSum > 0, s"Nothing has been added to this summarizer.") + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") val realMagnitude = Array.ofDim[Double](n) @@ -286,7 +293,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.2.0") override def normL1: Vector = { - require(weightSum > 0, s"Nothing has been added to this summarizer.") + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") Vectors.dense(currL1) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala index 4c382d7c2b79..97c032de7a81 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat.test import scala.beans.BeanInfo -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.dstream.DStream @@ -42,7 +42,6 @@ case class BinarySample @Since("1.6.0") ( } /** - * :: Experimental :: * Performs online 2-sample significance testing for a stream of (Boolean, Double) pairs. The * Boolean identifies which sample each observation comes from, and the Double is the numeric value * of the observation. @@ -67,7 +66,6 @@ case class BinarySample @Since("1.6.0") ( * .registerStream(DStream) * }}} */ -@Experimental @Since("1.6.0") class StreamingTest @Since("1.6.0") () extends Logging with Serializable { private var peacePeriod: Int = 0 diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala index 8a29fd39a910..5cfc05a3dd2d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.stat.test -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since /** * Trait for hypothesis test results. @@ -94,10 +94,8 @@ class ChiSqTestResult private[stat] (override val pValue: Double, } /** - * :: Experimental :: * Object containing the test results for the Kolmogorov-Smirnov test. */ -@Experimental @Since("1.5.0") class KolmogorovSmirnovTestResult private[stat] ( @Since("1.5.0") override val pValue: Double, @@ -113,10 +111,8 @@ class KolmogorovSmirnovTestResult private[stat] ( } /** - * :: Experimental :: * Object containing the test results for streaming testing. */ -@Experimental @Since("1.6.0") private[stat] class StreamingTestResult @Since("1.6.0") ( @Since("1.6.0") override val pValue: Double, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index 853c7319ec44..2436ce40866e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -17,14 +17,12 @@ package org.apache.spark.mllib.tree.configuration -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since /** - * :: Experimental :: * Enum to select the algorithm for the decision tree */ @Since("1.0.0") -@Experimental object Algo extends Enumeration { @Since("1.0.0") type Algo = Value diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala index 3a731f45d6a0..d4448da9eef5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala @@ -17,14 +17,12 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Since} /** - * :: Experimental :: * Class for calculating entropy during multiclass classification. */ @Since("1.0.0") -@Experimental object Entropy extends Impurity { private[tree] def log2(x: Double) = scala.math.log(x) / scala.math.log(2) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala index 7730c0a8c111..c5e34ffa4f2e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala @@ -17,16 +17,14 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Since} /** - * :: Experimental :: - * Class for calculating the - * [[http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity Gini impurity]] + * Class for calculating the Gini impurity + * (http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity) * during multiclass classification. */ @Since("1.0.0") -@Experimental object Gini extends Impurity { /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala index 65f0163ec605..a5bdc2c6d2c9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala @@ -17,17 +17,15 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Since} /** - * :: Experimental :: * Trait for calculating information gain. * This trait is used for * (a) setting the impurity parameter in [[org.apache.spark.mllib.tree.configuration.Strategy]] * (b) calculating impurity values from sufficient statistics. */ @Since("1.0.0") -@Experimental trait Impurity extends Serializable { /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala index 2423516123b8..c9bf0db4de3c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala @@ -17,14 +17,12 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Since} /** - * :: Experimental :: * Class for calculating variance during regression */ @Since("1.0.0") -@Experimental object Variance extends Impurity { /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala index 4d71d534a077..c881c8ea50c0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala @@ -45,7 +45,7 @@ trait Saveable { * - human-readable (JSON) model metadata to path/metadata/ * - Parquet formatted data to path/data/ * - * The model may be loaded using [[Loader.load]]. + * The model may be loaded using `Loader.load`. * * @param sc Spark context used to save model data. * @param path Path specifying the directory in which to save this model. diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java index ac479c08418c..8c0338e2844f 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java @@ -107,7 +107,11 @@ public VectorPair call(Tuple2 pair) { .fit(df); List result = pca.transform(df).select("pca_features", "expected").toJavaRDD().collect(); for (Row r : result) { - Assert.assertEquals(r.get(1), r.get(0)); + Vector calculatedVector = (Vector) r.get(0); + Vector expectedVector = (Vector) r.get(1); + for (int i = 0; i < calculatedVector.size(); i++) { + Assert.assertEquals(calculatedVector.apply(i), expectedVector.apply(i), 1.0e-8); + } } } } diff --git a/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java b/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java index b09e13112f12..bd64a7186eac 100644 --- a/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java @@ -20,7 +20,7 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.ml.linalg.sqlDataTypes.*; +import static org.apache.spark.ml.linalg.SQLDataTypes.*; public class JavaSQLDataTypesSuite { @Test diff --git a/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala index 13bf3d3015f6..0bd0c32f19d0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite class SQLDataTypesSuite extends SparkFunSuite { test("sqlDataTypes") { - assert(sqlDataTypes.VectorType === new VectorUDT) - assert(sqlDataTypes.MatrixType === new MatrixUDT) + assert(SQLDataTypes.VectorType === new VectorUDT) + assert(SQLDataTypes.MatrixType === new MatrixUDT) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index eb050158d48f..211e2bc026c7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -118,8 +118,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { assert(weights.length == 2) val bdvTopicDist = topicDistribution.asBreeze val top2Indices = argtopk(bdvTopicDist, 2) - assert(top2Indices.toArray === indices) - assert(bdvTopicDist(top2Indices).toArray === weights) + assert(top2Indices.toSet === indices.toSet) + assert(bdvTopicDist(top2Indices).toArray.toSet === weights.toSet) } // Check: log probabilities diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala index a8d82932d390..2f90afdcee55 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala @@ -18,9 +18,10 @@ package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { @@ -42,7 +43,9 @@ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() - assert(pca_transform.toSet === mat_multiply.toSet) - assert(pca.explainedVariance === explainedVariance) + pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => + assert(calculated ~== expected relTol 1e-8) + } + assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala index ea4f2865757c..94da626d92eb 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala @@ -176,6 +176,17 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext w assert(model.predictions === Array(1, 2, 2)) } + test("SPARK-16426 isotonic regression with duplicate features that produce NaNs") { + val trainRDD = sc.parallelize(Seq[(Double, Double, Double)]((2, 1, 1), (1, 1, 1), (0, 2, 1), + (1, 2, 1), (0.5, 3, 1), (0, 3, 1)), + 2) + + val model = new IsotonicRegression().run(trainRDD) + + assert(model.boundaries === Array(1.0, 3.0)) + assert(model.predictions === Array(0.75, 0.75)) + } + test("isotonic regression prediction") { val model = runIsotonicRegression(Seq(1, 2, 7, 1, 2), true) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala index b6d41db69be0..165a3f314a20 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala @@ -245,4 +245,29 @@ class MultivariateOnlineSummarizerSuite extends SparkFunSuite { absTol 1E-8, "normL2 mismatch") assert(summarizer.normL1 ~== Vectors.dense(0.21, 0.4265, 0.61) absTol 1E-10, "normL1 mismatch") } + + test("test min/max with weighted samples (SPARK-16561)") { + val summarizer1 = new MultivariateOnlineSummarizer() + .add(Vectors.dense(10.0, -10.0), 1e10) + .add(Vectors.dense(0.0, 0.0), 1e-7) + + val summarizer2 = new MultivariateOnlineSummarizer() + summarizer2.add(Vectors.dense(10.0, -10.0), 1e10) + for (i <- 1 to 100) { + summarizer2.add(Vectors.dense(0.0, 0.0), 1e-7) + } + + val summarizer3 = new MultivariateOnlineSummarizer() + for (i <- 1 to 100) { + summarizer3.add(Vectors.dense(0.0, 0.0), 1e-7) + } + summarizer3.add(Vectors.dense(10.0, -10.0), 1e10) + + assert(summarizer1.max ~== Vectors.dense(10.0, 0.0) absTol 1e-14) + assert(summarizer1.min ~== Vectors.dense(0.0, -10.0) absTol 1e-14) + assert(summarizer2.max ~== Vectors.dense(10.0, 0.0) absTol 1e-14) + assert(summarizer2.min ~== Vectors.dense(0.0, -10.0) absTol 1e-14) + assert(summarizer3.max ~== Vectors.dense(10.0, 0.0) absTol 1e-14) + assert(summarizer3.min ~== Vectors.dense(0.0, -10.0) absTol 1e-14) + } } diff --git a/pom.xml b/pom.xml index 4c8671a57035..d064cb57dd6c 100644 --- a/pom.xml +++ b/pom.xml @@ -657,7 +657,7 @@ org.scalanlp breeze_${scala.binary.version} - 0.11.2 + 0.12 diff --git a/python/pyspark/ml/__init__.py b/python/pyspark/ml/__init__.py index 05f3be5f0d6e..1d42d49a8816 100644 --- a/python/pyspark/ml/__init__.py +++ b/python/pyspark/ml/__init__.py @@ -16,8 +16,8 @@ # """ -Spark ML is a component that adds a new set of machine learning APIs to let users quickly -assemble and configure practical machine learning pipelines. +DataFrame-based machine learning APIs to let users quickly assemble and configure practical +machine learning pipelines. """ from pyspark.ml.base import Estimator, Model, Transformer from pyspark.ml.pipeline import Pipeline, PipelineModel diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index c035942f7386..9a3c7b15964e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -49,8 +49,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, HasWeightCol, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Logistic regression. Currently, this class only supports binary classification. @@ -216,8 +214,6 @@ def _checkThresholdConsistency(self): class LogisticRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by LogisticRegression. .. versionadded:: 1.3.0 @@ -277,6 +273,8 @@ def evaluate(self, dataset): class LogisticRegressionSummary(JavaWrapper): """ + .. note:: Experimental + Abstraction for Logistic Regression Results for a given model. .. versionadded:: 2.0.0 @@ -321,6 +319,8 @@ def featuresCol(self): @inherit_doc class LogisticRegressionTrainingSummary(LogisticRegressionSummary): """ + .. note:: Experimental + Abstraction for multinomial Logistic Regression Training results. Currently, the training summary ignores the training weights except for the objective trace. @@ -501,8 +501,6 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - `Decision tree `_ learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical @@ -599,8 +597,6 @@ def _create_model(self, java_model): @inherit_doc class DecisionTreeClassificationModel(DecisionTreeModel, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by DecisionTreeClassifier. .. versionadded:: 1.4.0 @@ -634,8 +630,6 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred RandomForestParams, TreeClassifierParams, HasCheckpointInterval, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - `Random Forest `_ learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical @@ -730,8 +724,6 @@ def _create_model(self, java_model): class RandomForestClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by RandomForestClassifier. .. versionadded:: 1.4.0 @@ -764,8 +756,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - `Gradient-Boosted Trees (GBTs) `_ learning algorithm for classification. It supports binary labels, as well as both continuous and categorical features. @@ -885,8 +875,6 @@ def getLossType(self): class GBTClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by GBTClassifier. .. versionadded:: 1.4.0 @@ -918,8 +906,6 @@ def trees(self): class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, HasRawPredictionCol, HasThresholds, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Naive Bayes Classifiers. It supports both Multinomial and Bernoulli NB. `Multinomial NB `_ @@ -1043,8 +1029,6 @@ def getModelType(self): class NaiveBayesModel(JavaModel, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by NaiveBayes. .. versionadded:: 1.5.0 @@ -1140,11 +1124,11 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \ + maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \ solver="l-bfgs", initialWeights=None) """ super(MultilayerPerceptronClassifier, self).__init__() @@ -1157,11 +1141,11 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred @keyword_only @since("1.6.0") def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \ + maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \ solver="l-bfgs", initialWeights=None) Sets params for MultilayerPerceptronClassifier. """ @@ -1315,7 +1299,7 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable): >>> [x.coefficients for x in model.models] [DenseVector([3.3925, 1.8785]), DenseVector([-4.3016, -6.3163]), DenseVector([-4.5855, 6.1785])] >>> [x.intercept for x in model.models] - [-3.6474708290602034, 2.5507881951814495, -1.1016513228162115] + [-3.64747..., 2.55078..., -1.10165...] >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0))]).toDF() >>> model.transform(test0).head().prediction 1.0 diff --git a/python/pyspark/ml/common.py b/python/pyspark/ml/common.py index 7d449aaccb44..aec860fca705 100644 --- a/python/pyspark/ml/common.py +++ b/python/pyspark/ml/common.py @@ -51,6 +51,7 @@ def _new_smart_decode(obj): _picklable_classes = [ 'SparseVector', 'DenseVector', + 'SparseMatrix', 'DenseMatrix', ] diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index bbbb94f9a0a0..2881380152c8 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -60,8 +60,6 @@ @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Binarize a column of continuous features given a threshold. >>> df = spark.createDataFrame([(0.5,)], ["values"]) @@ -125,8 +123,6 @@ def getThreshold(self): @inherit_doc class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Maps a column of continuous features to a column of feature buckets. >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) @@ -200,8 +196,6 @@ def getSplits(self): @inherit_doc class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`. >>> df = spark.createDataFrame( @@ -348,8 +342,6 @@ def _create_model(self, java_model): class CountVectorizerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`CountVectorizer`. .. versionadded:: 1.6.0 @@ -367,8 +359,6 @@ def vocabulary(self): @inherit_doc class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A feature transformer that takes the 1D discrete cosine transform of a real vector. No zero padding is performed on the input vector. It returns a real vector of the same length representing the DCT. @@ -439,8 +429,6 @@ def getInverse(self): class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a provided "weight" vector. In other words, it scales each column of the dataset by a scalar multiplier. @@ -505,8 +493,6 @@ def getScalingVec(self): class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Maps a sequence of terms to their term frequencies using the hashing trick. Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32) to calculate the hash code value for the term object. @@ -576,8 +562,6 @@ def getBinary(self): @inherit_doc class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Compute the Inverse Document Frequency (IDF) given a collection of documents. >>> from pyspark.ml.linalg import DenseVector @@ -653,8 +637,6 @@ def _create_model(self, java_model): class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`IDF`. .. versionadded:: 1.4.0 @@ -752,8 +734,6 @@ def maxAbs(self): @inherit_doc class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Rescale each feature individually to a common range [min, max] linearly using column summary statistics, which is also known as min-max normalization or Rescaling. The rescaled value for feature E is calculated as, @@ -859,8 +839,6 @@ def _create_model(self, java_model): class MinMaxScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`MinMaxScaler`. .. versionadded:: 1.6.0 @@ -887,8 +865,6 @@ def originalMax(self): @ignore_unicode_prefix class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of @@ -965,8 +941,6 @@ def getN(self): @inherit_doc class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Normalize a vector to have unit norm using the given p-norm. >>> from pyspark.ml.linalg import Vectors @@ -1031,8 +1005,6 @@ def getP(self): @inherit_doc class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A one-hot encoder that maps a column of category indices to a column of binary vectors, with at most a single one-value per row that indicates the input category index. @@ -1114,8 +1086,6 @@ def getDropLast(self): class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion `_, "In mathematics, an expansion of a product of sums expresses it as a sum of products by using the fact that @@ -1287,8 +1257,6 @@ def _create_model(self, java_model): @ignore_unicode_prefix class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect) to split the text (default) or repeatedly matching the regex (if gaps is false). @@ -1418,8 +1386,6 @@ def getToLowercase(self): @inherit_doc class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Implements the transforms which are defined by SQL statement. Currently we only support SQL syntax like 'SELECT ... FROM __THIS__' where '__THIS__' represents the underlying table of the input dataset. @@ -1479,8 +1445,6 @@ def getStatement(self): @inherit_doc class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. @@ -1576,8 +1540,6 @@ def _create_model(self, java_model): class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`StandardScaler`. .. versionadded:: 1.4.0 @@ -1604,8 +1566,6 @@ def mean(self): class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A label indexer that maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values. The indices are in [0, numLabels), ordered by label frequencies. @@ -1668,8 +1628,6 @@ def _create_model(self, java_model): class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`StringIndexer`. .. versionadded:: 1.4.0 @@ -1687,8 +1645,6 @@ def labels(self): @inherit_doc class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A :py:class:`Transformer` that maps a column of indices back to a new column of corresponding string values. The index-string mapping is either from the ML attributes of the input column, @@ -1741,8 +1697,6 @@ def getLabels(self): class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A feature transformer that filters out stop words from input. Note: null values from input array are preserved unless adding null to stopWords explicitly. @@ -1833,8 +1787,6 @@ def loadDefaultStopWords(language): @ignore_unicode_prefix class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A tokenizer that converts the input string to lowercase and then splits it by white spaces. @@ -1888,8 +1840,6 @@ def setParams(self, inputCol=None, outputCol=None): @inherit_doc class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - A feature transformer that merges multiple columns into a vector column. >>> df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"]) @@ -1934,8 +1884,6 @@ def setParams(self, inputCols=None, outputCol=None): @inherit_doc class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Class for indexing categorical feature columns in a dataset of `Vector`. This has 2 usage modes: @@ -2050,8 +1998,6 @@ def _create_model(self, java_model): class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`VectorIndexer`. Transform categorical features to use 0-based indices instead of their original values. @@ -2089,8 +2035,6 @@ def categoryMaps(self): @inherit_doc class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - This class takes a feature vector and outputs a new feature vector with a subarray of the original features. @@ -2183,8 +2127,6 @@ def getNames(self): class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further natural language processing or machine learning process. @@ -2352,8 +2294,6 @@ def _create_model(self, java_model): class Word2VecModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`Word2Vec`. .. versionadded:: 1.4.0 @@ -2383,8 +2323,6 @@ def findSynonyms(self, word, num): @inherit_doc class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - PCA trains a model to project vectors to a lower dimensional space of the top :py:attr:`k` principal components. @@ -2458,8 +2396,6 @@ def _create_model(self, java_model): class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable): """ - .. note:: Experimental - Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space. .. versionadded:: 1.5.0 diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 8de9ad85311f..d88dc7535359 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -41,8 +41,6 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept, HasStandardization, HasSolver, HasWeightCol, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Linear regression. The learning objective is to minimize the squared error, with regularization. @@ -130,8 +128,6 @@ def _create_model(self, java_model): class LinearRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by :class:`LinearRegression`. .. versionadded:: 1.4.0 @@ -411,8 +407,6 @@ def totalIterations(self): class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Currently implemented using parallelized pool adjacent violators algorithm. Only univariate (single feature) algorithm supported. @@ -439,6 +433,8 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti True >>> model.predictions == model2.predictions True + + .. versionadded:: 1.6.0 """ isotonic = \ @@ -505,13 +501,13 @@ def getFeatureIndex(self): class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by :class:`IsotonicRegression`. + + .. versionadded:: 1.6.0 """ @property - @since("2.0.0") + @since("1.6.0") def boundaries(self): """ Boundaries in increasing order for which predictions are known. @@ -519,7 +515,7 @@ def boundaries(self): return self._call_java("boundaries") @property - @since("2.0.0") + @since("1.6.0") def predictions(self): """ Predictions associated with the boundaries at the same index, monotone because of isotonic @@ -642,8 +638,6 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol): """ - .. note:: Experimental - `Decision tree `_ learning algorithm for regression. It supports both continuous and categorical features. @@ -727,8 +721,6 @@ def _create_model(self, java_model): @inherit_doc class DecisionTreeModel(JavaModel): """ - .. note:: Experimental - Abstraction for Decision Tree models. .. versionadded:: 1.5.0 @@ -759,11 +751,9 @@ def __repr__(self): @inherit_doc class TreeEnsembleModels(JavaModel): """ - .. note:: Experimental + (private abstraction) Represents a tree ensemble model. - - .. versionadded:: 1.5.0 """ @property @@ -803,8 +793,6 @@ def __repr__(self): @inherit_doc class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by :class:`DecisionTreeRegressor`. .. versionadded:: 1.4.0 @@ -837,8 +825,6 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi RandomForestParams, TreeRegressorParams, HasCheckpointInterval, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - `Random Forest `_ learning algorithm for regression. It supports both continuous and categorical features. @@ -925,8 +911,6 @@ def _create_model(self, java_model): class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by :class:`RandomForestRegressor`. .. versionadded:: 1.4.0 @@ -959,8 +943,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, JavaMLReadable, TreeRegressorParams): """ - .. note:: Experimental - `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. It supports both continuous and categorical features. @@ -1067,8 +1049,6 @@ def getLossType(self): class GBTRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ - .. note:: Experimental - Model fitted by :class:`GBTRegressor`. .. versionadded:: 1.4.0 diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 24efce812b3b..4bcb2c400c4a 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -16,7 +16,7 @@ # """ -Unit tests for Spark ML Python APIs. +Unit tests for MLlib Python DataFrame-based APIs. """ import sys if sys.version > '3': diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index f857c5e8c86b..298314d46caf 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -33,8 +33,6 @@ class ParamGridBuilder(object): r""" - .. note:: Experimental - Builder for a param grid used in grid search-based model selection. >>> from pyspark.ml.classification import LogisticRegression @@ -145,8 +143,6 @@ def getEvaluator(self): class CrossValidator(Estimator, ValidatorParams): """ - .. note:: Experimental - K-fold cross validation. >>> from pyspark.ml.classification import LogisticRegression @@ -264,8 +260,6 @@ def copy(self, extra=None): class CrossValidatorModel(Model, ValidatorParams): """ - .. note:: Experimental - Model from k-fold cross validation. .. versionadded:: 1.4.0 diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py index acba3a717d21..ae26521ea96b 100644 --- a/python/pyspark/mllib/__init__.py +++ b/python/pyspark/mllib/__init__.py @@ -16,7 +16,10 @@ # """ -Python bindings for MLlib. +RDD-based machine learning APIs for Python (in maintenance mode). + +The `pyspark.mllib` package is in maintenance mode as of the Spark 2.0.0 release to encourage +migration to the DataFrame-based APIs under the `pyspark.ml` package. """ from __future__ import absolute_import diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 3734f87405e5..9f53ed098202 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -48,8 +48,6 @@ def __init__(self, weights, intercept): @since('1.4.0') def setThreshold(self, value): """ - .. note:: Experimental - Sets the threshold that separates positive predictions from negative predictions. An example with prediction score greater than or equal to this threshold is identified as a positive, @@ -62,8 +60,6 @@ def setThreshold(self, value): @since('1.4.0') def threshold(self): """ - .. note:: Experimental - Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions. It is used for binary classification only. @@ -73,8 +69,6 @@ def threshold(self): @since('1.4.0') def clearThreshold(self): """ - .. note:: Experimental - Clears the threshold so that `predict` will output raw prediction scores. It is used for binary classification only. """ diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index c38c543972d1..c8c3c42774f2 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -47,8 +47,6 @@ @inherit_doc class BisectingKMeansModel(JavaModelWrapper): """ - .. note:: Experimental - A clustering model derived from the bisecting k-means method. >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) @@ -120,8 +118,6 @@ def computeCost(self, x): class BisectingKMeans(object): """ - .. note:: Experimental - A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark. @@ -366,8 +362,6 @@ def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||" class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ - .. note:: Experimental - A clustering model derived from the Gaussian Mixture Model method. >>> from pyspark.mllib.linalg import Vectors, DenseMatrix @@ -513,8 +507,6 @@ def load(cls, sc, path): class GaussianMixture(object): """ - .. note:: Experimental - Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm. .. versionadded:: 1.3.0 @@ -565,8 +557,6 @@ def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initia class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ - .. note:: Experimental - Model produced by [[PowerIterationClustering]]. >>> import math @@ -645,8 +635,6 @@ def load(cls, sc, path): class PowerIterationClustering(object): """ - .. note:: Experimental - Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very low-dimensional embedding of a @@ -693,8 +681,6 @@ class Assignment(namedtuple("Assignment", ["id", "cluster"])): class StreamingKMeansModel(KMeansModel): """ - .. note:: Experimental - Clustering model which can perform an online update of the centroids. The update formula for each centroid is given by @@ -794,8 +780,6 @@ def update(self, data, decayFactor, timeUnit): class StreamingKMeans(object): """ - .. note:: Experimental - Provides methods to set k, decayFactor, timeUnit to configure the KMeans algorithm for fitting and predicting on incoming dstreams. More details on how the centroids are updated are provided under the diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index aef91a8ddc1f..c8a6e33f4d9a 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -60,8 +60,6 @@ def transform(self, vector): class Normalizer(VectorTransformer): """ - .. note:: Experimental - Normalizes samples individually to unit L\ :sup:`p`\ norm For any 1 <= `p` < float('inf'), normalizes samples using @@ -131,8 +129,6 @@ def transform(self, vector): class StandardScalerModel(JavaVectorTransformer): """ - .. note:: Experimental - Represents a StandardScaler model that can transform vectors. .. versionadded:: 1.2.0 @@ -207,8 +203,6 @@ def mean(self): class StandardScaler(object): """ - .. note:: Experimental - Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. @@ -262,8 +256,6 @@ def fit(self, dataset): class ChiSqSelectorModel(JavaVectorTransformer): """ - .. note:: Experimental - Represents a Chi Squared selector model. .. versionadded:: 1.4.0 @@ -282,8 +274,6 @@ def transform(self, vector): class ChiSqSelector(object): """ - .. note:: Experimental - Creates a ChiSquared feature selector. :param numTopFeatures: number of features that selector will select. @@ -361,8 +351,6 @@ def fit(self, data): class HashingTF(object): """ - .. note:: Experimental - Maps a sequence of terms to their term frequencies using the hashing trick. @@ -448,8 +436,6 @@ def idf(self): class IDF(object): """ - .. note:: Experimental - Inverse document frequency (IDF). The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, @@ -697,8 +683,6 @@ def fit(self, data): class ElementwiseProduct(VectorTransformer): """ - .. note:: Experimental - Scales each column of the vector, with the supplied weight vector. i.e the elementwise product. diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py index fb226e84e5d5..f58ea5dfb087 100644 --- a/python/pyspark/mllib/fpm.py +++ b/python/pyspark/mllib/fpm.py @@ -31,8 +31,6 @@ @ignore_unicode_prefix class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ - .. note:: Experimental - A FP-Growth model for mining frequent itemsets using the Parallel FP-Growth algorithm. @@ -70,8 +68,6 @@ def load(cls, sc, path): class FPGrowth(object): """ - .. note:: Experimental - A Parallel FP-growth algorithm to mine frequent itemsets. .. versionadded:: 1.4.0 @@ -108,8 +104,6 @@ class FreqItemset(namedtuple("FreqItemset", ["items", "freq"])): @ignore_unicode_prefix class PrefixSpanModel(JavaModelWrapper): """ - .. note:: Experimental - Model fitted by PrefixSpan >>> data = [ @@ -133,8 +127,6 @@ def freqSequences(self): class PrefixSpan(object): """ - .. note:: Experimental - A parallel PrefixSpan algorithm to mine frequent sequential patterns. The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns Efficiently by Prefix-Projected Pattern Growth diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 15dc53a959d6..9672dbde823f 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -1338,8 +1338,6 @@ def fromML(mat): class QRDecomposition(object): """ - .. note:: Experimental - Represents QR factors. """ def __init__(self, Q, R): diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index ea4f27cf4ffe..538cada7d163 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -40,8 +40,6 @@ class DistributedMatrix(object): """ - .. note:: Experimental - Represents a distributively stored matrix backed by one or more RDDs. @@ -57,8 +55,6 @@ def numCols(self): class RowMatrix(DistributedMatrix): """ - .. note:: Experimental - Represents a row-oriented distributed Matrix with no meaningful row indices. @@ -306,8 +302,6 @@ def tallSkinnyQR(self, computeQ=False): class IndexedRow(object): """ - .. note:: Experimental - Represents a row of an IndexedRowMatrix. Just a wrapper over a (long, vector) tuple. @@ -334,8 +328,6 @@ def _convert_to_indexed_row(row): class IndexedRowMatrix(DistributedMatrix): """ - .. note:: Experimental - Represents a row-oriented distributed Matrix with indexed rows. :param rows: An RDD of IndexedRows or (long, vector) tuples. @@ -536,8 +528,6 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): class MatrixEntry(object): """ - .. note:: Experimental - Represents an entry of a CoordinateMatrix. Just a wrapper over a (long, long, float) tuple. @@ -566,8 +556,6 @@ def _convert_to_matrix_entry(entry): class CoordinateMatrix(DistributedMatrix): """ - .. note:: Experimental - Represents a matrix in coordinate format. :param entries: An RDD of MatrixEntry inputs or @@ -795,8 +783,6 @@ def _convert_to_matrix_block_tuple(block): class BlockMatrix(DistributedMatrix): """ - .. note:: Experimental - Represents a distributed matrix in blocks of local matrices. :param blocks: An RDD of sub-matrix blocks diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py index 7da921976d4d..3b1c5519bd87 100644 --- a/python/pyspark/mllib/stat/KernelDensity.py +++ b/python/pyspark/mllib/stat/KernelDensity.py @@ -28,8 +28,6 @@ class KernelDensity(object): """ - .. note:: Experimental - Estimate probability density at required points given a RDD of samples from the population. diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index b0a85240b289..67d5f0e44f41 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -160,8 +160,6 @@ def corr(x, y=None, method=None): @ignore_unicode_prefix def chiSqTest(observed, expected=None): """ - .. note:: Experimental - If `observed` is Vector, conduct Pearson's chi-squared goodness of fit test of the observed data against the expected distribution, or againt the uniform distribution (by default), with each category @@ -246,8 +244,6 @@ def chiSqTest(observed, expected=None): @ignore_unicode_prefix def kolmogorovSmirnovTest(data, distName="norm", *params): """ - .. note:: Experimental - Performs the Kolmogorov-Smirnov (KS) test for data sampled from a continuous distribution. It tests the null hypothesis that the data is generated from a particular distribution. diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 8be76fcefe54..b3011d42e56a 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -76,8 +76,6 @@ def toDebugString(self): class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ - .. note:: Experimental - A decision tree model for classification or regression. .. versionadded:: 1.1.0 @@ -130,8 +128,6 @@ def _java_loader_class(cls): class DecisionTree(object): """ - .. note:: Experimental - Learning algorithm for a decision tree model for classification or regression. @@ -283,8 +279,6 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, @inherit_doc class RandomForestModel(TreeEnsembleModel, JavaLoader): """ - .. note:: Experimental - Represents a random forest model. .. versionadded:: 1.2.0 @@ -297,8 +291,6 @@ def _java_loader_class(cls): class RandomForest(object): """ - .. note:: Experimental - Learning algorithm for a random forest model for classification or regression. @@ -486,8 +478,6 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt @inherit_doc class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader): """ - .. note:: Experimental - Represents a gradient-boosted tree model. .. versionadded:: 1.3.0 @@ -500,8 +490,6 @@ def _java_loader_class(cls): class GradientBoostedTrees(object): """ - .. note:: Experimental - Learning algorithm for a gradient boosted trees model for classification or regression. diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 6afe76966222..0508235c1c9e 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1027,20 +1027,20 @@ def histogram(self, buckets): If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), this can be switched from an O(log n) inseration to O(1) per - element(where n = # buckets). + element (where n is the number of buckets). - Buckets must be sorted and not contain any duplicates, must be + Buckets must be sorted, not contain any duplicates, and have at least two elements. - If `buckets` is a number, it will generates buckets which are + If `buckets` is a number, it will generate buckets which are evenly spaced between the minimum and maximum of the RDD. For - example, if the min value is 0 and the max is 100, given buckets - as 2, the resulting buckets will be [0,50) [50,100]. buckets must - be at least 1 If the RDD contains infinity, NaN throws an exception - If the elements in RDD do not vary (max == min) always returns - a single bucket. + example, if the min value is 0 and the max is 100, given `buckets` + as 2, the resulting buckets will be [0,50) [50,100]. `buckets` must + be at least 1. An exception is raised if the RDD contains infinity. + If the elements in the RDD do not vary (max == min), a single bucket + will be used. - It will return a tuple of buckets and histogram. + The return value is a tuple of buckets and histogram. >>> rdd = sc.parallelize(range(51)) >>> rdd.histogram(2) diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index ac5ce87a3f0f..c1917d2be69d 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -49,6 +49,7 @@ spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext +sql = spark.sql atexit.register(lambda: sc.stop()) # for compatibility diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 4cfdf799f6f4..0debcf1de405 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -471,12 +471,11 @@ class HiveContext(SQLContext): .. note:: Deprecated in 2.0.0. Use SparkSession.builder.enableHiveSupport().getOrCreate(). """ - warnings.warn( - "HiveContext is deprecated in Spark 2.0.0. Please use " + - "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", - DeprecationWarning) - def __init__(self, sparkContext, jhiveContext=None): + warnings.warn( + "HiveContext is deprecated in Spark 2.0.0. Please use " + + "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", + DeprecationWarning) if jhiveContext is None: sparkSession = SparkSession.builder.enableHiveSupport().getOrCreate() else: diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index ab41e88620b2..0cbb3adfa88b 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -613,16 +613,16 @@ def alias(self, alias): def join(self, other, on=None, how=None): """Joins with another :class:`DataFrame`, using the given join expression. - The following performs a full outer join between ``df1`` and ``df2``. - :param other: Right side of the join - :param on: a string for join column name, a list of column names, - , a join expression (Column) or a list of Columns. - If `on` is a string or a list of string indicating the name of the join column(s), + :param on: a string for the join column name, a list of column names, + a join expression (Column), or a list of Columns. + If `on` is a string or a list of strings indicating the name of the join column(s), the column(s) must exist on both sides, and this performs an equi-join. :param how: str, default 'inner'. One of `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`. + The following performs a full outer join between ``df1`` and ``df2``. + >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect() [Row(name=None, height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)] @@ -1388,6 +1388,7 @@ def withColumn(self, colName, col): @since(1.3) def withColumnRenamed(self, existing, new): """Returns a new :class:`DataFrame` by renaming an existing column. + This is a no-op if schema doesn't contain the given column name. :param existing: string, name of the existing column to rename. :param col: string, new name of the column. @@ -1399,11 +1400,12 @@ def withColumnRenamed(self, existing, new): @since(1.4) @ignore_unicode_prefix - def drop(self, col): + def drop(self, *cols): """Returns a new :class:`DataFrame` that drops the specified column. + This is a no-op if schema doesn't contain the given column name(s). - :param col: a string name of the column to drop, or a - :class:`Column` to drop. + :param cols: a string name of the column to drop, or a + :class:`Column` to drop, or a list of string name of the columns to drop. >>> df.drop('age').collect() [Row(name=u'Alice'), Row(name=u'Bob')] @@ -1416,13 +1418,24 @@ def drop(self, col): >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect() [Row(age=5, name=u'Bob', height=85)] + + >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect() + [Row(name=u'Bob')] """ - if isinstance(col, basestring): - jdf = self._jdf.drop(col) - elif isinstance(col, Column): - jdf = self._jdf.drop(col._jc) + if len(cols) == 1: + col = cols[0] + if isinstance(col, basestring): + jdf = self._jdf.drop(col) + elif isinstance(col, Column): + jdf = self._jdf.drop(col._jc) + else: + raise TypeError("col should be a string or a Column") else: - raise TypeError("col should be a string or a Column") + for col in cols: + if not isinstance(col, basestring): + raise TypeError("each col in the param list should be a string") + jdf = self._jdf.drop(self._jseq(cols)) + return DataFrame(jdf, self.sql_ctx) @ignore_unicode_prefix diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index a360fbefa492..594f9375f767 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -232,6 +232,12 @@ def sparkContext(self): """Returns the underlying :class:`SparkContext`.""" return self._sc + @property + @since(2.0) + def version(self): + """The version of Spark on which this application is running.""" + return self._jsparkSession.version() + @property @since(2.0) def conf(self): diff --git a/repl/pom.xml b/repl/pom.xml index 5a288b064dee..4e623a6ac1f1 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-repl_2.11 jar Spark Project REPL diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala index e87100417370..16f330a320a4 100644 --- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -1066,7 +1066,7 @@ class SparkILoop( logWarning("ADD_JARS environment variable is deprecated, use --jar spark submit argument instead") } val jars = { - val userJars = Utils.getUserJars(conf) + val userJars = Utils.getUserJars(conf, isShell = true) if (userJars.isEmpty) { envJars.getOrElse("") } else { diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala index 28fe84d6fe9b..5dfe18ad4982 100644 --- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala +++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala @@ -54,7 +54,7 @@ object Main extends Logging { // Visible for testing private[repl] def doMain(args: Array[String], _interp: SparkILoop): Unit = { interp = _interp - val jars = Utils.getUserJars(conf).mkString(File.pathSeparator) + val jars = Utils.getUserJars(conf, isShell = true).mkString(File.pathSeparator) val interpArguments = List( "-Yrepl-class-based", "-Yrepl-outdir", s"${outputDir.getAbsolutePath}", diff --git a/sql/README.md b/sql/README.md index b0903980a59f..58e9097ed4db 100644 --- a/sql/README.md +++ b/sql/README.md @@ -1,83 +1,10 @@ Spark SQL ========= -This module provides support for executing relational queries expressed in either SQL or a LINQ-like Scala DSL. +This module provides support for executing relational queries expressed in either SQL or the DataFrame/Dataset API. Spark SQL is broken up into four subprojects: - Catalyst (sql/catalyst) - An implementation-agnostic framework for manipulating trees of relational operators and expressions. - Execution (sql/core) - A query planner / execution engine for translating Catalyst's logical query plans into Spark RDDs. This component also includes a new public interface, SQLContext, that allows users to execute SQL or LINQ statements against existing RDDs and Parquet files. - Hive Support (sql/hive) - Includes an extension of SQLContext called HiveContext that allows users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes. There are also wrappers that allows users to run queries that include Hive UDFs, UDAFs, and UDTFs. - HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server. - - -Other dependencies for developers ---------------------------------- -In order to create new hive test cases (i.e. a test suite based on `HiveComparisonTest`), -you will need to setup your development environment based on the following instructions. - -If you are working with Hive 0.12.0, you will need to set several environmental variables as follows. - -``` -export HIVE_HOME="/hive/build/dist" -export HIVE_DEV_HOME="/hive/" -export HADOOP_HOME="/hadoop" -``` - -If you are working with Hive 0.13.1, the following steps are needed: - -1. Download Hive's [0.13.1](https://archive.apache.org/dist/hive/hive-0.13.1) and set `HIVE_HOME` with `export HIVE_HOME=""`. Please do not set `HIVE_DEV_HOME` (See [SPARK-4119](https://issues.apache.org/jira/browse/SPARK-4119)). -2. Set `HADOOP_HOME` with `export HADOOP_HOME=""` -3. Download all Hive 0.13.1a jars (Hive jars actually used by Spark) from [here](http://mvnrepository.com/artifact/org.spark-project.hive) and replace corresponding original 0.13.1 jars in `$HIVE_HOME/lib`. -4. Download [Kryo 2.21 jar](http://mvnrepository.com/artifact/com.esotericsoftware.kryo/kryo/2.21) (Note: 2.22 jar does not work) and [Javolution 5.5.1 jar](http://mvnrepository.com/artifact/javolution/javolution/5.5.1) to `$HIVE_HOME/lib`. -5. This step is optional. But, when generating golden answer files, if a Hive query fails and you find that Hive tries to talk to HDFS or you find weird runtime NPEs, set the following in your test suite... - -``` -val testTempDir = Utils.createTempDir() -// We have to use kryo to let Hive correctly serialize some plans. -sql("set hive.plan.serialization.format=kryo") -// Explicitly set fs to local fs. -sql(s"set fs.default.name=file://$testTempDir/") -// Ask Hive to run jobs in-process as a single map and reduce task. -sql("set mapred.job.tracker=local") -``` - -Using the console -================= -An interactive scala console can be invoked by running `build/sbt hive/console`. -From here you can execute queries with HiveQl and manipulate DataFrame by using DSL. - -```scala -$ build/sbt hive/console - -[info] Starting scala interpreter... -import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.dsl._ -import org.apache.spark.sql.catalyst.errors._ -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.execution -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive._ -import org.apache.spark.sql.hive.test.TestHive._ -import org.apache.spark.sql.hive.test.TestHive.implicits._ -import org.apache.spark.sql.types._ -Type in expressions to have them evaluated. -Type :help for more information. - -scala> val query = sql("SELECT * FROM (SELECT * FROM src) a") -query: org.apache.spark.sql.DataFrame = [key: int, value: string] -``` - -Query results are `DataFrames` and can be operated as such. -``` -scala> query.collect() -res0: Array[org.apache.spark.sql.Row] = Array([238,val_238], [86,val_86], [311,val_311], [27,val_27]... -``` - -You can also build further queries on top of these `DataFrames` using the query DSL. -``` -scala> query.where(query("key") > 30).select(avg(query("key"))).collect() -res1: Array[org.apache.spark.sql.Row] = Array([274.79025423728814]) -``` diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index 34ea0a77f147..82b49ebb21a4 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-catalyst_2.11 jar Spark Project Catalyst diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 7ccbb2dac90c..5e1046293a20 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -170,7 +170,7 @@ unsupportedHiveNativeCommands | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CONCATENATE | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=SET kw4=FILEFORMAT | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=ADD kw4=COLUMNS - | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CHANGE kw4=COLUMNS? + | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CHANGE kw4=COLUMN? | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=REPLACE kw4=COLUMNS | kw1=START kw2=TRANSACTION | kw1=COMMIT diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/FixedLengthRowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/FixedLengthRowBasedKeyValueBatch.java new file mode 100644 index 000000000000..b6130d1f332b --- /dev/null +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/FixedLengthRowBasedKeyValueBatch.java @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions; + +import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.sql.types.*; +import org.apache.spark.unsafe.Platform; + +/** + * An implementation of `RowBasedKeyValueBatch` in which all key-value records have same length. + * + * The format for each record looks like this: + * [UnsafeRow for key of length klen] [UnsafeRow for Value of length vlen] + * [8 bytes pointer to next] + * Thus, record length = klen + vlen + 8 + */ +public final class FixedLengthRowBasedKeyValueBatch extends RowBasedKeyValueBatch { + private final int klen; + private final int vlen; + private final int recordLength; + + private final long getKeyOffsetForFixedLengthRecords(int rowId) { + return recordStartOffset + rowId * (long) recordLength; + } + + /** + * Append a key value pair. + * It copies data into the backing MemoryBlock. + * Returns an UnsafeRow pointing to the value if succeeds, otherwise returns null. + */ + @Override + public final UnsafeRow appendRow(Object kbase, long koff, int klen, + Object vbase, long voff, int vlen) { + // if run out of max supported rows or page size, return null + if (numRows >= capacity || page == null || page.size() - pageCursor < recordLength) { + return null; + } + + long offset = page.getBaseOffset() + pageCursor; + final long recordOffset = offset; + Platform.copyMemory(kbase, koff, base, offset, klen); + offset += klen; + Platform.copyMemory(vbase, voff, base, offset, vlen); + offset += vlen; + Platform.putLong(base, offset, 0); + + pageCursor += recordLength; + + keyRowId = numRows; + keyRow.pointTo(base, recordOffset, klen); + valueRow.pointTo(base, recordOffset + klen, vlen + 4); + numRows++; + return valueRow; + } + + /** + * Returns the key row in this batch at `rowId`. Returned key row is reused across calls. + */ + @Override + public final UnsafeRow getKeyRow(int rowId) { + assert(rowId >= 0); + assert(rowId < numRows); + if (keyRowId != rowId) { // if keyRowId == rowId, desired keyRow is already cached + long offset = getKeyOffsetForFixedLengthRecords(rowId); + keyRow.pointTo(base, offset, klen); + // set keyRowId so we can check if desired row is cached + keyRowId = rowId; + } + return keyRow; + } + + /** + * Returns the value row by two steps: + * 1) looking up the key row with the same id (skipped if the key row is cached) + * 2) retrieve the value row by reusing the metadata from step 1) + * In most times, 1) is skipped because `getKeyRow(id)` is often called before `getValueRow(id)`. + */ + @Override + protected final UnsafeRow getValueFromKey(int rowId) { + if (keyRowId != rowId) { + getKeyRow(rowId); + } + assert(rowId >= 0); + valueRow.pointTo(base, keyRow.getBaseOffset() + klen, vlen + 4); + return valueRow; + } + + /** + * Returns an iterator to go through all rows + */ + @Override + public final org.apache.spark.unsafe.KVIterator rowIterator() { + return new org.apache.spark.unsafe.KVIterator() { + private final UnsafeRow key = new UnsafeRow(keySchema.length()); + private final UnsafeRow value = new UnsafeRow(valueSchema.length()); + + private long offsetInPage = 0; + private int recordsInPage = 0; + + private boolean initialized = false; + + private void init() { + if (page != null) { + offsetInPage = page.getBaseOffset(); + recordsInPage = numRows; + } + initialized = true; + } + + @Override + public boolean next() { + if (!initialized) init(); + //searching for the next non empty page is records is now zero + if (recordsInPage == 0) { + freeCurrentPage(); + return false; + } + + key.pointTo(base, offsetInPage, klen); + value.pointTo(base, offsetInPage + klen, vlen + 4); + + offsetInPage += recordLength; + recordsInPage -= 1; + return true; + } + + @Override + public UnsafeRow getKey() { + return key; + } + + @Override + public UnsafeRow getValue() { + return value; + } + + @Override + public void close() { + // do nothing + } + + private void freeCurrentPage() { + if (page != null) { + freePage(page); + page = null; + } + } + }; + } + + protected FixedLengthRowBasedKeyValueBatch(StructType keySchema, StructType valueSchema, + int maxRows, TaskMemoryManager manager) { + super(keySchema, valueSchema, maxRows, manager); + klen = keySchema.defaultSize() + + UnsafeRow.calculateBitSetWidthInBytes(keySchema.length()); + vlen = valueSchema.defaultSize() + + UnsafeRow.calculateBitSetWidthInBytes(valueSchema.length()); + recordLength = klen + vlen + 8; + } +} diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java new file mode 100644 index 000000000000..cea9d5d5bc3a --- /dev/null +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions; + +import java.io.IOException; + +import org.apache.spark.memory.MemoryConsumer; +import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.sql.types.*; +import org.apache.spark.unsafe.memory.MemoryBlock; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * RowBasedKeyValueBatch stores key value pairs in contiguous memory region. + * + * Each key or value is stored as a single UnsafeRow. Each record contains one key and one value + * and some auxiliary data, which differs based on implementation: + * i.e., `FixedLengthRowBasedKeyValueBatch` and `VariableLengthRowBasedKeyValueBatch`. + * + * We use `FixedLengthRowBasedKeyValueBatch` if all fields in the key and the value are fixed-length + * data types. Otherwise we use `VariableLengthRowBasedKeyValueBatch`. + * + * RowBasedKeyValueBatch is backed by a single page / MemoryBlock (defaults to 64MB). If the page + * is full, the aggregate logic should fallback to a second level, larger hash map. We intentionally + * use the single-page design because it simplifies memory address encoding & decoding for each + * key-value pair. Because the maximum capacity for RowBasedKeyValueBatch is only 2^16, it is + * unlikely we need a second page anyway. Filling the page requires an average size for key value + * pairs to be larger than 1024 bytes. + * + */ +public abstract class RowBasedKeyValueBatch extends MemoryConsumer { + protected final Logger logger = LoggerFactory.getLogger(RowBasedKeyValueBatch.class); + + private static final int DEFAULT_CAPACITY = 1 << 16; + private static final long DEFAULT_PAGE_SIZE = 64 * 1024 * 1024; + + protected final StructType keySchema; + protected final StructType valueSchema; + protected final int capacity; + protected int numRows = 0; + + // ids for current key row and value row being retrieved + protected int keyRowId = -1; + + // placeholder for key and value corresponding to keyRowId. + protected final UnsafeRow keyRow; + protected final UnsafeRow valueRow; + + protected MemoryBlock page = null; + protected Object base = null; + protected final long recordStartOffset; + protected long pageCursor = 0; + + public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema, + TaskMemoryManager manager) { + return allocate(keySchema, valueSchema, manager, DEFAULT_CAPACITY); + } + + public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema, + TaskMemoryManager manager, int maxRows) { + boolean allFixedLength = true; + // checking if there is any variable length fields + // there is probably a more succinct impl of this + for (String name : keySchema.fieldNames()) { + allFixedLength = allFixedLength + && UnsafeRow.isFixedLength(keySchema.apply(name).dataType()); + } + for (String name : valueSchema.fieldNames()) { + allFixedLength = allFixedLength + && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType()); + } + + if (allFixedLength) { + return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); + } else { + return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); + } + } + + protected RowBasedKeyValueBatch(StructType keySchema, StructType valueSchema, int maxRows, + TaskMemoryManager manager) { + super(manager, manager.pageSizeBytes(), manager.getTungstenMemoryMode()); + + this.keySchema = keySchema; + this.valueSchema = valueSchema; + this.capacity = maxRows; + + this.keyRow = new UnsafeRow(keySchema.length()); + this.valueRow = new UnsafeRow(valueSchema.length()); + + if (!acquirePage(DEFAULT_PAGE_SIZE)) { + page = null; + recordStartOffset = 0; + } else { + base = page.getBaseObject(); + recordStartOffset = page.getBaseOffset(); + } + } + + public final int numRows() { return numRows; } + + public final void close() { + if (page != null) { + freePage(page); + page = null; + } + } + + private final boolean acquirePage(long requiredSize) { + try { + page = allocatePage(requiredSize); + } catch (OutOfMemoryError e) { + logger.warn("Failed to allocate page ({} bytes).", requiredSize); + return false; + } + base = page.getBaseObject(); + pageCursor = 0; + return true; + } + + /** + * Append a key value pair. + * It copies data into the backing MemoryBlock. + * Returns an UnsafeRow pointing to the value if succeeds, otherwise returns null. + */ + public abstract UnsafeRow appendRow(Object kbase, long koff, int klen, + Object vbase, long voff, int vlen); + + /** + * Returns the key row in this batch at `rowId`. Returned key row is reused across calls. + */ + public abstract UnsafeRow getKeyRow(int rowId); + + /** + * Returns the value row in this batch at `rowId`. Returned value row is reused across calls. + * Because `getValueRow(id)` is always called after `getKeyRow(id)` with the same id, we use + * `getValueFromKey(id) to retrieve value row, which reuses metadata from the cached key. + */ + public final UnsafeRow getValueRow(int rowId) { + return getValueFromKey(rowId); + } + + /** + * Returns the value row by two steps: + * 1) looking up the key row with the same id (skipped if the key row is cached) + * 2) retrieve the value row by reusing the metadata from step 1) + * In most times, 1) is skipped because `getKeyRow(id)` is often called before `getValueRow(id)`. + */ + protected abstract UnsafeRow getValueFromKey(int rowId); + + /** + * Sometimes the TaskMemoryManager may call spill() on its associated MemoryConsumers to make + * space for new consumers. For RowBasedKeyValueBatch, we do not actually spill and return 0. + * We should not throw OutOfMemory exception here because other associated consumers might spill + */ + public final long spill(long size, MemoryConsumer trigger) throws IOException { + logger.warn("Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0."); + return 0; + } + + /** + * Returns an iterator to go through all rows + */ + public abstract org.apache.spark.unsafe.KVIterator rowIterator(); +} diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java new file mode 100644 index 000000000000..f4002ee0d50d --- /dev/null +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions; + +import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.sql.types.*; +import org.apache.spark.unsafe.Platform; + +/** + * An implementation of `RowBasedKeyValueBatch` in which key-value records have variable lengths. + * + * The format for each record looks like this: + * [4 bytes total size = (klen + vlen + 4)] [4 bytes key size = klen] + * [UnsafeRow for key of length klen] [UnsafeRow for Value of length vlen] + * [8 bytes pointer to next] + * Thus, record length = 4 + 4 + klen + vlen + 8 + */ +public final class VariableLengthRowBasedKeyValueBatch extends RowBasedKeyValueBatch { + // full addresses for key rows and value rows + private final long[] keyOffsets; + + /** + * Append a key value pair. + * It copies data into the backing MemoryBlock. + * Returns an UnsafeRow pointing to the value if succeeds, otherwise returns null. + */ + @Override + public final UnsafeRow appendRow(Object kbase, long koff, int klen, + Object vbase, long voff, int vlen) { + final long recordLength = 8 + klen + vlen + 8; + // if run out of max supported rows or page size, return null + if (numRows >= capacity || page == null || page.size() - pageCursor < recordLength) { + return null; + } + + long offset = page.getBaseOffset() + pageCursor; + final long recordOffset = offset; + Platform.putInt(base, offset, klen + vlen + 4); + Platform.putInt(base, offset + 4, klen); + + offset += 8; + Platform.copyMemory(kbase, koff, base, offset, klen); + offset += klen; + Platform.copyMemory(vbase, voff, base, offset, vlen); + offset += vlen; + Platform.putLong(base, offset, 0); + + pageCursor += recordLength; + + keyOffsets[numRows] = recordOffset + 8; + + keyRowId = numRows; + keyRow.pointTo(base, recordOffset + 8, klen); + valueRow.pointTo(base, recordOffset + 8 + klen, vlen + 4); + numRows++; + return valueRow; + } + + /** + * Returns the key row in this batch at `rowId`. Returned key row is reused across calls. + */ + @Override + public UnsafeRow getKeyRow(int rowId) { + assert(rowId >= 0); + assert(rowId < numRows); + if (keyRowId != rowId) { // if keyRowId == rowId, desired keyRow is already cached + long offset = keyOffsets[rowId]; + int klen = Platform.getInt(base, offset - 4); + keyRow.pointTo(base, offset, klen); + // set keyRowId so we can check if desired row is cached + keyRowId = rowId; + } + return keyRow; + } + + /** + * Returns the value row by two steps: + * 1) looking up the key row with the same id (skipped if the key row is cached) + * 2) retrieve the value row by reusing the metadata from step 1) + * In most times, 1) is skipped because `getKeyRow(id)` is often called before `getValueRow(id)`. + */ + @Override + public final UnsafeRow getValueFromKey(int rowId) { + if (keyRowId != rowId) { + getKeyRow(rowId); + } + assert(rowId >= 0); + long offset = keyRow.getBaseOffset(); + int klen = keyRow.getSizeInBytes(); + int vlen = Platform.getInt(base, offset - 8) - klen - 4; + valueRow.pointTo(base, offset + klen, vlen + 4); + return valueRow; + } + + /** + * Returns an iterator to go through all rows + */ + @Override + public final org.apache.spark.unsafe.KVIterator rowIterator() { + return new org.apache.spark.unsafe.KVIterator() { + private final UnsafeRow key = new UnsafeRow(keySchema.length()); + private final UnsafeRow value = new UnsafeRow(valueSchema.length()); + + private long offsetInPage = 0; + private int recordsInPage = 0; + + private int currentklen; + private int currentvlen; + private int totalLength; + + private boolean initialized = false; + + private void init() { + if (page != null) { + offsetInPage = page.getBaseOffset(); + recordsInPage = numRows; + } + initialized = true; + } + + @Override + public boolean next() { + if (!initialized) init(); + //searching for the next non empty page is records is now zero + if (recordsInPage == 0) { + freeCurrentPage(); + return false; + } + + totalLength = Platform.getInt(base, offsetInPage) - 4; + currentklen = Platform.getInt(base, offsetInPage + 4); + currentvlen = totalLength - currentklen; + + key.pointTo(base, offsetInPage + 8, currentklen); + value.pointTo(base, offsetInPage + 8 + currentklen, currentvlen + 4); + + offsetInPage += 8 + totalLength + 8; + recordsInPage -= 1; + return true; + } + + @Override + public UnsafeRow getKey() { + return key; + } + + @Override + public UnsafeRow getValue() { + return value; + } + + @Override + public void close() { + // do nothing + } + + private void freeCurrentPage() { + if (page != null) { + freePage(page); + page = null; + } + } + }; + } + + protected VariableLengthRowBasedKeyValueBatch(StructType keySchema, StructType valueSchema, + int maxRows, TaskMemoryManager manager) { + super(keySchema, valueSchema, maxRows, manager); + this.keyOffsets = new long[maxRows]; + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 65a90d8099b7..65168998c8ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -228,6 +228,7 @@ object FunctionRegistry { expression[Signum]("signum"), expression[Sin]("sin"), expression[Sinh]("sinh"), + expression[StringToMap]("str_to_map"), expression[Sqrt]("sqrt"), expression[Tan]("tan"), expression[Tanh]("tanh"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index baec6d14a212..9a040f8644fb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -100,7 +100,7 @@ object TypeCoercion { } /** Similar to [[findTightestCommonType]], but can promote all the way to StringType. */ - private def findTightestCommonTypeToString(left: DataType, right: DataType): Option[DataType] = { + def findTightestCommonTypeToString(left: DataType, right: DataType): Option[DataType] = { findTightestCommonTypeOfTwo(left, right).orElse((left, right) match { case (StringType, t2: AtomicType) if t2 != BinaryType && t2 != BooleanType => Some(StringType) case (t1: AtomicType, StringType) if t1 != BinaryType && t1 != BooleanType => Some(StringType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index b12606e17d38..710bce5da981 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} +import org.apache.spark.sql.catalyst.util.quoteIdentifier /** @@ -49,12 +50,12 @@ case class CatalogStorageFormat( outputFormat: Option[String], serde: Option[String], compressed: Boolean, - serdeProperties: Map[String, String]) { + properties: Map[String, String]) { override def toString: String = { val serdePropsToString = - if (serdeProperties.nonEmpty) { - s"Properties: " + serdeProperties.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]") + if (properties.nonEmpty) { + s"Properties: " + properties.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]") } else { "" } @@ -73,7 +74,7 @@ case class CatalogStorageFormat( object CatalogStorageFormat { /** Empty storage format for default values and copies. */ val empty = CatalogStorageFormat(locationUri = None, inputFormat = None, - outputFormat = None, serde = None, compressed = false, serdeProperties = Map.empty) + outputFormat = None, serde = None, compressed = false, properties = Map.empty) } /** @@ -81,9 +82,9 @@ object CatalogStorageFormat { */ case class CatalogColumn( name: String, - // This may be null when used to create views. TODO: make this type-safe; this is left - // as a string due to issues in converting Hive varchars to and from SparkSQL strings. - @Nullable dataType: String, + // TODO: make this type-safe; this is left as a string due to issues in converting Hive + // varchars to and from SparkSQL strings. + dataType: String, nullable: Boolean = true, comment: Option[String] = None) { @@ -109,6 +110,24 @@ case class CatalogTablePartition( storage: CatalogStorageFormat) +/** + * A container for bucketing information. + * Bucketing is a technology for decomposing data sets into more manageable parts, and the number + * of buckets is fixed so it does not fluctuate with data. + * + * @param numBuckets number of buckets. + * @param bucketColumnNames the names of the columns that used to generate the bucket id. + * @param sortColumnNames the names of the columns that used to sort data in each bucket. + */ +case class BucketSpec( + numBuckets: Int, + bucketColumnNames: Seq[String], + sortColumnNames: Seq[String]) { + if (numBuckets <= 0) { + throw new AnalysisException(s"Expected positive number of buckets, but got `$numBuckets`.") + } +} + /** * A table defined in the catalog. * @@ -124,9 +143,7 @@ case class CatalogTable( storage: CatalogStorageFormat, schema: Seq[CatalogColumn], partitionColumnNames: Seq[String] = Seq.empty, - sortColumnNames: Seq[String] = Seq.empty, - bucketColumnNames: Seq[String] = Seq.empty, - numBuckets: Int = -1, + bucketSpec: Option[BucketSpec] = None, owner: String = "", createTime: Long = System.currentTimeMillis, lastAccessTime: Long = -1, @@ -143,8 +160,8 @@ case class CatalogTable( s"must be a subset of schema (${colNames.mkString(", ")}) in table '$identifier'") } requireSubsetOfSchema(partitionColumnNames, "partition") - requireSubsetOfSchema(sortColumnNames, "sort") - requireSubsetOfSchema(bucketColumnNames, "bucket") + requireSubsetOfSchema(bucketSpec.map(_.sortColumnNames).getOrElse(Nil), "sort") + requireSubsetOfSchema(bucketSpec.map(_.bucketColumnNames).getOrElse(Nil), "bucket") /** Columns this table is partitioned by. */ def partitionColumns: Seq[CatalogColumn] = @@ -165,16 +182,26 @@ case class CatalogTable( outputFormat: Option[String] = storage.outputFormat, compressed: Boolean = false, serde: Option[String] = storage.serde, - serdeProperties: Map[String, String] = storage.serdeProperties): CatalogTable = { + serdeProperties: Map[String, String] = storage.properties): CatalogTable = { copy(storage = CatalogStorageFormat( locationUri, inputFormat, outputFormat, serde, compressed, serdeProperties)) } override def toString: String = { val tableProperties = properties.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]") - val partitionColumns = partitionColumnNames.map("`" + _ + "`").mkString("[", ", ", "]") - val sortColumns = sortColumnNames.map("`" + _ + "`").mkString("[", ", ", "]") - val bucketColumns = bucketColumnNames.map("`" + _ + "`").mkString("[", ", ", "]") + val partitionColumns = partitionColumnNames.map(quoteIdentifier).mkString("[", ", ", "]") + val bucketStrings = bucketSpec match { + case Some(BucketSpec(numBuckets, bucketColumnNames, sortColumnNames)) => + val bucketColumnsString = bucketColumnNames.map(quoteIdentifier).mkString("[", ", ", "]") + val sortColumnsString = sortColumnNames.map(quoteIdentifier).mkString("[", ", ", "]") + Seq( + s"Num Buckets: $numBuckets", + if (bucketColumnNames.nonEmpty) s"Bucket Columns: $bucketColumnsString" else "", + if (sortColumnNames.nonEmpty) s"Sort Columns: $sortColumnsString" else "" + ) + + case _ => Nil + } val output = Seq(s"Table: ${identifier.quotedString}", @@ -183,10 +210,8 @@ case class CatalogTable( s"Last Access: ${new Date(lastAccessTime).toString}", s"Type: ${tableType.name}", if (schema.nonEmpty) s"Schema: ${schema.mkString("[", ", ", "]")}" else "", - if (partitionColumnNames.nonEmpty) s"Partition Columns: $partitionColumns" else "", - if (numBuckets != -1) s"Num Buckets: $numBuckets" else "", - if (bucketColumnNames.nonEmpty) s"Bucket Columns: $bucketColumns" else "", - if (sortColumnNames.nonEmpty) s"Sort Columns: $sortColumns" else "", + if (partitionColumnNames.nonEmpty) s"Partition Columns: $partitionColumns" else "" + ) ++ bucketStrings ++ Seq( viewOriginalText.map("Original View: " + _).getOrElse(""), viewText.map("View: " + _).getOrElse(""), comment.map("Comment: " + _).getOrElse(""), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 10a141254f54..1f37b68846ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -377,6 +377,7 @@ abstract class UnaryExpression extends Expression { """) } else { ev.copy(code = s""" + boolean ${ev.isNull} = false; ${childGen.code} ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; $resultCode""", isNull = "false") @@ -475,6 +476,7 @@ abstract class BinaryExpression extends Expression { """) } else { ev.copy(code = s""" + boolean ${ev.isNull} = false; ${leftGen.code} ${rightGen.code} ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; @@ -617,6 +619,7 @@ abstract class TernaryExpression extends Expression { $nullSafeEval""") } else { ev.copy(code = s""" + boolean ${ev.isNull} = false; ${leftGen.code} ${midGen.code} ${rightGen.code} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala index 946b3d446a40..d702c08cfd34 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala @@ -43,7 +43,7 @@ case class First(child: Expression, ignoreNullsExpr: Expression) extends Declara throw new AnalysisException("The second argument of First should be a boolean literal.") } - override def children: Seq[Expression] = child :: Nil + override def children: Seq[Expression] = child :: ignoreNullsExpr :: Nil override def nullable: Boolean = true @@ -54,7 +54,7 @@ case class First(child: Expression, ignoreNullsExpr: Expression) extends Declara override def dataType: DataType = child.dataType // Expected input data type. - override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType) + override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, BooleanType) private lazy val first = AttributeReference("first", child.dataType)() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala index 53b4b761ae51..af8840305805 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala @@ -40,7 +40,7 @@ case class Last(child: Expression, ignoreNullsExpr: Expression) extends Declarat throw new AnalysisException("The second argument of First should be a boolean literal.") } - override def children: Seq[Expression] = child :: Nil + override def children: Seq[Expression] = child :: ignoreNullsExpr :: Nil override def nullable: Boolean = true @@ -51,7 +51,7 @@ case class Last(child: Expression, ignoreNullsExpr: Expression) extends Declarat override def dataType: DataType = child.dataType // Expected input data type. - override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType) + override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, BooleanType) private lazy val last = AttributeReference("last", child.dataType)() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index d603d3c73ecb..0ca715f42472 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, TypeUtils} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -393,3 +393,53 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends Expression override def prettyName: String = "named_struct_unsafe" } + +/** + * Creates a map after splitting the input text into key/value pairs using delimiters + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(text[, pairDelim, keyValueDelim]) - Creates a map after splitting the text into key/value pairs using delimiters. Default delimiters are ',' for pairDelim and ':' for keyValueDelim.", + extended = """ > SELECT _FUNC_('a:1,b:2,c:3',',',':');\n map("a":"1","b":"2","c":"3") """) +// scalastyle:on line.size.limit +case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: Expression) + extends TernaryExpression with CodegenFallback with ExpectsInputTypes { + + def this(child: Expression, pairDelim: Expression) = { + this(child, pairDelim, Literal(":")) + } + + def this(child: Expression) = { + this(child, Literal(","), Literal(":")) + } + + override def children: Seq[Expression] = Seq(text, pairDelim, keyValueDelim) + + override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType) + + override def dataType: DataType = MapType(StringType, StringType, valueContainsNull = false) + + override def checkInputDataTypes(): TypeCheckResult = { + if (Seq(pairDelim, keyValueDelim).exists(! _.foldable)) { + TypeCheckResult.TypeCheckFailure(s"$prettyName's delimiters must be foldable.") + } else { + super.checkInputDataTypes() + } + } + + override def nullSafeEval(str: Any, delim1: Any, delim2: Any): Any = { + val array = str.asInstanceOf[UTF8String] + .split(delim1.asInstanceOf[UTF8String], -1) + .map { kv => + val arr = kv.split(delim2.asInstanceOf[UTF8String], 2) + if (arr.length < 2) { + Array(arr(0), null) + } else { + arr + } + } + ArrayBasedMapData(array.map(_ (0)), array.map(_ (1))) + } + + override def prettyName: String = "str_to_map" +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala index 523fb053972d..1c18265e0fed 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala @@ -134,7 +134,7 @@ case class Nvl(left: Expression, right: Expression) extends RuntimeReplaceable { override def replaceForTypeCoercion(): Expression = { if (left.dataType != right.dataType) { - TypeCoercion.findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { dtype => + TypeCoercion.findTightestCommonTypeToString(left.dataType, right.dataType).map { dtype => copy(left = Cast(left, dtype), right = Cast(right, dtype)) }.getOrElse(this) } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index ea4dee174e74..d6863ed2fdfa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -134,31 +134,42 @@ case class Invoke( val argGen = arguments.map(_.genCode(ctx)) val argString = argGen.map(_.value).mkString(", ") - val callFunc = if (method.isDefined && method.get.getReturnType.isPrimitive) { - s"${obj.value}.$functionName($argString)" - } else { - s"(${ctx.boxedType(javaType)}) ${obj.value}.$functionName($argString)" - } + val returnPrimitive = method.isDefined && method.get.getReturnType.isPrimitive + val needTryCatch = method.isDefined && method.get.getExceptionTypes.nonEmpty - val setIsNull = if (propagateNull && arguments.nonEmpty) { - s"boolean ${ev.isNull} = ${obj.isNull} || ${argGen.map(_.isNull).mkString(" || ")};" + def getFuncResult(resultVal: String, funcCall: String): String = if (needTryCatch) { + s""" + try { + $resultVal = $funcCall; + } catch (Exception e) { + org.apache.spark.unsafe.Platform.throwException(e); + } + """ } else { - s"boolean ${ev.isNull} = ${obj.isNull};" + s"$resultVal = $funcCall;" } - val evaluate = if (method.forall(_.getExceptionTypes.isEmpty)) { - s"final $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : $callFunc;" + val evaluate = if (returnPrimitive) { + getFuncResult(ev.value, s"${obj.value}.$functionName($argString)") } else { + val funcResult = ctx.freshName("funcResult") s""" - $javaType ${ev.value} = ${ctx.defaultValue(javaType)}; - try { - ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(javaType)} : $callFunc; - } catch (Exception e) { - org.apache.spark.unsafe.Platform.throwException(e); + Object $funcResult = null; + ${getFuncResult(funcResult, s"${obj.value}.$functionName($argString)")} + if ($funcResult == null) { + ${ev.isNull} = true; + } else { + ${ev.value} = (${ctx.boxedType(javaType)}) $funcResult; } """ } + val setIsNull = if (propagateNull && arguments.nonEmpty) { + s"boolean ${ev.isNull} = ${obj.isNull} || ${argGen.map(_.isNull).mkString(" || ")};" + } else { + s"boolean ${ev.isNull} = ${obj.isNull};" + } + // If the function can return null, we do an extra check to make sure our null bit is still set // correctly. val postNullCheck = if (ctx.defaultValue(dataType) == "null") { @@ -166,12 +177,14 @@ case class Invoke( } else { "" } - val code = s""" ${obj.code} ${argGen.map(_.code).mkString("\n")} $setIsNull - $evaluate + $javaType ${ev.value} = ${ctx.defaultValue(dataType)}; + if (!${ev.isNull}) { + $evaluate + } $postNullCheck """ ev.copy(code = code) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 61549c9a2368..fc13845a7f6c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -171,7 +171,7 @@ case class ConcatWs(children: Seq[Expression]) usage = "_FUNC_(n, str1, str2, ...) - returns the n-th string, e.g. returns str2 when n is 2", extended = "> SELECT _FUNC_(1, 'scala', 'java') FROM src LIMIT 1;\n" + "'scala'") case class Elt(children: Seq[Expression]) - extends Expression with ImplicitCastInputTypes with CodegenFallback { + extends Expression with ImplicitCastInputTypes { private lazy val indexExpr = children.head private lazy val stringExprs = children.tail.toArray @@ -204,6 +204,29 @@ case class Elt(children: Seq[Expression]) } } } + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val index = indexExpr.genCode(ctx) + val strings = stringExprs.map(_.genCode(ctx)) + val assignStringValue = strings.zipWithIndex.map { case (eval, index) => + s""" + case ${index + 1}: + ${ev.value} = ${eval.isNull} ? null : ${eval.value}; + break; + """ + }.mkString("\n") + val indexVal = ctx.freshName("index") + val stringArray = ctx.freshName("strings"); + + ev.copy(index.code + "\n" + strings.map(_.code).mkString("\n") + s""" + final int $indexVal = ${index.value}; + UTF8String ${ev.value} = null; + switch ($indexVal) { + $assignStringValue + } + final boolean ${ev.isNull} = ${ev.value} == null; + """) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala index c0b453dccf5e..e35192ca2d26 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala @@ -82,16 +82,16 @@ case class WindowSpecDefinition( val partition = if (partitionSpec.isEmpty) { "" } else { - "PARTITION BY " + partitionSpec.map(_.sql).mkString(", ") + "PARTITION BY " + partitionSpec.map(_.sql).mkString(", ") + " " } val order = if (orderSpec.isEmpty) { "" } else { - "ORDER BY " + orderSpec.map(_.sql).mkString(", ") + "ORDER BY " + orderSpec.map(_.sql).mkString(", ") + " " } - s"($partition $order ${frameSpecification.toString})" + s"($partition$order${frameSpecification.toString})" } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 368e9a539630..c8e9d8e2f9dd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -165,36 +165,49 @@ object PushProjectThroughSample extends Rule[LogicalPlan] { * but can also benefit other operators. */ object RemoveAliasOnlyProject extends Rule[LogicalPlan] { - // Check if projectList in the Project node has the same attribute names and ordering - // as its child node. + /** + * Returns true if the project list is semantically same as child output, after strip alias on + * attribute. + */ private def isAliasOnly( projectList: Seq[NamedExpression], childOutput: Seq[Attribute]): Boolean = { - if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != childOutput.length) { + if (projectList.length != childOutput.length) { false } else { - projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case (a, o) => - a.child match { - case attr: Attribute if a.name == attr.name && attr.semanticEquals(o) => true - case _ => false - } + stripAliasOnAttribute(projectList).zip(childOutput).forall { + case (a: Attribute, o) if a semanticEquals o => true + case _ => false } } } + private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = { + projectList.map { + // Alias with metadata can not be stripped, or the metadata will be lost. + // If the alias name is different from attribute name, we can't strip it either, or we may + // accidentally change the output schema name of the root plan. + case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && name == attr.name => + attr + case other => other + } + } + def apply(plan: LogicalPlan): LogicalPlan = { - val aliasOnlyProject = plan.find { - case Project(pList, child) if isAliasOnly(pList, child.output) => true - case _ => false + val aliasOnlyProject = plan.collectFirst { + case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p } - aliasOnlyProject.map { case p: Project => - val aliases = p.projectList.map(_.asInstanceOf[Alias]) - val attrMap = AttributeMap(aliases.map(a => (a.toAttribute, a.child))) - plan.transformAllExpressions { - case a: Attribute if attrMap.contains(a) => attrMap(a) - }.transform { - case op: Project if op.eq(p) => op.child + aliasOnlyProject.map { case proj => + val attributesToReplace = proj.output.zip(proj.child.output).filterNot { + case (a1, a2) => a1 semanticEquals a2 + } + val attrMap = AttributeMap(attributesToReplace) + plan transform { + case plan: Project if plan eq proj => plan.child + case plan => plan transformExpressions { + case a: Attribute if attrMap.contains(a) => attrMap(a) + } } }.getOrElse(plan) } @@ -1128,19 +1141,23 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild)) // Push [[Filter]] operators through [[Window]] operators. Parts of the predicate that can be - // pushed beneath must satisfy the following two conditions: + // pushed beneath must satisfy the following conditions: // 1. All the expressions are part of window partitioning key. The expressions can be compound. - // 2. Deterministic + // 2. Deterministic. + // 3. Placed before any non-deterministic predicates. case filter @ Filter(condition, w: Window) if w.partitionSpec.forall(_.isInstanceOf[AttributeReference]) => val partitionAttrs = AttributeSet(w.partitionSpec.flatMap(_.references)) - val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { cond => - cond.references.subsetOf(partitionAttrs) && cond.deterministic && - // This is for ensuring all the partitioning expressions have been converted to alias - // in Analyzer. Thus, we do not need to check if the expressions in conditions are - // the same as the expressions used in partitioning columns. - partitionAttrs.forall(_.isInstanceOf[Attribute]) + + val (candidates, containingNonDeterministic) = + splitConjunctivePredicates(condition).span(_.deterministic) + + val (pushDown, rest) = candidates.partition { cond => + cond.references.subsetOf(partitionAttrs) } + + val stayUp = rest ++ containingNonDeterministic + if (pushDown.nonEmpty) { val pushDownPredicate = pushDown.reduce(And) val newWindow = w.copy(child = Filter(pushDownPredicate, w.child)) @@ -1159,11 +1176,16 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { // For each filter, expand the alias and check if the filter can be evaluated using // attributes produced by the aggregate operator's child operator. - val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { cond => + val (candidates, containingNonDeterministic) = + splitConjunctivePredicates(condition).span(_.deterministic) + + val (pushDown, rest) = candidates.partition { cond => val replaced = replaceAlias(cond, aliasMap) - replaced.references.subsetOf(aggregate.child.outputSet) && replaced.deterministic + replaced.references.subsetOf(aggregate.child.outputSet) } + val stayUp = rest ++ containingNonDeterministic + if (pushDown.nonEmpty) { val pushDownPredicate = pushDown.reduce(And) val replaced = replaceAlias(pushDownPredicate, aliasMap) @@ -1177,9 +1199,8 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { case filter @ Filter(condition, union: Union) => // Union could change the rows, so non-deterministic predicate can't be pushed down - val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { cond => - cond.deterministic - } + val (pushDown, stayUp) = splitConjunctivePredicates(condition).span(_.deterministic) + if (pushDown.nonEmpty) { val pushDownCond = pushDown.reduceLeft(And) val output = union.output @@ -1219,9 +1240,15 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { // come from grandchild. // TODO: non-deterministic predicates could be pushed through some operators that do not change // the rows. - val (pushDown, stayUp) = splitConjunctivePredicates(filter.condition).partition { cond => - cond.deterministic && cond.references.subsetOf(grandchild.outputSet) + val (candidates, containingNonDeterministic) = + splitConjunctivePredicates(filter.condition).span(_.deterministic) + + val (pushDown, rest) = candidates.partition { cond => + cond.references.subsetOf(grandchild.outputSet) } + + val stayUp = rest ++ containingNonDeterministic + if (pushDown.nonEmpty) { val newChild = insertFilter(pushDown.reduceLeft(And)) if (stayUp.nonEmpty) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index d0b2b5d7b2df..6d7799151d93 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -127,7 +127,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { */ def resolve(schema: StructType, resolver: Resolver): Seq[Attribute] = { schema.map { field => - resolveQuoted(field.name, resolver).map { + resolve(field.name :: Nil, resolver).map { case a: AttributeReference => a case other => sys.error(s"can not handle nested schema yet... plan $this") }.getOrElse { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index c0e400f61777..b31f5aa11c22 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -483,8 +483,10 @@ case class Aggregate( override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute) override def maxRows: Option[Long] = child.maxRows - override def validConstraints: Set[Expression] = - child.constraints.union(getAliasedConstraints(aggregateExpressions)) + override def validConstraints: Set[Expression] = { + val nonAgg = aggregateExpressions.filter(_.find(_.isInstanceOf[AggregateExpression]).isEmpty) + child.constraints.union(getAliasedConstraints(nonAgg)) + } override lazy val statistics: Statistics = { if (groupingExpressions.isEmpty) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 3a665d370830..7ee9581b63af 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -23,6 +23,16 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.{DataType, Decimal} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +private object GenericArrayData { + + // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. + def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { + case seq: Seq[Any] => seq + case array: Array[_] => array.toSeq + } + +} + class GenericArrayData(val array: Array[Any]) extends ArrayData { def this(seq: Seq[Any]) = this(seq.toArray) @@ -37,10 +47,7 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData { def this(primitiveArray: Array[Byte]) = this(primitiveArray.toSeq) def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq) - def this(seqOrArray: Any) = this(seqOrArray match { - case seq: Seq[Any] => seq - case array: Array[_] => array.toSeq - }) + def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) override def copy(): ArrayData = new GenericArrayData(array.clone()) diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatchSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatchSuite.java new file mode 100644 index 000000000000..0dd129cea7b3 --- /dev/null +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatchSuite.java @@ -0,0 +1,425 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.SparkConf; +import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.memory.TestMemoryManager; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder; +import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter; +import org.apache.spark.unsafe.types.UTF8String; + +import java.util.Random; + +public class RowBasedKeyValueBatchSuite { + + private final Random rand = new Random(42); + + private TestMemoryManager memoryManager; + private TaskMemoryManager taskMemoryManager; + private StructType keySchema = new StructType().add("k1", DataTypes.LongType) + .add("k2", DataTypes.StringType); + private StructType fixedKeySchema = new StructType().add("k1", DataTypes.LongType) + .add("k2", DataTypes.LongType); + private StructType valueSchema = new StructType().add("count", DataTypes.LongType) + .add("sum", DataTypes.LongType); + private int DEFAULT_CAPACITY = 1 << 16; + + private String getRandomString(int length) { + Assert.assertTrue(length >= 0); + final byte[] bytes = new byte[length]; + rand.nextBytes(bytes); + return new String(bytes); + } + + private UnsafeRow makeKeyRow(long k1, String k2) { + UnsafeRow row = new UnsafeRow(2); + BufferHolder holder = new BufferHolder(row, 32); + UnsafeRowWriter writer = new UnsafeRowWriter(holder, 2); + holder.reset(); + writer.write(0, k1); + writer.write(1, UTF8String.fromString(k2)); + row.setTotalSize(holder.totalSize()); + return row; + } + + private UnsafeRow makeKeyRow(long k1, long k2) { + UnsafeRow row = new UnsafeRow(2); + BufferHolder holder = new BufferHolder(row, 0); + UnsafeRowWriter writer = new UnsafeRowWriter(holder, 2); + holder.reset(); + writer.write(0, k1); + writer.write(1, k2); + row.setTotalSize(holder.totalSize()); + return row; + } + + private UnsafeRow makeValueRow(long v1, long v2) { + UnsafeRow row = new UnsafeRow(2); + BufferHolder holder = new BufferHolder(row, 0); + UnsafeRowWriter writer = new UnsafeRowWriter(holder, 2); + holder.reset(); + writer.write(0, v1); + writer.write(1, v2); + row.setTotalSize(holder.totalSize()); + return row; + } + + private UnsafeRow appendRow(RowBasedKeyValueBatch batch, UnsafeRow key, UnsafeRow value) { + return batch.appendRow(key.getBaseObject(), key.getBaseOffset(), key.getSizeInBytes(), + value.getBaseObject(), value.getBaseOffset(), value.getSizeInBytes()); + } + + private void updateValueRow(UnsafeRow row, long v1, long v2) { + row.setLong(0, v1); + row.setLong(1, v2); + } + + private boolean checkKey(UnsafeRow row, long k1, String k2) { + return (row.getLong(0) == k1) + && (row.getUTF8String(1).equals(UTF8String.fromString(k2))); + } + + private boolean checkKey(UnsafeRow row, long k1, long k2) { + return (row.getLong(0) == k1) + && (row.getLong(1) == k2); + } + + private boolean checkValue(UnsafeRow row, long v1, long v2) { + return (row.getLong(0) == v1) && (row.getLong(1) == v2); + } + + @Before + public void setup() { + memoryManager = new TestMemoryManager(new SparkConf() + .set("spark.memory.offHeap.enabled", "false") + .set("spark.shuffle.spill.compress", "false") + .set("spark.shuffle.compress", "false")); + taskMemoryManager = new TaskMemoryManager(memoryManager, 0); + } + + @After + public void tearDown() { + if (taskMemoryManager != null) { + Assert.assertEquals(0L, taskMemoryManager.cleanUpAllAllocatedMemory()); + long leakedMemory = taskMemoryManager.getMemoryConsumptionForThisTask(); + taskMemoryManager = null; + Assert.assertEquals(0L, leakedMemory); + } + } + + + @Test + public void emptyBatch() throws Exception { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + try { + Assert.assertEquals(0, batch.numRows()); + try { + batch.getKeyRow(-1); + Assert.fail("Should not be able to get row -1"); + } catch (AssertionError e) { + // Expected exception; do nothing. + } + try { + batch.getValueRow(-1); + Assert.fail("Should not be able to get row -1"); + } catch (AssertionError e) { + // Expected exception; do nothing. + } + try { + batch.getKeyRow(0); + Assert.fail("Should not be able to get row 0 when batch is empty"); + } catch (AssertionError e) { + // Expected exception; do nothing. + } + try { + batch.getValueRow(0); + Assert.fail("Should not be able to get row 0 when batch is empty"); + } catch (AssertionError e) { + // Expected exception; do nothing. + } + Assert.assertFalse(batch.rowIterator().next()); + } finally { + batch.close(); + } + } + + @Test + public void batchType() throws Exception { + RowBasedKeyValueBatch batch1 = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + RowBasedKeyValueBatch batch2 = RowBasedKeyValueBatch.allocate(fixedKeySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + try { + Assert.assertEquals(batch1.getClass(), VariableLengthRowBasedKeyValueBatch.class); + Assert.assertEquals(batch2.getClass(), FixedLengthRowBasedKeyValueBatch.class); + } finally { + batch1.close(); + batch2.close(); + } + } + + @Test + public void setAndRetrieve() { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + try { + UnsafeRow ret1 = appendRow(batch, makeKeyRow(1, "A"), makeValueRow(1, 1)); + Assert.assertTrue(checkValue(ret1, 1, 1)); + UnsafeRow ret2 = appendRow(batch, makeKeyRow(2, "B"), makeValueRow(2, 2)); + Assert.assertTrue(checkValue(ret2, 2, 2)); + UnsafeRow ret3 = appendRow(batch, makeKeyRow(3, "C"), makeValueRow(3, 3)); + Assert.assertTrue(checkValue(ret3, 3, 3)); + Assert.assertEquals(3, batch.numRows()); + UnsafeRow retrievedKey1 = batch.getKeyRow(0); + Assert.assertTrue(checkKey(retrievedKey1, 1, "A")); + UnsafeRow retrievedKey2 = batch.getKeyRow(1); + Assert.assertTrue(checkKey(retrievedKey2, 2, "B")); + UnsafeRow retrievedValue1 = batch.getValueRow(1); + Assert.assertTrue(checkValue(retrievedValue1, 2, 2)); + UnsafeRow retrievedValue2 = batch.getValueRow(2); + Assert.assertTrue(checkValue(retrievedValue2, 3, 3)); + try { + batch.getKeyRow(3); + Assert.fail("Should not be able to get row 3"); + } catch (AssertionError e) { + // Expected exception; do nothing. + } + try { + batch.getValueRow(3); + Assert.fail("Should not be able to get row 3"); + } catch (AssertionError e) { + // Expected exception; do nothing. + } + } finally { + batch.close(); + } + } + + @Test + public void setUpdateAndRetrieve() { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + try { + appendRow(batch, makeKeyRow(1, "A"), makeValueRow(1, 1)); + Assert.assertEquals(1, batch.numRows()); + UnsafeRow retrievedValue = batch.getValueRow(0); + updateValueRow(retrievedValue, 2, 2); + UnsafeRow retrievedValue2 = batch.getValueRow(0); + Assert.assertTrue(checkValue(retrievedValue2, 2, 2)); + } finally { + batch.close(); + } + } + + + @Test + public void iteratorTest() throws Exception { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + try { + appendRow(batch, makeKeyRow(1, "A"), makeValueRow(1, 1)); + appendRow(batch, makeKeyRow(2, "B"), makeValueRow(2, 2)); + appendRow(batch, makeKeyRow(3, "C"), makeValueRow(3, 3)); + Assert.assertEquals(3, batch.numRows()); + org.apache.spark.unsafe.KVIterator iterator + = batch.rowIterator(); + Assert.assertTrue(iterator.next()); + UnsafeRow key1 = iterator.getKey(); + UnsafeRow value1 = iterator.getValue(); + Assert.assertTrue(checkKey(key1, 1, "A")); + Assert.assertTrue(checkValue(value1, 1, 1)); + Assert.assertTrue(iterator.next()); + UnsafeRow key2 = iterator.getKey(); + UnsafeRow value2 = iterator.getValue(); + Assert.assertTrue(checkKey(key2, 2, "B")); + Assert.assertTrue(checkValue(value2, 2, 2)); + Assert.assertTrue(iterator.next()); + UnsafeRow key3 = iterator.getKey(); + UnsafeRow value3 = iterator.getValue(); + Assert.assertTrue(checkKey(key3, 3, "C")); + Assert.assertTrue(checkValue(value3, 3, 3)); + Assert.assertFalse(iterator.next()); + } finally { + batch.close(); + } + } + + @Test + public void fixedLengthTest() throws Exception { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(fixedKeySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + try { + appendRow(batch, makeKeyRow(11, 11), makeValueRow(1, 1)); + appendRow(batch, makeKeyRow(22, 22), makeValueRow(2, 2)); + appendRow(batch, makeKeyRow(33, 33), makeValueRow(3, 3)); + UnsafeRow retrievedKey1 = batch.getKeyRow(0); + Assert.assertTrue(checkKey(retrievedKey1, 11, 11)); + UnsafeRow retrievedKey2 = batch.getKeyRow(1); + Assert.assertTrue(checkKey(retrievedKey2, 22, 22)); + UnsafeRow retrievedValue1 = batch.getValueRow(1); + Assert.assertTrue(checkValue(retrievedValue1, 2, 2)); + UnsafeRow retrievedValue2 = batch.getValueRow(2); + Assert.assertTrue(checkValue(retrievedValue2, 3, 3)); + Assert.assertEquals(3, batch.numRows()); + org.apache.spark.unsafe.KVIterator iterator + = batch.rowIterator(); + Assert.assertTrue(iterator.next()); + UnsafeRow key1 = iterator.getKey(); + UnsafeRow value1 = iterator.getValue(); + Assert.assertTrue(checkKey(key1, 11, 11)); + Assert.assertTrue(checkValue(value1, 1, 1)); + Assert.assertTrue(iterator.next()); + UnsafeRow key2 = iterator.getKey(); + UnsafeRow value2 = iterator.getValue(); + Assert.assertTrue(checkKey(key2, 22, 22)); + Assert.assertTrue(checkValue(value2, 2, 2)); + Assert.assertTrue(iterator.next()); + UnsafeRow key3 = iterator.getKey(); + UnsafeRow value3 = iterator.getValue(); + Assert.assertTrue(checkKey(key3, 33, 33)); + Assert.assertTrue(checkValue(value3, 3, 3)); + Assert.assertFalse(iterator.next()); + } finally { + batch.close(); + } + } + + @Test + public void appendRowUntilExceedingCapacity() throws Exception { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, 10); + try { + UnsafeRow key = makeKeyRow(1, "A"); + UnsafeRow value = makeValueRow(1, 1); + for (int i = 0; i < 10; i++) { + appendRow(batch, key, value); + } + UnsafeRow ret = appendRow(batch, key, value); + Assert.assertEquals(batch.numRows(), 10); + Assert.assertNull(ret); + org.apache.spark.unsafe.KVIterator iterator + = batch.rowIterator(); + for (int i = 0; i < 10; i++) { + Assert.assertTrue(iterator.next()); + UnsafeRow key1 = iterator.getKey(); + UnsafeRow value1 = iterator.getValue(); + Assert.assertTrue(checkKey(key1, 1, "A")); + Assert.assertTrue(checkValue(value1, 1, 1)); + } + Assert.assertFalse(iterator.next()); + } finally { + batch.close(); + } + } + + @Test + public void appendRowUntilExceedingPageSize() throws Exception { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, 64 * 1024 * 1024); //enough capacity + try { + UnsafeRow key = makeKeyRow(1, "A"); + UnsafeRow value = makeValueRow(1, 1); + int recordLength = 8 + key.getSizeInBytes() + value.getSizeInBytes() + 8; + int totalSize = 4; + int numRows = 0; + while (totalSize + recordLength < 64 * 1024 * 1024) { // default page size + appendRow(batch, key, value); + totalSize += recordLength; + numRows++; + } + UnsafeRow ret = appendRow(batch, key, value); + Assert.assertEquals(batch.numRows(), numRows); + Assert.assertNull(ret); + org.apache.spark.unsafe.KVIterator iterator + = batch.rowIterator(); + for (int i = 0; i < numRows; i++) { + Assert.assertTrue(iterator.next()); + UnsafeRow key1 = iterator.getKey(); + UnsafeRow value1 = iterator.getValue(); + Assert.assertTrue(checkKey(key1, 1, "A")); + Assert.assertTrue(checkValue(value1, 1, 1)); + } + Assert.assertFalse(iterator.next()); + } finally { + batch.close(); + } + } + + @Test + public void failureToAllocateFirstPage() throws Exception { + memoryManager.limit(1024); + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + try { + UnsafeRow key = makeKeyRow(1, "A"); + UnsafeRow value = makeValueRow(11, 11); + UnsafeRow ret = appendRow(batch, key, value); + Assert.assertNull(ret); + Assert.assertFalse(batch.rowIterator().next()); + } finally { + batch.close(); + } + } + + @Test + public void randomizedTest() { + RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + int numEntry = 100; + long[] expectedK1 = new long[numEntry]; + String[] expectedK2 = new String[numEntry]; + long[] expectedV1 = new long[numEntry]; + long[] expectedV2 = new long[numEntry]; + + for (int i = 0; i < numEntry; i++) { + long k1 = rand.nextLong(); + String k2 = getRandomString(rand.nextInt(256)); + long v1 = rand.nextLong(); + long v2 = rand.nextLong(); + appendRow(batch, makeKeyRow(k1, k2), makeValueRow(v1, v2)); + expectedK1[i] = k1; + expectedK2[i] = k2; + expectedV1[i] = v1; + expectedV2[i] = v2; + } + try { + for (int j = 0; j < 10000; j++) { + int rowId = rand.nextInt(numEntry); + if (rand.nextBoolean()) { + UnsafeRow key = batch.getKeyRow(rowId); + Assert.assertTrue(checkKey(key, expectedK1[rowId], expectedK2[rowId])); + } + if (rand.nextBoolean()) { + UnsafeRow value = batch.getValueRow(rowId); + Assert.assertTrue(checkValue(value, expectedV1[rowId], expectedV2[rowId])); + } + } + } finally { + batch.close(); + } + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala index a9268535c40a..3a0dcea903db 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala @@ -399,11 +399,11 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac // alter other storage information catalog.alterPartitions("db2", "tbl2", Seq( oldPart1.copy(storage = storageFormat.copy(serde = Some(newSerde))), - oldPart2.copy(storage = storageFormat.copy(serdeProperties = newSerdeProps)))) + oldPart2.copy(storage = storageFormat.copy(properties = newSerdeProps)))) val newPart1b = catalog.getPartition("db2", "tbl2", part1.spec) val newPart2b = catalog.getPartition("db2", "tbl2", part2.spec) assert(newPart1b.storage.serde == Some(newSerde)) - assert(newPart2b.storage.serdeProperties == newSerdeProps) + assert(newPart2b.storage.properties == newSerdeProps) // alter but change spec, should fail because new partition specs do not exist yet val badPart1 = part1.copy(spec = Map("a" -> "v1", "b" -> "v2")) val badPart2 = part2.copy(spec = Map("a" -> "v3", "b" -> "v4")) @@ -634,7 +634,7 @@ abstract class CatalogTestUtils { outputFormat = Some(tableOutputFormat), serde = None, compressed = false, - serdeProperties = Map.empty) + properties = Map.empty) lazy val part1 = CatalogTablePartition(Map("a" -> "1", "b" -> "2"), storageFormat) lazy val part2 = CatalogTablePartition(Map("a" -> "3", "b" -> "4"), storageFormat) lazy val part3 = CatalogTablePartition(Map("a" -> "5", "b" -> "6"), storageFormat) @@ -692,7 +692,7 @@ abstract class CatalogTestUtils { CatalogColumn("a", "int"), CatalogColumn("b", "string")), partitionColumnNames = Seq("a", "b"), - bucketColumnNames = Seq("col1")) + bucketSpec = Some(BucketSpec(4, Seq("col1"), Nil))) } def newFunc(name: String, database: Option[String] = None): CatalogFunction = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala index ec7be4d4b849..0c307b2b8576 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala @@ -246,4 +246,40 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { checkMetadata(CreateStructUnsafe(Seq(a, b))) checkMetadata(CreateNamedStructUnsafe(Seq("a", a, "b", b))) } + + test("StringToMap") { + val s0 = Literal("a:1,b:2,c:3") + val m0 = Map("a" -> "1", "b" -> "2", "c" -> "3") + checkEvaluation(new StringToMap(s0), m0) + + val s1 = Literal("a: ,b:2") + val m1 = Map("a" -> " ", "b" -> "2") + checkEvaluation(new StringToMap(s1), m1) + + val s2 = Literal("a=1,b=2,c=3") + val m2 = Map("a" -> "1", "b" -> "2", "c" -> "3") + checkEvaluation(StringToMap(s2, Literal(","), Literal("=")), m2) + + val s3 = Literal("") + val m3 = Map[String, String]("" -> null) + checkEvaluation(StringToMap(s3, Literal(","), Literal("=")), m3) + + val s4 = Literal("a:1_b:2_c:3") + val m4 = Map("a" -> "1", "b" -> "2", "c" -> "3") + checkEvaluation(new StringToMap(s4, Literal("_")), m4) + + // arguments checking + assert(new StringToMap(Literal("a:1,b:2,c:3")).checkInputDataTypes().isSuccess) + assert(new StringToMap(Literal(null)).checkInputDataTypes().isFailure) + assert(new StringToMap(Literal("a:1,b:2,c:3"), Literal(null)).checkInputDataTypes().isFailure) + assert(StringToMap(Literal("a:1,b:2,c:3"), Literal(null), Literal(null)) + .checkInputDataTypes().isFailure) + assert(new StringToMap(Literal(null), Literal(null)).checkInputDataTypes().isFailure) + + assert(new StringToMap(Literal("a:1_b:2_c:3"), NonFoldableLiteral("_")) + .checkInputDataTypes().isFailure) + assert( + new StringToMap(Literal("a=1_b=2_c=3"), Literal("_"), NonFoldableLiteral("=")) + .checkInputDataTypes().isFailure) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala index ace6c15dc841..712fe35f477b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala @@ -77,6 +77,21 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } + test("SPARK-16602 Nvl should support numeric-string cases") { + val intLit = Literal.create(1, IntegerType) + val doubleLit = Literal.create(2.2, DoubleType) + val stringLit = Literal.create("c", StringType) + val nullLit = Literal.create(null, NullType) + + assert(Nvl(intLit, doubleLit).replaceForTypeCoercion().dataType == DoubleType) + assert(Nvl(intLit, stringLit).replaceForTypeCoercion().dataType == StringType) + assert(Nvl(stringLit, doubleLit).replaceForTypeCoercion().dataType == StringType) + + assert(Nvl(nullLit, intLit).replaceForTypeCoercion().dataType == IntegerType) + assert(Nvl(doubleLit, nullLit).replaceForTypeCoercion().dataType == DoubleType) + assert(Nvl(nullLit, stringLit).replaceForTypeCoercion().dataType == StringType) + } + test("AtLeastNNonNulls") { val mix = Seq(Literal("x"), Literal.create(null, StringType), diff --git a/dev/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala similarity index 51% rename from dev/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index 9f7ae75d0b47..ee65826cd57f 100644 --- a/dev/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -15,27 +15,21 @@ * limitations under the License. */ -// scalastyle:off println -package main.scala +package org.apache.spark.sql.catalyst.expressions -import scala.util.Try +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.objects.Invoke +import org.apache.spark.sql.types.{IntegerType, ObjectType} -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -object SimpleApp { - def main(args: Array[String]) { - // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue - val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess - val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess - if (!foundConsole) { - println("Console sink not loaded via spark-core") - System.exit(-1) - } - if (!foundGanglia) { - println("Ganglia sink not loaded via spark-ganglia-lgpl") - System.exit(-1) - } +class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { + + test("SPARK-16622: The returned value of the called method in Invoke can be null") { + val inputRow = InternalRow.fromSeq(Seq((false, null))) + val cls = classOf[Tuple2[Boolean, java.lang.Integer]] + val inputObject = BoundReference(0, ObjectType(cls), nullable = true) + val invoke = Invoke(inputObject, "_2", IntegerType) + checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow) } } -// scalastyle:on println diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala new file mode 100644 index 000000000000..5299549e7b4d --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.types.StringType + +/** + * Unit tests for regular expression (regexp) related SQL expressions. + */ +class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { + + test("LIKE literal Regular Expression") { + checkEvaluation(Literal.create(null, StringType).like("a"), null) + checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null) + checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null) + checkEvaluation( + Literal.create("a", StringType).like(NonFoldableLiteral.create("a", StringType)), true) + checkEvaluation( + Literal.create("a", StringType).like(NonFoldableLiteral.create(null, StringType)), null) + checkEvaluation( + Literal.create(null, StringType).like(NonFoldableLiteral.create("a", StringType)), null) + checkEvaluation( + Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null) + + checkEvaluation("abdef" like "abdef", true) + checkEvaluation("a_%b" like "a\\__b", true) + checkEvaluation("addb" like "a_%b", true) + checkEvaluation("addb" like "a\\__b", false) + checkEvaluation("addb" like "a%\\%b", false) + checkEvaluation("a_%b" like "a%\\%b", true) + checkEvaluation("addb" like "a%", true) + checkEvaluation("addb" like "**", false) + checkEvaluation("abc" like "a%", true) + checkEvaluation("abc" like "b%", false) + checkEvaluation("abc" like "bc%", false) + checkEvaluation("a\nb" like "a_b", true) + checkEvaluation("ab" like "a%b", true) + checkEvaluation("a\nb" like "a%b", true) + } + + test("LIKE Non-literal Regular Expression") { + val regEx = 'a.string.at(0) + checkEvaluation("abcd" like regEx, null, create_row(null)) + checkEvaluation("abdef" like regEx, true, create_row("abdef")) + checkEvaluation("a_%b" like regEx, true, create_row("a\\__b")) + checkEvaluation("addb" like regEx, true, create_row("a_%b")) + checkEvaluation("addb" like regEx, false, create_row("a\\__b")) + checkEvaluation("addb" like regEx, false, create_row("a%\\%b")) + checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b")) + checkEvaluation("addb" like regEx, true, create_row("a%")) + checkEvaluation("addb" like regEx, false, create_row("**")) + checkEvaluation("abc" like regEx, true, create_row("a%")) + checkEvaluation("abc" like regEx, false, create_row("b%")) + checkEvaluation("abc" like regEx, false, create_row("bc%")) + checkEvaluation("a\nb" like regEx, true, create_row("a_b")) + checkEvaluation("ab" like regEx, true, create_row("a%b")) + checkEvaluation("a\nb" like regEx, true, create_row("a%b")) + + checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%")) + } + + test("RLIKE literal Regular Expression") { + checkEvaluation(Literal.create(null, StringType) rlike "abdef", null) + checkEvaluation("abdef" rlike Literal.create(null, StringType), null) + checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null) + checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true) + checkEvaluation("abdef" rlike NonFoldableLiteral.create(null, StringType), null) + checkEvaluation( + Literal.create(null, StringType) rlike NonFoldableLiteral.create("abdef", StringType), null) + checkEvaluation( + Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null) + + checkEvaluation("abdef" rlike "abdef", true) + checkEvaluation("abbbbc" rlike "a.*c", true) + + checkEvaluation("fofo" rlike "^fo", true) + checkEvaluation("fo\no" rlike "^fo\no$", true) + checkEvaluation("Bn" rlike "^Ba*n", true) + checkEvaluation("afofo" rlike "fo", true) + checkEvaluation("afofo" rlike "^fo", false) + checkEvaluation("Baan" rlike "^Ba?n", false) + checkEvaluation("axe" rlike "pi|apa", false) + checkEvaluation("pip" rlike "^(pi)*$", false) + + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) + + intercept[java.util.regex.PatternSyntaxException] { + evaluate("abbbbc" rlike "**") + } + } + + test("RLIKE Non-literal Regular Expression") { + val regEx = 'a.string.at(0) + checkEvaluation("abdef" rlike regEx, true, create_row("abdef")) + checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c")) + checkEvaluation("fofo" rlike regEx, true, create_row("^fo")) + checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$")) + checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n")) + + intercept[java.util.regex.PatternSyntaxException] { + evaluate("abbbbc" rlike regEx, create_row("**")) + } + } + + + test("RegexReplace") { + val row1 = create_row("100-200", "(\\d+)", "num") + val row2 = create_row("100-200", "(\\d+)", "###") + val row3 = create_row("100-200", "(-)", "###") + val row4 = create_row(null, "(\\d+)", "###") + val row5 = create_row("100-200", null, "###") + val row6 = create_row("100-200", "(-)", null) + + val s = 's.string.at(0) + val p = 'p.string.at(1) + val r = 'r.string.at(2) + + val expr = RegExpReplace(s, p, r) + checkEvaluation(expr, "num-num", row1) + checkEvaluation(expr, "###-###", row2) + checkEvaluation(expr, "100###200", row3) + checkEvaluation(expr, null, row4) + checkEvaluation(expr, null, row5) + checkEvaluation(expr, null, row6) + + val nonNullExpr = RegExpReplace(Literal("100-200"), Literal("(\\d+)"), Literal("num")) + checkEvaluation(nonNullExpr, "num-num", row1) + } + + test("RegexExtract") { + val row1 = create_row("100-200", "(\\d+)-(\\d+)", 1) + val row2 = create_row("100-200", "(\\d+)-(\\d+)", 2) + val row3 = create_row("100-200", "(\\d+).*", 1) + val row4 = create_row("100-200", "([a-z])", 1) + val row5 = create_row(null, "([a-z])", 1) + val row6 = create_row("100-200", null, 1) + val row7 = create_row("100-200", "([a-z])", null) + + val s = 's.string.at(0) + val p = 'p.string.at(1) + val r = 'r.int.at(2) + + val expr = RegExpExtract(s, p, r) + checkEvaluation(expr, "100", row1) + checkEvaluation(expr, "200", row2) + checkEvaluation(expr, "100", row3) + checkEvaluation(expr, "", row4) // will not match anything, empty string get + checkEvaluation(expr, null, row5) + checkEvaluation(expr, null, row6) + checkEvaluation(expr, null, row7) + + val expr1 = new RegExpExtract(s, p) + checkEvaluation(expr1, "100", row1) + + val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1)) + checkEvaluation(nonNullExpr, "100", row1) + } + + test("SPLIT") { + val s1 = 'a.string.at(0) + val s2 = 'b.string.at(1) + val row1 = create_row("aa2bb3cc", "[1-9]+") + val row2 = create_row(null, "[1-9]+") + val row3 = create_row("aa2bb3cc", null) + + checkEvaluation( + StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1) + checkEvaluation( + StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1) + checkEvaluation(StringSplit(s1, s2), null, row2) + checkEvaluation(StringSplit(s1, s2), null, row3) + } + +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 5b9ed83a4eb3..fdb9fa31f09c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -254,102 +254,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { SubstringIndex(Literal("www||apache||org"), Literal( "||"), Literal(2)), "www||apache") } - test("LIKE literal Regular Expression") { - checkEvaluation(Literal.create(null, StringType).like("a"), null) - checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null) - checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null) - checkEvaluation( - Literal.create("a", StringType).like(NonFoldableLiteral.create("a", StringType)), true) - checkEvaluation( - Literal.create("a", StringType).like(NonFoldableLiteral.create(null, StringType)), null) - checkEvaluation( - Literal.create(null, StringType).like(NonFoldableLiteral.create("a", StringType)), null) - checkEvaluation( - Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null) - - checkEvaluation("abdef" like "abdef", true) - checkEvaluation("a_%b" like "a\\__b", true) - checkEvaluation("addb" like "a_%b", true) - checkEvaluation("addb" like "a\\__b", false) - checkEvaluation("addb" like "a%\\%b", false) - checkEvaluation("a_%b" like "a%\\%b", true) - checkEvaluation("addb" like "a%", true) - checkEvaluation("addb" like "**", false) - checkEvaluation("abc" like "a%", true) - checkEvaluation("abc" like "b%", false) - checkEvaluation("abc" like "bc%", false) - checkEvaluation("a\nb" like "a_b", true) - checkEvaluation("ab" like "a%b", true) - checkEvaluation("a\nb" like "a%b", true) - } - - test("LIKE Non-literal Regular Expression") { - val regEx = 'a.string.at(0) - checkEvaluation("abcd" like regEx, null, create_row(null)) - checkEvaluation("abdef" like regEx, true, create_row("abdef")) - checkEvaluation("a_%b" like regEx, true, create_row("a\\__b")) - checkEvaluation("addb" like regEx, true, create_row("a_%b")) - checkEvaluation("addb" like regEx, false, create_row("a\\__b")) - checkEvaluation("addb" like regEx, false, create_row("a%\\%b")) - checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b")) - checkEvaluation("addb" like regEx, true, create_row("a%")) - checkEvaluation("addb" like regEx, false, create_row("**")) - checkEvaluation("abc" like regEx, true, create_row("a%")) - checkEvaluation("abc" like regEx, false, create_row("b%")) - checkEvaluation("abc" like regEx, false, create_row("bc%")) - checkEvaluation("a\nb" like regEx, true, create_row("a_b")) - checkEvaluation("ab" like regEx, true, create_row("a%b")) - checkEvaluation("a\nb" like regEx, true, create_row("a%b")) - - checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%")) - } - - test("RLIKE literal Regular Expression") { - checkEvaluation(Literal.create(null, StringType) rlike "abdef", null) - checkEvaluation("abdef" rlike Literal.create(null, StringType), null) - checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null) - checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true) - checkEvaluation("abdef" rlike NonFoldableLiteral.create(null, StringType), null) - checkEvaluation( - Literal.create(null, StringType) rlike NonFoldableLiteral.create("abdef", StringType), null) - checkEvaluation( - Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null) - - checkEvaluation("abdef" rlike "abdef", true) - checkEvaluation("abbbbc" rlike "a.*c", true) - - checkEvaluation("fofo" rlike "^fo", true) - checkEvaluation("fo\no" rlike "^fo\no$", true) - checkEvaluation("Bn" rlike "^Ba*n", true) - checkEvaluation("afofo" rlike "fo", true) - checkEvaluation("afofo" rlike "^fo", false) - checkEvaluation("Baan" rlike "^Ba?n", false) - checkEvaluation("axe" rlike "pi|apa", false) - checkEvaluation("pip" rlike "^(pi)*$", false) - - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) - - intercept[java.util.regex.PatternSyntaxException] { - evaluate("abbbbc" rlike "**") - } - } - - test("RLIKE Non-literal Regular Expression") { - val regEx = 'a.string.at(0) - checkEvaluation("abdef" rlike regEx, true, create_row("abdef")) - checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c")) - checkEvaluation("fofo" rlike regEx, true, create_row("^fo")) - checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$")) - checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n")) - - intercept[java.util.regex.PatternSyntaxException] { - evaluate("abbbbc" rlike regEx, create_row("**")) - } - } - test("ascii for string") { val a = 'a.string.at(0) checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef")) @@ -612,74 +516,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(StringSpace(s1), null, row2) } - test("RegexReplace") { - val row1 = create_row("100-200", "(\\d+)", "num") - val row2 = create_row("100-200", "(\\d+)", "###") - val row3 = create_row("100-200", "(-)", "###") - val row4 = create_row(null, "(\\d+)", "###") - val row5 = create_row("100-200", null, "###") - val row6 = create_row("100-200", "(-)", null) - - val s = 's.string.at(0) - val p = 'p.string.at(1) - val r = 'r.string.at(2) - - val expr = RegExpReplace(s, p, r) - checkEvaluation(expr, "num-num", row1) - checkEvaluation(expr, "###-###", row2) - checkEvaluation(expr, "100###200", row3) - checkEvaluation(expr, null, row4) - checkEvaluation(expr, null, row5) - checkEvaluation(expr, null, row6) - - val nonNullExpr = RegExpReplace(Literal("100-200"), Literal("(\\d+)"), Literal("num")) - checkEvaluation(nonNullExpr, "num-num", row1) - } - - test("RegexExtract") { - val row1 = create_row("100-200", "(\\d+)-(\\d+)", 1) - val row2 = create_row("100-200", "(\\d+)-(\\d+)", 2) - val row3 = create_row("100-200", "(\\d+).*", 1) - val row4 = create_row("100-200", "([a-z])", 1) - val row5 = create_row(null, "([a-z])", 1) - val row6 = create_row("100-200", null, 1) - val row7 = create_row("100-200", "([a-z])", null) - - val s = 's.string.at(0) - val p = 'p.string.at(1) - val r = 'r.int.at(2) - - val expr = RegExpExtract(s, p, r) - checkEvaluation(expr, "100", row1) - checkEvaluation(expr, "200", row2) - checkEvaluation(expr, "100", row3) - checkEvaluation(expr, "", row4) // will not match anything, empty string get - checkEvaluation(expr, null, row5) - checkEvaluation(expr, null, row6) - checkEvaluation(expr, null, row7) - - val expr1 = new RegExpExtract(s, p) - checkEvaluation(expr1, "100", row1) - - val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1)) - checkEvaluation(nonNullExpr, "100", row1) - } - - test("SPLIT") { - val s1 = 'a.string.at(0) - val s2 = 'b.string.at(1) - val row1 = create_row("aa2bb3cc", "[1-9]+") - val row2 = create_row(null, "[1-9]+") - val row3 = create_row("aa2bb3cc", null) - - checkEvaluation( - StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1) - checkEvaluation( - StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1) - checkEvaluation(StringSplit(s1, s2), null, row2) - checkEvaluation(StringSplit(s1, s2), null, row3) - } - test("length for string / binary") { val a = 'a.string.at(0) val b = 'b.binary.at(0) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala index 9cb49e74ad34..780e78ed1cf2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala @@ -531,14 +531,14 @@ class FilterPushdownSuite extends PlanTest { val originalQuery = { testRelationWithArrayType .generate(Explode('c_arr), true, false, Some("arr")) - .where(('b >= 5) && ('a + Rand(10).as("rnd") > 6)) + .where(('b >= 5) && ('a + Rand(10).as("rnd") > 6) && ('c > 6)) } val optimized = Optimize.execute(originalQuery.analyze) val correctAnswer = { testRelationWithArrayType .where('b >= 5) .generate(Explode('c_arr), true, false, Some("arr")) - .where('a + Rand(10).as("rnd") > 6) + .where('a + Rand(10).as("rnd") > 6 && 'c > 6) .analyze } @@ -715,14 +715,14 @@ class FilterPushdownSuite extends PlanTest { val testRelation2 = LocalRelation('d.int, 'e.int, 'f.int) val originalQuery = Union(Seq(testRelation, testRelation2)) - .where('a === 2L && 'b + Rand(10).as("rnd") === 3) + .where('a === 2L && 'b + Rand(10).as("rnd") === 3 && 'c > 5L) val optimized = Optimize.execute(originalQuery.analyze) val correctAnswer = Union(Seq( testRelation.where('a === 2L), testRelation2.where('d === 2L))) - .where('b + Rand(10).as("rnd") === 3) + .where('b + Rand(10).as("rnd") === 3 && 'c > 5L) .analyze comparePlans(optimized, correctAnswer) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala new file mode 100644 index 000000000000..7c26cb5598b3 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules._ +import org.apache.spark.sql.types.MetadataBuilder + +class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper { + + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = Batch("RemoveAliasOnlyProject", FixedPoint(50), RemoveAliasOnlyProject) :: Nil + } + + test("all expressions in project list are aliased child output") { + val relation = LocalRelation('a.int, 'b.int) + val query = relation.select('a as 'a, 'b as 'b).analyze + val optimized = Optimize.execute(query) + comparePlans(optimized, relation) + } + + test("all expressions in project list are aliased child output but with different order") { + val relation = LocalRelation('a.int, 'b.int) + val query = relation.select('b as 'b, 'a as 'a).analyze + val optimized = Optimize.execute(query) + comparePlans(optimized, query) + } + + test("some expressions in project list are aliased child output") { + val relation = LocalRelation('a.int, 'b.int) + val query = relation.select('a as 'a, 'b).analyze + val optimized = Optimize.execute(query) + comparePlans(optimized, relation) + } + + test("some expressions in project list are aliased child output but with different order") { + val relation = LocalRelation('a.int, 'b.int) + val query = relation.select('b as 'b, 'a).analyze + val optimized = Optimize.execute(query) + comparePlans(optimized, query) + } + + test("some expressions in project list are not Alias or Attribute") { + val relation = LocalRelation('a.int, 'b.int) + val query = relation.select('a as 'a, 'b + 1).analyze + val optimized = Optimize.execute(query) + comparePlans(optimized, query) + } + + test("some expressions in project list are aliased child output but with metadata") { + val relation = LocalRelation('a.int, 'b.int) + val metadata = new MetadataBuilder().putString("x", "y").build() + val aliasWithMeta = Alias('a, "a")(explicitMetadata = Some(metadata)) + val query = relation.select(aliasWithMeta, 'b).analyze + val optimized = Optimize.execute(query) + comparePlans(optimized, query) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala index 0b73b5e009b7..5a76969235ac 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala @@ -79,13 +79,15 @@ class ConstraintPropagationSuite extends SparkFunSuite { assert(tr.analyze.constraints.isEmpty) val aliasedRelation = tr.where('c.attr > 10 && 'a.attr < 5) - .groupBy('a, 'c, 'b)('a, 'c.as("c1"), count('a).as("a3")).select('c1, 'a).analyze + .groupBy('a, 'c, 'b)('a, 'c.as("c1"), count('a).as("a3")).select('c1, 'a, 'a3).analyze + // SPARK-16644: aggregate expression count(a) should not appear in the constraints. verifyConstraints(aliasedRelation.analyze.constraints, ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "c1") > 10, IsNotNull(resolveColumn(aliasedRelation.analyze, "c1")), resolveColumn(aliasedRelation.analyze, "a") < 5, - IsNotNull(resolveColumn(aliasedRelation.analyze, "a"))))) + IsNotNull(resolveColumn(aliasedRelation.analyze, "a")), + IsNotNull(resolveColumn(aliasedRelation.analyze, "a3"))))) } test("propagating constraints in expand") { diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 627558dd32c3..b2752638bebd 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-sql_2.11 jar Spark Project SQL diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index d823275d857b..04752ec5fe7b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -60,6 +60,7 @@ import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Types; import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.StructType$; /** * Base class for custom RecordReaders for Parquet that directly materialize to `T`. @@ -136,7 +137,9 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont ReadSupport.ReadContext readContext = readSupport.init(new InitContext( taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema)); this.requestedSchema = readContext.getRequestedSchema(); - this.sparkSchema = new ParquetSchemaConverter(configuration).convert(requestedSchema); + String sparkRequestedSchemaString = + configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA()); + this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString); this.reader = new ParquetFileReader(configuration, file, blocks, requestedSchema.getColumns()); for (BlockMetaData block : blocks) { this.totalRowCount += block.getRowCount(); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index a18b881c78a0..6c47dc09a863 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -59,7 +59,7 @@ public class VectorizedColumnReader { /** * If true, the current page is dictionary encoded. */ - private boolean useDictionary; + private boolean isCurrentPageDictionaryEncoded; /** * Maximum definition level for this column. @@ -100,13 +100,13 @@ public VectorizedColumnReader(ColumnDescriptor descriptor, PageReader pageReader if (dictionaryPage != null) { try { this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); - this.useDictionary = true; + this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; - this.useDictionary = false; + this.isCurrentPageDictionaryEncoded = false; } this.totalValueCount = pageReader.getTotalValueCount(); if (totalValueCount == 0) { @@ -136,6 +136,13 @@ private boolean next() throws IOException { */ void readBatch(int total, ColumnVector column) throws IOException { int rowId = 0; + ColumnVector dictionaryIds = null; + if (dictionary != null) { + // SPARK-16334: We only maintain a single dictionary per row batch, so that it can be used to + // decode all previous dictionary encoded pages if we ever encounter a non-dictionary encoded + // page. + dictionaryIds = column.reserveDictionaryIds(total); + } while (total > 0) { // Compute the number of values we want to read in this page. int leftInPage = (int) (endOfPageValueCount - valuesRead); @@ -144,12 +151,10 @@ void readBatch(int total, ColumnVector column) throws IOException { leftInPage = (int) (endOfPageValueCount - valuesRead); } int num = Math.min(total, leftInPage); - if (useDictionary) { + if (isCurrentPageDictionaryEncoded) { // Read and decode dictionary ids. - ColumnVector dictionaryIds = column.reserveDictionaryIds(total); defColumn.readIntegers( num, dictionaryIds, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); - if (column.hasDictionary() || (rowId == 0 && (descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT32 || descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT64 || @@ -461,13 +466,13 @@ private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset) thr throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); } this.dataColumn = new VectorizedRleValuesReader(); - this.useDictionary = true; + this.isCurrentPageDictionaryEncoded = true; } else { if (dataEncoding != Encoding.PLAIN) { throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); } this.dataColumn = new VectorizedPlainValuesReader(); - this.useDictionary = false; + this.isCurrentPageDictionaryEncoded = false; } try { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 12b304623d30..44189881ddd0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -23,8 +23,9 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation -import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Project} -import org.apache.spark.sql.execution.datasources.{BucketSpec, CreateTableUsingAsSelect, DataSource, HadoopFsRelation} +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable +import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, DataSource, HadoopFsRelation} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils /** @@ -387,12 +388,25 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash * your external database systems. * + * You can set the following JDBC-specific option(s) for storing JDBC: + *

  • `truncate` (default `false`): use `TRUNCATE TABLE` instead of `DROP TABLE`.
  • + * + * In case of failures, users should turn off `truncate` option to use `DROP TABLE` again. Also, + * due to the different behavior of `TRUNCATE TABLE` among DBMS, it's not always safe to use this. + * MySQLDialect, DB2Dialect, MsSqlServerDialect, DerbyDialect, and OracleDialect supports this + * while PostgresDialect and default JDBCDirect doesn't. For unknown and unsupported JDBCDirect, + * the user option `truncate` is ignored. + * * @param url JDBC database url of the form `jdbc:subprotocol:subname` * @param table Name of the table in the external database. * @param connectionProperties JDBC database connection arguments, a list of arbitrary string * tag/value. Normally at least a "user" and "password" property * should be included. "batchsize" can be used to control the - * number of rows per insert. + * number of rows per insert. "isolationLevel" can be one of + * "NONE", "READ_COMMITTED", "READ_UNCOMMITTED", "REPEATABLE_READ", + * or "SERIALIZABLE", corresponding to standard transaction + * isolation levels defined by JDBC's Connection object, with default + * of "READ_UNCOMMITTED". * @since 1.4.0 */ def jdbc(url: String, table: String, connectionProperties: Properties): Unit = { @@ -419,8 +433,13 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } if (mode == SaveMode.Overwrite && tableExists) { - JdbcUtils.dropTable(conn, table) - tableExists = false + if (extraOptions.getOrElse("truncate", "false").toBoolean && + JdbcUtils.isCascadingTruncateTable(url) == Some(false)) { + JdbcUtils.truncateTable(conn, table) + } else { + JdbcUtils.dropTable(conn, table) + tableExists = false + } } // Create the table if the table didn't exist. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index ed4ccdb4c8d4..8b6443c8b96f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -2421,13 +2421,7 @@ class Dataset[T] private[sql]( */ @throws[AnalysisException] def createTempView(viewName: String): Unit = withPlan { - val tableDesc = CatalogTable( - identifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), - tableType = CatalogTableType.VIEW, - schema = Seq.empty[CatalogColumn], - storage = CatalogStorageFormat.empty) - CreateViewCommand(tableDesc, logicalPlan, allowExisting = false, replace = false, - isTemporary = true) + createViewCommand(viewName, replace = false) } /** @@ -2438,12 +2432,19 @@ class Dataset[T] private[sql]( * @since 2.0.0 */ def createOrReplaceTempView(viewName: String): Unit = withPlan { - val tableDesc = CatalogTable( - identifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), - tableType = CatalogTableType.VIEW, - schema = Seq.empty[CatalogColumn], - storage = CatalogStorageFormat.empty) - CreateViewCommand(tableDesc, logicalPlan, allowExisting = false, replace = true, + createViewCommand(viewName, replace = true) + } + + private def createViewCommand(viewName: String, replace: Boolean): CreateViewCommand = { + CreateViewCommand( + name = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), + userSpecifiedColumns = Nil, + comment = None, + properties = Map.empty, + originalText = None, + child = logicalPlan, + allowExisting = false, + replace = replace, isTemporary = true) } @@ -2489,12 +2490,12 @@ class Dataset[T] private[sql]( val rdd: RDD[String] = queryExecution.toRdd.mapPartitions { iter => val writer = new CharArrayWriter() // create the Generator without separator inserted between 2 records - val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) + val gen = new JacksonGenerator(rowSchema, writer) new Iterator[String] { override def hasNext: Boolean = iter.hasNext override def next(): String = { - JacksonGenerator(rowSchema, gen)(iter.next()) + gen.write(iter.next()) gen.flush() val json = writer.toString diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 1271d1c55bb3..2ade36d07502 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -25,7 +25,7 @@ import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal -import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.{SPARK_VERSION, SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.api.java.JavaRDD import org.apache.spark.internal.Logging @@ -79,6 +79,13 @@ class SparkSession private( sparkContext.assertNotStopped() + /** + * The version of Spark on which this application is running. + * + * @since 2.0.0 + */ + def version: String = SPARK_VERSION + /* ----------------------- * | Session-related state | * ----------------------- */ @@ -108,9 +115,11 @@ class SparkSession private( /** * A wrapped version of this session in the form of a [[SQLContext]], for backward compatibility. + * + * @since 2.0.0 */ @transient - private[spark] val sqlContext: SQLContext = new SQLContext(this) + val sqlContext: SQLContext = new SQLContext(this) /** * Runtime configuration interface for Spark. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index c5f4d58da43a..5e1ad9b885b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -344,6 +344,11 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { table, provider, partitionColumnNames, bucketSpec, mode, options, query) } else { val struct = Option(ctx.colTypeList()).map(createStructType) + if (struct.isEmpty && bucketSpec.nonEmpty) { + throw new ParseException( + "Expected explicit specification of table schema when using CLUSTERED BY clause.", ctx) + } + CreateTableUsing( table, struct, @@ -963,7 +968,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { // whether to convert a table created by CTAS to a datasource table. serde = None, compressed = false, - serdeProperties = Map()) + properties = Map()) } validateRowFormatFileFormat(ctx.rowFormat, ctx.createFileFormat, ctx) val fileStorage = Option(ctx.createFileFormat).map(visitCreateFileFormat) @@ -981,7 +986,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { outputFormat = fileStorage.outputFormat.orElse(defaultStorage.outputFormat), serde = rowStorage.serde.orElse(fileStorage.serde).orElse(defaultStorage.serde), compressed = false, - serdeProperties = rowStorage.serdeProperties ++ fileStorage.serdeProperties) + properties = rowStorage.properties ++ fileStorage.properties) // If location is defined, we'll assume this is an external table. // Otherwise, we may accidentally delete existing data. val tableType = if (external || location.isDefined) { @@ -1140,7 +1145,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { import ctx._ CatalogStorageFormat.empty.copy( serde = Option(string(name)), - serdeProperties = Option(tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)) + properties = Option(tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)) } /** @@ -1168,7 +1173,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { ctx) "line.delim" -> value } - CatalogStorageFormat.empty.copy(serdeProperties = entries.toMap) + CatalogStorageFormat.empty.copy(properties = entries.toMap) } /** @@ -1230,20 +1235,21 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { if (ctx.identifierList != null) { operationNotAllowed("CREATE VIEW ... PARTITIONED ON", ctx) } else { - val identifiers = Option(ctx.identifierCommentList).toSeq.flatMap(_.identifierComment.asScala) - val schema = identifiers.map { ic => - CatalogColumn(ic.identifier.getText, null, nullable = true, Option(ic.STRING).map(string)) + val userSpecifiedColumns = Option(ctx.identifierCommentList).toSeq.flatMap { icl => + icl.identifierComment.asScala.map { ic => + ic.identifier.getText -> Option(ic.STRING).map(string) + } } createView( ctx, ctx.tableIdentifier, comment = Option(ctx.STRING).map(string), - schema, + userSpecifiedColumns, ctx.query, Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty), - ctx.EXISTS != null, - ctx.REPLACE != null, - ctx.TEMPORARY != null + allowExisting = ctx.EXISTS != null, + replace = ctx.REPLACE != null, + isTemporary = ctx.TEMPORARY != null ) } } @@ -1254,12 +1260,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { override def visitAlterViewQuery(ctx: AlterViewQueryContext): LogicalPlan = withOrigin(ctx) { createView( ctx, - ctx.tableIdentifier, + name = ctx.tableIdentifier, comment = None, - Seq.empty, - ctx.query, - Map.empty, - allowExist = false, + userSpecifiedColumns = Seq.empty, + query = ctx.query, + properties = Map.empty, + allowExisting = false, replace = true, isTemporary = false) } @@ -1271,23 +1277,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { ctx: ParserRuleContext, name: TableIdentifierContext, comment: Option[String], - schema: Seq[CatalogColumn], + userSpecifiedColumns: Seq[(String, Option[String])], query: QueryContext, properties: Map[String, String], - allowExist: Boolean, + allowExisting: Boolean, replace: Boolean, isTemporary: Boolean): LogicalPlan = { - val sql = Option(source(query)) - val tableDesc = CatalogTable( - identifier = visitTableIdentifier(name), - tableType = CatalogTableType.VIEW, - schema = schema, - storage = CatalogStorageFormat.empty, - properties = properties, - viewOriginalText = sql, - viewText = sql, - comment = comment) - CreateViewCommand(tableDesc, plan(query), allowExist, replace, isTemporary) + val originalText = source(query) + CreateViewCommand( + visitTableIdentifier(name), + userSpecifiedColumns, + comment, + properties, + Some(originalText), + plan(query), + allowExisting = allowExisting, + replace = replace, + isTemporary = isTemporary) } /** @@ -1325,7 +1331,10 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { // Decode and input/output format. type Format = (Seq[(String, String)], Option[String], Seq[(String, String)], Option[String]) - def format(fmt: RowFormatContext, configKey: String): Format = fmt match { + def format( + fmt: RowFormatContext, + configKey: String, + defaultConfigValue: String): Format = fmt match { case c: RowFormatDelimitedContext => // TODO we should use the visitRowFormatDelimited function here. However HiveScriptIOSchema // expects a seq of pairs in which the old parsers' token names are used as keys. @@ -1348,7 +1357,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { // SPARK-10310: Special cases LazySimpleSerDe val recordHandler = if (name == "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") { - Try(conf.getConfString(configKey)).toOption + Option(conf.getConfString(configKey, defaultConfigValue)) } else { None } @@ -1359,15 +1368,18 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { val name = conf.getConfString("hive.script.serde", "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") val props = Seq("field.delim" -> "\t") - val recordHandler = Try(conf.getConfString(configKey)).toOption + val recordHandler = Option(conf.getConfString(configKey, defaultConfigValue)) (Nil, Option(name), props, recordHandler) } val (inFormat, inSerdeClass, inSerdeProps, reader) = - format(inRowFormat, "hive.script.recordreader") + format( + inRowFormat, "hive.script.recordreader", "org.apache.hadoop.hive.ql.exec.TextRecordReader") val (outFormat, outSerdeClass, outSerdeProps, writer) = - format(outRowFormat, "hive.script.recordwriter") + format( + outRowFormat, "hive.script.recordwriter", + "org.apache.hadoop.hive.ql.exec.TextRecordWriter") ScriptInputOutputSchema( inFormat, outFormat, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala new file mode 100644 index 000000000000..90deb20e9724 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, DeclarativeAggregate} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.types._ + +/** + * This is a helper class to generate an append-only row-based hash map that can act as a 'cache' + * for extremely fast key-value lookups while evaluating aggregates (and fall back to the + * `BytesToBytesMap` if a given key isn't found). This is 'codegened' in HashAggregate to speed + * up aggregates w/ key. + * + * NOTE: the generated hash map currently doesn't support nullable keys and falls back to the + * `BytesToBytesMap` to store them. + */ +abstract class HashMapGenerator( + ctx: CodegenContext, + aggregateExpressions: Seq[AggregateExpression], + generatedClassName: String, + groupingKeySchema: StructType, + bufferSchema: StructType) { + case class Buffer(dataType: DataType, name: String) + + val groupingKeys = groupingKeySchema.map(k => Buffer(k.dataType, ctx.freshName("key"))) + val bufferValues = bufferSchema.map(k => Buffer(k.dataType, ctx.freshName("value"))) + val groupingKeySignature = + groupingKeys.map(key => s"${ctx.javaType(key.dataType)} ${key.name}").mkString(", ") + val buffVars: Seq[ExprCode] = { + val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate]) + val initExpr = functions.flatMap(f => f.initialValues) + initExpr.map { e => + val isNull = ctx.freshName("bufIsNull") + val value = ctx.freshName("bufValue") + ctx.addMutableState("boolean", isNull, "") + ctx.addMutableState(ctx.javaType(e.dataType), value, "") + val ev = e.genCode(ctx) + val initVars = + s""" + | $isNull = ${ev.isNull}; + | $value = ${ev.value}; + """.stripMargin + ExprCode(ev.code + initVars, isNull, value) + } + } + + def generate(): String = { + s""" + |public class $generatedClassName { + |${initializeAggregateHashMap()} + | + |${generateFindOrInsert()} + | + |${generateEquals()} + | + |${generateHashFunction()} + | + |${generateRowIterator()} + | + |${generateClose()} + |} + """.stripMargin + } + + protected def initializeAggregateHashMap(): String + + /** + * Generates a method that computes a hash by currently xor-ing all individual group-by keys. For + * instance, if we have 2 long group-by keys, the generated function would be of the form: + * + * {{{ + * private long hash(long agg_key, long agg_key1) { + * return agg_key ^ agg_key1; + * } + * }}} + */ + protected final def generateHashFunction(): String = { + val hash = ctx.freshName("hash") + + def genHashForKeys(groupingKeys: Seq[Buffer]): String = { + groupingKeys.map { key => + val result = ctx.freshName("result") + s""" + |${genComputeHash(ctx, key.name, key.dataType, result)} + |$hash = ($hash ^ (0x9e3779b9)) + $result + ($hash << 6) + ($hash >>> 2); + """.stripMargin + }.mkString("\n") + } + + s""" + |private long hash($groupingKeySignature) { + | long $hash = 0; + | ${genHashForKeys(groupingKeys)} + | return $hash; + |} + """.stripMargin + } + + /** + * Generates a method that returns true if the group-by keys exist at a given index. + */ + protected def generateEquals(): String + + /** + * Generates a method that returns a row which keeps track of the + * aggregate value(s) for a given set of keys. If the corresponding row doesn't exist, the + * generated method adds the corresponding row in the associated key value batch. + */ + protected def generateFindOrInsert(): String + + protected def generateRowIterator(): String + + protected final def generateClose(): String = { + s""" + |public void close() { + | batch.close(); + |} + """.stripMargin + } + + protected final def genComputeHash( + ctx: CodegenContext, + input: String, + dataType: DataType, + result: String): String = { + def hashInt(i: String): String = s"int $result = $i;" + def hashLong(l: String): String = s"long $result = $l;" + def hashBytes(b: String): String = { + val hash = ctx.freshName("hash") + val bytes = ctx.freshName("bytes") + s""" + |int $result = 0; + |byte[] $bytes = $b; + |for (int i = 0; i < $bytes.length; i++) { + | ${genComputeHash(ctx, s"$bytes[i]", ByteType, hash)} + | $result = ($result ^ (0x9e3779b9)) + $hash + ($result << 6) + ($result >>> 2); + |} + """.stripMargin + } + + dataType match { + case BooleanType => hashInt(s"$input ? 1 : 0") + case ByteType | ShortType | IntegerType | DateType => hashInt(input) + case LongType | TimestampType => hashLong(input) + case FloatType => hashInt(s"Float.floatToIntBits($input)") + case DoubleType => hashLong(s"Double.doubleToLongBits($input)") + case d: DecimalType => + if (d.precision <= Decimal.MAX_LONG_DIGITS) { + hashLong(s"$input.toUnscaledLong()") + } else { + val bytes = ctx.freshName("bytes") + s""" + final byte[] $bytes = $input.toJavaBigDecimal().unscaledValue().toByteArray(); + ${hashBytes(bytes)} + """ + } + case StringType => hashBytes(s"$input.getBytes()") + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala new file mode 100644 index 000000000000..1dea33037c85 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext} +import org.apache.spark.sql.types._ + +/** + * This is a helper class to generate an append-only row-based hash map that can act as a 'cache' + * for extremely fast key-value lookups while evaluating aggregates (and fall back to the + * `BytesToBytesMap` if a given key isn't found). This is 'codegened' in HashAggregate to speed + * up aggregates w/ key. + * + * We also have VectorizedHashMapGenerator, which generates a append-only vectorized hash map. + * We choose one of the two as the 1st level, fast hash map during aggregation. + * + * NOTE: This row-based hash map currently doesn't support nullable keys and falls back to the + * `BytesToBytesMap` to store them. + */ +class RowBasedHashMapGenerator( + ctx: CodegenContext, + aggregateExpressions: Seq[AggregateExpression], + generatedClassName: String, + groupingKeySchema: StructType, + bufferSchema: StructType) + extends HashMapGenerator (ctx, aggregateExpressions, generatedClassName, + groupingKeySchema, bufferSchema) { + + protected def initializeAggregateHashMap(): String = { + val generatedKeySchema: String = + s"new org.apache.spark.sql.types.StructType()" + + groupingKeySchema.map { key => + key.dataType match { + case d: DecimalType => + s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.createDecimalType( + |${d.precision}, ${d.scale}))""".stripMargin + case _ => + s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.${key.dataType})""" + } + }.mkString("\n").concat(";") + + val generatedValueSchema: String = + s"new org.apache.spark.sql.types.StructType()" + + bufferSchema.map { key => + key.dataType match { + case d: DecimalType => + s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.createDecimalType( + |${d.precision}, ${d.scale}))""".stripMargin + case _ => + s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.${key.dataType})""" + } + }.mkString("\n").concat(";") + + s""" + | private org.apache.spark.sql.catalyst.expressions.RowBasedKeyValueBatch batch; + | private int[] buckets; + | private int capacity = 1 << 16; + | private double loadFactor = 0.5; + | private int numBuckets = (int) (capacity / loadFactor); + | private int maxSteps = 2; + | private int numRows = 0; + | private org.apache.spark.sql.types.StructType keySchema = $generatedKeySchema + | private org.apache.spark.sql.types.StructType valueSchema = $generatedValueSchema + | private Object emptyVBase; + | private long emptyVOff; + | private int emptyVLen; + | private boolean isBatchFull = false; + | + | + | public $generatedClassName( + | org.apache.spark.memory.TaskMemoryManager taskMemoryManager, + | InternalRow emptyAggregationBuffer) { + | batch = org.apache.spark.sql.catalyst.expressions.RowBasedKeyValueBatch + | .allocate(keySchema, valueSchema, taskMemoryManager, capacity); + | + | final UnsafeProjection valueProjection = UnsafeProjection.create(valueSchema); + | final byte[] emptyBuffer = valueProjection.apply(emptyAggregationBuffer).getBytes(); + | + | emptyVBase = emptyBuffer; + | emptyVOff = Platform.BYTE_ARRAY_OFFSET; + | emptyVLen = emptyBuffer.length; + | + | buckets = new int[numBuckets]; + | java.util.Arrays.fill(buckets, -1); + | } + """.stripMargin + } + + /** + * Generates a method that returns true if the group-by keys exist at a given index in the + * associated [[org.apache.spark.sql.catalyst.expressions.RowBasedKeyValueBatch]]. + * + */ + protected def generateEquals(): String = { + + def genEqualsForKeys(groupingKeys: Seq[Buffer]): String = { + groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) => + s"""(${ctx.genEqual(key.dataType, ctx.getValue("row", + key.dataType, ordinal.toString()), key.name)})""" + }.mkString(" && ") + } + + s""" + |private boolean equals(int idx, $groupingKeySignature) { + | UnsafeRow row = batch.getKeyRow(buckets[idx]); + | return ${genEqualsForKeys(groupingKeys)}; + |} + """.stripMargin + } + + /** + * Generates a method that returns a + * [[org.apache.spark.sql.catalyst.expressions.UnsafeRow]] which keeps track of the + * aggregate value(s) for a given set of keys. If the corresponding row doesn't exist, the + * generated method adds the corresponding row in the associated + * [[org.apache.spark.sql.catalyst.expressions.RowBasedKeyValueBatch]]. + * + */ + protected def generateFindOrInsert(): String = { + val numVarLenFields = groupingKeys.map(_.dataType).count { + case dt if UnsafeRow.isFixedLength(dt) => false + // TODO: consider large decimal and interval type + case _ => true + } + + val createUnsafeRowForKey = groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) => + s"agg_rowWriter.write(${ordinal}, ${key.name})"} + .mkString(";\n") + + s""" + |public org.apache.spark.sql.catalyst.expressions.UnsafeRow findOrInsert(${ + groupingKeySignature}) { + | long h = hash(${groupingKeys.map(_.name).mkString(", ")}); + | int step = 0; + | int idx = (int) h & (numBuckets - 1); + | while (step < maxSteps) { + | // Return bucket index if it's either an empty slot or already contains the key + | if (buckets[idx] == -1) { + | if (numRows < capacity && !isBatchFull) { + | // creating the unsafe for new entry + | UnsafeRow agg_result = new UnsafeRow(${groupingKeySchema.length}); + | org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder agg_holder + | = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(agg_result, + | ${numVarLenFields * 32}); + | org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter agg_rowWriter + | = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter( + | agg_holder, + | ${groupingKeySchema.length}); + | agg_holder.reset(); //TODO: investigate if reset or zeroout are actually needed + | agg_rowWriter.zeroOutNullBytes(); + | ${createUnsafeRowForKey}; + | agg_result.setTotalSize(agg_holder.totalSize()); + | Object kbase = agg_result.getBaseObject(); + | long koff = agg_result.getBaseOffset(); + | int klen = agg_result.getSizeInBytes(); + | + | UnsafeRow vRow + | = batch.appendRow(kbase, koff, klen, emptyVBase, emptyVOff, emptyVLen); + | if (vRow == null) { + | isBatchFull = true; + | } else { + | buckets[idx] = numRows++; + | } + | return vRow; + | } else { + | // No more space + | return null; + | } + | } else if (equals(idx, ${groupingKeys.map(_.name).mkString(", ")})) { + | return batch.getValueRow(buckets[idx]); + | } + | idx = (idx + 1) & (numBuckets - 1); + | step++; + | } + | // Didn't find it + | return null; + |} + """.stripMargin + } + + protected def generateRowIterator(): String = { + s""" + |public org.apache.spark.unsafe.KVIterator rowIterator() { + | return batch.rowIterator(); + |} + """.stripMargin + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala index 8a3f466ccfef..7418df90b824 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala @@ -17,8 +17,8 @@ package org.apache.spark.sql.execution.aggregate -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, DeclarativeAggregate} -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext} import org.apache.spark.sql.types._ /** @@ -44,49 +44,11 @@ class VectorizedHashMapGenerator( aggregateExpressions: Seq[AggregateExpression], generatedClassName: String, groupingKeySchema: StructType, - bufferSchema: StructType) { - case class Buffer(dataType: DataType, name: String) - val groupingKeys = groupingKeySchema.map(k => Buffer(k.dataType, ctx.freshName("key"))) - val bufferValues = bufferSchema.map(k => Buffer(k.dataType, ctx.freshName("value"))) - val groupingKeySignature = - groupingKeys.map(key => s"${ctx.javaType(key.dataType)} ${key.name}").mkString(", ") - val buffVars: Seq[ExprCode] = { - val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate]) - val initExpr = functions.flatMap(f => f.initialValues) - initExpr.map { e => - val isNull = ctx.freshName("bufIsNull") - val value = ctx.freshName("bufValue") - ctx.addMutableState("boolean", isNull, "") - ctx.addMutableState(ctx.javaType(e.dataType), value, "") - val ev = e.genCode(ctx) - val initVars = - s""" - | $isNull = ${ev.isNull}; - | $value = ${ev.value}; - """.stripMargin - ExprCode(ev.code + initVars, isNull, value) - } - } - - def generate(): String = { - s""" - |public class $generatedClassName { - |${initializeAggregateHashMap()} - | - |${generateFindOrInsert()} - | - |${generateEquals()} - | - |${generateHashFunction()} - | - |${generateRowIterator()} - | - |${generateClose()} - |} - """.stripMargin - } + bufferSchema: StructType) + extends HashMapGenerator (ctx, aggregateExpressions, generatedClassName, + groupingKeySchema, bufferSchema) { - private def initializeAggregateHashMap(): String = { + protected def initializeAggregateHashMap(): String = { val generatedSchema: String = s"new org.apache.spark.sql.types.StructType()" + (groupingKeySchema ++ bufferSchema).map { key => @@ -140,37 +102,6 @@ class VectorizedHashMapGenerator( """.stripMargin } - /** - * Generates a method that computes a hash by currently xor-ing all individual group-by keys. For - * instance, if we have 2 long group-by keys, the generated function would be of the form: - * - * {{{ - * private long hash(long agg_key, long agg_key1) { - * return agg_key ^ agg_key1; - * } - * }}} - */ - private def generateHashFunction(): String = { - val hash = ctx.freshName("hash") - - def genHashForKeys(groupingKeys: Seq[Buffer]): String = { - groupingKeys.map { key => - val result = ctx.freshName("result") - s""" - |${genComputeHash(ctx, key.name, key.dataType, result)} - |$hash = ($hash ^ (0x9e3779b9)) + $result + ($hash << 6) + ($hash >>> 2); - """.stripMargin - }.mkString("\n") - } - - s""" - |private long hash($groupingKeySignature) { - | long $hash = 0; - | ${genHashForKeys(groupingKeys)} - | return $hash; - |} - """.stripMargin - } /** * Generates a method that returns true if the group-by keys exist at a given index in the @@ -184,7 +115,7 @@ class VectorizedHashMapGenerator( * } * }}} */ - private def generateEquals(): String = { + protected def generateEquals(): String = { def genEqualsForKeys(groupingKeys: Seq[Buffer]): String = { groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) => @@ -233,7 +164,7 @@ class VectorizedHashMapGenerator( * } * }}} */ - private def generateFindOrInsert(): String = { + protected def generateFindOrInsert(): String = { def genCodeToSetKeys(groupingKeys: Seq[Buffer]): Seq[String] = { groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) => @@ -287,7 +218,7 @@ class VectorizedHashMapGenerator( """.stripMargin } - private def generateRowIterator(): String = { + protected def generateRowIterator(): String = { s""" |public java.util.Iterator | rowIterator() { @@ -295,50 +226,4 @@ class VectorizedHashMapGenerator( |} """.stripMargin } - - private def generateClose(): String = { - s""" - |public void close() { - | batch.close(); - |} - """.stripMargin - } - - private def genComputeHash( - ctx: CodegenContext, - input: String, - dataType: DataType, - result: String): String = { - def hashInt(i: String): String = s"int $result = $i;" - def hashLong(l: String): String = s"long $result = $l;" - def hashBytes(b: String): String = { - val hash = ctx.freshName("hash") - s""" - |int $result = 0; - |for (int i = 0; i < $b.length; i++) { - | ${genComputeHash(ctx, s"$b[i]", ByteType, hash)} - | $result = ($result ^ (0x9e3779b9)) + $hash + ($result << 6) + ($result >>> 2); - |} - """.stripMargin - } - - dataType match { - case BooleanType => hashInt(s"$input ? 1 : 0") - case ByteType | ShortType | IntegerType | DateType => hashInt(input) - case LongType | TimestampType => hashLong(input) - case FloatType => hashInt(s"Float.floatToIntBits($input)") - case DoubleType => hashLong(s"Double.doubleToLongBits($input)") - case d: DecimalType => - if (d.precision <= Decimal.MAX_LONG_DIGITS) { - hashLong(s"$input.toUnscaledLong()") - } else { - val bytes = ctx.freshName("bytes") - s""" - final byte[] $bytes = $input.toJavaBigDecimal().unscaledValue().toByteArray(); - ${hashBytes(bytes)} - """ - } - case StringType => hashBytes(s"$input.getBytes()") - } - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index c38eca5156e5..5e3cd9f8950f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -392,7 +392,7 @@ object CreateDataSourceTableUtils extends Logging { outputFormat = None, serde = None, compressed = false, - serdeProperties = options + properties = options ), properties = tableProperties.toMap) } @@ -412,7 +412,7 @@ object CreateDataSourceTableUtils extends Logging { outputFormat = serde.outputFormat, serde = serde.serde, compressed = false, - serdeProperties = options + properties = options ), schema = relation.schema.map { f => CatalogColumn(f.name, f.dataType.catalogString) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index a3a057a5628f..03f81c46a817 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -21,12 +21,11 @@ import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, CatalogTable} import org.apache.spark.sql.catalyst.catalog.{CatalogTablePartition, CatalogTableType, SessionCatalog} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ -import org.apache.spark.sql.execution.datasources.BucketSpec import org.apache.spark.sql.types._ @@ -320,14 +319,14 @@ case class AlterTableSerDePropertiesCommand( if (partSpec.isEmpty) { val newTable = table.withNewStorage( serde = serdeClassName.orElse(table.storage.serde), - serdeProperties = table.storage.serdeProperties ++ serdeProperties.getOrElse(Map())) + serdeProperties = table.storage.properties ++ serdeProperties.getOrElse(Map())) catalog.alterTable(newTable) } else { val spec = partSpec.get val part = catalog.getPartition(tableName, spec) val newPart = part.copy(storage = part.storage.copy( serde = serdeClassName.orElse(part.storage.serde), - serdeProperties = part.storage.serdeProperties ++ serdeProperties.getOrElse(Map()))) + properties = part.storage.properties ++ serdeProperties.getOrElse(Map()))) catalog.alterPartitions(tableName, Seq(newPart)) } Seq.empty[Row] @@ -466,7 +465,7 @@ case class AlterTableSetLocationCommand( if (DDLUtils.isDatasourceTable(table)) { table.withNewStorage( locationUri = Some(location), - serdeProperties = table.storage.serdeProperties ++ Map("path" -> location)) + serdeProperties = table.storage.properties ++ Map("path" -> location)) } else { table.withNewStorage(locationUri = Some(location)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 5c815df0deb9..8f3adadbf3b5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -29,7 +29,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.{CatalogColumn, CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogColumn, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.catalog.CatalogTableType._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} @@ -120,7 +120,7 @@ case class CreateTableCommand(table: CatalogTable, ifNotExists: Boolean) extends override def run(sparkSession: SparkSession): Seq[Row] = { DDLUtils.verifyTableProperties(table.properties.keys.toSeq, "CREATE TABLE") - DDLUtils.verifyTableProperties(table.storage.serdeProperties.keys.toSeq, "CREATE TABLE") + DDLUtils.verifyTableProperties(table.storage.properties.keys.toSeq, "CREATE TABLE") sparkSession.sessionState.catalog.createTable(table, ifNotExists) Seq.empty[Row] } @@ -167,7 +167,7 @@ case class AlterTableRenameCommand( if (DDLUtils.isDatasourceTable(table) && table.tableType == CatalogTableType.MANAGED) { val newPath = catalog.defaultTablePath(newName) val newTable = table.withNewStorage( - serdeProperties = table.storage.serdeProperties ++ Map("path" -> newPath)) + serdeProperties = table.storage.properties ++ Map("path" -> newPath)) catalog.alterTable(newTable) } // Invalidate the table last, otherwise uncaching the table would load the logical plan @@ -349,7 +349,7 @@ case class TruncateTableCommand( } val locations = if (isDatasourceTable) { - Seq(table.storage.serdeProperties.get("path")) + Seq(table.storage.properties.get("path")) } else if (table.partitionColumnNames.isEmpty) { Seq(table.storage.locationUri) } else { @@ -413,29 +413,29 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF } else { val metadata = catalog.getTableMetadata(table) + if (DDLUtils.isDatasourceTable(metadata)) { + DDLUtils.getSchemaFromTableProperties(metadata) match { + case Some(userSpecifiedSchema) => describeSchema(userSpecifiedSchema, result) + case None => describeSchema(catalog.lookupRelation(table).schema, result) + } + } else { + describeSchema(metadata.schema, result) + } + if (isExtended) { describeExtended(metadata, result) } else if (isFormatted) { describeFormatted(metadata, result) } else { - describe(metadata, result) + describePartitionInfo(metadata, result) } } result } - // Shows data columns and partitioned columns (if any) - private def describe(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { + private def describePartitionInfo(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { if (DDLUtils.isDatasourceTable(table)) { - val schema = DDLUtils.getSchemaFromTableProperties(table) - - if (schema.isEmpty) { - append(buffer, "# Schema of this table is inferred at runtime", "", "") - } else { - schema.foreach(describeSchema(_, buffer)) - } - val partCols = DDLUtils.getPartitionColumnsFromTableProperties(table) if (partCols.nonEmpty) { append(buffer, "# Partition Information", "", "") @@ -443,8 +443,6 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF partCols.foreach(col => append(buffer, col, "", "")) } } else { - describeSchema(table.schema, buffer) - if (table.partitionColumns.nonEmpty) { append(buffer, "# Partition Information", "", "") append(buffer, s"# ${output.head.name}", output(1).name, output(2).name) @@ -454,14 +452,14 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF } private def describeExtended(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { - describe(table, buffer) + describePartitionInfo(table, buffer) append(buffer, "", "", "") append(buffer, "# Detailed Table Information", table.toString, "") } private def describeFormatted(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { - describe(table, buffer) + describePartitionInfo(table, buffer) append(buffer, "", "", "") append(buffer, "# Detailed Table Information", "", "") @@ -494,29 +492,25 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF describeBucketingInfo(metadata, buffer) append(buffer, "Storage Desc Parameters:", "", "") - metadata.storage.serdeProperties.foreach { case (key, value) => + metadata.storage.properties.foreach { case (key, value) => append(buffer, s" $key", value, "") } } private def describeBucketingInfo(metadata: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { - def appendBucketInfo(numBuckets: Int, bucketColumns: Seq[String], sortColumns: Seq[String]) = { - append(buffer, "Num Buckets:", numBuckets.toString, "") - append(buffer, "Bucket Columns:", bucketColumns.mkString("[", ", ", "]"), "") - append(buffer, "Sort Columns:", sortColumns.mkString("[", ", ", "]"), "") + def appendBucketInfo(bucketSpec: Option[BucketSpec]) = bucketSpec match { + case Some(BucketSpec(numBuckets, bucketColumnNames, sortColumnNames)) => + append(buffer, "Num Buckets:", numBuckets.toString, "") + append(buffer, "Bucket Columns:", bucketColumnNames.mkString("[", ", ", "]"), "") + append(buffer, "Sort Columns:", sortColumnNames.mkString("[", ", ", "]"), "") + + case _ => } - DDLUtils.getBucketSpecFromTableProperties(metadata) match { - case Some(bucketSpec) => - appendBucketInfo( - bucketSpec.numBuckets, - bucketSpec.bucketColumnNames, - bucketSpec.sortColumnNames) - case None => - appendBucketInfo( - metadata.numBuckets, - metadata.bucketColumnNames, - metadata.sortColumnNames) + if (DDLUtils.isDatasourceTable(metadata)) { + appendBucketInfo(DDLUtils.getBucketSpecFromTableProperties(metadata)) + } else { + appendBucketInfo(metadata.bucketSpec) } } @@ -624,9 +618,8 @@ case class ShowTablePropertiesCommand(table: TableIdentifier, propertyKey: Optio * }}} */ case class ShowColumnsCommand(table: TableIdentifier) extends RunnableCommand { - // The result of SHOW COLUMNS has one column called 'result' override val output: Seq[Attribute] = { - AttributeReference("result", StringType, nullable = false)() :: Nil + AttributeReference("col_name", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { @@ -654,9 +647,8 @@ case class ShowColumnsCommand(table: TableIdentifier) extends RunnableCommand { case class ShowPartitionsCommand( table: TableIdentifier, spec: Option[TablePartitionSpec]) extends RunnableCommand { - // The result of SHOW PARTITIONS has one column called 'result' override val output: Seq[Attribute] = { - AttributeReference("result", StringType, nullable = false)() :: Nil + AttributeReference("partition", StringType, nullable = false)() :: Nil } private def getPartName(spec: TablePartitionSpec, partColNames: Seq[String]): String = { @@ -812,7 +804,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman builder ++= partCols.mkString("PARTITIONED BY (", ", ", ")\n") } - if (metadata.bucketColumnNames.nonEmpty) { + if (metadata.bucketSpec.isDefined) { throw new UnsupportedOperationException( "Creating Hive table with bucket spec is not supported yet.") } @@ -824,7 +816,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman storage.serde.foreach { serde => builder ++= s"ROW FORMAT SERDE '$serde'\n" - val serdeProps = metadata.storage.serdeProperties.map { + val serdeProps = metadata.storage.properties.map { case (key, value) => s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" } @@ -894,7 +886,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman builder ++= s"USING ${props(CreateDataSourceTableUtils.DATASOURCE_PROVIDER)}\n" - val dataSourceOptions = metadata.storage.serdeProperties.filterNot { + val dataSourceOptions = metadata.storage.properties.filterNot { case (key, value) => // If it's a managed table, omit PATH option. Spark SQL always creates external table // when the table creation DDL contains the PATH option. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 16b333a40288..312a1f691b2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -21,7 +21,7 @@ import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.{SQLBuilder, TableIdentifier} -import org.apache.spark.sql.catalyst.catalog.{CatalogColumn, CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.catalog.{CatalogColumn, CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} @@ -31,7 +31,13 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} * Create Hive view on non-hive-compatible tables by specifying schema ourselves instead of * depending on Hive meta-store. * - * @param tableDesc the catalog table + * @param name the name of this view. + * @param userSpecifiedColumns the output column names and optional comments specified by users, + * can be Nil if not specified. + * @param comment the comment of this view. + * @param properties the properties of this view. + * @param originalText the original SQL text of this view, can be None if this view is created via + * Dataset API. * @param child the logical plan that represents the view; this is used to generate a canonicalized * version of the SQL that can be saved in the catalog. * @param allowExisting if true, and if the view already exists, noop; if false, and if the view @@ -44,7 +50,11 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} * unless they are specified with full qualified table name with database prefix. */ case class CreateViewCommand( - tableDesc: CatalogTable, + name: TableIdentifier, + userSpecifiedColumns: Seq[(String, Option[String])], + comment: Option[String], + properties: Map[String, String], + originalText: Option[String], child: LogicalPlan, allowExisting: Boolean, replace: Boolean, @@ -58,11 +68,9 @@ case class CreateViewCommand( override def output: Seq[Attribute] = Seq.empty[Attribute] - require(tableDesc.tableType == CatalogTableType.VIEW, - "The type of the table to created with CREATE VIEW must be 'CatalogTableType.VIEW'.") if (!isTemporary) { - require(tableDesc.viewText.isDefined, - "The table to created with CREATE VIEW must have 'viewText'.") + require(originalText.isDefined, + "The table to created with CREATE VIEW must have 'originalText'.") } if (allowExisting && replace) { @@ -76,8 +84,8 @@ case class CreateViewCommand( } // Temporary view names should NOT contain database prefix like "database.table" - if (isTemporary && tableDesc.identifier.database.isDefined) { - val database = tableDesc.identifier.database.get + if (isTemporary && name.database.isDefined) { + val database = name.database.get throw new AnalysisException( s"It is not allowed to add database prefix `$database` for the TEMPORARY view name.") } @@ -88,23 +96,23 @@ case class CreateViewCommand( qe.assertAnalyzed() val analyzedPlan = qe.analyzed - if (tableDesc.schema != Nil && tableDesc.schema.length != analyzedPlan.output.length) { + if (userSpecifiedColumns.nonEmpty && + userSpecifiedColumns.length != analyzedPlan.output.length) { throw new AnalysisException(s"The number of columns produced by the SELECT clause " + s"(num: `${analyzedPlan.output.length}`) does not match the number of column names " + - s"specified by CREATE VIEW (num: `${tableDesc.schema.length}`).") + s"specified by CREATE VIEW (num: `${userSpecifiedColumns.length}`).") } val sessionState = sparkSession.sessionState if (isTemporary) { - createTemporaryView(tableDesc.identifier, sparkSession, analyzedPlan) + createTemporaryView(sparkSession, analyzedPlan) } else { // Adds default database for permanent table if it doesn't exist, so that tableExists() // only check permanent tables. - val database = tableDesc.identifier.database.getOrElse( - sessionState.catalog.getCurrentDatabase) - val tableIdentifier = tableDesc.identifier.copy(database = Option(database)) + val database = name.database.getOrElse(sessionState.catalog.getCurrentDatabase) + val qualifiedName = name.copy(database = Option(database)) - if (sessionState.catalog.tableExists(tableIdentifier)) { + if (sessionState.catalog.tableExists(qualifiedName)) { if (allowExisting) { // Handles `CREATE VIEW IF NOT EXISTS v0 AS SELECT ...`. Does nothing when the target view // already exists. @@ -115,7 +123,7 @@ case class CreateViewCommand( // Handles `CREATE VIEW v0 AS SELECT ...`. Throws exception when the target view already // exists. throw new AnalysisException( - s"View $tableIdentifier already exists. If you want to update the view definition, " + + s"View $qualifiedName already exists. If you want to update the view definition, " + "please use ALTER VIEW AS or CREATE OR REPLACE VIEW AS") } } else { @@ -127,25 +135,20 @@ case class CreateViewCommand( Seq.empty[Row] } - private def createTemporaryView( - table: TableIdentifier, sparkSession: SparkSession, analyzedPlan: LogicalPlan): Unit = { - - val sessionState = sparkSession.sessionState - val catalog = sessionState.catalog + private def createTemporaryView(sparkSession: SparkSession, analyzedPlan: LogicalPlan): Unit = { + val catalog = sparkSession.sessionState.catalog // Projects column names to alias names - val logicalPlan = { - if (tableDesc.schema.isEmpty) { - analyzedPlan - } else { - val projectList = analyzedPlan.output.zip(tableDesc.schema).map { - case (attr, col) => Alias(attr, col.name)() - } - sparkSession.sessionState.executePlan(Project(projectList, analyzedPlan)).analyzed + val logicalPlan = if (userSpecifiedColumns.isEmpty) { + analyzedPlan + } else { + val projectList = analyzedPlan.output.zip(userSpecifiedColumns).map { + case (attr, (colName, _)) => Alias(attr, colName)() } + sparkSession.sessionState.executePlan(Project(projectList, analyzedPlan)).analyzed } - catalog.createTempView(table.table, logicalPlan, replace) + catalog.createTempView(name.table, logicalPlan, replace) } /** @@ -154,15 +157,14 @@ case class CreateViewCommand( */ private def prepareTable(sparkSession: SparkSession, analyzedPlan: LogicalPlan): CatalogTable = { val viewSQL: String = { - val logicalPlan = - if (tableDesc.schema.isEmpty) { - analyzedPlan - } else { - val projectList = analyzedPlan.output.zip(tableDesc.schema).map { - case (attr, col) => Alias(attr, col.name)() - } - sparkSession.sessionState.executePlan(Project(projectList, analyzedPlan)).analyzed + val logicalPlan = if (userSpecifiedColumns.isEmpty) { + analyzedPlan + } else { + val projectList = analyzedPlan.output.zip(userSpecifiedColumns).map { + case (attr, (colName, _)) => Alias(attr, colName)() } + sparkSession.sessionState.executePlan(Project(projectList, analyzedPlan)).analyzed + } new SQLBuilder(logicalPlan).toSQL } @@ -176,21 +178,26 @@ case class CreateViewCommand( "Failed to analyze the canonicalized SQL. It is possible there is a bug in Spark.", e) } - val viewSchema: Seq[CatalogColumn] = { - if (tableDesc.schema.isEmpty) { - analyzedPlan.output.map { a => - CatalogColumn(a.name, a.dataType.catalogString) - } - } else { - analyzedPlan.output.zip(tableDesc.schema).map { case (a, col) => - CatalogColumn(col.name, a.dataType.catalogString, nullable = true, col.comment) - } + val viewSchema = if (userSpecifiedColumns.isEmpty) { + analyzedPlan.output.map { a => + CatalogColumn(a.name, a.dataType.catalogString) + } + } else { + analyzedPlan.output.zip(userSpecifiedColumns).map { + case (a, (name, comment)) => + CatalogColumn(name, a.dataType.catalogString, comment = comment) } } - tableDesc.copy(schema = viewSchema, viewText = Some(viewSQL)) + CatalogTable( + identifier = name, + tableType = CatalogTableType.VIEW, + storage = CatalogStorageFormat.empty, + schema = viewSchema, + properties = properties, + viewOriginalText = originalText, + viewText = Some(viewSQL), + comment = comment + ) } - - /** Escape backtick with double-backtick in column name and wrap it with backtick. */ - private def quote(name: String) = s"`${name.replaceAll("`", "``")}`" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BucketingUtils.scala similarity index 77% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BucketingUtils.scala index 6008d73717f7..377b81809675 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BucketingUtils.scala @@ -17,20 +17,6 @@ package org.apache.spark.sql.execution.datasources -/** - * A container for bucketing information. - * Bucketing is a technology for decomposing data sets into more manageable parts, and the number - * of buckets is fixed so it does not fluctuate with data. - * - * @param numBuckets number of buckets. - * @param bucketColumnNames the names of the columns that used to generate the bucket id. - * @param sortColumnNames the names of the columns that used to sort data in each bucket. - */ -private[sql] case class BucketSpec( - numBuckets: Int, - bucketColumnNames: Seq[String], - sortColumnNames: Seq[String]) - private[sql] object BucketingUtils { // The file name of bucketed data should have 3 parts: // 1. some other information in the head of file name diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index f572b93991e0..79024fda2f8c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -30,6 +30,7 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider import org.apache.spark.sql.execution.datasources.json.JsonFileFormat diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 0841636d3309..8ffdc507db52 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -214,7 +214,7 @@ private[sql] class FindDataSourceTable(sparkSession: SparkSession) extends Rule[ val bucketSpec = DDLUtils.getBucketSpecFromTableProperties(table) - val options = table.storage.serdeProperties + val options = table.storage.properties val dataSource = DataSource( sparkSession, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 04f166f8ff45..32aa4713ebdb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -25,6 +25,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.{expressions, InternalRow} +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 1426dcf4697f..b49525c8ceda 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -19,13 +19,13 @@ package org.apache.spark.sql.execution.datasources import java.io.IOException -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.spark._ import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.InternalRow diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala index 9a0b46c1a4a5..c801436b0a64 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala @@ -28,6 +28,7 @@ import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.mapred.SparkHadoopMapRedUtil import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.catalyst.InternalRow diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala index 31a2075d2ff9..18369b51b930 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index d238da242f3f..5ce8350de207 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.execution.datasources import scala.collection.mutable -import scala.util.Try import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ @@ -30,6 +29,7 @@ import org.apache.spark.annotation.Experimental import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.execution.FileRelation diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 24e2c1a5fd2f..4c9843036311 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -28,7 +28,7 @@ import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow +import org.apache.spark.sql.catalyst.expressions.{MutableRow, SpecificMutableRow} import org.apache.spark.sql.catalyst.util.{DateTimeUtils, GenericArrayData} import org.apache.spark.sql.jdbc.JdbcDialects import org.apache.spark.sql.sources._ @@ -322,43 +322,134 @@ private[sql] class JDBCRDD( } } - // Each JDBC-to-Catalyst conversion corresponds to a tag defined here so that - // we don't have to potentially poke around in the Metadata once for every - // row. - // Is there a better way to do this? I'd rather be using a type that - // contains only the tags I define. - abstract class JDBCConversion - case object BooleanConversion extends JDBCConversion - case object DateConversion extends JDBCConversion - case class DecimalConversion(precision: Int, scale: Int) extends JDBCConversion - case object DoubleConversion extends JDBCConversion - case object FloatConversion extends JDBCConversion - case object IntegerConversion extends JDBCConversion - case object LongConversion extends JDBCConversion - case object BinaryLongConversion extends JDBCConversion - case object StringConversion extends JDBCConversion - case object TimestampConversion extends JDBCConversion - case object BinaryConversion extends JDBCConversion - case class ArrayConversion(elementConversion: JDBCConversion) extends JDBCConversion + // A `JDBCValueSetter` is responsible for converting and setting a value from `ResultSet` + // into a field for `MutableRow`. The last argument `Int` means the index for the + // value to be set in the row and also used for the value to retrieve from `ResultSet`. + private type JDBCValueSetter = (ResultSet, MutableRow, Int) => Unit /** - * Maps a StructType to a type tag list. + * Creates `JDBCValueSetter`s according to [[StructType]], which can set + * each value from `ResultSet` to each field of [[MutableRow]] correctly. */ - def getConversions(schema: StructType): Array[JDBCConversion] = - schema.fields.map(sf => getConversions(sf.dataType, sf.metadata)) - - private def getConversions(dt: DataType, metadata: Metadata): JDBCConversion = dt match { - case BooleanType => BooleanConversion - case DateType => DateConversion - case DecimalType.Fixed(p, s) => DecimalConversion(p, s) - case DoubleType => DoubleConversion - case FloatType => FloatConversion - case IntegerType => IntegerConversion - case LongType => if (metadata.contains("binarylong")) BinaryLongConversion else LongConversion - case StringType => StringConversion - case TimestampType => TimestampConversion - case BinaryType => BinaryConversion - case ArrayType(et, _) => ArrayConversion(getConversions(et, metadata)) + def makeSetters(schema: StructType): Array[JDBCValueSetter] = + schema.fields.map(sf => makeSetter(sf.dataType, sf.metadata)) + + private def makeSetter(dt: DataType, metadata: Metadata): JDBCValueSetter = dt match { + case BooleanType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setBoolean(pos, rs.getBoolean(pos + 1)) + + case DateType => + (rs: ResultSet, row: MutableRow, pos: Int) => + // DateTimeUtils.fromJavaDate does not handle null value, so we need to check it. + val dateVal = rs.getDate(pos + 1) + if (dateVal != null) { + row.setInt(pos, DateTimeUtils.fromJavaDate(dateVal)) + } else { + row.update(pos, null) + } + + // When connecting with Oracle DB through JDBC, the precision and scale of BigDecimal + // object returned by ResultSet.getBigDecimal is not correctly matched to the table + // schema reported by ResultSetMetaData.getPrecision and ResultSetMetaData.getScale. + // If inserting values like 19999 into a column with NUMBER(12, 2) type, you get through + // a BigDecimal object with scale as 0. But the dataframe schema has correct type as + // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then + // retrieve it, you will get wrong result 199.99. + // So it is needed to set precision and scale for Decimal based on JDBC metadata. + case DecimalType.Fixed(p, s) => + (rs: ResultSet, row: MutableRow, pos: Int) => + val decimal = + nullSafeConvert[java.math.BigDecimal](rs.getBigDecimal(pos + 1), d => Decimal(d, p, s)) + row.update(pos, decimal) + + case DoubleType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setDouble(pos, rs.getDouble(pos + 1)) + + case FloatType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setFloat(pos, rs.getFloat(pos + 1)) + + case IntegerType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setInt(pos, rs.getInt(pos + 1)) + + case LongType if metadata.contains("binarylong") => + (rs: ResultSet, row: MutableRow, pos: Int) => + val bytes = rs.getBytes(pos + 1) + var ans = 0L + var j = 0 + while (j < bytes.size) { + ans = 256 * ans + (255 & bytes(j)) + j = j + 1 + } + row.setLong(pos, ans) + + case LongType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setLong(pos, rs.getLong(pos + 1)) + + case StringType => + (rs: ResultSet, row: MutableRow, pos: Int) => + // TODO(davies): use getBytes for better performance, if the encoding is UTF-8 + row.update(pos, UTF8String.fromString(rs.getString(pos + 1))) + + case TimestampType => + (rs: ResultSet, row: MutableRow, pos: Int) => + val t = rs.getTimestamp(pos + 1) + if (t != null) { + row.setLong(pos, DateTimeUtils.fromJavaTimestamp(t)) + } else { + row.update(pos, null) + } + + case BinaryType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.update(pos, rs.getBytes(pos + 1)) + + case ArrayType(et, _) => + val elementConversion = et match { + case TimestampType => + (array: Object) => + array.asInstanceOf[Array[java.sql.Timestamp]].map { timestamp => + nullSafeConvert(timestamp, DateTimeUtils.fromJavaTimestamp) + } + + case StringType => + (array: Object) => + array.asInstanceOf[Array[java.lang.String]] + .map(UTF8String.fromString) + + case DateType => + (array: Object) => + array.asInstanceOf[Array[java.sql.Date]].map { date => + nullSafeConvert(date, DateTimeUtils.fromJavaDate) + } + + case dt: DecimalType => + (array: Object) => + array.asInstanceOf[Array[java.math.BigDecimal]].map { decimal => + nullSafeConvert[java.math.BigDecimal]( + decimal, d => Decimal(d, dt.precision, dt.scale)) + } + + case LongType if metadata.contains("binarylong") => + throw new IllegalArgumentException(s"Unsupported array element " + + s"type ${dt.simpleString} based on binary") + + case ArrayType(_, _) => + throw new IllegalArgumentException("Nested arrays unsupported") + + case _ => (array: Object) => array.asInstanceOf[Array[Any]] + } + + (rs: ResultSet, row: MutableRow, pos: Int) => + val array = nullSafeConvert[Object]( + rs.getArray(pos + 1).getArray, + array => new GenericArrayData(elementConversion.apply(array))) + row.update(pos, array) + case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") } @@ -398,93 +489,15 @@ private[sql] class JDBCRDD( stmt.setFetchSize(fetchSize) val rs = stmt.executeQuery() - val conversions = getConversions(schema) + val setters: Array[JDBCValueSetter] = makeSetters(schema) val mutableRow = new SpecificMutableRow(schema.fields.map(x => x.dataType)) def getNext(): InternalRow = { if (rs.next()) { inputMetrics.incRecordsRead(1) var i = 0 - while (i < conversions.length) { - val pos = i + 1 - conversions(i) match { - case BooleanConversion => mutableRow.setBoolean(i, rs.getBoolean(pos)) - case DateConversion => - // DateTimeUtils.fromJavaDate does not handle null value, so we need to check it. - val dateVal = rs.getDate(pos) - if (dateVal != null) { - mutableRow.setInt(i, DateTimeUtils.fromJavaDate(dateVal)) - } else { - mutableRow.update(i, null) - } - // When connecting with Oracle DB through JDBC, the precision and scale of BigDecimal - // object returned by ResultSet.getBigDecimal is not correctly matched to the table - // schema reported by ResultSetMetaData.getPrecision and ResultSetMetaData.getScale. - // If inserting values like 19999 into a column with NUMBER(12, 2) type, you get through - // a BigDecimal object with scale as 0. But the dataframe schema has correct type as - // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then - // retrieve it, you will get wrong result 199.99. - // So it is needed to set precision and scale for Decimal based on JDBC metadata. - case DecimalConversion(p, s) => - val decimalVal = rs.getBigDecimal(pos) - if (decimalVal == null) { - mutableRow.update(i, null) - } else { - mutableRow.update(i, Decimal(decimalVal, p, s)) - } - case DoubleConversion => mutableRow.setDouble(i, rs.getDouble(pos)) - case FloatConversion => mutableRow.setFloat(i, rs.getFloat(pos)) - case IntegerConversion => mutableRow.setInt(i, rs.getInt(pos)) - case LongConversion => mutableRow.setLong(i, rs.getLong(pos)) - // TODO(davies): use getBytes for better performance, if the encoding is UTF-8 - case StringConversion => mutableRow.update(i, UTF8String.fromString(rs.getString(pos))) - case TimestampConversion => - val t = rs.getTimestamp(pos) - if (t != null) { - mutableRow.setLong(i, DateTimeUtils.fromJavaTimestamp(t)) - } else { - mutableRow.update(i, null) - } - case BinaryConversion => mutableRow.update(i, rs.getBytes(pos)) - case BinaryLongConversion => - val bytes = rs.getBytes(pos) - var ans = 0L - var j = 0 - while (j < bytes.size) { - ans = 256 * ans + (255 & bytes(j)) - j = j + 1 - } - mutableRow.setLong(i, ans) - case ArrayConversion(elementConversion) => - val array = rs.getArray(pos).getArray - if (array != null) { - val data = elementConversion match { - case TimestampConversion => - array.asInstanceOf[Array[java.sql.Timestamp]].map { timestamp => - nullSafeConvert(timestamp, DateTimeUtils.fromJavaTimestamp) - } - case StringConversion => - array.asInstanceOf[Array[java.lang.String]] - .map(UTF8String.fromString) - case DateConversion => - array.asInstanceOf[Array[java.sql.Date]].map { date => - nullSafeConvert(date, DateTimeUtils.fromJavaDate) - } - case DecimalConversion(p, s) => - array.asInstanceOf[Array[java.math.BigDecimal]].map { decimal => - nullSafeConvert[java.math.BigDecimal](decimal, d => Decimal(d, p, s)) - } - case BinaryLongConversion => - throw new IllegalArgumentException(s"Unsupported array element conversion $i") - case _: ArrayConversion => - throw new IllegalArgumentException("Nested arrays unsupported") - case _ => array.asInstanceOf[Array[Any]] - } - mutableRow.update(i, new GenericArrayData(data)) - } else { - mutableRow.update(i, null) - } - } + while (i < setters.length) { + setters(i).apply(rs, mutableRow, i) if (rs.wasNull) mutableRow.setNullAt(i) i = i + 1 } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index d3e1efc56277..cb474cbd0ae7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -37,6 +37,7 @@ object JdbcUtils extends Logging { // the property names are case sensitive val JDBC_BATCH_FETCH_SIZE = "fetchsize" val JDBC_BATCH_INSERT_SIZE = "batchsize" + val JDBC_TXN_ISOLATION_LEVEL = "isolationLevel" /** * Returns a factory for creating connections to the given JDBC URL. @@ -97,6 +98,22 @@ object JdbcUtils extends Logging { } } + /** + * Truncates a table from the JDBC database. + */ + def truncateTable(conn: Connection, table: String): Unit = { + val statement = conn.createStatement + try { + statement.executeUpdate(s"TRUNCATE TABLE $table") + } finally { + statement.close() + } + } + + def isCascadingTruncateTable(url: String): Option[Boolean] = { + JdbcDialects.get(url).isCascadingTruncateTable() + } + /** * Returns a PreparedStatement that inserts a row into table via conn. */ @@ -139,8 +156,8 @@ object JdbcUtils extends Logging { /** * Saves a partition of a DataFrame to the JDBC database. This is done in - * a single database transaction in order to avoid repeatedly inserting - * data as much as possible. + * a single database transaction (unless isolation level is "NONE") + * in order to avoid repeatedly inserting data as much as possible. * * It is still theoretically possible for rows in a DataFrame to be * inserted into the database more than once if a stage somehow fails after @@ -158,25 +175,44 @@ object JdbcUtils extends Logging { rddSchema: StructType, nullTypes: Array[Int], batchSize: Int, - dialect: JdbcDialect): Iterator[Byte] = { + dialect: JdbcDialect, + isolationLevel: Int): Iterator[Byte] = { require(batchSize >= 1, s"Invalid value `${batchSize.toString}` for parameter " + s"`${JdbcUtils.JDBC_BATCH_INSERT_SIZE}`. The minimum value is 1.") val conn = getConnection() var committed = false - val supportsTransactions = try { - conn.getMetaData().supportsDataManipulationTransactionsOnly() || - conn.getMetaData().supportsDataDefinitionAndDataManipulationTransactions() - } catch { - case NonFatal(e) => - logWarning("Exception while detecting transaction support", e) - true + + var finalIsolationLevel = Connection.TRANSACTION_NONE + if (isolationLevel != Connection.TRANSACTION_NONE) { + try { + val metadata = conn.getMetaData + if (metadata.supportsTransactions()) { + // Update to at least use the default isolation, if any transaction level + // has been chosen and transactions are supported + val defaultIsolation = metadata.getDefaultTransactionIsolation + finalIsolationLevel = defaultIsolation + if (metadata.supportsTransactionIsolationLevel(isolationLevel)) { + // Finally update to actually requested level if possible + finalIsolationLevel = isolationLevel + } else { + logWarning(s"Requested isolation level $isolationLevel is not supported; " + + s"falling back to default isolation level $defaultIsolation") + } + } else { + logWarning(s"Requested isolation level $isolationLevel, but transactions are unsupported") + } + } catch { + case NonFatal(e) => logWarning("Exception while detecting transaction support", e) + } } + val supportsTransactions = finalIsolationLevel != Connection.TRANSACTION_NONE try { if (supportsTransactions) { conn.setAutoCommit(false) // Everything in the same db transaction. + conn.setTransactionIsolation(finalIsolationLevel) } val stmt = insertStatement(conn, table, rddSchema, dialect) try { @@ -285,9 +321,17 @@ object JdbcUtils extends Logging { val rddSchema = df.schema val getConnection: () => Connection = createConnectionFactory(url, properties) val batchSize = properties.getProperty(JDBC_BATCH_INSERT_SIZE, "1000").toInt - df.foreachPartition { iterator => - savePartition(getConnection, table, iterator, rddSchema, nullTypes, batchSize, dialect) - } + val isolationLevel = + properties.getProperty(JDBC_TXN_ISOLATION_LEVEL, "READ_UNCOMMITTED") match { + case "NONE" => Connection.TRANSACTION_NONE + case "READ_UNCOMMITTED" => Connection.TRANSACTION_READ_UNCOMMITTED + case "READ_COMMITTED" => Connection.TRANSACTION_READ_COMMITTED + case "REPEATABLE_READ" => Connection.TRANSACTION_REPEATABLE_READ + case "SERIALIZABLE" => Connection.TRANSACTION_SERIALIZABLE + } + df.foreachPartition(iterator => savePartition( + getConnection, table, iterator, rddSchema, nullTypes, batchSize, dialect, isolationLevel) + ) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala index 8b920ecafaee..23f4a55491d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala @@ -17,74 +17,174 @@ package org.apache.spark.sql.execution.datasources.json +import java.io.Writer + import com.fasterxml.jackson.core._ import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.SpecializedGetters import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils, MapData} import org.apache.spark.sql.types._ -private[sql] object JacksonGenerator { - /** Transforms a single InternalRow to JSON using Jackson - * - * TODO: make the code shared with the other apply method. +private[sql] class JacksonGenerator(schema: StructType, writer: Writer) { + // A `ValueWriter` is responsible for writing a field of an `InternalRow` to appropriate + // JSON data. Here we are using `SpecializedGetters` rather than `InternalRow` so that + // we can directly access data in `ArrayData` without the help of `SpecificMutableRow`. + private type ValueWriter = (SpecializedGetters, Int) => Unit + + // `ValueWriter`s for all fields of the schema + private val rootFieldWriters: Array[ValueWriter] = schema.map(_.dataType).map(makeWriter).toArray + + private val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) + + private def makeWriter(dataType: DataType): ValueWriter = dataType match { + case NullType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNull() + + case BooleanType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeBoolean(row.getBoolean(ordinal)) + + case ByteType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNumber(row.getByte(ordinal)) + + case ShortType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNumber(row.getShort(ordinal)) + + case IntegerType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNumber(row.getInt(ordinal)) + + case LongType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNumber(row.getLong(ordinal)) + + case FloatType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNumber(row.getFloat(ordinal)) + + case DoubleType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNumber(row.getDouble(ordinal)) + + case StringType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeString(row.getUTF8String(ordinal).toString) + + case TimestampType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeString(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal)).toString) + + case DateType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeString(DateTimeUtils.toJavaDate(row.getInt(ordinal)).toString) + + case BinaryType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeBinary(row.getBinary(ordinal)) + + case dt: DecimalType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeNumber(row.getDecimal(ordinal, dt.precision, dt.scale).toJavaBigDecimal) + + case st: StructType => + val fieldWriters = st.map(_.dataType).map(makeWriter) + (row: SpecializedGetters, ordinal: Int) => + writeObject(writeFields(row.getStruct(ordinal, st.length), st, fieldWriters)) + + case at: ArrayType => + val elementWriter = makeWriter(at.elementType) + (row: SpecializedGetters, ordinal: Int) => + writeArray(writeArrayData(row.getArray(ordinal), elementWriter)) + + case mt: MapType => + val valueWriter = makeWriter(mt.valueType) + (row: SpecializedGetters, ordinal: Int) => + writeObject(writeMapData(row.getMap(ordinal), mt, valueWriter)) + + // For UDT values, they should be in the SQL type's corresponding value type. + // We should not see values in the user-defined class at here. + // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is + // an ArrayData at here, instead of a Vector. + case t: UserDefinedType[_] => + makeWriter(t.sqlType) + + case _ => + (row: SpecializedGetters, ordinal: Int) => + val v = row.get(ordinal, dataType) + sys.error(s"Failed to convert value $v (class of ${v.getClass}}) " + + s"with the type of $dataType to JSON.") + } + + private def writeObject(f: => Unit): Unit = { + gen.writeStartObject() + f + gen.writeEndObject() + } + + private def writeFields( + row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = { + var i = 0 + while (i < row.numFields) { + val field = schema(i) + if (!row.isNullAt(i)) { + gen.writeFieldName(field.name) + fieldWriters(i).apply(row, i) + } + i += 1 + } + } + + private def writeArray(f: => Unit): Unit = { + gen.writeStartArray() + f + gen.writeEndArray() + } + + private def writeArrayData( + array: ArrayData, fieldWriter: ValueWriter): Unit = { + var i = 0 + while (i < array.numElements()) { + if (!array.isNullAt(i)) { + fieldWriter.apply(array, i) + } else { + gen.writeNull() + } + i += 1 + } + } + + private def writeMapData( + map: MapData, mapType: MapType, fieldWriter: ValueWriter): Unit = { + val keyArray = map.keyArray() + val valueArray = map.valueArray() + var i = 0 + while (i < map.numElements()) { + gen.writeFieldName(keyArray.get(i, mapType.keyType).toString) + if (!valueArray.isNullAt(i)) { + fieldWriter.apply(valueArray, i) + } else { + gen.writeNull() + } + i += 1 + } + } + + def close(): Unit = gen.close() + + def flush(): Unit = gen.flush() + + /** + * Transforms a single InternalRow to JSON using Jackson * - * @param rowSchema the schema object used for conversion - * @param gen a JsonGenerator object * @param row The row to convert */ - def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = { - def valWriter: (DataType, Any) => Unit = { - case (_, null) | (NullType, _) => gen.writeNull() - case (StringType, v) => gen.writeString(v.toString) - case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString) - case (IntegerType, v: Int) => gen.writeNumber(v) - case (ShortType, v: Short) => gen.writeNumber(v) - case (FloatType, v: Float) => gen.writeNumber(v) - case (DoubleType, v: Double) => gen.writeNumber(v) - case (LongType, v: Long) => gen.writeNumber(v) - case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal) - case (ByteType, v: Byte) => gen.writeNumber(v.toInt) - case (BinaryType, v: Array[Byte]) => gen.writeBinary(v) - case (BooleanType, v: Boolean) => gen.writeBoolean(v) - case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString) - // For UDT values, they should be in the SQL type's corresponding value type. - // We should not see values in the user-defined class at here. - // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is - // an ArrayData at here, instead of a Vector. - case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v) - - case (ArrayType(ty, _), v: ArrayData) => - gen.writeStartArray() - v.foreach(ty, (_, value) => valWriter(ty, value)) - gen.writeEndArray() - - case (MapType(kt, vt, _), v: MapData) => - gen.writeStartObject() - v.foreach(kt, vt, { (k, v) => - gen.writeFieldName(k.toString) - valWriter(vt, v) - }) - gen.writeEndObject() - - case (StructType(ty), v: InternalRow) => - gen.writeStartObject() - var i = 0 - while (i < ty.length) { - val field = ty(i) - val value = v.get(i, field.dataType) - if (value != null) { - gen.writeFieldName(field.name) - valWriter(field.dataType, value) - } - i += 1 - } - gen.writeEndObject() - - case (dt, v) => - sys.error( - s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.") + def write(row: InternalRow): Unit = { + writeObject { + writeFields(row, schema, rootFieldWriters) } - - valWriter(rowSchema, row) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 86aef1f7d441..adca8d7af0bd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources.json import java.io.CharArrayWriter -import com.fasterxml.jackson.core.JsonFactory import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{LongWritable, NullWritable, Text} @@ -162,7 +161,7 @@ private[json] class JsonOutputWriter( private[this] val writer = new CharArrayWriter() // create the Generator without separator inserted between 2 records - private[this] val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) + private[this] val gen = new JacksonGenerator(dataSchema, writer) private[this] val result = new Text() private val recordWriter: RecordWriter[NullWritable, Text] = { @@ -181,7 +180,7 @@ private[json] class JsonOutputWriter( override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal") override protected[sql] def writeInternal(row: InternalRow): Unit = { - JacksonGenerator(dataSchema, gen)(row) + gen.write(row) gen.flush() result.set(writer.toString) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 76d7f5cbc3db..772e031ea77d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -780,8 +780,7 @@ private[sql] object ParquetFileFormat extends Logging { val assumeBinaryIsString = sparkSession.sessionState.conf.isParquetBinaryAsString val assumeInt96IsTimestamp = sparkSession.sessionState.conf.isParquetINT96AsTimestamp val writeLegacyParquetFormat = sparkSession.sessionState.conf.writeLegacyParquetFormat - val serializedConf = - new SerializableConfiguration(sparkSession.sessionState.newHadoopConf()) + val serializedConf = new SerializableConfiguration(sparkSession.sessionState.newHadoopConf()) // !! HACK ALERT !! // diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index e6ef63442128..8a2e0d7995bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -94,7 +94,8 @@ private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with new ParquetRecordMaterializer( parquetRequestedSchema, - ParquetReadSupport.expandUDT(catalystRequestedSchema)) + ParquetReadSupport.expandUDT(catalystRequestedSchema), + new ParquetSchemaConverter(conf)) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala index 0818d802b077..d12e7805281a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala @@ -28,12 +28,14 @@ import org.apache.spark.sql.types.StructType * * @param parquetSchema Parquet schema of the records to be read * @param catalystSchema Catalyst schema of the rows to be constructed + * @param schemaConverter A Parquet-Catalyst schema converter that helps initializing row converters */ private[parquet] class ParquetRecordMaterializer( - parquetSchema: MessageType, catalystSchema: StructType) + parquetSchema: MessageType, catalystSchema: StructType, schemaConverter: ParquetSchemaConverter) extends RecordMaterializer[InternalRow] { - private val rootConverter = new ParquetRowConverter(parquetSchema, catalystSchema, NoopUpdater) + private val rootConverter = + new ParquetRowConverter(schemaConverter, parquetSchema, catalystSchema, NoopUpdater) override def getCurrentRecord: InternalRow = rootConverter.currentRecord diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 9dad59647e0d..9ffc2b5dd8a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -25,7 +25,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.parquet.column.Dictionary import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter} -import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type} +import org.apache.parquet.schema.{GroupType, MessageType, Type} import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8} import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, DOUBLE, FIXED_LEN_BYTE_ARRAY, INT32, INT64} @@ -113,12 +113,14 @@ private[parquet] class ParquetPrimitiveConverter(val updater: ParentContainerUpd * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have * any "parent" container. * + * @param schemaConverter A utility converter used to convert Parquet types to Catalyst types. * @param parquetType Parquet schema of Parquet records * @param catalystType Spark SQL schema that corresponds to the Parquet record type. User-defined * types should have been expanded. * @param updater An updater which propagates converted field values to the parent container */ private[parquet] class ParquetRowConverter( + schemaConverter: ParquetSchemaConverter, parquetType: GroupType, catalystType: StructType, updater: ParentContainerUpdater) @@ -292,9 +294,10 @@ private[parquet] class ParquetRowConverter( new ParquetMapConverter(parquetType.asGroupType(), t, updater) case t: StructType => - new ParquetRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater { - override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy()) - }) + new ParquetRowConverter( + schemaConverter, parquetType.asGroupType(), t, new ParentContainerUpdater { + override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy()) + }) case t => throw new RuntimeException( @@ -442,13 +445,46 @@ private[parquet] class ParquetRowConverter( private val elementConverter: Converter = { val repeatedType = parquetSchema.getType(0) val elementType = catalystSchema.elementType - val parentName = parquetSchema.getName - if (isElementType(repeatedType, elementType, parentName)) { + // At this stage, we're not sure whether the repeated field maps to the element type or is + // just the syntactic repeated group of the 3-level standard LIST layout. Take the following + // Parquet LIST-annotated group type as an example: + // + // optional group f (LIST) { + // repeated group list { + // optional group element { + // optional int32 element; + // } + // } + // } + // + // This type is ambiguous: + // + // 1. When interpreted as a standard 3-level layout, the `list` field is just the syntactic + // group, and the entire type should be translated to: + // + // ARRAY> + // + // 2. On the other hand, when interpreted as a non-standard 2-level layout, the `list` field + // represents the element type, and the entire type should be translated to: + // + // ARRAY>> + // + // Here we try to convert field `list` into a Catalyst type to see whether the converted type + // matches the Catalyst array element type. If it doesn't match, then it's case 1; otherwise, + // it's case 2. + val guessedElementType = schemaConverter.convertField(repeatedType) + + if (DataType.equalsIgnoreCompatibleNullability(guessedElementType, elementType)) { + // If the repeated field corresponds to the element type, creates a new converter using the + // type of the repeated field. newConverter(repeatedType, elementType, new ParentContainerUpdater { override def set(value: Any): Unit = currentArray += value }) } else { + // If the repeated field corresponds to the syntactic group in the standard 3-level Parquet + // LIST layout, creates a new converter using the only child field of the repeated field. + assert(!repeatedType.isPrimitive && repeatedType.asGroupType().getFieldCount == 1) new ElementConverter(repeatedType.asGroupType().getType(0), elementType) } } @@ -462,37 +498,6 @@ private[parquet] class ParquetRowConverter( // in row cells. override def start(): Unit = currentArray = ArrayBuffer.empty[Any] - // scalastyle:off - /** - * Returns whether the given type is the element type of a list or is a syntactic group with - * one field that is the element type. This is determined by checking whether the type can be - * a syntactic group and by checking whether a potential syntactic group matches the expected - * schema. - * {{{ - * group (LIST) { - * repeated group list { <-- repeatedType points here - * element; - * } - * } - * }}} - * In short, here we handle Parquet list backwards-compatibility rules on the read path. This - * method is based on `AvroIndexedRecordConverter.isElementType`. - * - * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules - */ - // scalastyle:on - private def isElementType( - parquetRepeatedType: Type, catalystElementType: DataType, parentName: String): Boolean = { - (parquetRepeatedType, catalystElementType) match { - case (t: PrimitiveType, _) => true - case (t: GroupType, _) if t.getFieldCount > 1 => true - case (t: GroupType, _) if t.getFieldCount == 1 && t.getName == "array" => true - case (t: GroupType, _) if t.getFieldCount == 1 && t.getName == parentName + "_tuple" => true - case (t: GroupType, StructType(Array(f))) if f.name == t.getFieldName(0) => true - case _ => false - } - } - /** Array element converter */ private final class ElementConverter(parquetType: Type, catalystType: DataType) extends GroupConverter { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala index bcf535d45521..c81a65f4973e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala @@ -260,7 +260,7 @@ private[parquet] class ParquetSchemaConverter( { // For legacy 2-level list types with primitive element type, e.g.: // - // // List (nullable list, non-null elements) + // // ARRAY (nullable list, non-null elements) // optional group my_list (LIST) { // repeated int32 element; // } @@ -270,7 +270,7 @@ private[parquet] class ParquetSchemaConverter( // For legacy 2-level list types whose element type is a group type with 2 or more fields, // e.g.: // - // // List> (nullable list, non-null elements) + // // ARRAY> (nullable list, non-null elements) // optional group my_list (LIST) { // repeated group element { // required binary str (UTF8); @@ -282,7 +282,7 @@ private[parquet] class ParquetSchemaConverter( } || { // For legacy 2-level list types generated by parquet-avro (Parquet version < 1.6.0), e.g.: // - // // List> (nullable list, non-null elements) + // // ARRAY> (nullable list, non-null elements) // optional group my_list (LIST) { // repeated group array { // required binary str (UTF8); @@ -293,7 +293,7 @@ private[parquet] class ParquetSchemaConverter( } || { // For Parquet data generated by parquet-thrift, e.g.: // - // // List> (nullable list, non-null elements) + // // ARRAY> (nullable list, non-null elements) // optional group my_list (LIST) { // repeated group my_list_tuple { // required binary str (UTF8); diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala index edfdf7cd6b7f..9817a56f499a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala @@ -101,7 +101,9 @@ private[sql] object SQLMetrics { */ def stringValue(metricsType: String, values: Seq[Long]): String = { if (metricsType == SUM_METRIC) { - NumberFormat.getInstance().format(values.sum) + val numberFormat = NumberFormat.getInstance() + numberFormat.setGroupingUsed(false) + numberFormat.format(values.sum) } else { val strFormat: Long => String = if (metricsType == SIZE_METRIC) { Utils.bytesToString diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 45d5d05d9f3f..93af8456c4f0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -978,6 +978,7 @@ object functions { * @group normal_funcs * @since 1.4.0 */ + @deprecated("Use monotonically_increasing_id()", "2.0.0") def monotonicallyIncreasingId(): Column = monotonically_increasing_id() /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index 1ae9b5524cec..05dfb8cb22b8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -153,7 +153,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { private def listColumns(tableIdentifier: TableIdentifier): Dataset[Column] = { val tableMetadata = sessionCatalog.getTableMetadata(tableIdentifier) val partitionColumnNames = tableMetadata.partitionColumnNames.toSet - val bucketColumnNames = tableMetadata.bucketColumnNames.toSet + val bucketColumnNames = tableMetadata.bucketSpec.map(_.bucketColumnNames).getOrElse(Nil).toSet val columns = tableMetadata.schema.map { c => new Column( name = c.name, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 14a1680fafa3..12a11ad74621 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -677,9 +677,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { def variableSubstituteDepth: Int = getConf(VARIABLE_SUBSTITUTE_DEPTH) - def warehousePath: String = { - getConf(WAREHOUSE_PATH).replace("${system:user.dir}", System.getProperty("user.dir")) - } + def warehousePath: String = getConf(WAREHOUSE_PATH) override def orderByOrdinal: Boolean = getConf(ORDER_BY_ORDINAL) @@ -738,8 +736,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { */ def getConf[T](entry: ConfigEntry[T]): T = { require(sqlConfEntries.get(entry.key) == entry, s"$entry is not registered") - Option(settings.get(entry.key)).map(entry.valueConverter).orElse(entry.defaultValue). - getOrElse(throw new NoSuchElementException(entry.key)) + entry.readFrom(settings, System.getenv) } /** @@ -748,7 +745,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { */ def getConf[T](entry: OptionalConfigEntry[T]): Option[T] = { require(sqlConfEntries.get(entry.key) == entry, s"$entry is not registered") - Option(settings.get(entry.key)).map(entry.rawValueConverter) + entry.readFrom(settings, System.getenv) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala index f12b6ca9d6ad..190463df0d92 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala @@ -28,4 +28,6 @@ private object DB2Dialect extends JdbcDialect { case BooleanType => Option(JdbcType("CHAR(1)", java.sql.Types.CHAR)) case _ => None } + + override def isCascadingTruncateTable(): Option[Boolean] = Some(false) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala index 948106fd062a..78107809a1cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala @@ -108,6 +108,13 @@ abstract class JdbcDialect extends Serializable { def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = { } + /** + * Return Some[true] iff `TRUNCATE TABLE` causes cascading default. + * Some[true] : TRUNCATE TABLE causes cascading. + * Some[false] : TRUNCATE TABLE does not cause cascading. + * None: The behavior of TRUNCATE TABLE is unknown (default). + */ + def isCascadingTruncateTable(): Option[Boolean] = None } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala index 3eb722b070d5..70122f259914 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala @@ -38,4 +38,6 @@ private object MsSqlServerDialect extends JdbcDialect { case TimestampType => Some(JdbcType("DATETIME", java.sql.Types.TIMESTAMP)) case _ => None } + + override def isCascadingTruncateTable(): Option[Boolean] = Some(false) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala index e1717049f383..b2cff7877d8b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala @@ -44,4 +44,6 @@ private case object MySQLDialect extends JdbcDialect { override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } + + override def isCascadingTruncateTable(): Option[Boolean] = Some(false) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala index b795e8b42df0..ce8731efd166 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala @@ -53,4 +53,6 @@ private case object OracleDialect extends JdbcDialect { case StringType => Some(JdbcType("VARCHAR2(255)", java.sql.Types.VARCHAR)) case _ => None } + + override def isCascadingTruncateTable(): Option[Boolean] = Some(false) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala index 6baf1b6f16cd..fb959d881e9d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala @@ -94,4 +94,6 @@ private object PostgresDialect extends JdbcDialect { } } + + override def isCascadingTruncateTable(): Option[Boolean] = Some(true) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 6f6abfa93c1d..f42402e1cc7d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -73,7 +73,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext } test("cache temp table") { - withTempTable("tempTable") { + withTempView("tempTable") { testData.select('key).createOrReplaceTempView("tempTable") assertCached(sql("SELECT COUNT(*) FROM tempTable"), 0) spark.catalog.cacheTable("tempTable") @@ -97,7 +97,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext } test("cache table as select") { - withTempTable("tempTable") { + withTempView("tempTable") { sql("CACHE TABLE tempTable AS SELECT key FROM testData") assertCached(sql("SELECT COUNT(*) FROM tempTable")) spark.catalog.uncacheTable("tempTable") @@ -227,7 +227,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext } test("CACHE TABLE tableName AS SELECT * FROM anotherTable") { - withTempTable("testCacheTable") { + withTempView("testCacheTable") { sql("CACHE TABLE testCacheTable AS SELECT * FROM testData") assertCached(spark.table("testCacheTable")) @@ -244,7 +244,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext } test("CACHE TABLE tableName AS SELECT ...") { - withTempTable("testCacheTable") { + withTempView("testCacheTable") { sql("CACHE TABLE testCacheTable AS SELECT key FROM testData LIMIT 10") assertCached(spark.table("testCacheTable")) @@ -413,7 +413,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext // Set up two tables distributed in the same way. Try this with the data distributed into // different number of partitions. for (numPartitions <- 1 until 10 by 4) { - withTempTable("t1", "t2") { + withTempView("t1", "t2") { testData.repartition(numPartitions, $"key").createOrReplaceTempView("t1") testData2.repartition(numPartitions, $"a").createOrReplaceTempView("t2") spark.catalog.cacheTable("t1") @@ -435,7 +435,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext } // Distribute the tables into non-matching number of partitions. Need to shuffle one side. - withTempTable("t1", "t2") { + withTempView("t1", "t2") { testData.repartition(6, $"key").createOrReplaceTempView("t1") testData2.repartition(3, $"a").createOrReplaceTempView("t2") spark.catalog.cacheTable("t1") @@ -452,7 +452,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext } // One side of join is not partitioned in the desired way. Need to shuffle one side. - withTempTable("t1", "t2") { + withTempView("t1", "t2") { testData.repartition(6, $"value").createOrReplaceTempView("t1") testData2.repartition(6, $"a").createOrReplaceTempView("t2") spark.catalog.cacheTable("t1") @@ -468,7 +468,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext spark.catalog.uncacheTable("t2") } - withTempTable("t1", "t2") { + withTempView("t1", "t2") { testData.repartition(6, $"value").createOrReplaceTempView("t1") testData2.repartition(12, $"a").createOrReplaceTempView("t2") spark.catalog.cacheTable("t1") @@ -487,7 +487,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext // One side of join is not partitioned in the desired way. Since the number of partitions of // the side that has already partitioned is smaller than the side that is not partitioned, // we shuffle both side. - withTempTable("t1", "t2") { + withTempView("t1", "t2") { testData.repartition(6, $"value").createOrReplaceTempView("t1") testData2.repartition(3, $"a").createOrReplaceTempView("t2") spark.catalog.cacheTable("t1") @@ -504,7 +504,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext // repartition's column ordering is different from group by column ordering. // But they use the same set of columns. - withTempTable("t1") { + withTempView("t1") { testData.repartition(6, $"value", $"key").createOrReplaceTempView("t1") spark.catalog.cacheTable("t1") @@ -520,7 +520,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext // We will still shuffle because hashcodes of a row depend on the column ordering. // If we do not shuffle, we may actually partition two tables in totally two different way. // See PartitioningSuite for more details. - withTempTable("t1", "t2") { + withTempView("t1", "t2") { val df1 = testData df1.repartition(6, $"value", $"key").createOrReplaceTempView("t1") val df2 = testData2.select($"a", $"b".cast("string")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index a170fae577c1..26e1a9f75da1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -508,18 +508,17 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext { Row("ab", "cde")) } - test("monotonicallyIncreasingId") { + test("monotonically_increasing_id") { // Make sure we have 2 partitions, each with 2 records. val df = sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ => Iterator(Tuple1(1), Tuple1(2)) }.toDF("a") checkAnswer( - df.select(monotonicallyIncreasingId()), - Row(0L) :: Row(1L) :: Row((1L << 33) + 0L) :: Row((1L << 33) + 1L) :: Nil - ) - checkAnswer( - df.select(expr("monotonically_increasing_id()")), - Row(0L) :: Row(1L) :: Row((1L << 33) + 0L) :: Row((1L << 33) + 1L) :: Nil + df.select(monotonically_increasing_id(), expr("monotonically_increasing_id()")), + Row(0L, 0L) :: + Row(1L, 1L) :: + Row((1L << 33) + 0L, (1L << 33) + 0L) :: + Row((1L << 33) + 1L, (1L << 33) + 1L) :: Nil ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala index a15b4e1221d3..4296ec543e27 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala @@ -29,16 +29,6 @@ class DataFrameTimeWindowingSuite extends QueryTest with SharedSQLContext with B import testImplicits._ - override def beforeEach(): Unit = { - super.beforeEach() - TimeZone.setDefault(TimeZone.getTimeZone("UTC")) - } - - override def afterEach(): Unit = { - super.beforeEach() - TimeZone.setDefault(null) - } - test("tumbling window groupBy statement") { val df = Seq( ("2016-03-27 19:39:34", 1, "a"), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MetadataCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MetadataCacheSuite.scala index 3f8cc8164d04..eacf254cd183 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MetadataCacheSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MetadataCacheSuite.scala @@ -60,7 +60,7 @@ class MetadataCacheSuite extends QueryTest with SharedSQLContext { } test("SPARK-16337 temporary view refresh") { - withTempTable("view_refresh") { withTempPath { (location: File) => + withTempView("view_refresh") { withTempPath { (location: File) => // Create a Parquet directory spark.range(start = 0, end = 100, step = 1, numPartitions = 3) .write.parquet(location.getAbsolutePath) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index eeaa0103a08e..06cc2a505758 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -805,7 +805,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("count of empty table") { - withTempTable("t") { + withTempView("t") { Seq.empty[(Int, Int)].toDF("a", "b").createOrReplaceTempView("t") checkAnswer( sql("select count(a) from t"), @@ -1671,7 +1671,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-7952: fix the equality check between boolean and numeric types") { - withTempTable("t") { + withTempView("t") { // numeric field i, boolean field j, result of i = j, result of i <=> j Seq[(Integer, java.lang.Boolean, java.lang.Boolean, java.lang.Boolean)]( (1, true, true, true), @@ -1691,7 +1691,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-7067: order by queries for complex ExtractValue chain") { - withTempTable("t") { + withTempView("t") { spark.read.json(sparkContext.makeRDD( """{"a": {"b": [{"c": 1}]}, "b": [{"d": 1}]}""" :: Nil)).createOrReplaceTempView("t") checkAnswer(sql("SELECT a.b FROM t ORDER BY b[0].d"), Row(Seq(Row(1)))) @@ -1699,14 +1699,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-8782: ORDER BY NULL") { - withTempTable("t") { + withTempView("t") { Seq((1, 2), (1, 2)).toDF("a", "b").createOrReplaceTempView("t") checkAnswer(sql("SELECT * FROM t ORDER BY NULL"), Seq(Row(1, 2), Row(1, 2))) } } test("SPARK-8837: use keyword in column name") { - withTempTable("t") { + withTempView("t") { val df = Seq(1 -> "a").toDF("count", "sort") checkAnswer(df.filter("count > 0"), Row(1, "a")) df.createOrReplaceTempView("t") @@ -1820,7 +1820,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-9511: error with table starting with number") { - withTempTable("1one") { + withTempView("1one") { sparkContext.parallelize(1 to 10).map(i => (i, i.toString)) .toDF("num", "str") .createOrReplaceTempView("1one") @@ -1864,7 +1864,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-10130 type coercion for IF should have children resolved first") { - withTempTable("src") { + withTempView("src") { Seq((1, 1), (-1, 1)).toDF("key", "value").createOrReplaceTempView("src") checkAnswer( sql("SELECT IF(a > 0, a, 0) FROM (SELECT key a FROM src) temp"), Seq(Row(1), Row(0))) @@ -1872,7 +1872,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-10389: order by non-attribute grouping expression on Aggregate") { - withTempTable("src") { + withTempView("src") { Seq((1, 1), (-1, 1)).toDF("key", "value").createOrReplaceTempView("src") checkAnswer(sql("SELECT MAX(value) FROM src GROUP BY key + 1 ORDER BY key + 1"), Seq(Row(1), Row(1))) @@ -1976,7 +1976,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-11032: resolve having correctly") { - withTempTable("src") { + withTempView("src") { Seq(1 -> "a").toDF("i", "j").createOrReplaceTempView("src") checkAnswer( sql("SELECT MIN(t.i) FROM (SELECT * FROM src WHERE i > 0) t HAVING(COUNT(1) > 0)"), @@ -2081,7 +2081,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) // Try with a temporary view - withTempTable("nestedStructTable") { + withTempView("nestedStructTable") { nestedStructData.createOrReplaceTempView("nestedStructTable") checkAnswer( sql("SELECT record.* FROM nestedStructTable"), @@ -2104,7 +2104,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { | SELECT struct(`col$.a_`, `a.b.c.`) as `r&&b.c` FROM | (SELECT struct(a, b) as `col$.a_`, struct(b, a) as `a.b.c.` FROM testData2) tmp """.stripMargin) - withTempTable("specialCharacterTable") { + withTempView("specialCharacterTable") { specialCharacterPath.createOrReplaceTempView("specialCharacterTable") checkAnswer( specialCharacterPath.select($"`r&&b.c`.*"), @@ -2128,7 +2128,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { test("Struct Star Expansion - Name conflict") { // Create a data set that contains a naming conflict val nameConflict = sql("SELECT struct(a, b) as nameConflict, a as a FROM testData2") - withTempTable("nameConflict") { + withTempView("nameConflict") { nameConflict.createOrReplaceTempView("nameConflict") // Unqualified should resolve to table. checkAnswer(sql("SELECT nameConflict.* FROM nameConflict"), @@ -2149,7 +2149,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("Star Expansion - table with zero column") { - withTempTable("temp_table_no_cols") { + withTempView("temp_table_no_cols") { val rddNoCols = sparkContext.parallelize(1 to 10).map(_ => Row.empty) val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty)) dfNoCols.createTempView("temp_table_no_cols") @@ -2464,7 +2464,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { test("SPARK-13056: Null in map value causes NPE") { val df = Seq(1 -> Map("abc" -> "somestring", "cba" -> null)).toDF("key", "value") - withTempTable("maptest") { + withTempView("maptest") { df.createOrReplaceTempView("maptest") // local optimization will by pass codegen code, so we should keep the filter `key=1` checkAnswer(sql("SELECT value['abc'] FROM maptest where key = 1"), Row("somestring")) @@ -2474,7 +2474,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { test("hash function") { val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j") - withTempTable("tbl") { + withTempView("tbl") { df.createOrReplaceTempView("tbl") checkAnswer( df.select(hash($"i", $"j")), @@ -2526,7 +2526,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { test("natural join") { val df1 = Seq(("one", 1), ("two", 2), ("three", 3)).toDF("k", "v1") val df2 = Seq(("one", 1), ("two", 22), ("one", 5)).toDF("k", "v2") - withTempTable("nt1", "nt2") { + withTempView("nt1", "nt2") { df1.createOrReplaceTempView("nt1") df2.createOrReplaceTempView("nt2") checkAnswer( @@ -2554,7 +2554,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { ("r2c1", "r2c2", "t2r2c3"), ("r3c1y", "r3c2", "t2r3c3")).toDF("c1", "c2", "c3") val df3 = Seq((null, "r1c2", "t3r1c3"), ("r2c1", "r2c2", "t3r2c3"), ("r3c1y", "r3c2", "t3r3c3")).toDF("c1", "c2", "c3") - withTempTable("t1", "t2", "t3") { + withTempView("t1", "t2", "t3") { df1.createOrReplaceTempView("t1") df2.createOrReplaceTempView("t2") df3.createOrReplaceTempView("t3") @@ -2965,4 +2965,36 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } } } + + test("SPARK-16644: Aggregate should not put aggregate expressions to constraints") { + withTable("tbl") { + sql("CREATE TABLE tbl(a INT, b INT) USING parquet") + checkAnswer(sql( + """ + |SELECT + | a, + | MAX(b) AS c1, + | b AS c2 + |FROM tbl + |WHERE a = b + |GROUP BY a, b + |HAVING c1 = 1 + """.stripMargin), Nil) + } + } + + test("SPARK-16674: field names containing dots for both fields and partitioned fields") { + withTempPath { path => + val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b")) + .toDF("col.1", "col.2", "part.col1", "part.col2") + data.write + .format("parquet") + .partitionBy("part.col1", "part.col2") + .save(path.getCanonicalPath) + val readBack = spark.read.format("parquet").load(path.getCanonicalPath) + checkAnswer( + readBack.selectExpr("`part.col1`", "`col.1`"), + data.selectExpr("`part.col1`", "`col.1`")) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala index ab55242ec068..2c81cbf15f08 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala @@ -34,7 +34,7 @@ class StatisticsSuite extends QueryTest with SharedSQLContext { } test("estimates the size of limit") { - withTempTable("test") { + withTempView("test") { Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v") .createOrReplaceTempView("test") Seq((0, 1), (1, 24), (2, 48)).foreach { case (limit, expected) => @@ -58,7 +58,7 @@ class StatisticsSuite extends QueryTest with SharedSQLContext { } test("estimates the size of a limit 0 on outer join") { - withTempTable("test") { + withTempView("test") { Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v") .createOrReplaceTempView("test") val df1 = spark.table("test") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index f509551b1ed8..524926e1e9b6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -384,4 +384,27 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext { }.getMessage assert(m.contains("Invalid number of arguments for function sentences")) } + + test("str_to_map function") { + val df1 = Seq( + ("a=1,b=2", "y"), + ("a=1,b=2,c=3", "y") + ).toDF("a", "b") + + checkAnswer( + df1.selectExpr("str_to_map(a,',','=')"), + Seq( + Row(Map("a" -> "1", "b" -> "2")), + Row(Map("a" -> "1", "b" -> "2", "c" -> "3")) + ) + ) + + val df2 = Seq(("a:1,b:2,c:3", "y")).toDF("a", "b") + + checkAnswer( + df2.selectExpr("str_to_map(a)"), + Seq(Row(Map("a" -> "1", "b" -> "2", "c" -> "3"))) + ) + + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index 1d9ff21dbf5d..afed342ff8e2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -128,7 +128,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext { } test("SPARK-15677: Queries against local relations with scalar subquery in Select list") { - withTempTable("t1", "t2") { + withTempView("t1", "t2") { Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t1") Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t2") @@ -267,7 +267,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext { } test("SPARK-15832: Test embedded existential predicate sub-queries") { - withTempTable("t1", "t2", "t3", "t4", "t5") { + withTempView("t1", "t2", "t3", "t4", "t5") { Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t1") Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t2") Seq((1, 1), (2, 2), (1, 2)).toDF("c1", "c2").createOrReplaceTempView("t3") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index c96239e68201..13490c35679a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -71,7 +71,7 @@ class PlannerSuite extends SharedSQLContext { test("sizeInBytes estimation of limit operator for broadcast hash join optimization") { def checkPlan(fieldTypes: Seq[DataType]): Unit = { - withTempTable("testLimit") { + withTempView("testLimit") { val fields = fieldTypes.zipWithIndex.map { case (dataType, index) => StructField(s"c${index}", dataType, true) } :+ StructField("key", IntegerType, true) @@ -131,7 +131,7 @@ class PlannerSuite extends SharedSQLContext { test("InMemoryRelation statistics propagation") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "81920") { - withTempTable("tiny") { + withTempView("tiny") { testData.limit(3).createOrReplaceTempView("tiny") sql("CACHE TABLE tiny") @@ -157,7 +157,7 @@ class PlannerSuite extends SharedSQLContext { val df = spark.read.parquet(path) df.createOrReplaceTempView("testPushed") - withTempTable("testPushed") { + withTempView("testPushed") { val exp = sql("select * from testPushed where key = 15").queryExecution.sparkPlan assert(exp.toString.contains("PushedFilters: [IsNotNull(key), EqualTo(key,15)]")) } @@ -198,7 +198,7 @@ class PlannerSuite extends SharedSQLContext { } test("PartitioningCollection") { - withTempTable("normal", "small", "tiny") { + withTempView("normal", "small", "tiny") { testData.createOrReplaceTempView("normal") testData.limit(10).createOrReplaceTempView("small") testData.limit(3).createOrReplaceTempView("tiny") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index b170a3a77ee0..999afc9751fe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -20,15 +20,15 @@ package org.apache.spark.sql.execution.command import scala.reflect.{classTag, ClassTag} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, FunctionResource} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTableType, FunctionResource} import org.apache.spark.sql.catalyst.catalog.FunctionResourceType import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.SparkSqlParser -import org.apache.spark.sql.execution.datasources.{BucketSpec, CreateTableUsing} +import org.apache.spark.sql.execution.datasources.CreateTableUsing import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} -import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType, StructType} +import org.apache.spark.sql.types.{IntegerType, StringType, StructType} // TODO: merge this with DDLSuite (SPARK-14441) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 169250d9bb1c..a354594a6d17 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -26,12 +26,12 @@ import org.apache.spark.internal.config._ import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{DatabaseAlreadyExistsException, FunctionRegistry, NoSuchPartitionException, NoSuchTableException, TempTableAlreadyExistsException} -import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, CatalogStorageFormat} import org.apache.spark.sql.catalyst.catalog.{CatalogColumn, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.catalog.{CatalogTablePartition, SessionCatalog} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ -import org.apache.spark.sql.execution.datasources.BucketSpec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{IntegerType, StructType} @@ -84,7 +84,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { outputFormat = None, serde = None, compressed = false, - serdeProperties = Map()) + properties = Map()) CatalogTable( identifier = name, tableType = CatalogTableType.EXTERNAL, @@ -497,7 +497,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } test("rename temporary table - destination table with database name") { - withTempTable("tab1") { + withTempView("tab1") { sql( """ |CREATE TEMPORARY TABLE tab1 @@ -522,7 +522,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } test("rename temporary table - destination table already exists") { - withTempTable("tab1", "tab2") { + withTempView("tab1", "tab2") { sql( """ |CREATE TEMPORARY TABLE tab1 @@ -677,7 +677,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } test("show tables") { - withTempTable("show1a", "show2b") { + withTempView("show1a", "show2b") { sql( """ |CREATE TEMPORARY TABLE show1a @@ -891,9 +891,9 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { convertToDatasourceTable(catalog, tableIdent) } assert(catalog.getTableMetadata(tableIdent).storage.locationUri.isDefined) - assert(catalog.getTableMetadata(tableIdent).storage.serdeProperties.isEmpty) + assert(catalog.getTableMetadata(tableIdent).storage.properties.isEmpty) assert(catalog.getPartition(tableIdent, partSpec).storage.locationUri.isEmpty) - assert(catalog.getPartition(tableIdent, partSpec).storage.serdeProperties.isEmpty) + assert(catalog.getPartition(tableIdent, partSpec).storage.properties.isEmpty) // Verify that the location is set to the expected string def verifyLocation(expected: String, spec: Option[TablePartitionSpec] = None): Unit = { val storageFormat = spec @@ -901,10 +901,10 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { .getOrElse { catalog.getTableMetadata(tableIdent).storage } if (isDatasourceTable) { if (spec.isDefined) { - assert(storageFormat.serdeProperties.isEmpty) + assert(storageFormat.properties.isEmpty) assert(storageFormat.locationUri.isEmpty) } else { - assert(storageFormat.serdeProperties.get("path") === Some(expected)) + assert(storageFormat.properties.get("path") === Some(expected)) assert(storageFormat.locationUri === Some(expected)) } } else { @@ -947,7 +947,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { convertToDatasourceTable(catalog, tableIdent) } assert(catalog.getTableMetadata(tableIdent).storage.serde.isEmpty) - assert(catalog.getTableMetadata(tableIdent).storage.serdeProperties.isEmpty) + assert(catalog.getTableMetadata(tableIdent).storage.properties.isEmpty) // set table serde and/or properties (should fail on datasource tables) if (isDatasourceTable) { val e1 = intercept[AnalysisException] { @@ -962,21 +962,21 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } else { sql("ALTER TABLE dbx.tab1 SET SERDE 'org.apache.jadoop'") assert(catalog.getTableMetadata(tableIdent).storage.serde == Some("org.apache.jadoop")) - assert(catalog.getTableMetadata(tableIdent).storage.serdeProperties.isEmpty) + assert(catalog.getTableMetadata(tableIdent).storage.properties.isEmpty) sql("ALTER TABLE dbx.tab1 SET SERDE 'org.apache.madoop' " + "WITH SERDEPROPERTIES ('k' = 'v', 'kay' = 'vee')") assert(catalog.getTableMetadata(tableIdent).storage.serde == Some("org.apache.madoop")) - assert(catalog.getTableMetadata(tableIdent).storage.serdeProperties == + assert(catalog.getTableMetadata(tableIdent).storage.properties == Map("k" -> "v", "kay" -> "vee")) } // set serde properties only sql("ALTER TABLE dbx.tab1 SET SERDEPROPERTIES ('k' = 'vvv', 'kay' = 'vee')") - assert(catalog.getTableMetadata(tableIdent).storage.serdeProperties == + assert(catalog.getTableMetadata(tableIdent).storage.properties == Map("k" -> "vvv", "kay" -> "vee")) // set things without explicitly specifying database catalog.setCurrentDatabase("dbx") sql("ALTER TABLE tab1 SET SERDEPROPERTIES ('kay' = 'veee')") - assert(catalog.getTableMetadata(tableIdent).storage.serdeProperties == + assert(catalog.getTableMetadata(tableIdent).storage.properties == Map("k" -> "vvv", "kay" -> "veee")) // table to alter does not exist intercept[AnalysisException] { @@ -1003,7 +1003,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { convertToDatasourceTable(catalog, tableIdent) } assert(catalog.getPartition(tableIdent, spec).storage.serde.isEmpty) - assert(catalog.getPartition(tableIdent, spec).storage.serdeProperties.isEmpty) + assert(catalog.getPartition(tableIdent, spec).storage.properties.isEmpty) // set table serde and/or properties (should fail on datasource tables) if (isDatasourceTable) { val e1 = intercept[AnalysisException] { @@ -1018,25 +1018,25 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } else { sql("ALTER TABLE dbx.tab1 PARTITION (a=1, b=2) SET SERDE 'org.apache.jadoop'") assert(catalog.getPartition(tableIdent, spec).storage.serde == Some("org.apache.jadoop")) - assert(catalog.getPartition(tableIdent, spec).storage.serdeProperties.isEmpty) + assert(catalog.getPartition(tableIdent, spec).storage.properties.isEmpty) sql("ALTER TABLE dbx.tab1 PARTITION (a=1, b=2) SET SERDE 'org.apache.madoop' " + "WITH SERDEPROPERTIES ('k' = 'v', 'kay' = 'vee')") assert(catalog.getPartition(tableIdent, spec).storage.serde == Some("org.apache.madoop")) - assert(catalog.getPartition(tableIdent, spec).storage.serdeProperties == + assert(catalog.getPartition(tableIdent, spec).storage.properties == Map("k" -> "v", "kay" -> "vee")) } // set serde properties only maybeWrapException(isDatasourceTable) { sql("ALTER TABLE dbx.tab1 PARTITION (a=1, b=2) " + "SET SERDEPROPERTIES ('k' = 'vvv', 'kay' = 'vee')") - assert(catalog.getPartition(tableIdent, spec).storage.serdeProperties == + assert(catalog.getPartition(tableIdent, spec).storage.properties == Map("k" -> "vvv", "kay" -> "vee")) } // set things without explicitly specifying database catalog.setCurrentDatabase("dbx") maybeWrapException(isDatasourceTable) { sql("ALTER TABLE tab1 PARTITION (a=1, b=2) SET SERDEPROPERTIES ('kay' = 'veee')") - assert(catalog.getPartition(tableIdent, spec).storage.serdeProperties == + assert(catalog.getPartition(tableIdent, spec).storage.properties == Map("k" -> "vvv", "kay" -> "veee")) } // table to alter does not exist @@ -1264,6 +1264,29 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } } + test("create table using CLUSTERED BY without schema specification") { + import testImplicits._ + withTempPath { tempDir => + withTable("jsonTable") { + (("a", "b") :: Nil).toDF().write.json(tempDir.getCanonicalPath) + + val e = intercept[ParseException] { + sql( + s""" + |CREATE TABLE jsonTable + |USING org.apache.spark.sql.json + |OPTIONS ( + | path '${tempDir.getCanonicalPath}' + |) + |CLUSTERED BY (inexistentColumnA) SORTED BY (inexistentColumnB) INTO 2 BUCKETS + """.stripMargin) + }.getMessage + assert(e.contains( + "Expected explicit specification of table schema when using CLUSTERED BY clause")) + } + } + } + test("create table with datasource properties (not allowed)") { assertUnsupported("CREATE TABLE my_tab TBLPROPERTIES ('spark.sql.sources.me'='anything')") assertUnsupported("CREATE TABLE my_tab ROW FORMAT SERDE 'serde' " + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala index 8d8a18fa9332..ddcc24a7f56b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala @@ -27,6 +27,7 @@ import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkConf import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionSet, PredicateHelper} import org.apache.spark.sql.catalyst.util import org.apache.spark.sql.execution.DataSourceScanExec diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 6c72019702c3..177fc04b02e3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -21,10 +21,7 @@ import java.io.{File, StringWriter} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} -import scala.collection.JavaConverters._ - import com.fasterxml.jackson.core.JsonFactory -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec @@ -1082,7 +1079,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { test("Corrupt records: PERMISSIVE mode") { // Test if we can query corrupt records. withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") { - withTempTable("jsonTable") { + withTempView("jsonTable") { val jsonDF = spark.read.json(corruptRecords) jsonDF.createOrReplaceTempView("jsonTable") val schema = StructType( @@ -1518,7 +1515,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { test("SPARK-12057 additional corrupt records do not throw exceptions") { // Test if we can query corrupt records. withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") { - withTempTable("jsonTable") { + withTempView("jsonTable") { val schema = StructType( StructField("_unparsed", StringType, true) :: StructField("dummy", StringType, true) :: Nil) @@ -1635,7 +1632,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } test("Casting long as timestamp") { - withTempTable("jsonTable") { + withTempView("jsonTable") { val schema = (new StructType).add("ts", TimestampType) val jsonDF = spark.read.schema(schema).json(timestampAsLong) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 88fcfce0ec1b..c7541889f202 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -16,6 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.parquet +import scala.collection.JavaConverters._ + +import org.apache.parquet.hadoop.ParquetOutputFormat + import org.apache.spark.sql.test.SharedSQLContext // TODO: this needs a lot more testing but it's currently not easy to test with the parquet @@ -78,4 +82,29 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSQLContex }} } } + + test("Read row group containing both dictionary and plain encoded pages") { + withSQLConf(ParquetOutputFormat.DICTIONARY_PAGE_SIZE -> "2048", + ParquetOutputFormat.PAGE_SIZE -> "4096") { + withTempPath { dir => + // In order to explicitly test for SPARK-14217, we set the parquet dictionary and page size + // such that the following data spans across 3 pages (within a single row group) where the + // first page is dictionary encoded and the remaining two are plain encoded. + val data = (0 until 512).flatMap(i => Seq.fill(3)(i.toString)) + data.toDF("f").coalesce(1).write.parquet(dir.getCanonicalPath) + val file = SpecificParquetRecordReaderBase.listDirectory(dir).asScala.head + + val reader = new VectorizedParquetRecordReader + reader.initialize(file, null /* set columns to null to project all columns */) + val column = reader.resultBatch().column(0) + assert(reader.nextBatch()) + + (0 until 512).foreach { i => + assert(column.getUTF8String(3 * i).toString == i.toString) + assert(column.getUTF8String(3 * i + 1).toString == i.toString) + assert(column.getUTF8String(3 * i + 2).toString == i.toString) + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index 133ffedf1281..8d18be9300f7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -404,7 +404,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha spark.read.parquet(base.getCanonicalPath).createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { checkAnswer( sql("SELECT * FROM t"), for { @@ -488,7 +488,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha spark.read.parquet(base.getCanonicalPath).createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { checkAnswer( sql("SELECT * FROM t"), for { @@ -537,7 +537,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha val parquetRelation = spark.read.format("parquet").load(base.getCanonicalPath) parquetRelation.createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { checkAnswer( sql("SELECT * FROM t"), for { @@ -577,7 +577,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha val parquetRelation = spark.read.format("parquet").load(base.getCanonicalPath) parquetRelation.createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { checkAnswer( sql("SELECT * FROM t"), for { @@ -613,7 +613,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha .load(base.getCanonicalPath) .createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { checkAnswer( sql("SELECT * FROM t"), (1 to 10).map(i => Row(i, null, 1)) ++ (1 to 10).map(i => Row(i, i.toString, 2))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index 7c394e0b0ccb..7e83bcbb6e24 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow import org.apache.spark.sql.execution.BatchedDataSourceScanExec -import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT} +import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT, SingleElement} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -668,9 +668,47 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext } } } + + test("SPARK-16344: array of struct with a single field named 'element'") { + withTempPath { dir => + val path = dir.getCanonicalPath + Seq(Tuple1(Array(SingleElement(42)))).toDF("f").write.parquet(path) + + checkAnswer( + sqlContext.read.parquet(path), + Row(Array(Row(42))) + ) + } + } + + test("SPARK-16632: read Parquet int32 as ByteType and ShortType") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { + withTempPath { dir => + val path = dir.getCanonicalPath + + // When being written to Parquet, `TINYINT` and `SMALLINT` should be converted into + // `int32 (INT_8)` and `int32 (INT_16)` respectively. However, Hive doesn't add the `INT_8` + // and `INT_16` annotation properly (HIVE-14294). Thus, when reading files written by Hive + // using Spark with the vectorized Parquet reader enabled, we may hit error due to type + // mismatch. + // + // Here we are simulating Hive's behavior by writing a single `INT` field and then read it + // back as `TINYINT` and `SMALLINT` in Spark to verify this issue. + Seq(1).toDF("f").write.parquet(path) + + val withByteField = new StructType().add("f", ByteType) + checkAnswer(spark.read.schema(withByteField).parquet(path), Row(1: Byte)) + + val withShortField = new StructType().add("f", ShortType) + checkAnswer(spark.read.schema(withShortField).parquet(path), Row(1: Short)) + } + } + } } object TestingUDT { + case class SingleElement(element: Long) + @SQLUserDefinedType(udt = classOf[NestedStructUDT]) case class NestedStruct(a: Integer, b: Long, c: Double) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala index 9fb34e03cb20..85efca3c4b24 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala @@ -91,7 +91,7 @@ private[sql] trait ParquetTest extends SQLTestUtils { (f: => Unit): Unit = { withParquetDataFrame(data, testVectorized) { df => df.createOrReplaceTempView(tableName) - withTempTable(tableName)(f) + withTempView(tableName)(f) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 579a095ff000..bba40c6510cf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -133,7 +133,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext { // test should use the deterministic number of partitions. val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) testDataForJoin.createOrReplaceTempView("testDataForJoin") - withTempTable("testDataForJoin") { + withTempView("testDataForJoin") { // Assume the execution plan is // ... -> SortMergeJoin(nodeId = 1) -> TungstenProject(nodeId = 0) val df = spark.sql( @@ -151,7 +151,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext { // this test should use the deterministic number of partitions. val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) testDataForJoin.createOrReplaceTempView("testDataForJoin") - withTempTable("testDataForJoin") { + withTempView("testDataForJoin") { // Assume the execution plan is // ... -> SortMergeJoin(nodeId = 1) -> TungstenProject(nodeId = 0) val df = spark.sql( @@ -206,7 +206,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext { val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) testDataForJoin.createOrReplaceTempView("testDataForJoin") withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { - withTempTable("testDataForJoin") { + withTempView("testDataForJoin") { // Assume the execution plan is // ... -> BroadcastNestedLoopJoin(nodeId = 1) -> TungstenProject(nodeId = 0) val df = spark.sql( @@ -236,7 +236,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext { withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) testDataForJoin.createOrReplaceTempView("testDataForJoin") - withTempTable("testDataForJoin") { + withTempView("testDataForJoin") { // Assume the execution plan is // ... -> CartesianProduct(nodeId = 1) -> TungstenProject(nodeId = 0) val df = spark.sql( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala index 07aeaeb695a4..8aa81854b22e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala @@ -90,11 +90,12 @@ class CatalogSuite .getOrElse { spark.catalog.listColumns(tableName) } assume(tableMetadata.schema.nonEmpty, "bad test") assume(tableMetadata.partitionColumnNames.nonEmpty, "bad test") - assume(tableMetadata.bucketColumnNames.nonEmpty, "bad test") + assume(tableMetadata.bucketSpec.isDefined, "bad test") assert(columns.collect().map(_.name).toSet == tableMetadata.schema.map(_.name).toSet) + val bucketColumnNames = tableMetadata.bucketSpec.map(_.bucketColumnNames).getOrElse(Nil).toSet columns.collect().foreach { col => assert(col.isPartition == tableMetadata.partitionColumnNames.contains(col.name)) - assert(col.isBucket == tableMetadata.bucketColumnNames.contains(col.name)) + assert(col.isBucket == bucketColumnNames.contains(col.name)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 228e4250f3c6..995b1200a229 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -752,7 +752,7 @@ class JDBCSuite extends SparkFunSuite assertEmptyQuery(s"SELECT * FROM foobar WHERE $FALSE1 AND ($FALSE2 OR $TRUE)") // Tests JDBCPartition whereClause clause push down. - withTempTable("tempFrame") { + withTempView("tempFrame") { val jdbcPartitionWhereClause = s"$FALSE1 OR $TRUE" val df = spark.read.jdbc( urlWithUserAndPass, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala index 2c6449fa6870..d99b3cf975f4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala @@ -40,6 +40,14 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter { properties.setProperty("password", "testPass") properties.setProperty("rowId", "false") + val testH2Dialect = new JdbcDialect { + override def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2") + override def getCatalystType( + sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = + Some(StringType) + override def isCascadingTruncateTable(): Option[Boolean] = Some(false) + } + before { Utils.classForName("org.h2.Driver") conn = DriverManager.getConnection(url) @@ -145,14 +153,25 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter { assert(2 === spark.read.jdbc(url, "TEST.APPENDTEST", new Properties()).collect()(0).length) } - test("CREATE then INSERT to truncate") { + test("Truncate") { + JdbcDialects.registerDialect(testH2Dialect) val df = spark.createDataFrame(sparkContext.parallelize(arr2x2), schema2) val df2 = spark.createDataFrame(sparkContext.parallelize(arr1x2), schema2) + val df3 = spark.createDataFrame(sparkContext.parallelize(arr2x3), schema3) df.write.jdbc(url1, "TEST.TRUNCATETEST", properties) - df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.TRUNCATETEST", properties) + df2.write.mode(SaveMode.Overwrite).option("truncate", true) + .jdbc(url1, "TEST.TRUNCATETEST", properties) assert(1 === spark.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count()) assert(2 === spark.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length) + + val m = intercept[SparkException] { + df3.write.mode(SaveMode.Overwrite).option("truncate", true) + .jdbc(url1, "TEST.TRUNCATETEST", properties) + }.getMessage + assert(m.contains("Column \"seq\" not found")) + assert(0 === spark.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count()) + JdbcDialects.unregisterDialect(testH2Dialect) } test("Incompatible INSERT to append") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index f9a07dbdf0be..49153f77362b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -19,24 +19,27 @@ package org.apache.spark.sql.sources import java.io.File -import org.scalatest.BeforeAndAfter +import org.scalatest.BeforeAndAfterEach import org.apache.spark.SparkException +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.execution.command.DDLUtils -import org.apache.spark.sql.execution.datasources.BucketSpec import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils -class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter { +class CreateTableAsSelectSuite + extends DataSourceTest + with SharedSQLContext + with BeforeAndAfterEach { protected override lazy val sql = spark.sql _ private var path: File = null override def beforeAll(): Unit = { super.beforeAll() - path = Utils.createTempDir() val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}""")) spark.read.json(rdd).createOrReplaceTempView("jt") } @@ -44,18 +47,21 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with override def afterAll(): Unit = { try { spark.catalog.dropTempView("jt") - if (path.exists()) { - Utils.deleteRecursively(path) - } + Utils.deleteRecursively(path) } finally { super.afterAll() } } - before { - if (path.exists()) { - Utils.deleteRecursively(path) - } + override def beforeEach(): Unit = { + super.beforeEach() + path = Utils.createTempDir() + path.delete() + } + + override def afterEach(): Unit = { + Utils.deleteRecursively(path) + super.afterEach() } test("CREATE TABLE USING AS SELECT") { @@ -199,7 +205,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with } } - test("create table using as select - with bucket") { + test("create table using as select - with non-zero buckets") { val catalog = spark.sessionState.catalog withTable("t") { sql( @@ -212,7 +218,23 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with ) val table = catalog.getTableMetadata(TableIdentifier("t")) assert(DDLUtils.getBucketSpecFromTableProperties(table) == - Some(BucketSpec(5, Seq("a"), Seq("b")))) + Option(BucketSpec(5, Seq("a"), Seq("b")))) + } + } + + test("create table using as select - with zero buckets") { + withTable("t") { + val e = intercept[AnalysisException] { + sql( + s""" + |CREATE TABLE t USING PARQUET + |OPTIONS (PATH '${path.toString}') + |CLUSTERED BY (a) SORTED BY (b) INTO 0 BUCKETS + |AS SELECT 1 AS a, 2 AS b + """.stripMargin + ) + }.getMessage + assert(e.contains("Expected positive number of buckets, but got `0`")) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala index 206d03ea98e6..cc77d3c4b91a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.sources import org.apache.spark.sql._ -import org.apache.spark.sql.internal.SQLConf private[sql] abstract class DataSourceTest extends QueryTest { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala index f9496520f383..af2b58116b2a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala @@ -189,7 +189,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts { } def apply(message: String)(condition: StreamExecution => Unit): AssertOnQuery = { - new AssertOnQuery(s => { condition; true }, message) + new AssertOnQuery(s => { condition(s); true }, message) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 853dd0ff3f60..5286ee5bc23d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -29,6 +29,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.NoSuchTableException +import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ @@ -149,7 +150,7 @@ private[sql] trait SQLTestUtils /** * Drops temporary table `tableName` after calling `f`. */ - protected def withTempTable(tableNames: String*)(f: => Unit): Unit = { + protected def withTempView(tableNames: String*)(f: => Unit): Unit = { try f finally { // If the test failed part way, we don't want to mask the failure by failing to remove // temp tables that never got created. @@ -196,7 +197,12 @@ private[sql] trait SQLTestUtils fail("Failed to create temporary database", cause) } - try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") + try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { + spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") + } } /** diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 9ee8d6a36c42..819897cd4685 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-hive-thriftserver_2.11 jar Spark Project Hive Thrift Server diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index d9d8b2e55960..2be99cb1046f 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -26,7 +26,6 @@ ../../pom.xml - org.apache.spark spark-hive_2.11 jar Spark Project Hive diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 2be51ed0e87e..db970785a716 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -119,7 +119,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log BucketSpec(n.toInt, getColumnNames("bucket"), getColumnNames("sort")) } - val options = table.storage.serdeProperties + val options = table.storage.properties val dataSource = DataSource( sparkSession, @@ -171,8 +171,6 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log } else if (table.tableType == CatalogTableType.VIEW) { val viewText = table.viewText.getOrElse(sys.error("Invalid view without text.")) alias match { - // because hive use things like `_c0` to build the expanded text - // currently we cannot support view from "create view v1(c1) as ..." case None => SubqueryAlias(table.identifier.table, sparkSession.sessionState.sqlParser.parsePlan(viewText)) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index b8a75850b1d5..c59ac3dcafea 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -238,7 +238,6 @@ private[sql] class HiveSessionCatalog( "hash", "histogram_numeric", "percentile", - "percentile_approx", - "str_to_map" + "percentile_approx" ) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index 9ed357c587c3..bdec611453b2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -97,10 +97,11 @@ private[spark] object HiveUtils extends Logging { .createWithDefault(false) val CONVERT_METASTORE_ORC = SQLConfigBuilder("spark.sql.hive.convertMetastoreOrc") + .internal() .doc("When set to false, Spark SQL will use the Hive SerDe for ORC tables instead of " + "the built in support.") .booleanConf - .createWithDefault(true) + .createWithDefault(false) val HIVE_METASTORE_SHARED_PREFIXES = SQLConfigBuilder("spark.sql.hive.metastore.sharedPrefixes") .doc("A comma separated list of class prefixes that should be loaded using the classloader " + diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 3ab1bdabb99b..f3c849b9f282 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -103,7 +103,7 @@ private[hive] case class MetastoreRelation( sd.setSerdeInfo(serdeInfo) val serdeParameters = new java.util.HashMap[String, String]() - catalogTable.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) } + catalogTable.storage.properties.foreach { case (k, v) => serdeParameters.put(k, v) } serdeInfo.setParameters(serdeParameters) new HiveTable(tTable) @@ -173,8 +173,8 @@ private[hive] case class MetastoreRelation( p.storage.serde.foreach(serdeInfo.setSerializationLib) val serdeParameters = new java.util.HashMap[String, String]() - catalogTable.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) } - p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) } + catalogTable.storage.properties.foreach { case (k, v) => serdeParameters.put(k, v) } + p.storage.properties.foreach { case (k, v) => serdeParameters.put(k, v) } serdeInfo.setParameters(serdeParameters) new Partition(hiveQlTable, tPartition) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 7e0cef3e355d..2392cc0bdd8d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -293,7 +293,7 @@ private[hive] class HiveClientImpl( database.name, database.description, database.locationUri, - database.properties.asJava), + Option(database.properties).map(_.asJava).orNull), ignoreIfExists) } @@ -311,7 +311,7 @@ private[hive] class HiveClientImpl( database.name, database.description, database.locationUri, - database.properties.asJava)) + Option(database.properties).map(_.asJava).orNull)) } override def getDatabaseOption(name: String): Option[CatalogDatabase] = withHiveState { @@ -320,7 +320,7 @@ private[hive] class HiveClientImpl( name = d.getName, description = d.getDescription, locationUri = d.getLocationUri, - properties = d.getParameters.asScala.toMap) + properties = Option(d.getParameters).map(_.asScala.toMap).orNull) } } @@ -353,7 +353,7 @@ private[hive] class HiveClientImpl( unsupportedFeatures += "bucketing" } - val properties = h.getParameters.asScala.toMap + val properties = Option(h.getParameters).map(_.asScala.toMap).orNull CatalogTable( identifier = TableIdentifier(h.getTableName, Option(h.getDbName)), @@ -365,9 +365,9 @@ private[hive] class HiveClientImpl( }, schema = schema, partitionColumnNames = partCols.map(_.name), - sortColumnNames = Seq(), // TODO: populate this - bucketColumnNames = h.getBucketCols.asScala, - numBuckets = h.getNumBuckets, + // We can not populate bucketing information for Hive tables as Spark SQL has a different + // implementation of hash function from Hive. + bucketSpec = None, owner = h.getOwner, createTime = h.getTTable.getCreateTime.toLong * 1000, lastAccessTime = h.getLastAccessTime.toLong * 1000, @@ -390,7 +390,8 @@ private[hive] class HiveClientImpl( outputFormat = Option(h.getOutputFormatClass).map(_.getName), serde = Option(h.getSerializationLib), compressed = h.getTTable.getSd.isCompressed, - serdeProperties = h.getTTable.getSd.getSerdeInfo.getParameters.asScala.toMap + properties = Option(h.getTTable.getSd.getSerdeInfo.getParameters) + .map(_.asScala.toMap).orNull ), properties = properties, viewOriginalText = Option(h.getViewOriginalText), @@ -763,10 +764,7 @@ private[hive] class HiveClientImpl( hiveTable.setFields(schema.asJava) } hiveTable.setPartCols(partCols.asJava) - // TODO: set sort columns here too - hiveTable.setBucketCols(table.bucketColumnNames.asJava) hiveTable.setOwner(conf.getUser) - hiveTable.setNumBuckets(table.numBuckets) hiveTable.setCreateTime((table.createTime / 1000).toInt) hiveTable.setLastAccessTime((table.lastAccessTime / 1000).toInt) table.storage.locationUri.foreach { loc => shim.setDataLocation(hiveTable, loc) } @@ -774,7 +772,7 @@ private[hive] class HiveClientImpl( table.storage.outputFormat.map(toOutputFormat).foreach(hiveTable.setOutputFormatClass) hiveTable.setSerializationLib( table.storage.serde.getOrElse("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) - table.storage.serdeProperties.foreach { case (k, v) => hiveTable.setSerdeParam(k, v) } + table.storage.properties.foreach { case (k, v) => hiveTable.setSerdeParam(k, v) } table.properties.foreach { case (k, v) => hiveTable.setProperty(k, v) } table.comment.foreach { c => hiveTable.setProperty("comment", c) } table.viewOriginalText.foreach { t => hiveTable.setViewOriginalText(t) } @@ -798,7 +796,7 @@ private[hive] class HiveClientImpl( p.storage.inputFormat.foreach(storageDesc.setInputFormat) p.storage.outputFormat.foreach(storageDesc.setOutputFormat) p.storage.serde.foreach(serdeInfo.setSerializationLib) - serdeInfo.setParameters(p.storage.serdeProperties.asJava) + serdeInfo.setParameters(p.storage.properties.asJava) storageDesc.setSerdeInfo(serdeInfo) tpart.setDbName(ht.getDbName) tpart.setTableName(ht.getTableName) @@ -817,6 +815,7 @@ private[hive] class HiveClientImpl( outputFormat = Option(apiPartition.getSd.getOutputFormat), serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib), compressed = apiPartition.getSd.isCompressed, - serdeProperties = apiPartition.getSd.getSerdeInfo.getParameters.asScala.toMap)) + properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) + .map(_.asScala.toMap).orNull)) } } diff --git a/sql/hive/src/test/resources/sqlgen/agg1.sql b/sql/hive/src/test/resources/sqlgen/agg1.sql new file mode 100644 index 000000000000..9953640a6fa1 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/agg1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, max(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr` HAVING (`gen_attr` > CAST(0 AS BIGINT))) AS gen_subquery_1) AS gen_subquery_2 diff --git a/sql/hive/src/test/resources/sqlgen/agg2.sql b/sql/hive/src/test/resources/sqlgen/agg2.sql new file mode 100644 index 000000000000..0caea28b0240 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/agg2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY MAX(key) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, max(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr` ORDER BY `gen_attr` ASC) AS gen_subquery_1) AS gen_subquery_2 diff --git a/sql/hive/src/test/resources/sqlgen/agg3.sql b/sql/hive/src/test/resources/sqlgen/agg3.sql new file mode 100644 index 000000000000..437afa779077 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/agg3.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY key, MAX(key) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, `gen_attr` AS `gen_attr`, max(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr` ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1) AS gen_subquery_2 diff --git a/sql/hive/src/test/resources/sqlgen/aggregate_functions_and_window.sql b/sql/hive/src/test/resources/sqlgen/aggregate_functions_and_window.sql new file mode 100644 index 000000000000..c94f53b0a9f4 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/aggregate_functions_and_window.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT MAX(c) + COUNT(a) OVER () FROM parquet_t2 GROUP BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `(max(c) + count(a) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))` FROM (SELECT (`gen_attr` + `gen_attr`) AS `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, count(`gen_attr`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `gen_attr` FROM (SELECT max(`gen_attr`) AS `gen_attr`, `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr`) AS gen_subquery_1) AS gen_subquery_2) AS gen_subquery_3 diff --git a/sql/hive/src/test/resources/sqlgen/case.sql b/sql/hive/src/test/resources/sqlgen/case.sql new file mode 100644 index 000000000000..f6a8c32cf012 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/case.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `CASE WHEN ((id % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 WHEN ((id % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT)) THEN 1 END` FROM (SELECT CASE WHEN ((`gen_attr` % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 WHEN ((`gen_attr` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT)) THEN 1 END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/case_with_else.sql b/sql/hive/src/test/resources/sqlgen/case_with_else.sql new file mode 100644 index 000000000000..8f1595d4b8ce --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/case_with_else.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT CASE WHEN id % 2 > 0 THEN 0 ELSE 1 END FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `CASE WHEN ((id % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 ELSE 1 END` FROM (SELECT CASE WHEN ((`gen_attr` % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 ELSE 1 END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/case_with_key.sql b/sql/hive/src/test/resources/sqlgen/case_with_key.sql new file mode 100644 index 000000000000..88353e321e1d --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/case_with_key.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' END FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `CASE WHEN (id = CAST(0 AS BIGINT)) THEN foo WHEN (id = CAST(1 AS BIGINT)) THEN bar END` FROM (SELECT CASE WHEN (`gen_attr` = CAST(0 AS BIGINT)) THEN "foo" WHEN (`gen_attr` = CAST(1 AS BIGINT)) THEN "bar" END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/case_with_key_and_else.sql b/sql/hive/src/test/resources/sqlgen/case_with_key_and_else.sql new file mode 100644 index 000000000000..bc44f25d1ff8 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/case_with_key_and_else.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' ELSE 'baz' END FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `CASE WHEN (id = CAST(0 AS BIGINT)) THEN foo WHEN (id = CAST(1 AS BIGINT)) THEN bar ELSE baz END` FROM (SELECT CASE WHEN (`gen_attr` = CAST(0 AS BIGINT)) THEN "foo" WHEN (`gen_attr` = CAST(1 AS BIGINT)) THEN "bar" ELSE "baz" END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/cluster_by.sql b/sql/hive/src/test/resources/sqlgen/cluster_by.sql new file mode 100644 index 000000000000..3bc1dfc58188 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/cluster_by.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM parquet_t0 CLUSTER BY id +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0 CLUSTER BY `gen_attr`) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/data_source_json_parquet_t0.sql b/sql/hive/src/test/resources/sqlgen/data_source_json_parquet_t0.sql new file mode 100644 index 000000000000..a4f46da9ffb7 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/data_source_json_parquet_t0.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM json_parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`json_parquet_t0`) AS gen_subquery_0) AS json_parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/data_source_orc_parquet_t0.sql b/sql/hive/src/test/resources/sqlgen/data_source_orc_parquet_t0.sql new file mode 100644 index 000000000000..214dbcf23625 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/data_source_orc_parquet_t0.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM orc_parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`orc_parquet_t0`) AS gen_subquery_0) AS orc_parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/data_source_parquet_parquet_t0.sql b/sql/hive/src/test/resources/sqlgen/data_source_parquet_parquet_t0.sql new file mode 100644 index 000000000000..eccf03eb122c --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/data_source_parquet_parquet_t0.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM parquet_parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_parquet_t0`) AS gen_subquery_0) AS parquet_parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/distinct_aggregation.sql b/sql/hive/src/test/resources/sqlgen/distinct_aggregation.sql new file mode 100644 index 000000000000..0fa21ad3ba42 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/distinct_aggregation.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT COUNT(DISTINCT id) FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `count(DISTINCT id)` FROM (SELECT count(DISTINCT `gen_attr`) AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/distribute_by.sql b/sql/hive/src/test/resources/sqlgen/distribute_by.sql new file mode 100644 index 000000000000..75a1a2a6b60b --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/distribute_by.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM parquet_t0 DISTRIBUTE BY id +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0 DISTRIBUTE BY `gen_attr`) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/distribute_by_with_sort_by.sql b/sql/hive/src/test/resources/sqlgen/distribute_by_with_sort_by.sql new file mode 100644 index 000000000000..a8e766e81db9 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/distribute_by_with_sort_by.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM parquet_t0 DISTRIBUTE BY id SORT BY id +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0 CLUSTER BY `gen_attr`) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/except.sql b/sql/hive/src/test/resources/sqlgen/except.sql new file mode 100644 index 000000000000..b1c64288d6e6 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/except.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT * FROM t0 EXCEPT SELECT * FROM t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM ((SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_0 ) EXCEPT ( SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_1)) AS t0 diff --git a/sql/hive/src/test/resources/sqlgen/filter_after_subquery.sql b/sql/hive/src/test/resources/sqlgen/filter_after_subquery.sql new file mode 100644 index 000000000000..0156dffd0a9f --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/filter_after_subquery.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a FROM (SELECT key + 1 AS a FROM parquet_t1) t WHERE a > 5 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a` FROM (SELECT `gen_attr` FROM (SELECT (`gen_attr` + CAST(1 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t WHERE (`gen_attr` > CAST(5 AS BIGINT))) AS t diff --git a/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql b/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql new file mode 100644 index 000000000000..15d23c67aec3 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql @@ -0,0 +1,8 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(arr) AS val, id +FROM parquet_t3 +WHERE id > 2 +ORDER BY val, id +LIMIT 5 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT gen_subquery_0.`gen_attr`, gen_subquery_0.`gen_attr`, gen_subquery_0.`gen_attr`, gen_subquery_0.`gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 WHERE (`gen_attr` > CAST(2 AS BIGINT))) AS gen_subquery_1 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr` ORDER BY `gen_attr` ASC, `gen_attr` ASC LIMIT 5) AS parquet_t3 diff --git a/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql b/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql new file mode 100644 index 000000000000..e2b94ce6739a --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql @@ -0,0 +1,10 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT val, id +FROM parquet_t3 +LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array +LATERAL VIEW EXPLODE(nested_array) exp1 AS val +WHERE val > 2 +ORDER BY val, id +LIMIT 5 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr` LATERAL VIEW explode(`gen_attr`) gen_subquery_3 AS `gen_attr` WHERE (`gen_attr` > CAST(2 AS BIGINT)) ORDER BY `gen_attr` ASC, `gen_attr` ASC LIMIT 5) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_in_lateral_view_1.sql b/sql/hive/src/test/resources/sqlgen/generator_in_lateral_view_1.sql new file mode 100644 index 000000000000..c62b895c6740 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_in_lateral_view_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT val, id FROM parquet_t3 LATERAL VIEW EXPLODE(arr) exp AS val +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_in_lateral_view_2.sql b/sql/hive/src/test/resources/sqlgen/generator_in_lateral_view_2.sql new file mode 100644 index 000000000000..4c68eef214fb --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_in_lateral_view_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT val, id FROM parquet_t3 LATERAL VIEW OUTER EXPLODE(arr) exp AS val +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW OUTER explode(`gen_attr`) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_non_referenced_table_1.sql b/sql/hive/src/test/resources/sqlgen/generator_non_referenced_table_1.sql new file mode 100644 index 000000000000..059c45ec47c0 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_non_referenced_table_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(ARRAY(1,2,3)) FROM t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `col` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_0 LATERAL VIEW explode(array(1, 2, 3)) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_non_referenced_table_2.sql b/sql/hive/src/test/resources/sqlgen/generator_non_referenced_table_2.sql new file mode 100644 index 000000000000..8f03dedbf971 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_non_referenced_table_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(ARRAY(1,2,3)) AS val FROM t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_0 LATERAL VIEW explode(array(1, 2, 3)) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_non_udtf_1.sql b/sql/hive/src/test/resources/sqlgen/generator_non_udtf_1.sql new file mode 100644 index 000000000000..0bfe38036ebf --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_non_udtf_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(arr), id FROM parquet_t3 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `col`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_1 AS `gen_attr`) AS parquet_t3 diff --git a/sql/hive/src/test/resources/sqlgen/generator_non_udtf_2.sql b/sql/hive/src/test/resources/sqlgen/generator_non_udtf_2.sql new file mode 100644 index 000000000000..e4edb6876cac --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_non_udtf_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(arr) AS val, id as a FROM parquet_t3 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val`, `gen_attr` AS `a` FROM (SELECT `gen_attr`, `gen_attr` AS `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_referenced_table_1.sql b/sql/hive/src/test/resources/sqlgen/generator_referenced_table_1.sql new file mode 100644 index 000000000000..5cf57f82d2df --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_referenced_table_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(arr) FROM parquet_t3 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `col` FROM (SELECT `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_referenced_table_2.sql b/sql/hive/src/test/resources/sqlgen/generator_referenced_table_2.sql new file mode 100644 index 000000000000..ed42894807d2 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_referenced_table_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(arr) AS val FROM parquet_t3 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val` FROM (SELECT `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_with_ambiguous_names_1.sql b/sql/hive/src/test/resources/sqlgen/generator_with_ambiguous_names_1.sql new file mode 100644 index 000000000000..5a4aec961a5e --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_with_ambiguous_names_1.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT exp.id, parquet_t3.id +FROM parquet_t3 +LATERAL VIEW EXPLODE(arr) exp AS id +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_with_ambiguous_names_2.sql b/sql/hive/src/test/resources/sqlgen/generator_with_ambiguous_names_2.sql new file mode 100644 index 000000000000..3c3e858deda7 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_with_ambiguous_names_2.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT exp.id, parquet_t3.id +FROM parquet_t3 +LATERAL VIEW OUTER EXPLODE(arr) exp AS id +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW OUTER explode(`gen_attr`) gen_subquery_2 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/generator_without_from_1.sql b/sql/hive/src/test/resources/sqlgen/generator_without_from_1.sql new file mode 100644 index 000000000000..d56e89c8bb45 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_without_from_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(ARRAY(1,2,3)) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `col` FROM (SELECT `gen_attr` FROM (SELECT 1) gen_subquery_1 LATERAL VIEW explode(array(1, 2, 3)) gen_subquery_2 AS `gen_attr`) AS gen_subquery_0 diff --git a/sql/hive/src/test/resources/sqlgen/generator_without_from_2.sql b/sql/hive/src/test/resources/sqlgen/generator_without_from_2.sql new file mode 100644 index 000000000000..d7acb6924076 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/generator_without_from_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT EXPLODE(ARRAY(1,2,3)) AS val +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val` FROM (SELECT `gen_attr` FROM (SELECT 1) gen_subquery_1 LATERAL VIEW explode(array(1, 2, 3)) gen_subquery_2 AS `gen_attr`) AS gen_subquery_0 diff --git a/sql/hive/src/test/resources/sqlgen/grouping_sets_1.sql b/sql/hive/src/test/resources/sqlgen/grouping_sets_1.sql new file mode 100644 index 000000000000..72e429fe05b3 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/grouping_sets_1.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id() AS k3 +FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5 +GROUPING SETS (key % 5, key - 5) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `cnt`, `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `k3` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `gen_attr`, (`gen_attr` % CAST(2 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t GROUP BY (`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT))), ((`gen_attr` - CAST(5 AS BIGINT))))) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/grouping_sets_2_1.sql b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_1.sql new file mode 100644 index 000000000000..25e86e73cbb7 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a, b) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(c)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`), (`gen_attr`)) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/grouping_sets_2_2.sql b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_2.sql new file mode 100644 index 000000000000..24bf02043488 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(c)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`)) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/grouping_sets_2_3.sql b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_3.sql new file mode 100644 index 000000000000..f72bc8acd69a --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_3.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (b) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(c)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`)) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/grouping_sets_2_4.sql b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_4.sql new file mode 100644 index 000000000000..2fac7f661809 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_4.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (()) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(c)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS(()) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/grouping_sets_2_5.sql b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_5.sql new file mode 100644 index 000000000000..2e60c9c4d2c6 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/grouping_sets_2_5.sql @@ -0,0 +1,5 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b +GROUPING SETS ((), (a), (a, b)) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(c)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((), (`gen_attr`), (`gen_attr`, `gen_attr`)) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/in.sql b/sql/hive/src/test/resources/sqlgen/in.sql new file mode 100644 index 000000000000..1e0251e586e2 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/in.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM parquet_t0 WHERE id IN (1, 2, 3) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0 WHERE (CAST(`gen_attr` AS BIGINT) IN (CAST(1 AS BIGINT), CAST(2 AS BIGINT), CAST(3 AS BIGINT)))) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/intersect.sql b/sql/hive/src/test/resources/sqlgen/intersect.sql new file mode 100644 index 000000000000..608fa59f8afb --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/intersect.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT * FROM t0 INTERSECT SELECT * FROM t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM ((SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_0 ) INTERSECT ( SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_1)) AS t0 diff --git a/sql/hive/src/test/resources/sqlgen/join_2_tables.sql b/sql/hive/src/test/resources/sqlgen/join_2_tables.sql new file mode 100644 index 000000000000..bb99797b8bcc --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/join_2_tables.sql @@ -0,0 +1,7 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT COUNT(a.value), b.KEY, a.KEY +FROM parquet_t1 a, parquet_t1 b +GROUP BY a.KEY, b.KEY +HAVING MAX(a.KEY) > 0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `count(value)`, `gen_attr` AS `KEY`, `gen_attr` AS `KEY` FROM (SELECT `gen_attr`, `gen_attr`, `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, `gen_attr`, `gen_attr`, max(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_1 GROUP BY `gen_attr`, `gen_attr` HAVING (`gen_attr` > CAST(0 AS BIGINT))) AS gen_subquery_2) AS gen_subquery_3 diff --git a/sql/hive/src/test/resources/sqlgen/json_tuple_generator_1.sql b/sql/hive/src/test/resources/sqlgen/json_tuple_generator_1.sql new file mode 100644 index 000000000000..9e5fe1156e5d --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/json_tuple_generator_1.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT c0, c1, c2 +FROM parquet_t3 +LATERAL VIEW JSON_TUPLE(json, 'f1', 'f2', 'f3') jt +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `c0`, `gen_attr` AS `c1`, `gen_attr` AS `c2` FROM (SELECT `gen_attr`, `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW json_tuple(`gen_attr`, "f1", "f2", "f3") gen_subquery_1 AS `gen_attr`, `gen_attr`, `gen_attr`) AS jt diff --git a/sql/hive/src/test/resources/sqlgen/json_tuple_generator_2.sql b/sql/hive/src/test/resources/sqlgen/json_tuple_generator_2.sql new file mode 100644 index 000000000000..55dd491a32a3 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/json_tuple_generator_2.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, c +FROM parquet_t3 +LATERAL VIEW JSON_TUPLE(json, 'f1', 'f2', 'f3') jt AS a, b, c +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `c` FROM (SELECT `gen_attr`, `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW json_tuple(`gen_attr`, "f1", "f2", "f3") gen_subquery_1 AS `gen_attr`, `gen_attr`, `gen_attr`) AS jt diff --git a/sql/hive/src/test/resources/sqlgen/multi_distinct.sql b/sql/hive/src/test/resources/sqlgen/multi_distinct.sql new file mode 100644 index 000000000000..b5c7cf0b0ce1 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/multi_distinct.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, COUNT(DISTINCT b), COUNT(DISTINCT c), SUM(d) FROM parquet_t2 GROUP BY a +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `count(DISTINCT b)`, `gen_attr` AS `count(DISTINCT c)`, `gen_attr` AS `sum(d)` FROM (SELECT `gen_attr`, count(DISTINCT `gen_attr`) AS `gen_attr`, count(DISTINCT `gen_attr`) AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`) AS parquet_t2 diff --git a/sql/hive/src/test/resources/sqlgen/nested_generator_in_lateral_view_1.sql b/sql/hive/src/test/resources/sqlgen/nested_generator_in_lateral_view_1.sql new file mode 100644 index 000000000000..3d79f9565458 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/nested_generator_in_lateral_view_1.sql @@ -0,0 +1,7 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT val, id +FROM parquet_t3 +LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array +LATERAL VIEW EXPLODE(nested_array) exp1 AS val +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr` LATERAL VIEW explode(`gen_attr`) gen_subquery_3 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/nested_generator_in_lateral_view_2.sql b/sql/hive/src/test/resources/sqlgen/nested_generator_in_lateral_view_2.sql new file mode 100644 index 000000000000..18b29a7bc5ee --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/nested_generator_in_lateral_view_2.sql @@ -0,0 +1,7 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT val, id +FROM parquet_t3 +LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array +LATERAL VIEW OUTER EXPLODE(nested_array) exp1 AS val +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `val`, `gen_attr` AS `id` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `arr` AS `gen_attr`, `arr2` AS `gen_attr`, `json` AS `gen_attr`, `id` AS `gen_attr` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr`) gen_subquery_2 AS `gen_attr` LATERAL VIEW OUTER explode(`gen_attr`) gen_subquery_3 AS `gen_attr`) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/not_in.sql b/sql/hive/src/test/resources/sqlgen/not_in.sql new file mode 100644 index 000000000000..a50ee282e534 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/not_in.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM t0 WHERE id NOT IN (1, 2, 3) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_0 WHERE (NOT (CAST(`gen_attr` AS BIGINT) IN (CAST(1 AS BIGINT), CAST(2 AS BIGINT), CAST(3 AS BIGINT))))) AS t0 diff --git a/sql/hive/src/test/resources/sqlgen/not_like.sql b/sql/hive/src/test/resources/sqlgen/not_like.sql new file mode 100644 index 000000000000..2f66695189cb --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/not_like.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM t0 WHERE id + 5 NOT LIKE '1%' +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_0 WHERE (NOT CAST((`gen_attr` + CAST(5 AS BIGINT)) AS STRING) LIKE "1%")) AS t0 diff --git a/sql/hive/src/test/resources/sqlgen/regular_expressions_and_window.sql b/sql/hive/src/test/resources/sqlgen/regular_expressions_and_window.sql new file mode 100644 index 000000000000..a7909485fbc5 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/regular_expressions_and_window.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT MAX(key) OVER (PARTITION BY key % 3) + key FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `(max(key) OVER (PARTITION BY (key % CAST(3 AS BIGINT)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + key)` FROM (SELECT (`gen_attr` + `gen_attr`) AS `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, max(`gen_attr`) OVER (PARTITION BY `gen_attr` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `gen_attr` FROM (SELECT `gen_attr`, (`gen_attr` % CAST(3 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1) AS gen_subquery_2) AS gen_subquery_3 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_1_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_1_1.sql new file mode 100644 index 000000000000..4b2c78ea40c7 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_1_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `cnt`, `gen_attr` AS `(key % CAST(5 AS BIGINT))`, `gen_attr` AS `grouping_id()` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY (`gen_attr` % CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT))), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_1_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_1_2.sql new file mode 100644 index 000000000000..62cf1bb9d8e0 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_1_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `cnt`, `gen_attr` AS `(key % CAST(5 AS BIGINT))`, `gen_attr` AS `grouping_id()` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY (`gen_attr` % CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT))), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_2_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_2_1.sql new file mode 100644 index 000000000000..d58910af6939 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_2_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value`, `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, count(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_2_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_2_2.sql new file mode 100644 index 000000000000..9a58c15203d1 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_2_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value`, `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, count(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_3_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_3_1.sql new file mode 100644 index 000000000000..cd4d428b07ed --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_3_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `count(value)`, `gen_attr` AS `grouping_id()` FROM (SELECT `gen_attr` AS `gen_attr`, count(`gen_attr`) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_3_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_3_2.sql new file mode 100644 index 000000000000..4f42d7eda890 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_3_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `count(value)`, `gen_attr` AS `grouping_id()` FROM (SELECT `gen_attr` AS `gen_attr`, count(`gen_attr`) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_4_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_4_1.sql new file mode 100644 index 000000000000..be4908ea16e5 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_4_1.sql @@ -0,0 +1,5 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1 +GROUP BY key % 5, key - 5 WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `cnt`, `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `grouping_id()` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY (`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT))), ((`gen_attr` % CAST(5 AS BIGINT))), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql new file mode 100644 index 000000000000..eebef6a5f3b8 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_4_2.sql @@ -0,0 +1,5 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1 +GROUP BY key % 5, key - 5 WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `cnt`, `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `grouping_id()` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY (`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT))), ((`gen_attr` % CAST(5 AS BIGINT))), ((`gen_attr` - CAST(5 AS BIGINT))), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql new file mode 100644 index 000000000000..9474233cb752 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_1.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3 +FROM (SELECT key, key%2, key - 5 FROM parquet_t1) t GROUP BY key%5, key-5 +WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `cnt`, `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `k3` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `gen_attr`, (`gen_attr` % CAST(2 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t GROUP BY (`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT))), ((`gen_attr` % CAST(5 AS BIGINT))), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql new file mode 100644 index 000000000000..d36f43d665cd --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_5_2.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3 +FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5 +WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `cnt`, `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `k3` FROM (SELECT count(1) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `gen_attr`, (`gen_attr` % CAST(2 AS BIGINT)) AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t GROUP BY (`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS(((`gen_attr` % CAST(5 AS BIGINT)), (`gen_attr` - CAST(5 AS BIGINT))), ((`gen_attr` % CAST(5 AS BIGINT))), ((`gen_attr` - CAST(5 AS BIGINT))), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_6_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_1.sql new file mode 100644 index 000000000000..de980784c721 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(c) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(c)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), ()) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_6_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_2.sql new file mode 100644 index 000000000000..8956ac4c42f8 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(c) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(c)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), (`gen_attr`), ()) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_6_3.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_3.sql new file mode 100644 index 000000000000..b9cef62dcfdf --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_3.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(a) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(a)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), ()) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_6_4.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_4.sql new file mode 100644 index 000000000000..ba6457ab1a54 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_4.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, sum(a) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `sum(a)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), (`gen_attr`), ()) ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_6_5.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_5.sql new file mode 100644 index 000000000000..763fecd9517f --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_5.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `(a + b)`, `gen_attr` AS `b`, `gen_attr` AS `sum((a - b))` FROM (SELECT (`gen_attr` + `gen_attr`) AS `gen_attr`, `gen_attr` AS `gen_attr`, sum((`gen_attr` - `gen_attr`)) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY (`gen_attr` + `gen_attr`), `gen_attr` GROUPING SETS(((`gen_attr` + `gen_attr`), `gen_attr`), ((`gen_attr` + `gen_attr`)), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_6_6.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_6.sql new file mode 100644 index 000000000000..00ecbf6078e0 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_6_6.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `(a + b)`, `gen_attr` AS `b`, `gen_attr` AS `sum((a - b))` FROM (SELECT (`gen_attr` + `gen_attr`) AS `gen_attr`, `gen_attr` AS `gen_attr`, sum((`gen_attr` - `gen_attr`)) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY (`gen_attr` + `gen_attr`), `gen_attr` GROUPING SETS(((`gen_attr` + `gen_attr`), `gen_attr`), ((`gen_attr` + `gen_attr`)), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_7_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_7_1.sql new file mode 100644 index 000000000000..6f57130f3ea2 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_7_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, grouping_id(a, b) FROM parquet_t2 GROUP BY cube(a, b) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `grouping_id(a, b)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, grouping_id() AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_7_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_7_2.sql new file mode 100644 index 000000000000..dab7852a3e96 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_7_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, grouping(b) FROM parquet_t2 GROUP BY cube(a, b) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `grouping(b)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, grouping(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_7_3.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_7_3.sql new file mode 100644 index 000000000000..50964e5b01b6 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_7_3.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT a, b, grouping(a) FROM parquet_t2 GROUP BY cube(a, b) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `a`, `gen_attr` AS `b`, `gen_attr` AS `grouping(a)` FROM (SELECT `gen_attr` AS `gen_attr`, `gen_attr` AS `gen_attr`, grouping(`gen_attr`) AS `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` GROUPING SETS((`gen_attr`, `gen_attr`), (`gen_attr`), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_8_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_8_1.sql new file mode 100644 index 000000000000..84af580a3625 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_8_1.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid +FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5 +WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `hgid` FROM (SELECT `gen_attr` AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, hash(grouping_id()) AS `gen_attr` FROM (SELECT hash(`gen_attr`) AS `gen_attr`, `gen_attr` AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t GROUP BY `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS((`gen_attr`, (`gen_attr` - CAST(5 AS BIGINT))), (`gen_attr`), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_8_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_8_2.sql new file mode 100644 index 000000000000..b0f65b7d41bf --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_8_2.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid +FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5 +WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `k1`, `gen_attr` AS `k2`, `gen_attr` AS `hgid` FROM (SELECT `gen_attr` AS `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, hash(grouping_id()) AS `gen_attr` FROM (SELECT hash(`gen_attr`) AS `gen_attr`, `gen_attr` AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t GROUP BY `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS((`gen_attr`, (`gen_attr` - CAST(5 AS BIGINT))), (`gen_attr`), ((`gen_attr` - CAST(5 AS BIGINT))), ())) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_9_1.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_9_1.sql new file mode 100644 index 000000000000..171400dd78cd --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_9_1.sql @@ -0,0 +1,8 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT t.key - 5, cnt, SUM(cnt) +FROM (SELECT x.key, COUNT(*) as cnt +FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t +GROUP BY cnt, t.key - 5 +WITH ROLLUP +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `(key - CAST(5 AS BIGINT))`, `gen_attr` AS `cnt`, `gen_attr` AS `sum(cnt)` FROM (SELECT (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `gen_attr`, count(1) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_1 ON (`gen_attr` = `gen_attr`) GROUP BY `gen_attr`) AS t GROUP BY `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS((`gen_attr`, (`gen_attr` - CAST(5 AS BIGINT))), (`gen_attr`), ())) AS gen_subquery_2 diff --git a/sql/hive/src/test/resources/sqlgen/rollup_cube_9_2.sql b/sql/hive/src/test/resources/sqlgen/rollup_cube_9_2.sql new file mode 100644 index 000000000000..fe98b6c343cc --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/rollup_cube_9_2.sql @@ -0,0 +1,8 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT t.key - 5, cnt, SUM(cnt) +FROM (SELECT x.key, COUNT(*) as cnt +FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t +GROUP BY cnt, t.key - 5 +WITH CUBE +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `(key - CAST(5 AS BIGINT))`, `gen_attr` AS `cnt`, `gen_attr` AS `sum(cnt)` FROM (SELECT (`gen_attr` - CAST(5 AS BIGINT)) AS `gen_attr`, `gen_attr` AS `gen_attr`, sum(`gen_attr`) AS `gen_attr` FROM (SELECT `gen_attr`, count(1) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_1 ON (`gen_attr` = `gen_attr`) GROUP BY `gen_attr`) AS t GROUP BY `gen_attr`, (`gen_attr` - CAST(5 AS BIGINT)) GROUPING SETS((`gen_attr`, (`gen_attr` - CAST(5 AS BIGINT))), (`gen_attr`), ((`gen_attr` - CAST(5 AS BIGINT))), ())) AS gen_subquery_2 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_1.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_1.sql new file mode 100644 index 000000000000..75e43d0dc1cf --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT TRANSFORM (a, b, c, d) USING 'cat' FROM parquet_t2 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value` FROM (SELECT TRANSFORM (`gen_attr`, `gen_attr`, `gen_attr`, `gen_attr`) USING 'cat' AS (`gen_attr` string, `gen_attr` string) FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_2.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_2.sql new file mode 100644 index 000000000000..6a68edc72ebc --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT TRANSFORM (*) USING 'cat' FROM parquet_t2 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value` FROM (SELECT TRANSFORM (`gen_attr`, `gen_attr`, `gen_attr`, `gen_attr`) USING 'cat' AS (`gen_attr` string, `gen_attr` string) FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_alias_list.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_alias_list.sql new file mode 100644 index 000000000000..111df4bdf229 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_alias_list.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT TRANSFORM (a, b, c, d) USING 'cat' AS (d1, d2, d3, d4) FROM parquet_t2 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `d1`, `gen_attr` AS `d2`, `gen_attr` AS `d3`, `gen_attr` AS `d4` FROM (SELECT TRANSFORM (`gen_attr`, `gen_attr`, `gen_attr`, `gen_attr`) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES('field.delim' = ' ') USING 'cat' AS (`gen_attr` string, `gen_attr` string, `gen_attr` string, `gen_attr` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES('field.delim' = ' ') FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_alias_list_with_type.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_alias_list_with_type.sql new file mode 100644 index 000000000000..4ab412cfd171 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_alias_list_with_type.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +FROM +(FROM parquet_t1 SELECT TRANSFORM(key, value) USING 'cat' AS (thing1 int, thing2 string)) t +SELECT thing1 + 1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `(thing1 + 1)` FROM (SELECT (`gen_attr` + 1) AS `gen_attr` FROM (SELECT TRANSFORM (`gen_attr`, `gen_attr`) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES('field.delim' = ' ') USING 'cat' AS (`gen_attr` int, `gen_attr` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES('field.delim' = ' ') FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS t) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_multiple.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_multiple.sql new file mode 100644 index 000000000000..1e30e45d45d3 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_multiple.sql @@ -0,0 +1,8 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT TRANSFORM (key) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t' +USING 'cat' AS (tKey) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t' +FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `tKey` FROM (SELECT TRANSFORM (`gen_attr`) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t' USING 'cat' AS (`gen_attr` string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t' FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_one.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_one.sql new file mode 100644 index 000000000000..dd622898780a --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_one.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT TRANSFORM (key) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +USING 'cat' AS (tKey) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `tKey` FROM (SELECT TRANSFORM (`gen_attr`) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' USING 'cat' AS (`gen_attr` string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_serde.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_serde.sql new file mode 100644 index 000000000000..2ad369815742 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_serde.sql @@ -0,0 +1,10 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT TRANSFORM (key, value) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES('field.delim' = '|') +USING 'cat' AS (tKey, tValue) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES('field.delim' = '|') +FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `tKey`, `gen_attr` AS `tValue` FROM (SELECT TRANSFORM (`gen_attr`, `gen_attr`) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES('field.delim' = '|') USING 'cat' AS (`gen_attr` string, `gen_attr` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES('field.delim' = '|') FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_without_serde.sql b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_without_serde.sql new file mode 100644 index 000000000000..a90b42d9b4ae --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/script_transformation_row_format_without_serde.sql @@ -0,0 +1,8 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT TRANSFORM (key, value) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +USING 'cat' AS (tKey, tValue) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `tKey`, `gen_attr` AS `tValue` FROM (SELECT TRANSFORM (`gen_attr`, `gen_attr`) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' USING 'cat' AS (`gen_attr` string, `gen_attr` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1 diff --git a/sql/hive/src/test/resources/sqlgen/select_distinct.sql b/sql/hive/src/test/resources/sqlgen/select_distinct.sql new file mode 100644 index 000000000000..3bc8e5537928 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/select_distinct.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT DISTINCT id FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT DISTINCT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/select_orc_table.sql b/sql/hive/src/test/resources/sqlgen/select_orc_table.sql new file mode 100644 index 000000000000..eae67f954038 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/select_orc_table.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +select * from orc_t +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `c1`, `gen_attr` AS `c2` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `c1` AS `gen_attr`, `c2` AS `gen_attr` FROM `default`.`orc_t`) AS gen_subquery_0) AS orc_t diff --git a/sql/hive/src/test/resources/sqlgen/select_parquet_table.sql b/sql/hive/src/test/resources/sqlgen/select_parquet_table.sql new file mode 100644 index 000000000000..a085bab84c9d --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/select_parquet_table.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +select * from parquet_t +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `c1`, `gen_attr` AS `c2` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `c1` AS `gen_attr`, `c2` AS `gen_attr` FROM `default`.`parquet_t`) AS gen_subquery_0) AS parquet_t diff --git a/sql/hive/src/test/resources/sqlgen/self_join.sql b/sql/hive/src/test/resources/sqlgen/self_join.sql new file mode 100644 index 000000000000..8947ccda820a --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/self_join.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT x.key FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key` FROM (SELECT `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_1 ON (`gen_attr` = `gen_attr`)) AS x diff --git a/sql/hive/src/test/resources/sqlgen/self_join_with_group_by.sql b/sql/hive/src/test/resources/sqlgen/self_join_with_group_by.sql new file mode 100644 index 000000000000..6db053f28602 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/self_join_with_group_by.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT x.key, COUNT(*) FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key group by x.key +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `count(1)` FROM (SELECT `gen_attr`, count(1) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_1 ON (`gen_attr` = `gen_attr`) GROUP BY `gen_attr`) AS x diff --git a/sql/hive/src/test/resources/sqlgen/sort_by_after_having.sql b/sql/hive/src/test/resources/sqlgen/sort_by_after_having.sql new file mode 100644 index 000000000000..e9a6afdc7eda --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/sort_by_after_having.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0 SORT BY key +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, max(`gen_attr`) AS `gen_attr`, `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr` HAVING (`gen_attr` > CAST(0 AS BIGINT))) AS gen_subquery_1 SORT BY `gen_attr` ASC) AS gen_subquery_2) AS gen_subquery_3 diff --git a/sql/hive/src/test/resources/sqlgen/tablesample_1.sql b/sql/hive/src/test/resources/sqlgen/tablesample_1.sql new file mode 100644 index 000000000000..54c8deeb6567 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/tablesample_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT s.id FROM parquet_t0 TABLESAMPLE(100 PERCENT) s +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0` TABLESAMPLE(100.0 PERCENT)) AS gen_subquery_0) AS s diff --git a/sql/hive/src/test/resources/sqlgen/tablesample_2.sql b/sql/hive/src/test/resources/sqlgen/tablesample_2.sql new file mode 100644 index 000000000000..13dcadb41a87 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/tablesample_2.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT * FROM parquet_t0 TABLESAMPLE(100 PERCENT) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0` TABLESAMPLE(100.0 PERCENT)) AS gen_subquery_0) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/tablesample_3.sql b/sql/hive/src/test/resources/sqlgen/tablesample_3.sql new file mode 100644 index 000000000000..c1ed57f5e3f2 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/tablesample_3.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT s.id FROM t0 TABLESAMPLE(100 PERCENT) s +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0` TABLESAMPLE(100.0 PERCENT)) AS gen_subquery_0) AS s diff --git a/sql/hive/src/test/resources/sqlgen/tablesample_4.sql b/sql/hive/src/test/resources/sqlgen/tablesample_4.sql new file mode 100644 index 000000000000..c22b20802c41 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/tablesample_4.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT * FROM t0 TABLESAMPLE(100 PERCENT) +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0` TABLESAMPLE(100.0 PERCENT)) AS gen_subquery_0) AS t0 diff --git a/sql/hive/src/test/resources/sqlgen/tablesample_5.sql b/sql/hive/src/test/resources/sqlgen/tablesample_5.sql new file mode 100644 index 000000000000..fa69dd46c746 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/tablesample_5.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT s.id FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) s WHERE 1=0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0` TABLESAMPLE(0.1 PERCENT)) AS gen_subquery_0 WHERE (1 = 0)) AS s diff --git a/sql/hive/src/test/resources/sqlgen/tablesample_6.sql b/sql/hive/src/test/resources/sqlgen/tablesample_6.sql new file mode 100644 index 000000000000..bc72560c4290 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/tablesample_6.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT * FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) WHERE 1=0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0` TABLESAMPLE(0.1 PERCENT)) AS gen_subquery_0 WHERE (1 = 0)) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/three_child_union.sql b/sql/hive/src/test/resources/sqlgen/three_child_union.sql new file mode 100644 index 000000000000..c3781117d814 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/three_child_union.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM parquet_t0 +UNION ALL SELECT id FROM parquet_t0 +UNION ALL SELECT id FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM ((SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) UNION ALL (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_1) UNION ALL (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_2)) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/type_widening.sql b/sql/hive/src/test/resources/sqlgen/type_widening.sql new file mode 100644 index 000000000000..96e6cc2fec90 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/type_widening.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT id FROM parquet_t0 UNION ALL SELECT CAST(id AS INT) AS id FROM parquet_t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM ((SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) UNION ALL (SELECT CAST(CAST(`gen_attr` AS INT) AS BIGINT) AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_1)) AS parquet_t0 diff --git a/sql/hive/src/test/resources/sqlgen/union_distinct.sql b/sql/hive/src/test/resources/sqlgen/union_distinct.sql new file mode 100644 index 000000000000..c5895ae766d2 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/union_distinct.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT * FROM t0 UNION SELECT * FROM t0 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `id` FROM ((SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_0) UNION DISTINCT (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`t0`) AS gen_subquery_1)) AS t0 diff --git a/sql/hive/src/test/resources/sqlgen/window_basic_1.sql b/sql/hive/src/test/resources/sqlgen/window_basic_1.sql new file mode 100644 index 000000000000..f86ca5e086b5 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_basic_1.sql @@ -0,0 +1,4 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT MAX(value) OVER (PARTITION BY key % 3) FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `max(value) OVER (PARTITION BY (key % CAST(3 AS BIGINT)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)` FROM (SELECT `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, max(`gen_attr`) OVER (PARTITION BY `gen_attr` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `gen_attr` FROM (SELECT `gen_attr`, (`gen_attr` % CAST(3 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1) AS gen_subquery_2) AS gen_subquery_3 diff --git a/sql/hive/src/test/resources/sqlgen/window_basic_2.sql b/sql/hive/src/test/resources/sqlgen/window_basic_2.sql new file mode 100644 index 000000000000..15559ac8e15d --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_basic_2.sql @@ -0,0 +1,5 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, value, ROUND(AVG(key) OVER (), 2) +FROM parquet_t1 ORDER BY key +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value`, `gen_attr` AS `round(avg(key) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), 2)` FROM (SELECT `gen_attr`, `gen_attr`, round(`gen_attr`, 2) AS `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, avg(`gen_attr`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `gen_attr` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1) AS gen_subquery_2 ORDER BY `gen_attr` ASC) AS parquet_t1 diff --git a/sql/hive/src/test/resources/sqlgen/window_basic_3.sql b/sql/hive/src/test/resources/sqlgen/window_basic_3.sql new file mode 100644 index 000000000000..43f992882ce2 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_basic_3.sql @@ -0,0 +1,5 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT value, MAX(key + 1) OVER (PARTITION BY key % 5 ORDER BY key % 7) AS max +FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `value`, `gen_attr` AS `max` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, max(`gen_attr`) OVER (PARTITION BY `gen_attr` ORDER BY `gen_attr` ASC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `gen_attr` FROM (SELECT `gen_attr`, (`gen_attr` + CAST(1 AS BIGINT)) AS `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr`, (`gen_attr` % CAST(7 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1) AS gen_subquery_2) AS parquet_t1 diff --git a/sql/hive/src/test/resources/sqlgen/window_with_join.sql b/sql/hive/src/test/resources/sqlgen/window_with_join.sql new file mode 100644 index 000000000000..00c45c8feafa --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_with_join.sql @@ -0,0 +1,5 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT x.key, MAX(y.key) OVER (PARTITION BY x.key % 5 ORDER BY x.key) +FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `max(key) OVER (PARTITION BY (key % CAST(5 AS BIGINT)) ORDER BY key ASC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT gen_subquery_2.`gen_attr`, gen_subquery_2.`gen_attr`, gen_subquery_2.`gen_attr`, max(`gen_attr`) OVER (PARTITION BY `gen_attr` ORDER BY `gen_attr` ASC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `gen_attr` FROM (SELECT `gen_attr`, `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_1 ON (`gen_attr` = `gen_attr`)) AS gen_subquery_2) AS gen_subquery_3) AS x diff --git a/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg.sql b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg.sql new file mode 100644 index 000000000000..65d3d3aeb623 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg.sql @@ -0,0 +1,7 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, value, +DENSE_RANK() OVER (DISTRIBUTE BY key SORT BY key, value) AS dr, +COUNT(key) +FROM parquet_t1 GROUP BY key, value +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value`, `gen_attr` AS `dr`, `gen_attr` AS `count(key)` FROM (SELECT `gen_attr`, `gen_attr`, `gen_attr`, `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, DENSE_RANK() OVER (PARTITION BY `gen_attr` ORDER BY `gen_attr` ASC, `gen_attr` ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `gen_attr` FROM (SELECT `gen_attr`, `gen_attr`, count(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr`) AS gen_subquery_1) AS gen_subquery_2) AS parquet_t1 diff --git a/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_filter.sql b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_filter.sql new file mode 100644 index 000000000000..03e0962f4494 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_filter.sql @@ -0,0 +1,7 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, value, +DENSE_RANK() OVER (DISTRIBUTE BY key SORT BY key, value) AS dr, +COUNT(key) OVER(DISTRIBUTE BY key SORT BY key, value) AS ca +FROM parquet_t1 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value`, `gen_attr` AS `dr`, `gen_attr` AS `ca` FROM (SELECT `gen_attr`, `gen_attr`, `gen_attr`, `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, DENSE_RANK() OVER (PARTITION BY `gen_attr` ORDER BY `gen_attr` ASC, `gen_attr` ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `gen_attr`, count(`gen_attr`) OVER (PARTITION BY `gen_attr` ORDER BY `gen_attr` ASC, `gen_attr` ASC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `gen_attr` FROM (SELECT `gen_attr`, `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0) AS gen_subquery_1) AS gen_subquery_2) AS parquet_t1 diff --git a/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_functions.sql b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_functions.sql new file mode 100644 index 000000000000..090207f9b835 --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_functions.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, value, +MAX(value) OVER (PARTITION BY key % 5 ORDER BY key) AS max +FROM parquet_t1 GROUP BY key, value +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value`, `gen_attr` AS `max` FROM (SELECT `gen_attr`, `gen_attr`, `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, max(`gen_attr`) OVER (PARTITION BY `gen_attr` ORDER BY `gen_attr` ASC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `gen_attr` FROM (SELECT `gen_attr`, `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr`) AS gen_subquery_1) AS gen_subquery_2) AS parquet_t1 diff --git a/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_having.sql b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_having.sql new file mode 100644 index 000000000000..fcc2cf7de80a --- /dev/null +++ b/sql/hive/src/test/resources/sqlgen/window_with_the_same_window_with_agg_having.sql @@ -0,0 +1,6 @@ +-- This file is automatically generated by LogicalPlanToSQLSuite. +SELECT key, value, +MAX(value) OVER (PARTITION BY key % 5 ORDER BY key DESC) AS max +FROM parquet_t1 GROUP BY key, value HAVING key > 5 +-------------------------------------------------------------------------------- +SELECT `gen_attr` AS `key`, `gen_attr` AS `value`, `gen_attr` AS `max` FROM (SELECT `gen_attr`, `gen_attr`, `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, max(`gen_attr`) OVER (PARTITION BY `gen_attr` ORDER BY `gen_attr` DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `gen_attr` FROM (SELECT `gen_attr`, `gen_attr`, (`gen_attr` % CAST(5 AS BIGINT)) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr` HAVING (`gen_attr` > CAST(5 AS BIGINT))) AS gen_subquery_1) AS gen_subquery_2) AS parquet_t1 diff --git a/sql/hive/src/test/resources/test_script.sh b/sql/hive/src/test/resources/test_script.sh new file mode 100755 index 000000000000..eb0c50e98292 --- /dev/null +++ b/sql/hive/src/test/resources/test_script.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +while read line +do + echo "$line" | sed $'s/\t/_/' +done < /dev/stdin diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala index c8bf20d13bdb..fef726c5d801 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala @@ -20,8 +20,7 @@ package org.apache.spark.sql.catalyst import java.sql.Timestamp import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions.{If, Literal} - +import org.apache.spark.sql.catalyst.expressions.{If, Literal, SpecifiedWindowFrame, WindowSpecDefinition} class ExpressionSQLBuilderSuite extends SQLBuilderTest { test("literal") { @@ -79,4 +78,36 @@ class ExpressionSQLBuilderSuite extends SQLBuilderTest { checkSQL(-'a.int, "(-`a`)") checkSQL(-('a.int + 'b.int), "(-(`a` + `b`))") } + + test("window specification") { + val frame = SpecifiedWindowFrame.defaultWindowFrame( + hasOrderSpecification = true, + acceptWindowFrame = true + ) + + checkSQL( + WindowSpecDefinition('a.int :: Nil, Nil, frame), + s"(PARTITION BY `a` $frame)" + ) + + checkSQL( + WindowSpecDefinition('a.int :: 'b.string :: Nil, Nil, frame), + s"(PARTITION BY `a`, `b` $frame)" + ) + + checkSQL( + WindowSpecDefinition(Nil, 'a.int.asc :: Nil, frame), + s"(ORDER BY `a` ASC $frame)" + ) + + checkSQL( + WindowSpecDefinition(Nil, 'a.int.asc :: 'b.string.desc :: Nil, frame), + s"(ORDER BY `a` ASC, `b` DESC $frame)" + ) + + checkSQL( + WindowSpecDefinition('a.int :: 'b.string :: Nil, 'c.int.asc :: 'd.string.desc :: Nil, frame), + s"(PARTITION BY `a`, `b` ORDER BY `c` ASC, `d` DESC $frame)" + ) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala index 0827b04252bc..1f5078dc9c7d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala @@ -17,15 +17,33 @@ package org.apache.spark.sql.catalyst +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, NoSuchFileException, Paths} + import scala.util.control.NonFatal import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SQLTestUtils +/** + * A test suite for LogicalPlan-to-SQL conversion. + * + * Each query has a golden generated SQL file in test/resources/sqlgen. The test suite also has + * built-in functionality to automatically generate these golden files. + * + * To re-generate golden files, run: + * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "hive/test-only *LogicalPlanToSQLSuite" + */ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { import testImplicits._ + // Used for generating new query answer files by saving + private val regenerateGoldenFiles: Boolean = + Option(System.getenv("SPARK_GENERATE_GOLDEN_FILES")) == Some("1") + private val goldenSQLPath = "src/test/resources/sqlgen/" + protected override def beforeAll(): Unit = { super.beforeAll() sql("DROP TABLE IF EXISTS parquet_t0") @@ -76,22 +94,60 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { } } - private def checkHiveQl(hiveQl: String): Unit = { - val df = sql(hiveQl) + /** + * Compare the generated SQL with the expected answer string. + * Note that there exists a normalization for both arguments for the convenience. + * - Remove the id from the generated attributes, e.g., `gen_attr_1` -> `gen_attr`. + */ + private def checkSQLStructure(originalSQL: String, convertedSQL: String, answerFile: String) = { + val normalizedGenSQL = convertedSQL.replaceAll("`gen_attr_\\d+`", "`gen_attr`") + if (answerFile != null) { + val separator = "-" * 80 + if (regenerateGoldenFiles) { + val path = Paths.get(s"$goldenSQLPath/$answerFile.sql") + val header = "-- This file is automatically generated by LogicalPlanToSQLSuite." + val answerText = s"$header\n${originalSQL.trim()}\n${separator}\n$normalizedGenSQL\n" + Files.write(path, answerText.getBytes(StandardCharsets.UTF_8)) + } else { + val goldenFileName = s"sqlgen/$answerFile.sql" + val resourceFile = getClass.getClassLoader.getResource(goldenFileName) + if (resourceFile == null) { + throw new NoSuchFileException(goldenFileName) + } + val path = resourceFile.getPath + val answerText = new String(Files.readAllBytes(Paths.get(path)), StandardCharsets.UTF_8) + val sqls = answerText.split(separator) + assert(sqls.length == 2, "Golden sql files should have a separator.") + val normalizedExpectSQL = sqls(1).trim() + assert(normalizedGenSQL == normalizedExpectSQL) + } + } + } + + /** + * 1. Checks if SQL parsing succeeds. + * 2. Checks if SQL generation succeeds. + * 3. Checks the generated SQL against golden files. + * 4. Verifies the execution result stays the same. + */ + private def checkSQL(sqlString: String, answerFile: String = null): Unit = { + val df = sql(sqlString) val convertedSQL = try new SQLBuilder(df).toSQL catch { case NonFatal(e) => fail( - s"""Cannot convert the following HiveQL query plan back to SQL query string: + s"""Cannot convert the following SQL query plan back to SQL query string: | - |# Original HiveQL query string: - |$hiveQl + |# Original SQL query string: + |$sqlString | |# Resolved query plan: |${df.queryExecution.analyzed.treeString} """.stripMargin, e) } + checkSQLStructure(sqlString, convertedSQL, answerFile) + try { checkAnswer(sql(convertedSQL), df) } catch { case cause: Throwable => @@ -101,8 +157,8 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { |# Converted SQL query string: |$convertedSQL | - |# Original HiveQL query string: - |$hiveQl + |# Original SQL query string: + |$sqlString | |# Resolved query plan: |${df.queryExecution.analyzed.treeString} @@ -110,24 +166,57 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { } } + // When saving golden files, these tests should be ignored to prevent making files. + if (!regenerateGoldenFiles) { + test("Test should fail if the SQL query cannot be parsed") { + val m = intercept[ParseException] { + checkSQL("SELE", "NOT_A_FILE") + }.getMessage + assert(m.contains("mismatched input")) + } + + test("Test should fail if the golden file cannot be found") { + val m2 = intercept[NoSuchFileException] { + checkSQL("SELECT 1", "NOT_A_FILE") + }.getMessage + assert(m2.contains("NOT_A_FILE")) + } + + test("Test should fail if the SQL query cannot be regenerated") { + spark.range(10).createOrReplaceTempView("not_sql_gen_supported_table_so_far") + sql("select * from not_sql_gen_supported_table_so_far") + val m3 = intercept[org.scalatest.exceptions.TestFailedException] { + checkSQL("select * from not_sql_gen_supported_table_so_far", "in") + }.getMessage + assert(m3.contains("Cannot convert the following SQL query plan back to SQL query string")) + } + + test("Test should fail if the SQL query did not equal to the golden SQL") { + val m4 = intercept[org.scalatest.exceptions.TestFailedException] { + checkSQL("SELECT 1", "in") + }.getMessage + assert(m4.contains("did not equal")) + } + } + test("in") { - checkHiveQl("SELECT id FROM parquet_t0 WHERE id IN (1, 2, 3)") + checkSQL("SELECT id FROM parquet_t0 WHERE id IN (1, 2, 3)", "in") } test("not in") { - checkHiveQl("SELECT id FROM t0 WHERE id NOT IN (1, 2, 3)") + checkSQL("SELECT id FROM t0 WHERE id NOT IN (1, 2, 3)", "not_in") } test("not like") { - checkHiveQl("SELECT id FROM t0 WHERE id + 5 NOT LIKE '1%'") + checkSQL("SELECT id FROM t0 WHERE id + 5 NOT LIKE '1%'", "not_like") } test("aggregate function in having clause") { - checkHiveQl("SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0") + checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0", "agg1") } test("aggregate function in order by clause") { - checkHiveQl("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY MAX(key)") + checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY MAX(key)", "agg2") } // When there are multiple aggregate functions in ORDER BY clause, all of them are extracted into @@ -135,61 +224,67 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { // execution since these aliases have different expression ID. But this introduces name collision // when converting resolved plans back to SQL query strings as expression IDs are stripped. test("aggregate function in order by clause with multiple order keys") { - checkHiveQl("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY key, MAX(key)") + checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY key, MAX(key)", "agg3") } test("type widening in union") { - checkHiveQl("SELECT id FROM parquet_t0 UNION ALL SELECT CAST(id AS INT) AS id FROM parquet_t0") + checkSQL("SELECT id FROM parquet_t0 UNION ALL SELECT CAST(id AS INT) AS id FROM parquet_t0", + "type_widening") } test("union distinct") { - checkHiveQl("SELECT * FROM t0 UNION SELECT * FROM t0") + checkSQL("SELECT * FROM t0 UNION SELECT * FROM t0", "union_distinct") } test("three-child union") { - checkHiveQl( + checkSQL( """ |SELECT id FROM parquet_t0 |UNION ALL SELECT id FROM parquet_t0 |UNION ALL SELECT id FROM parquet_t0 - """.stripMargin) + """.stripMargin, + "three_child_union") } test("intersect") { - checkHiveQl("SELECT * FROM t0 INTERSECT SELECT * FROM t0") + checkSQL("SELECT * FROM t0 INTERSECT SELECT * FROM t0", "intersect") } test("except") { - checkHiveQl("SELECT * FROM t0 EXCEPT SELECT * FROM t0") + checkSQL("SELECT * FROM t0 EXCEPT SELECT * FROM t0", "except") } test("self join") { - checkHiveQl("SELECT x.key FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key") + checkSQL("SELECT x.key FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key", "self_join") } test("self join with group by") { - checkHiveQl( - "SELECT x.key, COUNT(*) FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key group by x.key") + checkSQL( + "SELECT x.key, COUNT(*) FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key group by x.key", + "self_join_with_group_by") } test("case") { - checkHiveQl("SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM parquet_t0") + checkSQL("SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM parquet_t0", + "case") } test("case with else") { - checkHiveQl("SELECT CASE WHEN id % 2 > 0 THEN 0 ELSE 1 END FROM parquet_t0") + checkSQL("SELECT CASE WHEN id % 2 > 0 THEN 0 ELSE 1 END FROM parquet_t0", "case_with_else") } test("case with key") { - checkHiveQl("SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' END FROM parquet_t0") + checkSQL("SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' END FROM parquet_t0", + "case_with_key") } test("case with key and else") { - checkHiveQl("SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' ELSE 'baz' END FROM parquet_t0") + checkSQL("SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' ELSE 'baz' END FROM parquet_t0", + "case_with_key_and_else") } test("select distinct without aggregate functions") { - checkHiveQl("SELECT DISTINCT id FROM parquet_t0") + checkSQL("SELECT DISTINCT id FROM parquet_t0", "select_distinct") } test("rollup/cube #1") { @@ -213,146 +308,195 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { // FROM `default`.`t1` // GROUP BY (`t1`.`key` % CAST(5 AS BIGINT)) // GROUPING SETS (((`t1`.`key` % CAST(5 AS BIGINT))), ()) - checkHiveQl( - "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH ROLLUP") - checkHiveQl( - "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH CUBE") + checkSQL( + "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH ROLLUP", + "rollup_cube_1_1") + + checkSQL( + "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH CUBE", + "rollup_cube_1_2") } test("rollup/cube #2") { - checkHiveQl("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH ROLLUP") - checkHiveQl("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH CUBE") + checkSQL("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH ROLLUP", + "rollup_cube_2_1") + + checkSQL("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH CUBE", + "rollup_cube_2_2") } test("rollup/cube #3") { - checkHiveQl( - "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH ROLLUP") - checkHiveQl( - "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH CUBE") + checkSQL( + "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH ROLLUP", + "rollup_cube_3_1") + + checkSQL( + "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH CUBE", + "rollup_cube_3_2") } test("rollup/cube #4") { - checkHiveQl( + checkSQL( s""" |SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1 |GROUP BY key % 5, key - 5 WITH ROLLUP - """.stripMargin) - checkHiveQl( + """.stripMargin, + "rollup_cube_4_1") + + checkSQL( s""" |SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1 |GROUP BY key % 5, key - 5 WITH CUBE - """.stripMargin) + """.stripMargin, + "rollup_cube_4_2") } test("rollup/cube #5") { - checkHiveQl( + checkSQL( s""" |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3 |FROM (SELECT key, key%2, key - 5 FROM parquet_t1) t GROUP BY key%5, key-5 |WITH ROLLUP - """.stripMargin) - checkHiveQl( + """.stripMargin, + "rollup_cube_5_1") + + checkSQL( s""" |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3 |FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5 |WITH CUBE - """.stripMargin) + """.stripMargin, + "rollup_cube_5_2") } test("rollup/cube #6") { - checkHiveQl("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b") - checkHiveQl("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b") - checkHiveQl("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b") - checkHiveQl("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b") - checkHiveQl("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH ROLLUP") - checkHiveQl("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH CUBE") + checkSQL("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b", + "rollup_cube_6_1") + + checkSQL("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b", + "rollup_cube_6_2") + + checkSQL("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b", + "rollup_cube_6_3") + + checkSQL("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b", + "rollup_cube_6_4") + + checkSQL("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH ROLLUP", + "rollup_cube_6_5") + + checkSQL("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH CUBE", + "rollup_cube_6_6") } test("rollup/cube #7") { - checkHiveQl("SELECT a, b, grouping_id(a, b) FROM parquet_t2 GROUP BY cube(a, b)") - checkHiveQl("SELECT a, b, grouping(b) FROM parquet_t2 GROUP BY cube(a, b)") - checkHiveQl("SELECT a, b, grouping(a) FROM parquet_t2 GROUP BY cube(a, b)") + checkSQL("SELECT a, b, grouping_id(a, b) FROM parquet_t2 GROUP BY cube(a, b)", + "rollup_cube_7_1") + + checkSQL("SELECT a, b, grouping(b) FROM parquet_t2 GROUP BY cube(a, b)", + "rollup_cube_7_2") + + checkSQL("SELECT a, b, grouping(a) FROM parquet_t2 GROUP BY cube(a, b)", + "rollup_cube_7_3") } test("rollup/cube #8") { // grouping_id() is part of another expression - checkHiveQl( + checkSQL( s""" |SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid |FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5 |WITH ROLLUP - """.stripMargin) - checkHiveQl( + """.stripMargin, + "rollup_cube_8_1") + + checkSQL( s""" |SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid |FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5 |WITH CUBE - """.stripMargin) + """.stripMargin, + "rollup_cube_8_2") } test("rollup/cube #9") { // self join is used as the child node of ROLLUP/CUBE with replaced quantifiers - checkHiveQl( + checkSQL( s""" |SELECT t.key - 5, cnt, SUM(cnt) |FROM (SELECT x.key, COUNT(*) as cnt |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t |GROUP BY cnt, t.key - 5 |WITH ROLLUP - """.stripMargin) - checkHiveQl( + """.stripMargin, + "rollup_cube_9_1") + + checkSQL( s""" |SELECT t.key - 5, cnt, SUM(cnt) |FROM (SELECT x.key, COUNT(*) as cnt |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t |GROUP BY cnt, t.key - 5 |WITH CUBE - """.stripMargin) + """.stripMargin, + "rollup_cube_9_2") } test("grouping sets #1") { - checkHiveQl( + checkSQL( s""" |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id() AS k3 |FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5 |GROUPING SETS (key % 5, key - 5) - """.stripMargin) + """.stripMargin, + "grouping_sets_1") } test("grouping sets #2") { - checkHiveQl( - "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a, b) ORDER BY a, b") - checkHiveQl( - "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a) ORDER BY a, b") - checkHiveQl( - "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (b) ORDER BY a, b") - checkHiveQl( - "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (()) ORDER BY a, b") - checkHiveQl( + checkSQL( + "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a, b) ORDER BY a, b", + "grouping_sets_2_1") + + checkSQL( + "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a) ORDER BY a, b", + "grouping_sets_2_2") + + checkSQL( + "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (b) ORDER BY a, b", + "grouping_sets_2_3") + + checkSQL( + "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (()) ORDER BY a, b", + "grouping_sets_2_4") + + checkSQL( s""" |SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b |GROUPING SETS ((), (a), (a, b)) ORDER BY a, b - """.stripMargin) + """.stripMargin, + "grouping_sets_2_5") } test("cluster by") { - checkHiveQl("SELECT id FROM parquet_t0 CLUSTER BY id") + checkSQL("SELECT id FROM parquet_t0 CLUSTER BY id", "cluster_by") } test("distribute by") { - checkHiveQl("SELECT id FROM parquet_t0 DISTRIBUTE BY id") + checkSQL("SELECT id FROM parquet_t0 DISTRIBUTE BY id", "distribute_by") } test("distribute by with sort by") { - checkHiveQl("SELECT id FROM parquet_t0 DISTRIBUTE BY id SORT BY id") + checkSQL("SELECT id FROM parquet_t0 DISTRIBUTE BY id SORT BY id", + "distribute_by_with_sort_by") } test("SPARK-13720: sort by after having") { - checkHiveQl("SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0 SORT BY key") + checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0 SORT BY key", + "sort_by_after_having") } test("distinct aggregation") { - checkHiveQl("SELECT COUNT(DISTINCT id) FROM parquet_t0") + checkSQL("SELECT COUNT(DISTINCT id) FROM parquet_t0", "distinct_aggregation") } test("TABLESAMPLE") { @@ -361,33 +505,34 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { // +- Subquery s // +- Subquery parquet_t0 // +- Relation[id#2L] ParquetRelation - checkHiveQl("SELECT s.id FROM parquet_t0 TABLESAMPLE(100 PERCENT) s") + checkSQL("SELECT s.id FROM parquet_t0 TABLESAMPLE(100 PERCENT) s", "tablesample_1") // Project [id#2L] // +- Sample 0.0, 1.0, false, ... // +- Subquery parquet_t0 // +- Relation[id#2L] ParquetRelation - checkHiveQl("SELECT * FROM parquet_t0 TABLESAMPLE(100 PERCENT)") + checkSQL("SELECT * FROM parquet_t0 TABLESAMPLE(100 PERCENT)", "tablesample_2") // Project [id#21L] // +- Sample 0.0, 1.0, false, ... // +- MetastoreRelation default, t0, Some(s) - checkHiveQl("SELECT s.id FROM t0 TABLESAMPLE(100 PERCENT) s") + checkSQL("SELECT s.id FROM t0 TABLESAMPLE(100 PERCENT) s", "tablesample_3") // Project [id#24L] // +- Sample 0.0, 1.0, false, ... // +- MetastoreRelation default, t0, None - checkHiveQl("SELECT * FROM t0 TABLESAMPLE(100 PERCENT)") + checkSQL("SELECT * FROM t0 TABLESAMPLE(100 PERCENT)", "tablesample_4") // When a sampling fraction is not 100%, the returned results are random. // Thus, added an always-false filter here to check if the generated plan can be successfully // executed. - checkHiveQl("SELECT s.id FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) s WHERE 1=0") - checkHiveQl("SELECT * FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) WHERE 1=0") + checkSQL("SELECT s.id FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) s WHERE 1=0", "tablesample_5") + checkSQL("SELECT * FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) WHERE 1=0", "tablesample_6") } test("multi-distinct columns") { - checkHiveQl("SELECT a, COUNT(DISTINCT b), COUNT(DISTINCT c), SUM(d) FROM parquet_t2 GROUP BY a") + checkSQL("SELECT a, COUNT(DISTINCT b), COUNT(DISTINCT c), SUM(d) FROM parquet_t2 GROUP BY a", + "multi_distinct") } test("persisted data source relations") { @@ -395,48 +540,54 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { val tableName = s"${format}_parquet_t0" withTable(tableName) { spark.range(10).write.format(format).saveAsTable(tableName) - checkHiveQl(s"SELECT id FROM $tableName") + checkSQL(s"SELECT id FROM $tableName", s"data_source_$tableName") } } } test("script transformation - schemaless") { - checkHiveQl("SELECT TRANSFORM (a, b, c, d) USING 'cat' FROM parquet_t2") - checkHiveQl("SELECT TRANSFORM (*) USING 'cat' FROM parquet_t2") + checkSQL("SELECT TRANSFORM (a, b, c, d) USING 'cat' FROM parquet_t2", + "script_transformation_1") + checkSQL("SELECT TRANSFORM (*) USING 'cat' FROM parquet_t2", + "script_transformation_2") } test("script transformation - alias list") { - checkHiveQl("SELECT TRANSFORM (a, b, c, d) USING 'cat' AS (d1, d2, d3, d4) FROM parquet_t2") + checkSQL("SELECT TRANSFORM (a, b, c, d) USING 'cat' AS (d1, d2, d3, d4) FROM parquet_t2", + "script_transformation_alias_list") } test("script transformation - alias list with type") { - checkHiveQl( + checkSQL( """FROM |(FROM parquet_t1 SELECT TRANSFORM(key, value) USING 'cat' AS (thing1 int, thing2 string)) t |SELECT thing1 + 1 - """.stripMargin) + """.stripMargin, + "script_transformation_alias_list_with_type") } test("script transformation - row format delimited clause with only one format property") { - checkHiveQl( + checkSQL( """SELECT TRANSFORM (key) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' |USING 'cat' AS (tKey) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' |FROM parquet_t1 - """.stripMargin) + """.stripMargin, + "script_transformation_row_format_one") } test("script transformation - row format delimited clause with multiple format properties") { - checkHiveQl( + checkSQL( """SELECT TRANSFORM (key) |ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t' |USING 'cat' AS (tKey) |ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t' |FROM parquet_t1 - """.stripMargin) + """.stripMargin, + "script_transformation_row_format_multiple") } test("script transformation - row format serde clauses with SERDEPROPERTIES") { - checkHiveQl( + checkSQL( """SELECT TRANSFORM (key, value) |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |WITH SERDEPROPERTIES('field.delim' = '|') @@ -444,17 +595,19 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |WITH SERDEPROPERTIES('field.delim' = '|') |FROM parquet_t1 - """.stripMargin) + """.stripMargin, + "script_transformation_row_format_serde") } test("script transformation - row format serde clauses without SERDEPROPERTIES") { - checkHiveQl( + checkSQL( """SELECT TRANSFORM (key, value) |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |USING 'cat' AS (tKey, tValue) |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |FROM parquet_t1 - """.stripMargin) + """.stripMargin, + "script_transformation_row_format_without_serde") } test("plans with non-SQL expressions") { @@ -464,7 +617,7 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { test("named expression in column names shouldn't be quoted") { def checkColumnNames(query: String, expectedColNames: String*): Unit = { - checkHiveQl(query) + checkSQL(query) assert(sql(query).columns === expectedColNames) } @@ -521,21 +674,25 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { } test("window basic") { - checkHiveQl("SELECT MAX(value) OVER (PARTITION BY key % 3) FROM parquet_t1") - checkHiveQl( + checkSQL("SELECT MAX(value) OVER (PARTITION BY key % 3) FROM parquet_t1", "window_basic_1") + + checkSQL( """ |SELECT key, value, ROUND(AVG(key) OVER (), 2) |FROM parquet_t1 ORDER BY key - """.stripMargin) - checkHiveQl( + """.stripMargin, + "window_basic_2") + + checkSQL( """ |SELECT value, MAX(key + 1) OVER (PARTITION BY key % 5 ORDER BY key % 7) AS max |FROM parquet_t1 - """.stripMargin) + """.stripMargin, + "window_basic_3") } test("multiple window functions in one expression") { - checkHiveQl( + checkSQL( """ |SELECT | MAX(key) OVER (ORDER BY key DESC, value) / MIN(key) OVER (PARTITION BY key % 3) @@ -544,15 +701,17 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { } test("regular expressions and window functions in one expression") { - checkHiveQl("SELECT MAX(key) OVER (PARTITION BY key % 3) + key FROM parquet_t1") + checkSQL("SELECT MAX(key) OVER (PARTITION BY key % 3) + key FROM parquet_t1", + "regular_expressions_and_window") } test("aggregate functions and window functions in one expression") { - checkHiveQl("SELECT MAX(c) + COUNT(a) OVER () FROM parquet_t2 GROUP BY a, b") + checkSQL("SELECT MAX(c) + COUNT(a) OVER () FROM parquet_t2 GROUP BY a, b", + "aggregate_functions_and_window") } test("window with different window specification") { - checkHiveQl( + checkSQL( """ |SELECT key, value, |DENSE_RANK() OVER (ORDER BY key, value) AS dr, @@ -562,45 +721,49 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { } test("window with the same window specification with aggregate + having") { - checkHiveQl( + checkSQL( """ |SELECT key, value, |MAX(value) OVER (PARTITION BY key % 5 ORDER BY key DESC) AS max |FROM parquet_t1 GROUP BY key, value HAVING key > 5 - """.stripMargin) + """.stripMargin, + "window_with_the_same_window_with_agg_having") } test("window with the same window specification with aggregate functions") { - checkHiveQl( + checkSQL( """ |SELECT key, value, |MAX(value) OVER (PARTITION BY key % 5 ORDER BY key) AS max |FROM parquet_t1 GROUP BY key, value - """.stripMargin) + """.stripMargin, + "window_with_the_same_window_with_agg_functions") } test("window with the same window specification with aggregate") { - checkHiveQl( + checkSQL( """ |SELECT key, value, |DENSE_RANK() OVER (DISTRIBUTE BY key SORT BY key, value) AS dr, |COUNT(key) |FROM parquet_t1 GROUP BY key, value - """.stripMargin) + """.stripMargin, + "window_with_the_same_window_with_agg") } test("window with the same window specification without aggregate and filter") { - checkHiveQl( + checkSQL( """ |SELECT key, value, |DENSE_RANK() OVER (DISTRIBUTE BY key SORT BY key, value) AS dr, |COUNT(key) OVER(DISTRIBUTE BY key SORT BY key, value) AS ca |FROM parquet_t1 - """.stripMargin) + """.stripMargin, + "window_with_the_same_window_with_agg_filter") } test("window clause") { - checkHiveQl( + checkSQL( """ |SELECT key, MAX(value) OVER w1 AS MAX, MIN(value) OVER w2 AS min |FROM parquet_t1 @@ -609,7 +772,7 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { } test("special window functions") { - checkHiveQl( + checkSQL( """ |SELECT | RANK() OVER w, @@ -626,107 +789,120 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { } test("window with join") { - checkHiveQl( + checkSQL( """ |SELECT x.key, MAX(y.key) OVER (PARTITION BY x.key % 5 ORDER BY x.key) |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key - """.stripMargin) + """.stripMargin, + "window_with_join") } test("join 2 tables and aggregate function in having clause") { - checkHiveQl( + checkSQL( """ |SELECT COUNT(a.value), b.KEY, a.KEY |FROM parquet_t1 a, parquet_t1 b |GROUP BY a.KEY, b.KEY |HAVING MAX(a.KEY) > 0 - """.stripMargin) + """.stripMargin, + "join_2_tables") } test("generator in project list without FROM clause") { - checkHiveQl("SELECT EXPLODE(ARRAY(1,2,3))") - checkHiveQl("SELECT EXPLODE(ARRAY(1,2,3)) AS val") + checkSQL("SELECT EXPLODE(ARRAY(1,2,3))", "generator_without_from_1") + checkSQL("SELECT EXPLODE(ARRAY(1,2,3)) AS val", "generator_without_from_2") } test("generator in project list with non-referenced table") { - checkHiveQl("SELECT EXPLODE(ARRAY(1,2,3)) FROM t0") - checkHiveQl("SELECT EXPLODE(ARRAY(1,2,3)) AS val FROM t0") + checkSQL("SELECT EXPLODE(ARRAY(1,2,3)) FROM t0", "generator_non_referenced_table_1") + checkSQL("SELECT EXPLODE(ARRAY(1,2,3)) AS val FROM t0", "generator_non_referenced_table_2") } test("generator in project list with referenced table") { - checkHiveQl("SELECT EXPLODE(arr) FROM parquet_t3") - checkHiveQl("SELECT EXPLODE(arr) AS val FROM parquet_t3") + checkSQL("SELECT EXPLODE(arr) FROM parquet_t3", "generator_referenced_table_1") + checkSQL("SELECT EXPLODE(arr) AS val FROM parquet_t3", "generator_referenced_table_2") } test("generator in project list with non-UDTF expressions") { - checkHiveQl("SELECT EXPLODE(arr), id FROM parquet_t3") - checkHiveQl("SELECT EXPLODE(arr) AS val, id as a FROM parquet_t3") + checkSQL("SELECT EXPLODE(arr), id FROM parquet_t3", "generator_non_udtf_1") + checkSQL("SELECT EXPLODE(arr) AS val, id as a FROM parquet_t3", "generator_non_udtf_2") } test("generator in lateral view") { - checkHiveQl("SELECT val, id FROM parquet_t3 LATERAL VIEW EXPLODE(arr) exp AS val") - checkHiveQl("SELECT val, id FROM parquet_t3 LATERAL VIEW OUTER EXPLODE(arr) exp AS val") + checkSQL("SELECT val, id FROM parquet_t3 LATERAL VIEW EXPLODE(arr) exp AS val", + "generator_in_lateral_view_1") + checkSQL("SELECT val, id FROM parquet_t3 LATERAL VIEW OUTER EXPLODE(arr) exp AS val", + "generator_in_lateral_view_2") } test("generator in lateral view with ambiguous names") { - checkHiveQl( + checkSQL( """ |SELECT exp.id, parquet_t3.id |FROM parquet_t3 |LATERAL VIEW EXPLODE(arr) exp AS id - """.stripMargin) - checkHiveQl( + """.stripMargin, + "generator_with_ambiguous_names_1") + + checkSQL( """ |SELECT exp.id, parquet_t3.id |FROM parquet_t3 |LATERAL VIEW OUTER EXPLODE(arr) exp AS id - """.stripMargin) + """.stripMargin, + "generator_with_ambiguous_names_2") } test("use JSON_TUPLE as generator") { - checkHiveQl( + checkSQL( """ |SELECT c0, c1, c2 |FROM parquet_t3 |LATERAL VIEW JSON_TUPLE(json, 'f1', 'f2', 'f3') jt - """.stripMargin) - checkHiveQl( + """.stripMargin, + "json_tuple_generator_1") + + checkSQL( """ |SELECT a, b, c |FROM parquet_t3 |LATERAL VIEW JSON_TUPLE(json, 'f1', 'f2', 'f3') jt AS a, b, c - """.stripMargin) + """.stripMargin, + "json_tuple_generator_2") } test("nested generator in lateral view") { - checkHiveQl( + checkSQL( """ |SELECT val, id |FROM parquet_t3 |LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array |LATERAL VIEW EXPLODE(nested_array) exp1 AS val - """.stripMargin) + """.stripMargin, + "nested_generator_in_lateral_view_1") - checkHiveQl( + checkSQL( """ |SELECT val, id |FROM parquet_t3 |LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array |LATERAL VIEW OUTER EXPLODE(nested_array) exp1 AS val - """.stripMargin) + """.stripMargin, + "nested_generator_in_lateral_view_2") } test("generate with other operators") { - checkHiveQl( + checkSQL( """ |SELECT EXPLODE(arr) AS val, id |FROM parquet_t3 |WHERE id > 2 |ORDER BY val, id |LIMIT 5 - """.stripMargin) + """.stripMargin, + "generate_with_other_1") - checkHiveQl( + checkSQL( """ |SELECT val, id |FROM parquet_t3 @@ -735,24 +911,26 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { |WHERE val > 2 |ORDER BY val, id |LIMIT 5 - """.stripMargin) + """.stripMargin, + "generate_with_other_2") } test("filter after subquery") { - checkHiveQl("SELECT a FROM (SELECT key + 1 AS a FROM parquet_t1) t WHERE a > 5") + checkSQL("SELECT a FROM (SELECT key + 1 AS a FROM parquet_t1) t WHERE a > 5", + "filter_after_subquery") } test("SPARK-14933 - select parquet table") { withTable("parquet_t") { sql("create table parquet_t stored as parquet as select 1 as c1, 'abc' as c2") - checkHiveQl("select * from parquet_t") + checkSQL("select * from parquet_t", "select_parquet_table") } } test("SPARK-14933 - select orc table") { withTable("orc_t") { sql("create table orc_t stored as orc as select 1 as c1, 'abc' as c2") - checkHiveQl("select * from orc_t") + checkSQL("select * from orc_t", "select_orc_table") } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index f7c3e347b61e..7d4ef6f26a60 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -129,7 +129,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("CACHE TABLE tableName AS SELECT * FROM anotherTable") { - withTempTable("testCacheTable") { + withTempView("testCacheTable") { sql("CACHE TABLE testCacheTable AS SELECT * FROM src") assertCached(table("testCacheTable")) @@ -144,7 +144,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("CACHE TABLE tableName AS SELECT ...") { - withTempTable("testCacheTable") { + withTempView("testCacheTable") { sql("CACHE TABLE testCacheTable AS SELECT key FROM src LIMIT 10") assertCached(table("testCacheTable")) @@ -177,7 +177,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("CACHE TABLE with Hive UDF") { - withTempTable("udfTest") { + withTempView("udfTest") { sql("CACHE TABLE udfTest AS SELECT * FROM src WHERE floor(key) = 1") assertCached(table("udfTest")) uncacheTable("udfTest") @@ -276,7 +276,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto test("Cache/Uncache Qualified Tables") { withTempDatabase { db => - withTempTable("cachedTable") { + withTempView("cachedTable") { sql(s"CREATE TABLE $db.cachedTable STORED AS PARQUET AS SELECT 1") sql(s"CACHE TABLE $db.cachedTable") assertCached(spark.table(s"$db.cachedTable")) @@ -298,7 +298,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto test("Cache Table As Select - having database name") { withTempDatabase { db => - withTempTable("cachedTable") { + withTempView("cachedTable") { val e = intercept[ParseException] { sql(s"CACHE TABLE $db.cachedTable AS SELECT 1") }.getMessage diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala index 867aadb5f556..5450fba7533e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala @@ -37,7 +37,6 @@ class HiveDDLCommandSuite extends PlanTest { parser.parsePlan(sql).collect { case c: CreateTableCommand => (c.table, c.ifNotExists) case c: CreateHiveTableAsSelectLogicalPlan => (c.tableDesc, c.allowExisting) - case c: CreateViewCommand => (c.tableDesc, c.allowExisting) }.head } @@ -100,7 +99,7 @@ class HiveDDLCommandSuite extends PlanTest { assert(desc.viewText.isEmpty) assert(desc.viewOriginalText.isEmpty) assert(desc.partitionColumns == Seq.empty[CatalogColumn]) - assert(desc.storage.serdeProperties == Map()) + assert(desc.storage.properties == Map()) assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat")) assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat")) assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe")) @@ -118,7 +117,7 @@ class HiveDDLCommandSuite extends PlanTest { assert(desc.schema == Seq.empty[CatalogColumn]) assert(desc.viewText == None) // TODO will be SQLText assert(desc.viewOriginalText.isEmpty) - assert(desc.storage.serdeProperties == Map()) + assert(desc.storage.properties == Map()) assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat")) assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) @@ -154,7 +153,7 @@ class HiveDDLCommandSuite extends PlanTest { assert(desc.schema == Seq.empty[CatalogColumn]) assert(desc.viewText == None) // TODO will be SQLText assert(desc.viewOriginalText.isEmpty) - assert(desc.storage.serdeProperties == Map(("serde_p1" -> "p1"), ("serde_p2" -> "p2"))) + assert(desc.storage.properties == Map(("serde_p1" -> "p1"), ("serde_p2" -> "p2"))) assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat")) assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat")) assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe")) @@ -294,9 +293,7 @@ class HiveDDLCommandSuite extends PlanTest { assert(desc.tableType == CatalogTableType.MANAGED) assert(desc.schema == Seq(CatalogColumn("id", "int"), CatalogColumn("name", "string"))) assert(desc.partitionColumnNames.isEmpty) - assert(desc.sortColumnNames.isEmpty) - assert(desc.bucketColumnNames.isEmpty) - assert(desc.numBuckets == -1) + assert(desc.bucketSpec.isEmpty) assert(desc.viewText.isEmpty) assert(desc.viewOriginalText.isEmpty) assert(desc.storage.locationUri.isEmpty) @@ -305,7 +302,7 @@ class HiveDDLCommandSuite extends PlanTest { assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) assert(desc.storage.serde.isEmpty) - assert(desc.storage.serdeProperties.isEmpty) + assert(desc.storage.properties.isEmpty) assert(desc.properties.isEmpty) assert(desc.comment.isEmpty) } @@ -391,10 +388,10 @@ class HiveDDLCommandSuite extends PlanTest { val (desc2, _) = extractTableDesc(query2) val (desc3, _) = extractTableDesc(query3) assert(desc1.storage.serde == Some("org.apache.poof.serde.Baff")) - assert(desc1.storage.serdeProperties.isEmpty) + assert(desc1.storage.properties.isEmpty) assert(desc2.storage.serde == Some("org.apache.poof.serde.Baff")) - assert(desc2.storage.serdeProperties == Map("k1" -> "v1")) - assert(desc3.storage.serdeProperties == Map( + assert(desc2.storage.properties == Map("k1" -> "v1")) + assert(desc3.storage.properties == Map( "field.delim" -> "x", "escape.delim" -> "y", "serialization.format" -> "x", @@ -454,63 +451,44 @@ class HiveDDLCommandSuite extends PlanTest { CatalogColumn("name", "string"), CatalogColumn("month", "int"))) assert(desc.partitionColumnNames == Seq("month")) - assert(desc.sortColumnNames.isEmpty) - assert(desc.bucketColumnNames.isEmpty) - assert(desc.numBuckets == -1) + assert(desc.bucketSpec.isEmpty) assert(desc.viewText.isEmpty) assert(desc.viewOriginalText.isEmpty) assert(desc.storage.locationUri == Some("/path/to/mercury")) assert(desc.storage.inputFormat == Some("winput")) assert(desc.storage.outputFormat == Some("wowput")) assert(desc.storage.serde == Some("org.apache.poof.serde.Baff")) - assert(desc.storage.serdeProperties == Map("k1" -> "v1")) + assert(desc.storage.properties == Map("k1" -> "v1")) assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2")) assert(desc.comment == Some("no comment")) } test("create view -- basic") { val v1 = "CREATE VIEW view1 AS SELECT * FROM tab1" - val (desc, exists) = extractTableDesc(v1) - assert(!exists) - assert(desc.identifier.database.isEmpty) - assert(desc.identifier.table == "view1") - assert(desc.tableType == CatalogTableType.VIEW) - assert(desc.storage.locationUri.isEmpty) - assert(desc.schema == Seq.empty[CatalogColumn]) - assert(desc.viewText == Option("SELECT * FROM tab1")) - assert(desc.viewOriginalText == Option("SELECT * FROM tab1")) - assert(desc.storage.serdeProperties == Map()) - assert(desc.storage.inputFormat.isEmpty) - assert(desc.storage.outputFormat.isEmpty) - assert(desc.storage.serde.isEmpty) - assert(desc.properties == Map()) + val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand] + assert(!command.allowExisting) + assert(command.name.database.isEmpty) + assert(command.name.table == "view1") + assert(command.originalText == Some("SELECT * FROM tab1")) + assert(command.userSpecifiedColumns.isEmpty) } test("create view - full") { val v1 = """ |CREATE OR REPLACE VIEW view1 - |(col1, col3) + |(col1, col3 COMMENT 'hello') |COMMENT 'BLABLA' |TBLPROPERTIES('prop1Key'="prop1Val") |AS SELECT * FROM tab1 """.stripMargin - val (desc, exists) = extractTableDesc(v1) - assert(desc.identifier.database.isEmpty) - assert(desc.identifier.table == "view1") - assert(desc.tableType == CatalogTableType.VIEW) - assert(desc.storage.locationUri.isEmpty) - assert(desc.schema == - CatalogColumn("col1", null, nullable = true, None) :: - CatalogColumn("col3", null, nullable = true, None) :: Nil) - assert(desc.viewText == Option("SELECT * FROM tab1")) - assert(desc.viewOriginalText == Option("SELECT * FROM tab1")) - assert(desc.storage.serdeProperties == Map()) - assert(desc.storage.inputFormat.isEmpty) - assert(desc.storage.outputFormat.isEmpty) - assert(desc.storage.serde.isEmpty) - assert(desc.properties == Map("prop1Key" -> "prop1Val")) - assert(desc.comment == Option("BLABLA")) + val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand] + assert(command.name.database.isEmpty) + assert(command.name.table == "view1") + assert(command.userSpecifiedColumns == Seq("col1" -> None, "col3" -> Some("hello"))) + assert(command.originalText == Some("SELECT * FROM tab1")) + assert(command.properties == Map("prop1Key" -> "prop1Val")) + assert(command.comment == Some("BLABLA")) } test("create view -- partitioned view") { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala index 5714d06f0fe7..3414f5e0409a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.test.SQLTestUtils class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("SPARK-16337 temporary view refresh") { - withTempTable("view_refresh") { + withTempView("view_refresh") { withTable("view_table") { // Create a Parquet directory spark.range(start = 0, end = 100, step = 1, numPartitions = 3) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala index 33252ad07add..09c15473b21c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala @@ -52,7 +52,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("p") - withTempTable("p") { + withTempView("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) @@ -66,7 +66,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) spark.read.parquet(file.getCanonicalPath).createOrReplaceTempView("p") - withTempTable("p") { + withTempView("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 12d250d4fb60..22f8c0f19c6a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -79,7 +79,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv |) """.stripMargin) - withTempTable("expectedJsonTable") { + withTempView("expectedJsonTable") { read.json(jsonFilePath).createOrReplaceTempView("expectedJsonTable") checkAnswer( sql("SELECT a, b, `c_!@(3)`, ``.`d!`, ``.`=` FROM jsonTable"), @@ -109,7 +109,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv assert(expectedSchema === table("jsonTable").schema) - withTempTable("expectedJsonTable") { + withTempView("expectedJsonTable") { read.json(jsonFilePath).createOrReplaceTempView("expectedJsonTable") checkAnswer( sql("SELECT b, ``.`=` FROM jsonTable"), @@ -247,7 +247,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv |) """.stripMargin) - withTempTable("expectedJsonTable") { + withTempView("expectedJsonTable") { read.json(jsonFilePath).createOrReplaceTempView("expectedJsonTable") checkAnswer( @@ -553,7 +553,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv test("scan a parquet table created through a CTAS statement") { withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "true") { - withTempTable("jt") { + withTempView("jt") { (1 to 10).map(i => i -> s"str$i").toDF("a", "b").createOrReplaceTempView("jt") withTable("test_parquet_ctas") { @@ -733,7 +733,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv outputFormat = None, serde = None, compressed = false, - serdeProperties = Map( + properties = Map( "path" -> sessionState.catalog.hiveDefaultTableFilePath(TableIdentifier(tableName))) ), properties = Map( @@ -1171,8 +1171,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv checkAnswer(table("t"), Seq(Row(1, 2, 3), Row(2, 3, 4))) val catalogTable = sharedState.externalCatalog.getTable("default", "t") // there should not be a lowercase key 'path' now - assert(catalogTable.storage.serdeProperties.get("path").isEmpty) - assert(catalogTable.storage.serdeProperties.get("PATH").isDefined) + assert(catalogTable.storage.properties.get("path").isEmpty) + assert(catalogTable.storage.properties.get("PATH").isDefined) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala index 83f1b192f7c9..7ba880e47613 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala @@ -29,7 +29,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle val expectedPath = spark.sharedState.externalCatalog.getDatabase(dbName).locationUri + "/" + tableName - assert(metastoreTable.storage.serdeProperties("path") === expectedPath) + assert(metastoreTable.storage.properties("path") === expectedPath) } private def getTableNames(dbName: Option[String] = None): Array[String] = { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala index ac89bbbf8e19..14266e68478d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.hive import java.sql.Timestamp -import org.apache.hadoop.hive.conf.HiveConf - import org.apache.spark.sql.Row import org.apache.spark.sql.execution.datasources.parquet.ParquetCompatibilityTest import org.apache.spark.sql.hive.test.TestHiveSingleton @@ -54,7 +52,7 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi // Don't convert Hive metastore Parquet tables to let Hive write those Parquet files. withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") { - withTempTable("data") { + withTempView("data") { val fields = hiveTypes.zipWithIndex.map { case (typ, index) => s" col_$index $typ" } val ddl = @@ -137,4 +135,10 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi Row(Row(1, Seq("foo", "bar", null))), "STRUCT>") } + + test("SPARK-16344: array of struct with a single field named 'array_element'") { + testParquetHiveCompatibility( + Row(Seq(Row(1))), + "ARRAY>") + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index a972f61e2585..066c3ffabafc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -153,7 +153,7 @@ class VersionsSuite extends SparkFunSuite with Logging { outputFormat = Some(classOf[HiveIgnoreKeyTextOutputFormat[_, _]].getName), serde = Some(classOf[LazySimpleSerDe].getName()), compressed = false, - serdeProperties = Map.empty + properties = Map.empty )) } @@ -275,7 +275,7 @@ class VersionsSuite extends SparkFunSuite with Logging { outputFormat = None, serde = None, compressed = false, - serdeProperties = Map.empty) + properties = Map.empty) test(s"$version: sql create partitioned table") { client.runSqlHive("CREATE TABLE src_part (value INT) PARTITIONED BY (key1 INT, key2 INT)") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index a16fe3228b1f..2dcf13c02a46 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -923,7 +923,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te } test("udaf without specifying inputSchema") { - withTempTable("noInputSchemaUDAF") { + withTempView("noInputSchemaUDAF") { spark.udf.register("noInputSchema", new ScalaAggregateFunctionWithoutInputSchema) val data = diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index 741abcb7513c..5d510197c4d9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -139,7 +139,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("show tblproperties for spark temporary table - empty row") { - withTempTable("parquet_temp") { + withTempView("parquet_temp") { sql( """ |CREATE TEMPORARY TABLE parquet_temp (c1 INT, c2 STRING) @@ -397,7 +397,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("show partitions - empty row") { - withTempTable("parquet_temp") { + withTempView("parquet_temp") { sql( """ |CREATE TEMPORARY TABLE parquet_temp (c1 INT, c2 STRING) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 343d7bae98bf..fb5c9948a5d3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -356,7 +356,7 @@ class HiveDDLSuite expectedSerdeProps.map { case (k, v) => s"'$k'='$v'" }.mkString(", ") val oldPart = catalog.getPartition(TableIdentifier("boxes"), Map("width" -> "4")) assume(oldPart.storage.serde != Some(expectedSerde), "bad test: serde was already set") - assume(oldPart.storage.serdeProperties.filterKeys(expectedSerdeProps.contains) != + assume(oldPart.storage.properties.filterKeys(expectedSerdeProps.contains) != expectedSerdeProps, "bad test: serde properties were already set") sql(s"""ALTER TABLE boxes PARTITION (width=4) | SET SERDE '$expectedSerde' @@ -364,7 +364,7 @@ class HiveDDLSuite |""".stripMargin) val newPart = catalog.getPartition(TableIdentifier("boxes"), Map("width" -> "4")) assert(newPart.storage.serde == Some(expectedSerde)) - assume(newPart.storage.serdeProperties.filterKeys(expectedSerdeProps.contains) == + assume(newPart.storage.properties.filterKeys(expectedSerdeProps.contains) == expectedSerdeProps) } @@ -612,15 +612,17 @@ class HiveDDLSuite } test("desc table for data source table - no user-defined schema") { - withTable("t1") { - withTempPath { dir => - val path = dir.getCanonicalPath - spark.range(1).write.parquet(path) - sql(s"CREATE TABLE t1 USING parquet OPTIONS (PATH '$path')") + Seq("parquet", "json", "orc").foreach { fileFormat => + withTable("t1") { + withTempPath { dir => + val path = dir.getCanonicalPath + spark.range(1).write.format(fileFormat).save(path) + sql(s"CREATE TABLE t1 USING $fileFormat OPTIONS (PATH '$path')") - val desc = sql("DESC FORMATTED t1").collect().toSeq + val desc = sql("DESC FORMATTED t1").collect().toSeq - assert(desc.contains(Row("# Schema of this table is inferred at runtime", "", ""))) + assert(desc.contains(Row("id", "bigint", ""))) + } } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala index a43eed9a2a4f..98afd99a203a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala @@ -78,7 +78,7 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("SPARK-6212: The EXPLAIN output of CTAS only shows the analyzed plan") { - withTempTable("jt") { + withTempView("jt") { val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}""")) spark.read.json(rdd).createOrReplaceTempView("jt") val outputs = sql( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 76d3f3dbab01..5b464764f0a9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -102,7 +102,7 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH test("Verify SQLConf HIVE_METASTORE_PARTITION_PRUNING") { val view = "src" - withTempTable(view) { + withTempView(view) { spark.range(1, 5).createOrReplaceTempView(view) val table = "table_with_partition" withTable(table) { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index def4601cf615..f690035c845f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala @@ -358,7 +358,7 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { } test("Hive UDF in group by") { - withTempTable("tab1") { + withTempView("tab1") { Seq(Tuple1(1451400761)).toDF("test_date").createOrReplaceTempView("tab1") sql(s"CREATE TEMPORARY FUNCTION testUDFToDate AS '${classOf[GenericUDFToDate].getName}'") val count = sql("select testUDFToDate(cast(test_date as timestamp))" + diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 961d95c268b2..cba6aa53f17e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -19,6 +19,9 @@ package org.apache.spark.sql.hive.execution import java.sql.{Date, Timestamp} +import scala.sys.process.Process +import scala.util.Try + import org.apache.hadoop.fs.Path import org.apache.spark.sql._ @@ -63,6 +66,20 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import hiveContext._ import spark.implicits._ + test("script") { + if (testCommandAvailable("bash") && testCommandAvailable("echo | sed")) { + val df = Seq(("x1", "y1", "z1"), ("x2", "y2", "z2")).toDF("c1", "c2", "c3") + df.createOrReplaceTempView("script_table") + val query1 = sql( + """ + |SELECT col1 FROM (from(SELECT c1, c2, c3 FROM script_table) tempt_table + |REDUCE c1, c2, c3 USING 'bash src/test/resources/test_script.sh' AS + |(col1 STRING, col2 STRING)) script_test_table""".stripMargin) + checkAnswer(query1, Row("x1_y1") :: Row("x2_y2") :: Nil) + } + // else skip this test + } + test("UDTF") { withUserDefinedFunction("udtf_count2" -> true) { sql(s"ADD JAR ${hiveContext.getHiveFile("TestUDTF.jar").getCanonicalPath()}") @@ -111,7 +128,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("SPARK-13651: generator outputs shouldn't be resolved from its child's output") { - withTempTable("src") { + withTempView("src") { Seq(("id1", "value1")).toDF("key", "value").createOrReplaceTempView("src") val query = sql("SELECT genoutput.* FROM src " + @@ -941,7 +958,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("Sorting columns are not in Generate") { - withTempTable("data") { + withTempView("data") { spark.range(1, 5) .select(array($"id", $"id" + 1).as("a"), $"id".as("b"), (lit(10) - $"id").as("c")) .createOrReplaceTempView("data") @@ -1218,7 +1235,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("SPARK-10741: Sort on Aggregate using parquet") { withTable("test10741") { - withTempTable("src") { + withTempView("src") { Seq("a" -> 5, "a" -> 9, "b" -> 6).toDF("c1", "c2").createOrReplaceTempView("src") sql("CREATE TABLE test10741 STORED AS PARQUET AS SELECT * FROM src") } @@ -1472,7 +1489,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("multi-insert with lateral view") { - withTempTable("t1") { + withTempView("t1") { spark.range(10) .select(array($"id", $"id" + 1).as("arr"), $"id") .createOrReplaceTempView("source") @@ -1755,4 +1772,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } } + + def testCommandAvailable(command: String): Boolean = { + Try(Process(command) !!).isSuccess + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala index c6b7eb63662c..0ff3511c87a4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala @@ -247,4 +247,16 @@ class WindowQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleto |from part """.stripMargin)) } + + test("SPARK-16646: LAST_VALUE(FALSE) OVER ()") { + checkAnswer(sql("SELECT LAST_VALUE(FALSE) OVER ()"), Row(false)) + checkAnswer(sql("SELECT LAST_VALUE(FALSE, FALSE) OVER ()"), Row(false)) + checkAnswer(sql("SELECT LAST_VALUE(TRUE, TRUE) OVER ()"), Row(true)) + } + + test("SPARK-16646: FIRST_VALUE(FALSE) OVER ()") { + checkAnswer(sql("SELECT FIRST_VALUE(FALSE) OVER ()"), Row(false)) + checkAnswer(sql("SELECT FIRST_VALUE(FALSE, FALSE) OVER ()"), Row(false)) + checkAnswer(sql("SELECT FIRST_VALUE(TRUE, TRUE) OVER ()"), Row(true)) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala index b9e98fc85f78..49e963ee1294 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala @@ -93,7 +93,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { test("Creating case class RDD table") { val data = (1 to 100).map(i => (i, s"val_$i")) sparkContext.parallelize(data).toDF().createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { checkAnswer(sql("SELECT * FROM t"), data.toDF().collect()) } } @@ -310,7 +310,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { val path = dir.getCanonicalPath withTable("empty_orc") { - withTempTable("empty", "single") { + withTempView("empty", "single") { spark.sql( s"""CREATE TABLE empty_orc(key INT, value STRING) |STORED AS ORC @@ -402,7 +402,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { } test("Verify the ORC conversion parameter: CONVERT_METASTORE_ORC") { - withTempTable("single") { + withTempView("single") { val singleRowDF = Seq((0, "foo")).toDF("key", "value") singleRowDF.createOrReplaceTempView("single") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala index 2a647115b7e0..7226ed521ef3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala @@ -62,7 +62,7 @@ private[sql] trait OrcTest extends SQLTestUtils with TestHiveSingleton { (f: => Unit): Unit = { withOrcDataFrame(data) { df => df.createOrReplaceTempView(tableName) - withTempTable(tableName)(f) + withTempView(tableName)(f) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala index 96beb2d3427b..31b6197d56fc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala @@ -702,7 +702,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest { } test("Verify the PARQUET conversion parameter: CONVERT_METASTORE_PARQUET") { - withTempTable("single") { + withTempView("single") { val singleRowDF = Seq((0, "foo")).toDF("key", "value") singleRowDF.createOrReplaceTempView("single") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index fc01ff3f5aa0..e46149031091 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -20,10 +20,11 @@ package org.apache.spark.sql.sources import java.io.File import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.execution.DataSourceScanExec -import org.apache.spark.sql.execution.datasources.{BucketSpec, DataSourceStrategy} +import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.exchange.ShuffleExchange import org.apache.spark.sql.execution.joins.SortMergeJoinExec import org.apache.spark.sql.functions._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index 62998572eaf4..047b08c4ccf6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -92,7 +92,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes // Self-join df.createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { checkAnswer( sql( """SELECT l.a, r.b, l.p1, r.p2 @@ -339,7 +339,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes test("saveAsTable()/load() - non-partitioned table - ErrorIfExists") { Seq.empty[(Int, String)].toDF().createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { intercept[AnalysisException] { testDF.write.format(dataSourceName).mode(SaveMode.ErrorIfExists).saveAsTable("t") } @@ -349,7 +349,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes test("saveAsTable()/load() - non-partitioned table - Ignore") { Seq.empty[(Int, String)].toDF().createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { testDF.write.format(dataSourceName).mode(SaveMode.Ignore).saveAsTable("t") assert(spark.table("t").collect().isEmpty) } @@ -461,7 +461,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes test("saveAsTable()/load() - partitioned table - ErrorIfExists") { Seq.empty[(Int, String)].toDF().createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { intercept[AnalysisException] { partitionedTestDF.write .format(dataSourceName) @@ -476,7 +476,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes test("saveAsTable()/load() - partitioned table - Ignore") { Seq.empty[(Int, String)].toDF().createOrReplaceTempView("t") - withTempTable("t") { + withTempView("t") { partitionedTestDF.write .format(dataSourceName) .mode(SaveMode.Ignore) @@ -722,7 +722,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes 'p3.cast(FloatType).as('pf1), 'f) - withTempTable("t") { + withTempView("t") { input .write .format(dataSourceName) diff --git a/streaming/pom.xml b/streaming/pom.xml index 31a5fe6ab5e1..07a0dab0ee04 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -25,7 +25,6 @@ ../pom.xml - org.apache.spark spark-streaming_2.11 streaming diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 0b1102686319..398fa6500f09 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -18,8 +18,8 @@ package org.apache.spark.streaming import java.io._ -import java.util.concurrent.Executors -import java.util.concurrent.RejectedExecutionException +import java.util.concurrent.{ArrayBlockingQueue, RejectedExecutionException, + ThreadPoolExecutor, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} @@ -184,7 +184,14 @@ class CheckpointWriter( hadoopConf: Configuration ) extends Logging { val MAX_ATTEMPTS = 3 - val executor = Executors.newFixedThreadPool(1) + + // Single-thread executor which rejects executions when a large amount have queued up. + // This fails fast since this typically means the checkpoint store will never keep up, and + // will otherwise lead to filling memory with waiting payloads of byte[] to write. + val executor = new ThreadPoolExecutor( + 1, 1, + 0L, TimeUnit.MILLISECONDS, + new ArrayBlockingQueue[Runnable](1000)) val compressionCodec = CompressionCodec.createCodec(conf) private var stopped = false @volatile private[this] var fs: FileSystem = null diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 6046426fdf8c..4808d0fcbc6c 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -592,6 +592,7 @@ class StreamingContext private[streaming] ( } StreamingContext.setActiveContext(this) } + logDebug("Adding shutdown hook") // force eager creation of logger shutdownHookRef = ShutdownHookManager.addShutdownHook( StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown) // Registering Streaming Metrics at the start of the StreamingContext diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala index 19c88f1ee011..10d64f98ac71 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala @@ -287,12 +287,14 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { markBatchFullyProcessed(time) } - /** Perform checkpoint for the give `time`. */ + /** Perform checkpoint for the given `time`. */ private def doCheckpoint(time: Time, clearCheckpointDataLater: Boolean) { if (shouldCheckpoint && (time - graph.zeroTime).isMultipleOf(ssc.checkpointDuration)) { logInfo("Checkpointing graph for time " + time) ssc.graph.updateCheckpointData(time) checkpointWriter.write(new Checkpoint(ssc, time), clearCheckpointDataLater) + } else if (clearCheckpointDataLater) { + markBatchFullyProcessed(time) } } diff --git a/tools/pom.xml b/tools/pom.xml index 86a3ba8086d9..b9be8db684a9 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -24,7 +24,6 @@ ../pom.xml - org.apache.spark spark-tools_2.11 tools diff --git a/yarn/pom.xml b/yarn/pom.xml index f4a85cd2f3d3..64ff845b5ae9 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -24,7 +24,6 @@ ../pom.xml - org.apache.spark spark-yarn_2.11 jar Spark Project YARN diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala index 874e3045b405..1ccd7e5993f5 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala @@ -97,7 +97,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite { } test("run Spark in yarn-cluster mode with different configurations") { - testBasicYarnApp(true, + testBasicYarnApp(false, Map( "spark.driver.memory" -> "512m", "spark.driver.cores" -> "1", diff --git a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala index 5458fb9d2e75..e123e7854104 100644 --- a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala @@ -16,13 +16,17 @@ */ package org.apache.spark.network.yarn -import java.io.{DataOutputStream, File, FileOutputStream} +import java.io.{DataOutputStream, File, FileOutputStream, IOException} +import java.nio.file.Files +import java.nio.file.attribute.PosixFilePermission._ +import java.util.EnumSet import scala.annotation.tailrec import scala.concurrent.duration._ import scala.language.postfixOps import org.apache.hadoop.fs.Path +import org.apache.hadoop.service.ServiceStateException import org.apache.hadoop.yarn.api.records.ApplicationId import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.server.api.{ApplicationInitializationContext, ApplicationTerminationContext} @@ -45,7 +49,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd classOf[YarnShuffleService].getCanonicalName) yarnConfig.setInt("spark.shuffle.service.port", 0) val localDir = Utils.createTempDir() - yarnConfig.set("yarn.nodemanager.local-dirs", localDir.getAbsolutePath) + yarnConfig.set(YarnConfiguration.NM_LOCAL_DIRS, localDir.getAbsolutePath) } var s1: YarnShuffleService = null @@ -316,4 +320,28 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd s2.stop() } - } + + test("service throws error if cannot start") { + // Create a different config with a read-only local dir. + val roConfig = new YarnConfiguration(yarnConfig) + val roDir = Utils.createTempDir() + Files.setPosixFilePermissions(roDir.toPath(), EnumSet.of(OWNER_READ, OWNER_EXECUTE)) + roConfig.set(YarnConfiguration.NM_LOCAL_DIRS, roDir.getAbsolutePath()) + roConfig.setBoolean(YarnShuffleService.STOP_ON_FAILURE_KEY, true) + + // Try to start the shuffle service, it should fail. + val service = new YarnShuffleService() + + try { + val error = intercept[ServiceStateException] { + service.init(roConfig) + } + assert(error.getCause().isInstanceOf[IOException]) + } finally { + service.stop() + Files.setPosixFilePermissions(roDir.toPath(), + EnumSet.of(OWNER_READ, OWNER_WRITE, OWNER_EXECUTE)) + } + } + +}