Skip to content

Commit 692ff63

Browse files
authored
Merge branch 'master' into SPARK-25867
2 parents 2b18e55 + 15c0384 commit 692ff63

File tree

72 files changed

+931
-430
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+931
-430
lines changed

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1674,7 +1674,7 @@ test_that("column functions", {
16741674

16751675
# check for unparseable
16761676
df <- as.DataFrame(list(list("a" = "")))
1677-
expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
1677+
expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]]$a, NA)
16781678

16791679
# check if array type in string is correctly supported.
16801680
jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"

assembly/README

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command
99

1010
If you need to build an assembly for a different version of Hadoop the
1111
hadoop-version system property needs to be set as in this example:
12-
-Dhadoop.version=2.7.3
12+
-Dhadoop.version=2.7.4

bin/docker-image-tool.sh

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,18 @@ function image_ref {
4141
echo "$image"
4242
}
4343

44+
function docker_push {
45+
local image_name="$1"
46+
if [ ! -z $(docker images -q "$(image_ref ${image_name})") ]; then
47+
docker push "$(image_ref ${image_name})"
48+
if [ $? -ne 0 ]; then
49+
error "Failed to push $image_name Docker image."
50+
fi
51+
else
52+
echo "$(image_ref ${image_name}) image not found. Skipping push for this image."
53+
fi
54+
}
55+
4456
function build {
4557
local BUILD_ARGS
4658
local IMG_PATH
@@ -92,8 +104,8 @@ function build {
92104
base_img=$(image_ref spark)
93105
)
94106
local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
95-
local PYDOCKERFILE=${PYDOCKERFILE:-"$IMG_PATH/spark/bindings/python/Dockerfile"}
96-
local RDOCKERFILE=${RDOCKERFILE:-"$IMG_PATH/spark/bindings/R/Dockerfile"}
107+
local PYDOCKERFILE=${PYDOCKERFILE:-false}
108+
local RDOCKERFILE=${RDOCKERFILE:-false}
97109

98110
docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
99111
-t $(image_ref spark) \
@@ -102,33 +114,29 @@ function build {
102114
error "Failed to build Spark JVM Docker image, please refer to Docker build output for details."
103115
fi
104116

105-
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
106-
-t $(image_ref spark-py) \
107-
-f "$PYDOCKERFILE" .
117+
if [ "${PYDOCKERFILE}" != "false" ]; then
118+
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
119+
-t $(image_ref spark-py) \
120+
-f "$PYDOCKERFILE" .
121+
if [ $? -ne 0 ]; then
122+
error "Failed to build PySpark Docker image, please refer to Docker build output for details."
123+
fi
124+
fi
125+
126+
if [ "${RDOCKERFILE}" != "false" ]; then
127+
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
128+
-t $(image_ref spark-r) \
129+
-f "$RDOCKERFILE" .
108130
if [ $? -ne 0 ]; then
109-
error "Failed to build PySpark Docker image, please refer to Docker build output for details."
131+
error "Failed to build SparkR Docker image, please refer to Docker build output for details."
110132
fi
111-
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
112-
-t $(image_ref spark-r) \
113-
-f "$RDOCKERFILE" .
114-
if [ $? -ne 0 ]; then
115-
error "Failed to build SparkR Docker image, please refer to Docker build output for details."
116133
fi
117134
}
118135

119136
function push {
120-
docker push "$(image_ref spark)"
121-
if [ $? -ne 0 ]; then
122-
error "Failed to push Spark JVM Docker image."
123-
fi
124-
docker push "$(image_ref spark-py)"
125-
if [ $? -ne 0 ]; then
126-
error "Failed to push PySpark Docker image."
127-
fi
128-
docker push "$(image_ref spark-r)"
129-
if [ $? -ne 0 ]; then
130-
error "Failed to push SparkR Docker image."
131-
fi
137+
docker_push "spark"
138+
docker_push "spark-py"
139+
docker_push "spark-r"
132140
}
133141

134142
function usage {
@@ -143,8 +151,10 @@ Commands:
143151
144152
Options:
145153
-f file Dockerfile to build for JVM based Jobs. By default builds the Dockerfile shipped with Spark.
146-
-p file Dockerfile to build for PySpark Jobs. Builds Python dependencies and ships with Spark.
147-
-R file Dockerfile to build for SparkR Jobs. Builds R dependencies and ships with Spark.
154+
-p file (Optional) Dockerfile to build for PySpark Jobs. Builds Python dependencies and ships with Spark.
155+
Skips building PySpark docker image if not specified.
156+
-R file (Optional) Dockerfile to build for SparkR Jobs. Builds R dependencies and ships with Spark.
157+
Skips building SparkR docker image if not specified.
148158
-r repo Repository address.
149159
-t tag Tag to apply to the built image, or to identify the image to be pushed.
150160
-m Use minikube's Docker daemon.
@@ -164,6 +174,9 @@ Examples:
164174
- Build image in minikube with tag "testing"
165175
$0 -m -t testing build
166176
177+
- Build PySpark docker image
178+
$0 -r docker.io/myrepo -t v2.3.0 -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile build
179+
167180
- Build and push image with tag "v2.3.0" to docker.io/myrepo
168181
$0 -r docker.io/myrepo -t v2.3.0 build
169182
$0 -r docker.io/myrepo -t v2.3.0 push

core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,7 @@ private[ui] class TaskPagedTable(
843843
</div>
844844
</td>
845845
<td>{UIUtils.formatDate(task.launchTime)}</td>
846-
<td>{formatDuration(task.duration)}</td>
846+
<td>{formatDuration(task.taskMetrics.map(_.executorRunTime))}</td>
847847
<td class={TaskDetailsClassNames.SCHEDULER_DELAY}>
848848
{UIUtils.formatDuration(AppStatusUtils.schedulerDelay(task))}
849849
</td>
@@ -996,7 +996,9 @@ private[ui] object ApiHelper {
996996
HEADER_EXECUTOR -> TaskIndexNames.EXECUTOR,
997997
HEADER_HOST -> TaskIndexNames.HOST,
998998
HEADER_LAUNCH_TIME -> TaskIndexNames.LAUNCH_TIME,
999-
HEADER_DURATION -> TaskIndexNames.DURATION,
999+
// SPARK-26109: Duration of task as executorRunTime to make it consistent with the
1000+
// aggregated tasks summary metrics table and the previous versions of Spark.
1001+
HEADER_DURATION -> TaskIndexNames.EXEC_RUN_TIME,
10001002
HEADER_SCHEDULER_DELAY -> TaskIndexNames.SCHEDULER_DELAY,
10011003
HEADER_DESER_TIME -> TaskIndexNames.DESER_TIME,
10021004
HEADER_GC_TIME -> TaskIndexNames.GC_TIME,

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import java.security.SecureRandom
3131
import java.util.{Locale, Properties, Random, UUID}
3232
import java.util.concurrent._
3333
import java.util.concurrent.TimeUnit.NANOSECONDS
34-
import java.util.concurrent.atomic.AtomicBoolean
3534
import java.util.zip.GZIPInputStream
3635

3736
import scala.annotation.tailrec
@@ -93,53 +92,6 @@ private[spark] object Utils extends Logging {
9392
private val MAX_DIR_CREATION_ATTEMPTS: Int = 10
9493
@volatile private var localRootDirs: Array[String] = null
9594

96-
/**
97-
* The performance overhead of creating and logging strings for wide schemas can be large. To
98-
* limit the impact, we bound the number of fields to include by default. This can be overridden
99-
* by setting the 'spark.debug.maxToStringFields' conf in SparkEnv.
100-
*/
101-
val DEFAULT_MAX_TO_STRING_FIELDS = 25
102-
103-
private[spark] def maxNumToStringFields = {
104-
if (SparkEnv.get != null) {
105-
SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
106-
} else {
107-
DEFAULT_MAX_TO_STRING_FIELDS
108-
}
109-
}
110-
111-
/** Whether we have warned about plan string truncation yet. */
112-
private val truncationWarningPrinted = new AtomicBoolean(false)
113-
114-
/**
115-
* Format a sequence with semantics similar to calling .mkString(). Any elements beyond
116-
* maxNumToStringFields will be dropped and replaced by a "... N more fields" placeholder.
117-
*
118-
* @return the trimmed and formatted string.
119-
*/
120-
def truncatedString[T](
121-
seq: Seq[T],
122-
start: String,
123-
sep: String,
124-
end: String,
125-
maxNumFields: Int = maxNumToStringFields): String = {
126-
if (seq.length > maxNumFields) {
127-
if (truncationWarningPrinted.compareAndSet(false, true)) {
128-
logWarning(
129-
"Truncated the string representation of a plan since it was too large. This " +
130-
"behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.")
131-
}
132-
val numFields = math.max(0, maxNumFields - 1)
133-
seq.take(numFields).mkString(
134-
start, sep, sep + "... " + (seq.length - numFields) + " more fields" + end)
135-
} else {
136-
seq.mkString(start, sep, end)
137-
}
138-
}
139-
140-
/** Shorthand for calling truncatedString() without start or end strings. */
141-
def truncatedString[T](seq: Seq[T], sep: String): String = truncatedString(seq, "", sep, "")
142-
14395
/** Serialize an object using Java serialization */
14496
def serialize[T](o: T): Array[Byte] = {
14597
val bos = new ByteArrayOutputStream()

core/src/test/scala/org/apache/spark/util/UtilsSuite.scala

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,6 @@ import org.apache.spark.scheduler.SparkListener
4545

4646
class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
4747

48-
test("truncatedString") {
49-
assert(Utils.truncatedString(Nil, "[", ", ", "]", 2) == "[]")
50-
assert(Utils.truncatedString(Seq(1, 2), "[", ", ", "]", 2) == "[1, 2]")
51-
assert(Utils.truncatedString(Seq(1, 2, 3), "[", ", ", "]", 2) == "[1, ... 2 more fields]")
52-
assert(Utils.truncatedString(Seq(1, 2, 3), "[", ", ", "]", -5) == "[, ... 3 more fields]")
53-
assert(Utils.truncatedString(Seq(1, 2, 3), ", ") == "1, 2, 3")
54-
}
55-
5648
test("timeConversion") {
5749
// Test -1
5850
assert(Utils.timeStringAsSeconds("-1") === -1)

dev/deps/spark-deps-hadoop-2.7

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,21 +64,21 @@ gson-2.2.4.jar
6464
guava-14.0.1.jar
6565
guice-3.0.jar
6666
guice-servlet-3.0.jar
67-
hadoop-annotations-2.7.3.jar
68-
hadoop-auth-2.7.3.jar
69-
hadoop-client-2.7.3.jar
70-
hadoop-common-2.7.3.jar
71-
hadoop-hdfs-2.7.3.jar
72-
hadoop-mapreduce-client-app-2.7.3.jar
73-
hadoop-mapreduce-client-common-2.7.3.jar
74-
hadoop-mapreduce-client-core-2.7.3.jar
75-
hadoop-mapreduce-client-jobclient-2.7.3.jar
76-
hadoop-mapreduce-client-shuffle-2.7.3.jar
77-
hadoop-yarn-api-2.7.3.jar
78-
hadoop-yarn-client-2.7.3.jar
79-
hadoop-yarn-common-2.7.3.jar
80-
hadoop-yarn-server-common-2.7.3.jar
81-
hadoop-yarn-server-web-proxy-2.7.3.jar
67+
hadoop-annotations-2.7.4.jar
68+
hadoop-auth-2.7.4.jar
69+
hadoop-client-2.7.4.jar
70+
hadoop-common-2.7.4.jar
71+
hadoop-hdfs-2.7.4.jar
72+
hadoop-mapreduce-client-app-2.7.4.jar
73+
hadoop-mapreduce-client-common-2.7.4.jar
74+
hadoop-mapreduce-client-core-2.7.4.jar
75+
hadoop-mapreduce-client-jobclient-2.7.4.jar
76+
hadoop-mapreduce-client-shuffle-2.7.4.jar
77+
hadoop-yarn-api-2.7.4.jar
78+
hadoop-yarn-client-2.7.4.jar
79+
hadoop-yarn-common-2.7.4.jar
80+
hadoop-yarn-server-common-2.7.4.jar
81+
hadoop-yarn-server-web-proxy-2.7.4.jar
8282
hk2-api-2.4.0-b34.jar
8383
hk2-locator-2.4.0-b34.jar
8484
hk2-utils-2.4.0-b34.jar
@@ -117,6 +117,7 @@ jersey-guava-2.22.2.jar
117117
jersey-media-jaxb-2.22.2.jar
118118
jersey-server-2.22.2.jar
119119
jetty-6.1.26.jar
120+
jetty-sslengine-6.1.26.jar
120121
jetty-util-6.1.26.jar
121122
jline-2.14.6.jar
122123
joda-time-2.9.3.jar

docs/running-on-kubernetes.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,18 @@ $ ./bin/docker-image-tool.sh -r <repo> -t my-tag build
8888
$ ./bin/docker-image-tool.sh -r <repo> -t my-tag push
8989
```
9090

91+
By default `bin/docker-image-tool.sh` builds docker image for running JVM jobs. You need to opt-in to build additional
92+
language binding docker images.
93+
94+
Example usage is
95+
```bash
96+
# To build additional PySpark docker image
97+
$ ./bin/docker-image-tool.sh -r <repo> -t my-tag -p ./kubernetes/dockerfiles/spark/bindings/python/Dockerfile build
98+
99+
# To build additional SparkR docker image
100+
$ ./bin/docker-image-tool.sh -r <repo> -t my-tag -R ./kubernetes/dockerfiles/spark/bindings/R/Dockerfile build
101+
```
102+
91103
## Cluster Mode
92104

93105
To launch Spark Pi in cluster mode,

docs/sql-migration-guide-upgrade.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,13 @@ displayTitle: Spark SQL Upgrading Guide
1515

1616
- Since Spark 3.0, the `from_json` functions supports two modes - `PERMISSIVE` and `FAILFAST`. The modes can be set via the `mode` option. The default mode became `PERMISSIVE`. In previous versions, behavior of `from_json` did not conform to either `PERMISSIVE` nor `FAILFAST`, especially in processing of malformed JSON records. For example, the JSON string `{"a" 1}` with the schema `a INT` is converted to `null` by previous versions but Spark 3.0 converts it to `Row(null)`.
1717

18+
- In Spark version 2.4 and earlier, the `from_json` function produces `null`s for JSON strings and JSON datasource skips the same independetly of its mode if there is no valid root JSON token in its input (` ` for example). Since Spark 3.0, such input is treated as a bad record and handled according to specified mode. For example, in the `PERMISSIVE` mode the ` ` input is converted to `Row(null, null)` if specified schema is `key STRING, value INT`.
19+
1820
- The `ADD JAR` command previously returned a result set with the single value 0. It now returns an empty result set.
1921

2022
- In Spark version 2.4 and earlier, users can create map values with map type key via built-in function like `CreateMap`, `MapFromArrays`, etc. Since Spark 3.0, it's not allowed to create map values with map type key with these built-in functions. Users can still read map values with map type key from data source or Java/Scala collections, though they are not very useful.
23+
24+
- In Spark version 2.4 and earlier, `Dataset.groupByKey` results to a grouped dataset with key attribute wrongly named as "value", if the key is non-struct type, e.g. int, string, array, etc. This is counterintuitive and makes the schema of aggregation queries weird. For example, the schema of `ds.groupByKey(...).count()` is `(value, count)`. Since Spark 3.0, we name the grouping attribute to "key". The old behaviour is preserved under a newly added configuration `spark.sql.legacy.dataset.nameNonStructGroupingKeyAsValue` with a default value of `false`.
2125

2226
## Upgrading From Spark SQL 2.3 to 2.4
2327

mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,27 +55,27 @@ class DecisionTreeClassifier @Since("1.4.0") (
5555

5656
/** @group setParam */
5757
@Since("1.4.0")
58-
override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
58+
def setMaxDepth(value: Int): this.type = set(maxDepth, value)
5959

6060
/** @group setParam */
6161
@Since("1.4.0")
62-
override def setMaxBins(value: Int): this.type = set(maxBins, value)
62+
def setMaxBins(value: Int): this.type = set(maxBins, value)
6363

6464
/** @group setParam */
6565
@Since("1.4.0")
66-
override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
66+
def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
6767

6868
/** @group setParam */
6969
@Since("1.4.0")
70-
override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
70+
def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
7171

7272
/** @group expertSetParam */
7373
@Since("1.4.0")
74-
override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
74+
def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
7575

7676
/** @group expertSetParam */
7777
@Since("1.4.0")
78-
override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
78+
def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
7979

8080
/**
8181
* Specifies how often to checkpoint the cached node IDs.
@@ -87,15 +87,15 @@ class DecisionTreeClassifier @Since("1.4.0") (
8787
* @group setParam
8888
*/
8989
@Since("1.4.0")
90-
override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
90+
def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
9191

9292
/** @group setParam */
9393
@Since("1.4.0")
94-
override def setImpurity(value: String): this.type = set(impurity, value)
94+
def setImpurity(value: String): this.type = set(impurity, value)
9595

9696
/** @group setParam */
9797
@Since("1.6.0")
98-
override def setSeed(value: Long): this.type = set(seed, value)
98+
def setSeed(value: Long): this.type = set(seed, value)
9999

100100
override protected def train(
101101
dataset: Dataset[_]): DecisionTreeClassificationModel = instrumented { instr =>

0 commit comments

Comments
 (0)