Skip to content

Commit 673e088

Browse files
Merge remote-tracking branch 'origin/master' into SPARK-21901-StateOperatorProgress-toString
2 parents 0e2b9c6 + ca59445 commit 673e088

File tree

11 files changed

+541
-23
lines changed

11 files changed

+541
-23
lines changed

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ exportMethods("arrange",
169169
"transform",
170170
"union",
171171
"unionAll",
172+
"unionByName",
172173
"unique",
173174
"unpersist",
174175
"where",

R/pkg/R/DataFrame.R

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2683,7 +2683,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
26832683
#' @rdname union
26842684
#' @name union
26852685
#' @aliases union,SparkDataFrame,SparkDataFrame-method
2686-
#' @seealso \link{rbind}
2686+
#' @seealso \link{rbind} \link{unionByName}
26872687
#' @export
26882688
#' @examples
26892689
#'\dontrun{
@@ -2714,6 +2714,40 @@ setMethod("unionAll",
27142714
union(x, y)
27152715
})
27162716

2717+
#' Return a new SparkDataFrame containing the union of rows, matched by column names
2718+
#'
2719+
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
2720+
#' and another SparkDataFrame. This is different from \code{union} function, and both
2721+
#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
2722+
#' into account. Input SparkDataFrames can have different data types in the schema.
2723+
#'
2724+
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
2725+
#' This function resolves columns by name (not by position).
2726+
#'
2727+
#' @param x A SparkDataFrame
2728+
#' @param y A SparkDataFrame
2729+
#' @return A SparkDataFrame containing the result of the union.
2730+
#' @family SparkDataFrame functions
2731+
#' @rdname unionByName
2732+
#' @name unionByName
2733+
#' @aliases unionByName,SparkDataFrame,SparkDataFrame-method
2734+
#' @seealso \link{rbind} \link{union}
2735+
#' @export
2736+
#' @examples
2737+
#'\dontrun{
2738+
#' sparkR.session()
2739+
#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
2740+
#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
2741+
#' head(unionByName(df1, df2))
2742+
#' }
2743+
#' @note unionByName since 2.3.0
2744+
setMethod("unionByName",
2745+
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
2746+
function(x, y) {
2747+
unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
2748+
dataFrame(unioned)
2749+
})
2750+
27172751
#' Union two or more SparkDataFrames
27182752
#'
27192753
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
@@ -2730,7 +2764,7 @@ setMethod("unionAll",
27302764
#' @aliases rbind,SparkDataFrame-method
27312765
#' @rdname rbind
27322766
#' @name rbind
2733-
#' @seealso \link{union}
2767+
#' @seealso \link{union} \link{unionByName}
27342768
#' @export
27352769
#' @examples
27362770
#'\dontrun{

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
769769
#' @export
770770
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
771771

772+
#' @rdname unionByName
773+
#' @export
774+
setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
775+
772776
#' @rdname unpersist
773777
#' @export
774778
setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2255,7 +2255,7 @@ test_that("isLocal()", {
22552255
expect_false(isLocal(df))
22562256
})
22572257

2258-
test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
2258+
test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataFrame", {
22592259
df <- read.json(jsonPath)
22602260

22612261
lines <- c("{\"name\":\"Bob\", \"age\":24}",
@@ -2271,6 +2271,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
22712271
expect_equal(first(unioned)$name, "Michael")
22722272
expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
22732273

2274+
df1 <- select(df2, "age", "name")
2275+
unioned1 <- arrange(unionByName(df1, df), df1$age)
2276+
expect_is(unioned, "SparkDataFrame")
2277+
expect_equal(count(unioned), 6)
2278+
# Here, we test if 'Michael' in df is correctly mapped to the same name.
2279+
expect_equal(first(unioned)$name, "Michael")
2280+
22742281
unioned2 <- arrange(rbind(unioned, df, df2), df$age)
22752282
expect_is(unioned2, "SparkDataFrame")
22762283
expect_equal(count(unioned2), 12)

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2639,9 +2639,12 @@ private[spark] object Utils extends Logging {
26392639
* Redact the sensitive information in the given string.
26402640
*/
26412641
def redact(conf: SparkConf, text: String): String = {
2642-
if (text == null || text.isEmpty || !conf.contains(STRING_REDACTION_PATTERN)) return text
2643-
val regex = conf.get(STRING_REDACTION_PATTERN).get
2644-
regex.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
2642+
if (text == null || text.isEmpty || conf == null || !conf.contains(STRING_REDACTION_PATTERN)) {
2643+
text
2644+
} else {
2645+
val regex = conf.get(STRING_REDACTION_PATTERN).get
2646+
regex.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
2647+
}
26452648
}
26462649

26472650
private def redact(redactionPattern: Regex, kvs: Seq[(String, String)]): Seq[(String, String)] = {

python/pyspark/sql/dataframe.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1290,7 +1290,7 @@ def union(self, other):
12901290
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.
12911291
12921292
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1293-
(that does deduplication of elements), use this function followed by a distinct.
1293+
(that does deduplication of elements), use this function followed by :func:`distinct`.
12941294
12951295
Also as standard in SQL, this function resolves columns by position (not by name).
12961296
"""
@@ -1301,14 +1301,36 @@ def unionAll(self, other):
13011301
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.
13021302
13031303
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1304-
(that does deduplication of elements), use this function followed by a distinct.
1304+
(that does deduplication of elements), use this function followed by :func:`distinct`.
13051305
13061306
Also as standard in SQL, this function resolves columns by position (not by name).
13071307
1308-
.. note:: Deprecated in 2.0, use union instead.
1308+
.. note:: Deprecated in 2.0, use :func:`union` instead.
13091309
"""
13101310
return self.union(other)
13111311

1312+
@since(2.3)
1313+
def unionByName(self, other):
1314+
""" Returns a new :class:`DataFrame` containing union of rows in this and another frame.
1315+
1316+
This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
1317+
union (that does deduplication of elements), use this function followed by :func:`distinct`.
1318+
1319+
The difference between this function and :func:`union` is that this function
1320+
resolves columns by name (not by position):
1321+
1322+
>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1323+
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
1324+
>>> df1.unionByName(df2).show()
1325+
+----+----+----+
1326+
|col0|col1|col2|
1327+
+----+----+----+
1328+
| 1| 2| 3|
1329+
| 6| 4| 5|
1330+
+----+----+----+
1331+
"""
1332+
return DataFrame(self._jdf.unionByName(other._jdf), self.sql_ctx)
1333+
13121334
@since(1.3)
13131335
def intersect(self, other):
13141336
""" Return a new :class:`DataFrame` containing rows only in

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala

Lines changed: 137 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,25 @@ case class Not(child: Expression)
133133
/**
134134
* Evaluates to `true` if `list` contains `value`.
135135
*/
136+
// scalastyle:off line.size.limit
136137
@ExpressionDescription(
137-
usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.")
138+
usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.",
139+
arguments = """
140+
Arguments:
141+
* expr1, expr2, expr3, ... - the arguments must be same type.
142+
""",
143+
examples = """
144+
Examples:
145+
> SELECT 1 _FUNC_(1, 2, 3);
146+
true
147+
> SELECT 1 _FUNC_(2, 3, 4);
148+
false
149+
> SELECT named_struct('a', 1, 'b', 2) _FUNC_(named_struct('a', 1, 'b', 1), named_struct('a', 1, 'b', 3));
150+
false
151+
> SELECT named_struct('a', 1, 'b', 2) _FUNC_(named_struct('a', 1, 'b', 2), named_struct('a', 1, 'b', 3));
152+
true
153+
""")
154+
// scalastyle:on line.size.limit
138155
case class In(value: Expression, list: Seq[Expression]) extends Predicate {
139156

140157
require(list != null, "list should not be null")
@@ -491,7 +508,24 @@ object Equality {
491508
// TODO: although map type is not orderable, technically map type should be able to be used
492509
// in equality comparison
493510
@ExpressionDescription(
494-
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` equals `expr2`, or false otherwise.")
511+
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` equals `expr2`, or false otherwise.",
512+
arguments = """
513+
Arguments:
514+
* expr1, expr2 - the two expressions must be same type or can be casted to a common type,
515+
and must be a type that can be used in equality comparison. Map type is not supported.
516+
For complex types such array/struct, the data types of fields must be orderable.
517+
""",
518+
examples = """
519+
Examples:
520+
> SELECT 2 _FUNC_ 2;
521+
true
522+
> SELECT 1 _FUNC_ '1';
523+
true
524+
> SELECT true _FUNC_ NULL;
525+
NULL
526+
> SELECT NULL _FUNC_ NULL;
527+
NULL
528+
""")
495529
case class EqualTo(left: Expression, right: Expression)
496530
extends BinaryComparison with NullIntolerant {
497531

@@ -510,6 +544,23 @@ case class EqualTo(left: Expression, right: Expression)
510544
usage = """
511545
expr1 _FUNC_ expr2 - Returns same result as the EQUAL(=) operator for non-null operands,
512546
but returns true if both are null, false if one of the them is null.
547+
""",
548+
arguments = """
549+
Arguments:
550+
* expr1, expr2 - the two expressions must be same type or can be casted to a common type,
551+
and must be a type that can be used in equality comparison. Map type is not supported.
552+
For complex types such array/struct, the data types of fields must be orderable.
553+
""",
554+
examples = """
555+
Examples:
556+
> SELECT 2 _FUNC_ 2;
557+
true
558+
> SELECT 1 _FUNC_ '1';
559+
true
560+
> SELECT true _FUNC_ NULL;
561+
false
562+
> SELECT NULL _FUNC_ NULL;
563+
true
513564
""")
514565
case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
515566

@@ -540,7 +591,27 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
540591
}
541592

542593
@ExpressionDescription(
543-
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than `expr2`.")
594+
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than `expr2`.",
595+
arguments = """
596+
Arguments:
597+
* expr1, expr2 - the two expressions must be same type or can be casted to a common type,
598+
and must be a type that can be ordered. For example, map type is not orderable, so it
599+
is not supported. For complex types such array/struct, the data types of fields must
600+
be orderable.
601+
""",
602+
examples = """
603+
Examples:
604+
> SELECT 1 _FUNC_ 2;
605+
true
606+
> SELECT 1.1 _FUNC_ '1';
607+
false
608+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
609+
false
610+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
611+
true
612+
> SELECT 1 _FUNC_ NULL;
613+
NULL
614+
""")
544615
case class LessThan(left: Expression, right: Expression)
545616
extends BinaryComparison with NullIntolerant {
546617

@@ -550,7 +621,27 @@ case class LessThan(left: Expression, right: Expression)
550621
}
551622

552623
@ExpressionDescription(
553-
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than or equal to `expr2`.")
624+
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than or equal to `expr2`.",
625+
arguments = """
626+
Arguments:
627+
* expr1, expr2 - the two expressions must be same type or can be casted to a common type,
628+
and must be a type that can be ordered. For example, map type is not orderable, so it
629+
is not supported. For complex types such array/struct, the data types of fields must
630+
be orderable.
631+
""",
632+
examples = """
633+
Examples:
634+
> SELECT 2 _FUNC_ 2;
635+
true
636+
> SELECT 1.0 _FUNC_ '1';
637+
true
638+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
639+
true
640+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
641+
true
642+
> SELECT 1 _FUNC_ NULL;
643+
NULL
644+
""")
554645
case class LessThanOrEqual(left: Expression, right: Expression)
555646
extends BinaryComparison with NullIntolerant {
556647

@@ -560,7 +651,27 @@ case class LessThanOrEqual(left: Expression, right: Expression)
560651
}
561652

562653
@ExpressionDescription(
563-
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than `expr2`.")
654+
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than `expr2`.",
655+
arguments = """
656+
Arguments:
657+
* expr1, expr2 - the two expressions must be same type or can be casted to a common type,
658+
and must be a type that can be ordered. For example, map type is not orderable, so it
659+
is not supported. For complex types such array/struct, the data types of fields must
660+
be orderable.
661+
""",
662+
examples = """
663+
Examples:
664+
> SELECT 2 _FUNC_ 1;
665+
true
666+
> SELECT 2 _FUNC_ '1.1';
667+
true
668+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
669+
false
670+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
671+
false
672+
> SELECT 1 _FUNC_ NULL;
673+
NULL
674+
""")
564675
case class GreaterThan(left: Expression, right: Expression)
565676
extends BinaryComparison with NullIntolerant {
566677

@@ -570,7 +681,27 @@ case class GreaterThan(left: Expression, right: Expression)
570681
}
571682

572683
@ExpressionDescription(
573-
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than or equal to `expr2`.")
684+
usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than or equal to `expr2`.",
685+
arguments = """
686+
Arguments:
687+
* expr1, expr2 - the two expressions must be same type or can be casted to a common type,
688+
and must be a type that can be ordered. For example, map type is not orderable, so it
689+
is not supported. For complex types such array/struct, the data types of fields must
690+
be orderable.
691+
""",
692+
examples = """
693+
Examples:
694+
> SELECT 2 _FUNC_ 1;
695+
true
696+
> SELECT 2.0 _FUNC_ '2.1';
697+
false
698+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
699+
true
700+
> SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
701+
false
702+
> SELECT 1 _FUNC_ NULL;
703+
NULL
704+
""")
574705
case class GreaterThanOrEqual(left: Expression, right: Expression)
575706
extends BinaryComparison with NullIntolerant {
576707

0 commit comments

Comments
 (0)