apache · xuanyuanking · Dec 17, 2019 · Dec 18, 2019 · cloud-fan · Dec 17, 2019
diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml
@@ -15,6 +15,8 @@
       url: sql-getting-started.html#creating-datasets
     - text: Interoperating with RDDs
       url: sql-getting-started.html#interoperating-with-rdds
+    - text: Scalar Functions
+      url: sql-getting-started.html#scalar-functions
     - text: Aggregations
       url: sql-getting-started.html#aggregations
 - text: Data Sources
@@ -34,6 +36,8 @@
       url: sql-data-sources-jdbc.html
     - text: Avro Files
       url: sql-data-sources-avro.html
+    - text: Whole Binary Files
+      url: sql-data-sources-binaryFile.html
     - text: Troubleshooting
       url: sql-data-sources-troubleshooting.html
 - text: Performance Tuning
@@ -43,8 +47,8 @@
       url: sql-performance-tuning.html#caching-data-in-memory
     - text: Other Configuration Options
       url: sql-performance-tuning.html#other-configuration-options
-    - text: Broadcast Hint for SQL Queries
-      url: sql-performance-tuning.html#broadcast-hint-for-sql-queries
+    - text: Join Strategy Hints for SQL Queries
+      url: sql-performance-tuning.html#join-strategy-hints-for-sql-queries
 - text: Distributed SQL Engine
   url: sql-distributed-sql-engine.html
   subitems:

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -519,7 +519,7 @@ license: |
 
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
 
-  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](sql-performance-tuning.html#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
+  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Join Strategy Hints for SQL Queries](sql-performance-tuning.html#join-strategy-hints-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
 
   - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
 

diff --git a/docs/sql-performance-tuning.md b/docs/sql-performance-tuning.md
@@ -129,26 +129,23 @@ a specific strategy may not support all join types.
 <div data-lang="scala"  markdown="1">
 
 {% highlight scala %}
-import org.apache.spark.sql.functions.broadcast
-broadcast(spark.table("src")).join(spark.table("records"), "key").show()
+spark.table("src").join(spark.table("records").hint("broadcast"), "key").show()
 {% endhighlight %}
 
 </div>
 
 <div data-lang="java"  markdown="1">
 
 {% highlight java %}
-import static org.apache.spark.sql.functions.broadcast;
-broadcast(spark.table("src")).join(spark.table("records"), "key").show();
+spark.table("src").join(spark.table("records").hint("broadcast"), "key").show();
 {% endhighlight %}
 
 </div>
 
 <div data-lang="python"  markdown="1">
 
 {% highlight python %}
-from pyspark.sql.functions import broadcast
-broadcast(spark.table("src")).join(spark.table("records"), "key").show()
+spark.table("src").join(spark.table("records").hint("broadcast"), "key").show()
 {% endhighlight %}
 
 </div>
@@ -158,7 +155,7 @@ broadcast(spark.table("src")).join(spark.table("records"), "key").show()
 {% highlight r %}
 src <- sql("SELECT * FROM src")
 records <- sql("SELECT * FROM records")
-head(join(broadcast(src), records, src$key == records$key))
 setMethod("broadcast", 
           signature(x = "SparkDataFrame"), 
           function(x) { 
             sdf <- callJStatic("org.apache.spark.sql.functions", "broadcast", x@sdf) 
             dataFrame(sdf) 
           }) 
 setMethod("broadcast", 
           signature(x = "SparkDataFrame"), 
           function(x) { 
             sdf <- callJStatic("org.apache.spark.sql.functions", "broadcast", x@sdf) 
             dataFrame(sdf) 
           }) 
+head(join(src, hint(records, "broadcast"), src$key == records$key))
 {% endhighlight %}
 
 </div>
@@ -172,3 +169,18 @@ SELECT /*+ BROADCAST(r) */ * FROM records r JOIN src s ON r.key = s.key
 
 </div>
 </div>
+
+## Coalesce Hints for SQL Queries
+
+Coalesce hints allows the Spark SQL users to control the number of output files just like the
+`coalesce`, `repartition` and `repartitionByRange` in Dataset API, they can be used for performance
+tuning and reducing the number of output files. The "COALESCE" hint only has a partition number as a
+parameter. The "REPARTITION" hint has a partition number, columns, or both of them as parameters.
+The "REPARTITION_BY_RANGE" hint must have column names and a partition number is optional.
+
+    SELECT /*+ COALESCE(3) */ * FROM t
+    SELECT /*+ REPARTITION(3) */ * FROM t
+    SELECT /*+ REPARTITION(c) */ * FROM t
+    SELECT /*+ REPARTITION(3, c) */ * FROM t
+    SELECT /*+ REPARTITION_BY_RANGE(c) */ * FROM t
+    SELECT /*+ REPARTITION_BY_RANGE(3, c) */ * FROM t