apache
diff --git a/‎.asf.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.asf.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build_and_test.yml‎
Lines changed: 69 additions & 29 deletions b/‎.github/workflows/build_and_test.yml‎
Lines changed: 69 additions & 29 deletions
diff --git a/‎.github/workflows/cancel_duplicate_workflow_runs.yml‎
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/cancel_duplicate_workflow_runs.yml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎.github/workflows/test_report.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test_report.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/DOCUMENTATION.md‎
Lines changed: 1 addition & 1 deletion b/‎R/DOCUMENTATION.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/DESCRIPTION‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/DESCRIPTION‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 7 additions & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/DataFrame.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/R/SQLContext.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/SQLContext.R‎
Lines changed: 1 addition & 1 deletion
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# https://cwiki.apache.org/confluence/display/INFRA/.asf.yaml+features+for+git+repositories
+# https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features
 ---
 github:
   description: "Apache Spark - A unified analytics engine for large-scale data processing"
 
@@ -93,12 +93,11 @@ jobs:
       if: ${{ github.event.inputs.target != '' }}
       run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
-    - name: Cache Scala, SBT, Maven and Zinc
+    - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
-          build/zinc-*
           build/scala-*
           build/*.jar
           ~/.sbt
@@ -180,12 +179,11 @@ jobs:
       if: ${{ github.event.inputs.target != '' }}
       run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
-    - name: Cache Scala, SBT, Maven and Zinc
+    - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
-          build/zinc-*
           build/scala-*
           build/*.jar
           ~/.sbt
@@ -248,12 +246,11 @@ jobs:
       if: ${{ github.event.inputs.target != '' }}
       run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
-    - name: Cache Scala, SBT, Maven and Zinc
+    - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
-          build/zinc-*
           build/scala-*
           build/*.jar
           ~/.sbt
@@ -285,16 +282,17 @@ jobs:
   lint:
     name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-20.04
+    container:
+      image: dongjoon/apache-spark-github-action-image:20201025
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
-    - name: Cache Scala, SBT, Maven and Zinc
+    - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
-          build/zinc-*
           build/scala-*
           build/*.jar
           ~/.sbt
@@ -315,10 +313,6 @@ jobs:
         key: docs-maven-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
           docs-maven-
-    - name: Install Java 8
-      uses: actions/setup-java@v1
-      with:
-        java-version: 8
     - name: Install Python 3.6
       uses: actions/setup-python@v2
       with:
@@ -328,30 +322,26 @@ jobs:
       run: |
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc
-    - name: Install R 4.0
-      uses: r-lib/actions/setup-r@v1
-      with:
-        r-version: 4.0
+        python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc
     - name: Install R linter dependencies and SparkR
       run: |
-        sudo apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
-        sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
-        sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
+        apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
+        Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
+        Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
         ./R/install-dev.sh
-    - name: Install Ruby 2.7 for documentation generation
-      uses: actions/setup-ruby@v1
-      with:
-        ruby-version: 2.7
     - name: Install dependencies for documentation generation
       run: |
         # pandoc is required to generate PySpark APIs as well in nbsphinx.
-        sudo apt-get install -y libcurl4-openssl-dev pandoc
+        apt-get install -y libcurl4-openssl-dev pandoc
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc
-        gem install jekyll jekyll-redirect-from rouge
-        sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc
+        apt-get update -y
+        apt-get install -y ruby ruby-dev
+        Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        gem install bundler
+        cd docs
+        bundle install
     - name: Scala linter
       run: ./dev/lint-scala
     - name: Java linter
@@ -367,7 +357,9 @@ jobs:
     - name: Run documentation build
       run: |
         cd docs
-        jekyll build
+        export LC_ALL=C.UTF-8
+        export LANG=C.UTF-8
+        bundle exec jekyll build
 
   java-11:
     name: Java 11 build with Maven
@@ -436,3 +428,51 @@ jobs:
     - name: Build with SBT
       run: |
         ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile
+
+  tpcds-1g:
+    name: Run TPC-DS queries with SF=1
+    runs-on: ubuntu-20.04
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache TPC-DS generated data
+      id: cache-tpcds-sf-1
+      uses: actions/cache@v2
+      with:
+        path: ./tpcds-sf-1
+        key: tpcds-${{ hashFiles('tpcds-sf-1/.spark-tpcds-sf-1.md5') }}
+        restore-keys: |
+          tpcds-
+    - name: Checkout TPC-DS (SF=1) generated data repository
+      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+      uses: actions/checkout@v2
+      with:
+        repository: maropu/spark-tpcds-sf-1
+        ref: 6b660a53091bd6d23cbe58b0f09aae08e71cc667
+        path: ./tpcds-sf-1
+    - name: Cache Coursier local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.cache/coursier
+        key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+        restore-keys: |
+          tpcds-coursier-
+    - name: Install Java 8
+      uses: actions/setup-java@v1
+      with:
+        java-version: 8
+    - name: Run TPC-DS queries
+      run: |
+        SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
+    - name: Upload test results to report
+      if: always()
+      uses: actions/upload-artifact@v2
+      with:
+        name: test-results-tpcds--8-hadoop3.2-hive2.3
+        path: "**/target/test-reports/*.xml"
+    - name: Upload unit tests log files
+      if: failure()
+      uses: actions/upload-artifact@v2
+      with:
+        name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
+        path: "**/target/unit-tests.log"
@@ -0,0 +1,19 @@
+name: Cancelling Duplicates
+on:
+  workflow_run:
+    workflows: 
+      - 'Build and test'
+    types: ['requested']
+
+jobs:
+  cancel-duplicate-workflow-runs:
+    name: "Cancel duplicate workflow runs"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: potiuk/cancel-workflow-runs@953e057dc81d3458935a18d1184c386b0f6b5738 # @master
+        name: "Cancel duplicate workflow runs"
+        with:
+          cancelMode: allDuplicates
+          token: ${{ secrets.GITHUB_TOKEN }}
+          sourceRunId: ${{ github.event.workflow_run.id }}
+          skipEventTypes: '["push", "schedule"]'
@@ -15,6 +15,7 @@ jobs:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         workflow: ${{ github.event.workflow_run.workflow_id }}
         commit: ${{ github.event.workflow_run.head_commit.id }}
+        workflow_conclusion: completed
     - name: Publish test report
       uses: scacap/action-surefire-report@v1
       with:
 
@@ -30,7 +30,6 @@ R/pkg/tests/fulltests/Rplots.pdf
 build/*.jar
 build/apache-maven*
 build/scala*
-build/zinc*
 cache
 checkpoint
 conf/*.cmd
@@ -48,6 +47,7 @@ dev/pr-deps/
 dist/
 docs/_site/
 docs/api
+docs/.local_ruby_bundle
 sql/docs
 sql/site
 lib_managed/
 
@@ -19,7 +19,7 @@ license: |
 # SparkR Documentation
 
 SparkR documentation is generated by using in-source comments and annotated by using
-[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages,
+[`roxygen2`](https://cran.r-project.org/package=roxygen2). After making changes to the documentation and generating man pages,
 you can run the following from an R console in the SparkR home directory
 ```R
 library(devtools)
 
@@ -11,7 +11,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "felixcheung@apache.org"),
              person(family = "The Apache Software Foundation", role = c("aut", "cph")))
 License: Apache License (== 2.0)
-URL: https://www.apache.org/ https://spark.apache.org/
+URL: https://www.apache.org https://spark.apache.org
 BugReports: https://spark.apache.org/contributing.html
 SystemRequirements: Java (>= 8, < 12)
 Depends:
@@ -59,7 +59,7 @@ Collate:
     'types.R'
     'utils.R'
     'window.R'
-RoxygenNote: 5.0.1
+RoxygenNote: 7.1.1
 VignetteBuilder: knitr
 NeedsCompilation: no
 Encoding: UTF-8
@@ -243,6 +243,7 @@ exportMethods("%<=>%",
               "base64",
               "between",
               "bin",
+              "bitwise_not",
               "bitwiseNOT",
               "bround",
               "cast",
@@ -259,6 +260,7 @@ exportMethods("%<=>%",
               "cos",
               "cosh",
               "count",
+              "count_distinct",
               "countDistinct",
               "crc32",
               "create_array",
@@ -369,6 +371,7 @@ exportMethods("%<=>%",
               "pmod",
               "posexplode",
               "posexplode_outer",
+              "product",
               "quarter",
               "radians",
               "raise_error",
@@ -391,8 +394,11 @@ exportMethods("%<=>%",
               "sha1",
               "sha2",
               "shiftLeft",
+              "shiftleft",
               "shiftRight",
+              "shiftright",
               "shiftRightUnsigned",
+              "shiftrightunsigned",
               "shuffle",
               "sd",
               "sign",
@@ -415,6 +421,7 @@ exportMethods("%<=>%",
               "substr",
               "substring_index",
               "sum",
+              "sum_distinct",
               "sumDistinct",
               "tan",
               "tanh",
 
@@ -880,7 +880,7 @@ setMethod("toJSON",
 
 #' Save the contents of SparkDataFrame as a JSON file
 #'
-#' Save the contents of a SparkDataFrame as a JSON file (\href{http://jsonlines.org/}{
+#' Save the contents of a SparkDataFrame as a JSON file (\href{https://jsonlines.org/}{
 #' JSON Lines text format or newline-delimited JSON}). Files written out
 #' with this method can be read back in as a SparkDataFrame using read.json().
 #'
 
@@ -374,7 +374,7 @@ setMethod("toDF", signature(x = "RDD"),
 #' Create a SparkDataFrame from a JSON file.
 #'
 #' Loads a JSON file, returning the result as a SparkDataFrame
-#' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
+#' By default, (\href{https://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
 #' ) is supported. For JSON (one record per file), set a named property \code{multiLine} to
 #' \code{TRUE}.
 #' It goes through the entire dataset once to determine the schema.