Skip to content

Commit f1332eb

Browse files
committed
Merge branch 'master' into SPARK-33152_constraint_propagation
2 parents 0c156f7 + 1c3bdab commit f1332eb

File tree

2,383 files changed

+142114
-83005
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,383 files changed

+142114
-83005
lines changed

.asf.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
# https://cwiki.apache.org/confluence/display/INFRA/.asf.yaml+features+for+git+repositories
16+
# https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features
1717
---
1818
github:
1919
description: "Apache Spark - A unified analytics engine for large-scale data processing"

.github/workflows/build_and_test.yml

Lines changed: 69 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,11 @@ jobs:
9393
if: ${{ github.event.inputs.target != '' }}
9494
run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
9595
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
96-
- name: Cache Scala, SBT, Maven and Zinc
96+
- name: Cache Scala, SBT and Maven
9797
uses: actions/cache@v2
9898
with:
9999
path: |
100100
build/apache-maven-*
101-
build/zinc-*
102101
build/scala-*
103102
build/*.jar
104103
~/.sbt
@@ -180,12 +179,11 @@ jobs:
180179
if: ${{ github.event.inputs.target != '' }}
181180
run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
182181
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
183-
- name: Cache Scala, SBT, Maven and Zinc
182+
- name: Cache Scala, SBT and Maven
184183
uses: actions/cache@v2
185184
with:
186185
path: |
187186
build/apache-maven-*
188-
build/zinc-*
189187
build/scala-*
190188
build/*.jar
191189
~/.sbt
@@ -248,12 +246,11 @@ jobs:
248246
if: ${{ github.event.inputs.target != '' }}
249247
run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
250248
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
251-
- name: Cache Scala, SBT, Maven and Zinc
249+
- name: Cache Scala, SBT and Maven
252250
uses: actions/cache@v2
253251
with:
254252
path: |
255253
build/apache-maven-*
256-
build/zinc-*
257254
build/scala-*
258255
build/*.jar
259256
~/.sbt
@@ -285,16 +282,17 @@ jobs:
285282
lint:
286283
name: Linters, licenses, dependencies and documentation generation
287284
runs-on: ubuntu-20.04
285+
container:
286+
image: dongjoon/apache-spark-github-action-image:20201025
288287
steps:
289288
- name: Checkout Spark repository
290289
uses: actions/checkout@v2
291290
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
292-
- name: Cache Scala, SBT, Maven and Zinc
291+
- name: Cache Scala, SBT and Maven
293292
uses: actions/cache@v2
294293
with:
295294
path: |
296295
build/apache-maven-*
297-
build/zinc-*
298296
build/scala-*
299297
build/*.jar
300298
~/.sbt
@@ -315,10 +313,6 @@ jobs:
315313
key: docs-maven-${{ hashFiles('**/pom.xml') }}
316314
restore-keys: |
317315
docs-maven-
318-
- name: Install Java 8
319-
uses: actions/setup-java@v1
320-
with:
321-
java-version: 8
322316
- name: Install Python 3.6
323317
uses: actions/setup-python@v2
324318
with:
@@ -328,30 +322,26 @@ jobs:
328322
run: |
329323
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
330324
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
331-
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc
332-
- name: Install R 4.0
333-
uses: r-lib/actions/setup-r@v1
334-
with:
335-
r-version: 4.0
325+
python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc
336326
- name: Install R linter dependencies and SparkR
337327
run: |
338-
sudo apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
339-
sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
340-
sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
328+
apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
329+
Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
330+
Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
341331
./R/install-dev.sh
342-
- name: Install Ruby 2.7 for documentation generation
343-
uses: actions/setup-ruby@v1
344-
with:
345-
ruby-version: 2.7
346332
- name: Install dependencies for documentation generation
347333
run: |
348334
# pandoc is required to generate PySpark APIs as well in nbsphinx.
349-
sudo apt-get install -y libcurl4-openssl-dev pandoc
335+
apt-get install -y libcurl4-openssl-dev pandoc
350336
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
351337
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
352-
pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc
353-
gem install jekyll jekyll-redirect-from rouge
354-
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
338+
python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc
339+
apt-get update -y
340+
apt-get install -y ruby ruby-dev
341+
Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
342+
gem install bundler
343+
cd docs
344+
bundle install
355345
- name: Scala linter
356346
run: ./dev/lint-scala
357347
- name: Java linter
@@ -367,7 +357,9 @@ jobs:
367357
- name: Run documentation build
368358
run: |
369359
cd docs
370-
jekyll build
360+
export LC_ALL=C.UTF-8
361+
export LANG=C.UTF-8
362+
bundle exec jekyll build
371363
372364
java-11:
373365
name: Java 11 build with Maven
@@ -436,3 +428,51 @@ jobs:
436428
- name: Build with SBT
437429
run: |
438430
./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile
431+
432+
tpcds-1g:
433+
name: Run TPC-DS queries with SF=1
434+
runs-on: ubuntu-20.04
435+
steps:
436+
- name: Checkout Spark repository
437+
uses: actions/checkout@v2
438+
- name: Cache TPC-DS generated data
439+
id: cache-tpcds-sf-1
440+
uses: actions/cache@v2
441+
with:
442+
path: ./tpcds-sf-1
443+
key: tpcds-${{ hashFiles('tpcds-sf-1/.spark-tpcds-sf-1.md5') }}
444+
restore-keys: |
445+
tpcds-
446+
- name: Checkout TPC-DS (SF=1) generated data repository
447+
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
448+
uses: actions/checkout@v2
449+
with:
450+
repository: maropu/spark-tpcds-sf-1
451+
ref: 6b660a53091bd6d23cbe58b0f09aae08e71cc667
452+
path: ./tpcds-sf-1
453+
- name: Cache Coursier local repository
454+
uses: actions/cache@v2
455+
with:
456+
path: ~/.cache/coursier
457+
key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
458+
restore-keys: |
459+
tpcds-coursier-
460+
- name: Install Java 8
461+
uses: actions/setup-java@v1
462+
with:
463+
java-version: 8
464+
- name: Run TPC-DS queries
465+
run: |
466+
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
467+
- name: Upload test results to report
468+
if: always()
469+
uses: actions/upload-artifact@v2
470+
with:
471+
name: test-results-tpcds--8-hadoop3.2-hive2.3
472+
path: "**/target/test-reports/*.xml"
473+
- name: Upload unit tests log files
474+
if: failure()
475+
uses: actions/upload-artifact@v2
476+
with:
477+
name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
478+
path: "**/target/unit-tests.log"
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: Cancelling Duplicates
2+
on:
3+
workflow_run:
4+
workflows:
5+
- 'Build and test'
6+
types: ['requested']
7+
8+
jobs:
9+
cancel-duplicate-workflow-runs:
10+
name: "Cancel duplicate workflow runs"
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: potiuk/cancel-workflow-runs@953e057dc81d3458935a18d1184c386b0f6b5738 # @master
14+
name: "Cancel duplicate workflow runs"
15+
with:
16+
cancelMode: allDuplicates
17+
token: ${{ secrets.GITHUB_TOKEN }}
18+
sourceRunId: ${{ github.event.workflow_run.id }}
19+
skipEventTypes: '["push", "schedule"]'

.github/workflows/test_report.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ jobs:
1515
github_token: ${{ secrets.GITHUB_TOKEN }}
1616
workflow: ${{ github.event.workflow_run.workflow_id }}
1717
commit: ${{ github.event.workflow_run.head_commit.id }}
18+
workflow_conclusion: completed
1819
- name: Publish test report
1920
uses: scacap/action-surefire-report@v1
2021
with:

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ R/pkg/tests/fulltests/Rplots.pdf
3030
build/*.jar
3131
build/apache-maven*
3232
build/scala*
33-
build/zinc*
3433
cache
3534
checkpoint
3635
conf/*.cmd
@@ -48,6 +47,7 @@ dev/pr-deps/
4847
dist/
4948
docs/_site/
5049
docs/api
50+
docs/.local_ruby_bundle
5151
sql/docs
5252
sql/site
5353
lib_managed/

R/DOCUMENTATION.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ license: |
1919
# SparkR Documentation
2020

2121
SparkR documentation is generated by using in-source comments and annotated by using
22-
[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages,
22+
[`roxygen2`](https://cran.r-project.org/package=roxygen2). After making changes to the documentation and generating man pages,
2323
you can run the following from an R console in the SparkR home directory
2424
```R
2525
library(devtools)

R/pkg/DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
1111
email = "felixcheung@apache.org"),
1212
person(family = "The Apache Software Foundation", role = c("aut", "cph")))
1313
License: Apache License (== 2.0)
14-
URL: https://www.apache.org/ https://spark.apache.org/
14+
URL: https://www.apache.org https://spark.apache.org
1515
BugReports: https://spark.apache.org/contributing.html
1616
SystemRequirements: Java (>= 8, < 12)
1717
Depends:
@@ -59,7 +59,7 @@ Collate:
5959
'types.R'
6060
'utils.R'
6161
'window.R'
62-
RoxygenNote: 5.0.1
62+
RoxygenNote: 7.1.1
6363
VignetteBuilder: knitr
6464
NeedsCompilation: no
6565
Encoding: UTF-8

R/pkg/NAMESPACE

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ exportMethods("%<=>%",
243243
"base64",
244244
"between",
245245
"bin",
246+
"bitwise_not",
246247
"bitwiseNOT",
247248
"bround",
248249
"cast",
@@ -259,6 +260,7 @@ exportMethods("%<=>%",
259260
"cos",
260261
"cosh",
261262
"count",
263+
"count_distinct",
262264
"countDistinct",
263265
"crc32",
264266
"create_array",
@@ -369,6 +371,7 @@ exportMethods("%<=>%",
369371
"pmod",
370372
"posexplode",
371373
"posexplode_outer",
374+
"product",
372375
"quarter",
373376
"radians",
374377
"raise_error",
@@ -391,8 +394,11 @@ exportMethods("%<=>%",
391394
"sha1",
392395
"sha2",
393396
"shiftLeft",
397+
"shiftleft",
394398
"shiftRight",
399+
"shiftright",
395400
"shiftRightUnsigned",
401+
"shiftrightunsigned",
396402
"shuffle",
397403
"sd",
398404
"sign",
@@ -415,6 +421,7 @@ exportMethods("%<=>%",
415421
"substr",
416422
"substring_index",
417423
"sum",
424+
"sum_distinct",
418425
"sumDistinct",
419426
"tan",
420427
"tanh",

R/pkg/R/DataFrame.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,7 @@ setMethod("toJSON",
880880

881881
#' Save the contents of SparkDataFrame as a JSON file
882882
#'
883-
#' Save the contents of a SparkDataFrame as a JSON file (\href{http://jsonlines.org/}{
883+
#' Save the contents of a SparkDataFrame as a JSON file (\href{https://jsonlines.org/}{
884884
#' JSON Lines text format or newline-delimited JSON}). Files written out
885885
#' with this method can be read back in as a SparkDataFrame using read.json().
886886
#'

R/pkg/R/SQLContext.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ setMethod("toDF", signature(x = "RDD"),
374374
#' Create a SparkDataFrame from a JSON file.
375375
#'
376376
#' Loads a JSON file, returning the result as a SparkDataFrame
377-
#' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
377+
#' By default, (\href{https://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
378378
#' ) is supported. For JSON (one record per file), set a named property \code{multiLine} to
379379
#' \code{TRUE}.
380380
#' It goes through the entire dataset once to determine the schema.

0 commit comments

Comments
 (0)