-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1098 from romainx/spark_doc
Follow-on PR #911: Spark documentation rework
- Loading branch information
Showing
7 changed files
with
466 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"output_type": "error", | ||
"ename": "Error", | ||
"evalue": "Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).", | ||
"traceback": [ | ||
"Error: Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).", | ||
"at b.startServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:270430)", | ||
"at async b.createServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:269873)", | ||
"at async connect (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:397876)", | ||
"at async w.ensureConnectionAndNotebookImpl (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556625)", | ||
"at async w.ensureConnectionAndNotebook (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556303)", | ||
"at async w.clearResult (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:552346)", | ||
"at async w.reexecuteCell (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:540374)", | ||
"at async w.reexecuteCells (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:537541)" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from pyspark.sql import SparkSession\n", | ||
"\n", | ||
"# Spark session & context\n", | ||
"spark = SparkSession.builder.master('local').getOrCreate()\n", | ||
"sc = spark.sparkContext\n", | ||
"\n", | ||
"# Sum of the first 100 whole numbers\n", | ||
"rdd = sc.parallelize(range(100 + 1))\n", | ||
"rdd.sum()\n", | ||
"# 5050" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"library(SparkR)\n", | ||
"\n", | ||
"# Spark session & context\n", | ||
"sc <- sparkR.session(\"local\")\n", | ||
"\n", | ||
"# Sum of the first 100 whole numbers\n", | ||
"sdf <- createDataFrame(list(1:100))\n", | ||
"dapplyCollect(sdf,\n", | ||
" function(x) \n", | ||
" { x <- sum(x)}\n", | ||
" )\n", | ||
"# 5050" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "R", | ||
"language": "R", | ||
"name": "ir" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": "r", | ||
"file_extension": ".r", | ||
"mimetype": "text/x-r-source", | ||
"name": "R", | ||
"pygments_lexer": "r", | ||
"version": "3.6.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"library(sparklyr)\n", | ||
"\n", | ||
"# get the default config\n", | ||
"conf <- spark_config()\n", | ||
"# Set the catalog implementation in-memory\n", | ||
"conf$spark.sql.catalogImplementation <- \"in-memory\"\n", | ||
"\n", | ||
"# Spark session & context\n", | ||
"sc <- spark_connect(master = \"local\", config = conf)\n", | ||
"\n", | ||
"# Sum of the first 100 whole numbers\n", | ||
"sdf_len(sc, 100, repartition = 1) %>% \n", | ||
" spark_apply(function(e) sum(e))\n", | ||
"# 5050" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "R", | ||
"language": "R", | ||
"name": "ir" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": "r", | ||
"file_extension": ".r", | ||
"mimetype": "text/x-r-source", | ||
"name": "R", | ||
"pygments_lexer": "r", | ||
"version": "3.6.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%init_spark\n", | ||
"# Spark session & context\n", | ||
"launcher.master = \"local\"\n", | ||
"launcher.conf.spark.executor.cores = 1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:28\n", | ||
"res4: Double = 5050.0\n" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"// Sum of the first 100 whole numbers\n", | ||
"val rdd = sc.parallelize(0 to 100)\n", | ||
"rdd.sum()\n", | ||
"// 5050" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "spylon-kernel", | ||
"language": "scala", | ||
"name": "spylon-kernel" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": "text/x-scala", | ||
"file_extension": ".scala", | ||
"help_links": [ | ||
{ | ||
"text": "MetaKernel Magics", | ||
"url": "https://metakernel.readthedocs.io/en/latest/source/README.html" | ||
} | ||
], | ||
"mimetype": "text/x-scala", | ||
"name": "scala", | ||
"pygments_lexer": "scala", | ||
"version": "0.4.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"Waiting for a Spark session to start..." | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"spark://master:7077\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"// should print the value of --master in the kernel spec\n", | ||
"println(sc.master)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"Waiting for a Spark session to start..." | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"rdd = ParallelCollectionRDD[0] at parallelize at <console>:28\n" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"5050.0" | ||
] | ||
}, | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"// Sum of the first 100 whole numbers\n", | ||
"val rdd = sc.parallelize(0 to 100)\n", | ||
"rdd.sum()\n", | ||
"// 5050" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Apache Toree - Scala", | ||
"language": "scala", | ||
"name": "apache_toree_scala" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": "text/x-scala", | ||
"file_extension": ".scala", | ||
"mimetype": "text/x-scala", | ||
"name": "scala", | ||
"pygments_lexer": "scala", | ||
"version": "2.11.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright (c) Jupyter Development Team. | ||
# Distributed under the terms of the Modified BSD License. | ||
|
||
import logging | ||
|
||
import pytest | ||
import os | ||
|
||
LOGGER = logging.getLogger(__name__) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"test_file", | ||
# TODO: add local_sparklyr | ||
["local_pyspark", "local_spylon", "local_toree", "local_sparkR"], | ||
) | ||
def test_nbconvert(container, test_file): | ||
"""Check if Spark notebooks can be executed""" | ||
host_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") | ||
cont_data_dir = "/home/jovyan/data" | ||
output_dir = "/tmp" | ||
timeout_ms = 600 | ||
LOGGER.info(f"Test that {test_file} notebook can be executed ...") | ||
command = f"jupyter nbconvert --to markdown --ExecutePreprocessor.timeout={timeout_ms} --output-dir {output_dir} --execute {cont_data_dir}/{test_file}.ipynb" | ||
c = container.run( | ||
volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}}, | ||
tty=True, | ||
command=["start.sh", "bash", "-c", command], | ||
) | ||
rv = c.wait(timeout=timeout_ms / 10 + 10) | ||
assert rv == 0 or rv["StatusCode"] == 0, f"Command {command} failed" | ||
logs = c.logs(stdout=True).decode("utf-8") | ||
LOGGER.debug(logs) | ||
expected_file = f"{output_dir}/{test_file}.md" | ||
assert expected_file in logs, f"Expected file {expected_file} not generated" |
Oops, something went wrong.