Merge pull request #1098 from romainx/spark_doc

Follow-on PR #911: Spark documentation rework
jupyter · May 29, 2020 · 8d4bff6 · 8d4bff6
2 parents 2c0af4a + c83024c
commit 8d4bff6
Show file tree

Hide file tree

Showing 7 changed files with 466 additions and 82 deletions.
diff --git a/all-spark-notebook/test/data/local_pyspark.ipynb b/all-spark-notebook/test/data/local_pyspark.ipynb
@@ -0,0 +1,60 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "error",
+     "ename": "Error",
+     "evalue": "Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
+     "traceback": [
+      "Error: Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
+      "at b.startServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:270430)",
+      "at async b.createServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:269873)",
+      "at async connect (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:397876)",
+      "at async w.ensureConnectionAndNotebookImpl (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556625)",
+      "at async w.ensureConnectionAndNotebook (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556303)",
+      "at async w.clearResult (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:552346)",
+      "at async w.reexecuteCell (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:540374)",
+      "at async w.reexecuteCells (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:537541)"
+     ]
+    }
+   ],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "\n",
+    "# Spark session & context\n",
+    "spark = SparkSession.builder.master('local').getOrCreate()\n",
+    "sc = spark.sparkContext\n",
+    "\n",
+    "# Sum of the first 100 whole numbers\n",
+    "rdd = sc.parallelize(range(100 + 1))\n",
+    "rdd.sum()\n",
+    "# 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/all-spark-notebook/test/data/local_sparkR.ipynb b/all-spark-notebook/test/data/local_sparkR.ipynb
@@ -0,0 +1,41 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "library(SparkR)\n",
+    "\n",
+    "# Spark session & context\n",
+    "sc <- sparkR.session(\"local\")\n",
+    "\n",
+    "# Sum of the first 100 whole numbers\n",
+    "sdf <- createDataFrame(list(1:100))\n",
+    "dapplyCollect(sdf,\n",
+    "              function(x) \n",
+    "              { x <- sum(x)}\n",
+    "             )\n",
+    "# 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/all-spark-notebook/test/data/local_sparklyr.ipynb b/all-spark-notebook/test/data/local_sparklyr.ipynb
@@ -0,0 +1,43 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "library(sparklyr)\n",
+    "\n",
+    "# get the default config\n",
+    "conf <- spark_config()\n",
+    "# Set the catalog implementation in-memory\n",
+    "conf$spark.sql.catalogImplementation <- \"in-memory\"\n",
+    "\n",
+    "# Spark session & context\n",
+    "sc <- spark_connect(master = \"local\", config = conf)\n",
+    "\n",
+    "# Sum of the first 100 whole numbers\n",
+    "sdf_len(sc, 100, repartition = 1) %>% \n",
+    "    spark_apply(function(e) sum(e))\n",
+    "# 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/all-spark-notebook/test/data/local_spylon.ipynb b/all-spark-notebook/test/data/local_spylon.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%init_spark\n",
+    "# Spark session & context\n",
+    "launcher.master = \"local\"\n",
+    "launcher.conf.spark.executor.cores = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:28\n",
+       "res4: Double = 5050.0\n"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "// Sum of the first 100 whole numbers\n",
+    "val rdd = sc.parallelize(0 to 100)\n",
+    "rdd.sum()\n",
+    "// 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "spylon-kernel",
+   "language": "scala",
+   "name": "spylon-kernel"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".scala",
+   "help_links": [
+    {
+     "text": "MetaKernel Magics",
+     "url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
+    }
+   ],
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "pygments_lexer": "scala",
+   "version": "0.4.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/all-spark-notebook/test/data/local_toree.ipynb b/all-spark-notebook/test/data/local_toree.ipynb
@@ -0,0 +1,89 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Waiting for a Spark session to start..."
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "spark://master:7077\n"
+     ]
+    }
+   ],
+   "source": [
+    "// should print the value of --master in the kernel spec\n",
+    "println(sc.master)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Waiting for a Spark session to start..."
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "rdd = ParallelCollectionRDD[0] at parallelize at <console>:28\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "5050.0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "// Sum of the first 100 whole numbers\n",
+    "val rdd = sc.parallelize(0 to 100)\n",
+    "rdd.sum()\n",
+    "// 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Apache Toree - Scala",
+   "language": "scala",
+   "name": "apache_toree_scala"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".scala",
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "pygments_lexer": "scala",
+   "version": "2.11.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/all-spark-notebook/test/test_spark_notebooks.py b/all-spark-notebook/test/test_spark_notebooks.py
@@ -0,0 +1,35 @@
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+import logging
+
+import pytest
+import os
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pytest.mark.parametrize(
+    "test_file",
+    # TODO: add local_sparklyr
+    ["local_pyspark", "local_spylon", "local_toree", "local_sparkR"],
+)
+def test_nbconvert(container, test_file):
+    """Check if Spark notebooks can be executed"""
+    host_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
+    cont_data_dir = "/home/jovyan/data"
+    output_dir = "/tmp"
+    timeout_ms = 600
+    LOGGER.info(f"Test that {test_file} notebook can be executed ...")
+    command = f"jupyter nbconvert --to markdown --ExecutePreprocessor.timeout={timeout_ms} --output-dir {output_dir} --execute {cont_data_dir}/{test_file}.ipynb"
+    c = container.run(
+        volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}},
+        tty=True,
+        command=["start.sh", "bash", "-c", command],
+    )
+    rv = c.wait(timeout=timeout_ms / 10 + 10)
+    assert rv == 0 or rv["StatusCode"] == 0, f"Command {command} failed"
+    logs = c.logs(stdout=True).decode("utf-8")
+    LOGGER.debug(logs)
+    expected_file = f"{output_dir}/{test_file}.md"
+    assert expected_file in logs, f"Expected file {expected_file} not generated"