From f77c8b887e0418f37d2ab0286a8d5b320a26debc Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Thu, 9 Feb 2023 15:49:09 -0800 Subject: [PATCH 01/19] Python Druid API for use in notebooks Revises existing notebooks and readme to reference the new API. Notebook to explain the new API. Split README into a console version and a notebook version to work around lack of a nice display for md files. --- .gitignore | 1 + docs/tutorials/tutorial-jupyter-index.md | 80 +- .../jupyter-notebooks/-START HERE-.ipynb | 180 +++ .../Python_API_Tutorial.ipynb | 1022 +++++++++++++++++ .../quickstart/jupyter-notebooks/README.md | 68 +- .../jupyter-notebooks/api-tutorial.ipynb | 4 +- .../jupyter-notebooks/druidapi/__init__.py | 33 + .../jupyter-notebooks/druidapi/base_table.py | 113 ++ .../jupyter-notebooks/druidapi/catalog.py | 60 + .../jupyter-notebooks/druidapi/consts.py | 56 + .../jupyter-notebooks/druidapi/datasource.py | 80 ++ .../jupyter-notebooks/druidapi/display.py | 84 ++ .../jupyter-notebooks/druidapi/druid.py | 67 ++ .../jupyter-notebooks/druidapi/error.py | 32 + .../jupyter-notebooks/druidapi/html_table.py | 121 ++ .../jupyter-notebooks/druidapi/rest.py | 178 +++ .../jupyter-notebooks/druidapi/sql.py | 693 +++++++++++ .../jupyter-notebooks/druidapi/status.py | 99 ++ .../jupyter-notebooks/druidapi/tasks.py | 178 +++ .../jupyter-notebooks/druidapi/text_table.py | 161 +++ .../jupyter-notebooks/druidapi/util.py | 30 + 21 files changed, 3282 insertions(+), 58 deletions(-) create mode 100644 examples/quickstart/jupyter-notebooks/-START HERE-.ipynb create mode 100644 examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/__init__.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/base_table.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/catalog.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/consts.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/datasource.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/display.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/druid.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/error.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/html_table.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/rest.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/sql.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/status.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/tasks.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/text_table.py create mode 100644 examples/quickstart/jupyter-notebooks/druidapi/util.py diff --git a/.gitignore b/.gitignore index d6ecf2b79524..14e9778c3d75 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ integration-tests/gen-scripts/ /bin/ *.hprof **/.ipynb_checkpoints/ +*.pyc diff --git a/docs/tutorials/tutorial-jupyter-index.md b/docs/tutorials/tutorial-jupyter-index.md index 012bb7e16cd3..ce479c490495 100644 --- a/docs/tutorials/tutorial-jupyter-index.md +++ b/docs/tutorials/tutorial-jupyter-index.md @@ -22,50 +22,84 @@ title: "Jupyter Notebook tutorials" ~ under the License. --> - + -You can try out the Druid APIs using the Jupyter Notebook-based tutorials. These tutorials provide snippets of Python code that you can use to run calls against the Druid API to complete the tutorial. - -## Prerequisites +You can try out the Druid APIs using the Jupyter Notebook-based tutorials. These +tutorials provide snippets of Python code that you can use to run calls against +the Druid API to complete the tutorial. Make sure you meet the following requirements before starting the Jupyter-based tutorials: -- Python 3 +- Python 3 + +- The `requests` package for Python. For example, you can install it with the following command: -- The `requests` package for Python. For example, you can install it with the following command: - ```bash pip3 install requests - ``` + ```` -- JupyterLab (recommended) or Jupyter Notebook running on a non-default port. By default, Druid and Jupyter both try to use port `8888,` so start Jupyter on a different port. +- JupyterLab (recommended) or Jupyter Notebook running on a non-default port. By default, Druid + and Jupyter both try to use port `8888,` so start Jupyter on a different port. - Install JupyterLab or Notebook: - + ```bash - # Install JupyterLab - pip3 install jupyterlab - # Install Jupyter Notebook - pip3 install notebook + # Install JupyterLab + pip3 install jupyterlab + # Install Jupyter Notebook + pip3 install notebook ``` - - Start Jupyter - - JupyterLab + - Start Jupyter: + - JupyterLab ```bash # Start JupyterLab on port 3001 jupyter lab --port 3001 ``` - Jupyter Notebook - ```bash - # Start Jupyter Notebook on port 3001 - jupyter notebook --port 3001 - ``` + ```bash + # Start Jupyter Notebook on port 3001 + jupyter notebook --port 3001 + ``` + +- An available Druid instance. You can use the `micro-quickstart` configuration + described in [Quickstart](../../../docs/tutorials/index.md). The tutorials + assume that you are using the quickstart, so no authentication or authorization + is expected unless explicitly mentioned. + + Druid developers can use a cluster launched for an integration test: + + ```bash + cd $DRUID_DEV + ./it.sh build + ./it.sh image + ./it.sh up + ``` + + Where `DRUID_DEV` points to your Druid source code repo, and `` is one + of the available integration test categories. See the integration test `README.md` + for details. + +## Simple Druid API -- An available Druid instance. You can use the [Quickstart (local)](./index.md) instance. The tutorials assume that you are using the quickstart, so no authentication or authorization is expected unless explicitly mentioned. +One of the notebooks shows how to use the Druid REST API. The others focus on other +topics and use a simple set of Python wrappers around the underlying REST API. The +wrappers reside in the `druidapi` package within this directory. While the package +can be used in any Python program, the key purpose, at present, is to support these +notebooks. See the [Introduction to the Druid Python API] +(https://github.com/apache/druid/tree/master/examples/quickstart/jupyter-notebooks/python-api-tutorial.ipynb) +for an overview of the Python API. ## Tutorials -The notebooks are located in the [apache/druid repo](https://github.com/apache/druid/tree/master/examples/quickstart/jupyter-notebooks/). You can either clone the repo or download the notebooks you want individually. +The notebooks are located in the [apache/druid repo](https://github.com/apache/druid/tree/master/examples/quickstart/jupyter-notebooks/). You can either clone the repo or download the notebooks you want individually. The links that follow are the raw GitHub URLs, so you can use them to download the notebook directly, such as with `wget`, or manually through your web browser. Note that if you save the file from your web browser, make sure to remove the `.txt` extension. -- [Introduction to the Druid API](https://raw.githubusercontent.com/apache/druid/master/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb) walks you through some of the basics related to the Druid API and several endpoints. \ No newline at end of file +- [Introduction to the Druid REST API]( + https://raw.githubusercontent.com/apache/druid/master/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb) + walks you through some of the basics related to the Druid REST API and several endpoints. +- [Introduction to the Druid Python API]( + https://raw.githubusercontent.com/apache/druid/master/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb) + walks you through some of the basics related to the Druid API using the Python wrapper API. diff --git a/examples/quickstart/jupyter-notebooks/-START HERE-.ipynb b/examples/quickstart/jupyter-notebooks/-START HERE-.ipynb new file mode 100644 index 000000000000..d19f6f77b777 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/-START HERE-.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e415d732", + "metadata": {}, + "source": [ + "# Jupyter Notebook tutorials for Druid\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "You can try out the Druid APIs using the Jupyter Notebook-based tutorials. These\n", + "tutorials provide snippets of Python code that you can use to run calls against\n", + "the Druid API to complete the tutorial." + ] + }, + { + "cell_type": "markdown", + "id": "c11a86a4", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "To get this far, you've installed Python 3 and Jupyter Notebook. Make sure you meet the following requirements before starting the Jupyter-based tutorials:\n", + "\n", + "- The `requests` package for Python. For example, you can install it with the following command:\n", + "\n", + " ```bash\n", + " pip3 install requests\n", + " ````\n", + "\n", + "- JupyterLab (recommended) or Jupyter Notebook running on a non-default port. By default, Druid\n", + " and Jupyter both try to use port `8888,` so start Jupyter on a different port.\n", + "\n", + "- An available Druid instance. You can use the `micro-quickstart` configuration\n", + " described in [Quickstart](https://druid.apache.org/docs/latest/tutorials/index.html).\n", + " The tutorials assume that you are using the quickstart, so no authentication or authorization\n", + " is expected unless explicitly mentioned.\n", + "\n", + " Druid developers can use a cluster launched for an integration test:\n", + "\n", + " ```bash\n", + " cd $DRUID_DEV\n", + " ./it.sh build\n", + " ./it.sh image\n", + " ./it.sh up \n", + " ```\n", + "\n", + " Where `DRUID_DEV` points to your Druid source code repo, and `` is one\n", + " of the available integration test categories. See the integration test `README.md`\n", + " for details." + ] + }, + { + "cell_type": "markdown", + "id": "60015702", + "metadata": {}, + "source": [ + "## Simple Druid API\n", + "\n", + "One of the notebooks shows how to use the Druid REST API. The others focus on other\n", + "topics and use a simple set of Python wrappers around the underlying REST API. The\n", + "wrappers reside in the `druidapi` package within this directory. While the package\n", + "can be used in any Python program, the key purpose, at present, is to support these\n", + "notebooks. See the [Introduction to the Druid Python API](Python_API_Tutorial.ipynb)\n", + "for an overview of the Python API." + ] + }, + { + "cell_type": "markdown", + "id": "d9e18342", + "metadata": {}, + "source": [ + "## Tutorials\n", + "\n", + "The notebooks are located in the [apache/druid repo](\n", + "https://github.com/apache/druid/tree/master/examples/quickstart/jupyter-notebooks/).\n", + "You can either clone the repo or download the notebooks you want individually.\n", + "\n", + "The links that follow are the raw GitHub URLs, so you can use them to download the\n", + "notebook directly, such as with `wget`, or manually through your web browser. Note\n", + "that if you save the file from your web browser, make sure to remove the `.txt` extension.\n", + "\n", + "- [Introduction to the Druid REST API](api-tutorial.ipynb) walks you through some of the\n", + " basics related to the Druid REST API and several endpoints.\n", + "- [Introduction to the Druid Python API](Python_API_Tutorial.ipynb) walks you through some of the\n", + " basics related to the Druid API using the Python wrapper API." + ] + }, + { + "cell_type": "markdown", + "id": "1a4b986a", + "metadata": {}, + "source": [ + "## Contributing\n", + "\n", + "If you build a Jupyter tutorial, you need to do a few things to add it to the docs\n", + "in addition to saving the notebook in this directory. The process requires two PRs to the repo.\n", + "\n", + "For the first PR, do the following:\n", + "\n", + "1. Depending on the goal of the noteboo, you may want to clear the outputs from your notebook\n", + " before you make the PR. You can use the following command:\n", + "\n", + " ```bash\n", + " jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace ./path/to/notebook/notebookName.ipynb\n", + " ```\n", + "\n", + "2. Create the PR as you normally would. Make sure to note that this PR is the one that\n", + " contains only the Jupyter notebook and that there will be a subsequent PR that updates\n", + " related pages.\n", + "\n", + "3. After this first PR is merged, grab the \"raw\" URL for the file from GitHub. For example,\n", + " navigate to the file in the GitHub web UI and select **Raw**. Use the URL for this in the\n", + " second PR as the download link.\n", + "\n", + "For the second PR, do the following:\n", + "\n", + "1. Update the list of [Tutorials](#tutorials) on this page and in the\n", + " [Jupyter tutorial index page](../../../docs/tutorials/tutorial-jupyter-index.md#tutorials)\n", + " in the `docs/tutorials` directory.\n", + "\n", + "2. Update `tutorial-jupyter-index.md` and provide the URL to the raw version of the file\n", + " that becomes available after the first PR is merged.\n", + "\n", + "Note that you can skip the second PR, if you just copy the prefix link from one of the\n", + "existing notebook links when doing your first PR." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e6f2a0e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb new file mode 100644 index 000000000000..32cfe2811f75 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -0,0 +1,1022 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ce2efaaa", + "metadata": {}, + "source": [ + "# Tutorial: Learn the Druid Python API\n", + "\n", + "This notebook provides a quick introduction to the Python wrapper around the [Druid REST API](api-tutorial.ipynb). This notebook assumes you are familiar with the basics of the REST API, and the [set of operations which Druid provides](https://druid.apache.org/docs/latest/operations/api-reference.html). Here we focus on using Python to access those APIs rather than explaining the APIs themselves. The APIs themselves are covered in other notebooks that use the Python API.\n", + "\n", + "The Druid Python API is primarily intended to help with these notebook tutorials. It can also be used in a regular Python program, as long as the IPython dependencies are available.\n", + "\n", + "The Druid Python API is a work in progress. We add API wrappers as needed for the notebook tutorials. If you find you need additional wrappers, please feel free to add them, and post a PR to Apache Druid with your additions.\n", + "\n", + "The API provides two levels of functions. Most are simple wrappers around Druid's REST APIs. Others add additional code to make the API easier to use. The SQL query interface is a prime example: extra code translates a simple SQL query into Druid's `SQLQuery` object and interprets the results into a form that can be displayed in a notebook.\n", + "\n", + "We start by importing the `druidapi` package from the same folder as this notebook. The `styles()` calls adds some CSS styles needed to display results." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6d90ca5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import druidapi\n", + "druidapi.styles()" + ] + }, + { + "cell_type": "markdown", + "id": "fb68a838", + "metadata": {}, + "source": [ + "Next we connect to our cluster by providing the router endpoint. Here we assume the cluster is on your local machine, using the default port. Go ahead and change this if your setup is different.\n", + "\n", + "The API uses the router to forward messages to each of Druid's services so that we don't have to keep track of the host and port for each service." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ae601081", + "metadata": {}, + "outputs": [], + "source": [ + "druid = druidapi.client(\"http://localhost:8888\")" + ] + }, + { + "cell_type": "markdown", + "id": "8b4e774b", + "metadata": {}, + "source": [ + "## Status Client\n", + "\n", + "The SDK groups Druid REST API calls into categories, with a client for each. Let's start with the status client." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ff16fc3b", + "metadata": {}, + "outputs": [], + "source": [ + "status_client = druid.status()" + ] + }, + { + "cell_type": "markdown", + "id": "be992774", + "metadata": {}, + "source": [ + "Use the Python help() function to learn what methods are avaialble." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "03f26417", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on StatusClient in module druidapi.status object:\n", + "\n", + "class StatusClient(builtins.object)\n", + " | StatusClient(rest_client)\n", + " | \n", + " | Client for status APIs. These APIs are available on all nodes.\n", + " | If used with the router, they report the status of just the router.\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(self, rest_client)\n", + " | Initialize self. See help(type(self)) for accurate signature.\n", + " | \n", + " | brokers(self)\n", + " | \n", + " | in_cluster(self)\n", + " | Returns `True` if the node is visible wihtin the cluster, `False` if not.\n", + " | (That is, returns the value of the `{\"selfDiscovered\": true/false}`\n", + " | field in the response.\n", + " | \n", + " | GET `/status/selfDiscovered/status`\n", + " | \n", + " | See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information\n", + " | \n", + " | is_healthy(self) -> bool\n", + " | Returns `True` if the node is healthy, an exception otherwise.\n", + " | Useful for automated health checks.\n", + " | \n", + " | GET `/status/health`\n", + " | \n", + " | See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information\n", + " | \n", + " | properties(self) -> map\n", + " | Returns the effective set of Java properties used by the service, including\n", + " | system properties and properties from the `common_runtime.propeties` and\n", + " | `runtime.properties` files.\n", + " | \n", + " | GET `/status/properties`\n", + " | \n", + " | See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information\n", + " | \n", + " | status(self)\n", + " | Returns the Druid version, loaded extensions, memory used, total memory \n", + " | and other useful information about the process.\n", + " | \n", + " | GET `/status`\n", + " | \n", + " | See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information\n", + " | \n", + " | version(self)\n", + " | \n", + " | wait_until_ready(self)\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + "\n" + ] + } + ], + "source": [ + "help(status_client)" + ] + }, + { + "cell_type": "markdown", + "id": "70f3d578", + "metadata": {}, + "source": [ + "Druid servers return unexpected results if we make REST calls while Druid starts up. Let's wait until things are ready. The following will run until the server is ready. If you forgot to start your server, or the URL above is wrong, this will hang forever. Use the Kernel → Interrupt command to break out of the function. (Or, start your server. If your server refuses to start, then this Jupyter Notebook may be running on port 8888. See the [README](README.md) for how to start on a different port.)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "114ed0d1", + "metadata": {}, + "outputs": [], + "source": [ + "status_client.wait_until_ready()" + ] + }, + { + "cell_type": "markdown", + "id": "e803c9fe", + "metadata": {}, + "source": [ + "Check the version of your cluster. Some of these notebooks illustrate newer features available only on specific versions of Druid." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2faa0d81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'26.0.0-SNAPSHOT'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "status_client.version()" + ] + }, + { + "cell_type": "markdown", + "id": "d78a6c35", + "metadata": {}, + "source": [ + "You can also check which extensions are loaded in your cluster. Some notebooks require specific extensions to be available." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1001f412", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[\"druid-hdfs-storage\", \"druid-kafka-indexing-service\", \"druid-datasketches\", \"druid-multi-stage-query\", \"druid-lookups-cached-global\", \"druid-catalog\"]'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "status_client.properties()['druid.extensions.loadList']" + ] + }, + { + "cell_type": "markdown", + "id": "e618366f", + "metadata": {}, + "source": [ + "## SQL Client\n", + "\n", + "Running SQL queries in a notebook is easy. Our goal here is to run a query and display results. The [pydruid](https://pythonhosted.org/pydruid/) library provides a robust way to run native queries, to run SQL queries, and to convert the results to various formats. Here our goal is just to interact with Druid." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3a9ba661", + "metadata": {}, + "outputs": [], + "source": [ + "sql_client = druid.sql()" + ] + }, + { + "cell_type": "markdown", + "id": "dfcc1d63", + "metadata": {}, + "source": [ + "We can start with getting a list of schemas." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "271b3a67", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
SchemaName
INFORMATION_SCHEMA
druid
ext
lookup
sys
view
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_client.show_schemas()" + ] + }, + { + "cell_type": "markdown", + "id": "d2adb6fe", + "metadata": {}, + "source": [ + "We can also see the tables (or datasources) within any schema." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "74ad71be", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
TableName
COLUMNS
PARAMETERS
SCHEMATA
TABLES
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_client.show_tables('INFORMATION_SCHEMA')" + ] + }, + { + "cell_type": "markdown", + "id": "915c5630", + "metadata": {}, + "source": [ + "We see the list of datasources by default. You'll get an empty result if you have no datasources yet." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "610d3444", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "
TableName
myWiki
myWiki3
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_client.show_tables()" + ] + }, + { + "cell_type": "markdown", + "id": "2b6df996", + "metadata": {}, + "source": [ + "We can easily run a query and show the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e11bceba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
TABLE_NAME
COLUMNS
PARAMETERS
SCHEMATA
TABLES
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql = '''\n", + "SELECT TABLE_NAME\n", + "FROM INFORMATION_SCHEMA.TABLES\n", + "WHERE TABLE_SCHEMA = 'INFORMATION_SCHEMA'\n", + "'''\n", + "sql_client.show(sql)" + ] + }, + { + "cell_type": "markdown", + "id": "c8aaffa9", + "metadata": {}, + "source": [ + "The query above showed the same results as `show_tables()`. That is not surprising: `show_tables()` just runs this query for us." + ] + }, + { + "cell_type": "markdown", + "id": "be4c481a", + "metadata": {}, + "source": [ + "The API also allows passing context parameters and query parameters using a request object. Druid will work out the query parameter type based on the Python type. Pass context values as a Python `dict`." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bb06e99d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
TABLE_NAME
COLUMNS
PARAMETERS
SCHEMATA
TABLES
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql = '''\n", + "SELECT TABLE_NAME\n", + "FROM INFORMATION_SCHEMA.TABLES\n", + "WHERE TABLE_SCHEMA = ?\n", + "'''\n", + "req = sql_client.sql_request(sql)\n", + "req.add_parameter('INFORMATION_SCHEMA')\n", + "req.with_context({\"someParameter\": \"someValue\"})\n", + "sql_client.show(req)" + ] + }, + { + "cell_type": "markdown", + "id": "543945f3", + "metadata": {}, + "source": [ + "The request has other features for advanced use cases: see the code for details. The query API actually returns a sql response object. Use this if you want to get the values directly, work with the schema, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b1497972", + "metadata": {}, + "outputs": [], + "source": [ + "sql = '''\n", + "SELECT TABLE_NAME\n", + "FROM INFORMATION_SCHEMA.TABLES\n", + "WHERE TABLE_SCHEMA = 'INFORMATION_SCHEMA'\n", + "'''\n", + "resp = sql_client.sql_query(sql)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "50f5384d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TABLE_NAME VARCHAR string\n" + ] + } + ], + "source": [ + "col1 = resp.schema()[0]\n", + "print(col1.name, col1.sql_type, col1.druid_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7ba92de5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'TABLE_NAME': 'COLUMNS'},\n", + " {'TABLE_NAME': 'PARAMETERS'},\n", + " {'TABLE_NAME': 'SCHEMATA'},\n", + " {'TABLE_NAME': 'TABLES'}]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "resp.rows()" + ] + }, + { + "cell_type": "markdown", + "id": "2ecba1c2", + "metadata": {}, + "source": [ + "The `show()` method uses this information for format an HTML table to present the results." + ] + }, + { + "cell_type": "markdown", + "id": "8d071790", + "metadata": {}, + "source": [ + "## MSQ Ingestion\n", + "\n", + "The SQL client also performs MSQ-based ingestion using `INSERT` or `REPLACE` statements. Use the extension check above to ensure that `druid-multi-stage-query` is loaded in Druid 26. (Later versions may have MSQ built in.)\n", + "\n", + "An MSQ query is run using a different API: `task()`. This API returns a response object that describes the Overlord task which runs the MSQ query. For tutorials, our data is usually small enough we just want to wait for the ingestion to complete. We do that with the `run_task()` call which handles the waiting for us. To illustrate, lets use a query that ingests a subset of columns, and includes a few data clean-up steps:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "f1e3b9ac", + "metadata": {}, + "outputs": [], + "source": [ + "sql = '''\n", + "REPLACE INTO \"myWiki1\" OVERWRITE ALL\n", + "SELECT\n", + " TIME_PARSE(\"timestamp\") AS \"__time\",\n", + " namespace,\n", + " page,\n", + " channel,\n", + " \"user\",\n", + " countryName,\n", + " CASE WHEN isRobot = 'true' THEN 1 ELSE 0 END AS isRobot,\n", + " \"added\",\n", + " \"delta\",\n", + " CASE WHEN isNew = 'true' THEN 1 ELSE 0 END AS isNew,\n", + " CAST(\"deltaBucket\" AS DOUBLE) AS deltaBucket,\n", + " \"deleted\"\n", + "FROM TABLE(\n", + " EXTERN(\n", + " '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n", + " '{\"type\":\"json\"}',\n", + " '[{\"name\":\"isRobot\",\"type\":\"string\"},{\"name\":\"channel\",\"type\":\"string\"},{\"name\":\"timestamp\",\"type\":\"string\"},{\"name\":\"flags\",\"type\":\"string\"},{\"name\":\"isUnpatrolled\",\"type\":\"string\"},{\"name\":\"page\",\"type\":\"string\"},{\"name\":\"diffUrl\",\"type\":\"string\"},{\"name\":\"added\",\"type\":\"long\"},{\"name\":\"comment\",\"type\":\"string\"},{\"name\":\"commentLength\",\"type\":\"long\"},{\"name\":\"isNew\",\"type\":\"string\"},{\"name\":\"isMinor\",\"type\":\"string\"},{\"name\":\"delta\",\"type\":\"long\"},{\"name\":\"isAnonymous\",\"type\":\"string\"},{\"name\":\"user\",\"type\":\"string\"},{\"name\":\"deltaBucket\",\"type\":\"long\"},{\"name\":\"deleted\",\"type\":\"long\"},{\"name\":\"namespace\",\"type\":\"string\"},{\"name\":\"cityName\",\"type\":\"string\"},{\"name\":\"countryName\",\"type\":\"string\"},{\"name\":\"regionIsoCode\",\"type\":\"string\"},{\"name\":\"metroCode\",\"type\":\"long\"},{\"name\":\"countryIsoCode\",\"type\":\"string\"},{\"name\":\"regionName\",\"type\":\"string\"}]'\n", + " )\n", + ")\n", + "PARTITIONED BY DAY\n", + "CLUSTERED BY namespace, page\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "2d925dc6", + "metadata": {}, + "outputs": [], + "source": [ + "sql_client.run_task(sql)" + ] + }, + { + "cell_type": "markdown", + "id": "e5fcbf1b", + "metadata": {}, + "source": [ + "MSQ reports task completion as soon as ingestion is done. However, it takes a while for Druid to load the resulting segments. Let's wait for the table to become ready." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "e08d7c30", + "metadata": {}, + "outputs": [], + "source": [ + "sql_client.wait_until_ready('myWiki1')" + ] + }, + { + "cell_type": "markdown", + "id": "11c21741", + "metadata": {}, + "source": [ + "`describe_table()` tells us about the columns in a table." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "47e9701e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
PositionNameType
1__timeTIMESTAMP
2namespaceVARCHAR
3pageVARCHAR
4channelVARCHAR
5userVARCHAR
6countryNameVARCHAR
7isRobotBIGINT
8addedBIGINT
9deltaBIGINT
10isNewBIGINT
11deltaBucketDOUBLE
12deletedBIGINT
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_client.describe_table('myWiki1')" + ] + }, + { + "cell_type": "markdown", + "id": "59127223", + "metadata": {}, + "source": [ + "We can also sample a few rows of data." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "087ad2bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
__timenamespacepagechannelusercountryNameisRobotaddeddeltaisNewdeltaBucketdeleted
2016-06-27T00:00:11.080ZMainSalo Toraut#sv.wikipediaLsjbot1313110.00
2016-06-27T00:00:17.457Z利用者利用者:ワーナー成増/放送ウーマン賞#ja.wikipediaワーナー成増01251250100.00
2016-06-27T00:00:34.959ZMainBailando 2015#en.wikipedia181.230.118.178Argentina02200.00
2016-06-27T00:00:36.027ZMainRichie Rich's Christmas Wish#en.wikipediaJasonAQuest00-20-100.02
2016-06-27T00:00:46.874ZMainEl Olivo, Ascensión#sh.wikipediaKolega235710-10-100.01
2016-06-27T00:00:56.913ZMainBlowback (intelligence)#en.wikipediaBrokenshardz0767600.00
2016-06-27T00:00:58.599ZKategoriaKategoria:Dyskusje nad usunięciem artykułu zakończone bez konsensusu − lipiec 2016#pl.wikipediaBeau.bot12702701200.00
2016-06-27T00:01:01.364ZMainEl Paraíso, Bachíniva#sh.wikipediaKolega235710-10-100.01
2016-06-27T00:01:03.685ZMainEl Terco, Bachíniva#sh.wikipediaKolega235710-10-100.01
2016-06-27T00:01:07.347ZMainNeqerssuaq#ceb.wikipediaLsjbot14150415014100.00
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_client.show('SELECT * FROM myWiki1 LIMIT 10')" + ] + }, + { + "cell_type": "markdown", + "id": "c823e2ee", + "metadata": {}, + "source": [ + "## Datasource Client\n", + "\n", + "The Datasource client lets us perform operations on datasource objects. While the SQL layer lets us get metadata and do queries. the datasource client let's us work with the underlying segments. Explaining the full functionality is the topic of another notebook. For now, let's just use the datasource client to clean up the datasource created above. The `True` argument asks for \"if exists\" semantics so we don't get an error if the datasource was alredy deleted." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "ced100ae", + "metadata": {}, + "outputs": [], + "source": [ + "ds_client = druid.datasources()\n", + "ds_client.drop('myWiki', True)" + ] + }, + { + "cell_type": "markdown", + "id": "f9ba40eb", + "metadata": {}, + "source": [ + "## Tasks Client\n", + "\n", + "Use the tasks client to work with Overlord tasks. The `run_task()` call above actually uses the task client internally to poll Overlord." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "0a8123e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 'query-24066a63-7e20-41bb-b212-80f193e6f2c8-worker0_0',\n", + " 'groupId': 'query-24066a63-7e20-41bb-b212-80f193e6f2c8',\n", + " 'type': 'query_worker',\n", + " 'createdTime': '2023-02-09T22:49:01.761Z',\n", + " 'queueInsertionTime': '1970-01-01T00:00:00.000Z',\n", + " 'statusCode': 'SUCCESS',\n", + " 'status': 'SUCCESS',\n", + " 'runnerStatusCode': 'NONE',\n", + " 'duration': 57895,\n", + " 'location': {'host': 'localhost', 'port': 8101, 'tlsPort': -1},\n", + " 'dataSource': 'myWiki1',\n", + " 'errorMsg': None},\n", + " {'id': 'query-24066a63-7e20-41bb-b212-80f193e6f2c8',\n", + " 'groupId': 'query-24066a63-7e20-41bb-b212-80f193e6f2c8',\n", + " 'type': 'query_controller',\n", + " 'createdTime': '2023-02-09T22:48:30.512Z',\n", + " 'queueInsertionTime': '1970-01-01T00:00:00.000Z',\n", + " 'statusCode': 'SUCCESS',\n", + " 'status': 'SUCCESS',\n", + " 'runnerStatusCode': 'NONE',\n", + " 'duration': 92476,\n", + " 'location': {'host': 'localhost', 'port': 8100, 'tlsPort': -1},\n", + " 'dataSource': 'myWiki1',\n", + " 'errorMsg': None}]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task_client = druid.tasks()\n", + "task_client.tasks()" + ] + }, + { + "cell_type": "markdown", + "id": "b7156347", + "metadata": {}, + "source": [ + "## Constants\n", + "\n", + "Druid has a large number of special constants: type names, options, etc. The `consts` module provides definitions for may of these:" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "415a1ec1", + "metadata": {}, + "outputs": [], + "source": [ + "from druidapi import consts" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "2e183999", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on module druidapi.consts in druidapi:\n", + "\n", + "NAME\n", + " druidapi.consts\n", + "\n", + "DESCRIPTION\n", + " # Licensed to the Apache Software Foundation (ASF) under one or more\n", + " # contributor license agreements. See the NOTICE file distributed with\n", + " # this work for additional information regarding copyright ownership.\n", + " # The ASF licenses this file to You under the Apache License, Version 2.0\n", + " # (the \"License\"); you may not use this file except in compliance with\n", + " # the License. You may obtain a copy of the License at\n", + " #\n", + " # http://www.apache.org/licenses/LICENSE-2.0\n", + " #\n", + " # Unless required by applicable law or agreed to in writing, software\n", + " # distributed under the License is distributed on an \"AS IS\" BASIS,\n", + " # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + " # See the License for the specific language governing permissions and\n", + " # limitations under the License.\n", + "\n", + "DATA\n", + " COLUMNS_TABLE = 'INFORMATION_SCHEMA.COLUMNS'\n", + " COORD_BASE = '/druid/coordinator/v1'\n", + " DRUID_DOUBLE_TYPE = 'double'\n", + " DRUID_FLOAT_TYPE = 'float'\n", + " DRUID_LONG_TYPE = 'long'\n", + " DRUID_SCHEMA = 'druid'\n", + " DRUID_STRING_TYPE = 'string'\n", + " DRUID_TIMESTAMP_TYPE = 'timestamp'\n", + " EXT_SCHEMA = 'ext'\n", + " FAILED_STATE = 'FAILED'\n", + " OVERLORD_BASE = '/druid/indexer/v1'\n", + " ROUTER_BASE = '/druid/v2'\n", + " RUNNING_STATE = 'RUNNING'\n", + " SCHEMAS_TABLE = 'INFORMATION_SCHEMA.SCHEMATA'\n", + " SCHEMA_SCHEMA = 'INFORMATION_SCHEMA'\n", + " SQL_ARRAY = 'array'\n", + " SQL_ARRAY_TYPE = 'ARRAY'\n", + " SQL_ARRAY_WITH_TRAILER = 'arrayWithTrailer'\n", + " SQL_BIGINT_TYPE = 'BIGINT'\n", + " SQL_CSV = 'csv'\n", + " SQL_DOUBLE_TYPE = 'DOUBLE'\n", + " SQL_FLOAT_TYPE = 'FLOAT'\n", + " SQL_OBJECT = 'object'\n", + " SQL_TIMESTAMP_TYPE = 'TIMESTAMP'\n", + " SQL_VARCHAR_TYPE = 'VARCHAR'\n", + " SUCCESS_STATE = 'SUCCESS'\n", + " SYS_SCHEMA = 'sys'\n", + " TABLES_TABLE = 'INFORMATION_SCHEMA.TABLES'\n", + "\n", + "FILE\n", + " /Users/paul/git/druid/examples/quickstart/jupyter-notebooks/druidapi/consts.py\n", + "\n", + "\n" + ] + } + ], + "source": [ + "help(consts)" + ] + }, + { + "cell_type": "markdown", + "id": "877a0e63", + "metadata": {}, + "source": [ + "Using the constants avoids typos:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "0e5a555b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
TableName
segments
server_segments
servers
supervisors
tasks
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_client.show_tables(consts.SYS_SCHEMA)" + ] + }, + { + "cell_type": "markdown", + "id": "7b28893e", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "This notebook have you a whirlwind tour of the Python Druid API: just enough to check your cluster, ingest some data with MSQ and query that data. Druid has many more APIs. As noted earlier, the Python API is a work in progress: the team adds new wrappers as needed for tutorials. Your [contributions](https://github.com/apache/druid/pulls) and [feedback](https://github.com/apache/druid/issues) are welcome." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056cbf27", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/quickstart/jupyter-notebooks/README.md b/examples/quickstart/jupyter-notebooks/README.md index 7e5fa2becaee..ffc9128fedd8 100644 --- a/examples/quickstart/jupyter-notebooks/README.md +++ b/examples/quickstart/jupyter-notebooks/README.md @@ -1,6 +1,11 @@ # Jupyter Notebook tutorials for Druid - +If you are reading this in Jupyter, switch over to the [- START HERE -](- START HERE -.ipynb] +notebok instead. + + -You can try out the Druid APIs using the Jupyter Notebook-based tutorials. These tutorials provide snippets of Python code that you can use to run calls against the Druid API to complete the tutorial. +You can try out the Druid APIs using the Jupyter Notebook-based tutorials. These +tutorials provide snippets of Python code that you can use to run calls against +the Druid API to complete the tutorial. ## Prerequisites Make sure you meet the following requirements before starting the Jupyter-based tutorials: -- Python 3 +- Python 3 + +- The `requests` package for Python. For example, you can install it with the following command: -- The `requests` package for Python. For example, you can install it with the following command: - ```bash pip3 install requests ```` -- JupyterLab (recommended) or Jupyter Notebook running on a non-default port. By default, Druid and Jupyter both try to use port `8888,` so start Jupyter on a different port. +- JupyterLab (recommended) or Jupyter Notebook running on a non-default port. By default, Druid + and Jupyter both try to use port `8888,` so start Jupyter on a different port. - Install JupyterLab or Notebook: - + ```bash # Install JupyterLab - pip3 install jupyterlab + pip3 install jupyterlab # Install Jupyter Notebook pip3 install notebook ``` - Start Jupyter: - - JupyterLab + - JupyterLab ```bash # Start JupyterLab on port 3001 jupyter lab --port 3001 @@ -57,33 +65,25 @@ Make sure you meet the following requirements before starting the Jupyter-based jupyter notebook --port 3001 ``` -- An available Druid instance. You can use the `micro-quickstart` configuration described in [Quickstart (local)](../../../docs/tutorials/index.md). The tutorials assume that you are using the quickstart, so no authentication or authorization is expected unless explicitly mentioned. - -## Tutorials - -The notebooks are located in the [apache/druid repo](https://github.com/apache/druid/tree/master/examples/quickstart/jupyter-notebooks/). You can either clone the repo or download the notebooks you want individually. - -The links that follow are the raw GitHub URLs, so you can use them to download the notebook directly, such as with `wget`, or manually through your web browser. Note that if you save the file from your web browser, make sure to remove the `.txt` extension. +- An available Druid instance. You can use the `micro-quickstart` configuration + described in [Quickstart](https://druid.apache.org/docs/latest/tutorials/index.html). + The tutorials assume that you are using the quickstart, so no authentication or authorization + is expected unless explicitly mentioned. -- [Introduction to the Druid API](api-tutorial.ipynb) walks you through some of the basics related to the Druid API and several endpoints. - -## Contributing - -If you build a Jupyter tutorial, you need to do a few things to add it to the docs in addition to saving the notebook in this directory. The process requires two PRs to the repo. - -For the first PR, do the following: - -1. Clear the outputs from your notebook before you make the PR. You can use the following command: - - ```bash - jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace ./path/to/notebook/notebookName.ipynb - ``` + Druid developers can use a cluster launched for an integration test: -2. Create the PR as you normally would. Make sure to note that this PR is the one that contains only the Jupyter notebook and that there will be a subsequent PR that updates related pages. + ```bash + cd $DRUID_DEV + ./it.sh build + ./it.sh image + ./it.sh up + ``` -3. After this first PR is merged, grab the "raw" URL for the file from GitHub. For example, navigate to the file in the GitHub web UI and select **Raw**. Use the URL for this in the second PR as the download link. + Where `DRUID_DEV` points to your Druid source code repo, and `` is one + of the available integration test categories. See the integration test `README.md` + for details. -For the second PR, do the following: +## Continue in Jupyter -1. Update the list of [Tutorials](#tutorials) on this page and in the [ Jupyter tutorial index page](../../../docs/tutorials/tutorial-jupyter-index.md#tutorials) in the `docs/tutorials` directory. -2. Update `tutorial-jupyter-index.md` and provide the URL to the raw version of the file that becomes available after the first PR is merged. +Fire up Jupyter (see above) and navigate to the "- START HERE -" page for more +information. diff --git a/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb b/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb index b795babaefef..3dec934b8110 100644 --- a/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb @@ -38,6 +38,8 @@ "\n", "For more information, see the [API reference](https://druid.apache.org/docs/latest/operations/api-reference.html), which is organized by server type.\n", "\n", + "For work within other notebooks, prefer to use the [Python API](Python_API_Tutorial.ipynb) which is a notebook-friendly wrapper around the low-level API calls shown here.\n", + "\n", "## Table of contents\n", "\n", "- [Prerequisites](#Prerequisites)\n", @@ -481,7 +483,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.6" }, "vscode": { "interpreter": { diff --git a/examples/quickstart/jupyter-notebooks/druidapi/__init__.py b/examples/quickstart/jupyter-notebooks/druidapi/__init__.py new file mode 100644 index 000000000000..55c9769fa281 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/__init__.py @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .druid import DruidClient +from . import display, html_table + +def client(endpoint): + return DruidClient(endpoint) + +def styles(): + show_as_html() + html_table.styles() + +def show_as_text(): + display.display.text() + +def show_as_html(): + display.display.html() + +def _display(): + return display.display diff --git a/examples/quickstart/jupyter-notebooks/druidapi/base_table.py b/examples/quickstart/jupyter-notebooks/druidapi/base_table.py new file mode 100644 index 000000000000..a3d18a3bbb35 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/base_table.py @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ALIGN_LEFT = 0 +ALIGN_CENTER = 1 +ALIGN_RIGHT = 2 + +def padded(array, width, fill): + if array is not None and len(array) >= width: + return array + if array is None: + result = [] + else: + result = array.copy() + return pad(result, width, fill) + +def pad(array, width, fill): + for _ in range(len(array), width): + array.append(fill) + return array + +class BaseTable: + + def __init__(self): + self._headers = None + self._align = None + self._col_fmt = None + self.sample_size = 10 + + def headers(self, headers): + self._headers = headers + + def alignments(self, align): + self._align = align + + def col_format(self, col_fmt): + self._col_fmt = col_fmt + + def row_width(self, rows): + max_width = 0 + min_width = None + if self._headers is not None: + max_width = len(self._headers) + min_width = max_width + for row in rows: + max_width = max(max_width, len(row)) + min_width = max_width if min_width is None else min(min_width, max_width) + min_width = max_width if min_width is None else min_width + return (min_width, max_width) + + def find_alignments(self, rows, width): + align = padded(self._align, width, None) + unknown_count = 0 + for v in align: + if v is None: + unknown_count += 1 + if unknown_count == 0: + return align + for row in rows: + for i in range(len(row)): + if align[i] is not None: + continue + v = row[i] + if v is None: + continue + if type(v) is str: + align[i] = ALIGN_LEFT + else: + align[i] = ALIGN_RIGHT + unknown_count -= 1 + if unknown_count == 0: + return align + for i in range(width): + if align[i] is None: + align[i] = ALIGN_LEFT + return align + + def pad_rows(self, rows, width): + new_rows = [] + for row in rows: + new_rows.append(padded(row, width, None)) + return new_rows + + def pad_headers(self, width): + if self._headers is None: + return None + if len(self._headers) == 0: + return None + has_none = False + for i in range(len(self._headers)): + if self._headers[i] is None: + has_none = True + break + if len(self._headers) >= width and not has_none: + return self._headers + headers = self._headers.copy() + if has_none: + for i in range(len(headers)): + if headers[i] is None: + headers[i] = '' + return pad(headers, width, '') diff --git a/examples/quickstart/jupyter-notebooks/druidapi/catalog.py b/examples/quickstart/jupyter-notebooks/druidapi/catalog.py new file mode 100644 index 000000000000..a3f3fee918f9 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/catalog.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +from .consts import COORD_BASE +from .rest import check_error + +# Catalog (new feature in Druid 26) +CATALOG_BASE = COORD_BASE + '/catalog' +REQ_CAT_SCHEMAS = CATALOG_BASE + '/schemas' +REQ_CAT_SCHEMA = REQ_CAT_SCHEMAS + '/{}' +REQ_CAT_SCHEMA_TABLES = REQ_CAT_SCHEMA + '/tables' +REQ_CAT_SCHEMA_TABLE = REQ_CAT_SCHEMA_TABLES + '/{}' +REQ_CAT_SCHEMA_TABLE_EDIT = REQ_CAT_SCHEMA_TABLE + '/edit' + +class CatalogClient: + + def __init__(self, rest_client): + self.client = rest_client + + def post_table(self, schema, table_name, table_spec, version=None, overwrite=None): + params = {} + if version is not None: + params['version'] = version + if overwrite is not None: + params['overwrite'] = overwrite + return self.client.post_json(REQ_CAT_SCHEMA_TABLE, table_spec, args=[schema, table_name], params=params) + + def create(self, schema, table_name, table_spec): + self.post_table(schema, table_name, table_spec) + + def table(self, schema, table_name): + return self.client.get_json(REQ_CAT_SCHEMA_TABLE, args=[schema, table_name]) + + def drop_table(self, schema, table_name, ifExists=False): + r = self.client.delete(REQ_CAT_SCHEMA_TABLE, args=[schema, table_name]) + if ifExists and r.status_code == requests.codes.not_found: + return + check_error(r) + + def edit_table(self, schema, table_name, action): + return self.client.post_json(REQ_CAT_SCHEMA_TABLE_EDIT, action, args=[schema, table_name]) + + def schema_names(self): + return self.client.get_json(REQ_CAT_SCHEMAS) + + def tables_in_schema(self, schema, list_format='name'): + return self.client.get_json(REQ_CAT_SCHEMA_TABLES, args=[schema], params={'format': list_format}) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/consts.py b/examples/quickstart/jupyter-notebooks/druidapi/consts.py new file mode 100644 index 000000000000..e452a399b197 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/consts.py @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +COORD_BASE = '/druid/coordinator/v1' +ROUTER_BASE = '/druid/v2' +OVERLORD_BASE = '/druid/indexer/v1' + +# System schemas and table names. Note: case must match in Druid, though +# SQL itself is supposed to be case-insensitive. +SYS_SCHEMA = 'sys' +SCHEMA_SCHEMA = 'INFORMATION_SCHEMA' +DRUID_SCHEMA = 'druid' +EXT_SCHEMA = 'ext' + +# Information Schema tables +SCHEMAS_TABLE = SCHEMA_SCHEMA + '.SCHEMATA' +TABLES_TABLE = SCHEMA_SCHEMA + '.TABLES' +COLUMNS_TABLE = SCHEMA_SCHEMA + '.COLUMNS' + +# SQL request formats +SQL_OBJECT = 'object' +SQL_ARRAY = 'array' +SQL_ARRAY_WITH_TRAILER = 'arrayWithTrailer' +SQL_CSV = 'csv' + +# Type names as known to Druid and mentioned in documentation. +DRUID_STRING_TYPE = 'string' +DRUID_LONG_TYPE = 'long' +DRUID_FLOAT_TYPE = 'float' +DRUID_DOUBLE_TYPE = 'double' +DRUID_TIMESTAMP_TYPE = 'timestamp' + +# SQL type names as returned from the INFORMATION_SCHEMA +SQL_VARCHAR_TYPE = 'VARCHAR' +SQL_BIGINT_TYPE = 'BIGINT' +SQL_FLOAT_TYPE = 'FLOAT' +SQL_DOUBLE_TYPE = 'DOUBLE' +SQL_TIMESTAMP_TYPE = 'TIMESTAMP' +SQL_ARRAY_TYPE = 'ARRAY' + +# Task status code +RUNNING_STATE = 'RUNNING' +SUCCESS_STATE = 'SUCCESS' +FAILED_STATE = 'FAILED' diff --git a/examples/quickstart/jupyter-notebooks/druidapi/datasource.py b/examples/quickstart/jupyter-notebooks/druidapi/datasource.py new file mode 100644 index 000000000000..af1113d3d45a --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/datasource.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests, time +from .consts import COORD_BASE +from .rest import check_error +from .util import dict_get + +REQ_DATASOURCES = COORD_BASE + '/datasources' +REQ_DATASOURCE = REQ_DATASOURCES + '/{}' + +# Segment load status +REQ_DATASOURCES = COORD_BASE + '/datasources' +REQ_DS_LOAD_STATUS = REQ_DATASOURCES + '/{}/loadstatus' + +class DatasourceClient: + ''' + Client for status APIs. These APIs are available on all nodes. + If used with the router, they report the status of just the router. + ''' + + def __init__(self, rest_client): + self.client = rest_client + + def drop(self, ds_name, ifExists=False): + """ + Drops a data source. + + Marks as unused all segments belonging to a datasource. + + Marking all segments as unused is equivalent to dropping the table. + + Parameters + ---------- + ds_name: str + name of the datasource to query + + Returns + ------- + Returns a map of the form + {"numChangedSegments": } with the number of segments in the database whose + state has been changed (that is, the segments were marked as unused) as the result + of this API call. + + Reference + --------- + `DELETE /druid/coordinator/v1/datasources/{dataSourceName}` + """ + r = self.client.delete(REQ_DATASOURCE, args=[ds_name]) + if ifExists and r.status_code == requests.codes.not_found: + return + check_error(r) + + def load_status_req(self, ds_name, params=None): + return self.client.get_json(REQ_DS_LOAD_STATUS, args=[ds_name], params=params) + + def load_status(self, ds_name): + return self.load_status_req(ds_name, { + 'forceMetadataRefresh': 'true', + 'interval': '1970-01-01/2999-01-01'}) + + def wait_until_ready(self, ds_name): + while True: + resp = self.load_status(ds_name) + if dict_get(resp, ds_name) == 100.0: + return + time.sleep(0.5) + \ No newline at end of file diff --git a/examples/quickstart/jupyter-notebooks/druidapi/display.py b/examples/quickstart/jupyter-notebooks/druidapi/display.py new file mode 100644 index 000000000000..8712b418188e --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/display.py @@ -0,0 +1,84 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TEXT_TABLE = 0 +HTML_TABLE = 1 + +class Display: + + def __init__(self): + self.format = TEXT_TABLE + self.html_initialized = False + + def text(self): + self.format = TEXT_TABLE + + def html(self): + self.format = HTML_TABLE + if not self.html_initialized: + from .html_table import styles + styles() + self.html_initialized = True + + def table(self): + if self.format == HTML_TABLE: + from .html_table import HtmlTable + return HtmlTable() + else: + from .text_table import TextTable + return TextTable() + + def show_object_list(self, objects, cols): + list_to_table(self.table(), objects, cols) + + def show_object(self, obj, labels): + object_to_table(self.table(), obj, labels) + + def show_error(self, msg): + from .html_table import html_error + html_error("ERROR: " + msg + "" + msg + " StatusClient: + ''' + Returns the status client for the router by default, else the status + endpoint for the specified endpoint. + ''' + if endpoint is None: + if self.status_client is None: + self.status_client = StatusClient(self.rest_client) + return self.status_client + else: + endpoint_client = DruidRestClient(endpoint) + return StatusClient(endpoint_client) + + def catalog(self): + if self.catalog_client is None: + self.catalog_client = CatalogClient(self.rest_client) + return self.catalog_client + + def sql(self): + if self.sql_client is None: + self.sql_client = QueryClient(self) + return self.sql_client + + def tasks(self): + if self.tasks_client is None: + self.tasks_client = TaskClient(self.rest_client) + return self.tasks_client + + def datasources(self): + if self.datasource_client is None: + self.datasource_client = DatasourceClient(self.rest_client) + return self.datasource_client diff --git a/examples/quickstart/jupyter-notebooks/druidapi/error.py b/examples/quickstart/jupyter-notebooks/druidapi/error.py new file mode 100644 index 000000000000..bd5d4f1f33b5 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/error.py @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class ClientError(Exception): + """ + Indicates an error with usage of the API. + """ + + def __init__(self, msg): + self.message = msg + +class DruidError(Exception): + """ + Indicates that something went wrong on Druid: often as a result of a + request that this client sent. + """ + + def __init__(self, msg): + self.message = msg + diff --git a/examples/quickstart/jupyter-notebooks/druidapi/html_table.py b/examples/quickstart/jupyter-notebooks/druidapi/html_table.py new file mode 100644 index 000000000000..a2e163401ae0 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/html_table.py @@ -0,0 +1,121 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from IPython.display import display, HTML +from .base_table import BaseTable +from html import escape + +STYLES = ''' + +''' + +def escape_for_html(s): + # Anoying: IPython treats $ as the start of Latex, which is cool, + # but not wanted here. + return s.replace('$', '\\$') + +def html(s): + s = '
' + escape_for_html(s) + '
' + display(HTML(s)) + +def html_error(s): + s = '
' + escape_for_html(s.replace('\n', '
')) + '
' + display(HTML(s)) + +def styles(): + display(HTML(STYLES)) + +alignments = ['druid-left', 'druid-center', 'druid-right'] + +def start_tag(tag, align): + s = '<' + tag + if align is not None: + s += ' class="{}"'.format(alignments[align]) + return s + '>' + +class HtmlTable(BaseTable): + + def __init__(self): + self._headers = None + self._align = None + self._col_fmt = None + + def widths(self, widths): + self._widths = widths + + def format(self, rows): + _, width = self.row_width(rows) + headers = self.pad_headers(width) + rows = self.pad_rows(rows, width) + s = '\n' + s += self.gen_header(headers) + s += self.gen_rows(rows) + return s + '\n
' + + def show(self, rows): + html(self.format(rows)) + + def gen_header(self, headers): + if headers is None or len(headers) == 0: + return '' + s = '' + for i in range(len(headers)): + s += start_tag('th', self.col_align(i)) + escape(headers[i]) + '' + return s + '\n' + + def gen_rows(self, rows): + html_rows = [] + for row in rows: + r = "" + for i in range(len(row)): + r += start_tag('td', self.col_align(i)) + cell = row[i] + value = '' if cell is None else escape(str(cell)) + r += value + '' + html_rows.append(r + "") + return "\n".join(html_rows) + + def col_align(self, col): + if self._align is None: + return None + if col >= len(self._align): + return None + return self._align[col] \ No newline at end of file diff --git a/examples/quickstart/jupyter-notebooks/druidapi/rest.py b/examples/quickstart/jupyter-notebooks/druidapi/rest.py new file mode 100644 index 000000000000..eb7a3e96637e --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/rest.py @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +from .util import dict_get, is_blank +from urllib.parse import quote +from .error import ClientError + +def check_error(response): + """ + Raises a requests HttpError if the response code is not OK or Accepted. + + If the response inclded a JSON payload, then the message is extracted + from that payload, else the message is from requests. The JSON + payload, if any, is returned in the json field of the error. + """ + code = response.status_code + if code == requests.codes.ok or code == requests.codes.accepted: + return + error = None + json = None + try: + json = response.json() + except Exception: + pass + msg = dict_get(json, 'errorMessage') + if msg is None: + msg = dict_get(json, 'error') + if not is_blank(msg): + raise ClientError(msg) + if code == requests.codes.not_found and error is None: + error = "Not found" + if error is not None: + response.reason = error + try: + response.raise_for_status() + except Exception as e: + e.json = json + raise e + +class DruidRestClient: + ''' + Wrapper around the basic Druid REST API operations using the + requests Python package. Handles the grunt work of building up + URLs, working with JSON, etc. + ''' + + def __init__(self, endpoint): + self.endpoint = endpoint + self.trace = False + self.session = requests.Session() + + def enable_trace(self, flag=True): + self.trace = flag + + def build_url(self, req, args=None) -> str: + """ + Returns the full URL for a REST call given the relative request API and + optional parameters to fill placeholders within the request URL. + + Parameters + ---------- + req : str + relative URL, with optional {} placeholders + + args : list + optional list of values to match {} placeholders + in the URL. + """ + url = self.endpoint + req + if args is not None: + quoted = [quote(arg) for arg in args] + url = url.format(*quoted) + return url + + def get(self, req, args=None, params=None, require_ok=True) -> requests.Request: + ''' + Generic GET request to this service. + + Parameters + ---------- + req: str + The request URL without host, port or query string. + Example: `/status` + + args: [str], default = None + Optional parameters to fill in to the URL. + Example: `/customer/{}` + + params: dict, default = None + Optional map of query variables to send in + the URL. Query parameters are the name/values pairs + that appear after the `?` marker. + + require_ok: bool, default = True + Whether to require an OK (200) response. If `True`, and + the request returns a different response code, then raises + a `RestError` exception. + + Returns + ------- + The `requests` `Request` object. + ''' + url = self.build_url(req, args) + if self.trace: + print("GET:", url) + r = self.session.get(url, params=params) + if require_ok: + check_error(r) + return r + + def get_json(self, url_tail, args=None, params=None): + ''' + Generic GET request which expects a JSON response. + ''' + r = self.get(url_tail, args, params) + return r.json() + + def post(self, req, body, args=None, headers=None, require_ok=True) -> requests.Request: + """ + Issues a POST request for the given URL on this + node, with the given payload and optional URL query + parameters. + """ + url = self.build_url(req, args) + if self.trace: + print("POST:", url) + print("body:", body) + r = self.session.post(url, data=body, headers=headers) + if require_ok: + check_error(r) + return r + + def post_json(self, req, body, args=None, headers=None, params=None): + """ + Issues a POST request for the given URL on this + node, with the given payload and optional URL query + parameters. The payload is serialized to JSON. + """ + r = self.post_only_json(req, body, args, headers, params) + check_error(r) + return r.json() + + def post_only_json(self, req, body, args=None, headers=None, params=None) -> requests.Request: + """ + Issues a POST request for the given URL on this + node, with the given payload and optional URL query + parameters. The payload is serialized to JSON. + + Does not parse error messages: that is up to the caller. + """ + url = self.build_url(req, args) + if self.trace: + print("POST:", url) + print("body:", body) + return self.session.post(url, json=body, headers=headers, params=params) + + def delete(self, req, args=None, params=None, headers=None): + url = self.build_url(req, args) + if self.trace: + print("DELETE:", url) + r = self.session.delete(url, params=params, headers=headers) + return r + + def delete_json(self, req, args=None, params=None, headers=None): + return self.delete(req, args=args, params=params, headers=headers).json() diff --git a/examples/quickstart/jupyter-notebooks/druidapi/sql.py b/examples/quickstart/jupyter-notebooks/druidapi/sql.py new file mode 100644 index 000000000000..0a26f8e2b713 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/sql.py @@ -0,0 +1,693 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time, requests +from . import consts, display +from .consts import ROUTER_BASE +from .util import is_blank, dict_get +from .error import DruidError, ClientError + +REQ_ROUTER_QUERY = ROUTER_BASE +REQ_ROUTER_SQL = ROUTER_BASE + '/sql' +REQ_ROUTER_SQL_TASK = REQ_ROUTER_SQL + '/task' + +class SqlRequest: + + def __init__(self, query_client, sql): + self.query_client = query_client + self.sql = sql + self.context = None + self.params = None + self.header = False + self.format = consts.SQL_OBJECT + self.headers = None + self.types = None + self.sqlTypes = None + + def with_format(self, result_format): + self.format = result_format + return self + + def with_headers(self, sqlTypes=False, druidTypes=False): + self.headers = True + self.types = druidTypes + self.sqlTypes = sqlTypes + return self + + def with_context(self, context): + if self.context is None: + self.context = context + else: + self.context.update(context) + return self + + def with_parameters(self, params): + ''' + Set the array of parameters. Parameters must each be a map of 'type'/'value' pairs: + {'type': the_type, 'value': the_value}. The type must be a valid SQL type + (in upper case). See the consts module for a list. + ''' + if self.params is None: + self.params = params + else: + self.params.update(params) + return self + + def add_parameter(self, value): + ''' + Add one parameter value. Infers the type of the parameter from the Python type. + ''' + if value is None: + raise ClientError("Druid does not support null parameter values") + data_type = None + value_type = type(value) + if value_type is str: + data_type = consts.SQL_VARCHAR_TYPE + elif value_type is int: + data_type = consts.SQL_BIGINT_TYPE + elif value_type is float: + data_type = consts.SQL_DOUBLE_TYPE + elif value_type is list: + data_type = consts.SQL_ARRAY_TYPE + else: + raise ClientError("Unsupported value type") + if self.params is None: + self.params = [] + self.params.append({'type': data_type, 'value': value}) + + def response_header(self): + self.header = True + return self + + def request_headers(self, headers): + self.headers = headers + return self + + def to_request(self): + query_obj = {"query": self.sql} + if self.context is not None and len(self.context) > 0: + query_obj['context'] = self.context + if self.params is not None and len(self.params) > 0: + query_obj['parameters'] = self.params + if self.header: + query_obj['header'] = True + if self.result_format is not None: + query_obj['resultFormat'] = self.format + if self.sqlTypes: + query_obj['sqlTypesHeader'] = self.sqlTypes + if self.types: + query_obj['typesHeader'] = self.types + return query_obj + + def result_format(self): + return self.format.lower() + + def run(self): + return self.query_client.sql_query(self) + +def parse_rows(fmt, context, results): + if fmt == consts.SQL_ARRAY_WITH_TRAILER: + rows = results['results'] + elif fmt == consts.SQL_ARRAY: + rows = results + else: + return results + if not context.get('headers', False): + return rows + header_size = 1 + if context.get('sqlTypesHeader', False): + header_size += 1 + if context.get('typesHeader', False): + header_size += 1 + return rows[header_size:] + +def label_non_null_cols(results): + if results is None or len(results) == 0: + return [] + is_null = {} + for key in results[0].keys(): + is_null[key] = True + for row in results: + for key, value in row.items(): + if type(value) == str: + if value != '': + is_null[key] = False + elif type(value) == float: + if value != 0.0: + is_null[key] = False + elif value is not None: + is_null[key] = False + return is_null + +def filter_null_cols(results): + ''' + Filter columns from a Druid result set by removing all null-like + columns. A column is considered null if all values for that column + are null. A value is null if it is either a JSON null, an empty + string, or a numeric 0. All rows are preserved, as is the order + of the remaining columns. + ''' + if results is None or len(results) == 0: + return results + is_null = label_non_null_cols(results) + revised = [] + for row in results: + new_row = {} + for key, value in row.items(): + if is_null[key]: + continue + new_row[key] = value + revised.append(new_row) + return revised + +def parse_object_schema(results): + schema = [] + if len(results) == 0: + return schema + row = results[0] + for k, v in row.items(): + druid_type = None + sql_type = None + if type(v) is str: + druid_type = consts.DRUID_STRING_TYPE + sql_type = consts.SQL_VARCHAR_TYPE + elif type(v) is int or type(v) is float: + druid_type = consts.DRUID_LONG_TYPE + sql_type = consts.SQL_BIGINT_TYPE + schema.append(ColumnSchema(k, sql_type, druid_type)) + return schema + +def parse_array_schema(context, results): + schema = [] + if len(results) == 0: + return schema + has_headers = context.get(consts.HEADERS_KEY, False) + if not has_headers: + return schema + has_sql_types = context.get(consts.SQL_TYPES_HEADERS_KEY, False) + has_druid_types = context.get(consts.DRUID_TYPE_HEADERS_KEY, False) + size = len(results[0]) + for i in range(size): + druid_type = None + if has_druid_types: + druid_type = results[1][i] + sql_type = None + if has_sql_types: + sql_type = results[2][i] + schema.append(ColumnSchema(results[0][i], sql_type, druid_type)) + return schema + +def parse_schema(fmt, context, results): + if fmt == consts.SQL_OBJECT: + return parse_object_schema(results) + elif fmt == consts.SQL_ARRAY or fmt == consts.SQL_ARRAY_WITH_TRAILER: + return parse_array_schema(context, results) + else: + return [] + +def is_response_ok(http_response): + code = http_response.status_code + return code == requests.codes.ok or code == requests.codes.accepted + +class ColumnSchema: + + def __init__(self, name, sql_type, druid_type): + self.name = name + self.sql_type = sql_type + self.druid_type = druid_type + + def __str__(self): + return "{{name={}, SQL type={}, Druid type={}}}".format(self.name, self.sql_type, self.druid_type) + +class SqlQueryResult: + """ + Defines the core protocol for Druid SQL queries. + """ + + def __init__(self, request, response): + self.http_response = response + self._json = None + self._rows = None + self._schema = None + self.request = request + self._error = None + self._id = None + if not is_response_ok(response): + try: + self._error = response.json() + except Exception: + self._error = response.text + if self._error is None or len(self._error) == 0: + self._error = "Failed with HTTP status {}".format(response.status_code) + try: + self._id = self.http_response.headers['X-Druid-SQL-Query-Id'] + except KeyError: + self._error = "Query returned no query ID" + + def result_format(self): + return self.request.result_format() + + def ok(self): + """ + Reports if the query succeeded. + + The query rows and schema are available only if ok() returns True. + """ + return is_response_ok(self.http_response) + + def error(self): + """ + If the query fails, returns the error, if any provided by Druid. + """ + return self._error + + def error_msg(self): + err = self.error() + if err is None: + return "unknown" + if type(err) is str: + return err + msg = err.get("error") + text = err.get("errorMessage") + if msg is None and text is None: + return "unknown" + if msg is None: + return text + if text is None: + return msg + return msg + ": " + text + + def id(self): + """ + Returns the unique identifier for the query. + """ + return self._id + + def non_null(self): + if not self.ok(): + return None + if self.result_format() != consts.SQL_OBJECT: + return None + return filter_null_cols(self.rows()) + + def as_array(self): + if self.result_format() == consts.SQL_OBJECT: + rows = [] + for obj in self.rows(): + rows.append([v for v in obj.values()]) + return rows + else: + return self.rows() + + def error(self): + if self.ok(): + return None + if self._error is not None: + return self._error + if self.http_response is None: + return { "error": "unknown"} + if is_response_ok(self.http_response): + return None + return {"error": "HTTP {}".format(self.http_response.status_code)} + + def json(self): + if not self.ok(): + return None + if self._json is None: + self._json = self.http_response.json() + return self._json + + def rows(self): + """ + Returns the rows of data for the query. + + Druid supports many data formats. The method makes its best + attempt to map the format into an array of rows of some sort. + """ + if self._rows is None: + json = self.json() + if json is None: + return self.http_response.text + self._rows = parse_rows(self.result_format(), self.request.context, json) + return self._rows + + def schema(self): + """ + Returns the data schema as a list of ColumnSchema objects. + + Druid supports many data formats, not all of them provide + schema information. This method makes its best attempt to + extract the schema from the query results. + """ + if self._schema is None: + self._schema = parse_schema(self.result_format(), self.request.context, self.json()) + return self._schema + + def show(self, non_null=False): + data = None + if non_null: + data = self.non_null() + if data is None: + data = self.as_array() + if data is None or len(data) == 0: + display.display.show_message("Query returned no results") + return + disp = display.display.table() + disp.headers([c.name for c in self.schema()]) + disp.show(data) + + def show_schema(self): + disp = display.display.table() + disp.headers(['Name', 'SQL Type', 'Druid Type']) + data = [] + for c in self.schema(): + data.append([c.name, c.sql_type, c.druid_type]) + disp.show(data) + +class QueryTaskResult: + + def __init__(self, request, response): + self._request = request + self.http_response = response + self._status = None + self._results = None + self._details = None + self._schema = None + self._rows = None + self._reports = None + self._schema = None + self._results = None + self._error = None + self._id = None + if not is_response_ok(response): + self._state = consts.FAILED_STATE + try: + self._error = response.json() + except Exception: + self._error = response.text + if self._error is None or len(self._error) == 0: + self._error = "Failed with HTTP status {}".format(response.status_code) + return + + # Typical response: + # {'taskId': '6f7b514a446d4edc9d26a24d4bd03ade_fd8e242b-7d93-431d-b65b-2a512116924c_bjdlojgj', + # 'state': 'RUNNING'} + self.response_obj = response.json() + self._id = self.response_obj['taskId'] + self._state = self.response_obj['state'] + + def ok(self): + """ + Reports if the query succeeded. + + The query rows and schema are available only if ok() returns True. + """ + return self._error is None + + def id(self): + return self._id + + def _tasks(self): + return self._request.query_client.druid_client.tasks() + + def status(self): + """ + Polls Druid for an update on the query run status. + """ + self.check_valid() + # Example: + # {'task': 'talaria-sql-w000-b373b68d-2675-4035-b4d2-7a9228edead6', + # 'status': { + # 'id': 'talaria-sql-w000-b373b68d-2675-4035-b4d2-7a9228edead6', + # 'groupId': 'talaria-sql-w000-b373b68d-2675-4035-b4d2-7a9228edead6', + # 'type': 'talaria0', 'createdTime': '2022-04-28T23:19:50.331Z', + # 'queueInsertionTime': '1970-01-01T00:00:00.000Z', + # 'statusCode': 'RUNNING', 'status': 'RUNNING', 'runnerStatusCode': 'PENDING', + # 'duration': -1, 'location': {'host': None, 'port': -1, 'tlsPort': -1}, + # 'dataSource': 'w000', 'errorMsg': None}} + self._status = self._tasks().task_status(self._id) + self._state = self._status['status']['status'] + if self._state == consts.FAILED_STATE: + self._error = self._status['status']['errorMsg'] + return self._status + + def done(self): + """ + Reports if the query is done: succeeded or failed. + """ + return self._state == consts.FAILED_STATE or self._state == consts.SUCCESS_STATE + + def succeeded(self): + """ + Reports if the query succeeded. + """ + return self._state == consts.SUCCESS_STATE + + def state(self): + """ + Reports the engine-specific query state. + + Updated after each call to status(). + """ + return self._state + + def error(self): + return self._error + + def error_msg(self): + err = self.error() + if err is None: + return "unknown" + if type(err) is str: + return err + msg = dict_get(err, "error") + text = dict_get(err, "errorMessage") + if msg is None and text is None: + return "unknown" + if text is not None: + text = text.replace('\\n', '\n') + if msg is None: + return text + if text is None: + return msg + return msg + ": " + text + + def join(self): + if not self.done(): + self.status() + while not self.done(): + time.sleep(0.5) + self.status() + return self.succeeded() + + def check_valid(self): + if self._id is None: + raise ClientError("Operation is invalid on a failed query") + + def wait_done(self): + if not self.join(): + raise DruidError("Query failed: " + self.error_msg()) + + def wait(self): + self.wait_done() + return self.rows() + + def reports(self) -> dict: + self.check_valid() + if self._reports is None: + self.join() + self._reports = self._tasks().task_reports(self._id) + return self._reports + + def results(self): + if self._results is None: + rpts = self.reports() + self._results = rpts['multiStageQuery']['payload']['results'] + return self._results + + def schema(self): + if self._schema is None: + results = self.results() + sig = results['signature'] + sqlTypes = results['sqlTypeNames'] + size = len(sig) + self._schema = [] + for i in range(size): + self._schema.append(ColumnSchema(sig[i]['name'], sqlTypes[i], sig[i]['type'])) + return self._schema + + def rows(self): + if self._rows is None: + results = self.results() + self._rows = results['results'] + return self._rows + + def show(self, non_null=False): + data = self.rows() + if non_null: + data = filter_null_cols(data) + disp = display.display.table() + disp.headers([c.name for c in self.schema()]) + disp.show(data) + +class QueryClient: + + def __init__(self, druid, rest_client=None): + self.druid_client = druid + self._rest_client = druid.rest_client if rest_client is None else rest_client + + def rest_client(self): + return self._rest_client + + def _prepare_query(self, request): + if request is None: + raise ClientError("No query provided.") + if type(request) == str: + request = self.sql_request(request) + if is_blank(request.sql): + raise ClientError("No query provided.") + if self.rest_client().trace: + print(request.sql) + query_obj = request.to_request() + return (request, query_obj) + + def sql_query(self, request) -> SqlQueryResult: + ''' + Submit a SQL query with control over the context, parameters and other + options. Returns a response with either a detailed error message, or + the rows and query ID. + ''' + request, query_obj = self._prepare_query(request) + r = self.rest_client().post_only_json(REQ_ROUTER_SQL, query_obj, headers=request.headers) + return SqlQueryResult(request, r) + + def sql(self, sql, *args): + if len(args) > 0: + sql = sql.result_format(*args) + resp = self.sql_query(sql) + if resp.ok(): + return resp.rows() + raise ClientError(resp.error_msg()) + + def explain_sql(self, query): + """ + Run an EXPLAIN PLAN FOR query for the given query. + + Returns + ------- + An object with the plan JSON parsed into Python objects: + plan: the query plan + columns: column schema + tables: dictionary of name/type pairs + """ + if is_blank(query): + raise ClientError("No query provided.") + results = self.sql('EXPLAIN PLAN FOR ' + query) + return results[0] + + def sql_request(self, sql): + return SqlRequest(self, sql) + + def show(self, query): + result = self.sql_query(query) + if result.ok(): + result.show() + else: + display.display.show_error(result.error_msg()) + + def task(self, request): + request, query_obj = self._prepare_query(request) + r = self.rest_client().post_only_json(REQ_ROUTER_SQL_TASK, query_obj, headers=request.headers) + return QueryTaskResult(request, r) + + def run_task(self, request): + resp = self.task(request) + if not resp.ok(): + raise ClientError(resp.error_msg()) + resp.wait_done() + + def _tables_query(self, schema): + return self.sql_query(''' + SELECT TABLE_NAME AS TableName + FROM INFORMATION_SCHEMA.TABLES + WHERE TABLE_SCHEMA = '{}' + ORDER BY TABLE_NAME + '''.format(schema)) + + def tables(self, schema=consts.DRUID_SCHEMA): + return self._tables_query(schema).rows() + + def show_tables(self, schema=consts.DRUID_SCHEMA): + self._tables_query(schema).show() + + def _schemas_query(self): + return self.sql_query(''' + SELECT SCHEMA_NAME AS SchemaName + FROM INFORMATION_SCHEMA.SCHEMATA + ORDER BY SCHEMA_NAME + ''') + + def show_schemas(self): + self._schemas_query().show() + + def describe_table(self, part1, part2=None): + if part2 is None: + schema = consts.DRUID_SCHEMA + table = part1 + else: + schema = part1 + table = part2 + self.show(''' + SELECT + ORDINAL_POSITION AS "Position", + COLUMN_NAME AS "Name", + DATA_TYPE AS "Type" + FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_SCHEMA = '{}' + AND TABLE_NAME = '{}' + ORDER BY ORDINAL_POSITION + '''.format(schema, table)) + + def describe_function(self, part1, part2=None): + if part2 is None: + schema = consts.EXT_SCHEMA + table = part1 + else: + schema = part1 + table = part2 + self.show(''' + SELECT + ORDINAL_POSITION AS "Position", + PARAMETER_NAME AS "Parameter", + DATA_TYPE AS "Type", + IS_OPTIONAL AS "Optional" + FROM INFORMATION_SCHEMA.PARAMETERS + WHERE SCHEMA_NAME = '{}' + AND FUNCTION_NAME = '{}' + ORDER BY ORDINAL_POSITION + '''.format(schema, table)) + + def wait_until_ready(self, ds_name): + ''' + Wait for a datasource to be loaded in the cluster, and to become available in SQL. + ''' + self.druid_client.datasources().wait_until_ready(ds_name) + while True: + try: + self.sql('SELECT 1 FROM "{}" LIMIT 1'.format(ds_name)); + return + except Exception: + time.sleep(0.5) + diff --git a/examples/quickstart/jupyter-notebooks/druidapi/status.py b/examples/quickstart/jupyter-notebooks/druidapi/status.py new file mode 100644 index 000000000000..b72014e99909 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/status.py @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +STATUS_BASE = "/status" +REQ_STATUS = STATUS_BASE +REQ_HEALTH = STATUS_BASE + "/health" +REQ_PROPERTIES = STATUS_BASE + "/properties" +REQ_IN_CLUSTER = STATUS_BASE + "/selfDiscovered/status" + +ROUTER_BASE = '/druid/router/v1' +REQ_BROKERS = ROUTER_BASE + '/brokers' + +class StatusClient: + ''' + Client for status APIs. These APIs are available on all nodes. + If used with the router, they report the status of just the router. + ''' + + def __init__(self, rest_client): + self.client = rest_client + + #-------- Common -------- + + def status(self): + """ + Returns the Druid version, loaded extensions, memory used, total memory + and other useful information about the process. + + GET `/status` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information + """ + return self.client.get_json(REQ_STATUS) + + def is_healthy(self) -> bool: + """ + Returns `True` if the node is healthy, an exception otherwise. + Useful for automated health checks. + + GET `/status/health` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information + """ + try: + return self.client.get_json(REQ_HEALTH) + except Exception: + return False + + def wait_until_ready(self): + while not self.is_healthy(): + time.sleep(0.5) + + def properties(self) -> map: + """ + Returns the effective set of Java properties used by the service, including + system properties and properties from the `common_runtime.propeties` and + `runtime.properties` files. + + GET `/status/properties` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information + """ + return self.client.get_json(REQ_PROPERTIES) + + def in_cluster(self): + """ + Returns `True` if the node is visible wihtin the cluster, `False` if not. + (That is, returns the value of the `{"selfDiscovered": true/false}` + field in the response. + + GET `/status/selfDiscovered/status` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#process-information + """ + try: + result = self.client.get_json(REQ_IN_CLUSTER) + return result.get('selfDiscovered', False) + except ConnectionError: + return False + + def version(self): + return self.status().get('version') + + def brokers(self): + return self.client.get_json(REQ_BROKERS) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/tasks.py b/examples/quickstart/jupyter-notebooks/druidapi/tasks.py new file mode 100644 index 000000000000..baaf45f2a448 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/tasks.py @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .consts import OVERLORD_BASE + +# Tasks +REQ_TASKS = OVERLORD_BASE + '/tasks' +REQ_POST_TASK = OVERLORD_BASE + '/task' +REQ_GET_TASK = REQ_POST_TASK + '/{}' +REQ_TASK_STATUS = REQ_GET_TASK + '/status' +REQ_TASK_REPORTS = REQ_GET_TASK + '/reports' +REQ_END_TASK = REQ_GET_TASK +REQ_END_DS_TASKS = REQ_END_TASK + '/shutdownAllTasks' + +class TaskClient: + """ + Client for task-related APIs. The APIs connect through the Router to + the Overlord. + """ + + def __init__(self, rest_client): + self.client = rest_client + + def tasks(self, state=None, table=None, type=None, max=None, created_time_interval=None): + ''' + Retrieve list of tasks. + + Parameters + ---------- + state : str, default = None + Filter list of tasks by task state. Valid options are "running", + "complete", "waiting", and "pending". Constants are defined for + each of these in the `consts` file. + table : str, default = None + Return tasks filtered by Druid table (datasource). + created_time_interval : str, Default = None + Return tasks created within the specified interval. + max : int, default = None + Maximum number of "complete" tasks to return. Only applies when state is set to "complete". + type : str, default = None + filter tasks by task type. + + Reference + --------- + `GET /druid/indexer/v1/tasks` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#get-15 + ''' + params = {} + if state is not None: + params['state'] = state + if table is not None: + params['datasource'] = table + if type is not None: + params['type'] = type + if max is not None: + params['max'] = max + if created_time_interval is not None: + params['createdTimeInterval'] = created_time_interval + return self.client.get_json(REQ_TASKS, params=params) + + def task(self, task_id): + """ + Retrieve the "payload" of a task. + + Parameters + ---------- + task_id : str + The id of the task to retrieve + + Reference + --------- + `GET /druid/indexer/v1/task/{taskId}` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#get-15 + """ + return self.client.get_json(REQ_GET_TASK, args=[task_id]) + + def task_status(self, task_id): + ''' + Retrieve the status of a task. + + Parameters + ---------- + task_id : str + The id of the task to retrieve + + Reference + --------- + `GET /druid/indexer/v1/task/{taskId}/status` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#get-15 + ''' + return self.client.get_json(REQ_TASK_STATUS, args=[task_id]) + + def task_reports(self, task_id): + ''' + Retrieve a task completion report for a task. + Only works for completed tasks. + + Parameters + ---------- + task_id : str + The id of the task to retrieve + + Reference + --------- + `GET /druid/indexer/v1/task/{taskId}/reports` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#get-15 + ''' + return self.client.get_json(REQ_TASK_REPORTS, args=[task_id]) + + def submit_task(self, payload): + """ + Submit a task or supervisor specs to the Overlord. + + Returns the taskId of the submitted task. + + Parameters + ---------- + payload : object + The task object. Serialized to JSON. + + Reference + --------- + `POST /druid/indexer/v1/task` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#post-5 + """ + return self.client.post_json(REQ_POST_TASK, payload) + + def shut_down_task(self, task_id): + """ + Shuts down a task. + + Parameters + ---------- + task_id : str + The id of the task to shut down + + Reference + --------- + `POST /druid/indexer/v1/task/{taskId}/shutdown` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#post-5 + """ + return self.client.post_json(REQ_END_TASK, args=[task_id]) + + def shut_down_tasks_for(self, table): + """ + Shuts down all tasks for a table (data source). + + Parameters + ---------- + table : str + The name of the table (data source). + + Reference + --------- + `POST /druid/indexer/v1/datasources/{dataSource}/shutdownAllTasks` + + See https://druid.apache.org/docs/latest/operations/api-reference.html#post-5 + """ + return self.client.post_json(REQ_END_DS_TASKS, args=[table]) + diff --git a/examples/quickstart/jupyter-notebooks/druidapi/text_table.py b/examples/quickstart/jupyter-notebooks/druidapi/text_table.py new file mode 100644 index 000000000000..2dc29719061b --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/text_table.py @@ -0,0 +1,161 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base_table import pad +from .base_table import BaseTable + +alignments = ['', '^', '>'] + +def simple_table(table_def): + table = [] + if table_def.headers is not None: + table.append(' '.join(table_def.format_row(table_def.headers))) + for row in table_def.rows: + table.append(' '.join(table_def.format_row(row))) + return table + +def border_table(table_def): + fmt = ' | '.join(table_def.formats) + table = [] + if table_def.headers is not None: + table.append(fmt.format(*table_def.headers)) + bar = '' + for i in range(table_def.width): + width = table_def.widths[i] + if i > 0: + bar += '+' + if table_def.width == 1: + pass + elif i == 0: + width += 1 + elif i == table_def.width - 1: + width += 1 + else: + width += 2 + bar += '-' * width + table.append(bar) + for row in table_def.rows: + table.append(fmt.format(*row)) + return table + +class TableDef: + + def __init__(self): + self.width = None + self.headers = None + self.align = None + self.formats = None + self.rows = None + self.widths = None + + def find_widths(self): + self.widths = [0 for i in range(self.width)] + if self.headers is not None: + for i in range(len(self.headers)): + self.widths[i] = len(self.headers[i]) + for row in self.rows: + for i in range(len(row)): + if row[i] is not None: + self.widths[i] = max(self.widths[i], len(row[i])) + + def apply_widths(self, widths): + if widths is None: + return + for i in range(min(len(self.widths), len(widths))): + if widths[i] is not None: + self.widths[i] = widths[i] + + def define_row_formats(self): + self.formats = [] + for i in range(self.width): + f = '{{:{}{}.{}}}'.format( + alignments[self.align[i]], + self.widths[i], self.widths[i]) + self.formats.append(f) + + def format_header(self): + if self.headers is None: + return None + return self.format_row(self.headers) + + def format_row(self, data_row): + row = [] + for i in range(self.width): + value = data_row[i] + if value is None: + row.append(' ' * self.widths[i]) + else: + row.append(self.formats[i].format(value)) + return row + +class TextTable(BaseTable): + + def __init__(self): + BaseTable.__init__(self) + self.formatter = simple_table + self._widths = None + + def widths(self, widths): + self._widths = widths + + def compute_def(self, rows): + table_def = TableDef() + min_width, max_width = self.row_width(rows) + table_def.width = max_width + table_def.headers = self.pad_headers(max_width) + table_def.rows = self.format_rows(rows, min_width, max_width) + table_def.find_widths() + table_def.apply_widths(self._widths) + table_def.align = self.find_alignments(rows, max_width) + table_def.define_row_formats() + return table_def + + def format(self, rows): + if rows is None: + rows = [] + table_rows = self.formatter(self.compute_def(rows)) + return '\n'.join(table_rows) + + def show(self, rows): + print(self.format(rows)) + + def format_rows(self, rows, min_width, max_width): + if self._col_fmt is None: + return self.default_row_format(rows, min_width, max_width) + else: + return self.apply_row_formats(rows, max_width) + + def default_row_format(self, rows, min_width, max_width): + new_rows = [] + if min_width <= max_width: + rows = self.pad_rows(rows, max_width) + for row in rows: + new_row = ['' if v is None else str(v) for v in row] + new_rows.append(pad(new_row, max_width, None)) + return new_rows + + def apply_row_formats(self, rows, max_width): + new_rows = [] + fmts = self._col_fmt + if len(fmts) < max_width: + fmts = fmts.copy() + for i in range(len(fmts), max_width): + fmts.append(lambda v: v) + for row in rows: + new_row = [] + for i in range(len(row)): + new_row.append(fmts[i](row[i])) + new_rows.append(pad(new_row, max_width, None)) + return new_rows diff --git a/examples/quickstart/jupyter-notebooks/druidapi/util.py b/examples/quickstart/jupyter-notebooks/druidapi/util.py new file mode 100644 index 000000000000..fdfc7c268fe8 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/util.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def is_blank(s): + """ + Returns True if the given string is None or blank (after stripping spaces), + False otherwise. + """ + return s is None or len(s.strip()) == 0 + +def dict_get(dict, key, default=None): + """ + Returns the value of key in the given dict, or the default value if + the key is not found. + """ + if dict is None: + return default + return dict.get(key, default) From fc0d6118082366a3f1e4346e8f594f67d08225c7 Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Thu, 9 Feb 2023 16:55:14 -0800 Subject: [PATCH 02/19] Fixes --- .../Python_API_Tutorial.ipynb | 146 +++++++++++++----- .../quickstart/jupyter-notebooks/README.md | 2 +- 2 files changed, 109 insertions(+), 39 deletions(-) diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb index 32cfe2811f75..35e1ab3055e8 100644 --- a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -313,7 +313,7 @@ }, { "cell_type": "markdown", - "id": "e618366f", + "id": "d352f89b", "metadata": {}, "source": [ "## SQL Client\n", @@ -324,7 +324,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "3a9ba661", + "id": "6ca60d3b", "metadata": {}, "outputs": [], "source": [ @@ -333,7 +333,7 @@ }, { "cell_type": "markdown", - "id": "dfcc1d63", + "id": "d6c187aa", "metadata": {}, "source": [ "We can start with getting a list of schemas." @@ -342,7 +342,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "271b3a67", + "id": "f963d859", "metadata": {}, "outputs": [ { @@ -372,7 +372,7 @@ }, { "cell_type": "markdown", - "id": "d2adb6fe", + "id": "43a7709b", "metadata": {}, "source": [ "We can also see the tables (or datasources) within any schema." @@ -381,7 +381,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "74ad71be", + "id": "364491e1", "metadata": {}, "outputs": [ { @@ -409,7 +409,7 @@ }, { "cell_type": "markdown", - "id": "915c5630", + "id": "5f92feb7", "metadata": {}, "source": [ "We see the list of datasources by default. You'll get an empty result if you have no datasources yet." @@ -418,7 +418,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "610d3444", + "id": "9fa20ad6", "metadata": {}, "outputs": [ { @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "2b6df996", + "id": "b9cf0467", "metadata": {}, "source": [ "We can easily run a query and show the results:" @@ -453,7 +453,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "e11bceba", + "id": "56d7f460", "metadata": {}, "outputs": [ { @@ -486,7 +486,7 @@ }, { "cell_type": "markdown", - "id": "c8aaffa9", + "id": "4b783904", "metadata": {}, "source": [ "The query above showed the same results as `show_tables()`. That is not surprising: `show_tables()` just runs this query for us." @@ -494,7 +494,7 @@ }, { "cell_type": "markdown", - "id": "be4c481a", + "id": "50edb8b4", "metadata": {}, "source": [ "The API also allows passing context parameters and query parameters using a request object. Druid will work out the query parameter type based on the Python type. Pass context values as a Python `dict`." @@ -503,7 +503,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "bb06e99d", + "id": "734235a5", "metadata": {}, "outputs": [ { @@ -539,7 +539,7 @@ }, { "cell_type": "markdown", - "id": "543945f3", + "id": "988f8ab6", "metadata": {}, "source": [ "The request has other features for advanced use cases: see the code for details. The query API actually returns a sql response object. Use this if you want to get the values directly, work with the schema, etc." @@ -548,7 +548,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "b1497972", + "id": "b6fa9f40", "metadata": {}, "outputs": [], "source": [ @@ -563,7 +563,7 @@ { "cell_type": "code", "execution_count": 29, - "id": "50f5384d", + "id": "19af8eda", "metadata": {}, "outputs": [ { @@ -582,7 +582,7 @@ { "cell_type": "code", "execution_count": 24, - "id": "7ba92de5", + "id": "fc60c0da", "metadata": {}, "outputs": [ { @@ -605,7 +605,7 @@ }, { "cell_type": "markdown", - "id": "2ecba1c2", + "id": "eee53cd4", "metadata": {}, "source": [ "The `show()` method uses this information for format an HTML table to present the results." @@ -613,7 +613,7 @@ }, { "cell_type": "markdown", - "id": "8d071790", + "id": "fd114d9e", "metadata": {}, "source": [ "## MSQ Ingestion\n", @@ -626,7 +626,7 @@ { "cell_type": "code", "execution_count": 31, - "id": "f1e3b9ac", + "id": "804a08ec", "metadata": {}, "outputs": [], "source": [ @@ -660,7 +660,7 @@ { "cell_type": "code", "execution_count": 32, - "id": "2d925dc6", + "id": "124f5d3f", "metadata": {}, "outputs": [], "source": [ @@ -669,7 +669,7 @@ }, { "cell_type": "markdown", - "id": "e5fcbf1b", + "id": "c5e5c604", "metadata": {}, "source": [ "MSQ reports task completion as soon as ingestion is done. However, it takes a while for Druid to load the resulting segments. Let's wait for the table to become ready." @@ -678,7 +678,7 @@ { "cell_type": "code", "execution_count": 34, - "id": "e08d7c30", + "id": "54d055a1", "metadata": {}, "outputs": [], "source": [ @@ -687,7 +687,7 @@ }, { "cell_type": "markdown", - "id": "11c21741", + "id": "8d1094f9", "metadata": {}, "source": [ "`describe_table()` tells us about the columns in a table." @@ -696,7 +696,7 @@ { "cell_type": "code", "execution_count": 35, - "id": "47e9701e", + "id": "d671a56e", "metadata": {}, "outputs": [ { @@ -732,7 +732,7 @@ }, { "cell_type": "markdown", - "id": "59127223", + "id": "553fdd96", "metadata": {}, "source": [ "We can also sample a few rows of data." @@ -741,7 +741,7 @@ { "cell_type": "code", "execution_count": 36, - "id": "087ad2bd", + "id": "f4e5e73c", "metadata": {}, "outputs": [ { @@ -775,7 +775,7 @@ }, { "cell_type": "markdown", - "id": "c823e2ee", + "id": "c1fd579f", "metadata": {}, "source": [ "## Datasource Client\n", @@ -786,7 +786,7 @@ { "cell_type": "code", "execution_count": 37, - "id": "ced100ae", + "id": "deab98d1", "metadata": {}, "outputs": [], "source": [ @@ -796,7 +796,7 @@ }, { "cell_type": "markdown", - "id": "f9ba40eb", + "id": "e3d49d26", "metadata": {}, "source": [ "## Tasks Client\n", @@ -807,7 +807,7 @@ { "cell_type": "code", "execution_count": 40, - "id": "0a8123e0", + "id": "649c1eb6", "metadata": {}, "outputs": [ { @@ -851,7 +851,7 @@ }, { "cell_type": "markdown", - "id": "b7156347", + "id": "f0c58a11", "metadata": {}, "source": [ "## Constants\n", @@ -862,7 +862,7 @@ { "cell_type": "code", "execution_count": 38, - "id": "415a1ec1", + "id": "701e909c", "metadata": {}, "outputs": [], "source": [ @@ -872,7 +872,7 @@ { "cell_type": "code", "execution_count": 39, - "id": "2e183999", + "id": "d6412c31", "metadata": {}, "outputs": [ { @@ -943,7 +943,7 @@ }, { "cell_type": "markdown", - "id": "877a0e63", + "id": "d09b3ca2", "metadata": {}, "source": [ "Using the constants avoids typos:" @@ -952,7 +952,7 @@ { "cell_type": "code", "execution_count": 41, - "id": "0e5a555b", + "id": "2589e362", "metadata": {}, "outputs": [ { @@ -981,7 +981,77 @@ }, { "cell_type": "markdown", - "id": "7b28893e", + "id": "98febf58", + "metadata": {}, + "source": [ + "## Tracing\n", + "\n", + "It is often handy to see what the Druid API is doing: what messages it sends to Druid. You may need to debug some function that isn't working as expected. Or, perhaps you want to see what is sent to Druid so you can replicate it in your own code. Either way, just turn on tracing:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d963e545", + "metadata": {}, + "outputs": [], + "source": [ + "druid.trace(True)" + ] + }, + { + "cell_type": "markdown", + "id": "ff1276b7", + "metadata": {}, + "source": [ + "Then, each call to Druid prints what it sends:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "80e1867b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " SELECT TABLE_NAME AS TableName\n", + " FROM INFORMATION_SCHEMA.TABLES\n", + " WHERE TABLE_SCHEMA = 'druid'\n", + " ORDER BY TABLE_NAME\n", + " \n", + "POST: http://localhost:8888/druid/v2/sql\n", + "body: {'query': \"\\n SELECT TABLE_NAME AS TableName\\n FROM INFORMATION_SCHEMA.TABLES\\n WHERE TABLE_SCHEMA = 'druid'\\n ORDER BY TABLE_NAME\\n \", 'resultFormat': 'object'}\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "
TableName
myWiki
myWiki1
myWiki3
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_client.show_tables()" + ] + }, + { + "cell_type": "markdown", + "id": "587a1ef2", "metadata": {}, "source": [ "## Conclusion\n", @@ -992,7 +1062,7 @@ { "cell_type": "code", "execution_count": null, - "id": "056cbf27", + "id": "e8d6b4f2", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/quickstart/jupyter-notebooks/README.md b/examples/quickstart/jupyter-notebooks/README.md index ffc9128fedd8..f8874c466bba 100644 --- a/examples/quickstart/jupyter-notebooks/README.md +++ b/examples/quickstart/jupyter-notebooks/README.md @@ -1,7 +1,7 @@ # Jupyter Notebook tutorials for Druid If you are reading this in Jupyter, switch over to the [- START HERE -](- START HERE -.ipynb] -notebok instead. +notebook instead. \n", "\n", - "The Druid Python API is primarily intended to help with these notebook tutorials. It can also be used in a regular Python program, as long as the IPython dependencies are available.\n", + "This notebook provides a quick introduction to the Python wrapper around the [Druid REST API](api-tutorial.ipynb). This notebook assumes you are familiar with the basics of the REST API, and the [set of operations which Druid provides](https://druid.apache.org/docs/latest/operations/api-reference.html). This tutorial focuses on using Python to access those APIs rather than explaining the APIs themselves. The APIs themselves are covered in other notebooks that use the Python API.\n", "\n", - "The Druid Python API is a work in progress. We add API wrappers as needed for the notebook tutorials. If you find you need additional wrappers, please feel free to add them, and post a PR to Apache Druid with your additions.\n", + "The Druid Python API is primarily intended to help with these notebook tutorials. It can also be used in your own ad-hoc notebooks, or in a regular Python program.\n", + "\n", + "The Druid Python API is a work in progress. The Druid team adds API wrappers as needed for the notebook tutorials. If you find you need additional wrappers, please feel free to add them, and post a PR to Apache Druid with your additions.\n", "\n", "The API provides two levels of functions. Most are simple wrappers around Druid's REST APIs. Others add additional code to make the API easier to use. The SQL query interface is a prime example: extra code translates a simple SQL query into Druid's `SQLQuery` object and interprets the results into a form that can be displayed in a notebook.\n", "\n", - "We start by importing the `druidapi` package from the same folder as this notebook. The `styles()` calls adds some CSS styles needed to display results." + "Start by importing the `druidapi` package from the same folder as this notebook. The `styles()` calls adds some CSS styles needed to display results." ] }, { @@ -113,9 +132,9 @@ "id": "fb68a838", "metadata": {}, "source": [ - "Next we connect to our cluster by providing the router endpoint. Here we assume the cluster is on your local machine, using the default port. Go ahead and change this if your setup is different.\n", + "Next, connect to your cluster by providing the router endpoint. The code assumes the cluster is on your local machine, using the default port. Go ahead and change this if your setup is different.\n", "\n", - "The API uses the router to forward messages to each of Druid's services so that we don't have to keep track of the host and port for each service." + "The API uses the router to forward messages to each of Druid's services so that you don't have to keep track of the host and port for each service." ] }, { @@ -135,7 +154,7 @@ "source": [ "## Status Client\n", "\n", - "The SDK groups Druid REST API calls into categories, with a client for each. Let's start with the status client." + "The SDK groups Druid REST API calls into categories, with a client for each. Start with the status client." ] }, { @@ -153,7 +172,7 @@ "id": "be992774", "metadata": {}, "source": [ - "Use the Python help() function to learn what methods are avaialble." + "Use the Python `help()` function to learn what methods are avaialble." ] }, { @@ -240,7 +259,7 @@ "id": "70f3d578", "metadata": {}, "source": [ - "Druid servers return unexpected results if we make REST calls while Druid starts up. Let's wait until things are ready. The following will run until the server is ready. If you forgot to start your server, or the URL above is wrong, this will hang forever. Use the Kernel → Interrupt command to break out of the function. (Or, start your server. If your server refuses to start, then this Jupyter Notebook may be running on port 8888. See the [README](README.md) for how to start on a different port.)" + "Druid servers return unexpected results if you make REST calls while Druid starts up. The following will run until the server is ready. If you forgot to start your server, or the URL above is wrong, this will hang forever. Use the Kernel → Interrupt command to break out of the function. (Or, start your server. If your server refuses to start, then this Jupyter Notebook may be running on port 8888. See the [README](README.md) for how to start on a different port.)" ] }, { @@ -318,7 +337,7 @@ "source": [ "## SQL Client\n", "\n", - "Running SQL queries in a notebook is easy. Our goal here is to run a query and display results. The [pydruid](https://pythonhosted.org/pydruid/) library provides a robust way to run native queries, to run SQL queries, and to convert the results to various formats. Here our goal is just to interact with Druid." + "Running SQL queries in a notebook is easy. Here is an example of how to run a query and display results. The [pydruid](https://pythonhosted.org/pydruid/) library provides a robust way to run native queries, to run SQL queries, and to convert the results to various formats. Here the goal is just to interact with Druid." ] }, { @@ -336,7 +355,7 @@ "id": "d051bc5e", "metadata": {}, "source": [ - "We can start with getting a list of schemas." + "Start by getting a list of schemas." ] }, { @@ -375,7 +394,7 @@ "id": "b8261ab0", "metadata": {}, "source": [ - "We can also see the tables (or datasources) within any schema." + "Then, retreive the tables (or datasources) within any schema." ] }, { @@ -412,7 +431,7 @@ "id": "ff311595", "metadata": {}, "source": [ - "We see the list of datasources by default. You'll get an empty result if you have no datasources yet." + "The above shows the list of datasources by default. You'll get an empty result if you have no datasources yet." ] }, { @@ -447,7 +466,7 @@ "id": "7392e484", "metadata": {}, "source": [ - "We can easily run a query and show the results:" + "You can easily run a query and show the results:" ] }, { @@ -489,7 +508,7 @@ "id": "c6c4e1d4", "metadata": {}, "source": [ - "The query above showed the same results as `show_tables()`. That is not surprising: `show_tables()` just runs this query for us." + "The query above showed the same results as `show_tables()`. That is not surprising: `show_tables()` just runs this query for you." ] }, { @@ -620,7 +639,7 @@ "\n", "The SQL client also performs MSQ-based ingestion using `INSERT` or `REPLACE` statements. Use the extension check above to ensure that `druid-multi-stage-query` is loaded in Druid 26. (Later versions may have MSQ built in.)\n", "\n", - "An MSQ query is run using a different API: `task()`. This API returns a response object that describes the Overlord task which runs the MSQ query. For tutorials, our data is usually small enough we just want to wait for the ingestion to complete. We do that with the `run_task()` call which handles the waiting for us. To illustrate, lets use a query that ingests a subset of columns, and includes a few data clean-up steps:" + "An MSQ query is run using a different API: `task()`. This API returns a response object that describes the Overlord task which runs the MSQ query. For tutorials, data is usually small enough you can wait for the ingestion to complete. Do that with the `run_task()` call which handles the waiting. To illustrate, here is a query that ingests a subset of columns, and includes a few data clean-up steps:" ] }, { @@ -672,7 +691,7 @@ "id": "ef4512f8", "metadata": {}, "source": [ - "MSQ reports task completion as soon as ingestion is done. However, it takes a while for Druid to load the resulting segments. Let's wait for the table to become ready." + "MSQ reports task completion as soon as ingestion is done. However, it takes a while for Druid to load the resulting segments. Wait for the table to become ready." ] }, { @@ -690,7 +709,7 @@ "id": "11d9c95a", "metadata": {}, "source": [ - "`describe_table()` tells us about the columns in a table." + "`describe_table()` lists the columns in a table." ] }, { @@ -735,7 +754,7 @@ "id": "936f57fb", "metadata": {}, "source": [ - "We can also sample a few rows of data." + "You can sample a few rows of data." ] }, { @@ -780,7 +799,7 @@ "source": [ "## Datasource Client\n", "\n", - "The Datasource client lets us perform operations on datasource objects. While the SQL layer lets us get metadata and do queries. the datasource client let's us work with the underlying segments. Explaining the full functionality is the topic of another notebook. For now, let's just use the datasource client to clean up the datasource created above. The `True` argument asks for \"if exists\" semantics so we don't get an error if the datasource was alredy deleted." + "The Datasource client lets you perform operations on datasource objects. The SQL layer allows you to get metadata and do queries. The datasource client works with the underlying segments. Explaining the full functionality is the topic of another notebook. For now, you can use the datasource client to clean up the datasource created above. The `True` argument asks for \"if exists\" semantics so you don't get an error if the datasource was alredy deleted." ] }, { @@ -893,7 +912,7 @@ "id": "2654e72c", "metadata": {}, "source": [ - "Use the REST client if you need to make calls that are not yet wrapped by the Python API, or if you want to do something special. To illustrate the client, we'll make some of the same calls as in the [Druid REST API notebook](api_tutorial.ipynb). For contrast, we also show the Python API equivalent.\n", + "Use the REST client if you need to make calls that are not yet wrapped by the Python API, or if you want to do something special. To illustrate the client, you can make some of the same calls as in the [Druid REST API notebook](api_tutorial.ipynb).\n", "\n", "The REST API maintains the Druid host: you just provide the specifc URL tail. There are methods to get or post JSON results. For example, to get status information:" ] @@ -1026,7 +1045,7 @@ "source": [ "## Constants\n", "\n", - "Druid has a large number of special constants: type names, options, etc. The `consts` module provides definitions for may of these:" + "Druid has a large number of special constants: type names, options, etc. The `consts` module provides definitions for many of these:" ] }, { diff --git a/examples/quickstart/jupyter-notebooks/README.md b/examples/quickstart/jupyter-notebooks/README.md index e3de5f1b416c..823d2136d7fe 100644 --- a/examples/quickstart/jupyter-notebooks/README.md +++ b/examples/quickstart/jupyter-notebooks/README.md @@ -53,24 +53,25 @@ Make sure you meet the following requirements before starting the Jupyter-based # Install Jupyter Notebook pip3 install notebook ``` - - Start Jupyter: - - JupyterLab - ```bash - # Start JupyterLab on port 3001 - jupyter lab --port 3001 - ``` - - Jupyter Notebook - ```bash - # Start Jupyter Notebook on port 3001 - jupyter notebook --port 3001 - ``` + - Start Jupyter using either JupyterLab + ```bash + # Start JupyterLab on port 3001 + jupyter lab --port 3001 + ``` + + Or using Jupyter Notebook + ```bash + # Start Jupyter Notebook on port 3001 + jupyter notebook --port 3001 + ``` - An available Druid instance. You can use the `micro-quickstart` configuration described in [Quickstart](https://druid.apache.org/docs/latest/tutorials/index.html). The tutorials assume that you are using the quickstart, so no authentication or authorization is expected unless explicitly mentioned. - Druid developers can use a cluster launched for an integration test: + If you contribute to Druid, and work with Druid integration tests, can use a test cluster. + Assume you have an environment variable, `DRUID_DEV`, which identifies your Druid source repo. ```bash cd $DRUID_DEV @@ -79,11 +80,9 @@ Make sure you meet the following requirements before starting the Jupyter-based ./it.sh up ``` - Where `DRUID_DEV` points to your Druid source code repo, and `` is one - of the available integration test categories. See the integration test `README.md` - for details. + Replace `` with one of the available integration test categories. See the integration + test `README.md` for details. ## Continue in Jupyter -Fire up Jupyter (see above) and navigate to the "- START HERE -" page for more -information. +Start Jupyter (see above) and navigate to the "- START HERE -" page for more information. diff --git a/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb b/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb index cf894e50a34f..f20e38ba3251 100644 --- a/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb @@ -7,7 +7,7 @@ "tags": [] }, "source": [ - "# Tutorial: Learn the basics of the Druid API\n", + "# Learn the basics of the Druid API\n", "\n", " + +# Python API for Druid + + +`druidapi` is a Python library to interact with all aspects of your +[Apache Druid](https://druid.apache.org/) cluster. +`druidapi` picks up where the venerable [pydruid](https://github.com/druid-io/pydruid) lbrary +left off to include full SQL support and support for many of of Druid APIs. `druidapi` is usable +in any Python environment, but is optimized for use in Jupyter, providing a complete interactive +environment which complements the UI-based Druid console. The primary use of `druidapi` at present +is to support the set of tutorial notebooks provided in the parent directory. + +## Install + +At present, the best way to use `druidapi` is to clone the Druid repo itself: + +```bash +git clone git@github.com:apache/druid.git +``` + +`druidapi` is located in `examples/quickstart/jupyter-notebooks/druidapi/` + +Eventually we would like to create a Python package that can be installed with `pip`. Contributions +in that area are welcome. + +Dependencies are listed in `requirements.txt`. + +`druidapi` works against any version of Druid. Opeations that exploit newer features obviously work +only against versions of Druid that support those features. + +## Getting Started + +To use `druidapi`, you must first import the library, then connect to your cluster by providing the URL to your Router instance. The way that is done differs a bit between consumers. + +### From a Tutorial Jupyter Notebook + +The tutorial Jupyter notebooks in `examples/quickstart/jupyter-notebooks` reside in the same directory tree +as this library. We start the library using the Jupyter-oriented API which is able to render tables in +HTML. First, identify your router endpoint. For a local installation: + +```python +router_endpoint = 'http://localhost:8888' +``` + +Then, import the library, declare the `druidapi` CSS styles, and create a client to your cluster: + +```python +import druidapi +druid = druidapi.jupyter_client(router_endpoint) +``` + +The `jupyter_client` call defines a number of CSS styles to aid in displaying tabular results. It also +provides a "display" client that renders information as HTML tables. + +### From Any Other Juypter Notebook + +If you create a Jupyter notebook outside of the `jupyter-notebooks` directory then you must tell Python where +to find the `druidapi` library. (This step is temporary until `druidapi` is properly packaged.) + +First, set a variable to point to the location where you cloned the Druid git repo: + +```python +druid_dev = "/path/to/Druid-repo` +``` + +Then, add the notebooks directory to Python's module search path: + +```python +import sys +sys.path.append(drudi_dev + '/examples/quickstart/jupyter-notebooks/') +``` + +Now you can import `druidapi` and create a client as shown in the previous section. + +### From a Python Script + +`druidapi` works in any Python script. When run outside of a Jupyter notebook, the various "display" +commands revert to displaying a text (not HTML) format. The steps are similar to those above: + +```python +druid_dev = "/path/to/Druid-repo` +import sys +sys.path.append(drudi_dev + '/examples/quickstart/jupyter-notebooks/') +import druidapi +druid = druidapi.client(router_endpoint) +``` + +## Library Organization + +`druidapi` organizes Druid REST operations into various "clients," each of which provides operations +for one of Druid's functional areas. Obtain a client from the `druid` client created above. For +status operations: + +```python +status_client = druid.status +``` + +The set of clients is still under construction. The set at present includes the following. The +set of operations within each client is also partial, and includes only those operations used +within one of the tutorial notebooks. Contributions welcome to expand the scope. Clients are +available as properties on the `druid` object created above. + +* `status` - Status operations such as service health, property values, and so on. This client + is special: it works only with the Router. The Router does not proxy these calls to other nodes. + See the note above about how to get status for other nodes. +* `datasources` - Operations on datasources such as dropping a datasource. +* `tasks` - Work with Overlord tasks: status, reports, and more. +* `sql` - SQL query operations for both the interactive query engine and MSQ. +* `display` - A set of convenience operations to display results as lightly formatted tables + in either HTML (for Jupyter notebooks) or text (for other Python scripts). + +## Assumed Cluster Architecture + +`druidapi` assumes that you run a standard Druid cluster with a Router in front of the other nodes. +This design works well for most Druid clusters: + +* Run locally, such as the various quickstart clusters. +* Remote cluster on the same network. +* Druid cluster running under Docker Compose such as that explained in the Druid documentation. +* Druid integration test clusters launched via the Druid development `it.sh` command. +* Druid clusters running under Kubernetes + +In all the Docker, Docker Compose and Kubernetes scenaris, the Router's port 8888 must be visible +to the machine running `druidapi`, perhaps via port mapping or a proxy. + +The Router is then responsible for routing Druid REST requests to the various other Druid nodes, +including those not visible outside of a private Docker or Kubernetes network. + +The one exception to this rule is if you want to perform a health check (i.e. the `/status` endpoint) +on a service other than the Router. These checks are _not_ proxied by the Router: you must connect to +the target service directly. + +## Status Operations + +When working with tutorials, a local Druid cluster, or a Druid integration test cluster, it is common +to start your cluster then immediately start performing `druidapi` operations. However, because Druid +is a distributed system, it can take some time for all the services to become ready. This seems to be +particularly true when starting a cluster with Docker Compose or Kubernetes on the local system. + +Therefore, the first operation is to wait for the cluster to become ready: + +```python +status_client = druid.status +status_client.wait_until_ready() +``` + +Without this step, your operations may mysteriously fail, and you'll wonder if you did something wrong. +Some clients retry operations multiple times in case a service is not yet ready. For typical scripts +against a stable cluster, the above line should be sufficient instead. This step is build into the +`jupyter_client()` method to ensure notebooks provide a good exerience. + +If your notebook or script uses newer features, you should start by ensuring that the target Druid cluster +is of the correct version: + +```python +status_client.version +``` + +This check will prevent frustration if the notebook is used against previous releases. + +Similarly, if the notebook or script uses features defined in an extension, check that the required +extension is loaded: + +```python +status_client.properties['druid.extensions.loadList'] +``` + +## Display Client + +When run in a Jypter notebook, it is often handy to format results for display. A special display +client performs operations _and_ formats them for display as HTML tables within the notebook. + +```python +display = druid.display +``` + +The most common methods are: + +* `sql(sql)` - Run a query and display the results as an HTML table. +* `schemas()` - Display the schemas defined in Druid. +* `tables(schema)` - Display the tables (datasources) in the given schema, `druid` by default. +* `table(name)` - Display the schema (list of columns) for the the given table. The name can + be one part (`foo`) or two parts (`INFORMATION_SCHEMA.TABLES`). +* `function(name)` - Display the arguments for a table function defined in the catalog. + +The display client also has other methods to format data as a table, to display various kinds +of messages and so on. + +## Interactive Queries + +The original [`pydruid`](https://pythonhosted.org/pydruid/) library revolves around Druid +"native" queries. Most new applications now use SQL. `druidapi` provides two ways to run +queries, depending on whether you want to display the results (typical in a notebook), or +use the results in Python code. You can run SQL queries using the SQL client: + +```python +sql_client = druid.sql +``` + +To obtain the results of a SQL query against the example Wikipedia table (datasource) in a "raw" form: + + +```python +sql = ''' +SELECT + channel, + COUNT(*) AS "count" +FROM wikipedia +GROUP BY channel +ORDER BY COUNT(*) DESC +LIMIT 5 +''' +client.sql(sql) +``` + +Gives: + +```text +[{'channel': '#en.wikipedia', 'count': 6650}, + {'channel': '#sh.wikipedia', 'count': 3969}, + {'channel': '#sv.wikipedia', 'count': 1867}, + {'channel': '#ceb.wikipedia', 'count': 1808}, + {'channel': '#de.wikipedia', 'count': 1357}] +``` + +The raw results are handy when Python code consumes the results, or for a quick check. The raw results +can also be forward to advanced visualization tools such a Pandas. + +For simple visualization in notebooks (or as text in Python scripts), you can use the "display" client: + +```python +display = druid.display +display.sql(sql) +``` + +When run without HTML visualization, the above gives: + +```text +channel count +#en.wikipedia 6650 +#sh.wikipedia 3969 +#sv.wikipedia 1867 +#ceb.wikipedia 1808 +#de.wikipedia 1357 +``` + +Within Jupyter, the results are formatted as an HTML table. + +### Advanced Queries + +In addition to the SQL text, Druid also lets you specify: + +* A query context +* Query parameters +* Result format options + +The Druid `SqlQuery` object specifies these options. You can build up a Python equivalent: + +```python +sql = ''' +SELECT * +FROM INFORMATION_SCHEMA.SCHEMATA +WHERE SCHEMA_NAME = ? +''' + +sql_query = { + 'query': sql, + 'parameters': [ + {'type': consts.SQL_VARCHAR_TYPE, 'value': 'druid'} + ], + 'resultFormat': consts.SQL_OBJECT +} +``` + +However, the easier approach is to let `druidapi` handle the work for you using a SQL request: + +```python +req = self.client.sql_request(sql) +req.add_parameter('druid') +``` + +Either way, when you submit the query in this form, you get a SQL response back: + +```python +resp = sql_client.sql_query(req) +``` + +The SQL response wraps the REST response. First, we ensure that the request worked: + +```python +resp.ok +``` + +If the request failed, we can obtain the error message: + +```python +resp.error_message +``` + +If the request succeeded, we can obtain the results in a variety of ways. The easiest is to obtain +the data as a list of Java objects. This is the form shown in the "raw" example above. This works +only if you use the default ('objects') result format. + +```python +resp.rows +``` + +You can also obtain the schema of the result: + +```python +resp.schema +``` + +The result is a list of `ColumnSchema` objects. Get column information from the `name`, `sql_type` +and `druid_type` fields in each object. + +For other formats, you can obtain the REST payload directly: + +```python +resp.results +``` + +Use the `results()` method if you requested other formats, such as CSV. The `rows()` and `schema()` methods +are not available for these other result formats. + +The result can also format the results as a text or HTML table, depending on how you created the client: + +```python +resp.show +``` + +In fact, the display client `sql()` method uses the `resp.show()` method internally, which in turn uses the +`rows()` and `schema()` methods. + +### Run a Query and Return Results + +The above forms are handy for interactive use in a notebook. If you just need to run a query to use the results +in code, just do the following: + +```python +rows = sql_client.sql(sql) +``` + +This form also takes a set of arguments so that you can use Python to parameterize the query: + +```python +sql = 'SELECT * FROM {}' +rows = sql_client.sql(sql, ['myTable']) +``` + +## MSQ Queries + +The SQL client can also run an MSQ query. See the `sql-tutorial.ipynb` notebook for examples. First define the +query: + +```python +sql = ''' +INSERT INTO myTable ... +``` + +Then launch an ingestion task: + +```python +task = sql_client.task(sql) +``` + +To learn the Overlord task ID: + +```python +task.id +``` + +You can use the tasks client to track the status, or let the task object do it for you: + +```python +task.wait_done() +``` + +A quirk of Druid is that MSQ reports task completion as soon as ingestion is done. However, it takes a +while for Druid to load the resulting segments. Wait for the table to become ready. Use the following +to wait for the table to become queryable: + +```python +sql_client.wait_until_ready('myTable') +``` + +## Datasource Operations + +To get information about a datasource, prefer to query the `INFORMATION_SCHEMA` tables, or use the methods +in the display client. Use the datasource client for other operations. + +```python +datasources = druid.datasources +``` + +To delete a datasource: + +```python +datasources.drop('myWiki', True) +``` + +The True argument asks for "if exists" semantics so you don't get an error if the datasource does not exist. + +## REST Client + +The `druidapi` is based on a simple REST client which is itself based on the Requests library. If you +need to use Druid REST APIs not yet wrapped by this library, you can use the REST client directly. +(If you find such APIs, we encourage you to add methods to the library and contribute them to Druid.) + +The REST client implements the common patterns seen in the Druid REST API. You can create a client directly: + +```python +from druidapi.rest import DruidRestClient +rest_client = DruidRestClient("http://localhost:8888") +``` + +Or, if you have already created the Druid client, you can reuse the existing REST client. This is how +the various other clients work internally. + +```python +rest_client = druid.rest +``` + +The REST API maintains the Druid host: you just provide the specifc URL tail. There are methods to get or +post JSON results. For example, to get status information: + +```python +rest_client.get_json('/status') +``` + +A quick comparison of the three approaches (Requests, REST client, Python client): + +Status: + +* Requests: `session.get(druid_host + '/status').json()` +* REST client: `rest_client.get_json('/status')` +* Status client: `status_client.status()` + +Health: + +* Requests: `session.get(druid_host + '/status/health').json()` +* REST client: `rest_client.get_json('/status/health')` +* Status client: `status_client.is_healthy()` + +Ingest data: + +* Requests: See the [REST tutorial](api_tutorial.ipynb) +* REST client: as the REST tutorial, but use `rest_client.post_json('/druid/v2/sql/task', sql_request)` and + `rest_client.get_json(f"/druid/indexer/v1/task/{ingestion_taskId}/status")` +* SQL client: `sql_client.run_task(sql)`, also a form for a full SQL request. + +List datasources: + +* Requests: `session.get(druid_host + '/druid/coordinator/v1/datasources').json()` +* REST client: `rest_client.get_json('/druid/coordinator/v1/datasources')` +* Datasources client: `ds_client.names()` + +Query data, where `sql_request` is a properly-formatted `SqlResquest` dictionary: + +* Requests: `session.post(druid_host + '/druid/v2/sql', json=sql_request).json()` +* REST client: `rest_client.post_json('/druid/v2/sql', sql_request)` +* SQL Client: `sql_client.show(sql)`, where `sql` is the query text + +In general, you have to provide the all the details for the Requests library. The REST client handles the low-level repetitious bits. The Python clients provide methods that encapsulate the specifics of the URLS and return formats. + +## Constants + +Druid has a large number of special constants: type names, options, etc. The consts module provides definitions for many of these: + +```python +from druidapi import consts +help(consts) +``` diff --git a/examples/quickstart/jupyter-notebooks/druidapi/__init__.py b/examples/quickstart/jupyter-notebooks/druidapi/__init__.py index 55c9769fa281..2734544ea48e 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/__init__.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/__init__.py @@ -14,20 +14,22 @@ # limitations under the License. from .druid import DruidClient -from . import display, html_table -def client(endpoint): - return DruidClient(endpoint) - -def styles(): - show_as_html() - html_table.styles() - -def show_as_text(): - display.display.text() +def jupyter_client(endpoint) -> DruidClient: + ''' + Create a Druid client configured to display results as HTML withing a Jupyter notebook. + Waits for the cluster to become ready to avoid intermitent problems when using Druid. + ''' + from .html import HtmlDisplayClient + druid = DruidClient(endpoint, HtmlDisplayClient()) + druid.status.wait_until_ready() + return druid -def show_as_html(): - display.display.html() - -def _display(): - return display.display +def client(endpoint) -> DruidClient: + ''' + Create a Druid client for use in Python scripts that uses a text-based format for + displaying results. Does not wait for the cluster to be ready: clients should call + `status().wait_until_ready()` before making other Druid calls if there is a chance + that the cluster has not yet fully started. + ''' + return DruidClient(endpoint) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/display.py b/examples/quickstart/jupyter-notebooks/druidapi/display.py index e211e2178817..e4a139c1f6d8 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/display.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/display.py @@ -13,53 +13,134 @@ # See the License for the specific language governing permissions and # limitations under the License. -TEXT_TABLE = 0 -HTML_TABLE = 1 +from . import consts -class Display: +class DisplayClient: + ''' + Abstract base class to display various kinds of results. + ''' - def __init__(self): - self.format = TEXT_TABLE - self.html_initialized = False + def __init__(self, druid=None): + # If the client is None, it must be backfilled by the caller. + # This case occurs only when creating the DruidClient to avoid + # a circular depencency. + self._druid = druid - def text(self): - self.format = TEXT_TABLE + # Basic display operations - def html(self): - self.format = HTML_TABLE - if not self.html_initialized: - from .html_table import styles - styles() - self.html_initialized = True - - def table(self): - if self.format == HTML_TABLE: - from .html_table import HtmlTable - return HtmlTable() - else: - from .text_table import TextTable - return TextTable() - - def show_object_list(self, objects, cols): - list_to_table(self.table(), objects, cols) + def text(self, msg): + raise NotImplementedError() + + def alert(self, msg): + raise NotImplementedError() - def show_object(self, obj, labels): - object_to_table(self.table(), obj, labels) + def error(self, msg): + raise NotImplementedError() - def show_error(self, msg): - from .html_table import html_error - html_error('ERROR: ' + msg + '') + # Tabular formatting + + def new_table(self): + raise NotImplementedError() - def show_message(self, msg): - from .html_table import html - html('' + msg + '') + def show_table(self, table): + raise NotImplementedError() + + def data_table(self, rows, cols=None): + ''' + Display a table of data with the optional column headings. + + Parameters + ---------- + objects: list[list] + The data to display as a list of lists, where each inner list represents one + row of data. Rows should be of the same width: ragged rows will display blank + cells. Data can be of any scalar type and is formatted correctly for that type. + + cols: list[str] + Optional list of column headings. + ''' + table = self.new_table() + table.rows(rows) + table.headers(cols) + self.show_table(table) + + def object_list(self, objects, cols=None): + ''' + Display a list of objects represented as dictionaries with optional headings. + + Parameters + ---------- + objects: list[dict] + List of dictionaries: one dictionary for each row. + + cols: dict, Default = None + A list of column headings in the form `{'key': 'label'}` + ''' + table = self.new_table() + table.from_object_list(objects, cols) + self.show_table(table) + + def object(self, obj, labels=None): + ''' + Display a single object represented as a dictionary with optional headings. + The object is displayed in two columns: keys and values. + + Parameters + ---------- + objects: list[dict] + List of dictionaries: one dictionary for each row. + + labels: list, Default = None + A list of column headings in the form `['key', 'value']`. Default headings + are used if the lables are not provided. + ''' + table = self.new_table() + table.from_object(obj, labels) + self.show_table(table) + + # SQL formatting + + def sql(self, sql): + ''' + Run a query and display the result as a table. + + Parameters + ---------- + query + The query as either a string or a SqlRequest object. + ''' + self._druid.sql.sql_query(sql).show(display=self) + + def table(self, table_name): + ''' + Describe a table by returning the list of columns in the table. + + Parameters + ---------- + table_name str + The name of the table as either "table" or "schema.table". + If the form is "table", then the 'druid' schema is assumed. + ''' + self._druid.sql._schema_query(table_name).show(display=self) + + def function(self, table_name): + ''' + Retrieve the list of parameters for a partial external table defined in + the Druid catalog. -def list_to_table(table, objects, cols = None): - table.from_object_list(objects, cols) - return table.show() + Parameters + ---------- + table_name str + The name of the table as either "table" or "schema.table". + If the form is "table", then the 'ext' schema is assumed. + ''' + return self._druid.sql._function_args_query(table_name).show(display=self) -def object_to_table(table, obj, labels): - table.from_object(obj, labels) - table.show() + def schemas(self): + ''' + Display the list of schemas available in Druid. + ''' + self._druid.sql._schemas_query().show() -display = Display() + def tables(self, schema=consts.DRUID_SCHEMA): + self._druid.sql._tables_query(schema).show(display=self) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druid.py b/examples/quickstart/jupyter-notebooks/druidapi/druid.py index 9f8db3ddc6ee..3e0d154068b4 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/druid.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/druid.py @@ -26,14 +26,21 @@ class DruidClient: specialized "clients" that group many of Druid's REST API calls. ''' - def __init__(self, router_endpoint): + def __init__(self, router_endpoint, display_client=None): self.rest_client = DruidRestClient(router_endpoint) self.status_client = None self.catalog_client = None self.sql_client = None self.tasks_client = None self.datasource_client = None + if display_client: + self.display_client = display_client + else: + from .text import TextDisplayClient + self.display_client = TextDisplayClient() + self.display_client._druid = self + @property def rest(self): ''' Returns the low-level REST client. Useful for debugging and to access REST API @@ -52,24 +59,27 @@ def trace(self, enable=True): ''' self.rest_client.enable_trace(enable) - def status(self, endpoint=None) -> StatusClient: + @property + def status(self) -> StatusClient: + ''' + Returns the status client for the Router service. + ''' + if not self.status_client: + self.status_client = StatusClient(self.rest_client) + return self.status_client + + def status_for(self, endpoint) -> StatusClient: ''' Returns the status client for a Druid service. Parameters ---------- endpoint: str - The URL for a Druid service. If None, then returns the status client - for the Router. + The URL for a Druid service. ''' - if not endpoint: - if not self.status_client: - self.status_client = StatusClient(self.rest_client) - return self.status_client - else: - endpoint_client = DruidRestClient(endpoint) - return StatusClient(endpoint_client, True) + return StatusClient(DruidRestClient(endpoint), True) + @property def catalog(self) -> CatalogClient: ''' Returns the catalog client to interact with the Druid catalog. @@ -78,6 +88,7 @@ def catalog(self) -> CatalogClient: self.catalog_client = CatalogClient(self.rest_client) return self.catalog_client + @property def sql(self) -> QueryClient: ''' Returns the SQL query client to submit interactive or MSQ queries. @@ -86,6 +97,7 @@ def sql(self) -> QueryClient: self.sql_client = QueryClient(self) return self.sql_client + @property def tasks(self) -> TaskClient: ''' Returns the Overlord tasks client to submit and track tasks. @@ -94,6 +106,7 @@ def tasks(self) -> TaskClient: self.tasks_client = TaskClient(self.rest_client) return self.tasks_client + @property def datasources(self) -> DatasourceClient: ''' Returns the Coordinator datasources client to manipulate datasources. @@ -104,6 +117,10 @@ def datasources(self) -> DatasourceClient: self.datasource_client = DatasourceClient(self.rest_client) return self.datasource_client + @property + def display(self): + return self.display_client + def close(self): self.rest_client.close() self.rest_client = None diff --git a/examples/quickstart/jupyter-notebooks/druidapi/html_table.py b/examples/quickstart/jupyter-notebooks/druidapi/html.py similarity index 78% rename from examples/quickstart/jupyter-notebooks/druidapi/html_table.py rename to examples/quickstart/jupyter-notebooks/druidapi/html.py index b3451cd8f439..e871c2b78550 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/html_table.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/html.py @@ -14,8 +14,9 @@ # limitations under the License. from IPython.display import display, HTML -from .base_table import BaseTable from html import escape +from .display import DisplayClient +from .base_table import BaseTable STYLES = ''' @@ -53,15 +58,9 @@ def escape_for_html(s): return s.replace('$', '\\$') def html(s): - s = '
' + escape_for_html(s) + '
' - display(HTML(s)) - -def html_error(s): - s = '
' + escape_for_html(s.replace('\n', '
')) + '
' display(HTML(s)) -def styles(): - display(HTML(STYLES)) +initialized = False alignments = ['druid-left', 'druid-center', 'druid-right'] @@ -90,10 +89,6 @@ def format(self) -> str: s += self.gen_rows(rows) return s + '\n' - def show(self, rows): - self._rows = rows - html(self.format()) - def gen_header(self, headers): if not headers: return '' @@ -120,3 +115,27 @@ def col_align(self, col): if col >= len(self._align): return None return self._align[col] + +class HtmlDisplayClient(DisplayClient): + + def __init__(self): + DisplayClient.__init__(self) + global initialized + if not initialized: + display(HTML(STYLES)) + initialized = True + + def text(self, msg): + html('
' + escape_for_html(msg) + '
') + + def alert(self, msg): + html('
' + escape_for_html(msg.replace('\n', '
')) + '
') + + def error(self, msg): + html('
ERROR: ' + escape_for_html(msg.replace('\n', '
')) + '
') + + def new_table(self): + return HtmlTable() + + def show_table(self, table): + self.text(table.format()) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/requirements.txt b/examples/quickstart/jupyter-notebooks/druidapi/requirements.txt new file mode 100644 index 000000000000..b67ab75d9f68 --- /dev/null +++ b/examples/quickstart/jupyter-notebooks/druidapi/requirements.txt @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ------------------------------------------------------------------------ + +# Requirements for the the druiapi library. +# See: https://pip.pypa.io/en/stable/reference/requirements-file-format/ +# +# Requirements are both few and simple at present. + +requests diff --git a/examples/quickstart/jupyter-notebooks/druidapi/rest.py b/examples/quickstart/jupyter-notebooks/druidapi/rest.py index 72b7e685d067..435b99ef16f7 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/rest.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/rest.py @@ -37,7 +37,6 @@ def check_error(response): code = response.status_code if code == requests.codes.ok or code == requests.codes.accepted: return - error = None json = None try: json = response.json() diff --git a/examples/quickstart/jupyter-notebooks/druidapi/sql.py b/examples/quickstart/jupyter-notebooks/druidapi/sql.py index b72be879ebbd..3a9faf038d45 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/sql.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/sql.py @@ -14,14 +14,12 @@ # limitations under the License. import time, requests -from . import consts, display -from .consts import ROUTER_BASE +from . import consts from .util import dict_get, split_table_name from .error import DruidError, ClientError -REQ_ROUTER_QUERY = ROUTER_BASE -REQ_ROUTER_SQL = ROUTER_BASE + '/sql' -REQ_ROUTER_SQL_TASK = REQ_ROUTER_SQL + '/task' +REQ_SQL = consts.ROUTER_BASE + '/sql' +REQ_SQL_TASK = REQ_SQL + '/task' class SqlRequest: @@ -272,22 +270,29 @@ def __init__(self, request, response): except KeyError: self._error = 'Query returned no query ID' + @property + def _druid(self): + return self.request.query_client.druid_client + + @property def result_format(self): return self.request.result_format() + @property def ok(self): ''' Reports if the query succeeded. - The query rows and schema are available only if ok() returns True. + The query rows and schema are available only if ok is True. ''' return is_response_ok(self.http_response) + @property def error(self): ''' If the query fails, returns the error, if any provided by Druid. ''' - if self.ok(): + if self.ok: return None if self._error: return self._error @@ -297,10 +302,11 @@ def error(self): return None return {'error': 'HTTP {}'.format(self.http_response.status_code)} - def error_msg(self): - if self.ok(): + @property + def error_message(self): + if self.ok: return None - err = self.error() + err = self.error if not err: return 'unknown' if type(err) is str: @@ -315,35 +321,40 @@ def error_msg(self): return msg return msg + ': ' + text + @property def id(self): ''' Returns the unique identifier for the query. ''' return self._id + @property def non_null(self): - if not self.ok(): + if not self.ok: return None - if self.result_format() != consts.SQL_OBJECT: + if self.result_format != consts.SQL_OBJECT: return None - return filter_null_cols(self.rows()) + return filter_null_cols(self.rows) + @property def as_array(self): - if self.result_format() == consts.SQL_OBJECT: + if self.result_format == consts.SQL_OBJECT: rows = [] - for obj in self.rows(): + for obj in self.rows: rows.append([v for v in obj.values()]) return rows else: - return self.rows() + return self.rows + @property def json(self): - if not self.ok(): + if not self.ok: return None if not self._json: self._json = self.http_response.json() return self._json + @property def rows(self): ''' Returns the rows of data for the query. @@ -352,12 +363,13 @@ def rows(self): attempt to map the format into an array of rows of some sort. ''' if not self._rows: - json = self.json() + json = self.json if not json: return self.http_response.text - self._rows = parse_rows(self.result_format(), self.request.context, json) + self._rows = parse_rows(self.result_format, self.request.context, json) return self._rows + @property def schema(self): ''' Returns the data schema as a list of ColumnSchema objects. @@ -367,29 +379,39 @@ def schema(self): extract the schema from the query results. ''' if not self._schema: - self._schema = parse_schema(self.result_format(), self.request.context, self.json()) + self._schema = parse_schema(self.result_format, self.request.context, self.json) return self._schema - def show(self, non_null=False): + def _display(self, display): + return self._druid.display if not display else display + + def show(self, non_null=False, display=None): + display = self._display(display) + if not self.ok: + display.error(self.error_message) + return data = None if non_null: - data = self.non_null() + data = self.non_null if not data: - data = self.as_array() + data = self.as_array if not data: - display.display.show_message('Query returned no results') + display.alert('Query returned no results') return - disp = display.display.table() - disp.headers([c.name for c in self.schema()]) - disp.show(data) + display.data_table(data, [c.name for c in self.schema]) - def show_schema(self): - disp = display.display.table() - disp.headers(['Name', 'SQL Type', 'Druid Type']) + def show_schema(self, display=None): + display = self._display(display) + if not self.ok: + display.error(self.error_message) + return data = [] - for c in self.schema(): + for c in self.schema: data.append([c.name, c.sql_type, c.druid_type]) - disp.show(data) + if not data: + display.alert('Query returned no schema') + return + display.data_table(data, ['Name', 'SQL Type', 'Druid Type']) class QueryTaskResult: ''' @@ -433,6 +455,7 @@ def __init__(self, request, response): self._id = self.response_obj['taskId'] self._state = self.response_obj['state'] + @property def ok(self): ''' Reports if the query completed successfully or is still running. @@ -440,12 +463,17 @@ def ok(self): ''' return not self._error + @property def id(self): return self._id + def _druid(self): + return self._request.query_client.druid_client + def _tasks(self): - return self._request.query_client.druid_client.tasks() + return self._druid().tasks + @property def status(self): ''' Polls Druid for an update on the query run status. @@ -467,6 +495,7 @@ def status(self): self._error = self._status['status']['errorMsg'] return self._status + @property def done(self): ''' Reports whether the query is done. The query is done when the Overlord task @@ -475,12 +504,14 @@ def done(self): ''' return self._state == consts.FAILED_STATE or self._state == consts.SUCCESS_STATE + @property def succeeded(self): ''' Reports if the query succeeded. ''' return self._state == consts.SUCCESS_STATE + @property def state(self): ''' Reports the task state from the Overlord task. @@ -489,10 +520,12 @@ def state(self): ''' return self._state + @property def error(self): return self._error - def error_msg(self): + @property + def error_message(self): err = self.error() if not err: return 'unknown' @@ -517,12 +550,12 @@ def join(self): Returns True for success, False for failure. ''' - if not self.done(): - self.status() - while not self.done(): + if not self.done: + self.status + while not self.done: time.sleep(0.5) - self.status() - return self.succeeded() + self.status + return self.succeeded def check_valid(self): if not self._id: @@ -535,15 +568,16 @@ def wait_done(self): once this method returns without raising an error. ''' if not self.join(): - raise DruidError('Query failed: ' + self.error_msg()) + raise DruidError('Query failed: ' + self.error_message()) def wait(self): ''' Wait for a SELECT query to finish running, then return the rows from the query. ''' self.wait_done() - return self.rows() + return self.rows + @property def reports(self) -> dict: self.check_valid() if not self._reports: @@ -551,15 +585,17 @@ def reports(self) -> dict: self._reports = self._tasks().task_reports(self._id) return self._reports + @property def results(self): if not self._results: rpts = self.reports() self._results = rpts['multiStageQuery']['payload']['results'] return self._results + @property def schema(self): if not self._schema: - results = self.results() + results = self.results sig = results['signature'] sql_types = results['sqlTypeNames'] size = len(sig) @@ -568,26 +604,39 @@ def schema(self): self._schema.append(ColumnSchema(sig[i]['name'], sql_types[i], sig[i]['type'])) return self._schema + @property def rows(self): if not self._rows: - results = self.results() + results = self.results self._rows = results['results'] return self._rows - def show(self, non_null=False): - data = self.rows() + def _display(self, display): + return self._druid().display if not display else display + + def show(self, non_null=False, display=None): + display = self._display(display) + if not self.done: + display.alert('Task has not finished running') + return + if not self.succeeded: + display.error(self.error_message) + return + data = self.rows if non_null: data = filter_null_cols(data) - disp = display.display.table() - disp.headers([c.name for c in self.schema()]) - disp.show(data) - + if not data: + display.alert('Query returned no {}rows'.format("visible " if non_null else '')) + return + display.data_table(data, [c.name for c in self.schema]) + class QueryClient: def __init__(self, druid, rest_client=None): self.druid_client = druid self._rest_client = druid.rest_client if not rest_client else rest_client + @property def rest_client(self): return self._rest_client @@ -603,7 +652,7 @@ def _prepare_query(self, request): request = self.sql_request(request) if not request.sql: raise ClientError('No query provided.') - if self.rest_client().trace: + if self.rest_client.trace: print(request.sql) if not query_obj: query_obj = request.to_request() @@ -639,7 +688,7 @@ def sql_query(self, request) -> SqlQueryResult: options to return data in the required format. ''' request, query_obj = self._prepare_query(request) - r = self.rest_client().post_only_json(REQ_ROUTER_SQL, query_obj, headers=request.headers) + r = self.rest_client.post_only_json(REQ_SQL, query_obj, headers=request.headers) return SqlQueryResult(request, r) def sql(self, sql, *args) -> list: @@ -658,9 +707,9 @@ def sql(self, sql, *args) -> list: if len(args) > 0: sql = sql.format(*args) resp = self.sql_query(sql) - if resp.ok(): - return resp.rows() - raise ClientError(resp.error_msg()) + if resp.ok: + return resp.rows + raise ClientError(resp.error_message) def explain_sql(self, query): ''' @@ -684,21 +733,6 @@ def sql_request(self, sql) -> SqlRequest: ''' return SqlRequest(self, sql) - def show(self, query): - ''' - Run a query and display the result as a table. - - Parameters - ---------- - query - The query as either a string or a SqlRequest object. - ''' - result = self.sql_query(query) - if result.ok(): - result.show() - else: - display.display.show_error(result.error_msg()) - def task(self, query) -> QueryTaskResult: ''' Submit an MSQ query. Returns a QueryTaskResult to track the task. @@ -709,7 +743,7 @@ def task(self, query) -> QueryTaskResult: The query as either a string or a SqlRequest object. ''' request, query_obj = self._prepare_query(query) - r = self.rest_client().post_only_json(REQ_ROUTER_SQL_TASK, query_obj, headers=request.headers) + r = self.rest_client.post_only_json(REQ_SQL_TASK, query_obj, headers=request.headers) return QueryTaskResult(request, r) def run_task(self, query): @@ -722,8 +756,8 @@ def run_task(self, query): The query as either a string or a SqlRequest object. ''' resp = self.task(query) - if not resp.ok(): - raise ClientError(resp.error_msg()) + if not resp.ok: + raise ClientError(resp.error_message) resp.wait_done() def _tables_query(self, schema): @@ -743,10 +777,7 @@ def tables(self, schema=consts.DRUID_SCHEMA): schema The schema to query, `druid` by default. ''' - return self._tables_query(schema).rows() - - def show_tables(self, schema=consts.DRUID_SCHEMA): - self._tables_query(schema).show() + return self._tables_query(schema).rows def _schemas_query(self): return self.sql_query(''' @@ -756,13 +787,7 @@ def _schemas_query(self): ''') def schemas(self): - return self._schemas_query().rows() - - def show_schemas(self): - ''' - Display the list of schemas available in Druid. - ''' - self._schemas_query().show() + return self._schemas_query().rows def _schema_query(self, table_name): parts = split_table_name(table_name, consts.DRUID_SCHEMA) @@ -788,20 +813,8 @@ def table_schema(self, table_name): The name of the table as either "table" or "schema.table". If the form is "table", then the 'druid' schema is assumed. ''' - return self._schema_query(table_name).rows() + return self._schema_query(table_name).rows - def describe_table(self, table_name): - ''' - Describe a table by returning the list of columns in the table. - - Parameters - ---------- - table_name str - The name of the table as either "table" or "schema.table". - If the form is "table", then the 'druid' schema is assumed. - ''' - self._schema_query(table_name).show() - def _function_args_query(self, table_name): parts = split_table_name(table_name, consts.EXT_SCHEMA) return self.sql_query(''' @@ -829,20 +842,7 @@ def function_parameters(self, table_name): The name of the table as either "table" or "schema.table". If the form is "table", then the 'ext' schema is assumed. ''' - return self._function_args_query(table_name).rows() - - def describe_function(self, table_name): - ''' - Retrieve the list of parameters for a partial external table defined in - the Druid catalog. - - Parameters - ---------- - table_name str - The name of the table as either "table" or "schema.table". - If the form is "table", then the 'ext' schema is assumed. - ''' - return self._function_args_query(table_name).show() + return self._function_args_query(table_name).rows def wait_until_ready(self, table_name): ''' @@ -853,11 +853,10 @@ def wait_until_ready(self, table_name): table_name str The name of a datasource in the 'druid' schema. ''' - self.druid_client.datasources().wait_until_ready(table_name) + self.druid_client.datasources.wait_until_ready(table_name) while True: try: self.sql('SELECT 1 FROM "{}" LIMIT 1'.format(table_name)); return except Exception: time.sleep(0.5) - diff --git a/examples/quickstart/jupyter-notebooks/druidapi/status.py b/examples/quickstart/jupyter-notebooks/druidapi/status.py index 5b29edb07e97..00e7c8f711f1 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/status.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/status.py @@ -39,16 +39,17 @@ class StatusClient: ''' def __init__(self, rest_client, owns_client=False): - self.client = rest_client + self.rest_client = rest_client self.owns_client = owns_client def close(self): if self.owns_client: - self.client.close() - self.client = None + self.rest_client.close() + self.rest_client = None #-------- Common -------- + @property def status(self): ''' Returns the Druid version, loaded extensions, memory used, total memory @@ -56,8 +57,9 @@ def status(self): GET `/status` ''' - return self.client.get_json(REQ_STATUS) + return self.rest_client.get_json(REQ_STATUS) + @property def is_healthy(self) -> bool: ''' Returns `True` if the node is healthy, `False`` otherwise. Check service health @@ -68,7 +70,7 @@ def is_healthy(self) -> bool: GET `/status/health` ''' try: - return self.client.get_json(REQ_HEALTH) + return self.rest_client.get_json(REQ_HEALTH) except Exception: return False @@ -77,9 +79,10 @@ def wait_until_ready(self): Sleeps until the node reports itself as healthy. Will run forever if the node is down or never becomes healthy. ''' - while not self.is_healthy(): + while not self.is_healthy: time.sleep(0.5) + @property def properties(self) -> map: ''' Returns the effective set of Java properties used by the service, including @@ -88,8 +91,9 @@ def properties(self) -> map: GET `/status/properties` ''' - return self.client.get_json(REQ_PROPERTIES) + return self.rest_client.get_json(REQ_PROPERTIES) + @property def in_cluster(self): ''' Returns `True` if the node is visible within the cluster, `False` if not. @@ -99,20 +103,22 @@ def in_cluster(self): GET `/status/selfDiscovered/status` ''' try: - result = self.client.get_json(REQ_IN_CLUSTER) + result = self.rest_client.get_json(REQ_IN_CLUSTER) return result.get('selfDiscovered', False) except ConnectionError: return False + @property def version(self): ''' Returns the version of the Druid server. If the server is running in an IDE, the version will be empty. ''' - return self.status().get('version') + return self.status.get('version') + @property def brokers(self): ''' Retrieve the list of broker nodes known to this node. Must be called on the Router. ''' - return self.client.get_json(REQ_BROKERS) + return self.rest_client.get_json(REQ_BROKERS) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/text_table.py b/examples/quickstart/jupyter-notebooks/druidapi/text.py similarity index 92% rename from examples/quickstart/jupyter-notebooks/druidapi/text_table.py rename to examples/quickstart/jupyter-notebooks/druidapi/text.py index d2b3e09f7921..c8f1f4d907ca 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/text_table.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/text.py @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .base_table import pad -from .base_table import BaseTable +from .display import DisplayClient +from .base_table import pad, BaseTable alignments = ['', '^', '>'] @@ -131,10 +131,6 @@ def format(self): table_rows = self.formatter(self.compute_def(self._rows)) return '\n'.join(table_rows) - def show(self, rows): - self._rows = rows - print(self.format()) - def format_rows(self, rows, min_width, max_width): if not self._col_fmt: return self.default_row_format(rows, min_width, max_width) @@ -163,3 +159,23 @@ def apply_row_formats(self, rows, max_width): new_row.append(fmts[i](row[i])) new_rows.append(pad(new_row, max_width, None)) return new_rows + +class TextDisplayClient(DisplayClient): + + def __init__(self): + DisplayClient.__init__(self) + + def text(self, msg): + print(msg) + + def alert(self, msg): + print("Alert:", msg) + + def error(self, msg): + print("ERROR:", msg) + + def new_table(self): + return TextTable() + + def show_table(self, table): + print(table.format()) diff --git a/examples/quickstart/jupyter-notebooks/sql-tutorial.ipynb b/examples/quickstart/jupyter-notebooks/sql-tutorial.ipynb index 96b184cf1385..558afe0f1cb0 100644 --- a/examples/quickstart/jupyter-notebooks/sql-tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/sql-tutorial.ipynb @@ -61,94 +61,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "b7f08a52", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import druidapi\n", - "druidapi.styles()\n", "\n", "# druid_host is the hostname and port for your Druid deployment. \n", "# In a distributed environment, you can point to other Druid services.\n", @@ -156,26 +76,9 @@ "druid_host = \"http://localhost:8888\"\n", "dataSourceName = \"wikipedia-sql-tutorial\"\n", "\n", - "druid = druidapi.client(druid_host)\n", - "sql_client = druid.sql()" - ] - }, - { - "cell_type": "markdown", - "id": "cb815aa4", - "metadata": {}, - "source": [ - "In case you just started your server, wait until the server is ready. The left margin will show an asterisk `[*]` while the notebook waits for your server to become ready." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "c09f1b52", - "metadata": {}, - "outputs": [], - "source": [ - "druid.status().wait_until_ready()" + "druid = druidapi.jupyter_client(druid_host)\n", + "display = druid.display\n", + "sql_client = druid.sql" ] }, { @@ -211,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "045f782c-74d8-4447-9487-529071812b51", "metadata": {}, "outputs": [], @@ -239,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "cca15307", "metadata": {}, "outputs": [], @@ -264,37 +167,15 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "6e5d8de0", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
channel
#sv.wikipedia
#ja.wikipedia
#en.wikipedia
#en.wikipedia
#sh.wikipedia
#en.wikipedia
#pl.wikipedia
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "sql = '''\n", "SELECT \"channel\" FROM \"wikipedia-sql-tutorial\" LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -323,32 +204,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "c7a86e2e", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
COLUMN_NAMEDATA_TYPE
__timeTIMESTAMP
addedBIGINT
channelVARCHAR
cityNameVARCHAR
commentVARCHAR
commentLengthBIGINT
countryIsoCodeVARCHAR
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "sql = '''\n", "SELECT COLUMN_NAME, DATA_TYPE \n", @@ -356,7 +215,7 @@ "WHERE \"TABLE_SCHEMA\" = 'druid' AND \"TABLE_NAME\" = 'wikipedia-sql-tutorial' \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -369,12 +228,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "1ac6c410", "metadata": {}, "outputs": [], "source": [ - "sql_client.describe_table(dataSourceName)" + "display.table(dataSourceName)" ] }, { @@ -409,7 +268,7 @@ "GROUP BY channel, page\n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -487,7 +346,7 @@ " AND namespace = 'Main' \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -524,7 +383,7 @@ "WHERE \"cityName\" <> '' AND \"countryIsoCode\" = 'US' \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -565,7 +424,7 @@ "GROUP BY channel \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -595,7 +454,7 @@ "GROUP BY cityName, countryName \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -646,7 +505,7 @@ "WHERE cityName = 'Mexico City' \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -681,7 +540,7 @@ "ORDER BY \"Number of events\" ASC \n", "LIMIT 5\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -720,7 +579,7 @@ "GROUP BY countryName , FLOOR(__time TO HOUR) \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { @@ -753,7 +612,7 @@ "GROUP BY countryName, countryIsoCode \n", "LIMIT 7\n", "'''\n", - "sql_client.show(sql)" + "display.sql(sql)" ] }, { From 0333d798a3a043e66c9852386ec8cff2be7aea26 Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Sun, 26 Feb 2023 17:31:04 -0800 Subject: [PATCH 12/19] Minor edits --- .../Python_API_Tutorial.ipynb | 14 ++++---- .../jupyter-notebooks/druidapi/README.md | 34 +++++++++++-------- .../jupyter-notebooks/druidapi/sql.py | 8 ++--- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb index f6dea1ca4365..1ccf4420175f 100644 --- a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "769c363b", + "id": "012b2e61", "metadata": {}, "source": [ "## Display Client\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ccb6e119", + "id": "f867f1f0", "metadata": {}, "outputs": [], "source": [ @@ -254,7 +254,7 @@ }, { "cell_type": "markdown", - "id": "59bee6e9", + "id": "f414d145", "metadata": {}, "source": [ "## SQL Client\n", @@ -265,7 +265,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25310f40", + "id": "9951e976", "metadata": {}, "outputs": [], "source": [ @@ -353,7 +353,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1dbf12a8", + "id": "8dba807b", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ }, { "cell_type": "markdown", - "id": "c7c109ae", + "id": "99f8db7b", "metadata": {}, "source": [ "The display and SQL clients are intened for exploratory queries. The [pydruid](https://pythonhosted.org/pydruid/) library provides a robust way to run native queries, to run SQL queries, and to convert the results to various formats." @@ -619,7 +619,7 @@ "* REST client: `rest_client.post_json('/druid/v2/sql', sql_request)`\n", "* SQL Client: `sql_client.show(sql)`, where `sql` is the query text\n", "\n", - "In general, you have to provide the all the details for the Requests library. The REST client handles the low-level repetitious bits. The Python clients provide methods that encapsulate the specifics of the URLS and return formats." + "In general, you have to provide the all the details for the Requests library. The REST client handles the low-level repetitious bits. The Python clients provide methods that encapsulate the specifics of the URLs and return formats." ] }, { diff --git a/examples/quickstart/jupyter-notebooks/druidapi/README.md b/examples/quickstart/jupyter-notebooks/druidapi/README.md index fdc3f558c5a1..94bb1ddfbb26 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/README.md +++ b/examples/quickstart/jupyter-notebooks/druidapi/README.md @@ -78,7 +78,7 @@ to find the `druidapi` library. (This step is temporary until `druidapi` is prop First, set a variable to point to the location where you cloned the Druid git repo: ```python -druid_dev = "/path/to/Druid-repo` +druid_dev = '/path/to/Druid-repo' ``` Then, add the notebooks directory to Python's module search path: @@ -96,7 +96,7 @@ Now you can import `druidapi` and create a client as shown in the previous secti commands revert to displaying a text (not HTML) format. The steps are similar to those above: ```python -druid_dev = "/path/to/Druid-repo` +druid_dev = '/path/to/Druid-repo' import sys sys.path.append(drudi_dev + '/examples/quickstart/jupyter-notebooks/') import druidapi @@ -115,12 +115,12 @@ status_client = druid.status The set of clients is still under construction. The set at present includes the following. The set of operations within each client is also partial, and includes only those operations used -within one of the tutorial notebooks. Contributions welcome to expand the scope. Clients are +within one of the tutorial notebooks. Contributions are welcome to expand the scope. Clients are available as properties on the `druid` object created above. * `status` - Status operations such as service health, property values, and so on. This client is special: it works only with the Router. The Router does not proxy these calls to other nodes. - See the note above about how to get status for other nodes. + Use the `status_for()` method to get status for other nodes. * `datasources` - Operations on datasources such as dropping a datasource. * `tasks` - Work with Overlord tasks: status, reports, and more. * `sql` - SQL query operations for both the interactive query engine and MSQ. @@ -138,7 +138,7 @@ This design works well for most Druid clusters: * Druid integration test clusters launched via the Druid development `it.sh` command. * Druid clusters running under Kubernetes -In all the Docker, Docker Compose and Kubernetes scenaris, the Router's port 8888 must be visible +In all the Docker, Docker Compose and Kubernetes scenaris, the Router's port (typically 8888) must be visible to the machine running `druidapi`, perhaps via port mapping or a proxy. The Router is then responsible for routing Druid REST requests to the various other Druid nodes, @@ -164,7 +164,7 @@ status_client.wait_until_ready() Without this step, your operations may mysteriously fail, and you'll wonder if you did something wrong. Some clients retry operations multiple times in case a service is not yet ready. For typical scripts -against a stable cluster, the above line should be sufficient instead. This step is build into the +against a stable cluster, the above line should be sufficient instead. This step is built into the `jupyter_client()` method to ensure notebooks provide a good exerience. If your notebook or script uses newer features, you should start by ensuring that the target Druid cluster @@ -344,11 +344,11 @@ are not available for these other result formats. The result can also format the results as a text or HTML table, depending on how you created the client: ```python -resp.show +resp.show() ``` In fact, the display client `sql()` method uses the `resp.show()` method internally, which in turn uses the -`rows()` and `schema()` methods. +`rows` and `schema` properties. ### Run a Query and Return Results @@ -359,7 +359,7 @@ in code, just do the following: rows = sql_client.sql(sql) ``` -This form also takes a set of arguments so that you can use Python to parameterize the query: +This form takes a set of arguments so that you can use Python to parameterize the query: ```python sql = 'SELECT * FROM {}' @@ -374,6 +374,7 @@ query: ```python sql = ''' INSERT INTO myTable ... +''' ``` Then launch an ingestion task: @@ -391,12 +392,17 @@ task.id You can use the tasks client to track the status, or let the task object do it for you: ```python -task.wait_done() +task.wait_until_done() +``` + +You can combine the run-and-wait operations into a single call: + +```python +task = sql_client.run_task(sql) ``` A quirk of Druid is that MSQ reports task completion as soon as ingestion is done. However, it takes a -while for Druid to load the resulting segments. Wait for the table to become ready. Use the following -to wait for the table to become queryable: +while for Druid to load the resulting segments, so you must wait for the table to become queryable: ```python sql_client.wait_until_ready('myTable') @@ -462,7 +468,7 @@ Health: Ingest data: -* Requests: See the [REST tutorial](api_tutorial.ipynb) +* Requests: See the REST tutorial. * REST client: as the REST tutorial, but use `rest_client.post_json('/druid/v2/sql/task', sql_request)` and `rest_client.get_json(f"/druid/indexer/v1/task/{ingestion_taskId}/status")` * SQL client: `sql_client.run_task(sql)`, also a form for a full SQL request. @@ -479,7 +485,7 @@ Query data, where `sql_request` is a properly-formatted `SqlResquest` dictionary * REST client: `rest_client.post_json('/druid/v2/sql', sql_request)` * SQL Client: `sql_client.show(sql)`, where `sql` is the query text -In general, you have to provide the all the details for the Requests library. The REST client handles the low-level repetitious bits. The Python clients provide methods that encapsulate the specifics of the URLS and return formats. +In general, you have to provide the all the details for the Requests library. The REST client handles the low-level repetitious bits. The Python clients provide methods that encapsulate the specifics of the URLs and return formats. ## Constants diff --git a/examples/quickstart/jupyter-notebooks/druidapi/sql.py b/examples/quickstart/jupyter-notebooks/druidapi/sql.py index 3a9faf038d45..91fb70529790 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/sql.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/sql.py @@ -561,7 +561,7 @@ def check_valid(self): if not self._id: raise ClientError('Operation is invalid on a failed query') - def wait_done(self): + def wait_until_done(self): ''' Wait for the task to complete. Raises an error if the task fails. A caller can proceed to do something with the successful result @@ -574,7 +574,7 @@ def wait(self): ''' Wait for a SELECT query to finish running, then return the rows from the query. ''' - self.wait_done() + self.wait_until_done() return self.rows @property @@ -754,11 +754,11 @@ def run_task(self, query): ---------- query The query as either a string or a SqlRequest object. - ''' + ''' resp = self.task(query) if not resp.ok: raise ClientError(resp.error_message) - resp.wait_done() + resp.wait_until_done() def _tables_query(self, schema): return self.sql_query(''' From e4bc97d6cd2b1853a1846fc1cda69d3c88ef1d7e Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Tue, 28 Feb 2023 15:16:07 -0800 Subject: [PATCH 13/19] Address review comments --- .../jupyter-notebooks/druidapi/README.md | 8 +++--- .../jupyter-notebooks/druidapi/datasource.py | 2 +- .../jupyter-notebooks/druidapi/rest.py | 8 +++--- .../jupyter-notebooks/druidapi/sql.py | 24 +++++++++--------- .../jupyter-notebooks/druidapi/status.py | 4 +-- .../jupyter-notebooks/druidapi/tasks.py | 25 ++++++++++--------- 6 files changed, 36 insertions(+), 35 deletions(-) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/README.md b/examples/quickstart/jupyter-notebooks/druidapi/README.md index 94bb1ddfbb26..519be144a660 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/README.md +++ b/examples/quickstart/jupyter-notebooks/druidapi/README.md @@ -22,7 +22,7 @@ `druidapi` is a Python library to interact with all aspects of your [Apache Druid](https://druid.apache.org/) cluster. -`druidapi` picks up where the venerable [pydruid](https://github.com/druid-io/pydruid) lbrary +`druidapi` picks up where the venerable [pydruid](https://github.com/druid-io/pydruid) library left off to include full SQL support and support for many of of Druid APIs. `druidapi` is usable in any Python environment, but is optimized for use in Jupyter, providing a complete interactive environment which complements the UI-based Druid console. The primary use of `druidapi` at present @@ -43,18 +43,18 @@ in that area are welcome. Dependencies are listed in `requirements.txt`. -`druidapi` works against any version of Druid. Opeations that exploit newer features obviously work +`druidapi` works against any version of Druid. Operations that exploit newer features obviously work only against versions of Druid that support those features. ## Getting Started -To use `druidapi`, you must first import the library, then connect to your cluster by providing the URL to your Router instance. The way that is done differs a bit between consumers. +To use `druidapi`, first import the library, then connect to your cluster by providing the URL to your Router instance. The way that is done differs a bit between consumers. ### From a Tutorial Jupyter Notebook The tutorial Jupyter notebooks in `examples/quickstart/jupyter-notebooks` reside in the same directory tree as this library. We start the library using the Jupyter-oriented API which is able to render tables in -HTML. First, identify your router endpoint. For a local installation: +HTML. First, identify your Router endpoint. Use the following for a local installation: ```python router_endpoint = 'http://localhost:8888' diff --git a/examples/quickstart/jupyter-notebooks/druidapi/datasource.py b/examples/quickstart/jupyter-notebooks/druidapi/datasource.py index 837a6997fa89..7a12630d1042 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/datasource.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/datasource.py @@ -27,7 +27,7 @@ class DatasourceClient: ''' - Client for datasource APIs. Prefer to use the SQL to query the + Client for datasource APIs. Prefer to use SQL to query the INFORMATION_SCHEMA to obtain information. See https://druid.apache.org/docs/latest/operations/api-reference.html#datasources diff --git a/examples/quickstart/jupyter-notebooks/druidapi/rest.py b/examples/quickstart/jupyter-notebooks/druidapi/rest.py index 435b99ef16f7..b9d62083afca 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/rest.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/rest.py @@ -23,7 +23,7 @@ def check_error(response): Raises an HttpError from the requests library if the response code is neither OK (200) nor Accepted (202). - Druid's REST API is inconsistent with how it resports errors. Some APIs return + Druid's REST API is inconsistent with how it reports errors. Some APIs return an error as a JSON object. Others return a text message. Still others return nothing at all. With the JSON format, sometimes the error returns an 'errorMessage' field, other times only a generic 'error' field. @@ -41,7 +41,7 @@ def check_error(response): try: json = response.json() except Exception: - # If we can't get the JSON, raise a Requets error + # If we can't get the JSON, raise a Requests error response.raise_for_status() # Druid JSON payload. Try to make sense of the error @@ -134,7 +134,7 @@ def get(self, req, args=None, params=None, require_ok=True) -> requests.Request: params: dict, default = None Optional map of query variables to send in - the URL. Query parameters are the name/values pairs + the URL. Query parameters are the name/value pairs that appear after the `?` marker. require_ok: bool, default = True @@ -178,7 +178,7 @@ def post(self, req, body, args=None, headers=None, require_ok=True) -> requests. def post_json(self, req, body, args=None, headers=None, params=None) -> requests.Response: ''' - Issues a POST request for the given URL on this node, with a JSON request, returning + Issues a POST request for the given URL on this node, with a JSON request. Returns the JSON response. Parameters diff --git a/examples/quickstart/jupyter-notebooks/druidapi/sql.py b/examples/quickstart/jupyter-notebooks/druidapi/sql.py index 91fb70529790..778f4bfd8337 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/sql.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/sql.py @@ -416,7 +416,7 @@ def show_schema(self, display=None): class QueryTaskResult: ''' Response from an asynchronous MSQ query, which may be an ingestion or a retrieval - query. Can monitor task progreess and wait for the task to complete. For a SELECT query, + query. Can monitor task progress and wait for the task to complete. For a SELECT query, obtains the rows from the task reports. There are no results for an ingestion query, just a success/failure status. @@ -459,7 +459,7 @@ def __init__(self, request, response): def ok(self): ''' Reports if the query completed successfully or is still running. - Use suceeded() to check if the task is done and successful. + Use succeeded() to check if the task is done and successful. ''' return not self._error @@ -499,7 +499,7 @@ def status(self): def done(self): ''' Reports whether the query is done. The query is done when the Overlord task - which runs the query completes. A completed task is one with a status of either + that runs the query completes. A completed task is one with a status of either SUCCESS or FAILED. ''' return self._state == consts.FAILED_STATE or self._state == consts.SUCCESS_STATE @@ -572,7 +572,7 @@ def wait_until_done(self): def wait(self): ''' - Wait for a SELECT query to finish running, then return the rows from the query. + Wait for a SELECT query to finish running, then returns the rows from the query. ''' self.wait_until_done() return self.rows @@ -693,7 +693,7 @@ def sql_query(self, request) -> SqlQueryResult: def sql(self, sql, *args) -> list: ''' - Run a SQL query and return the results. Typically used to retieve data as part + Run a SQL query and return the results. Typically used to receive data as part of another operation, rathre than to display results to the user. Parameters @@ -729,13 +729,13 @@ def explain_sql(self, query): def sql_request(self, sql) -> SqlRequest: ''' - Create a SqlRequest object for the given SQL query text. + Creates a SqlRequest object for the given SQL query text. ''' return SqlRequest(self, sql) def task(self, query) -> QueryTaskResult: ''' - Submit an MSQ query. Returns a QueryTaskResult to track the task. + Submits an MSQ query. Returns a QueryTaskResult to track the task. Parameters ---------- @@ -748,7 +748,7 @@ def task(self, query) -> QueryTaskResult: def run_task(self, query): ''' - Submit an MSQ query and wait for completion. Returns a QueryTaskResult to track the task. + Submits an MSQ query and wait for completion. Returns a QueryTaskResult to track the task. Parameters ---------- @@ -770,7 +770,7 @@ def _tables_query(self, schema): def tables(self, schema=consts.DRUID_SCHEMA): ''' - Return a list of tables in the given schema. + Returns a list of tables in the given schema. Parameters ---------- @@ -804,7 +804,7 @@ def _schema_query(self, table_name): def table_schema(self, table_name): ''' - Return the schema of a table as an array of dictionaries of the + Returns the schema of a table as an array of dictionaries of the form {"Position": "", "Name": "", "Type": ""} Parameters @@ -831,7 +831,7 @@ def _function_args_query(self, table_name): def function_parameters(self, table_name): ''' - Retrieve the list of parameters for a partial external table defined in + Retruns the list of parameters for a partial external table defined in the Druid catalog. Returns the parameters as an array of objects in the form {"Position": , "Parameter": "", "Type": "", "Optional": True|False} @@ -846,7 +846,7 @@ def function_parameters(self, table_name): def wait_until_ready(self, table_name): ''' - Wait for a datasource to be loaded in the cluster, and to become available to SQL. + Waits for a datasource to be loaded in the cluster, and to become available to SQL. Parameters ---------- diff --git a/examples/quickstart/jupyter-notebooks/druidapi/status.py b/examples/quickstart/jupyter-notebooks/druidapi/status.py index 00e7c8f711f1..bf26db7e2091 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/status.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/status.py @@ -62,7 +62,7 @@ def status(self): @property def is_healthy(self) -> bool: ''' - Returns `True` if the node is healthy, `False`` otherwise. Check service health + Returns `True` if the node is healthy, `False` otherwise. Check service health before using other Druid API methods to ensure the server is ready. See also `wait_until_ready()`. @@ -119,6 +119,6 @@ def version(self): @property def brokers(self): ''' - Retrieve the list of broker nodes known to this node. Must be called on the Router. + Returns the list of broker nodes known to this node. Must be called on the Router. ''' return self.rest_client.get_json(REQ_BROKERS) diff --git a/examples/quickstart/jupyter-notebooks/druidapi/tasks.py b/examples/quickstart/jupyter-notebooks/druidapi/tasks.py index bf5c1b235a92..9f5945b884c6 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/tasks.py +++ b/examples/quickstart/jupyter-notebooks/druidapi/tasks.py @@ -43,12 +43,16 @@ def tasks(self, state=None, table=None, task_type=None, max=None, created_time_i Filter list of tasks by task state. Valid options are "running", "complete", "waiting", and "pending". Constants are defined for each of these in the `consts` file. - table str, default = None - Return tasks for the this one Druid table (datasource). + + table: str, default = None + Return tasks for only for one Druid table (datasource). + created_time_interval: str, Default = None Return tasks created within the specified interval. + max: int, default = None Maximum number of "complete" tasks to return. Only applies when state is set to "complete". + task_type: str, default = None Filter tasks by task type. @@ -59,7 +63,7 @@ def tasks(self, state=None, table=None, task_type=None, max=None, created_time_i params = {} if state: params['state'] = state - if table : + if table: params['datasource'] = table if task_type: params['type'] = task_type @@ -80,7 +84,7 @@ def task(self, task_id) -> dict: Returns ------- - The task payload as a Python dictionary. + The task payload as a Python dictionary. Reference --------- @@ -99,8 +103,8 @@ def task_status(self, task_id) -> dict: Returns ------- - The task status as a Python dictionary. See the `consts` module for a list - of status codes. + The task status as a Python dictionary. See the `consts` module for a list + of status codes. Reference --------- @@ -119,7 +123,7 @@ def task_reports(self, task_id) -> dict: Returns ------- - The task reports as a Python dictionary. + The task reports as a Python dictionary. Reference --------- @@ -140,11 +144,8 @@ def submit_task(self, payload): Returns ------- - The REST response. + The REST response. - Returns - ------- - The REST response. Reference --------- `POST /druid/indexer/v1/task` @@ -181,7 +182,7 @@ def shut_down_tasks_for(self, table): Returns ------- - The REST response. + The REST response. Reference --------- From 06829d50d6a1f64e67fcd3e67e283dee045d36c3 Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Tue, 28 Feb 2023 15:54:39 -0800 Subject: [PATCH 14/19] Fix typo --- docs/tutorials/tutorial-jupyter-index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/tutorial-jupyter-index.md b/docs/tutorials/tutorial-jupyter-index.md index 66d919be4f5d..da5625997810 100644 --- a/docs/tutorials/tutorial-jupyter-index.md +++ b/docs/tutorials/tutorial-jupyter-index.md @@ -79,7 +79,7 @@ Make sure you meet the following requirements before starting the Jupyter-based ./it.sh up ``` - Replace `` with one of the available integration test categories. See the integration + Replace `` with one of the available integration test categories. See the integration test `README.md` for details. ## Simple Druid API From a93c4815e94ed5d104cce7594cd8976e41f8cb5b Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Thu, 2 Mar 2023 13:57:40 -0800 Subject: [PATCH 15/19] Address review comments --- examples/quickstart/jupyter-notebooks/-START HERE-.ipynb | 8 -------- .../jupyter-notebooks/Python_API_Tutorial.ipynb | 8 -------- examples/quickstart/jupyter-notebooks/api-tutorial.ipynb | 8 -------- examples/quickstart/jupyter-notebooks/druidapi/README.md | 8 ++++---- 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/examples/quickstart/jupyter-notebooks/-START HERE-.ipynb b/examples/quickstart/jupyter-notebooks/-START HERE-.ipynb index bfd865eb213b..9c88edc896f5 100644 --- a/examples/quickstart/jupyter-notebooks/-START HERE-.ipynb +++ b/examples/quickstart/jupyter-notebooks/-START HERE-.ipynb @@ -130,14 +130,6 @@ "Note that you can skip the second PR, if you just copy the prefix link from one of the\n", "existing notebook links when doing your first PR." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5e6f2a0e", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb index 1ccf4420175f..33ac4d0be611 100644 --- a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -717,14 +717,6 @@ "\n", "This notebook have you a whirlwind tour of the Python Druid API: just enough to check your cluster, ingest some data with MSQ and query that data. Druid has many more APIs. As noted earlier, the Python API is a work in progress: the team adds new wrappers as needed for tutorials. Your [contributions](https://github.com/apache/druid/pulls) and [feedback](https://github.com/apache/druid/issues) are welcome." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c9a9e4c", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb b/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb index 6e07ed6c0013..7bb722466201 100644 --- a/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/api-tutorial.ipynb @@ -667,14 +667,6 @@ "\n", "You can also try out the [druid-client](https://github.com/paul-rogers/druid-client), a Python library for Druid created by Paul Rogers, a Druid contributor. A simplified version of that library is included with these tutorials. See [the Python API Tutorial](Python_API_Tutorial.ipynb) for an overview. That tutorial shows how to do the same tasks as this one, but in a simpler form: focusing on the Druid actions and not the mechanics of the REST API." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "386a05e5", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/quickstart/jupyter-notebooks/druidapi/README.md b/examples/quickstart/jupyter-notebooks/druidapi/README.md index 519be144a660..0c9218a90408 100644 --- a/examples/quickstart/jupyter-notebooks/druidapi/README.md +++ b/examples/quickstart/jupyter-notebooks/druidapi/README.md @@ -85,7 +85,7 @@ Then, add the notebooks directory to Python's module search path: ```python import sys -sys.path.append(drudi_dev + '/examples/quickstart/jupyter-notebooks/') +sys.path.append(druid_dev + '/examples/quickstart/jupyter-notebooks/') ``` Now you can import `druidapi` and create a client as shown in the previous section. @@ -98,7 +98,7 @@ commands revert to displaying a text (not HTML) format. The steps are similar to ```python druid_dev = '/path/to/Druid-repo' import sys -sys.path.append(drudi_dev + '/examples/quickstart/jupyter-notebooks/') +sys.path.append(druid_dev + '/examples/quickstart/jupyter-notebooks/') import druidapi druid = druidapi.client(router_endpoint) ``` @@ -185,7 +185,7 @@ status_client.properties['druid.extensions.loadList'] ## Display Client -When run in a Jypter notebook, it is often handy to format results for display. A special display +When run in a Jupyter notebook, it is often handy to format results for display. A special display client performs operations _and_ formats them for display as HTML tables within the notebook. ```python @@ -479,7 +479,7 @@ List datasources: * REST client: `rest_client.get_json('/druid/coordinator/v1/datasources')` * Datasources client: `ds_client.names()` -Query data, where `sql_request` is a properly-formatted `SqlResquest` dictionary: +Query data, where `sql_request` is a properly-formatted `SqlRequest` dictionary: * Requests: `session.post(druid_host + '/druid/v2/sql', json=sql_request).json()` * REST client: `rest_client.post_json('/druid/v2/sql', sql_request)` From bc8c8c69ab426f2bab457091c2c23cf39451c4ce Mon Sep 17 00:00:00 2001 From: Charles Smith Date: Thu, 2 Mar 2023 15:16:51 -0800 Subject: [PATCH 16/19] Apply suggestions from code review commit suggestions Co-authored-by: Victoria Lim --- .../quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb index 33ac4d0be611..643b1e0919f5 100644 --- a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -34,7 +34,7 @@ "\n", "The API provides two levels of functions. Most are simple wrappers around Druid's REST APIs. Others add additional code to make the API easier to use. The SQL query interface is a prime example: extra code translates a simple SQL query into Druid's `SQLQuery` object and interprets the results into a form that can be displayed in a notebook.\n", "\n", - "This notebook contains sample output to allow it to work a bit like a reference. To run it yourself, start by using the `Kernel` → `Restart & Clear Output` menu command to clear the sample output.\n", + "This notebook contains sample output to allow it to function as a reference. To run it yourself, start by using the `Kernel` → `Restart & Clear Output` menu command to clear the sample output.\n", "\n", "Start by importing the `druidapi` package from the same folder as this notebook." ] @@ -58,7 +58,7 @@ "\n", "The API uses the router to forward messages to each of Druid's services so that you don't have to keep track of the host and port for each service.\n", "\n", - "The `jupyter_client()` method waits for the cluster to be ready, and sets up the client to display tables and messages as HTML. To use this code without waiting and without HTML formatting, use the `client()` method instead." + "The `jupyter_client()` method waits for the cluster to be ready and sets up the client to display tables and messages as HTML. To use this code without waiting and without HTML formatting, use the `client()` method instead." ] }, { From a3a0526bc71e33c08278e4fbbbd2075f3bd5ddf9 Mon Sep 17 00:00:00 2001 From: Charles Smith Date: Thu, 2 Mar 2023 15:20:07 -0800 Subject: [PATCH 17/19] Update examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb Co-authored-by: Victoria Lim --- examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb index 643b1e0919f5..51239edb4337 100644 --- a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -613,7 +613,7 @@ "* REST client: `rest_client.get_json('/druid/coordinator/v1/datasources')`\n", "* Datasources client: `ds_client.names()`\n", "\n", - "Query data, where `sql_request` is a properly-formatted `SqlResquest` dictionary:\n", + "Query data, where `sql_request` is a properly formatted `SqlRequest` dictionary:\n", "\n", "* Requests: `session.post(druid_host + '/druid/v2/sql', json=sql_request).json()`\n", "* REST client: `rest_client.post_json('/druid/v2/sql', sql_request)`\n", From c47b19f78b5c97bcf64a1f9ce329611c2bbcfa81 Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Fri, 3 Mar 2023 16:05:05 -0800 Subject: [PATCH 18/19] Fix a few more typos --- examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb index 51239edb4337..3d1e50d96631 100644 --- a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -613,7 +613,7 @@ "* REST client: `rest_client.get_json('/druid/coordinator/v1/datasources')`\n", "* Datasources client: `ds_client.names()`\n", "\n", - "Query data, where `sql_request` is a properly formatted `SqlRequest` dictionary:\n", + "Query data, where `sql_request` is a properly-formatted `SqlRequest` dictionary:\n", "\n", "* Requests: `session.post(druid_host + '/druid/v2/sql', json=sql_request).json()`\n", "* REST client: `rest_client.post_json('/druid/v2/sql', sql_request)`\n", From be972ea25172c86365fe279cc1ff97005280f418 Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Fri, 3 Mar 2023 16:20:54 -0800 Subject: [PATCH 19/19] Fix typos --- examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb index 3d1e50d96631..51239edb4337 100644 --- a/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb +++ b/examples/quickstart/jupyter-notebooks/Python_API_Tutorial.ipynb @@ -613,7 +613,7 @@ "* REST client: `rest_client.get_json('/druid/coordinator/v1/datasources')`\n", "* Datasources client: `ds_client.names()`\n", "\n", - "Query data, where `sql_request` is a properly-formatted `SqlRequest` dictionary:\n", + "Query data, where `sql_request` is a properly formatted `SqlRequest` dictionary:\n", "\n", "* Requests: `session.post(druid_host + '/druid/v2/sql', json=sql_request).json()`\n", "* REST client: `rest_client.post_json('/druid/v2/sql', sql_request)`\n",