diff --git a/.gitignore b/.gitignore index d7632abb49f0..b6a1e63c4192 100644 --- a/.gitignore +++ b/.gitignore @@ -78,7 +78,7 @@ python/deps python/docs/_site/ python/docs/source/development/errors.rst python/docs/source/reference/**/api/ -python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst +python/docs/source/tutorial/pandas_on_spark/supported_pandas_api.rst python/test_coverage/coverage_data python/test_coverage/htmlcov python/pyspark/python diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py index 0200d094185d..e30584002fcb 100644 --- a/examples/src/main/python/sql/arrow.py +++ b/examples/src/main/python/sql/arrow.py @@ -21,7 +21,7 @@ ./bin/spark-submit examples/src/main/python/sql/arrow.py """ -# NOTE that this file is imported in user guide in PySpark documentation. +# NOTE that this file is imported in tutorials in PySpark documentation. # The codes are referred via line numbers. See also `literalinclude` directive in Sphinx. import pandas as pd from typing import Iterable diff --git a/examples/src/main/python/sql/udtf.py b/examples/src/main/python/sql/udtf.py index b83b38daf597..bff5182f8e16 100644 --- a/examples/src/main/python/sql/udtf.py +++ b/examples/src/main/python/sql/udtf.py @@ -21,7 +21,7 @@ ./bin/spark-submit examples/src/main/python/sql/udtf.py """ -# NOTE that this file is imported in the User Guides in PySpark documentation. +# NOTE that this file is imported in the tutorials in PySpark documentation. # The codes are referred via line numbers. See also `literalinclude` directive in Sphinx. from pyspark.sql import SparkSession from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py index 20c13cd768de..3059ac1c2d22 100644 --- a/python/docs/source/conf.py +++ b/python/docs/source/conf.py @@ -23,11 +23,11 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath('.')) -# generate user_guide/pandas_on_spark/supported_pandas_api.rst +# generate tutorial/pandas_on_spark/supported_pandas_api.rst from pyspark.pandas.supported_api_gen import generate_supported_api output_rst_file_path = ( - "%s/user_guide/pandas_on_spark/supported_pandas_api.rst" + "%s/tutorial/pandas_on_spark/supported_pandas_api.rst" % os.path.dirname(os.path.abspath(__file__)) ) generate_supported_api(output_rst_file_path) diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst index 2e102c8de71e..c8d3fe62bf3b 100644 --- a/python/docs/source/index.rst +++ b/python/docs/source/index.rst @@ -185,6 +185,7 @@ should use for your streaming applications and pipelines. Overview getting_started/index + tutorial/index user_guide/index reference/index development/index diff --git a/python/docs/source/user_guide/arrow_pandas.rst b/python/docs/source/tutorial/arrow_pandas.rst similarity index 100% rename from python/docs/source/user_guide/arrow_pandas.rst rename to python/docs/source/tutorial/arrow_pandas.rst diff --git a/python/docs/source/tutorial/index.rst b/python/docs/source/tutorial/index.rst new file mode 100644 index 000000000000..e642c221490c --- /dev/null +++ b/python/docs/source/tutorial/index.rst @@ -0,0 +1,37 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +========= +Tutorials +========= + +PySpark specific tutorials are available here: + +.. toctree:: + :maxdepth: 2 + + python_packaging + sql/index + pandas_on_spark/index + +There are also basic programming guides covering multiple languages available in +`the Spark documentation `_, including these: + +- `Spark SQL, DataFrames and Datasets Guide `_ +- `Structured Streaming Programming Guide `_ +- `Machine Learning Library (MLlib) Guide `_ diff --git a/python/docs/source/user_guide/pandas_on_spark/best_practices.rst b/python/docs/source/tutorial/pandas_on_spark/best_practices.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/best_practices.rst rename to python/docs/source/tutorial/pandas_on_spark/best_practices.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/faq.rst b/python/docs/source/tutorial/pandas_on_spark/faq.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/faq.rst rename to python/docs/source/tutorial/pandas_on_spark/faq.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/from_to_dbms.rst b/python/docs/source/tutorial/pandas_on_spark/from_to_dbms.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/from_to_dbms.rst rename to python/docs/source/tutorial/pandas_on_spark/from_to_dbms.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/index.rst b/python/docs/source/tutorial/pandas_on_spark/index.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/index.rst rename to python/docs/source/tutorial/pandas_on_spark/index.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/options.rst b/python/docs/source/tutorial/pandas_on_spark/options.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/options.rst rename to python/docs/source/tutorial/pandas_on_spark/options.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/pandas_pyspark.rst b/python/docs/source/tutorial/pandas_on_spark/pandas_pyspark.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/pandas_pyspark.rst rename to python/docs/source/tutorial/pandas_on_spark/pandas_pyspark.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/transform_apply.rst b/python/docs/source/tutorial/pandas_on_spark/transform_apply.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/transform_apply.rst rename to python/docs/source/tutorial/pandas_on_spark/transform_apply.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/typehints.rst b/python/docs/source/tutorial/pandas_on_spark/typehints.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/typehints.rst rename to python/docs/source/tutorial/pandas_on_spark/typehints.rst diff --git a/python/docs/source/user_guide/pandas_on_spark/types.rst b/python/docs/source/tutorial/pandas_on_spark/types.rst similarity index 100% rename from python/docs/source/user_guide/pandas_on_spark/types.rst rename to python/docs/source/tutorial/pandas_on_spark/types.rst diff --git a/python/docs/source/user_guide/python_packaging.rst b/python/docs/source/tutorial/python_packaging.rst similarity index 100% rename from python/docs/source/user_guide/python_packaging.rst rename to python/docs/source/tutorial/python_packaging.rst diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/tutorial/sql/arrow_pandas.rst similarity index 100% rename from python/docs/source/user_guide/sql/arrow_pandas.rst rename to python/docs/source/tutorial/sql/arrow_pandas.rst diff --git a/python/docs/source/user_guide/sql/index.rst b/python/docs/source/tutorial/sql/index.rst similarity index 100% rename from python/docs/source/user_guide/sql/index.rst rename to python/docs/source/tutorial/sql/index.rst diff --git a/python/docs/source/user_guide/sql/python_data_source.rst b/python/docs/source/tutorial/sql/python_data_source.rst similarity index 100% rename from python/docs/source/user_guide/sql/python_data_source.rst rename to python/docs/source/tutorial/sql/python_data_source.rst diff --git a/python/docs/source/user_guide/sql/python_udtf.rst b/python/docs/source/tutorial/sql/python_udtf.rst similarity index 100% rename from python/docs/source/user_guide/sql/python_udtf.rst rename to python/docs/source/tutorial/sql/python_udtf.rst diff --git a/python/docs/source/user_guide/sql/type_conversions.rst b/python/docs/source/tutorial/sql/type_conversions.rst similarity index 100% rename from python/docs/source/user_guide/sql/type_conversions.rst rename to python/docs/source/tutorial/sql/type_conversions.rst diff --git a/python/docs/source/user_guide/assets/pyspark-udf-profile.png b/python/docs/source/user_guide/assets/pyspark-udf-profile.png new file mode 100644 index 000000000000..5b8ab3f3bd8b Binary files /dev/null and b/python/docs/source/user_guide/assets/pyspark-udf-profile.png differ diff --git a/python/docs/source/user_guide/assets/pyspark-ui-print.png b/python/docs/source/user_guide/assets/pyspark-ui-print.png new file mode 100644 index 000000000000..3ad6f28e34f5 Binary files /dev/null and b/python/docs/source/user_guide/assets/pyspark-ui-print.png differ diff --git a/python/docs/source/user_guide/assets/pyspark-ui-sql-broadcast.png b/python/docs/source/user_guide/assets/pyspark-ui-sql-broadcast.png new file mode 100644 index 000000000000..442b289bb1ee Binary files /dev/null and b/python/docs/source/user_guide/assets/pyspark-ui-sql-broadcast.png differ diff --git a/python/docs/source/user_guide/assets/pyspark-ui-sql.png b/python/docs/source/user_guide/assets/pyspark-ui-sql.png new file mode 100644 index 000000000000..1cf0f3cca7e8 Binary files /dev/null and b/python/docs/source/user_guide/assets/pyspark-ui-sql.png differ diff --git a/python/docs/source/user_guide/bugbusting.ipynb b/python/docs/source/user_guide/bugbusting.ipynb new file mode 100644 index 000000000000..0f024757cefa --- /dev/null +++ b/python/docs/source/user_guide/bugbusting.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1619d229-6f5c-4a31-9992-81bce15f7ef1", + "metadata": {}, + "source": [ + "# Chapter 4: Bug Busting - Debugging PySpark\n", + "\n", + "PySpark executes applications in a distributed environment, making it challenging to\n", + "monitor and debug these applications. It can be difficult to track which nodes are\n", + "executing specific code. However, there are multiple methods available within PySpark\n", + "to help with debugging. This section will outline how to effectively debug PySpark\n", + "applications.\n", + "\n", + "PySpark operates using Spark as its underlying engine, utilizing Spark Connect server\n", + "or Py4J (Spark Classic) to submit and compute jobs in Spark.\n", + "\n", + "On the driver side, PySpark interacts with the Spark Driver on JVM through Spark\n", + "Connect server or Py4J (Spark Classic). When `pyspark.sql.SparkSession` is created and\n", + "initialized, PySpark starts to communicate with the Spark Driver.\n", + "\n", + "On the executor side, Python workers are responsible for executing and managing Python\n", + "native functions or data. These workers are only launched if the PySpark application\n", + "requires interaction between Python and JVMs such as Python UDF execution. They are\n", + "initiated on-demand, for instance, when running pandas UDFs or PySpark RDD APIs." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "56890562-0151-45ac-903e-45b4f1d40d33", + "metadata": {}, + "source": [ + "## Spark UI\n", + "\n", + "### Python UDF Execution\n", + "\n", + "Debugging a Python UDF in PySpark can be done by simply adding print statements, though\n", + "the output won't be visible in the client/driver side since the functions are executed\n", + "on the executors - they can be seen in Spark UI. For example, if you have a working\n", + "Python UDF:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a9219c08-df6c-40d7-a73d-a5950ee7df0b", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "\n", + "@udf(\"integer\")\n", + "def my_udf(x):\n", + " # Do something with x\n", + " return x" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4875da48-03ee-4155-9257-b5514270d591", + "metadata": {}, + "source": [ + "You can add print statements for debugging as shown below:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2b9102be-e7df-4b80-a70e-87ef7e76a913", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(my_udf(id)=0)]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@udf(\"integer\")\n", + "def my_udf(x):\n", + " # Do something with x\n", + " print(\"What's going on?\")\n", + " return x\n", + "\n", + "spark.range(1).select(my_udf(\"id\")).collect()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "899b5d67-078a-4a12-8da8-7a841432ace2", + "metadata": {}, + "source": [ + "The output can be viewed in the Spark UI under `stdout`/`stderr` at `Executors` tab.\n", + "\n", + "![Spark UI print](./assets/pyspark-ui-print.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "154262f2-6f22-483b-9676-75901b325c66", + "metadata": {}, + "source": [ + "### Non-Python UDF\n", + "\n", + "When running non-Python UDF code, debugging is typically done via the Spark UI or\n", + "by using `DataFrame.explain(True)`.\n", + "\n", + "For instance, the code below performs a join between a large DataFrame (`df1`) and a\n", + "smaller one (`df2`):" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "55070bab-5659-4bb1-98ff-1a3eb6231218", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Physical Plan ==\n", + "AdaptiveSparkPlan isFinalPlan=false\n", + "+- Project [_1#6L]\n", + " +- SortMergeJoin [_1#6L], [_1#8L], Inner\n", + " :- Sort [_1#6L ASC NULLS FIRST], false, 0\n", + " : +- Exchange hashpartitioning(_1#6L, 200), ENSURE_REQUIREMENTS, [plan_id=41]\n", + " : +- Filter isnotnull(_1#6L)\n", + " : +- Scan ExistingRDD[_1#6L]\n", + " +- Sort [_1#8L ASC NULLS FIRST], false, 0\n", + " +- Exchange hashpartitioning(_1#8L, 200), ENSURE_REQUIREMENTS, [plan_id=42]\n", + " +- Filter isnotnull(_1#8L)\n", + " +- Scan ExistingRDD[_1#8L]\n", + "\n", + "\n" + ] + } + ], + "source": [ + "df1 = spark.createDataFrame([(x,) for x in range(100)])\n", + "df2 = spark.createDataFrame([(x,) for x in range(2)])\n", + "df1.join(df2, \"_1\").explain()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b8b7a595-006c-4417-9683-bc002d85b789", + "metadata": {}, + "source": [ + "Using `DataFrame.explain` displays the physical plans, showing how the join will\n", + "be executed. Those physical plans represent individual steps for the whole execution.\n", + "Here, it exchanges, a.k.a. shuffles, the data and performs a sort-merge-join." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "97fc5dcb-8531-4618-a66f-d84ca5095fdd", + "metadata": {}, + "source": [ + "\n", + "After checking how the plans are generated via this method, users can optimize their queries.\n", + "For example, because `df2` is very small, it can be broadcasted to executors\n", + "and remove the shuffle\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a1a985e-d260-49ce-a054-e70e3ed7e9e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Physical Plan ==\n", + "AdaptiveSparkPlan isFinalPlan=false\n", + "+- Project [_1#6L]\n", + " +- BroadcastHashJoin [_1#6L], [_1#8L], Inner, BuildRight, false\n", + " :- Filter isnotnull(_1#6L)\n", + " : +- Scan ExistingRDD[_1#6L]\n", + " +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=71]\n", + " +- Filter isnotnull(_1#8L)\n", + " +- Scan ExistingRDD[_1#8L]\n", + "\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import broadcast\n", + "\n", + "df1.join(broadcast(df2), \"_1\").explain()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "510949c3-91d1-475f-948f-34eed8617a41", + "metadata": {}, + "source": [ + "As can be seen the shuffle is removed, and it performs broadcast-hash-join:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "028aab52-adea-4b5d-806a-de79e9c54e71", + "metadata": {}, + "source": [ + "\n", + "These optimizations can also be visualized in the Spark UI under the SQL / DataFrame\n", + "tab after execution.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5cc67309-cd0b-49dc-b8d9-8d2c3a5aa944", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(_1=0), Row(_1=1)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.join(df2, \"_1\").collect()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "97a09efc-8fcd-400c-a052-8aef3bf7ce15", + "metadata": {}, + "source": [ + "\n", + "![PySpark UI SQL](./assets/pyspark-ui-sql.png)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "25b0f45f-26b6-485d-88df-a1fb323fd3f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(_1=0), Row(_1=1)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.join(broadcast(df2), \"_1\").collect()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "657d75db-1a64-4a97-8019-4ad1dd974997", + "metadata": {}, + "source": [ + "![PySpark UI SQL broadcast](./assets/pyspark-ui-sql-broadcast.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf1ab4f6-fe61-409a-9da5-afabdd7c987e", + "metadata": {}, + "source": [ + "## Monitor with `top` and `ps`\n", + "\n", + "On the driver side, you can obtain the process ID from your PySpark shell to\n", + "monitor resources:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f37891f2-1bfc-4995-ba8e-9ac351935bc8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "23976" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os; os.getpid()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cef297f1-3772-40d9-85d9-bdd0c9761c38", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " UID PID PPID C STIME TTY TIME CMD\n", + " 502 23976 21512 0 12:06PM ?? 0:02.30 /opt/miniconda3/envs/python3.11/bin/python -m ipykernel_launcher -f /Users/hyukjin.kwon/Library/Jupyter/runtime/kernel-c8eb73ef-2b21-418e-b770-92b946454606.json\n" + ] + } + ], + "source": [ + "%%bash\n", + "ps -fe 23976" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "296a6fbb-7ca8-448b-82f4-e7ee6d4359e2", + "metadata": {}, + "source": [ + "On the executor side, you can use `grep` to find the process IDs and resources for\n", + "Python workers, as these are forked from `pyspark.daemon`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "15b6127c-67ef-4b6f-b4be-7c05af5d12bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 502 23989 23981 0 12:06PM ?? 0:00.59 python3 -m pyspark.daemon pyspark.worker\n", + " 502 23990 23989 0 12:06PM ?? 0:00.19 python3 -m pyspark.daemon pyspark.worker\n", + " 502 23991 23989 0 12:06PM ?? 0:00.19 python3 -m pyspark.daemon pyspark.worker\n", + " 502 23992 23989 0 12:06PM ?? 0:00.19 python3 -m pyspark.daemon pyspark.worker\n", + " 502 23993 23989 0 12:06PM ?? 0:00.19 python3 -m pyspark.daemon pyspark.worker\n" + ] + } + ], + "source": [ + "%%bash\n", + "ps -fe | grep pyspark.daemon | head -n 5" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d4088643-1903-4d24-b4b6-cce097a92124", + "metadata": {}, + "source": [ + "Typically, users leverage top and the identified PIDs to monitor the memory usage\n", + "of Python processes in PySpark." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2949bb68-0570-44b9-af19-d0b3c260dc49", + "metadata": {}, + "source": [ + "## Use PySpark Profilers\n", + "\n", + "### Memory Profiler\n", + "\n", + "In order to debug the driver side, users typically can use most of the existing\n", + "Python tools such as [memory_profiler](https://github.com/pythonprofilers/memory_profiler)\n", + "that allow you to check the memory usage line by line. If your driver program\n", + "is not running on another machine (e.g., YARN cluster mode), you can use a memory\n", + "profiler to debug memory usage on the driver side. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cee1ae3c-0abe-4e38-b904-7f0c803441a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filename: profile_memory.py\n", + "\n", + "Line # Mem usage Increment Occurrences Line Contents\n", + "=============================================================\n", + " 4 80.6 MiB 80.6 MiB 1 @profile\n", + " 5 #=====================================================\n", + " 6 def my_func():\n", + " 7 79.0 MiB -1.7 MiB 1 session = SparkSession.builder.getOrCreate()\n", + " 8 80.1 MiB 1.1 MiB 1 df = session.range(10000)\n", + " 9 84.1 MiB 4.0 MiB 1 return df.collect()\n", + "\n", + "\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "echo \"from pyspark.sql import SparkSession\n", + "#===Your function should be decorated with @profile===\n", + "from memory_profiler import profile\n", + "@profile\n", + "#=====================================================\n", + "def my_func():\n", + " session = SparkSession.builder.getOrCreate()\n", + " df = session.range(10000)\n", + " return df.collect()\n", + "if __name__ == '__main__':\n", + " my_func()\" > profile_memory.py\n", + "\n", + "python -m memory_profiler profile_memory.py 2> /dev/null" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "98340ba5-abe0-4f92-ae48-26b63c6f5811", + "metadata": {}, + "source": [ + "It shows which line consumes how much memory properly." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7f01c836", + "metadata": {}, + "source": [ + "#### Python and Pandas UDF" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cc3e1ccc", + "metadata": {}, + "source": [ + "
\n", + "Note: This section applies to Spark 4.0\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0b2926cf-df02-42c8-98fa-5822849e901f", + "metadata": {}, + "source": [ + "PySpark provides remote [memory_profiler](https://github.com/pythonprofilers/memory_profiler)\n", + "for Python/Pandas UDFs. That can be used on editors with line numbers such as\n", + "Jupyter notebooks. SparkSession-based memory profiler can be enabled by setting\n", + "the runtime SQL configuration `spark.sql.pyspark.udf.profiler` to `memory`:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "553d9780-b30b-4e96-a134-ca1c06341c89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================================================\n", + "Profile of UDF\n", + "============================================================\n", + "Filename: /var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/885006762.py\n", + "\n", + "Line # Mem usage Increment Occurrences Line Contents\n", + "=============================================================\n", + " 5 1472.6 MiB 1472.6 MiB 10 @pandas_udf(\"long\")\n", + " 6 def add1(x):\n", + " 7 1473.9 MiB 1.3 MiB 10 return x + 1\n", + "\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import pandas_udf\n", + "\n", + "df = spark.range(10)\n", + "\n", + "@pandas_udf(\"long\")\n", + "def add1(x):\n", + " return x + 1\n", + "\n", + "spark.conf.set(\"spark.sql.pyspark.udf.profiler\", \"memory\")\n", + "\n", + "added = df.select(add1(\"id\"))\n", + "spark.profile.clear()\n", + "added.collect()\n", + "spark.profile.show(type=\"memory\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "abaf0439-43de-4482-8a80-258be3d98366", + "metadata": {}, + "source": [ + "The UDF IDs can be seen in the query plan, for example, `add1(...)#16L` in\n", + "`ArrowEvalPython` as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "607f9f0c-6288-4bd2-99b8-2cc7e62098ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Physical Plan ==\n", + "*(2) Project [pythonUDF0#19L AS add1(id)#17L]\n", + "+- ArrowEvalPython [add1(id#14L)#16L], [pythonUDF0#19L], 200\n", + " +- *(1) Range (0, 10, step=1, splits=16)\n", + "\n", + "\n" + ] + } + ], + "source": [ + "added.explain()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "06e40cdc", + "metadata": {}, + "source": [ + "### Performance Profiler" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7909aba0", + "metadata": {}, + "source": [ + "
\n", + "Note: This section applies to Spark 4.0\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e66c991c-1b07-45d0-aae2-c79c362e2210", + "metadata": {}, + "source": [ + "[Python Profilers](https://docs.python.org/3/library/profile.html) are useful built-in\n", + "features in Python itself. To use this on driver side, you can use it as you would\n", + "do for regular Python programs because PySpark on driver side is a regular Python\n", + "process unless you are running your driver program in another machine\n", + "(e.g., YARN cluster mode)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8d9ada24-81da-4c31-aaa2-b3578953b07b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 549275 function calls (536745 primitive calls) in 3.447 seconds\n", + "\n", + " Ordered by: cumulative time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2 0.000 0.000 3.448 1.724 app.py:1()\n", + " 792/1 0.005 0.000 3.447 3.447 {built-in method builtins.exec}\n", + " 128 0.000 0.000 2.104 0.016 socket.py:692(readinto)\n", + " 128 2.104 0.016 2.104 0.016 {method 'recv_into' of '_socket.socket' objects}\n", + " 124 0.000 0.000 2.100 0.017 java_gateway.py:1015(send_command)\n", + " 125 0.001 0.000 2.099 0.017 clientserver.py:499(send_command)\n", + " 138 0.000 0.000 2.097 0.015 {method 'readline' of '_io.BufferedReader' objects}\n", + " 55 0.000 0.000 1.622 0.029 java_gateway.py:1313(__call__)\n", + " 95 0.001 0.000 1.360 0.014 __init__.py:1()\n", + " 1 0.000 0.000 1.359 1.359 session.py:438(getOrCreate)\n", + " 1 0.000 0.000 1.311 1.311 context.py:491(getOrCreate)\n", + " 1 0.000 0.000 1.311 1.311 context.py:169(__init__)\n", + " 1 0.000 0.000 0.861 0.861 context.py:424(_ensure_initialized)\n", + " 1 0.001 0.001 0.861 0.861 java_gateway.py:39(launch_gateway)\n", + " 8 0.840 0.105 0.840 0.105 {built-in method time.sleep}\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "echo \"from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.getOrCreate()\n", + "spark.range(10).collect()\" > app.py\n", + "\n", + "python -m cProfile -s cumulative app.py 2> /dev/null | head -n 20" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3e5ae42d", + "metadata": {}, + "source": [ + "#### Python/Pandas UDF" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "eb137611", + "metadata": {}, + "source": [ + "
\n", + "Note: This section applies to Spark 4.0\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c9b9a26c-56fa-4a14-adfd-2697a87c479e", + "metadata": {}, + "source": [ + "PySpark provides remote Python Profilers for Python/Pandas UDFs. UDFs with\n", + "iterators as inputs/outputs are not supported. SparkSession-based performance\n", + "profiler can be enabled by setting the runtime SQL configuration\n", + "`spark.sql.pyspark.udf.profiler` to `perf`. An example is as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fcba873d-7a4f-42f9-b796-3c492c7e8077", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================================================\n", + "Profile of UDF\n", + "============================================================\n", + " 2130 function calls (2080 primitive calls) in 0.003 seconds\n", + "\n", + " Ordered by: internal time, cumulative time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 10 0.001 0.000 0.003 0.000 common.py:62(new_method)\n", + " 10 0.000 0.000 0.000 0.000 {built-in method _operator.add}\n", + " 10 0.000 0.000 0.002 0.000 base.py:1371(_arith_method)\n", + " 10 0.000 0.000 0.001 0.000 series.py:389(__init__)\n", + " 20 0.000 0.000 0.000 0.000 _ufunc_config.py:33(seterr)\n", + " 10 0.000 0.000 0.001 0.000 series.py:6201(_construct_result)\n", + " 10 0.000 0.000 0.000 0.000 cast.py:1605(maybe_cast_to_integer_array)\n", + " 10 0.000 0.000 0.000 0.000 construction.py:517(sanitize_array)\n", + " 10 0.000 0.000 0.002 0.000 series.py:6133(_arith_method)\n", + " 10 0.000 0.000 0.000 0.000 managers.py:1863(from_array)\n", + " 10 0.000 0.000 0.000 0.000 array_ops.py:240(arithmetic_op)\n", + " 510 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance}\n" + ] + } + ], + "source": [ + "import io\n", + "from contextlib import redirect_stdout\n", + "\n", + "from pyspark.sql.functions import pandas_udf\n", + "\n", + "df = spark.range(10)\n", + "@pandas_udf(\"long\")\n", + "def add1(x):\n", + " return x + 1\n", + "\n", + "added = df.select(add1(\"id\"))\n", + "\n", + "spark.conf.set(\"spark.sql.pyspark.udf.profiler\", \"perf\")\n", + "spark.profile.clear()\n", + "added.collect()\n", + "\n", + "# Only show top 10 lines\n", + "output = io.StringIO()\n", + "with redirect_stdout(output):\n", + " spark.profile.show(type=\"perf\")\n", + "\n", + "print(\"\\n\".join(output.getvalue().split(\"\\n\")[0:20]))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b35fb6c4-c074-4866-abd6-b1b435721b67", + "metadata": {}, + "source": [ + "The UDF IDs can be seen in the query plan, for example, `add1(...)#22L` in\n", + "`ArrowEvalPython` below." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "de015526-577b-45ea-a6e5-598cf215ef8b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Physical Plan ==\n", + "*(2) Project [pythonUDF0#25L AS add1(id)#23L]\n", + "+- ArrowEvalPython [add1(id#20L)#22L], [pythonUDF0#25L], 200\n", + " +- *(1) Range (0, 10, step=1, splits=16)\n", + "\n", + "\n" + ] + } + ], + "source": [ + "added.explain()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fe046e5e-73bb-466f-a373-9d5a445b0fa1", + "metadata": {}, + "source": [ + "We can render the result with a preregistered renderer as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e2507dc0-ab64-4afe-ae8a-19c52533e57c", + "metadata": {}, + "outputs": [], + "source": [ + "spark.profile.render(id=2, type=\"perf\") # renderer=\"flameprof\" by default" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ba892df0-4058-46ad-a952-791559da5259", + "metadata": {}, + "source": [ + "![PySpark UDF profiling](./assets/pyspark-udf-profile.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "09b420ba", + "metadata": {}, + "source": [ + "## Disply Stacktraces" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9756e41b", + "metadata": {}, + "source": [ + "
\n", + "Note: This section applies to Spark 4.0\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c7bf0b21-f9a8-4cc0-8288-46f7ef4f4f52", + "metadata": {}, + "source": [ + "By default, JVM stacktraces and Python internal tracebacks are hidden especially\n", + "in Python UDF executions. For example," + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "bafce04e-5d7c-40e0-9342-0ffb94c858c7", + "metadata": {}, + "outputs": [ + { + "ename": "PythonException", + "evalue": "\n An exception was thrown from the Python worker. Please see the stack trace below.\nTraceback (most recent call last):\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3806637820.py\", line 3, in \nZeroDivisionError: division by zero\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31mPythonException\u001b[0m: \n An exception was thrown from the Python worker. Please see the stack trace below.\nTraceback (most recent call last):\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3806637820.py\", line 3, in \nZeroDivisionError: division by zero\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import udf\n", + "\n", + "spark.range(1).select(udf(lambda x: x / 0)(\"id\")).collect()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "78100070-b9db-4efd-ad94-a8e6a00ee68a", + "metadata": {}, + "source": [ + "\n", + "To show the whole internal stacktraces, users can enable\n", + "`spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled` and `spark.sql.pyspark.jvmStacktrace.enabled`\n", + "respectively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "425b24b8-acd0-4897-bc8c-75af6316f430", + "metadata": {}, + "outputs": [ + { + "ename": "PythonException", + "evalue": "\n An exception was thrown from the Python worker. Please see the stack trace below.\nTraceback (most recent call last):\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1898, in main\n process()\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1890, in process\n serializer.dump_stream(out_iter, outfile)\n File \"/.../python/lib/pyspark.zip/pyspark/serializers.py\", line 224, in dump_stream\n self.serializer.dump_stream(self._batched(iterator), stream)\n File \"/.../python/lib/pyspark.zip/pyspark/serializers.py\", line 145, in dump_stream\n for obj in iterator:\n File \"/.../python/lib/pyspark.zip/pyspark/serializers.py\", line 213, in _batched\n for item in iterator:\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1798, in mapper\n result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1798, in \n result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 114, in \n return args_kwargs_offsets, lambda *a: func(*a)\n ^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/util.py\", line 145, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 739, in profiling_func\n ret = f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3570641234.py\", line 3, in \nZeroDivisionError: division by zero\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31mPythonException\u001b[0m: \n An exception was thrown from the Python worker. Please see the stack trace below.\nTraceback (most recent call last):\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1898, in main\n process()\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1890, in process\n serializer.dump_stream(out_iter, outfile)\n File \"/.../python/lib/pyspark.zip/pyspark/serializers.py\", line 224, in dump_stream\n self.serializer.dump_stream(self._batched(iterator), stream)\n File \"/.../python/lib/pyspark.zip/pyspark/serializers.py\", line 145, in dump_stream\n for obj in iterator:\n File \"/.../python/lib/pyspark.zip/pyspark/serializers.py\", line 213, in _batched\n for item in iterator:\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1798, in mapper\n result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 1798, in \n result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 114, in \n return args_kwargs_offsets, lambda *a: func(*a)\n ^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/util.py\", line 145, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/.../python/lib/pyspark.zip/pyspark/worker.py\", line 739, in profiling_func\n ret = f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3570641234.py\", line 3, in \nZeroDivisionError: division by zero\n" + ] + } + ], + "source": [ + "spark.conf.set(\"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled\", False)\n", + "spark.conf.set(\"spark.sql.pyspark.jvmStacktrace.enabled\", False)\n", + "spark.range(1).select(udf(lambda x: x / 0)(\"id\")).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5ec62d86-3631-4f48-a10c-4bcd727b1eb6", + "metadata": {}, + "outputs": [ + { + "ename": "PythonException", + "evalue": "\n An exception was thrown from the Python worker. Please see the stack trace below.\nTraceback (most recent call last):\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3514597595.py\", line 3, in \nZeroDivisionError: division by zero\n\n\nJVM stacktrace:\norg.apache.spark.SparkException: Job aborted due to stage failure: Task 15 in stage 13.0 failed 1 times, most recent failure: Lost task 15.0 in stage 13.0 (TID 161) (ip-192-168-45-94.ap-northeast-2.compute.internal executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3514597595.py\", line 3, in \nZeroDivisionError: division by zero\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:531)\n\tat org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:103)\n\tat org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:485)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:601)\n\tat scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)\n\tat scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:901)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:901)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:338)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)\n\tat org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:146)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:644)\n\tat org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)\n\tat org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:647)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)\n\tat java.base/java.lang.Thread.run(Thread.java:840)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:2887)\n\tat scala.Option.getOrElse(Option.scala:201)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2887)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2879)\n\tat scala.collection.immutable.List.foreach(List.scala:334)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2879)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1283)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1283)\n\tat scala.Option.foreach(Option.scala:437)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1283)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3158)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3092)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3081)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1009)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2479)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2498)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2523)\n\tat org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1057)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:417)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:1056)\n\tat org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)\n\tat org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4265)\n\tat org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4439)\n\tat org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:608)\n\tat org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4437)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:155)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:267)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:118)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:923)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:74)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:222)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:4437)\n\tat org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4262)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:568)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:840)\nCaused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3514597595.py\", line 3, in \nZeroDivisionError: division by zero\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:531)\n\tat org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:103)\n\tat org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:485)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:601)\n\tat scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)\n\tat scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:901)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:901)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:338)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)\n\tat org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:146)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:644)\n\tat org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)\n\tat org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:647)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)\n\t... 1 more\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31mPythonException\u001b[0m: \n An exception was thrown from the Python worker. Please see the stack trace below.\nTraceback (most recent call last):\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3514597595.py\", line 3, in \nZeroDivisionError: division by zero\n\n\nJVM stacktrace:\norg.apache.spark.SparkException: Job aborted due to stage failure: Task 15 in stage 13.0 failed 1 times, most recent failure: Lost task 15.0 in stage 13.0 (TID 161) (ip-192-168-45-94.ap-northeast-2.compute.internal executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n File \"/var/folders/qm/mlwmy16n5xx66ldgzmptzlc40000gp/T/ipykernel_23976/3514597595.py\", line 3, in \nZeroDivisionError: division by zero\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:531)\n\tat org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:103)\n\tat org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:485)\n\t...\n" + ] + } + ], + "source": [ + "spark.conf.set(\"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled\", True)\n", + "spark.conf.set(\"spark.sql.pyspark.jvmStacktrace.enabled\", True)\n", + "spark.range(1).select(udf(lambda x: x / 0)(\"id\")).collect()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "59bab886-7a57-4736-89b1-e3776b3b991e", + "metadata": {}, + "source": [ + "See also [Stack Traces](https://spark.apache.org/docs/latest/api/python/development/debugging.html#stack-traces) for more details." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0611287d-cb34-457e-9bc3-f5629ddea484", + "metadata": {}, + "source": [ + "\n", + "## IDE Debugging" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "80f080c4", + "metadata": {}, + "source": [ + "On the driver side, no additional steps are needed to use IDE for debugging your PySpark application. Refer to the guide below:\n", + "\n", + "- [Setting up IDEs](https://spark.apache.org/docs/latest/api/python/development/setting_ide.html)\n", + "\n", + "On the executor side, it requires several steps to set up the remote debugger. Refer to the guide below:\n", + "\n", + "- [Remote Debugging (PyCharm Professional)](https://spark.apache.org/docs/latest/api/python/development/debugging.html#remote-debugging-pycharm-professional)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/docs/source/user_guide/dataframes.ipynb b/python/docs/source/user_guide/dataframes.ipynb new file mode 100644 index 000000000000..2316e002e02b --- /dev/null +++ b/python/docs/source/user_guide/dataframes.ipynb @@ -0,0 +1,778 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "0b37fb0a", + "metadata": {}, + "source": [ + "# Chapter 1: DataFrames - A view into your structured data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "edfe9cd6", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pyspark in /Users/amanda.liu/anaconda3/lib/python3.10/site-packages (3.5.0)\n", + "Requirement already satisfied: py4j==0.10.9.7 in /Users/amanda.liu/anaconda3/lib/python3.10/site-packages (from pyspark) (0.10.9.7)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install pyspark" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "43b0e61f", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"Python Spark SQL basic example\") \\\n", + " .config(\"spark.some.config.option\", \"some-value\") \\\n", + " .getOrCreate()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ae367125", + "metadata": {}, + "source": [ + "This section introduces the most fundamental data structure in PySpark: the DataFrame.\n", + "\n", + "A DataFrame is a two-dimensional labeled data structure with columns \n", + "of potentially different types. You can think of a DataFrame like a spreadsheet, a SQL table, or a dictionary of series objects. \n", + "Apache Spark DataFrames support a rich set of APIs (select columns, filter, join, aggregate, etc.) \n", + "that allow you to solve common data analysis problems efficiently.\n", + "\n", + "Compared to traditional relational databases, Spark DataFrames offer several key advantages for big data processing and analytics:\n", + "\n", + "- **Distributed computing**: Spark distributes data across multiple nodes in a cluster, allowing for parallel processing of big data\n", + "- **In-memory processing**: Spark performs computations in memory, which can be significantly faster than disk-based processing\n", + "- **Schema flexibility**: Unlike traditional databases, PySpark DataFrames support schema evolution and dynamic typing.\n", + "- **Fault tolerance**: PySpark DataFrames are built on top of Resilient Distributed Dataset (RDDs), which are inherently fault-tolerant. \n", + "Spark automatically handles node failures and data replication, ensuring data reliability and integrity.\n", + "\n", + "A note on RDDs: \n", + "Direct use of RDDs are no longer supported on Spark Connect as of Spark 4.0.\n", + "Interacting directly with Spark DataFrames uses a unified planning and optimization engine, \n", + "allowing us to get nearly identical performance across all supported languages on Databricks (Python, SQL, Scala, and R)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "443ebbb1", + "metadata": {}, + "source": [ + "## Create a DataFrame\n", + "\n", + "There are several ways to create a DataFrame in PySpark." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "33ebd507", + "metadata": {}, + "source": [ + "### From a list of dictionaries\n", + "\n", + "The simplest way is to use the createDataFrame() method like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b26403e5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+--------+\n", + "|age| name|\n", + "+---+--------+\n", + "| 30| John D.|\n", + "| 25|Alice G.|\n", + "| 35| Bob T.|\n", + "| 28| Eve A.|\n", + "+---+--------+\n", + "\n" + ] + } + ], + "source": [ + "employees = [{\"name\": \"John D.\", \"age\": 30},\n", + " {\"name\": \"Alice G.\", \"age\": 25},\n", + " {\"name\": \"Bob T.\", \"age\": 35},\n", + " {\"name\": \"Eve A.\", \"age\": 28}]\n", + "\n", + "# Create a DataFrame containing the employees data\n", + "df = spark.createDataFrame(employees)\n", + "df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fe7cd086", + "metadata": {}, + "source": [ + "### From a local file\n", + "\n", + "We can also create a DataFrame from a local CSV file:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b421b87d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------------+-----------------+\n", + "|Employee ID| Role| Location|\n", + "+-----------+-----------------+-----------------+\n", + "| 19238| Data Analyst| Seattle, WA|\n", + "| 19239|Software Engineer| Seattle, WA|\n", + "| 19240| IT Specialist| Seattle, WA|\n", + "| 19241| Data Analyst| New York, NY|\n", + "| 19242| Recruiter|San Francisco, CA|\n", + "| 19243| Product Manager| New York, NY|\n", + "+-----------+-----------------+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read.csv(\"../data/employees.csv\", header=True, inferSchema=True)\n", + "df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9e8a5246", + "metadata": {}, + "source": [ + "Or from a local JSON file:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4a2d7fe9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------------+-----------------+\n", + "|Employee ID| Location| Role|\n", + "+-----------+-----------------+-----------------+\n", + "| 19238| Seattle, WA| Data Analyst|\n", + "| 19239| Seattle, WA|Software Engineer|\n", + "| 19240| Seattle, WA| IT Specialist|\n", + "| 19241| New York, NY| Data Analyst|\n", + "| 19242|San Francisco, CA| Recruiter|\n", + "| 19243| New York, NY| Product Manager|\n", + "+-----------+-----------------+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read.option(\"multiline\",\"true\").json(\"../data/employees.json\")\n", + "df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "da022789", + "metadata": {}, + "source": [ + "### From an existing DataFrame\n", + "\n", + "We can even create a DataFrame from another existing DataFrame, by selecting certain columns:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d494632d", + "metadata": {}, + "outputs": [], + "source": [ + "employees = [\n", + " {\"name\": \"John D.\", \"age\": 30, \"department\": \"HR\"},\n", + " {\"name\": \"Alice G.\", \"age\": 25, \"department\": \"Finance\"},\n", + " {\"name\": \"Bob T.\", \"age\": 35, \"department\": \"IT\"},\n", + " {\"name\": \"Eve A.\", \"age\": 28, \"department\": \"Marketing\"}\n", + "]\n", + "df = spark.createDataFrame(employees)\n", + "\n", + "# Select only the name and age columns\n", + "new_df = df.select(\"name\", \"age\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a330bfa9", + "metadata": {}, + "source": [ + "### From a table\n", + "\n", + "If you have an existing table `table_name` in your Spark environment, you can create a DataFrame like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "14ad6034", + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [ + ], + "source": [ + "df = spark.read.table(\"table_name\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "892871a6", + "metadata": {}, + "source": [ + "### From a database\n", + "\n", + "If your table is in a database, you can use JDBC to read the table into a DataFrame.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a40a5d54", + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [ + ], + "source": [ + "url = \"jdbc:mysql://localhost:3306/mydatabase\"\n", + "table = \"employees\"\n", + "properties = {\n", + " \"user\": \"username\",\n", + " \"password\": \"password\"\n", + "}\n", + "\n", + "# Read table into DataFrame\n", + "df = spark.read.jdbc(url=url, table=table, properties=properties)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "91c58617", + "metadata": {}, + "source": [ + "## View the DataFrame\n", + "\n", + "We can use PySpark to view and interact with our DataFrame." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "25faacd7", + "metadata": {}, + "source": [ + "### Display the DataFrame\n", + "\n", + "`df.show()` displays a basic visualization of the DataFrame's contents. From our above `createDataFrame()` example:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6a91ef12", + "metadata": {}, + "outputs": [], + "source": [ + "employees = [{\"name\": \"John D.\", \"age\": 30},\n", + " {\"name\": \"Alice G.\", \"age\": 25},\n", + " {\"name\": \"Bob T.\", \"age\": 35},\n", + " {\"name\": \"Eve A.\", \"age\": 28}]\n", + "\n", + "# Create a DataFrame containing the employees data\n", + "df = spark.createDataFrame(employees)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c2ce1c82", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+--------+\n", + "|age| name|\n", + "+---+--------+\n", + "| 30| John D.|\n", + "| 25|Alice G.|\n", + "| 35| Bob T.|\n", + "| 28| Eve A.|\n", + "+---+--------+\n", + "\n" + ] + } + ], + "source": [ + "df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c6ee7c70", + "metadata": {}, + "source": [ + "`df.show()` has 3 optional arguments: `n`, `truncate`, and `vertical`.\n", + "\n", + "By default, `df.show()` displays up to the first 20 rows of the DataFrame. \n", + "We can control the number of rows displayed by passing an argument to the show() method:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "01417e41", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+--------+\n", + "|age| name|\n", + "+---+--------+\n", + "| 30| John D.|\n", + "| 25|Alice G.|\n", + "+---+--------+\n", + "only showing top 2 rows\n", + "\n" + ] + } + ], + "source": [ + "df.show(n=2)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "605cd26c", + "metadata": {}, + "source": [ + "The truncate argument controls the length of displayed column values (default value is 20):" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b01d5223", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+----+\n", + "|age|name|\n", + "+---+----+\n", + "| 30| Joh|\n", + "| 25| Ali|\n", + "| 35| Bob|\n", + "| 28| Eve|\n", + "+---+----+\n", + "\n" + ] + } + ], + "source": [ + "df.show(truncate=3)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e9bedaa6", + "metadata": {}, + "source": [ + "If we set `vertical` to True, the DataFrame will be displayed vertically with one line per value:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "267facfc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-RECORD 0--------\n", + " age | 30 \n", + " name | John D. \n", + "-RECORD 1--------\n", + " age | 25 \n", + " name | Alice G. \n", + "-RECORD 2--------\n", + " age | 35 \n", + " name | Bob T. \n", + "-RECORD 3--------\n", + " age | 28 \n", + " name | Eve A. \n", + "\n" + ] + } + ], + "source": [ + "df.show(vertical=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4de10f68", + "metadata": {}, + "source": [ + "### Print the DataFrame schema\n", + "\n", + "We can view information about the DataFrame schema using the `printSchema()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "27481fa9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- age: long (nullable = true)\n", + " |-- name: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "df.printSchema()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8e90d081", + "metadata": {}, + "source": [ + "## DataFrame Manipulation\n", + "\n", + "Let's look at some ways we can transform our DataFrames.\n", + "\n", + "For more detailed information, please see the section about data manipulation, [Chapter 6: Function Junction - Data manipulation with PySpark](https://databricks-eng.github.io/pyspark-cookbook/07-dataprep.html).\n", + "\n", + "### Rename columns\n", + "\n", + "We can rename DataFrame columns using the `withColumnRenamed()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "65d6dfcb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+--------+\n", + "|age| name|\n", + "+---+--------+\n", + "| 30| John D.|\n", + "| 25|Alice G.|\n", + "| 35| Bob T.|\n", + "| 28| Eve A.|\n", + "+---+--------+\n", + "\n", + "+---+---------+\n", + "|age|full_name|\n", + "+---+---------+\n", + "| 30| John D.|\n", + "| 25| Alice G.|\n", + "| 35| Bob T.|\n", + "| 28| Eve A.|\n", + "+---+---------+\n", + "\n" + ] + } + ], + "source": [ + "df.show()\n", + "df2 = df.withColumnRenamed(\"name\", \"full_name\")\n", + "df2.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "10f3c03f", + "metadata": {}, + "source": [ + "### Filter rows\n", + "\n", + "We can filter for employees within a certain age range.\n", + "The following `df.filter` will create a new DataFrame with rows that match our age condition:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "af133309", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+\n", + "|age| name|\n", + "+---+-------+\n", + "| 30|John D.|\n", + "| 28| Eve A.|\n", + "+---+-------+\n", + "\n" + ] + } + ], + "source": [ + "filtered_df = df.filter((df[\"age\"] > 26) & (df[\"age\"] < 32))\n", + "filtered_df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c49ea696", + "metadata": {}, + "source": [ + "We can also use `df.where` to get the same result:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a29a0719", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+\n", + "|age| name|\n", + "+---+-------+\n", + "| 30|John D.|\n", + "| 28| Eve A.|\n", + "+---+-------+\n", + "\n" + ] + } + ], + "source": [ + "where_df = df.where((df[\"age\"] > 26) & (df[\"age\"] < 32))\n", + "where_df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b6d1026a", + "metadata": {}, + "source": [ + "## DataFrames vs. Tables\n", + "A DataFrame is an immutable distributed collection of data, only available in the current Spark session.\n", + "\n", + "A table is a persistent data structure that can be accessed across multiple Spark sessions.\n", + "\n", + "If you wish to promote a DataFrame to a table, you can use the `createOrReplaceTempView()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "778ad9c5", + "metadata": {}, + "outputs": [], + "source": [ + "df.createOrReplaceTempView(\"employees\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0a448326", + "metadata": {}, + "source": [ + "Note that the lifetime of this temporary table is tied to the SparkSession that was used to create this DataFrame.\n", + "To persist the table beyond this Spark session, you will need to save it to persistent storage." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bd253f06", + "metadata": {}, + "source": [ + "## Save DataFrame to Persistent Storage\n", + "\n", + "There are several ways to save a DataFrame to persistent storage in PySpark.\n", + "For more detailed information about saving data to your local environment,\n", + "please see the section about Data Loading (TODO: add link)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b72663f5", + "metadata": {}, + "source": [ + "### Save to file-based data source\n", + "\n", + "For file-based data source (text, parquet, json, etc.), you can specify a custom table path like so: " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "714d52bd", + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [], + "source": [ + "df.write.option(\"path\", \"../dataout\").saveAsTable(\"dataframes_savetable_example\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fcc3fc81", + "metadata": {}, + "source": [ + "Even if the table is dropped, the custom table path and table data will still be there. \n", + "\n", + "If no custom table path is specified, Spark will write data to a default table path under the warehouse directory. \n", + "When the table is dropped, the default table path will be removed too." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c98e0afb", + "metadata": {}, + "source": [ + "### Save to Hive metastore\n", + "To save to Hive metastore, you can use the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "00c35126", + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [ + ], + "source": [ + "df.write().mode(\"overwrite\").saveAsTable(\"schemaName.tableName\")" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "", + "language": "python", + "name": "" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/docs/source/user_guide/dataprep.ipynb b/python/docs/source/user_guide/dataprep.ipynb new file mode 100644 index 000000000000..5f858da18bc0 --- /dev/null +++ b/python/docs/source/user_guide/dataprep.ipynb @@ -0,0 +1,1420 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "92a57b81-9b11-47e9-905f-d8b0f5210b36", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Chapter 3: Function Junction - Data manipulation with PySpark\n", + "\n", + "\n", + "## Clean data\n", + "In data science, `garbage in, garbage out` (GIGO) is the concept that flawed, biased or poor quality information or input produces a result or output of similar quality.\n", + "To improve the analysis quality, we need data cleaning, the process to turn garbage into gold, it is composed of identifying, correcting, or removing errors and inconsistencies in data to improve its quality and usability. \n", + "\n", + "\n", + "\n", + "Let's start with a Dataframe containing bad values:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + ], + "source": [ + "!pip install pyspark==4.0.0.dev2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"Data Loading and Storage Example\") \\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "010edc0f-b8ff-4ca7-890b-312cfd86aee0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+------+-----+\n", + "| age|height| NAME|\n", + "+----+------+-----+\n", + "| 10| 80.0|Alice|\n", + "| 10| 80.0|Alice|\n", + "| 5| NaN| BOB|\n", + "|NULL| NULL| Tom|\n", + "|NULL| NaN| NULL|\n", + "| 9| 78.9| josh|\n", + "| 18|1802.3| bush|\n", + "| 7| 75.3|jerry|\n", + "+----+------+-----+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import Row\n", + "\n", + "df = spark.createDataFrame([\n", + " Row(age=10, height=80.0, NAME=\"Alice\"),\n", + " Row(age=10, height=80.0, NAME=\"Alice\"),\n", + " Row(age=5, height=float(\"nan\"), NAME=\"BOB\"),\n", + " Row(age=None, height=None, NAME=\"Tom\"),\n", + " Row(age=None, height=float(\"nan\"), NAME=None),\n", + " Row(age=9, height=78.9, NAME=\"josh\"),\n", + " Row(age=18, height=1802.3, NAME=\"bush\"),\n", + " Row(age=7, height=75.3, NAME=\"jerry\"),\n", + "])\n", + "\n", + "df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d69ab969-8377-449b-a8a5-3c2e900298eb", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "### Rename columns\n", + "At first glance, we find that column `NAME` is upper case.\n", + "For consistency, we can use `DataFrame.withColumnRenamed` to rename columns." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d682268d-dc62-47a2-be2e-b26af4b6bf0d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+------+-----+\n", + "| age|height| name|\n", + "+----+------+-----+\n", + "| 10| 80.0|Alice|\n", + "| 10| 80.0|Alice|\n", + "| 5| NaN| BOB|\n", + "|NULL| NULL| Tom|\n", + "|NULL| NaN| NULL|\n", + "| 9| 78.9| josh|\n", + "| 18|1802.3| bush|\n", + "| 7| 75.3|jerry|\n", + "+----+------+-----+\n", + "\n" + ] + } + ], + "source": [ + "df2 = df.withColumnRenamed(\"NAME\", \"name\")\n", + "\n", + "df2.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "34599ede-9280-48bb-b968-1bdda9d22d8e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Drop null values\n", + "\n", + "Then we can notice that there are two kinds of missing data:\n", + "\n", + "- the `NULL` values in all three columns;\n", + "- the `NaN` values which means `Not a Number` for a numeric column;\n", + "\n", + "The records without a valid `name` are likely useless, so let's drop them first. There are a group of functions in `DataFrameNaFunctions` for missing value handling, we can use `DataFrame.na.drop` or `DataFrame.dropna` to omit rows with `NULL` or `NaN` values.\n", + "\n", + "After the step `df2.na.drop(subset=\"name\")`, invalid record `(age=None, height=NaN, name=None)` is discarded." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bc46e7b4-c8ec-47cb-8934-9d7fde49e426", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+------+-----+\n", + "| age|height| name|\n", + "+----+------+-----+\n", + "| 10| 80.0|Alice|\n", + "| 10| 80.0|Alice|\n", + "| 5| NaN| BOB|\n", + "|NULL| NULL| Tom|\n", + "| 9| 78.9| josh|\n", + "| 18|1802.3| bush|\n", + "| 7| 75.3|jerry|\n", + "+----+------+-----+\n", + "\n" + ] + } + ], + "source": [ + "df3 = df2.na.drop(subset=\"name\")\n", + "\n", + "df3.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5c2b010e-f591-4ccd-a23b-8f68cc54e395", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Fill values\n", + "\n", + "For the remaining missing values, we can use `DataFrame.na.fill` or `DataFrame.fillna` to fill them.\n", + "\n", + "With a `Dict` input `{'age': 10, 'height': 80.1}`, we can specify the values for columns `age` and `height` together." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9aac3291-4e70-435c-a665-59beed8ef3b1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------+-----+\n", + "|age|height| name|\n", + "+---+------+-----+\n", + "| 10| 80.0|Alice|\n", + "| 10| 80.0|Alice|\n", + "| 5| 80.1| BOB|\n", + "| 10| 80.1| Tom|\n", + "| 9| 78.9| josh|\n", + "| 18|1802.3| bush|\n", + "| 7| 75.3|jerry|\n", + "+---+------+-----+\n", + "\n" + ] + } + ], + "source": [ + "df4 = df3.na.fill({'age': 10, 'height': 80.1})\n", + "\n", + "df4.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7c7ef34d-3403-4d5f-96a2-56f823e30277", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Remove outliers\n", + "\n", + "After above steps, all missing values are dropped or filled.\n", + "However, we can find that `height=1802.3` seems unreasonable, to remove this kind of outliers, we can filter the DataFrame with a valid range like `(65, 85)`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "09308454-3cea-4dd2-b0eb-01d93abd0488", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------+-----+\n", + "|age|height| name|\n", + "+---+------+-----+\n", + "| 10| 80.0|Alice|\n", + "| 10| 80.0|Alice|\n", + "| 5| 80.1| BOB|\n", + "| 10| 80.1| Tom|\n", + "| 9| 78.9| josh|\n", + "| 7| 75.3|jerry|\n", + "+---+------+-----+\n", + "\n" + ] + } + ], + "source": [ + "df5 = df4.where(df4.height.between(65, 85))\n", + "\n", + "df5.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "509b8ebe-52ba-4b8f-9473-018c2a8b1273", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Remove duplicates\n", + "\n", + "Now, all invalid records have been handled. But we notice that record `(age=10, height=80.0, name=Alice)` has been duplicated. To remove such duplicates, we can simply apply `DataFrame.distinct`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c5cb2e96-f194-46ab-9489-efe2fe14190d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------+-----+\n", + "|age|height| name|\n", + "+---+------+-----+\n", + "| 10| 80.0|Alice|\n", + "| 5| 80.1| BOB|\n", + "| 10| 80.1| Tom|\n", + "| 9| 78.9| josh|\n", + "| 7| 75.3|jerry|\n", + "+---+------+-----+\n", + "\n" + ] + } + ], + "source": [ + "df6 = df5.distinct()\n", + "\n", + "df6.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "287a5e17-bf51-423b-9cb1-1f0eb0663658", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### String manipulation\n", + "\n", + "Column `name` contains both lower case and upper case letters. We can apply `lower()` function to convert all letters to lower case. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "80a9cb72-1d37-407a-9161-85ea78ee4b73", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------+-----+\n", + "|age|height| name|\n", + "+---+------+-----+\n", + "| 10| 80.0|alice|\n", + "| 5| 80.1| bob|\n", + "| 10| 80.1| tom|\n", + "| 9| 78.9| josh|\n", + "| 7| 75.3|jerry|\n", + "+---+------+-----+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import functions as sf\n", + "\n", + "df7 = df6.withColumn(\"name\", sf.lower(\"name\"))\n", + "df7.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3b28e946-91be-4dd3-a4c3-720e2579a272", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "For more complicated string manipulations, we can also use `udf` to utilize Python's power functions." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ff338521-4770-4b49-b064-0ed3cff12570", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------+-----+\n", + "|age|height| name|\n", + "+---+------+-----+\n", + "| 10| 80.0|Alice|\n", + "| 5| 80.1| Bob|\n", + "| 10| 80.1| Tom|\n", + "| 9| 78.9| Josh|\n", + "| 7| 75.3|Jerry|\n", + "+---+------+-----+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "from pyspark.sql import functions as sf\n", + "\n", + "capitalize = sf.udf(lambda s: s.capitalize())\n", + "\n", + "df8 = df6.withColumn(\"name\", capitalize(\"name\"))\n", + "df8.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5279bf5b-6c37-48e7-bbd9-b3207820bb95", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Reorder columns\n", + "\n", + "After above process, the data is clean and we want to reorder the columns before saving the DataFrame to some storage. You can refer to previous chapter `Load and Behold: Data loading, storage, file formats` for more details.\n", + "\n", + "Normally, we use `DataFrame.select` for this purpose." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f316c501-6f97-4772-82a8-568fd59f04ff", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+---+------+\n", + "| name|age|height|\n", + "+-----+---+------+\n", + "|alice| 10| 80.0|\n", + "| bob| 5| 80.1|\n", + "| tom| 10| 80.1|\n", + "| josh| 9| 78.9|\n", + "|jerry| 7| 75.3|\n", + "+-----+---+------+\n", + "\n" + ] + } + ], + "source": [ + "df9 = df7.select(\"name\", \"age\", \"height\")\n", + "\n", + "df9.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "55a8d1de-f53a-4a73-a7c0-8dd8376f2dd5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Transform data\n", + "\n", + "The main part of a data engineering project is transformation. We create new dataframes from old ones." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b8dc6227-05c9-4c0a-90da-e3f377c9468b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Select columns with select()\n", + "\n", + "The input table may contains hundreds of columns, but for a specific project we likly are interested only in a small subset of them.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "402e442a-b04e-492b-a1b7-376185ea9f50", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+\n", + "| id|col_0|col_1|col_2|col_3|col_4|col_5|col_6|col_7|col_8|col_9|col_10|col_11|col_12|col_13|col_14|col_15|col_16|col_17|col_18|col_19|\n", + "+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+\n", + "| 0| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 1| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 2| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 3| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 4| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 5| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 6| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 7| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 8| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "| 9| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|\n", + "+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import functions as sf\n", + "df = spark.range(10)\n", + "\n", + "for i in range(20):\n", + " df = df.withColumn(f\"col_{i}\", sf.lit(i))\n", + "\n", + "df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "585efeeb-f935-4bc9-9d72-83d4e2cbe946", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "We create a DataFrame with 21 columns via a `for` loop, then we only select 4 columns by `select`. Columns `id`, `col_2` and `col_3` are directly selected from previous DataFrame, while column `sqrt_col_4_plus_5` is generated by the math functions.\n", + "\n", + "We have hundreds of functions for column manipulation in `pyspark.sql.function` and `pyspark.sql.Column`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dde46ecc-a43f-4c83-823c-4ba010291e2c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+-----+-----------------+\n", + "| id|col_2|col_3|sqrt_col_4_plus_5|\n", + "+---+-----+-----+-----------------+\n", + "| 0| 2| 3| 3.0|\n", + "| 1| 2| 3| 3.0|\n", + "| 2| 2| 3| 3.0|\n", + "| 3| 2| 3| 3.0|\n", + "| 4| 2| 3| 3.0|\n", + "| 5| 2| 3| 3.0|\n", + "| 6| 2| 3| 3.0|\n", + "| 7| 2| 3| 3.0|\n", + "| 8| 2| 3| 3.0|\n", + "| 9| 2| 3| 3.0|\n", + "+---+-----+-----+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "\n", + "df2 = df.select(\"id\", \"col_2\", \"col_3\", sf.sqrt(sf.col(\"col_4\") + sf.col(\"col_5\")).alias(\"sqrt_col_4_plus_5\"))\n", + "\n", + "df2.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5dea6b73-186e-4bdc-a4f1-918b4e74ef75", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Filter rows with where()\n", + "\n", + "The input table may be super huge and contains billions of rows, and we may also be interested in only a small subset.\n", + "\n", + "We can use `where` or `filter` with sepcified conditions to filter the rows.\n", + "\n", + "For example, we can select rows with odd `id` values." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2bf40a39-5a42-49af-8a3b-afbb766bbdc9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+-----+-----------------+\n", + "| id|col_2|col_3|sqrt_col_4_plus_5|\n", + "+---+-----+-----+-----------------+\n", + "| 1| 2| 3| 3.0|\n", + "| 3| 2| 3| 3.0|\n", + "| 5| 2| 3| 3.0|\n", + "| 7| 2| 3| 3.0|\n", + "| 9| 2| 3| 3.0|\n", + "+---+-----+-----+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "df3 = df2.where(sf.col(\"id\") % 2 == 1)\n", + "\n", + "df3.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6e34ecd8-8c7a-44b1-9f7e-d4b1cc40a2b7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Summarizing data\n", + "\n", + "In data analysis, we normally end up with summarizing data to a chart or table." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1fbde87c-c5f7-4102-a41d-9c81c63d750b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----+\n", + "| incomes| NAME|\n", + "+--------------------+-----+\n", + "|[123.0, 456.0, 78...|Alice|\n", + "| [234.0, 567.0]| BOB|\n", + "|[100.0, 200.0, 10...| Tom|\n", + "| [79.0, 128.0]| josh|\n", + "|[123.0, 145.0, 17...| bush|\n", + "|[111.0, 187.0, 45...|jerry|\n", + "+--------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import Row\n", + "\n", + "df = spark.createDataFrame([\n", + " Row(incomes=[123.0, 456.0, 789.0], NAME=\"Alice\"),\n", + " Row(incomes=[234.0, 567.0], NAME=\"BOB\"),\n", + " Row(incomes=[100.0, 200.0, 100.0], NAME=\"Tom\"),\n", + " Row(incomes=[79.0, 128.0], NAME=\"josh\"),\n", + " Row(incomes=[123.0, 145.0, 178.0], NAME=\"bush\"),\n", + " Row(incomes=[111.0, 187.0, 451.0, 188.0, 199.0], NAME=\"jerry\"),\n", + "])\n", + "\n", + "df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "aa593364-0f87-48d1-969e-7620c8c3ff85", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "For example, given the income per month, we want to find the average income for each name." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "107e5b59-0d5c-4539-b481-c7895115bb5d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+-----------------------------------+\n", + "|name |incomes |\n", + "+-----+-----------------------------------+\n", + "|alice|[123.0, 456.0, 789.0] |\n", + "|bob |[234.0, 567.0] |\n", + "|tom |[100.0, 200.0, 100.0] |\n", + "|josh |[79.0, 128.0] |\n", + "|bush |[123.0, 145.0, 178.0] |\n", + "|jerry|[111.0, 187.0, 451.0, 188.0, 199.0]|\n", + "+-----+-----------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import functions as sf\n", + "\n", + "df2 = df.select(sf.lower(\"NAME\").alias(\"name\"), \"incomes\")\n", + "\n", + "df2.show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3ee9f0d6-2d77-43b5-8632-2f60004b8bb4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Reshape data using explode()\n", + "\n", + "To make the data easier for aggregation, we can use `explode()` function to reshape the data" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1c0fb5ea-10aa-4f7a-be15-6881a04f3485", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+------+\n", + "| name|income|\n", + "+-----+------+\n", + "|alice| 123.0|\n", + "|alice| 456.0|\n", + "|alice| 789.0|\n", + "| bob| 234.0|\n", + "| bob| 567.0|\n", + "| tom| 100.0|\n", + "| tom| 200.0|\n", + "| tom| 100.0|\n", + "| josh| 79.0|\n", + "| josh| 128.0|\n", + "| bush| 123.0|\n", + "| bush| 145.0|\n", + "| bush| 178.0|\n", + "|jerry| 111.0|\n", + "|jerry| 187.0|\n", + "|jerry| 451.0|\n", + "|jerry| 188.0|\n", + "|jerry| 199.0|\n", + "+-----+------+\n", + "\n" + ] + } + ], + "source": [ + "df3 = df2.select(\"name\", sf.explode(\"incomes\").alias(\"income\"))\n", + "\n", + "df3.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "21590f9b-7e64-406d-9d55-7f0ac47b594c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Summarizing via groupBy() and agg()\n", + "\n", + "Then we normally use `DataFrame.groupBy(...).agg(...)` to aggreate the data. To compute the average income, we can apply aggration function `avg`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "31dece1d-4d2a-4f85-bb4e-76d3f82d8ca0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+------------------+\n", + "| name| avg_income|\n", + "+-----+------------------+\n", + "|alice| 456.0|\n", + "| bob| 400.5|\n", + "| tom|133.33333333333334|\n", + "| josh| 103.5|\n", + "| bush|148.66666666666666|\n", + "|jerry| 227.2|\n", + "+-----+------------------+\n", + "\n" + ] + } + ], + "source": [ + "df4 = df3.groupBy(\"name\").agg(sf.avg(\"income\").alias(\"avg_income\"))\n", + "\n", + "df4.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0938c629-4494-4614-8eac-3fabb0eb1547", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Orderby\n", + "\n", + "For final analysis, we normally want to order the data. In this case, we can order the data by `name`." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7e54b022-cb52-4fe2-a79d-259be277d705", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+------------------+\n", + "| name| avg_income|\n", + "+-----+------------------+\n", + "|alice| 456.0|\n", + "| bob| 400.5|\n", + "| bush|148.66666666666666|\n", + "|jerry| 227.2|\n", + "| josh| 103.5|\n", + "| tom|133.33333333333334|\n", + "+-----+------------------+\n", + "\n" + ] + } + ], + "source": [ + "df5 = df4.orderBy(\"name\")\n", + "\n", + "df5.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dc6e8724-1363-4d55-a98c-87ad11efb787", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## When DataFrames Collide: The Art of Joining\n", + "\n", + "When dealing with multiple dataframe, we likely need to combine them together in some way. The most frequently used approach is joining.\n", + "\n", + "For example, given the `incomes` data and `height` data, we can use `DataFrame.join` to join them together by `name`.\n", + "\n", + "We can see that only `alice`, `josh` and `bush` are in the final results, because they appear in both DataFrames." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2a5e52e6-fc57-4315-b649-f79828269449", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql import Row\n", + "\n", + "df1 = spark.createDataFrame([\n", + " Row(age=10, height=80.0, name=\"alice\"),\n", + " Row(age=9, height=78.9, name=\"josh\"),\n", + " Row(age=18, height=82.3, name=\"bush\"),\n", + " Row(age=7, height=75.3, name=\"tom\"),\n", + "])\n", + "\n", + "df2 = spark.createDataFrame([\n", + " Row(incomes=[123.0, 456.0, 789.0], name=\"alice\"),\n", + " Row(incomes=[234.0, 567.0], name=\"bob\"),\n", + " Row(incomes=[79.0, 128.0], name=\"josh\"),\n", + " Row(incomes=[123.0, 145.0, 178.0], name=\"bush\"),\n", + " Row(incomes=[111.0, 187.0, 451.0, 188.0, 199.0], name=\"jerry\"),\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bb635cb3-74b4-40f3-a8bd-b74fca020f95", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+---+------+---------------------+\n", + "|name |age|height|incomes |\n", + "+-----+---+------+---------------------+\n", + "|alice|10 |80.0 |[123.0, 456.0, 789.0]|\n", + "|bush |18 |82.3 |[123.0, 145.0, 178.0]|\n", + "|josh |9 |78.9 |[79.0, 128.0] |\n", + "+-----+---+------+---------------------+\n", + "\n" + ] + } + ], + "source": [ + "df3 = df1.join(df2, on=\"name\")\n", + "\n", + "df3.show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "98cefd42-a358-4d12-87f2-41cc41aa98b7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "There are seven join methods:\n", + "- `INNER`\n", + "- `LEFT`\n", + "- `RIGHT`\n", + "- `FULL`\n", + "- `CROSS`\n", + "- `LEFTSEMI`\n", + "- `LEFTANTI`\n", + "\n", + "And the default one is `INNER`.\n", + "\n", + "Let's take `LEFT` join as another example. A left join includes all of the records from the first (left) of two tables, even if there are no matching values for records in the second (right) table.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cdc282ab-ed1d-4964-9ebd-a5770de93cc3", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+---+------+---------------------+\n", + "|name |age|height|incomes |\n", + "+-----+---+------+---------------------+\n", + "|alice|10 |80.0 |[123.0, 456.0, 789.0]|\n", + "|josh |9 |78.9 |[79.0, 128.0] |\n", + "|bush |18 |82.3 |[123.0, 145.0, 178.0]|\n", + "|tom |7 |75.3 |NULL |\n", + "+-----+---+------+---------------------+\n", + "\n" + ] + } + ], + "source": [ + "df4 = df1.join(df2, on=\"name\", how=\"left\")\n", + "\n", + "df4.show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "70257613-f432-48e6-bc58-b521fde9b77a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "And a `RIGHT` join keeps all of the records from the right table." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "846429dc-ea7e-484e-ad3d-82e625348f69", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+----+------+-----------------------------------+\n", + "|name |age |height|incomes |\n", + "+-----+----+------+-----------------------------------+\n", + "|alice|10 |80.0 |[123.0, 456.0, 789.0] |\n", + "|bob |NULL|NULL |[234.0, 567.0] |\n", + "|josh |9 |78.9 |[79.0, 128.0] |\n", + "|bush |18 |82.3 |[123.0, 145.0, 178.0] |\n", + "|jerry|NULL|NULL |[111.0, 187.0, 451.0, 188.0, 199.0]|\n", + "+-----+----+------+-----------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "df5 = df1.join(df2, on=\"name\", how=\"right\")\n", + "\n", + "df5.show(truncate=False)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "PythonCookbook", + "widgets": {} + }, + "kernelspec": { + "display_name": "", + "language": "python", + "name": "" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/python/docs/source/user_guide/index.rst b/python/docs/source/user_guide/index.rst index 67f8c8d4d0fe..1da4dd2e6eb1 100644 --- a/python/docs/source/user_guide/index.rst +++ b/python/docs/source/user_guide/index.rst @@ -16,22 +16,22 @@ under the License. -=========== -User Guides -=========== +================ +User Guide +================ + +Welcome to the PySpark user guide! +Each of the below sections contains code-driven examples to help you get familiar with PySpark. -PySpark specific user guides are available here: - .. toctree:: :maxdepth: 2 - python_packaging - sql/index - pandas_on_spark/index + dataframes + touroftypes + dataprep + bugbusting + udfandudtf + sql + loadandbehold -There are also basic programming guides covering multiple languages available in -`the Spark documentation `_, including these: -- `Spark SQL, DataFrames and Datasets Guide `_ -- `Structured Streaming Programming Guide `_ -- `Machine Learning Library (MLlib) Guide `_ diff --git a/python/docs/source/user_guide/loadandbehold.ipynb b/python/docs/source/user_guide/loadandbehold.ipynb new file mode 100644 index 000000000000..3d4639efc82f --- /dev/null +++ b/python/docs/source/user_guide/loadandbehold.ipynb @@ -0,0 +1,488 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "7387c507-b420-4441-822c-ed9ba805d95e", + "metadata": {}, + "source": [ + "# Chapter 7: Load and Behold - Data loading, storage, file formats" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c7cd870-ac96-4f64-be0c-af900c800ccd", + "metadata": { + "tags": [ + "remove_cell" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pyspark==4.0.0.dev2 in /Users/amanda.liu/anaconda3/envs/llm-spark/lib/python3.11/site-packages (4.0.0.dev2)\n", + "Requirement already satisfied: py4j==0.10.9.7 in /Users/amanda.liu/anaconda3/envs/llm-spark/lib/python3.11/site-packages (from pyspark==4.0.0.dev2) (0.10.9.7)\n" + ] + } + ], + "source": [ + "!pip install pyspark==4.0.0.dev2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "43230845-2ff0-432b-8c7d-edf695b3de31", + "metadata": { + "tags": [ + "remove_cell" + ] + }, + "outputs": [ + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"Data Loading and Storage Example\") \\\n", + " .getOrCreate()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d4d0fad4-cd2f-4458-a3ed-5cd10ca21abb", + "metadata": {}, + "source": [ + "This section covers how to read and write data in various formats using PySpark. You'll learn how to load data from common file types (e.g., CSV, JSON, Parquet, ORC) and store data efficiently." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b99bd39b-c36d-470f-bf4a-922115e323f8", + "metadata": {}, + "source": [ + "## Reading Data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1b7bf82a-7727-46fa-bc95-fd5590c64ccd", + "metadata": {}, + "source": [ + "### 1.1 Reading CSV Files\n", + "\n", + "CSV is one of the most common formats for data exchange. Here's how to load a CSV file into a DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8b4a6a34-011d-481b-8575-6b1c1846e7c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------------+-----------------+\n", + "|Employee ID| Role| Location|\n", + "+-----------+-----------------+-----------------+\n", + "| 19238| Data Analyst| Seattle, WA|\n", + "| 19239|Software Engineer| Seattle, WA|\n", + "| 19240| IT Specialist| Seattle, WA|\n", + "| 19241| Data Analyst| New York, NY|\n", + "| 19242| Recruiter|San Francisco, CA|\n", + "| 19243| Product Manager| New York, NY|\n", + "+-----------+-----------------+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "csv_df = spark.read.csv(\"../data/employees.csv\", header=True, inferSchema=True)\n", + "csv_df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d8a98e55-b9d1-4cd7-a28c-b9738d55cae7", + "metadata": {}, + "source": [ + "**Explanation:**\n", + "- `header=True`: Treats the first line as column names.\n", + "- `inferSchema=True`: Automatically infers data types of columns." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "302b8d74-33ea-4a54-95c3-760ce6b0f816", + "metadata": {}, + "source": [ + "### 1.2 Reading JSON Files\n", + "\n", + "Loading JSON files is simple and allows you to handle both single-line and multi-line JSON structures:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "39e3fc27-a9bf-493a-8f89-6c087ed4e89b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------------+-----------------+\n", + "|Employee ID| Location| Role|\n", + "+-----------+-----------------+-----------------+\n", + "| 19238| Seattle, WA| Data Analyst|\n", + "| 19239| Seattle, WA|Software Engineer|\n", + "| 19240| Seattle, WA| IT Specialist|\n", + "| 19241| New York, NY| Data Analyst|\n", + "| 19242|San Francisco, CA| Recruiter|\n", + "| 19243| New York, NY| Product Manager|\n", + "+-----------+-----------------+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "json_df = spark.read.option(\"multiline\", \"true\").json(\"../data/employees.json\")\n", + "json_df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d4a46228-9f55-41a5-9e44-8a9efdac4985", + "metadata": {}, + "source": [ + "**Explanation:**\n", + "- `multiline=\"true\"`: Allows reading multi-line JSON structures." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d0578c9a-90d5-462d-b35a-f05aceaf70d9", + "metadata": {}, + "source": [ + "### 1.3 Reading Parquet Files\n", + "\n", + "Parquet is a columnar format that supports efficient data compression and encoding:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d04aa812-cd22-439b-aea5-0a148ccc5d1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------------+-----------------+\n", + "|Employee ID| Location| Role|\n", + "+-----------+-----------------+-----------------+\n", + "| 19239| Seattle, WA|Software Engineer|\n", + "| 19243| New York, NY| Product Manager|\n", + "| 19242|San Francisco, CA| Recruiter|\n", + "| 19241| New York, NY| Data Analyst|\n", + "| 19240| Seattle, WA| IT Specialist|\n", + "| 19238| Seattle, WA| Data Analyst|\n", + "+-----------+-----------------+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "parquet_df = spark.read.parquet(\"../data/employees.parquet\")\n", + "parquet_df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0c1a93b4-ed28-437f-a889-afb9fa128739", + "metadata": {}, + "source": [ + "**Tip:** Parquet files are highly efficient for storing data due to columnar storage and compression." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6442a9e1-f8f5-4b41-a5d3-f7983458aa30", + "metadata": {}, + "source": [ + "### 1.4 Reading ORC Files\n", + "\n", + "ORC is another columnar file format, often used in Hadoop environments:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3af892a9-1a1b-400b-84ec-e0f787689d2c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------------+-----------------+\n", + "|Employee ID| Location| Role|\n", + "+-----------+-----------------+-----------------+\n", + "| 19242|San Francisco, CA| Recruiter|\n", + "| 19239| Seattle, WA|Software Engineer|\n", + "| 19240| Seattle, WA| IT Specialist|\n", + "| 19243| New York, NY| Product Manager|\n", + "| 19238| Seattle, WA| Data Analyst|\n", + "| 19241| New York, NY| Data Analyst|\n", + "+-----------+-----------------+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "orc_df = spark.read.orc(\"../data/employees.orc\")\n", + "orc_df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c19666a4-a31f-4f76-b328-15b8d476f7e3", + "metadata": {}, + "source": [ + "## Writing Data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "20f3344d-0617-458e-86b5-ced402b20d89", + "metadata": {}, + "source": [ + "### 2.1 Writing Data as CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9961491-2b46-4252-9aae-014aadd98d4c", + "metadata": {}, + "outputs": [], + "source": [ + "csv_df.write.csv(\"../data/employees_out.csv\", mode=\"overwrite\", header=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6227bb33-01b9-4f4b-a17a-4c9678371192", + "metadata": {}, + "source": [ + "**Explanation:**\n", + "- `mode=\"overwrite\"`: If the directory exists, it will be replaced.\n", + "- `header=True`: Writes the column names as the first line." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f3151eab-0db9-4087-8df7-c36995c14243", + "metadata": {}, + "source": [ + "### 2.2 Writing Data as Parquet\n", + "\n", + "Parquet format is recommended for large datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ec2b8563-f681-4cbc-93e8-1da134a51008", + "metadata": {}, + "outputs": [], + "source": [ + "parquet_df.write.parquet(\"../data/employees_out.parquet\", mode=\"overwrite\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "988990e8-6e62-4410-945a-d284699871d6", + "metadata": {}, + "source": [ + "### 2.3 Writing Data as ORC" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e4827d27-1b41-4661-9dd6-efff6bea2c16", + "metadata": {}, + "outputs": [], + "source": [ + "json_df.write.orc(\"../data/employees_out.orc\", mode=\"overwrite\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ec497f26-95b4-4a4d-be3a-5b51576c2e98", + "metadata": {}, + "source": [ + "**Tip:** Parquet and ORC formats are best for efficient storage and quick reads." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4e23c3bf-a365-41c7-84cd-3d70589eb051", + "metadata": {}, + "source": [ + "## Additional Options and Configurations\n", + "\n", + "You can customize how data is read and written by using additional options. Here are a few examples:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "71080e8a-2be3-4726-bb15-0a24d08df46e", + "metadata": {}, + "source": [ + "### Custom Delimiter in CSV:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4a2ea041-4564-422a-81f4-94a745f99848", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------+\n", + "|_c0 |\n", + "+-------------------------------------+\n", + "|Employee ID,Role,Location |\n", + "|19238,Data Analyst,\"Seattle, WA\" |\n", + "|19239,Software Engineer,\"Seattle, WA\"|\n", + "|19240,IT Specialist,\"Seattle, WA\" |\n", + "|19241,Data Analyst,\"New York, NY\" |\n", + "|19242,Recruiter,\"San Francisco, CA\" |\n", + "|19243,Product Manager,\"New York, NY\" |\n", + "+-------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "spark.read.option(\"delimiter\", \";\").csv(\"../data/employees.csv\").show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a93d8450-f675-4288-9cdf-7aa00f806136", + "metadata": {}, + "source": [ + "### Handling Null Values:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "50069b43-c926-4453-8e37-6bc605d9bff2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------------+-----------------+\n", + "|_c0 |_c1 |_c2 |\n", + "+-----------+-----------------+-----------------+\n", + "|Employee ID|Role |Location |\n", + "|19238 |Data Analyst |Seattle, WA |\n", + "|19239 |Software Engineer|Seattle, WA |\n", + "|19240 |IT Specialist |Seattle, WA |\n", + "|19241 |Data Analyst |New York, NY |\n", + "|19242 |Recruiter |San Francisco, CA|\n", + "|19243 |Product Manager |New York, NY |\n", + "+-----------+-----------------+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "spark.read.option(\"nullValue\", \"NULL\").csv(\"../data/employees.csv\").show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0379e733-08b6-4851-9750-117ce4bfca09", + "metadata": {}, + "source": [ + "### Compression Options:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d0a2d239-6e87-4ea0-93b5-7272e77cedf8", + "metadata": {}, + "outputs": [], + "source": [ + "parquet_df.write.option(\"compression\", \"gzip\").parquet(\"../data/employees_out.parquet\", mode=\"overwrite\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "202f4ee8-0c24-40c3-9e54-ee7eef52799e", + "metadata": {}, + "source": [ + "See the [PySpark API reference](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/io.html) for Input/Output to check all supported functions and options." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "", + "language": "python", + "name": "" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/docs/source/user_guide/sql.ipynb b/python/docs/source/user_guide/sql.ipynb new file mode 100644 index 000000000000..b6a65645f63b --- /dev/null +++ b/python/docs/source/user_guide/sql.ipynb @@ -0,0 +1,1192 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "97d93ac1-1ba3-40c1-8ea7-901b691ac2be", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession.builder.getOrCreate()\n", + "spark.sparkContext.setLogLevel(\"error\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf68f672-caae-483b-aea8-a44ec22a7bdf", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Chapter 6: Old SQL, New Tricks - Running SQL on PySpark" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ecba6109-c9d9-4922-a13d-6256a3f74c3a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Introduction" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ca18176e-9457-447a-aa16-365204b7214e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This section explains how to use the Spark SQL API in PySpark and compare it with the DataFrame API. It also covers how to switch between the two APIs seamlessly, along with some practical tips and tricks." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "60b59fe5-4d05-408b-8807-20f863efc6e4", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Running SQL with PySpark\n", + "PySpark offers two main ways to perform SQL operations:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fd138d2d-480f-4a50-bff8-9f583f39d9a1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Using `spark.sql()`\n", + "The `spark.sql()` function allows you to execute SQL queries directly." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2e2a56b2-226a-4626-84b7-f0ff93e008c6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a table via spark.sql()\n", + "spark.sql(\"DROP TABLE IF EXISTS people\")\n", + "spark.sql(\"\"\"\n", + "CREATE TABLE people USING PARQUET\n", + "AS SELECT * FROM VALUES (1, 'Alice', 10), (2, 'Bob', 20), (3, 'Charlie', 30) t(id, name, age)\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "192620d0-10cb-4b25-8d5b-7a59b1ac6c56", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+---+\n", + "| name|age|\n", + "+-------+---+\n", + "|Charlie| 30|\n", + "+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "# Use spark.sql() to select data from a table\n", + "spark.sql(\"SELECT name, age FROM people WHERE age > 21\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d13f853c-fb81-43f8-b496-57ff80886a3a", + "metadata": {}, + "source": [ + "### Using the PySpark DataFrame API\n", + "The PySpark DataFrame API provides equivalent functionality to SQL but with a Pythonic approach." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b905d379-af51-4376-9837-fa32a913951c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+---+\n", + "| name|age|\n", + "+-------+---+\n", + "|Charlie| 30|\n", + "+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "# Read a table using the DataFrame API\n", + "people_df = spark.read.table(\"people\")\n", + "\n", + "# Use DataFrame API to select data\n", + "people_df.select(\"name\", \"age\").filter(\"age > 21\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1df50cb1-3178-4f7d-99d7-6137c9ce9c12", + "metadata": {}, + "source": [ + "## SQL vs. DataFrame API in PySpark\n", + "When to use which API depends on your background and the specific task:\n", + "\n", + "**SQL API:**\n", + " - Ideal for users with SQL backgrounds who are more comfortable writing SQL queries.\n", + "\n", + "**DataFrame API:**\n", + " - Preferred by Python developers as it aligns with Python syntax and idioms.\n", + " - Provides greater flexibility for complex transformations, especially with user-defined functions (UDFs)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "aded4623-8909-4b82-ae1a-d8bd13245f9e", + "metadata": {}, + "source": [ + "### Code Examples: SQL vs. DataFrame API\n", + "\n", + "Here are some examples comparing how common tasks are performed using the SQL API and PySpark's DataFrame API to give you an idea of their differences and when one might be more suitable than the other." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b9f201db-c440-42c3-81bc-b395a01a5e13", + "metadata": {}, + "source": [ + "#### Example: SELECT and FILTER Operation" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c77022e8-a979-43ce-9b62-ad6adeff6ee4", + "metadata": {}, + "source": [ + "**SQL API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "afccb0b5-f533-45c4-a8e0-c4fced910dd3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+\n", + "| name|\n", + "+-------+\n", + "|Charlie|\n", + "+-------+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"SELECT name FROM people WHERE age > 21\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "94f2b460-155a-4aa8-a98e-d0ef01693e8e", + "metadata": {}, + "source": [ + "**DataFrame API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ddd28264-b8ba-4f95-a62c-e3855c27c759", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+\n", + "| name|\n", + "+-------+\n", + "|Charlie|\n", + "+-------+\n", + "\n" + ] + } + ], + "source": [ + "spark.read.table(\"people\").select(\"name\").filter(\"age > 21\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "86ffd501-a658-4819-abc1-721b1e2633a7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### Example: JOIN Operation" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "63fd3534-9d20-4b96-8ef7-17adc9fc8e4f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql(\"DROP TABLE IF EXISTS orders\")\n", + "spark.sql(\"\"\"\n", + "CREATE TABLE orders USING PARQUET \n", + "AS SELECT * FROM VALUES (101, 1, 200), (102, 2, 150), (103,3, 300) t(order_id, customer_id, amount)\n", + "\"\"\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "75e88196-ef2b-42ed-aa83-4c384ea1f5ef", + "metadata": {}, + "source": [ + "**SQL API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "56cbb734-5f33-496d-8053-079abf19bff8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------+\n", + "| name|order_id|\n", + "+-------+--------+\n", + "|Charlie| 103|\n", + "| Alice| 101|\n", + "| Bob| 102|\n", + "+-------+--------+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"\"\"\n", + "SELECT p.name, o.order_id\n", + "FROM people p\n", + "JOIN orders o ON p.id = o.customer_id\n", + "\"\"\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "42d501f1-2906-47e1-b6e6-a2c4798ef7ed", + "metadata": {}, + "source": [ + "**DataFrame API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ee9b69ae-f4a6-48f2-ac63-869e22a7089e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------+\n", + "| name|order_id|\n", + "+-------+--------+\n", + "|Charlie| 103|\n", + "| Alice| 101|\n", + "| Bob| 102|\n", + "+-------+--------+\n", + "\n" + ] + } + ], + "source": [ + "people_df = spark.read.table(\"people\")\n", + "orders_df = spark.read.table(\"orders\")\n", + "(\n", + " people_df\n", + " .join(orders_df, people_df.id == orders_df.customer_id)\n", + " .select(people_df.name, orders_df.order_id)\n", + " .show()\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "723cbbcd-a8f2-432a-9138-e4080fe5c391", + "metadata": {}, + "source": [ + "#### Example: GROUP BY and Aggregate Operation" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4634009a-42cc-4452-bda0-56b8300925ac", + "metadata": {}, + "source": [ + "**SQL API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6540c85c-e0e9-4ea8-bad5-e7a39b17ef46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------------+\n", + "| name|total_amount|\n", + "+-------+------------+\n", + "|Charlie| 300|\n", + "| Alice| 200|\n", + "| Bob| 150|\n", + "+-------+------------+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"\"\"\n", + "SELECT p.name, SUM(o.amount) AS total_amount\n", + "FROM people p\n", + "JOIN orders o ON p.id = o.customer_id\n", + "GROUP BY p.name\n", + "\"\"\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8c7f2d9b-0c96-4849-a53e-82573d540f7f", + "metadata": {}, + "source": [ + "**DataFrame API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "096446a7-b2d5-4e6d-a145-95c26f043aaa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------------+\n", + "| name|total_amount|\n", + "+-------+------------+\n", + "|Charlie| 300|\n", + "| Alice| 200|\n", + "| Bob| 150|\n", + "+-------+------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import sum\n", + "\n", + "(\n", + " people_df\n", + " .join(orders_df, people_df.id == orders_df.customer_id)\n", + " .groupBy(\"name\")\n", + " .agg(sum(\"amount\").alias(\"total_amount\"))\n", + " .show()\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f4c6aad1-11fd-436a-8cb5-b1b6cde2e41e", + "metadata": {}, + "source": [ + "#### Example: Window Operations" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "486e5443-e58e-41ea-8ec3-562817ede628", + "metadata": {}, + "source": [ + "**SQL API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "a2745d1b-635b-4176-a142-84695b1dde32", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------+----+\n", + "| name|amount|rank|\n", + "+-------+------+----+\n", + "| Alice| 200| 1|\n", + "| Bob| 150| 1|\n", + "|Charlie| 300| 1|\n", + "+-------+------+----+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"\"\"\n", + "SELECT\n", + " p.name,\n", + " o.amount, \n", + " RANK() OVER (PARTITION BY p.name ORDER BY o.amount DESC) AS rank\n", + "FROM people p\n", + "JOIN orders o ON p.id = o.customer_id\n", + "\"\"\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7a0e8085-3c45-43df-946b-9db6aa78dc53", + "metadata": {}, + "source": [ + "**DataFrame API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "05d94aa3-4338-444e-a98a-b9ba388a2070", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------+----+\n", + "| name|amount|rank|\n", + "+-------+------+----+\n", + "| Alice| 200| 1|\n", + "| Bob| 150| 1|\n", + "|Charlie| 300| 1|\n", + "+-------+------+----+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.window import Window\n", + "from pyspark.sql.functions import rank\n", + "\n", + "# Define the window specification\n", + "window_spec = Window.partitionBy(\"name\").orderBy(orders_df.amount.desc())\n", + "\n", + "# Window operation with RANK\n", + "(\n", + " people_df\n", + " .join(orders_df, people_df.id == orders_df.customer_id)\n", + " .withColumn(\"rank\", rank().over(window_spec))\n", + " .select(\"name\", \"amount\", \"rank\")\n", + " .show()\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "054b56db-6f35-4820-bbfe-7d9095ea8f7c", + "metadata": {}, + "source": [ + "#### Example: UNION Operation" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1c13feb7-98f8-4772-a4a9-31f2289370e3", + "metadata": {}, + "source": [ + "**SQL API:**\n", + "- The `UNION` operator combines rows from two queries and removes duplicates by default." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "227897ff-fe7c-4999-b8a8-a1acb193241e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql(\"CREATE OR REPLACE TEMP VIEW people2 AS SELECT * FROM VALUES (1, 'Alice', 10), (4, 'David', 35) t(id, name, age)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8e72401b-021d-4169-97f3-36007c537e85", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+---+\n", + "| id| name|age|\n", + "+---+-------+---+\n", + "| 3|Charlie| 30|\n", + "| 1| Alice| 10|\n", + "| 2| Bob| 20|\n", + "| 4| David| 35|\n", + "+---+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"\"\"\n", + "SELECT * FROM people\n", + "UNION\n", + "SELECT * FROM people2\n", + "\"\"\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "705262ad-8963-4e14-b2c2-bffd909ec06a", + "metadata": {}, + "source": [ + "**DataFrame API:**\n", + "- The `union()` method is used to combine two DataFrames, but it does not remove duplicates by default.\n", + "- To match the behavior of SQL's UNION, we use the .dropDuplicates() method to eliminate duplicates after the union operation." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3a67f3ee-1179-4d6c-870a-c7982753f707", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+---+\n", + "| id| name|age|\n", + "+---+-------+---+\n", + "| 3|Charlie| 30|\n", + "| 1| Alice| 10|\n", + "| 2| Bob| 20|\n", + "| 1| Alice| 10|\n", + "| 4| David| 35|\n", + "+---+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "people_df = spark.read.table(\"people\")\n", + "people2_df = spark.read.table(\"people2\")\n", + "# This will have duplicate values.\n", + "people_df.union(people2_df).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "96e621eb-ca83-4935-85d0-757e7d49fdd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+---+\n", + "| id| name|age|\n", + "+---+-------+---+\n", + "| 3|Charlie| 30|\n", + "| 1| Alice| 10|\n", + "| 2| Bob| 20|\n", + "| 4| David| 35|\n", + "+---+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "# Remove duplicate values\n", + "people_df.union(people2_df).dropDuplicates().show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fbd87a8c-f5b5-41da-bc3e-db78fbfe1785", + "metadata": {}, + "source": [ + "#### Example: SET Configurations" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3e7e46ac-b5b3-4823-8ecb-957c5244482b", + "metadata": {}, + "source": [ + "**SQL API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "59cb6621-f5be-48e1-9e57-817f7db6be72", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[key: string, value: string]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql(\"SET spark.sql.shuffle.partitions=8\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "567f7905-5296-4651-b062-aa4c70f734ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------+-----+\n", + "|key |value|\n", + "+----------------------------+-----+\n", + "|spark.sql.shuffle.partitions|8 |\n", + "+----------------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"SET spark.sql.shuffle.partitions\").show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "142ee012-4d76-4969-bd48-1744b58042f5", + "metadata": {}, + "source": [ + "**DataFrame API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "af1e497b-af1e-427a-86c6-8ce1381fc4de", + "metadata": {}, + "outputs": [], + "source": [ + "spark.conf.set(\"spark.sql.shuffle.partitions\", 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "bf30b990-ff9d-4aaa-a77f-20b4a868ef25", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'10'" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.conf.get(\"spark.sql.shuffle.partitions\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6f13f062-1701-4c76-b546-ebcee1704ff5", + "metadata": {}, + "source": [ + "#### Example: Listing Tables and Views" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6a538410-4106-4fc1-a2db-5bf646edfe3e", + "metadata": {}, + "source": [ + "**SQL API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e25f34ef-5e2e-4379-b600-24fff2f11613", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+---------+-----------+\n", + "|namespace|tableName|isTemporary|\n", + "+---------+---------+-----------+\n", + "| default| orders| false|\n", + "| default| people| false|\n", + "| | people2| true|\n", + "+---------+---------+-----------+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"SHOW TABLES\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2e3e1465-383e-4b00-b40c-794e21ed6228", + "metadata": {}, + "source": [ + "**DataFrame API:**" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "81ba9737-3d29-4ae7-8672-403b2e553b94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: orders, isTemporary: False\n", + "Name: people, isTemporary: False\n", + "Name: people2, isTemporary: True\n" + ] + } + ], + "source": [ + "tables = spark.catalog.listTables()\n", + "for table in tables:\n", + " print(f\"Name: {table.name}, isTemporary: {table.isTemporary}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "219912da-50fe-435e-b880-b4d78df4c066", + "metadata": {}, + "source": [ + "### DataFrame API Exclusive Functions\n", + "Certain operations are exclusive to the DataFrame API and are not supported in SQL, such as:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "35892dd2-aae6-4789-bd98-ca0dcb914b3b", + "metadata": {}, + "source": [ + "**withColumn**: Adds or modifies columns in a DataFrame.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "7729d4eb-81f4-430d-8d1e-f6a398f3bc55", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+---+-------+\n", + "| id| name|age|new_col|\n", + "+---+-------+---+-------+\n", + "| 3|Charlie| 30| 40|\n", + "| 1| Alice| 10| 20|\n", + "| 2| Bob| 20| 30|\n", + "+---+-------+---+-------+\n", + "\n" + ] + } + ], + "source": [ + "people_df.withColumn(\"new_col\", people_df[\"age\"] + 10).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "00d09dc1-d788-43cd-b2d1-a9520ecb704b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+---+\n", + "| id| name|age|\n", + "+---+-------+---+\n", + "| 3|Charlie| 40|\n", + "| 1| Alice| 20|\n", + "| 2| Bob| 30|\n", + "+---+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "people_df.withColumn(\"age\", people_df[\"age\"] + 10).show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "04812924-2295-4b09-9a7d-ae26240a2b68", + "metadata": {}, + "source": [ + "## Using SQL and DataFrame API Interchangeably\n", + "PySpark supports switching between SQL and DataFrame API, making it easy to mix and match." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7b4e77bc-47a9-4051-a7eb-c28974465733", + "metadata": {}, + "source": [ + "### Chaining DataFrame Operations on SQL Outputs\n", + "PySpark’s DataFrame API allows you to chain multiple operations together to create efficient and readable transformations. " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "a145c7ba-6567-42ba-8089-cbf4f80e0ecd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+---+\n", + "| name|age|\n", + "+-------+---+\n", + "|Charlie| 30|\n", + "+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "# Chaining DataFrame operations on SQL results\n", + "spark.sql(\"SELECT name, age FROM people\").filter(\"age > 21\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ff055c98-76d6-45b2-a3cc-a998b29156be", + "metadata": {}, + "source": [ + "### Using `selectExpr()`\n", + "The `selectExpr()` method allows you to run SQL expressions within the DataFrame API." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "8784ac95-7cb2-427f-bb6f-58de919fcaf1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------------+\n", + "| name|age_plus_one|\n", + "+-------+------------+\n", + "|Charlie| 31|\n", + "| Alice| 11|\n", + "| Bob| 21|\n", + "+-------+------------+\n", + "\n" + ] + } + ], + "source": [ + "people_df.selectExpr(\"name\", \"age + 1 AS age_plus_one\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fc513ce0-e732-4dcc-ac99-12dc58606fa6", + "metadata": {}, + "source": [ + "### Querying a DataFrame in SQL\n", + "You can create a temporary view from a DataFrame and run SQL queries on it." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "64b253c1-3f19-40fb-a434-f02b563d624a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+---+\n", + "| id| name|age|\n", + "+---+-------+---+\n", + "| 3|Charlie| 30|\n", + "+---+-------+---+\n", + "\n" + ] + } + ], + "source": [ + "# First create a temp view on top of the DataFrame.\n", + "people_df.createOrReplaceTempView(\"people_view\")\n", + "\n", + "# Then it can be referenced in SQL.\n", + "spark.sql(\"SELECT * FROM people_view WHERE age > 21\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e8e0b0be-8ef7-4228-b2af-97bc5c10b1d1", + "metadata": {}, + "source": [ + "### Use Python User-Defined Functions in SQL\n", + "You can register Python user-defined functions (UDFs) for use within SQL queries, enabling custom transformations within SQL syntax." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "d430a9e2-be51-42dc-ad95-dca0cf1d5d4c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------------------+\n", + "| name|uppercase_name(name)|\n", + "+-------+--------------------+\n", + "|Charlie| CHARLIE|\n", + "+-------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "\n", + "# Define the UDF\n", + "@udf(\"string\")\n", + "def uppercase_name(name):\n", + " return name.upper()\n", + "\n", + "# Register the UDF\n", + "spark.udf.register(\"uppercase_name\", uppercase_name)\n", + "\n", + "# Use it in SQL\n", + "spark.sql(\"SELECT name, uppercase_name(name) FROM people_view WHERE age > 21\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72fc87df-0137-4491-b313-da5c7c172e81", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "", + "language": "python", + "name": "" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/docs/source/user_guide/touroftypes.ipynb b/python/docs/source/user_guide/touroftypes.ipynb new file mode 100644 index 000000000000..dffc9b11bf79 --- /dev/null +++ b/python/docs/source/user_guide/touroftypes.ipynb @@ -0,0 +1,1190 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "467e2e79-8418-44af-93fd-d66de6bfab02", + "metadata": {}, + "source": [ + "# Chapter 2: A Tour of PySpark Data Types" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e4f1e431-795b-4da9-87bd-0e0f0370cf69", + "metadata": {}, + "source": [ + "## Basic Data Types in PySpark\n", + "Understanding the basic data types in PySpark is crucial for defining DataFrame schemas and performing efficient data processing. Below is a detailed overview of each type, with descriptions, Python equivalents, and examples:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "19f6805c-d3f9-47ff-8f27-7d3a8314e8bc", + "metadata": {}, + "source": [ + "### Numerical Types\n", + "\n", + "ByteType\n", + "Used to store byte-length integers ranging from `-128` to `127`. Ideal for storing small data efficiently.\n", + "- Python Equivalent: `int` (`-128` to `127`)\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7e05a5cb-d814-4a43-9524-61a1b186871b", + "metadata": {}, + "outputs": [], + "source": [ + "byte_example = 127 # Maximum value for a signed byte" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c1afd5eb-ad40-4179-87bb-bd50b7b81882", + "metadata": {}, + "source": [ + "ShortType\n", + "Represents a short integer, storing values between `-32768` and `32767`. More efficient than using IntegerType for data with smaller numerical ranges.\n", + "- Python Equivalent: `int` (`-32768` to `32767`)\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2055833e-e92e-4ffc-9ed9-87be78ffe9f0", + "metadata": {}, + "outputs": [], + "source": [ + "short_example = 32767 # Maximum value for a signed short" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "92e7e44a", + "metadata": {}, + "source": [ + "IntegerType\n", + "Used to store integer values. Ideal for counts, indices, and any discrete quantity.\n", + "- Python Equivalent: `int` (`-2147483648` to `2147483647`)\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2d05fce6", + "metadata": {}, + "outputs": [], + "source": [ + "integer_example = 123" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ac0b7692", + "metadata": {}, + "source": [ + "LongType\n", + "Suitable for storing large integer values, often used for identifiers or large counts.\n", + "- Python Equivalent: `int` (`-9223372036854775808` to `9223372036854775807`)\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "28a7a514", + "metadata": {}, + "outputs": [], + "source": [ + "long_integer_example = 1234567890123456789" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0aad9f8c-f655-4803-9c01-fd8a4c520121", + "metadata": {}, + "source": [ + "DoubleType\n", + "Provides double precision floating-point numbers for accurate and precise calculations.\n", + "- Python Equivalent: `float` (double precision)\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cc7f7e87-3046-49a2-8621-a238015deb20", + "metadata": {}, + "outputs": [], + "source": [ + "double_example = 12345.6789" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e5a314cb-2206-40a2-9565-3fba0763faf7", + "metadata": {}, + "source": [ + "FloatType\n", + "Used for floating-point numbers where less precision is acceptable in exchange for performance.\n", + "- Python Equivalent: `float` (single precision)\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5d9a0d47-ed4b-4cf6-b98d-629a4d2cfe10", + "metadata": {}, + "outputs": [], + "source": [ + "float_example = 123.456" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4fc2ca5a-f47f-484c-afb5-334555afb901", + "metadata": {}, + "source": [ + "DecimalType\n", + "Allows fixed precision and scale, used in scenarios requiring exact decimal representation, such as financial computations.\n", + "- Python Equivalent: `decimal.Decimal`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f2f3fb9e-97fa-416f-80c5-87e62ae78ab3", + "metadata": {}, + "outputs": [], + "source": [ + "from decimal import Decimal\n", + "decimal_example = Decimal('12345.6789')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e8b97b6e-94bb-4006-b3f6-daa8688e9f6a", + "metadata": {}, + "source": [ + "### StringType\n", + "Used for text data; supports Unicode and is capable of storing any string data.\n", + "- Python Equivalent: `str`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a30aba2f-b686-4382-accd-dd3d473181b7", + "metadata": {}, + "outputs": [], + "source": [ + "string_example = \"Hello, World!\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2e858a57-0e32-4806-b913-3ed74c9d40df", + "metadata": {}, + "source": [ + "### BinaryType\n", + "Used for raw byte data, such as file contents or images, stored as binary streams.\n", + "- Python Equivalent: `bytes`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "40d8f6e6-47a6-4030-bb61-030e2c439f9b", + "metadata": {}, + "outputs": [], + "source": [ + "binary_example = b'Hello, binary world!'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6ed47ac0-dfed-41f7-88aa-0947033fd451", + "metadata": {}, + "source": [ + "### BooleanType\n", + "Represents Boolean values, used extensively in conditional operations and filters.\n", + "- Python Equivalent: `bool`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1c0fc79e-cad2-4e0f-9c16-63805c0dd309", + "metadata": {}, + "outputs": [], + "source": [ + "boolean_example = True" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6de4ab5b-fb95-4bac-a72b-773d9808dd71", + "metadata": {}, + "source": [ + "### Datetime Types\n", + "\n", + "DateType\n", + "Used for dates without time, suitable for storing calendar dates like birthdays or specific days.\n", + "- Python Equivalent: `datetime.date`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "aa99ee3b-ec22-484d-98d1-de6478090cf3", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import date\n", + "date_example = date(2020, 1, 1)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "064f3f44-1e2e-46ab-953f-d8d971b1292e", + "metadata": {}, + "source": [ + "TimestampType\n", + "Stores both date and time, essential for recording precise moments in time, such as log timestamps.\n", + "- Python Equivalent: `datetime.datetime`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "935536c7-edc2-402c-a157-650f4e98613b", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "timestamp_example = datetime(2020, 1, 1, 12, 0)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c54c3210-7022-4adf-b364-8967a06703f9", + "metadata": {}, + "source": [ + "### Creating a DataFrame from Python Objects in PySpark\n", + "Here's how to define a schema and create a DataFrame in PySpark using the Python objects corresponding to each basic data type:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "42fcac44-3fb8-4e9e-82fa-1557275f04e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------+--------------------+------------+-----------+-------------+---------------+--------------------+-------------+----------+-------------------+\n", + "|integer_field| long_field|double_field|float_field|decimal_field| string_field| binary_field|boolean_field|date_field| timestamp_field|\n", + "+-------------+--------------------+------------+-----------+-------------+---------------+--------------------+-------------+----------+-------------------+\n", + "| 123| 1234567890123456789| 12345.6789| 123.456| 12345.67| Hello, World!|[48 65 6C 6C 6F 2...| true|2020-01-01|2020-01-01 12:00:00|\n", + "| 456| 9223372036854775807| 98765.4321| 987.654| 98765.43|Goodbye, World!|[47 6F 6F 64 62 7...| false|2025-12-31|2025-12-31 23:59:00|\n", + "| -1|-1234567890123456789| -12345.6789| -123.456| -12345.67|Negative Values|[4E 65 67 61 74 6...| false|1990-01-01|1990-01-01 00:00:00|\n", + "| 0| 0| 0.0| 0.0| 0.00| | []| true|2000-01-01|2000-01-01 00:00:00|\n", + "+-------------+--------------------+------------+-----------+-------------+---------------+--------------------+-------------+----------+-------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.types import StructType, StructField, IntegerType, LongType, DoubleType, FloatType\n", + "from pyspark.sql.types import DecimalType, StringType, BinaryType, BooleanType, DateType, TimestampType\n", + "from decimal import Decimal\n", + "from datetime import date, datetime\n", + "\n", + "# Define the schema of the DataFrame\n", + "schema = StructType([\n", + " StructField(\"integer_field\", IntegerType(), nullable=False),\n", + " StructField(\"long_field\", LongType(), nullable=False),\n", + " StructField(\"double_field\", DoubleType(), nullable=False),\n", + " StructField(\"float_field\", FloatType(), nullable=False),\n", + " StructField(\"decimal_field\", DecimalType(10, 2), nullable=False),\n", + " StructField(\"string_field\", StringType(), nullable=False),\n", + " StructField(\"binary_field\", BinaryType(), nullable=False),\n", + " StructField(\"boolean_field\", BooleanType(), nullable=False),\n", + " StructField(\"date_field\", DateType(), nullable=False),\n", + " StructField(\"timestamp_field\", TimestampType(), nullable=False)\n", + "])\n", + "\n", + "# Sample data using the Python objects corresponding to each PySpark type\n", + "data = [\n", + " (123, 1234567890123456789, 12345.6789, 123.456, Decimal('12345.67'), \"Hello, World!\",\n", + " b'Hello, binary world!', True, date(2020, 1, 1), datetime(2020, 1, 1, 12, 0)),\n", + " (456, 9223372036854775807, 98765.4321, 987.654, Decimal('98765.43'), \"Goodbye, World!\",\n", + " b'Goodbye, binary world!', False, date(2025, 12, 31), datetime(2025, 12, 31, 23, 59)),\n", + " (-1, -1234567890123456789, -12345.6789, -123.456, Decimal('-12345.67'), \"Negative Values\",\n", + " b'Negative binary!', False, date(1990, 1, 1), datetime(1990, 1, 1, 0, 0)),\n", + " (0, 0, 0.0, 0.0, Decimal('0.00'), \"\", b'', True, date(2000, 1, 1), datetime(2000, 1, 1, 0, 0))\n", + "]\n", + "\n", + "# Create DataFrame\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "# Show the DataFrame\n", + "df.show()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2495a43f-58a8-4027-9987-457487656ee7", + "metadata": {}, + "source": [ + "## Precision for Doubles, Floats, and Decimals\n", + "Understanding precision in numerical data types is critical for data integrity, especially in fields requiring high accuracy such as financial analysis, scientific computation, and engineering. PySpark offers different data types to cater to these needs." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "18e8f2db-782a-484f-ac0c-da7aaf19c79b", + "metadata": {}, + "source": [ + "FloatType\n", + "`FloatType` in PySpark represents a single precision 32-bit IEEE 754 floating-point number. It's less precise but requires less storage and can be processed faster than DoubleType. This makes it suitable for applications where a large volume of numerical data needs to be processed quickly and extreme precision is not critical.\n", + "Usage Scenario\n", + "Useful in machine learning algorithms for faster computation when processing large datasets." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3d1c9c29-4de9-478d-b336-3e141ef81dcc", + "metadata": {}, + "source": [ + "DoubleType\n", + "`DoubleType` corresponds to a double precision 64-bit IEEE 754 floating-point number. It provides a good balance between precision and performance and is suitable for most numerical calculations where precision is important.\n", + "Usage Scenario\n", + "Ideal for financial calculations where precision is more crucial than computational speed." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0ef870f4-575a-4e1f-912c-d4c84180beaf", + "metadata": {}, + "source": [ + "DecimalType\n", + "`DecimalType` is used when dealing with high-precision fixed-scale decimal numbers. The precision and scale can be defined by the user, which makes it invaluable for applications such as financial reporting, where precise decimal representation helps avoid rounding errors.\n", + "Usage Scenario\n", + "Critical in accounting applications where calculations need to be accurate to the cent." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4dc12018-9e48-47da-a898-c05ae37fb4ea", + "metadata": {}, + "source": [ + "### Example: Calculating Financial Statistics\n", + "This example demonstrates how to use different numerical data types in PySpark for financial calculations, such as aggregating revenues and calculating averages with appropriate precision." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "144459bf-8e8e-4c38-b820-f259f225709f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------+---------------------+--------------------+----------------------+---------------------+-----------------------+\n", + "|Total_Revenue_Float|Average_Revenue_Float|Total_Revenue_Double|Average_Revenue_Double|Total_Revenue_Decimal|Average_Revenue_Decimal|\n", + "+-------------------+---------------------+--------------------+----------------------+---------------------+-----------------------+\n", + "| 165,432.20| 55,144.07| 165,432.21| 55,144.07| 165,432.21| 55,144.07|\n", + "+-------------------+---------------------+--------------------+----------------------+---------------------+-----------------------+\n", + "\n" + ] + } + ], + "source": [ + "from decimal import Decimal\n", + "\n", + "from pyspark.sql.types import StructType, StructField, FloatType, DoubleType, DecimalType\n", + "from pyspark.sql.functions import sum, avg, col, format_number\n", + "\n", + "# Define the schema of the DataFrame\n", + "schema = StructType([\n", + " StructField(\"revenue_float\", FloatType(), nullable=False),\n", + " StructField(\"revenue_double\", DoubleType(), nullable=False),\n", + " StructField(\"revenue_decimal\", DecimalType(10, 2), nullable=False)\n", + "])\n", + "\n", + "# Sample data\n", + "data = [\n", + " (12345.67, 12345.6789, Decimal('12345.68')),\n", + " (98765.43, 98765.4321, Decimal('98765.43')),\n", + " (54321.10, 54321.0987, Decimal('54321.10'))\n", + "]\n", + "\n", + "# Create DataFrame\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "# Calculations\n", + "result = df.select(\n", + " format_number(sum(col(\"revenue_float\")), 2).alias(\"Total_Revenue_Float\"),\n", + " format_number(avg(col(\"revenue_float\")), 2).alias(\"Average_Revenue_Float\"),\n", + " format_number(sum(col(\"revenue_double\")), 2).alias(\"Total_Revenue_Double\"),\n", + " format_number(avg(col(\"revenue_double\")), 2).alias(\"Average_Revenue_Double\"),\n", + " format_number(sum(col(\"revenue_decimal\")), 2).alias(\"Total_Revenue_Decimal\"),\n", + " format_number(avg(col(\"revenue_decimal\")), 2).alias(\"Average_Revenue_Decimal\")\n", + ")\n", + "\n", + "result.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "22e45ca5-d694-4bc1-83ac-ad1fc13fa745", + "metadata": {}, + "source": [ + "## Complex Data Types in PySpark\n", + "Complex data types in PySpark facilitate the handling of nested and structured data, which is essential for working with modern data formats like JSON, XML, and others commonly found in big data ecosystems. This section explores the primary complex data types available in PySpark: `ArrayType`, `StructType`, `MapType`, and their use cases." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "993de0fb-c568-42a2-85f0-5ab1bb80027d", + "metadata": {}, + "source": [ + "ArrayType\n", + "Allows storage of multiple values of the same type in a single column. Ideal for data that naturally forms a list, such as tags, categories, or historical data points.\n", + "- Python Equivalent: `list`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "547ba1eb-e115-4c07-b554-b3f69e321042", + "metadata": {}, + "outputs": [], + "source": [ + "array_example = ['apple', 'banana', 'cherry']" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "66549a85-4e94-4bb7-adf3-547bc8f7e7e7", + "metadata": {}, + "source": [ + "Usage Scenario\n", + "Managing lists of items associated with each record, such as multiple phone numbers or email addresses for a single contact." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c7f28820-4db9-4bb2-9290-c472585b1e6f", + "metadata": {}, + "source": [ + "StructType\n", + "Enables nesting of DataFrame columns, allowing complex and hierarchical data structures within a single DataFrame cell. Each field in a `StructType` can itself be a complex type. It's similar to a row in a DataFrame, typically used to encapsulate records with a structured schema.\n", + "- Python Equivalent: `pyspark.sql.Row`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "51e2337c-4d81-4d12-8ebf-ed3bdbab9ce7", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import Row\n", + "struct_example = Row(name=\"John Doe\", age=30, address=Row(street=\"123 Elm St\", city=\"Somewhere\"))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7bc34c76-0037-45e0-a640-488f393a9536", + "metadata": {}, + "source": [ + "Usage Scenario\n", + "Often used to represent a JSON object, enabling the manipulation of each JSON field as if it were a column in the DataFrame." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a5049490-8a4d-4f2d-99ed-1554cd8a4ef9", + "metadata": {}, + "source": [ + "MapType\n", + "Represents a key-value pair in a DataFrame column, where each key and value can be of any data type. Useful for dynamically structured data.\n", + "- Python Equivalent: `dict`\n", + "Python Example" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6bb04ee8-b9f0-427f-88a7-be0cdc049192", + "metadata": {}, + "outputs": [], + "source": [ + "map_example = {'food': 'pizza', 'color': 'blue', 'car': 'Tesla'}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7506bea9-6487-4202-bde4-3b0764ce92b2", + "metadata": {}, + "source": [ + "Usage Scenario\n", + "Storing and processing collections of key-value pairs within a single DataFrame column, like attributes of a product where keys are attribute names and values are attribute values." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "915a020f-b439-4da9-9090-7cdb54f32aef", + "metadata": {}, + "source": [ + "### Example: Handling Complex Nested Data\n", + "To illustrate the use of these complex data types, let's consider a practical example involving nested data structures such as a customer record that includes multiple addresses and preferences in various categories." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1bee3e29-0cc3-4345-aa05-c5e85d4a9b6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+---------------------------------------------------------------+---------------------------------------------+\n", + "|name |addresses |preferences |\n", + "+----------+---------------------------------------------------------------+---------------------------------------------+\n", + "|John Doe |[{123 Elm St, Somewhere, 12345}, {456 Oak St, Anywhere, 67890}]|{color -> blue, car -> Tesla, food -> pizza} |\n", + "|Jane Smith|[{789 Pine St, Everywhere, 10112}] |{color -> green, car -> Honda, food -> sushi}|\n", + "+----------+---------------------------------------------------------------+---------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType\n", + "from pyspark.sql import Row\n", + "\n", + "# Define the schema of the DataFrame\n", + "schema = StructType([\n", + " StructField(\"name\", StringType(), nullable=False),\n", + " StructField(\"addresses\", ArrayType(\n", + " StructType([\n", + " StructField(\"street\", StringType(), nullable=False),\n", + " StructField(\"city\", StringType(), nullable=False),\n", + " StructField(\"zip\", StringType(), nullable=False)\n", + " ])\n", + " ), nullable=True),\n", + " StructField(\"preferences\", MapType(StringType(), StringType()), nullable=True)\n", + "])\n", + "\n", + "# Sample data using Row objects for StructType\n", + "data = [\n", + " Row(name=\"John Doe\",\n", + " addresses=[Row(street=\"123 Elm St\", city=\"Somewhere\", zip=\"12345\"),\n", + " Row(street=\"456 Oak St\", city=\"Anywhere\", zip=\"67890\")],\n", + " preferences={\"food\": \"pizza\", \"color\": \"blue\", \"car\": \"Tesla\"}),\n", + " Row(name=\"Jane Smith\",\n", + " addresses=[Row(street=\"789 Pine St\", city=\"Everywhere\", zip=\"10112\")],\n", + " preferences={\"food\": \"sushi\", \"color\": \"green\", \"car\": \"Honda\"})\n", + "]\n", + "\n", + "# Create DataFrame\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "# Show the DataFrame\n", + "df.show(truncate=False)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1fdd3857-2e87-46c7-bd34-f652b39dcf1c", + "metadata": {}, + "source": [ + "In this example:\n", + "- `ArrayType` is used to store multiple addresses for each customer.\n", + "- `StructType` is nested within `ArrayType` to represent each address as a structured record.\n", + "- `MapType` stores preferences as key-value pairs, allowing for dynamic data storage." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9d374572-a06b-48dd-87e1-c21a89b6f6e7", + "metadata": {}, + "source": [ + "## Casting Columns in PySpark\n", + "Casting columns is a fundamental operation in data processing where the data type of a column in a DataFrame is converted from one type to another. PySpark provides straightforward methods that enable you to align input data types with the requirements of data processing operations or applications." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "13a44a7d-48cb-4548-a020-489b8753dc55", + "metadata": {}, + "source": [ + "### How to Cast Columns\n", + "To cast columns in PySpark, the `cast()` or `astype()` method can be used on a column. Here’s a complete example demonstrating how to perform basic casting operations:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0bc63e7b-0819-4dff-bb42-db63f4b17928", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original DataFrame:\n", + "+------------+-------------+\n", + "|float_column|string_column|\n", + "+------------+-------------+\n", + "| 123.456| 123|\n", + "| 789.012| 456|\n", + "| NULL| 789|\n", + "+------------+-------------+\n", + "\n", + "DataFrame after Casting:\n", + "+------------+-------------+-----------------+-------------------+\n", + "|float_column|string_column|string_from_float|integer_from_string|\n", + "+------------+-------------+-----------------+-------------------+\n", + "| 123.456| 123| 123.456| 123|\n", + "| 789.012| 456| 789.012| 456|\n", + "| NULL| 789| NULL| 789|\n", + "+------------+-------------+-----------------+-------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import col\n", + "from pyspark.sql.types import StructType, StructField, StringType, FloatType\n", + "\n", + "# Define the schema of the DataFrame\n", + "schema = StructType([\n", + " StructField(\"float_column\", FloatType(), nullable=True),\n", + " StructField(\"string_column\", StringType(), nullable=True)\n", + "])\n", + "\n", + "# Sample data\n", + "data = [\n", + " (123.456, \"123\"),\n", + " (789.012, \"456\"),\n", + " (None, \"789\")\n", + "]\n", + "\n", + "# Create DataFrame\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "# Display original DataFrame\n", + "print(\"Original DataFrame:\")\n", + "df.show()\n", + "\n", + "# Example of casting a float column to string\n", + "df = df.withColumn('string_from_float', col('float_column').cast('string'))\n", + "\n", + "# Example of casting a string column to integer\n", + "df = df.withColumn('integer_from_string', col('string_column').cast('integer'))\n", + "\n", + "# Display DataFrame after casting\n", + "print(\"DataFrame after Casting:\")\n", + "df.show()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "433312a0-9860-4879-8859-e94d711447b3", + "metadata": {}, + "source": [ + "### Cast with Caution: Potential Data Loss\n", + "When casting columns, it's important to be aware of how PySpark handles incompatible or invalid casting operations:\n", + "\n", + "Silent Conversion to Null\n", + "- If ANSI mode is disabled, PySpark does not throw an error if a value cannot be converted to the desired type during casting. Instead, it overflows or converts the value to `null`. This behavior can lead to data loss in your dataset, which might not be immediately obvious.\n", + "- If ANSI mode is enabled, PySpark throws an error in that case. If it is acceptable, use `try_cast` instead.\n", + "\n", + "Example: Checking for Data Loss\n", + "- It's a good practice to check for unexpected nulls that result from casting operations, especially when converting from string to numeric types where formatting issues may cause failures." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ccbd0999-89b1-47c4-80d8-b5b07dca627f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original DataFrame:\n", + "+---------------+\n", + "|original_column|\n", + "+---------------+\n", + "| 123|\n", + "| abc|\n", + "| NULL|\n", + "+---------------+\n", + "\n", + "DataFrame Showing Potential Data Loss:\n", + "+---------------+-------------+\n", + "|original_column|casted_column|\n", + "+---------------+-------------+\n", + "| abc| NULL|\n", + "+---------------+-------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import col\n", + "from pyspark.sql.types import StructType, StructField, StringType\n", + "\n", + "# Disable ANSI mode\n", + "spark.conf.set(\"spark.sql.ansi.enabled\", False)\n", + "\n", + "# Define the schema of the DataFrame\n", + "schema = StructType([\n", + " StructField(\"original_column\", StringType(), nullable=True)\n", + "])\n", + "\n", + "# Sample data\n", + "data = [\n", + " (\"123\",), # Valid integer in string form\n", + " (\"abc\",), # Invalid, will result in null when cast to integer\n", + " (None,) # Original null, remains null\n", + "]\n", + "\n", + "# Create DataFrame\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "# Display original DataFrame\n", + "print(\"Original DataFrame:\")\n", + "df.show()\n", + "\n", + "# Add a new column with casted values\n", + "df = df.withColumn('casted_column', col('original_column').cast('integer'))\n", + "\n", + "# Show rows where casting resulted in nulls but the original column had data\n", + "print(\"DataFrame Showing Potential Data Loss:\")\n", + "df.filter(col('original_column').isNotNull() & col('casted_column').isNull()).show()\n", + "\n", + "spark.conf.unset(\"spark.sql.ansi.enabled\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "07832bae-7065-4f27-a5c4-ea67a609d31d", + "metadata": {}, + "source": [ + "### Best Practices for Casting\n", + "\n", + "Validate Data First\n", + "- Before casting columns, especially when converting strings to numerical types, validate and clean your data to ensure it conforms to expected formats.\n", + "\n", + "Example: Checking if numeric strings are properly formatted before casting to integers" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a0ed0018-fb20-47b7-a652-a4c6c9c3100b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+\n", + "|data|\n", + "+----+\n", + "| 100|\n", + "| 300|\n", + "+----+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import col, regexp_extract\n", + "\n", + "# Sample DataFrame with a string column\n", + "df = spark.createDataFrame([(\"100\",), (\"20x\",), (\"300\",)], [\"data\"])\n", + "\n", + "# Checking and filtering rows where data can be safely cast to an integer\n", + "valid_df = df.filter(regexp_extract(col(\"data\"), '^[0-9]+$', 0) != \"\")\n", + "valid_df.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f6608cab-c774-4d89-9f7e-6b739855988d", + "metadata": {}, + "source": [ + "Use Explicit Schemas\n", + "- When reading data, use explicit schemas to avoid incorrect data type inference, which can minimize the need for casting.\n", + "\n", + "Example: Specifying a schema when reading data to ensure correct data types are applied from the start" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e217a874-cbba-4041-a153-163433f2c9ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- Employee ID: integer (nullable = true)\n", + " |-- Role: string (nullable = true)\n", + " |-- Location: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.types import StructType, StructField, IntegerType, StringType\n", + "\n", + "# Define a schema\n", + "schema = StructType([\n", + " StructField(\"Employee ID\", IntegerType(), True),\n", + " StructField(\"Role\", StringType(), True),\n", + " StructField(\"Location\", StringType(), True)\n", + "])\n", + "\n", + "# Read data with an explicit schema\n", + "df = spark.read.csv(\"../data/employees.csv\", schema=schema)\n", + "df.printSchema()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "81d081eb-be08-41bd-9f9b-cc240214c662", + "metadata": {}, + "source": [ + "## Semi-Structured Data Processing in PySpark\n", + "This section explores PySpark’s capabilities for handling semi-structured data formats, particularly focusing on JSON and XML, and addresses approaches for managing VARIANT-like data, which is commonly used in some SQL databases." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7b00a46d-52f4-47c4-8719-70a45f766634", + "metadata": {}, + "source": [ + "### JSON Processing\n", + "JSON is a widely used format in web services and data interchange. PySpark simplifies parsing JSON data into structured DataFrames, making it easy to manipulate and analyze.\n", + "\n", + "Key Functions\n", + "- `from_json()`: Converts JSON strings into a DataFrame column with a structured data type.\n", + "- `to_json()`: Converts columns of a DataFrame into JSON strings.\n", + "\n", + "Example: Parsing JSON Strings" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9b06236c-a1f1-479a-8c65-3f252a9a8474", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+\n", + "|parsed_json|\n", + "+-----------+\n", + "| {John, 30}|\n", + "| {Jane, 25}|\n", + "+-----------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import from_json, col\n", + "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", + "\n", + "json_schema = StructType([\n", + " StructField(\"name\", StringType()),\n", + " StructField(\"age\", IntegerType())\n", + "])\n", + "\n", + "df = spark.createDataFrame([(\"{\\\"name\\\":\\\"John\\\", \\\"age\\\":30}\",), (\"{\\\"name\\\":\\\"Jane\\\", \\\"age\\\":25}\",)], [\"json_str\"])\n", + "df.select(from_json(col(\"json_str\"), json_schema).alias(\"parsed_json\")).show()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "43152087-b2a8-4bc6-a709-41b27bc2a96a", + "metadata": {}, + "source": [ + "Example: Reading and Processing JSON Data" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "115a64a3-0ff2-41c6-bc54-2321c9d24203", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------+--------------------+---------+\n", + "| author| title| genre|\n", + "+-------------+--------------------+---------+\n", + "|George Orwell| 1984|Dystopian|\n", + "| Jane Austen| Pride and Prejudice| Romance|\n", + "| Mark Twain|Adventures of Huc...| Fiction|\n", + "+-------------+--------------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read.json('../data/books.json')\n", + "df.select(\"author\", \"title\", \"genre\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "165ed955-b461-4524-8837-6485ebe33afe", + "metadata": {}, + "source": [ + "### XML Processing\n", + "\n", + "
\n", + "Note: This section applies to Spark 4.0\n", + "
\n", + "\n", + "XML is another common format for semi-structured data, used extensively in various enterprise applications.\n", + "Example: Reading and Processing XML Data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e4651cd3-eba4-4bf2-83ca-e7185130f470", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------+--------------------+---------+\n", + "| author| title| genre|\n", + "+-------------+--------------------+---------+\n", + "|George Orwell| 1984|Dystopian|\n", + "| Jane Austen| Pride and Prejudice| Romance|\n", + "| Mark Twain|Adventures of Huc...| Fiction|\n", + "+-------------+--------------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read \\\n", + " .format('xml') \\\n", + " .option('rowTag', 'book') \\\n", + " .load('../data/books.xml')\n", + "df.select(\"author\", \"title\", \"genre\").show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "69f4daf5-fe9a-4399-93d2-abba41152962", + "metadata": {}, + "source": [ + "### Handling VARIANT Data Types in PySpark\n", + "\n", + "
\n", + "Note: This section applies to Spark 4.0\n", + "
\n", + "\n", + "With the introduction of the VARIANT data type, handling semi-structured data has become more streamlined. VARIANT types are designed to store data that doesn't conform to a fixed schema, such as JSON or XML, directly within a DataFrame column.\n", + "\n", + "Features of VARIANT in PySpark\n", + "- **Flexibility**: VARIANT types can store data structures like JSON or XML without predefined schema constraints, offering high flexibility for data ingestion and manipulation.\n", + "- **Integration**: Provides better integration with systems that use semi-structured data, allowing for more direct data exchanges and queries.\n", + "\n", + "Considerations When Using VARIANT\n", + "- **Performance**: While VARIANT provides flexibility, it might impact performance due to its dynamic nature. It's important to test and optimize data operations involving VARIANT types.\n", + "- **Compatibility**: Ensure that all parts of your data pipeline support VARIANT if you're leveraging this data type, especially when exporting data to external systems.\n", + "\n", + "Practical Example: Handling JSON Data with VARIANT\n", + "This example demonstrates how VARIANT can be used to handle JSON data effectively in PySpark:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f50668e1-ea8a-41bf-9bbf-850dff43f34c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- variant_data: variant (nullable = true)\n", + "\n", + "+-------------------------------------------------------+\n", + "|variant_data |\n", + "+-------------------------------------------------------+\n", + "|1234567890123456789 |\n", + "|12345.6789 |\n", + "|\"Hello, World!\" |\n", + "|true |\n", + "|{\"attributes\":{\"key1\":\"value1\",\"key2\":\"value2\"},\"id\":1}|\n", + "|{\"attributes\":{\"key1\":\"value3\",\"key2\":\"value4\"},\"id\":2}|\n", + "+-------------------------------------------------------+\n", + "\n", + "+-------------------+----+------+------+\n", + "| long_value| id| key1| key2|\n", + "+-------------------+----+------+------+\n", + "|1234567890123456789|NULL| NULL| NULL|\n", + "| 12345|NULL| NULL| NULL|\n", + "| NULL|NULL| NULL| NULL|\n", + "| 1|NULL| NULL| NULL|\n", + "| NULL| 1|value1|value2|\n", + "| NULL| 2|value3|value4|\n", + "+-------------------+----+------+------+\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[1234567890123456789,\n", + " Decimal('12345.6789'),\n", + " 'Hello, World!',\n", + " True,\n", + " {'attributes': {'key1': 'value1', 'key2': 'value2'}, 'id': 1},\n", + " {'attributes': {'key1': 'value3', 'key2': 'value4'}, 'id': 2}]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import date, datetime\n", + "from decimal import Decimal\n", + "\n", + "from pyspark.sql.functions import try_parse_json, try_variant_get, col\n", + "\n", + "# Sample JSON data\n", + "data = [\n", + " '1234567890123456789',\n", + " '12345.6789',\n", + " '\"Hello, World!\"',\n", + " 'true',\n", + " '{\"id\": 1, \"attributes\": {\"key1\": \"value1\", \"key2\": \"value2\"}}',\n", + " '{\"id\": 2, \"attributes\": {\"key1\": \"value3\", \"key2\": \"value4\"}}',\n", + "]\n", + "\n", + "# Load data into DataFrame with VARIANT\n", + "df = spark.createDataFrame(data, StringType()).select(try_parse_json(col(\"value\")).alias(\"variant_data\"))\n", + "df.printSchema()\n", + "df.show(truncate=False)\n", + "\n", + "# Accessing elements inside the VARIANT\n", + "df.select(\n", + " try_variant_get(col(\"variant_data\"), \"$\", \"long\").alias(\"long_value\"),\n", + " try_variant_get(col(\"variant_data\"), \"$.id\", \"int\").alias(\"id\"),\n", + " try_variant_get(col(\"variant_data\"), \"$.attributes.key1\", \"string\").alias(\"key1\"),\n", + " try_variant_get(col(\"variant_data\"), \"$.attributes.key2\", \"string\").alias(\"key2\"),\n", + ").show()\n", + "\n", + "# Collect data and convert to Python objects\n", + "[row[\"variant_data\"].toPython() for row in df.collect()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/docs/source/user_guide/udfandudtf.ipynb b/python/docs/source/user_guide/udfandudtf.ipynb new file mode 100644 index 000000000000..08f52ae461f6 --- /dev/null +++ b/python/docs/source/user_guide/udfandudtf.ipynb @@ -0,0 +1,997 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "42a74618-9a43-4f03-9c3c-a63380f23e75", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Chapter 5: Unleashing UDFs & UDTFs\n", + "\n", + "In large-scale data processing, customization is often necessary to extend the native capabilities of Spark. *Python User-Defined Functions (UDFs)* and *User-Defined Table Functions (UDTFs)* offer a way to perform complex transformations and computations using Python, seamlessly integrating them into Spark’s distributed environment.\n", + "\n", + "In this section, we’ll explore how to write and use UDFs and UDTFs in Python, leveraging PySpark to perform complex data transformations that go beyond Spark’s built-in functions." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "acc11405-3fdd-4309-90c0-08af0b2015a0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Python UDFs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2c3818be-4c83-4a80-9c6f-ca3a7197a651", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Categories of Python UDFs\n", + "\n", + "There are two main categories of UDFs supported in PySpark: Scalar Python UDFs and Pandas UDFs.\n", + "\n", + "- *Scalar Python UDFs* are user-defined scalar functions that take or return Python objects serialized/deserialized by [pickle](https://python.readthedocs.io/en/latest/library/pickle.html) or [Arrow](https://arrow.readthedocs.io/en/latest/) and operate one row at a time\n", + "- *Pandas UDFs* (a.k.a. Vectorized UDFs) are UDFs that take/return pandas Series or DataFrame serialized/deserialized by Apache Arrow and operate block by block. Pandas UDFs have some variations categorized by usage, with specific input and output types: Series to Series, Series to Scalar, and Iterator to Iterator.\n", + "\n", + "Based on Pandas UDFs implementation, there are also *Pandas Function APIs*: Map (i.e., `mapInPandas`) and (Co)Grouped Map (i.e., `applyInPandas`), as well as an Arrow Function API - `mapInArrow`." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "192eeb2f-8f3e-41fd-922c-9b024705233e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### To create a Scalar Python UDF" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "27aec808-13e8-4127-adb4-37dae1db057e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "In the code below, we've created a simple scalar Python UDF." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c41a09f4-5762-4e44-acb1-706aae7158c8", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "\n", + "@udf(returnType='int')\n", + "def slen(s: str):\n", + " return len(s)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bf375ec-4340-4cdc-8eb9-eed8fd0b83dc", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### Arrow optimization\n", + "\n", + "Scalar Python UDFs rely on [cloudpickle](https://pypi.org/project/cloudpickle/) for serialization and deserialization, and encounter performance bottlenecks, particularly when dealing with large data inputs and outputs. We introduce Arrow-optimized Python UDFs to significantly improve performance. \n", + "\n", + "At the core of this optimization lies Apache Arrow, a standardized cross-language columnar in-memory data representation. By harnessing Arrow, these UDFs bypass the traditional, slower methods of data (de)serialization, leading to swift data exchange between JVM and Python processes. With Apache Arrow's rich type system, these optimized UDFs offer a more consistent and standardized way to handle type coercion.\n", + "\n", + "We can control whether or not to enable Arrow optimization for individual UDFs by using the `useArrow` boolean parameter of `functions.udf`. An example is as shown below:\n", + "\n", + "```py\n", + "from pyspark.sql.functions import udf\n", + "\n", + "@udf(returnType='int', useArrow=True) # An Arrow Python UDF\n", + "def arrow_slen(s: str):\n", + " ...\n", + "```\n", + "\n", + "In addition, we can enable Arrow optimization for all UDFs of an entire SparkSession via a Spark configuration: `spark.sql.execution.pythonUDF.arrow.enabled`, as shown below:\n", + "\n", + "```py\n", + "spark.conf.set(\"spark.sql.execution.pythonUDF.arrow.enabled\", True)\n", + "\n", + "@udf(returnType='int') # An Arrow Python UDF\n", + "def arrow_slen(s: str):\n", + " ...\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0f131ffd-29da-458d-9281-17225e028ba5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### To use a Scalar Python UDF\n", + "\n", + "In Python, we can invoke a UDF directly on column(s), just like a built-in Spark function, as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "61da08a0-89f0-419e-8cfc-46916c518a31", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+-----------+\n", + "| name|name_length|\n", + "+-------+-----------+\n", + "| Alice| 5|\n", + "| Bob| 3|\n", + "|Charlie| 7|\n", + "+-------+-----------+\n", + "\n" + ] + } + ], + "source": [ + "data = [(\"Alice\",), (\"Bob\",), (\"Charlie\",)]\n", + "df = spark.createDataFrame(data, [\"name\"])\n", + "df.withColumn(\"name_length\", slen(df[\"name\"])).show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9789ada1-28cb-411d-bb0a-abe28e1c5c6b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### To create a Pandas UDF\n", + "\n", + "In the code below, we've created a Pandas UDF which takes one `pandas.Series` and outputs one `pandas.Series`" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fe3d9287-6d79-4b3a-ac3c-0827a9cd3ac3", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------+\n", + "|to_upper(name)|\n", + "+--------------+\n", + "| JOHN DOE|\n", + "+--------------+\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from pyspark.sql.functions import pandas_udf\n", + "\n", + "@pandas_udf(\"string\")\n", + "def to_upper(s: pd.Series) -> pd.Series:\n", + " return s.str.upper()\n", + "\n", + "df = spark.createDataFrame([(\"John Doe\",)], (\"name\",))\n", + "df.select(to_upper(\"name\")).show()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c2930070-349c-4510-bba1-e58c9c0c5db5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### To use a Pandas UDF\n", + "\n", + "Similar to a Scalar Python UDF, we can also invoke a pandas UDF directly on column(s):" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b770a716-7c7d-4e84-826e-34ebfab896bb", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+-----------+\n", + "| name|name_length|\n", + "+-------+-----------+\n", + "| Alice| ALICE|\n", + "| Bob| BOB|\n", + "|Charlie| CHARLIE|\n", + "+-------+-----------+\n", + "\n" + ] + } + ], + "source": [ + "data = [(\"Alice\",), (\"Bob\",), (\"Charlie\",)]\n", + "df = spark.createDataFrame(data, [\"name\"])\n", + "df.withColumn(\"name_length\", to_upper(df[\"name\"])).show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "903d4963-7d2b-4e6b-aecb-c1b012a66a34", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### More Examples\n", + "\n", + "#### Example 1: Python UDF to Process DataFrame with String and List Columns" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f041c492-eb98-4233-9b6e-ea6f580036ce", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------+-----------+--------------------------------+\n", + "|text_column |list_column|process_row |\n", + "+--------------+-----------+--------------------------------+\n", + "|Hello World |[1, 2, 3] |Vowels: 3, Doubled: [2, 4, 6] |\n", + "|PySpark is Fun|[4, 5, 6] |Vowels: 3, Doubled: [8, 10, 12] |\n", + "|PySpark Rocks |[7, 8, 9] |Vowels: 2, Doubled: [14, 16, 18]|\n", + "+--------------+-----------+--------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.types import ArrayType, IntegerType, StringType\n", + "from pyspark.sql.functions import udf\n", + "\n", + "data = [\n", + " (\"Hello World\", [1, 2, 3]),\n", + " (\"PySpark is Fun\", [4, 5, 6]),\n", + " (\"PySpark Rocks\", [7, 8, 9])\n", + "]\n", + "df = spark.createDataFrame(data, [\"text_column\", \"list_column\"])\n", + "\n", + "@udf(returnType=\"string\")\n", + "def process_row(text: str, numbers):\n", + " vowels_count = sum(1 for char in text if char in \"aeiouAEIOU\")\n", + " doubled = [x * 2 for x in numbers]\n", + " return f\"Vowels: {vowels_count}, Doubled: {doubled}\"\n", + "\n", + "df.withColumn(\"process_row\", process_row(df[\"text_column\"], df[\"list_column\"])).show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1a45ed0d-934d-451f-a259-e41696107e98", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### Example 2: Pandas UDF for Statistical Computations and Complex Transformation" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ad7d9ccc-4b08-48b0-854b-78529e7789b3", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------+----------------+------------------------------+\n", + "|numeric_column|text_column |result |\n", + "+--------------+----------------+------------------------------+\n", + "|10.0 |Spark |{10.0, 10.0, SPARK} |\n", + "|20.0 |Big Data |{20.0, 20.0, ataD giB} |\n", + "|30.0 |AI |{30.0, 30.0, AI} |\n", + "|40.0 |Machine Learning|{40.0, 40.0, gninraeL enihcaM}|\n", + "|50.0 |Deep Learning |{50.0, 50.0, gninraeL peeD} |\n", + "+--------------+----------------+------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import pandas_udf\n", + "from pyspark.sql.types import StructType, StructField, DoubleType, StringType\n", + "import pandas as pd\n", + "\n", + "data = [\n", + " (10.0, \"Spark\"),\n", + " (20.0, \"Big Data\"),\n", + " (30.0, \"AI\"),\n", + " (40.0, \"Machine Learning\"),\n", + " (50.0, \"Deep Learning\")\n", + "]\n", + "df = spark.createDataFrame(data, [\"numeric_column\", \"text_column\"])\n", + "\n", + "# Schema for the result\n", + "schema = StructType([\n", + " StructField(\"mean_value\", DoubleType(), True),\n", + " StructField(\"sum_value\", DoubleType(), True),\n", + " StructField(\"processed_text\", StringType(), True)\n", + "])\n", + "\n", + "@pandas_udf(schema)\n", + "def compute_stats_and_transform_string(numeric_col: pd.Series, text_col: pd.Series) -> pd.DataFrame:\n", + " mean_value = numeric_col.mean()\n", + " sum_value = numeric_col.sum()\n", + "\n", + " # Reverse the string if its length is greater than 5, otherwise capitalize it\n", + " processed_text = text_col.apply(lambda x: x[::-1] if len(x) > 5 else x.upper())\n", + "\n", + " result_df = pd.DataFrame({\n", + " \"mean_value\": [mean_value] * len(text_col),\n", + " \"sum_value\": [sum_value] * len(text_col),\n", + " \"processed_text\": processed_text\n", + " })\n", + " \n", + " return result_df\n", + "\n", + "df.withColumn(\"result\", compute_stats_and_transform_string(df[\"numeric_column\"], df[\"text_column\"])).show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a536afc7-d343-4ebf-b44a-ee94a5dd1cb0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Python UDTFs\n", + "\n", + "A Python user-defined table function (UDTF) is a new kind of function that returns a table as output instead of a single scalar result value. Once registered, they can appear in the FROM clause of a SQL query.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ddd5a76e-0339-4a07-b957-cfef51d2e5e1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### When to use Python UDTFs\n", + "\n", + "In short, if you want a function that generates multiple rows and columns, and want to leverage the rich Python ecosystem, Python UDTFs are for you.\n", + "\n", + "- **Python UDTFs vs Python UDFs**: While Python UDFs in Spark are designed to each accept zero or more scalar values as input, and return a single value as output, UDTFs offer more flexibility. They can return multiple rows and columns, extending the capabilities of UDFs. Here are a few scenarios where UDTFs are particularly helpful:\n", + "\n", + " - Exploding nested data types like arrays or structs, transforming it into multiple rows\n", + " - Dealing with string data that needs to be split into multiple parts, each represented as a separate row or multiple columns\n", + " - Generating rows based on input ranges, such as creating sequences of numbers, timestamps, or records for different dates\n", + "\n", + "- **Python UDTFs vs SQL UDTFs**: SQL UDTFs are efficient and versatile, but Python offers a richer set of libraries and tools. Compared to SQL, Python provides tools to enable advanced transformations or computations (e.g. statistical functions or machine learning inferences)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2c1f1457-8bb5-4d50-b896-3d6717036b4a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### To create a Python UDTF\n", + "\n", + "In the code below, we've created a simple UDTF that takes two integers as inputs and produces two columns as output: the original number and its square.\n", + "\n", + "Note the use of the `yield` statement; A Python UDTF requires the return type to be either a tuple or a Row object so that the results can be processed properly.\n", + "\n", + "Also note the return type must be a `StructType `with block-formatting or DDL string representing a `StructType` with block-formatting in Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d1a18cf6-3680-4c88-900c-98e232071467", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udtf\n", + "\n", + "@udtf(returnType=\"num: int, squared: int\")\n", + "class SquareNumbers:\n", + " def eval(self, start: int, end: int):\n", + " for num in range(start, end + 1):\n", + " yield (num, num * num)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "689c7037-c9a1-404f-8452-ab51cfe207d9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### Arrow optimization\n", + "\n", + "Apache Arrow is an in-memory columnar data format that allows for efficient data transfers between Java and Python processes. It can significantly boost performance when the UDTF outputs many rows. Arrow-optimization can be enabled using `useArrow=True`, for example, \n", + "\n", + "```py\n", + "from pyspark.sql.functions import udtf\n", + "\n", + "@udtf(returnType=\"num: int, squared: int\", useArrow=True)\n", + "class SquareNumbers:\n", + " ...\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b8df8c4c-e020-461c-8a24-db03ef692ee1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### To use a Python UDTF\n", + "\n", + "In Python, we can invoke a UDTF directly using the class name, as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f0c91bd8-81e1-431c-bd71-34bcfe09a4e0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-------+\n", + "|num|squared|\n", + "+---+-------+\n", + "| 1| 1|\n", + "| 2| 4|\n", + "| 3| 9|\n", + "+---+-------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import lit\n", + "\n", + "SquareNumbers(lit(1), lit(3)).show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "073f65fb-8ece-40e5-a739-753c58eed95a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "In SQL, we can register the Python UDTF and then use it in SQL as a table-valued function in the FROM clause of a query.\n", + "```\n", + "spark.sql(\"SELECT * FROM square_numbers(1, 3)\").show()\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bec8df7c-0918-4359-8f4e-50a520af0238", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### More Examples\n", + "\n", + "#### Example 1: Generating Numbers, Their Squares, Cubes, and Factorials for a Range" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e8f7a6da-715e-4289-be94-14093b5b7cbf", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------+----+---------+\n", + "|num|square|cube|factorial|\n", + "+---+------+----+---------+\n", + "| 1| 1| 1| 1|\n", + "| 2| 4| 8| 2|\n", + "| 3| 9| 27| 6|\n", + "| 4| 16| 64| 24|\n", + "| 5| 25| 125| 120|\n", + "+---+------+----+---------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import lit, udtf\n", + "import math\n", + "\n", + "@udtf(returnType=\"num: int, square: int, cube: int, factorial: int\")\n", + "class GenerateComplexNumbers:\n", + " def eval(self, start: int, end: int):\n", + " for num in range(start, end + 1):\n", + " yield (num, num ** 2, num ** 3, math.factorial(num))\n", + "\n", + "GenerateComplexNumbers(lit(1), lit(5)).show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bdebf07-03c3-43b3-b228-ebf435dfee96", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### Example 2: Splitting a Sentence into Words and Performing Multiple Operations" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1843700-2434-47a0-88bf-dce3a6de9ce6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+------+-------------+\n", + "| word|length|is_palindrome|\n", + "+-----+------+-------------+\n", + "|hello| 5| false|\n", + "|world| 5| false|\n", + "+-----+------+-------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import lit, udtf\n", + "\n", + "@udtf(returnType=\"word: string, length: int, is_palindrome: boolean\")\n", + "class ProcessWords:\n", + " def eval(self, sentence: str):\n", + " words = sentence.split() # Split sentence into words\n", + " for word in words:\n", + " is_palindrome = word == word[::-1] # Check if the word is a palindrome\n", + " yield (word, len(word), is_palindrome)\n", + "\n", + "ProcessWords(lit(\"hello world\")).show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f8ac11f5-10d0-4ae2-b40d-dc422aa9270e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### Example 3: Parsing JSON String into Key-Value Pairs with Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "51b6a7a5-0956-4da4-a099-010ac01cd3f1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+-----+----------+\n", + "| key|value|value_type|\n", + "+----------+-----+----------+\n", + "| name|Alice| str|\n", + "| age| 25| int|\n", + "|is_student|False| bool|\n", + "+----------+-----+----------+\n", + "\n" + ] + } + ], + "source": [ + "import json\n", + "from pyspark.sql.functions import lit, udtf\n", + "\n", + "@udtf(returnType=\"key: string, value: string, value_type: string\")\n", + "class ParseJSON:\n", + " def eval(self, json_str: str):\n", + " try:\n", + " json_data = json.loads(json_str)\n", + " for key, value in json_data.items():\n", + " value_type = type(value).__name__\n", + " yield (key, str(value), value_type)\n", + " except json.JSONDecodeError:\n", + " yield (\"Invalid JSON\", \"\", \"\")\n", + "\n", + "ParseJSON(lit('{\"name\": \"Alice\", \"age\": 25, \"is_student\": false}')).show()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "10-pythonudfsandudtfs", + "widgets": {} + }, + "kernelspec": { + "display_name": "", + "language": "python", + "name": "" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/python/pyspark/pandas/accessors.py b/python/pyspark/pandas/accessors.py index 77757e4b6087..a3af94e0e808 100644 --- a/python/pyspark/pandas/accessors.py +++ b/python/pyspark/pandas/accessors.py @@ -62,7 +62,7 @@ def attach_id_column(self, id_type: str, column: Name) -> "DataFrame": Attach a column to be used as an identifier of rows similar to the default index. See also `Default Index type - `_. + `_. Parameters ---------- @@ -203,7 +203,7 @@ def apply_batch( DataFrame given to the function is of a batch used internally. See also `Transform and apply a function - `_. + `_. .. note:: the `func` is unable to access the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each @@ -437,7 +437,7 @@ def transform_batch( each input and output should be the same. See also `Transform and apply a function - `_. + `_. .. note:: the `func` is unable to access the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each @@ -776,7 +776,7 @@ def transform_batch( The pandas Series given to the function is of a batch used internally. See also `Transform and apply a function - `_. + `_. .. note:: the `func` is unable to access the whole input series. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 86820573344e..6d6ac917b002 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -2976,7 +2976,7 @@ def apply( (``axis=1``). See also `Transform and apply a function - `_. + `_. .. note:: when `axis` is 0 or 'index', the `func` is unable to access to the whole input series. pandas-on-Spark internally splits the input series into @@ -3302,7 +3302,7 @@ def transform( and that has the same length as its input. See also `Transform and apply a function - `_. + `_. .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 8de03918a4cd..2c24c7a7cec2 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -2976,7 +2976,7 @@ def transform(self, func: Callable[..., pd.Series], *args: Any, **kwargs: Any) - When the given function has the return type annotated, the original index of the GroupBy object will be lost, and a default index will be attached to the result. Please be careful about configuring the default index. See also `Default Index Type - `_. + `_. .. note:: the series within ``func`` is actually a pandas series. Therefore, any pandas API within this function is allowed.