From 810c48ab5846745df8ca2b18104fb38445dfda30 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Sep 2019 08:52:09 +0200 Subject: [PATCH] Add 6_calculate_statistics.ipynb from master --- notebooks/6_calculate_statistics.ipynb | 873 +++++++++++++++++++++++++ 1 file changed, 873 insertions(+) create mode 100644 notebooks/6_calculate_statistics.ipynb diff --git a/notebooks/6_calculate_statistics.ipynb b/notebooks/6_calculate_statistics.ipynb new file mode 100644 index 0000000..018cb12 --- /dev/null +++ b/notebooks/6_calculate_statistics.ipynb @@ -0,0 +1,873 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Objectives\n", + "\n", + "- Calculate summary statistics for column(s) of interest\n", + "- Calculate summary statistics for column(s) of interest grouped by given column\n", + "- Introduce the split-apply-combine approach\n", + "- Get the value counts for a given column\n", + "\n", + "Content to cover\n", + "\n", + "- min/mean/max/corr\n", + "- groupby() + min/max/mean…\n", + "- value_counts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic = pd.read_csv(\"../data/titanic.csv\")\n", + "titanic.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Calculate statistics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregating statistics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/06_aggregate.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> What is the average age of the titanic passengers?" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "29.69911764705882" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic[\"Age\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Different statistics are available and can be applied to columns with numerical data. Operations in general exclude missing data and operates across rows by default." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/06_reduction.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> What is the median age and ticket fare price of the titanic passengers?" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Age 28.0000\n", + "Fare 14.4542\n", + "dtype: float64" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic[[\"Age\", \"Fare\"]].median()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The aggregating statistic can be calculated for multiple columns at the same time. Remember the `describe` function from the [first tutorial](1_table_oriented.ipynb)?" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeFare
count714.000000891.000000
mean29.69911832.204208
std14.52649749.693429
min0.4200000.000000
25%20.1250007.910400
50%28.00000014.454200
75%38.00000031.000000
max80.000000512.329200
\n", + "
" + ], + "text/plain": [ + " Age Fare\n", + "count 714.000000 891.000000\n", + "mean 29.699118 32.204208\n", + "std 14.526497 49.693429\n", + "min 0.420000 0.000000\n", + "25% 20.125000 7.910400\n", + "50% 28.000000 14.454200\n", + "75% 38.000000 31.000000\n", + "max 80.000000 512.329200" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic[[\"Age\", \"Fare\"]].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Instead of the predefined statistics, specific combinations of aggregating statistics for given columns can be defined using the `agg` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeFare
max80.000000512.329200
meanNaN32.204208
median28.00000014.454200
min0.4200000.000000
skew0.389108NaN
\n", + "
" + ], + "text/plain": [ + " Age Fare\n", + "max 80.000000 512.329200\n", + "mean NaN 32.204208\n", + "median 28.000000 14.454200\n", + "min 0.420000 0.000000\n", + "skew 0.389108 NaN" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.agg({'Age' : ['min', 'max', 'median', 'skew'], \n", + " 'Fare' : ['min', 'max', 'median', 'mean']})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ Further details about descriptive statistics is provided in :ref:`basics.stats`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregating statistics grouped by category" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To introduce the concept, let's focus on the Age and Sex columns of the titanic data set:" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSex
022.0male
138.0female
226.0female
335.0female
435.0male
\n", + "
" + ], + "text/plain": [ + " Age Sex\n", + "0 22.0 male\n", + "1 38.0 female\n", + "2 26.0 female\n", + "3 35.0 female\n", + "4 35.0 male" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_subset = titanic[[\"Age\", \"Sex\"]]\n", + "titanic_subset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/06_groupby.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> What is the average age for male versus female titanic passengers?" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Age
Sex
female27.915709
male30.726645
\n", + "
" + ], + "text/plain": [ + " Age\n", + "Sex \n", + "female 27.915709\n", + "male 30.726645" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_subset.groupby(\"Sex\").mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculating a given statistic (e.g. `mean` age) _for each category in a column_ (e.g. male/female in the `Sex` column) is a common pattern. The `groupby` method is used to support this type of operations. More general, this fits in the more general `split-apply-combine` pattern:\n", + "\n", + "* __Split__ the data into groups\n", + "* __Apply__ a function to each group independently\n", + "* __Combine__ the results into a data structure\n", + "\n", + "The apply and combine steps are typically done together in Pandas." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the previous example, a subset of the data was used. To apply the pattern on the entire DataFrame, the selection of columns is supported on the grouped data as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sex\n", + "female 27.915709\n", + "male 30.726645\n", + "Name: Age, dtype: float64" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.groupby(\"Sex\")[\"Age\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/06_groupby_select_detail.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> What is the mean ticket fare price for each of the sex and cabin class combinations?" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sex Pclass\n", + "female 1 106.125798\n", + " 2 21.970121\n", + " 3 16.118810\n", + "male 1 67.226127\n", + " 2 19.741782\n", + " 3 12.661633\n", + "Name: Fare, dtype: float64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.groupby([\"Sex\", \"Pclass\"])[\"Fare\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Grouping can be done by multiple columns at the same time. Provide the column names as a list to the groupby method." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ More information on groupby and the split-apply-combine approach is provided in :ref:`api.groupby`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Count number of records by category" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/06_valuecounts.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> What is the number of passengers in each of the cabin classes?" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3 491\n", + "1 216\n", + "2 184\n", + "Name: Pclass, dtype: int64" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic[\"Pclass\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `value_counts` function counts the number of records for each category in a column. The function is a shortcut, as it is actually a groupby operation in combination with counting of the number of records within each group: " + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pclass\n", + "1 216\n", + "2 184\n", + "3 491\n", + "Name: Pclass, dtype: int64" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.groupby(\"Pclass\")[\"Pclass\"].count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " \n", + "__Note__: Both `size` and `count` can be used in combination with `groupby`. Whereas `size` includes `NaN` values and just provides the number of rows (size of the table), `count` excludes the missing values. In the `value_counts` method, use the `dropna` argument to include or exclude the `Nan` values.\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ For more information about `value_counts`, see :ref:`basics.discretization`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## REMEMBER\n", + "\n", + "- Aggregation statistics can be calculated on entire columns or rows\n", + "- `groupby` provides the power of the _split-apply-combine_ pattern\n", + "- `value_counts` is a convenient shortcut to count the number of entries in each category of a variable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ More information on groupby and the split-apply-combine approach is provided in :ref:`api.groupby`." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}