index 2.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
  letterpaper,
  DIV=11,
  numbers=noendperiod]{scrreprt}

\usepackage{amsmath,amssymb}
\usepackage{lmodern}
\usepackage{iftex}
\ifPDFTeX
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{5}
% Make \paragraph and \subparagraph free-standing
\ifx\paragraph\undefined\else
  \let\oldparagraph\paragraph
  \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
  \let\oldsubparagraph\subparagraph
  \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
\fi

\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{241,243,245}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.40,0.45,0.13}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.28,0.35,0.67}{#1}}
\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.46,0.62}{#1}}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.07,0.07,0.07}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}

\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother

\KOMAoption{captions}{tableheading}
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[many]{tcolorbox}}
\@ifpackageloaded{fontawesome5}{}{\usepackage{fontawesome5}}
\definecolor{quarto-callout-color}{HTML}{909090}
\definecolor{quarto-callout-note-color}{HTML}{0758E5}
\definecolor{quarto-callout-important-color}{HTML}{CC1914}
\definecolor{quarto-callout-warning-color}{HTML}{EB9113}
\definecolor{quarto-callout-tip-color}{HTML}{00A047}
\definecolor{quarto-callout-caution-color}{HTML}{FC5300}
\definecolor{quarto-callout-color-frame}{HTML}{acacac}
\definecolor{quarto-callout-note-color-frame}{HTML}{4582ec}
\definecolor{quarto-callout-important-color-frame}{HTML}{d9534f}
\definecolor{quarto-callout-warning-color-frame}{HTML}{f0ad4e}
\definecolor{quarto-callout-tip-color-frame}{HTML}{02b875}
\definecolor{quarto-callout-caution-color-frame}{HTML}{fd7e14}
\makeatother
\makeatletter
\makeatother
\makeatletter
\@ifpackageloaded{bookmark}{}{\usepackage{bookmark}}
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\AtBeginDocument{%
\ifdefined\contentsname
  \renewcommand*\contentsname{Table of contents}
\else
  \newcommand\contentsname{Table of contents}
\fi
\ifdefined\listfigurename
  \renewcommand*\listfigurename{List of Figures}
\else
  \newcommand\listfigurename{List of Figures}
\fi
\ifdefined\listtablename
  \renewcommand*\listtablename{List of Tables}
\else
  \newcommand\listtablename{List of Tables}
\fi
\ifdefined\figurename
  \renewcommand*\figurename{Figure}
\else
  \newcommand\figurename{Figure}
\fi
\ifdefined\tablename
  \renewcommand*\tablename{Table}
\else
  \newcommand\tablename{Table}
\fi
}
\@ifpackageloaded{float}{}{\usepackage{float}}
\floatstyle{ruled}
\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
\floatname{codelisting}{Listing}
\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
\makeatother
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[many]{tcolorbox}}
\makeatother
\makeatletter
\@ifundefined{shadecolor}{\definecolor{shadecolor}{rgb}{.97, .97, .97}}
\makeatother
\makeatletter
\makeatother
\ifLuaTeX
  \usepackage{selnolig}  % disable illegal ligatures
\fi
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
\hypersetup{
  pdftitle={Principles and Techniques of Data Science},
  pdfauthor={Kanu Grover; Bella Crouch},
  colorlinks=true,
  linkcolor={blue},
  filecolor={Maroon},
  citecolor={Blue},
  urlcolor={Blue},
  pdfcreator={LaTeX via pandoc}}

\title{Principles and Techniques of Data Science}
\usepackage{etoolbox}
\makeatletter
\providecommand{\subtitle}[1]{% add subtitle to \maketitle
  \apptocmd{\@title}{\par {\large #1 \par}}{}{}
}
\makeatother
\subtitle{Data 100}
\author{Kanu Grover \and Bella Crouch}
\date{}

\begin{document}
\maketitle
\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[breakable, frame hidden, borderline west={3pt}{0pt}{shadecolor}, boxrule=0pt, enhanced, sharp corners, interior hidden]}{\end{tcolorbox}}\fi

\renewcommand*\contentsname{Table of contents}
{
\hypersetup{linkcolor=}
\setcounter{tocdepth}{2}
\tableofcontents
}
\bookmarksetup{startatroot}

\hypertarget{welcome}{%
\chapter*{Welcome}\label{welcome}}
\addcontentsline{toc}{chapter}{Welcome}

\markboth{Welcome}{Welcome}

\hypertarget{about-the-course-notes}{%
\section*{About the Course Notes}\label{about-the-course-notes}}
\addcontentsline{toc}{section}{About the Course Notes}

\markright{About the Course Notes}

This text was developed for the Spring 2023 Edition of the UC Berkeley
course Data 100: Principles and Techniques of Data Science.

As this project is in development during the Spring 2023 semester, the
course notes may be in flux. We appreciate your understanding. If you
spot any errors or would like to suggest any changes, please email us.
\textbf{Email}: data100.instructors@berkeley.edu

\bookmarksetup{startatroot}

\hypertarget{introduction}{%
\chapter{Introduction}\label{introduction}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Understand the stages of the data science lifecycle.
\end{itemize}

\end{tcolorbox}

Data science is an interdisciplinary field with a variety of
applications. The field is rapidly evolving; many of the key technical
underpinnings in modern-day data science have been popularized during
the early 21\textsuperscript{st} century.

A true mastery of data science requires a deep theoretical understanding
and strong grasp of domain expertise. This course will help you build on
the former -- specifically, the foundation of your technical knowledge.
To do so, we've organized concepts in Data 100 around the \textbf{data
science lifecycle}: an iterative process that encompasses the various
statistical and computational building blocks of data science.

\hypertarget{data-science-lifecycle}{%
\section{Data Science Lifecycle}\label{data-science-lifecycle}}

The data science lifecycle is a high-level overview of the data science
workflow. It's a cycle of stages that a data scientist should explore as
they conduct a thorough analysis of a data-driven problem.

There are many variations of the key ideas present in the data science
lifecycle. In Data 100, we visualize the stages of the lifecycle using a
flow diagram. Notice how there are two entry points.

\hypertarget{ask-a-question}{%
\subsection{Ask a Question}\label{ask-a-question}}

Whether by curiosity or necessity, data scientists will constantly ask
questions. For example, in the business world, data scientists may be
interested in predicting the profit generated by a certain investment.
In the field of medicine, they may ask whether some patients are more
likely than others to benefit from a treatment.

Posing questions is one of the primary ways the data science lifecycle
begins. It helps to fully define the question. Here are some things you
should ask yourself before framing a question.

\begin{itemize}
\tightlist
\item
  What do we want to know?

  \begin{itemize}
  \tightlist
  \item
    A question that is too ambiguous may lead to confusion.
  \end{itemize}
\item
  What problems are we trying to solve?

  \begin{itemize}
  \tightlist
  \item
    The goal of asking a question should be clear in order to justify
    your efforts to stakeholders.
  \end{itemize}
\item
  What are the hypotheses we want to test?

  \begin{itemize}
  \tightlist
  \item
    This gives a clear perspective from which to analyze final results.
  \end{itemize}
\item
  What are the metrics for our success?

  \begin{itemize}
  \tightlist
  \item
    This gives a clear point to know when to finish the project.
  \end{itemize}
\end{itemize}

\hypertarget{obtain-data}{%
\subsection{Obtain Data}\label{obtain-data}}

The second entry point to the lifecycle is by obtaining data. A careful
analysis of any problem requires the use of data. Data may be readily
available to us, or we may have to embark on a process to collect it.
When doing so, its crucial to ask the following:

\begin{itemize}
\tightlist
\item
  What data do we have and what data do we need?

  \begin{itemize}
  \tightlist
  \item
    Define the units of the data (people, cities, points in time, etc.)
    and what features to measure.
  \end{itemize}
\item
  How will we sample more data?

  \begin{itemize}
  \tightlist
  \item
    Scrape the web, collect manually, etc.
  \end{itemize}
\item
  Is our data representative of the population we want to study?

  \begin{itemize}
  \tightlist
  \item
    If our data is not representative of our population of interest,
    then we can come to incorrect conclusions.
  \end{itemize}
\end{itemize}

Key procedures: \emph{data acquisition}, \emph{data cleaning}

\hypertarget{understand-the-data}{%
\subsection{Understand the Data}\label{understand-the-data}}

Raw data itself is not inherently useful. It's impossible to discern all
the patterns and relationships between variables without carefully
investigating them. Therefore, translating pure data to actionable
insights is a key job of a data scientist. For example, we may choose to
ask:

\begin{itemize}
\tightlist
\item
  How is our data organized and what does it contain?

  \begin{itemize}
  \tightlist
  \item
    Knowing what the data says about the world helps us better
    understand the world.
  \end{itemize}
\item
  Do we have relevant data?

  \begin{itemize}
  \tightlist
  \item
    If the data we have collected is not useful to the question at hand,
    then we must collected more data.
  \end{itemize}
\item
  What are the biases, anomalies, or other issues with the data?

  \begin{itemize}
  \tightlist
  \item
    These can lead to many false conclusions if ignored, so data
    scientists must always be aware of these issues.
  \end{itemize}
\item
  How do we transform the data to enable effective analysis?

  \begin{itemize}
  \tightlist
  \item
    Data is not always easy to interpret at first glance, so a data
    scientist should reveal these hidden insights.
  \end{itemize}
\end{itemize}

Key procedures: \emph{exploratory data analysis}, \emph{data
visualization}.

\hypertarget{understand-the-world}{%
\subsection{Understand the World}\label{understand-the-world}}

After observing the patterns in our data, we can begin answering our
question. This may require that we predict a quantity (machine
learning), or measure the effect of some treatment (inference).

From here, we may choose to report our results, or possibly conduct more
analysis. We may not be satisfied by our findings, or our initial
exploration may have brought up new questions that require a new data.

\begin{itemize}
\tightlist
\item
  What does the data say about the world?

  \begin{itemize}
  \tightlist
  \item
    Given our models, the data will lead us to certain conclusions about
    the real world.\\
  \end{itemize}
\item
  Does it answer our questions or accurately solve the problem?

  \begin{itemize}
  \tightlist
  \item
    If our model and data can not accomplish our goals, then we must
    reform our question, model, or both.\\
  \end{itemize}
\item
  How robust are our conclusions and can we trust the predictions?

  \begin{itemize}
  \tightlist
  \item
    Inaccurate models can lead to untrue conclusions.
  \end{itemize}
\end{itemize}

Key procedures: \emph{model creation}, \emph{prediction},
\emph{inference}.

\hypertarget{conclusion}{%
\section{Conclusion}\label{conclusion}}

The data science lifecycle is meant to be a set of general guidelines
rather than a hard list of requirements. In our journey exploring the
lifecycle, we'll cover both the underlying theory and technologies used
in data science, and we hope you'll build an appreciation for the field.

With that, let's begin by introducing one of the most important tools in
exploratory data analysis: \texttt{pandas}.

\bookmarksetup{startatroot}

\hypertarget{pandas-i}{%
\chapter{Pandas I}\label{pandas-i}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Build familiarity with basic \texttt{pandas} syntax
\item
  Learn the methods of selecting and filtering data from a DataFrame.
\item
  Understand the differences between DataFrames and Series
\end{itemize}

\end{tcolorbox}

Data scientists work with data stored in a variety of formats. The
primary focus of this class is in understanding tabular data -- one of
the most widely used formats in data science. This note introduces
DataFrames, which are among the most popular representations of tabular
data. We'll also introduce \texttt{pandas}, the standard Python package
for manipulating data in DataFrames.

\hypertarget{introduction-to-exploratory-data-analysis}{%
\section{Introduction to Exploratory Data
Analysis}\label{introduction-to-exploratory-data-analysis}}

Imagine you collected, or have been given a box of data. What do you do
next?

The first step is to clean your data. \textbf{Data cleaning} often
corrects issues in the structure and formatting of data, including
missing values and unit conversions.

Data scientists have coined the term \textbf{exploratory data analysis
(EDA)} to describe the process of transforming raw data to insightful
observations. EDA is an \emph{open-ended} analysis of transforming,
visualizing, and summarizing patterns in data. In order to conduct EDA,
we first need to familiarize ourselves with \texttt{pandas} -- an
important programming tool.

\hypertarget{introduction-to-pandas}{%
\section{Introduction to Pandas}\label{introduction-to-pandas}}

\texttt{pandas} is a data analysis library to make data cleaning and
analysis fast and convenient in Python.

The \texttt{pandas} library adopts many coding idioms from
\texttt{NumPy}. The biggest difference is that \texttt{pandas} is
designed for working with tabular data, one of the most common data
formats (and the focus of Data 100).

Before writing any code, we must import \texttt{pandas} into our Python
environment.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# \textasciigrave{}pd\textasciigrave{} is the conventional alias for Pandas, as \textasciigrave{}np\textasciigrave{} is for NumPy}
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\end{Highlighting}
\end{Shaded}

\hypertarget{series-dataframes-and-indices}{%
\section{Series, DataFrames, and
Indices}\label{series-dataframes-and-indices}}

There are three fundamental data structures in \texttt{pandas}:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \textbf{Series}: 1D labeled array data; best thought of as columnar
  data
\item
  \textbf{DataFrame}: 2D tabular data with rows and columns
\item
  \textbf{Index}: A sequence of row/column labels
\end{enumerate}

DataFrames, Series, and Indices can be represented visually in the
following diagram.

\includegraphics{pandas_1/images/df_series_index.png}

Notice how the \textbf{DataFrame} is a two dimensional object -- it
contains both rows and columns. The \textbf{Series} above is a singular
column of this DataFrame, namely the \texttt{Candidate} column. Both
contain an \textbf{Index}, or a shared list of row labels (the integers
from 0 to 5, inclusive).

\hypertarget{series}{%
\subsection{Series}\label{series}}

A Series represents a column of a DataFrame; more generally, it can be
any 1-dimensional array-like object containing values of the same type
with associated data labels, called its index.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}

\NormalTok{s }\OperatorTok{=}\NormalTok{ pd.Series([}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{2}\NormalTok{])}
\BuiltInTok{print}\NormalTok{(s)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
0    -1
1    10
2     2
dtype: int64
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{s.array }\CommentTok{\# Data contained within the Series}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<PandasArray>
[-1, 10, 2]
Length: 3, dtype: int64
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{s.index }\CommentTok{\# The Index of the Series}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
RangeIndex(start=0, stop=3, step=1)
\end{verbatim}

By default, row indices in \texttt{pandas} are a sequential list of
integers beginning from 0. Optionally, a list of desired indices can be
passed to the \texttt{index} argument.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{s }\OperatorTok{=}\NormalTok{ pd.Series([}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{2}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{])}
\BuiltInTok{print}\NormalTok{(s)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
a    -1
b    10
c     2
dtype: int64
\end{verbatim}

Indices can also be changed after initialization.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{s.index }\OperatorTok{=}\NormalTok{ [}\StringTok{"first"}\NormalTok{, }\StringTok{"second"}\NormalTok{, }\StringTok{"third"}\NormalTok{]}
\BuiltInTok{print}\NormalTok{(s)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
first     -1
second    10
third      2
dtype: int64
\end{verbatim}

\hypertarget{selection-in-series}{%
\subsubsection{Selection in Series}\label{selection-in-series}}

Similar to an array, we can select a single value or a set of values
from a Series. There are 3 primary methods of selecting data.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  A single index label
\item
  A list of index labels
\item
  A filtering condition
\end{enumerate}

Let's define the following Series \texttt{ser}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ser }\OperatorTok{=}\NormalTok{ pd.Series([}\DecValTok{4}\NormalTok{, }\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{6}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"d"}\NormalTok{])}
\BuiltInTok{print}\NormalTok{(ser)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
a    4
b   -2
c    0
d    6
dtype: int64
\end{verbatim}

\hypertarget{a-single-index-label}{%
\paragraph{A Single Index Label}\label{a-single-index-label}}

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{print}\NormalTok{(ser[}\StringTok{"a"}\NormalTok{]) }\CommentTok{\# Notice how the return value is a single array element}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
4
\end{verbatim}

\hypertarget{a-list-of-index-labels}{%
\paragraph{A List of Index Labels}\label{a-list-of-index-labels}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ser[[}\StringTok{"a"}\NormalTok{, }\StringTok{"c"}\NormalTok{]] }\CommentTok{\# Notice how the return value is another Series}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lr}
\toprule
{} &  0 \\
\midrule
a &  4 \\
c &  0 \\
\bottomrule
\end{tabular}

\hypertarget{a-filtering-condition}{%
\paragraph{A Filtering Condition}\label{a-filtering-condition}}

Perhaps the most interesting (and useful) method of selecting data from
a Series is with a filtering condition.

We first must apply a vectorized boolean operation to our Series that
encodes the filter conditon.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ser }\OperatorTok{\textgreater{}} \DecValTok{0} \CommentTok{\# Filter condition: select all elements greater than 0}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &      0 \\
\midrule
a &   True \\
b &  False \\
c &  False \\
d &   True \\
\bottomrule
\end{tabular}

Upon ``indexing'' in our Series with this condition, \texttt{pandas}
selects only the rows with \texttt{True} values.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ser[ser }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{] }
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lr}
\toprule
{} &  0 \\
\midrule
a &  4 \\
d &  6 \\
\bottomrule
\end{tabular}

\hypertarget{dataframes}{%
\subsection{DataFrames}\label{dataframes}}

In Data 8, you encountered the \texttt{Table} class of the
\texttt{datascience} library, which represented tabular data. In Data
100, we'll be using the \texttt{DataFrame} class of the \texttt{pandas}
library.

Here is an example of a DataFrame that contains election data.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}

\NormalTok{elections }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{)}
\NormalTok{elections}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &               Candidate &                  Party &  Popular vote & Result &          \% \\
\midrule
0   &  1824 &          Andrew Jackson &  Democratic-Republican &        151271 &   loss &  57.210122 \\
1   &  1824 &       John Quincy Adams &  Democratic-Republican &        113142 &    win &  42.789878 \\
2   &  1828 &          Andrew Jackson &             Democratic &        642806 &    win &  56.203927 \\
3   &  1828 &       John Quincy Adams &    National Republican &        500897 &   loss &  43.796073 \\
4   &  1832 &          Andrew Jackson &             Democratic &        702735 &    win &  54.574789 \\
5   &  1832 &              Henry Clay &    National Republican &        484205 &   loss &  37.603628 \\
6   &  1832 &            William Wirt &           Anti-Masonic &        100715 &   loss &   7.821583 \\
7   &  1836 &       Hugh Lawson White &                   Whig &        146109 &   loss &  10.005985 \\
8   &  1836 &        Martin Van Buren &             Democratic &        763291 &    win &  52.272472 \\
9   &  1836 &  William Henry Harrison &                   Whig &        550816 &   loss &  37.721543 \\
10  &  1840 &        Martin Van Buren &             Democratic &       1128854 &   loss &  46.948787 \\
11  &  1840 &  William Henry Harrison &                   Whig &       1275583 &    win &  53.051213 \\
12  &  1844 &              Henry Clay &                   Whig &       1300004 &   loss &  49.250523 \\
13  &  1844 &              James Polk &             Democratic &       1339570 &    win &  50.749477 \\
14  &  1848 &              Lewis Cass &             Democratic &       1223460 &   loss &  42.552229 \\
15  &  1848 &        Martin Van Buren &              Free Soil &        291501 &   loss &  10.138474 \\
16  &  1848 &          Zachary Taylor &                   Whig &       1360235 &    win &  47.309296 \\
17  &  1852 &         Franklin Pierce &             Democratic &       1605943 &    win &  51.013168 \\
18  &  1852 &            John P. Hale &              Free Soil &        155210 &   loss &   4.930283 \\
19  &  1852 &          Winfield Scott &                   Whig &       1386942 &   loss &  44.056548 \\
20  &  1856 &          James Buchanan &             Democratic &       1835140 &    win &  45.306080 \\
21  &  1856 &         John C. Frémont &             Republican &       1342345 &   loss &  33.139919 \\
22  &  1856 &        Millard Fillmore &               American &        873053 &   loss &  21.554001 \\
23  &  1860 &         Abraham Lincoln &             Republican &       1855993 &    win &  39.699408 \\
24  &  1860 &               John Bell &   Constitutional Union &        590901 &   loss &  12.639283 \\
25  &  1860 &    John C. Breckinridge &    Southern Democratic &        848019 &   loss &  18.138998 \\
26  &  1860 &      Stephen A. Douglas &    Northern Democratic &       1380202 &   loss &  29.522311 \\
27  &  1864 &         Abraham Lincoln &         National Union &       2211317 &    win &  54.951512 \\
28  &  1864 &     George B. McClellan &             Democratic &       1812807 &   loss &  45.048488 \\
29  &  1868 &         Horatio Seymour &             Democratic &       2708744 &   loss &  47.334695 \\
30  &  1868 &           Ulysses Grant &             Republican &       3013790 &    win &  52.665305 \\
31  &  1872 &          Horace Greeley &     Liberal Republican &       2834761 &   loss &  44.071406 \\
32  &  1872 &           Ulysses Grant &             Republican &       3597439 &    win &  55.928594 \\
33  &  1876 &        Rutherford Hayes &             Republican &       4034142 &    win &  48.471624 \\
34  &  1876 &        Samuel J. Tilden &             Democratic &       4288546 &   loss &  51.528376 \\
35  &  1880 &         James B. Weaver &              Greenback &        308649 &   loss &   3.352344 \\
36  &  1880 &          James Garfield &             Republican &       4453337 &    win &  48.369234 \\
37  &  1880 &  Winfield Scott Hancock &             Democratic &       4444976 &   loss &  48.278422 \\
38  &  1884 &         Benjamin Butler &          Anti-Monopoly &        134294 &   loss &   1.335838 \\
39  &  1884 &        Grover Cleveland &             Democratic &       4914482 &    win &  48.884933 \\
40  &  1884 &         James G. Blaine &             Republican &       4856905 &   loss &  48.312208 \\
41  &  1884 &           John St. John &            Prohibition &        147482 &   loss &   1.467021 \\
42  &  1888 &          Alson Streeter &            Union Labor &        146602 &   loss &   1.288861 \\
43  &  1888 &       Benjamin Harrison &             Republican &       5443633 &    win &  47.858041 \\
44  &  1888 &         Clinton B. Fisk &            Prohibition &        249819 &   loss &   2.196299 \\
45  &  1888 &        Grover Cleveland &             Democratic &       5534488 &   loss &  48.656799 \\
46  &  1892 &       Benjamin Harrison &             Republican &       5176108 &   loss &  42.984101 \\
47  &  1892 &        Grover Cleveland &             Democratic &       5553898 &    win &  46.121393 \\
48  &  1892 &         James B. Weaver &               Populist &       1041028 &   loss &   8.645038 \\
49  &  1892 &            John Bidwell &            Prohibition &        270879 &   loss &   2.249468 \\
50  &  1896 &          John M. Palmer &    National Democratic &        134645 &   loss &   0.969566 \\
51  &  1896 &         Joshua Levering &            Prohibition &        131312 &   loss &   0.945565 \\
52  &  1896 &  William Jennings Bryan &             Democratic &       6509052 &   loss &  46.871053 \\
53  &  1896 &        William McKinley &             Republican &       7112138 &    win &  51.213817 \\
54  &  1900 &         John G. Woolley &            Prohibition &        210864 &   loss &   1.526821 \\
55  &  1900 &  William Jennings Bryan &             Democratic &       6370932 &   loss &  46.130540 \\
56  &  1900 &        William McKinley &             Republican &       7228864 &    win &  52.342640 \\
57  &  1904 &         Alton B. Parker &             Democratic &       5083880 &   loss &  37.685116 \\
58  &  1904 &          Eugene V. Debs &              Socialist &        402810 &   loss &   2.985897 \\
59  &  1904 &        Silas C. Swallow &            Prohibition &        259102 &   loss &   1.920637 \\
60  &  1904 &      Theodore Roosevelt &             Republican &       7630557 &    win &  56.562787 \\
61  &  1904 &        Thomas E. Watson &               Populist &        114070 &   loss &   0.845563 \\
62  &  1908 &          Eugene V. Debs &              Socialist &        420852 &   loss &   2.850866 \\
63  &  1908 &        Eugene W. Chafin &            Prohibition &        254087 &   loss &   1.721194 \\
64  &  1908 &  William Jennings Bryan &             Democratic &       6408979 &   loss &  43.414640 \\
65  &  1908 &            William Taft &             Republican &       7678335 &    win &  52.013300 \\
66  &  1912 &          Eugene V. Debs &              Socialist &        901551 &   loss &   6.004354 \\
67  &  1912 &        Eugene W. Chafin &            Prohibition &        208156 &   loss &   1.386325 \\
68  &  1912 &      Theodore Roosevelt &            Progressive &       4122721 &   loss &  27.457433 \\
69  &  1912 &            William Taft &             Republican &       3486242 &   loss &  23.218466 \\
70  &  1912 &          Woodrow Wilson &             Democratic &       6296284 &    win &  41.933422 \\
71  &  1916 &         Allan L. Benson &              Socialist &        590524 &   loss &   3.194193 \\
72  &  1916 &    Charles Evans Hughes &             Republican &       8548728 &   loss &  46.240779 \\
73  &  1916 &             Frank Hanly &            Prohibition &        221302 &   loss &   1.197041 \\
74  &  1916 &          Woodrow Wilson &             Democratic &       9126868 &    win &  49.367987 \\
75  &  1920 &        Aaron S. Watkins &            Prohibition &        188787 &   loss &   0.708351 \\
76  &  1920 &          Eugene V. Debs &              Socialist &        913693 &   loss &   3.428282 \\
77  &  1920 &            James M. Cox &             Democratic &       9139661 &   loss &  34.293063 \\
78  &  1920 &   Parley P. Christensen &           Farmer–Labor &        265398 &   loss &   0.995804 \\
79  &  1920 &          Warren Harding &             Republican &      16144093 &    win &  60.574501 \\
80  &  1924 &         Calvin Coolidge &             Republican &      15723789 &    win &  54.329113 \\
81  &  1924 &           John W. Davis &             Democratic &       8386242 &   loss &  28.976291 \\
82  &  1924 &      Robert La Follette &            Progressive &       4831706 &   loss &  16.694596 \\
83  &  1928 &                Al Smith &             Democratic &      15015464 &   loss &  40.902853 \\
84  &  1928 &          Herbert Hoover &             Republican &      21427123 &    win &  58.368524 \\
85  &  1928 &           Norman Thomas &              Socialist &        267478 &   loss &   0.728623 \\
86  &  1932 &      Franklin Roosevelt &             Democratic &      22821277 &    win &  57.672125 \\
87  &  1932 &          Herbert Hoover &             Republican &      15761254 &   loss &  39.830594 \\
88  &  1932 &           Norman Thomas &              Socialist &        884885 &   loss &   2.236211 \\
89  &  1932 &       William Z. Foster &              Communist &        103307 &   loss &   0.261069 \\
90  &  1936 &              Alf Landon &             Republican &      16679543 &   loss &  36.648285 \\
91  &  1936 &      Franklin Roosevelt &             Democratic &      27752648 &    win &  60.978107 \\
92  &  1936 &           Norman Thomas &              Socialist &        187910 &   loss &   0.412876 \\
93  &  1936 &           William Lemke &                  Union &        892378 &   loss &   1.960733 \\
94  &  1940 &      Franklin Roosevelt &             Democratic &      27313945 &    win &  54.871202 \\
95  &  1940 &           Norman Thomas &              Socialist &        116599 &   loss &   0.234237 \\
96  &  1940 &         Wendell Willkie &             Republican &      22347744 &   loss &  44.894561 \\
97  &  1944 &      Franklin Roosevelt &             Democratic &      25612916 &    win &  53.773801 \\
98  &  1944 &         Thomas E. Dewey &             Republican &      22017929 &   loss &  46.226199 \\
99  &  1948 &        Claude A. Watson &            Prohibition &        103708 &   loss &   0.212747 \\
100 &  1948 &            Harry Truman &             Democratic &      24179347 &    win &  49.601536 \\
101 &  1948 &        Henry A. Wallace &            Progressive &       1157328 &   loss &   2.374144 \\
102 &  1948 &           Norman Thomas &              Socialist &        139569 &   loss &   0.286312 \\
103 &  1948 &          Strom Thurmond &              Dixiecrat &       1175930 &   loss &   2.412304 \\
104 &  1948 &         Thomas E. Dewey &             Republican &      21991292 &   loss &  45.112958 \\
105 &  1952 &         Adlai Stevenson &             Democratic &      27375090 &   loss &  44.446312 \\
106 &  1952 &       Dwight Eisenhower &             Republican &      34075529 &    win &  55.325173 \\
107 &  1952 &        Vincent Hallinan &            Progressive &        140746 &   loss &   0.228516 \\
108 &  1956 &         Adlai Stevenson &             Democratic &      26028028 &   loss &  42.174464 \\
109 &  1956 &       Dwight Eisenhower &             Republican &      35579180 &    win &  57.650654 \\
110 &  1956 &      T. Coleman Andrews &         States' Rights &        107929 &   loss &   0.174883 \\
111 &  1960 &            John Kennedy &             Democratic &      34220984 &    win &  50.082561 \\
112 &  1960 &           Richard Nixon &             Republican &      34108157 &   loss &  49.917439 \\
113 &  1964 &         Barry Goldwater &             Republican &      27175754 &   loss &  38.655297 \\
114 &  1964 &          Lyndon Johnson &             Democratic &      43127041 &    win &  61.344703 \\
115 &  1968 &          George Wallace &   American Independent &       9901118 &   loss &  13.571218 \\
116 &  1968 &         Hubert Humphrey &             Democratic &      31271839 &   loss &  42.863537 \\
117 &  1968 &           Richard Nixon &             Republican &      31783783 &    win &  43.565246 \\
118 &  1972 &         George McGovern &             Democratic &      29173222 &   loss &  37.670670 \\
119 &  1972 &         John G. Schmitz &   American Independent &       1100868 &   loss &   1.421524 \\
120 &  1972 &           Richard Nixon &             Republican &      47168710 &    win &  60.907806 \\
121 &  1976 &         Eugene McCarthy &            Independent &        740460 &   loss &   0.911649 \\
122 &  1976 &             Gerald Ford &             Republican &      39148634 &   loss &  48.199499 \\
123 &  1976 &            Jimmy Carter &             Democratic &      40831881 &    win &  50.271900 \\
124 &  1976 &           Lester Maddox &   American Independent &        170274 &   loss &   0.209640 \\
125 &  1976 &          Roger MacBride &            Libertarian &        172557 &   loss &   0.212451 \\
126 &  1976 &      Thomas J. Anderson &               American &        158271 &   loss &   0.194862 \\
127 &  1980 &          Barry Commoner &               Citizens &        233052 &   loss &   0.270182 \\
128 &  1980 &                Ed Clark &            Libertarian &        921128 &   loss &   1.067883 \\
129 &  1980 &            Jimmy Carter &             Democratic &      35480115 &   loss &  41.132848 \\
130 &  1980 &        John B. Anderson &            Independent &       5719850 &   loss &   6.631143 \\
131 &  1980 &           Ronald Reagan &             Republican &      43903230 &    win &  50.897944 \\
132 &  1984 &          David Bergland &            Libertarian &        228111 &   loss &   0.247245 \\
133 &  1984 &           Ronald Reagan &             Republican &      54455472 &    win &  59.023326 \\
134 &  1984 &          Walter Mondale &             Democratic &      37577352 &   loss &  40.729429 \\
135 &  1988 &       George H. W. Bush &             Republican &      48886597 &    win &  53.518845 \\
136 &  1988 &           Lenora Fulani &           New Alliance &        217221 &   loss &   0.237804 \\
137 &  1988 &         Michael Dukakis &             Democratic &      41809074 &   loss &  45.770691 \\
138 &  1988 &                Ron Paul &            Libertarian &        431750 &   loss &   0.472660 \\
139 &  1992 &            Andre Marrou &            Libertarian &        290087 &   loss &   0.278516 \\
140 &  1992 &            Bill Clinton &             Democratic &      44909806 &    win &  43.118485 \\
141 &  1992 &                Bo Gritz &               Populist &        106152 &   loss &   0.101918 \\
142 &  1992 &       George H. W. Bush &             Republican &      39104550 &   loss &  37.544784 \\
143 &  1992 &              Ross Perot &            Independent &      19743821 &   loss &  18.956298 \\
144 &  1996 &            Bill Clinton &             Democratic &      47400125 &    win &  49.296938 \\
145 &  1996 &                Bob Dole &             Republican &      39197469 &   loss &  40.766036 \\
146 &  1996 &            Harry Browne &            Libertarian &        485759 &   loss &   0.505198 \\
147 &  1996 &         Howard Phillips &              Taxpayers &        184656 &   loss &   0.192045 \\
148 &  1996 &            John Hagelin &            Natural Law &        113670 &   loss &   0.118219 \\
149 &  1996 &             Ralph Nader &                  Green &        685297 &   loss &   0.712721 \\
150 &  1996 &              Ross Perot &                 Reform &       8085294 &   loss &   8.408844 \\
151 &  2000 &                 Al Gore &             Democratic &      50999897 &   loss &  48.491813 \\
152 &  2000 &          George W. Bush &             Republican &      50456002 &    win &  47.974666 \\
153 &  2000 &            Harry Browne &            Libertarian &        384431 &   loss &   0.365525 \\
154 &  2000 &            Pat Buchanan &                 Reform &        448895 &   loss &   0.426819 \\
155 &  2000 &             Ralph Nader &                  Green &       2882955 &   loss &   2.741176 \\
156 &  2004 &              David Cobb &                  Green &        119859 &   loss &   0.098088 \\
157 &  2004 &          George W. Bush &             Republican &      62040610 &    win &  50.771824 \\
158 &  2004 &              John Kerry &             Democratic &      59028444 &   loss &  48.306775 \\
159 &  2004 &        Michael Badnarik &            Libertarian &        397265 &   loss &   0.325108 \\
160 &  2004 &        Michael Peroutka &           Constitution &        143630 &   loss &   0.117542 \\
161 &  2004 &             Ralph Nader &            Independent &        465151 &   loss &   0.380663 \\
162 &  2008 &            Barack Obama &             Democratic &      69498516 &    win &  53.023510 \\
163 &  2008 &                Bob Barr &            Libertarian &        523715 &   loss &   0.399565 \\
164 &  2008 &           Chuck Baldwin &           Constitution &        199750 &   loss &   0.152398 \\
165 &  2008 &        Cynthia McKinney &                  Green &        161797 &   loss &   0.123442 \\
166 &  2008 &             John McCain &             Republican &      59948323 &   loss &  45.737243 \\
167 &  2008 &             Ralph Nader &            Independent &        739034 &   loss &   0.563842 \\
168 &  2012 &            Barack Obama &             Democratic &      65915795 &    win &  51.258484 \\
169 &  2012 &            Gary Johnson &            Libertarian &       1275971 &   loss &   0.992241 \\
170 &  2012 &              Jill Stein &                  Green &        469627 &   loss &   0.365199 \\
171 &  2012 &             Mitt Romney &             Republican &      60933504 &   loss &  47.384076 \\
172 &  2016 &          Darrell Castle &           Constitution &        203091 &   loss &   0.149640 \\
173 &  2016 &            Donald Trump &             Republican &      62984828 &    win &  46.407862 \\
174 &  2016 &           Evan McMullin &            Independent &        732273 &   loss &   0.539546 \\
175 &  2016 &            Gary Johnson &            Libertarian &       4489235 &   loss &   3.307714 \\
176 &  2016 &         Hillary Clinton &             Democratic &      65853514 &   loss &  48.521539 \\
177 &  2016 &              Jill Stein &                  Green &       1457226 &   loss &   1.073699 \\
178 &  2020 &            Joseph Biden &             Democratic &      81268924 &    win &  51.311515 \\
179 &  2020 &            Donald Trump &             Republican &      74216154 &   loss &  46.858542 \\
180 &  2020 &            Jo Jorgensen &            Libertarian &       1865724 &   loss &   1.177979 \\
181 &  2020 &          Howard Hawkins &                  Green &        405035 &   loss &   0.255731 \\
\bottomrule
\end{tabular}

Let's dissect the code above.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  We first import the \texttt{pandas} library into our Python
  environment, using the alias \texttt{pd}.
   \texttt{import\ pandas\ as\ pd}
\item
  There are a number of ways to read data into a DataFrame. In Data 100,
  our data are typically stored in a CSV (comma-seperated values) file
  format. We can import a CSV file into a DataFrame by passing the data
  path as an argument to the following \texttt{pandas} function.
   \texttt{pd.read\_csv("elections.csv")}
\end{enumerate}

This code stores our DataFrame object in the \texttt{elections}
variable. Upon inspection, our \texttt{elections} DataFrame has 182 rows
and 6 columns (\texttt{Year}, \texttt{Candidate}, \texttt{Party},
\texttt{Popular\ Vote}, \texttt{Result}, \texttt{\%}). Each row
represents a single record -- in our example, a presedential candidate
from some particular year. Each column represents a single attribute, or
feature of the record.

In the example above, we constructed a DataFrame object using data from
a CSV file. As we'll explore in the next section, we can create a
DataFrame with data of our own.

\hypertarget{creating-a-dataframe}{%
\subsubsection{Creating a DataFrame}\label{creating-a-dataframe}}

There are many ways to create a DataFrame. Here, we will cover the most
popular approaches.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Using a list and column names
\item
  From a dictionary
\item
  From a Series
\end{enumerate}

\hypertarget{using-a-list-and-column-names}{%
\paragraph{Using a List and Column
Names}\label{using-a-list-and-column-names}}

Consider the following examples.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_list }\OperatorTok{=}\NormalTok{ pd.DataFrame([}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], columns}\OperatorTok{=}\NormalTok{[}\StringTok{"Numbers"}\NormalTok{])}
\NormalTok{df\_list}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lr}
\toprule
{} &  Numbers \\
\midrule
0 &        1 \\
1 &        2 \\
2 &        3 \\
\bottomrule
\end{tabular}

The first code cell creates a DataFrame with a single column
\texttt{Numbers}, while the second creates a DataFrame with an
additional column \texttt{Description}. Notice how a 2D list of values
is required to initialize the second DataFrame -- each nested list
represents a single row of data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_list }\OperatorTok{=}\NormalTok{ pd.DataFrame([[}\DecValTok{1}\NormalTok{, }\StringTok{"one"}\NormalTok{], [}\DecValTok{2}\NormalTok{, }\StringTok{"two"}\NormalTok{]], columns }\OperatorTok{=}\NormalTok{ [}\StringTok{"Number"}\NormalTok{, }\StringTok{"Description"}\NormalTok{])}
\NormalTok{df\_list}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrl}
\toprule
{} &  Number & Description \\
\midrule
0 &       1 &         one \\
1 &       2 &         two \\
\bottomrule
\end{tabular}

\hypertarget{from-a-dictionary}{%
\paragraph{From a Dictionary}\label{from-a-dictionary}}

A second (and more common) way to create a DataFrame is with a
dictionary. The dictionary keys represent the column names, and the
dictionary values represent the column values.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_dict }\OperatorTok{=}\NormalTok{ pd.DataFrame(\{}\StringTok{"Fruit"}\NormalTok{: [}\StringTok{"Strawberry"}\NormalTok{, }\StringTok{"Orange"}\NormalTok{], }\StringTok{"Price"}\NormalTok{: [}\FloatTok{5.49}\NormalTok{, }\FloatTok{3.99}\NormalTok{]\})}
\NormalTok{df\_dict}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llr}
\toprule
{} &       Fruit &  Price \\
\midrule
0 &  Strawberry &   5.49 \\
1 &      Orange &   3.99 \\
\bottomrule
\end{tabular}

\hypertarget{from-a-series}{%
\paragraph{From a Series}\label{from-a-series}}

Earlier, we explained how a Series was synonymous to a column in a
DataFrame. It follows then, that a DataFrame is equivalent to a
collection of Series, which all share the same index.

In fact, we can initialize a DataFrame by merging two or more Series.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Notice how our indices, or row labels, are the same}

\NormalTok{s\_a }\OperatorTok{=}\NormalTok{ pd.Series([}\StringTok{"a1"}\NormalTok{, }\StringTok{"a2"}\NormalTok{, }\StringTok{"a3"}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"r1"}\NormalTok{, }\StringTok{"r2"}\NormalTok{, }\StringTok{"r3"}\NormalTok{])}
\NormalTok{s\_b }\OperatorTok{=}\NormalTok{ pd.Series([}\StringTok{"b1"}\NormalTok{, }\StringTok{"b2"}\NormalTok{, }\StringTok{"b3"}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"r1"}\NormalTok{, }\StringTok{"r2"}\NormalTok{, }\StringTok{"r3"}\NormalTok{])}

\NormalTok{pd.DataFrame(\{}\StringTok{"A{-}column"}\NormalTok{: s\_a, }\StringTok{"B{-}column"}\NormalTok{: s\_b\})}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lll}
\toprule
{} & A-column & B-column \\
\midrule
r1 &       a1 &       b1 \\
r2 &       a2 &       b2 \\
r3 &       a3 &       b3 \\
\bottomrule
\end{tabular}

\hypertarget{indices}{%
\subsection{Indices}\label{indices}}

The major takeaway: we can think of a \textbf{DataFrame} as a collection
of \textbf{Series} that all share the same \textbf{Index}.

On a more technical note, an Index doesn't have to be an integer, nor
does it have to be unique. For example, we can set the index of the
\texttt{elections} Dataframe to be the name of presedential candidates.
Selecting a new Series from this modified DataFrame yields the
following.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This sets the index to the "Candidate" column}
\NormalTok{elections.set\_index(}\StringTok{"Candidate"}\NormalTok{, inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{pandas_1/images/index_comparison_2.png}

To retrieve the indices of a DataFrame, simply use the \texttt{.index}
attribute of the DataFrame class.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.index}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Index(['Andrew Jackson', 'John Quincy Adams', 'Andrew Jackson',
       'John Quincy Adams', 'Andrew Jackson', 'Henry Clay', 'William Wirt',
       'Hugh Lawson White', 'Martin Van Buren', 'William Henry Harrison',
       ...
       'Darrell Castle', 'Donald Trump', 'Evan McMullin', 'Gary Johnson',
       'Hillary Clinton', 'Jill Stein', 'Joseph Biden', 'Donald Trump',
       'Jo Jorgensen', 'Howard Hawkins'],
      dtype='object', name='Candidate', length=182)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This resets the index to be the default list of integers}
\NormalTok{elections.reset\_index(inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\hypertarget{slicing-in-dataframes}{%
\section{Slicing in DataFrames}\label{slicing-in-dataframes}}

Now that we've learned how to create DataFrames, let's dive deeper into
their capabilities.

The API (application programming interface) for the DataFrame class is
enormous. In this section, we'll discuss several methods of the
DataFrame API that allow us to extract subsets of data.

The simplest way to manipulate a DataFrame is to extract a subset of
rows and columns, known as \textbf{slicing}. We will do so with three
primary methods of the DataFrame class:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{.loc}
\item
  \texttt{.iloc}
\item
  \texttt{{[}{]}}
\end{enumerate}

\hypertarget{indexing-with-.loc}{%
\subsection{Indexing with .loc}\label{indexing-with-.loc}}

The \texttt{.loc} operator selects rows and columns in a DataFrame by
their row and column label(s), respectively. The \textbf{row labels}
(commonly referred to as the \textbf{indices}) are the bold text on the
far \emph{left} of a DataFrame, while the \textbf{column labels} are the
column names found at the \emph{top} of a DataFrame.

To grab data with \texttt{.loc}, we must specify the row and column
label(s) where the data exists. The row labels are the first argument to
the \texttt{.loc} function; the column labels are the second. For
example, we can select the the row labeled \texttt{0} and the column
labeled \texttt{Candidate} from the \texttt{elections} DataFrame.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.loc[}\DecValTok{0}\NormalTok{, }\StringTok{\textquotesingle{}Candidate\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
'Andrew Jackson'
\end{verbatim}

To select \emph{multiple} rows and columns, we can use Python slice
notation. Here, we select both the first four rows and columns.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.loc[}\DecValTok{0}\NormalTok{:}\DecValTok{3}\NormalTok{, }\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}Popular vote\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrlr}
\toprule
{} &  Year &                  Party &  Popular vote \\
\midrule
0 &  1824 &  Democratic-Republican &        151271 \\
1 &  1824 &  Democratic-Republican &        113142 \\
2 &  1828 &             Democratic &        642806 \\
3 &  1828 &    National Republican &        500897 \\
\bottomrule
\end{tabular}

Suppose that instead, we wanted \emph{every} column value for the first
four rows in the \texttt{elections} DataFrame. The shorthand \texttt{:}
is useful for this.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.loc[}\DecValTok{0}\NormalTok{:}\DecValTok{3}\NormalTok{, :]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrlrlr}
\toprule
{} &          Candidate &  Year &                  Party &  Popular vote & Result &          \% \\
\midrule
0 &     Andrew Jackson &  1824 &  Democratic-Republican &        151271 &   loss &  57.210122 \\
1 &  John Quincy Adams &  1824 &  Democratic-Republican &        113142 &    win &  42.789878 \\
2 &     Andrew Jackson &  1828 &             Democratic &        642806 &    win &  56.203927 \\
3 &  John Quincy Adams &  1828 &    National Republican &        500897 &   loss &  43.796073 \\
\bottomrule
\end{tabular}

There are a couple of things we should note. Unlike conventional Python,
Pandas allows us to slice string values (in our example, the column
labels). Secondly, slicing with \texttt{.loc} is \emph{inclusive}.
Notice how our resulting DataFrame includes every row and column between
and including the slice labels we specified.

Equivalently, we can use a list to obtain multiple rows and columns in
our \texttt{elections} DataFrame.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.loc[[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], [}\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Candidate\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Party\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Popular vote\textquotesingle{}}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllr}
\toprule
{} &  Year &          Candidate &                  Party &  Popular vote \\
\midrule
0 &  1824 &     Andrew Jackson &  Democratic-Republican &        151271 \\
1 &  1824 &  John Quincy Adams &  Democratic-Republican &        113142 \\
2 &  1828 &     Andrew Jackson &             Democratic &        642806 \\
3 &  1828 &  John Quincy Adams &    National Republican &        500897 \\
\bottomrule
\end{tabular}

Lastly, we can interchange list and slicing notation.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.loc[[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], :]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrlrlr}
\toprule
{} &          Candidate &  Year &                  Party &  Popular vote & Result &          \% \\
\midrule
0 &     Andrew Jackson &  1824 &  Democratic-Republican &        151271 &   loss &  57.210122 \\
1 &  John Quincy Adams &  1824 &  Democratic-Republican &        113142 &    win &  42.789878 \\
2 &     Andrew Jackson &  1828 &             Democratic &        642806 &    win &  56.203927 \\
3 &  John Quincy Adams &  1828 &    National Republican &        500897 &   loss &  43.796073 \\
\bottomrule
\end{tabular}

\hypertarget{indexing-with-.iloc}{%
\subsection{Indexing with .iloc}\label{indexing-with-.iloc}}

Slicing with \texttt{.iloc} works similarily to \texttt{.loc}, although
\texttt{.iloc} uses the integer positions of rows and columns rather the
labels. The arguments to the \texttt{.iloc} function also behave
similarly - single values, lists, indices, and any combination of these
are permitted.

Let's begin reproducing our results from above. We'll begin by selecting
for the first presedential candidate in our \texttt{elections}
DataFrame:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# elections.loc[0, "Candidate"] {-} Previous approach}
\NormalTok{elections.iloc[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
1824
\end{verbatim}

Notice how the first argument to both \texttt{.loc} and \texttt{.iloc}
are the same. This is because the row with a label of 0 is conveniently
in the 0\textsuperscript{th} (or first) position of the
\texttt{elections} DataFrame. Generally, this is true of any DataFrame
where the row labels are incremented in ascending order from 0.

However, when we select the first four rows and columns using
\texttt{.iloc}, we notice something.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# elections.loc[0:3, \textquotesingle{}Year\textquotesingle{}:\textquotesingle{}Popular vote\textquotesingle{}] {-} Previous approach}
\NormalTok{elections.iloc[}\DecValTok{0}\NormalTok{:}\DecValTok{4}\NormalTok{, }\DecValTok{0}\NormalTok{:}\DecValTok{4}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrlr}
\toprule
{} &          Candidate &  Year &                  Party &  Popular vote \\
\midrule
0 &     Andrew Jackson &  1824 &  Democratic-Republican &        151271 \\
1 &  John Quincy Adams &  1824 &  Democratic-Republican &        113142 \\
2 &     Andrew Jackson &  1828 &             Democratic &        642806 \\
3 &  John Quincy Adams &  1828 &    National Republican &        500897 \\
\bottomrule
\end{tabular}

Slicing is no longer inclusive in \texttt{.iloc} - it's
\emph{exclusive}. This is one of Pandas syntatical subtleties; you'll
get used to with practice.

List behavior works just as expected.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#elections.loc[[0, 1, 2, 3], [\textquotesingle{}Year\textquotesingle{}, \textquotesingle{}Candidate\textquotesingle{}, \textquotesingle{}Party\textquotesingle{}, \textquotesingle{}Popular vote\textquotesingle{}]] {-} Previous Approach}
\NormalTok{elections.iloc[[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], [}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrlr}
\toprule
{} &          Candidate &  Year &                  Party &  Popular vote \\
\midrule
0 &     Andrew Jackson &  1824 &  Democratic-Republican &        151271 \\
1 &  John Quincy Adams &  1824 &  Democratic-Republican &        113142 \\
2 &     Andrew Jackson &  1828 &             Democratic &        642806 \\
3 &  John Quincy Adams &  1828 &    National Republican &        500897 \\
\bottomrule
\end{tabular}

This discussion begs the question: when should we use \texttt{.loc} vs
\texttt{.iloc}? In most cases, \texttt{.loc} is generally safer to use.
You can imagine \texttt{.iloc} may return incorrect values when applied
to a dataset where the ordering of data can change.

\hypertarget{indexing-with}{%
\subsection{Indexing with {[}{]}}\label{indexing-with}}

The \texttt{{[}{]}} selection operator is the most baffling of all, yet
the commonly used. It only takes a single argument, which may be one of
the following:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  A slice of row numbers
\item
  A list of column labels
\item
  A single column label
\end{enumerate}

That is, \texttt{{[}{]}} is \emph{context dependent}. Let's see some
examples.

\hypertarget{a-slice-of-row-numbers}{%
\subsubsection{A slice of row numbers}\label{a-slice-of-row-numbers}}

Say we wanted the first four rows of our \texttt{elections} DataFrame.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections[}\DecValTok{0}\NormalTok{:}\DecValTok{4}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrlrlr}
\toprule
{} &          Candidate &  Year &                  Party &  Popular vote & Result &          \% \\
\midrule
0 &     Andrew Jackson &  1824 &  Democratic-Republican &        151271 &   loss &  57.210122 \\
1 &  John Quincy Adams &  1824 &  Democratic-Republican &        113142 &    win &  42.789878 \\
2 &     Andrew Jackson &  1828 &             Democratic &        642806 &    win &  56.203927 \\
3 &  John Quincy Adams &  1828 &    National Republican &        500897 &   loss &  43.796073 \\
\bottomrule
\end{tabular}

\hypertarget{a-list-of-column-labels}{%
\subsubsection{A list of column labels}\label{a-list-of-column-labels}}

Suppose we now want the first four columns.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections[[}\StringTok{"Year"}\NormalTok{, }\StringTok{"Candidate"}\NormalTok{, }\StringTok{"Party"}\NormalTok{, }\StringTok{"Popular vote"}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllr}
\toprule
{} &  Year &               Candidate &                  Party &  Popular vote \\
\midrule
0   &  1824 &          Andrew Jackson &  Democratic-Republican &        151271 \\
1   &  1824 &       John Quincy Adams &  Democratic-Republican &        113142 \\
2   &  1828 &          Andrew Jackson &             Democratic &        642806 \\
3   &  1828 &       John Quincy Adams &    National Republican &        500897 \\
4   &  1832 &          Andrew Jackson &             Democratic &        702735 \\
5   &  1832 &              Henry Clay &    National Republican &        484205 \\
6   &  1832 &            William Wirt &           Anti-Masonic &        100715 \\
7   &  1836 &       Hugh Lawson White &                   Whig &        146109 \\
8   &  1836 &        Martin Van Buren &             Democratic &        763291 \\
9   &  1836 &  William Henry Harrison &                   Whig &        550816 \\
10  &  1840 &        Martin Van Buren &             Democratic &       1128854 \\
11  &  1840 &  William Henry Harrison &                   Whig &       1275583 \\
12  &  1844 &              Henry Clay &                   Whig &       1300004 \\
13  &  1844 &              James Polk &             Democratic &       1339570 \\
14  &  1848 &              Lewis Cass &             Democratic &       1223460 \\
15  &  1848 &        Martin Van Buren &              Free Soil &        291501 \\
16  &  1848 &          Zachary Taylor &                   Whig &       1360235 \\
17  &  1852 &         Franklin Pierce &             Democratic &       1605943 \\
18  &  1852 &            John P. Hale &              Free Soil &        155210 \\
19  &  1852 &          Winfield Scott &                   Whig &       1386942 \\
20  &  1856 &          James Buchanan &             Democratic &       1835140 \\
21  &  1856 &         John C. Frémont &             Republican &       1342345 \\
22  &  1856 &        Millard Fillmore &               American &        873053 \\
23  &  1860 &         Abraham Lincoln &             Republican &       1855993 \\
24  &  1860 &               John Bell &   Constitutional Union &        590901 \\
25  &  1860 &    John C. Breckinridge &    Southern Democratic &        848019 \\
26  &  1860 &      Stephen A. Douglas &    Northern Democratic &       1380202 \\
27  &  1864 &         Abraham Lincoln &         National Union &       2211317 \\
28  &  1864 &     George B. McClellan &             Democratic &       1812807 \\
29  &  1868 &         Horatio Seymour &             Democratic &       2708744 \\
30  &  1868 &           Ulysses Grant &             Republican &       3013790 \\
31  &  1872 &          Horace Greeley &     Liberal Republican &       2834761 \\
32  &  1872 &           Ulysses Grant &             Republican &       3597439 \\
33  &  1876 &        Rutherford Hayes &             Republican &       4034142 \\
34  &  1876 &        Samuel J. Tilden &             Democratic &       4288546 \\
35  &  1880 &         James B. Weaver &              Greenback &        308649 \\
36  &  1880 &          James Garfield &             Republican &       4453337 \\
37  &  1880 &  Winfield Scott Hancock &             Democratic &       4444976 \\
38  &  1884 &         Benjamin Butler &          Anti-Monopoly &        134294 \\
39  &  1884 &        Grover Cleveland &             Democratic &       4914482 \\
40  &  1884 &         James G. Blaine &             Republican &       4856905 \\
41  &  1884 &           John St. John &            Prohibition &        147482 \\
42  &  1888 &          Alson Streeter &            Union Labor &        146602 \\
43  &  1888 &       Benjamin Harrison &             Republican &       5443633 \\
44  &  1888 &         Clinton B. Fisk &            Prohibition &        249819 \\
45  &  1888 &        Grover Cleveland &             Democratic &       5534488 \\
46  &  1892 &       Benjamin Harrison &             Republican &       5176108 \\
47  &  1892 &        Grover Cleveland &             Democratic &       5553898 \\
48  &  1892 &         James B. Weaver &               Populist &       1041028 \\
49  &  1892 &            John Bidwell &            Prohibition &        270879 \\
50  &  1896 &          John M. Palmer &    National Democratic &        134645 \\
51  &  1896 &         Joshua Levering &            Prohibition &        131312 \\
52  &  1896 &  William Jennings Bryan &             Democratic &       6509052 \\
53  &  1896 &        William McKinley &             Republican &       7112138 \\
54  &  1900 &         John G. Woolley &            Prohibition &        210864 \\
55  &  1900 &  William Jennings Bryan &             Democratic &       6370932 \\
56  &  1900 &        William McKinley &             Republican &       7228864 \\
57  &  1904 &         Alton B. Parker &             Democratic &       5083880 \\
58  &  1904 &          Eugene V. Debs &              Socialist &        402810 \\
59  &  1904 &        Silas C. Swallow &            Prohibition &        259102 \\
60  &  1904 &      Theodore Roosevelt &             Republican &       7630557 \\
61  &  1904 &        Thomas E. Watson &               Populist &        114070 \\
62  &  1908 &          Eugene V. Debs &              Socialist &        420852 \\
63  &  1908 &        Eugene W. Chafin &            Prohibition &        254087 \\
64  &  1908 &  William Jennings Bryan &             Democratic &       6408979 \\
65  &  1908 &            William Taft &             Republican &       7678335 \\
66  &  1912 &          Eugene V. Debs &              Socialist &        901551 \\
67  &  1912 &        Eugene W. Chafin &            Prohibition &        208156 \\
68  &  1912 &      Theodore Roosevelt &            Progressive &       4122721 \\
69  &  1912 &            William Taft &             Republican &       3486242 \\
70  &  1912 &          Woodrow Wilson &             Democratic &       6296284 \\
71  &  1916 &         Allan L. Benson &              Socialist &        590524 \\
72  &  1916 &    Charles Evans Hughes &             Republican &       8548728 \\
73  &  1916 &             Frank Hanly &            Prohibition &        221302 \\
74  &  1916 &          Woodrow Wilson &             Democratic &       9126868 \\
75  &  1920 &        Aaron S. Watkins &            Prohibition &        188787 \\
76  &  1920 &          Eugene V. Debs &              Socialist &        913693 \\
77  &  1920 &            James M. Cox &             Democratic &       9139661 \\
78  &  1920 &   Parley P. Christensen &           Farmer–Labor &        265398 \\
79  &  1920 &          Warren Harding &             Republican &      16144093 \\
80  &  1924 &         Calvin Coolidge &             Republican &      15723789 \\
81  &  1924 &           John W. Davis &             Democratic &       8386242 \\
82  &  1924 &      Robert La Follette &            Progressive &       4831706 \\
83  &  1928 &                Al Smith &             Democratic &      15015464 \\
84  &  1928 &          Herbert Hoover &             Republican &      21427123 \\
85  &  1928 &           Norman Thomas &              Socialist &        267478 \\
86  &  1932 &      Franklin Roosevelt &             Democratic &      22821277 \\
87  &  1932 &          Herbert Hoover &             Republican &      15761254 \\
88  &  1932 &           Norman Thomas &              Socialist &        884885 \\
89  &  1932 &       William Z. Foster &              Communist &        103307 \\
90  &  1936 &              Alf Landon &             Republican &      16679543 \\
91  &  1936 &      Franklin Roosevelt &             Democratic &      27752648 \\
92  &  1936 &           Norman Thomas &              Socialist &        187910 \\
93  &  1936 &           William Lemke &                  Union &        892378 \\
94  &  1940 &      Franklin Roosevelt &             Democratic &      27313945 \\
95  &  1940 &           Norman Thomas &              Socialist &        116599 \\
96  &  1940 &         Wendell Willkie &             Republican &      22347744 \\
97  &  1944 &      Franklin Roosevelt &             Democratic &      25612916 \\
98  &  1944 &         Thomas E. Dewey &             Republican &      22017929 \\
99  &  1948 &        Claude A. Watson &            Prohibition &        103708 \\
100 &  1948 &            Harry Truman &             Democratic &      24179347 \\
101 &  1948 &        Henry A. Wallace &            Progressive &       1157328 \\
102 &  1948 &           Norman Thomas &              Socialist &        139569 \\
103 &  1948 &          Strom Thurmond &              Dixiecrat &       1175930 \\
104 &  1948 &         Thomas E. Dewey &             Republican &      21991292 \\
105 &  1952 &         Adlai Stevenson &             Democratic &      27375090 \\
106 &  1952 &       Dwight Eisenhower &             Republican &      34075529 \\
107 &  1952 &        Vincent Hallinan &            Progressive &        140746 \\
108 &  1956 &         Adlai Stevenson &             Democratic &      26028028 \\
109 &  1956 &       Dwight Eisenhower &             Republican &      35579180 \\
110 &  1956 &      T. Coleman Andrews &         States' Rights &        107929 \\
111 &  1960 &            John Kennedy &             Democratic &      34220984 \\
112 &  1960 &           Richard Nixon &             Republican &      34108157 \\
113 &  1964 &         Barry Goldwater &             Republican &      27175754 \\
114 &  1964 &          Lyndon Johnson &             Democratic &      43127041 \\
115 &  1968 &          George Wallace &   American Independent &       9901118 \\
116 &  1968 &         Hubert Humphrey &             Democratic &      31271839 \\
117 &  1968 &           Richard Nixon &             Republican &      31783783 \\
118 &  1972 &         George McGovern &             Democratic &      29173222 \\
119 &  1972 &         John G. Schmitz &   American Independent &       1100868 \\
120 &  1972 &           Richard Nixon &             Republican &      47168710 \\
121 &  1976 &         Eugene McCarthy &            Independent &        740460 \\
122 &  1976 &             Gerald Ford &             Republican &      39148634 \\
123 &  1976 &            Jimmy Carter &             Democratic &      40831881 \\
124 &  1976 &           Lester Maddox &   American Independent &        170274 \\
125 &  1976 &          Roger MacBride &            Libertarian &        172557 \\
126 &  1976 &      Thomas J. Anderson &               American &        158271 \\
127 &  1980 &          Barry Commoner &               Citizens &        233052 \\
128 &  1980 &                Ed Clark &            Libertarian &        921128 \\
129 &  1980 &            Jimmy Carter &             Democratic &      35480115 \\
130 &  1980 &        John B. Anderson &            Independent &       5719850 \\
131 &  1980 &           Ronald Reagan &             Republican &      43903230 \\
132 &  1984 &          David Bergland &            Libertarian &        228111 \\
133 &  1984 &           Ronald Reagan &             Republican &      54455472 \\
134 &  1984 &          Walter Mondale &             Democratic &      37577352 \\
135 &  1988 &       George H. W. Bush &             Republican &      48886597 \\
136 &  1988 &           Lenora Fulani &           New Alliance &        217221 \\
137 &  1988 &         Michael Dukakis &             Democratic &      41809074 \\
138 &  1988 &                Ron Paul &            Libertarian &        431750 \\
139 &  1992 &            Andre Marrou &            Libertarian &        290087 \\
140 &  1992 &            Bill Clinton &             Democratic &      44909806 \\
141 &  1992 &                Bo Gritz &               Populist &        106152 \\
142 &  1992 &       George H. W. Bush &             Republican &      39104550 \\
143 &  1992 &              Ross Perot &            Independent &      19743821 \\
144 &  1996 &            Bill Clinton &             Democratic &      47400125 \\
145 &  1996 &                Bob Dole &             Republican &      39197469 \\
146 &  1996 &            Harry Browne &            Libertarian &        485759 \\
147 &  1996 &         Howard Phillips &              Taxpayers &        184656 \\
148 &  1996 &            John Hagelin &            Natural Law &        113670 \\
149 &  1996 &             Ralph Nader &                  Green &        685297 \\
150 &  1996 &              Ross Perot &                 Reform &       8085294 \\
151 &  2000 &                 Al Gore &             Democratic &      50999897 \\
152 &  2000 &          George W. Bush &             Republican &      50456002 \\
153 &  2000 &            Harry Browne &            Libertarian &        384431 \\
154 &  2000 &            Pat Buchanan &                 Reform &        448895 \\
155 &  2000 &             Ralph Nader &                  Green &       2882955 \\
156 &  2004 &              David Cobb &                  Green &        119859 \\
157 &  2004 &          George W. Bush &             Republican &      62040610 \\
158 &  2004 &              John Kerry &             Democratic &      59028444 \\
159 &  2004 &        Michael Badnarik &            Libertarian &        397265 \\
160 &  2004 &        Michael Peroutka &           Constitution &        143630 \\
161 &  2004 &             Ralph Nader &            Independent &        465151 \\
162 &  2008 &            Barack Obama &             Democratic &      69498516 \\
163 &  2008 &                Bob Barr &            Libertarian &        523715 \\
164 &  2008 &           Chuck Baldwin &           Constitution &        199750 \\
165 &  2008 &        Cynthia McKinney &                  Green &        161797 \\
166 &  2008 &             John McCain &             Republican &      59948323 \\
167 &  2008 &             Ralph Nader &            Independent &        739034 \\
168 &  2012 &            Barack Obama &             Democratic &      65915795 \\
169 &  2012 &            Gary Johnson &            Libertarian &       1275971 \\
170 &  2012 &              Jill Stein &                  Green &        469627 \\
171 &  2012 &             Mitt Romney &             Republican &      60933504 \\
172 &  2016 &          Darrell Castle &           Constitution &        203091 \\
173 &  2016 &            Donald Trump &             Republican &      62984828 \\
174 &  2016 &           Evan McMullin &            Independent &        732273 \\
175 &  2016 &            Gary Johnson &            Libertarian &       4489235 \\
176 &  2016 &         Hillary Clinton &             Democratic &      65853514 \\
177 &  2016 &              Jill Stein &                  Green &       1457226 \\
178 &  2020 &            Joseph Biden &             Democratic &      81268924 \\
179 &  2020 &            Donald Trump &             Republican &      74216154 \\
180 &  2020 &            Jo Jorgensen &            Libertarian &       1865724 \\
181 &  2020 &          Howard Hawkins &                  Green &        405035 \\
\bottomrule
\end{tabular}

\hypertarget{a-single-column-label}{%
\subsubsection{A single column label}\label{a-single-column-label}}

Lastly, if we only want the \texttt{Candidate} column.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections[}\StringTok{"Candidate"}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &               Candidate \\
\midrule
0   &          Andrew Jackson \\
1   &       John Quincy Adams \\
2   &          Andrew Jackson \\
3   &       John Quincy Adams \\
4   &          Andrew Jackson \\
5   &              Henry Clay \\
6   &            William Wirt \\
7   &       Hugh Lawson White \\
8   &        Martin Van Buren \\
9   &  William Henry Harrison \\
10  &        Martin Van Buren \\
11  &  William Henry Harrison \\
12  &              Henry Clay \\
13  &              James Polk \\
14  &              Lewis Cass \\
15  &        Martin Van Buren \\
16  &          Zachary Taylor \\
17  &         Franklin Pierce \\
18  &            John P. Hale \\
19  &          Winfield Scott \\
20  &          James Buchanan \\
21  &         John C. Frémont \\
22  &        Millard Fillmore \\
23  &         Abraham Lincoln \\
24  &               John Bell \\
25  &    John C. Breckinridge \\
26  &      Stephen A. Douglas \\
27  &         Abraham Lincoln \\
28  &     George B. McClellan \\
29  &         Horatio Seymour \\
30  &           Ulysses Grant \\
31  &          Horace Greeley \\
32  &           Ulysses Grant \\
33  &        Rutherford Hayes \\
34  &        Samuel J. Tilden \\
35  &         James B. Weaver \\
36  &          James Garfield \\
37  &  Winfield Scott Hancock \\
38  &         Benjamin Butler \\
39  &        Grover Cleveland \\
40  &         James G. Blaine \\
41  &           John St. John \\
42  &          Alson Streeter \\
43  &       Benjamin Harrison \\
44  &         Clinton B. Fisk \\
45  &        Grover Cleveland \\
46  &       Benjamin Harrison \\
47  &        Grover Cleveland \\
48  &         James B. Weaver \\
49  &            John Bidwell \\
50  &          John M. Palmer \\
51  &         Joshua Levering \\
52  &  William Jennings Bryan \\
53  &        William McKinley \\
54  &         John G. Woolley \\
55  &  William Jennings Bryan \\
56  &        William McKinley \\
57  &         Alton B. Parker \\
58  &          Eugene V. Debs \\
59  &        Silas C. Swallow \\
60  &      Theodore Roosevelt \\
61  &        Thomas E. Watson \\
62  &          Eugene V. Debs \\
63  &        Eugene W. Chafin \\
64  &  William Jennings Bryan \\
65  &            William Taft \\
66  &          Eugene V. Debs \\
67  &        Eugene W. Chafin \\
68  &      Theodore Roosevelt \\
69  &            William Taft \\
70  &          Woodrow Wilson \\
71  &         Allan L. Benson \\
72  &    Charles Evans Hughes \\
73  &             Frank Hanly \\
74  &          Woodrow Wilson \\
75  &        Aaron S. Watkins \\
76  &          Eugene V. Debs \\
77  &            James M. Cox \\
78  &   Parley P. Christensen \\
79  &          Warren Harding \\
80  &         Calvin Coolidge \\
81  &           John W. Davis \\
82  &      Robert La Follette \\
83  &                Al Smith \\
84  &          Herbert Hoover \\
85  &           Norman Thomas \\
86  &      Franklin Roosevelt \\
87  &          Herbert Hoover \\
88  &           Norman Thomas \\
89  &       William Z. Foster \\
90  &              Alf Landon \\
91  &      Franklin Roosevelt \\
92  &           Norman Thomas \\
93  &           William Lemke \\
94  &      Franklin Roosevelt \\
95  &           Norman Thomas \\
96  &         Wendell Willkie \\
97  &      Franklin Roosevelt \\
98  &         Thomas E. Dewey \\
99  &        Claude A. Watson \\
100 &            Harry Truman \\
101 &        Henry A. Wallace \\
102 &           Norman Thomas \\
103 &          Strom Thurmond \\
104 &         Thomas E. Dewey \\
105 &         Adlai Stevenson \\
106 &       Dwight Eisenhower \\
107 &        Vincent Hallinan \\
108 &         Adlai Stevenson \\
109 &       Dwight Eisenhower \\
110 &      T. Coleman Andrews \\
111 &            John Kennedy \\
112 &           Richard Nixon \\
113 &         Barry Goldwater \\
114 &          Lyndon Johnson \\
115 &          George Wallace \\
116 &         Hubert Humphrey \\
117 &           Richard Nixon \\
118 &         George McGovern \\
119 &         John G. Schmitz \\
120 &           Richard Nixon \\
121 &         Eugene McCarthy \\
122 &             Gerald Ford \\
123 &            Jimmy Carter \\
124 &           Lester Maddox \\
125 &          Roger MacBride \\
126 &      Thomas J. Anderson \\
127 &          Barry Commoner \\
128 &                Ed Clark \\
129 &            Jimmy Carter \\
130 &        John B. Anderson \\
131 &           Ronald Reagan \\
132 &          David Bergland \\
133 &           Ronald Reagan \\
134 &          Walter Mondale \\
135 &       George H. W. Bush \\
136 &           Lenora Fulani \\
137 &         Michael Dukakis \\
138 &                Ron Paul \\
139 &            Andre Marrou \\
140 &            Bill Clinton \\
141 &                Bo Gritz \\
142 &       George H. W. Bush \\
143 &              Ross Perot \\
144 &            Bill Clinton \\
145 &                Bob Dole \\
146 &            Harry Browne \\
147 &         Howard Phillips \\
148 &            John Hagelin \\
149 &             Ralph Nader \\
150 &              Ross Perot \\
151 &                 Al Gore \\
152 &          George W. Bush \\
153 &            Harry Browne \\
154 &            Pat Buchanan \\
155 &             Ralph Nader \\
156 &              David Cobb \\
157 &          George W. Bush \\
158 &              John Kerry \\
159 &        Michael Badnarik \\
160 &        Michael Peroutka \\
161 &             Ralph Nader \\
162 &            Barack Obama \\
163 &                Bob Barr \\
164 &           Chuck Baldwin \\
165 &        Cynthia McKinney \\
166 &             John McCain \\
167 &             Ralph Nader \\
168 &            Barack Obama \\
169 &            Gary Johnson \\
170 &              Jill Stein \\
171 &             Mitt Romney \\
172 &          Darrell Castle \\
173 &            Donald Trump \\
174 &           Evan McMullin \\
175 &            Gary Johnson \\
176 &         Hillary Clinton \\
177 &              Jill Stein \\
178 &            Joseph Biden \\
179 &            Donald Trump \\
180 &            Jo Jorgensen \\
181 &          Howard Hawkins \\
\bottomrule
\end{tabular}

The output looks like a Series! In this course, we'll become very
comfortable with \texttt{{[}{]}}, especially for selecting columns. In
practice, \texttt{{[}{]}} is much more common than \texttt{.loc}.

\hypertarget{parting-note}{%
\section{Parting Note}\label{parting-note}}

The \texttt{pandas} library is enormous and contains many useful
functions. Here is a link to
\href{https://pandas.pydata.org/docs/}{documentation}.

The introductory \texttt{pandas} lectures will cover important data
structures and methods you should be fluent in. However, we want you to
get familiar with the real world programming practice of
\ldots Googling! Answers to your questions can be found in
documentation, Stack Overflow, etc.

With that, let's move on to Pandas II.

\bookmarksetup{startatroot}

\hypertarget{pandas-ii}{%
\chapter{Pandas II}\label{pandas-ii}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Build familiarity with advanced \texttt{pandas} syntax
\item
  Extract data from a DataFrame using conditional selection
\item
  Recognize situations where aggregation is useful and identify the
  correct technique for performing an aggregation
\end{itemize}

\end{tcolorbox}

Last time, we introduced the \texttt{pandas} library as a toolkit for
processing data. We learned the DataFrame and Series data structures,
familiarized ourselves with the basic syntax for manipulating tabular
data, and began writing our first lines of \texttt{pandas} code.

In this lecture, we'll start to dive into some advanced \texttt{pandas}
syntax. You may find it helpful to follow along with a notebook of your
own as we walk through these new pieces of code.

We'll start by loading the \texttt{babynames} dataset.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\ImportTok{import}\NormalTok{ urllib.request}
\ImportTok{import}\NormalTok{ os.path}
\ImportTok{import}\NormalTok{ zipfile}

\NormalTok{data\_url }\OperatorTok{=} \StringTok{"https://www.ssa.gov/oact/babynames/state/namesbystate.zip"}
\NormalTok{local\_filename }\OperatorTok{=} \StringTok{"babynamesbystate.zip"}
\ControlFlowTok{if} \KeywordTok{not}\NormalTok{ os.path.exists(local\_filename): }\CommentTok{\# if the data exists don\textquotesingle{}t download again}
    \ControlFlowTok{with}\NormalTok{ urllib.request.urlopen(data\_url) }\ImportTok{as}\NormalTok{ resp, }\BuiltInTok{open}\NormalTok{(local\_filename, }\StringTok{\textquotesingle{}wb\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{        f.write(resp.read())}

\NormalTok{zf }\OperatorTok{=}\NormalTok{ zipfile.ZipFile(local\_filename, }\StringTok{\textquotesingle{}r\textquotesingle{}}\NormalTok{)}

\NormalTok{ca\_name }\OperatorTok{=} \StringTok{\textquotesingle{}CA.TXT\textquotesingle{}}
\NormalTok{field\_names }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}State\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Sex\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Count\textquotesingle{}}\NormalTok{]}
\ControlFlowTok{with}\NormalTok{ zf.}\BuiltInTok{open}\NormalTok{(ca\_name) }\ImportTok{as}\NormalTok{ fh:}
\NormalTok{    babynames }\OperatorTok{=}\NormalTok{ pd.read\_csv(fh, header}\OperatorTok{=}\VariableTok{None}\NormalTok{, names}\OperatorTok{=}\NormalTok{field\_names)}

\NormalTok{babynames.head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &      Name &  Count \\
\midrule
0 &    CA &   F &  1910 &      Mary &    295 \\
1 &    CA &   F &  1910 &     Helen &    239 \\
2 &    CA &   F &  1910 &   Dorothy &    220 \\
3 &    CA &   F &  1910 &  Margaret &    163 \\
4 &    CA &   F &  1910 &   Frances &    134 \\
\bottomrule
\end{tabular}

\hypertarget{conditional-selection}{%
\section{Conditional Selection}\label{conditional-selection}}

Conditional selection allows us to select a subset of rows in a
DataFrame if they follow some specified condition.

To understand how to use conditional selection, we must look at another
possible input of the \texttt{.loc} and \texttt{{[}{]}} methods -- a
boolean array, which is simply an array where each element is either
\texttt{True} or \texttt{False}. This boolean array must have a length
equal to the number of rows in the DataFrame. It will return all rows in
the position of a corresponding True value in the array.

To see this in action, let's select all even-indexed rows in the first
10 rows of our DataFrame.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Ask yourself: why is :9 is the correct slice to select the first 10 rows?}
\NormalTok{babynames\_first\_10\_rows }\OperatorTok{=}\NormalTok{ babynames.loc[:}\DecValTok{9}\NormalTok{, :]}

\CommentTok{\# Notice how we have exactly 10 elements in our boolean array argument}
\NormalTok{babynames\_first\_10\_rows[[}\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &      Name &  Count \\
\midrule
0 &    CA &   F &  1910 &      Mary &    295 \\
2 &    CA &   F &  1910 &   Dorothy &    220 \\
4 &    CA &   F &  1910 &   Frances &    134 \\
6 &    CA &   F &  1910 &    Evelyn &    126 \\
8 &    CA &   F &  1910 &  Virginia &    101 \\
\bottomrule
\end{tabular}

We can perform a similar operation using \texttt{.loc}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames\_first\_10\_rows.loc[[}\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{], :]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &      Name &  Count \\
\midrule
0 &    CA &   F &  1910 &      Mary &    295 \\
2 &    CA &   F &  1910 &   Dorothy &    220 \\
4 &    CA &   F &  1910 &   Frances &    134 \\
6 &    CA &   F &  1910 &    Evelyn &    126 \\
8 &    CA &   F &  1910 &  Virginia &    101 \\
\bottomrule
\end{tabular}

These techniques worked well in this example, but you can imagine how
tedious it might be to list out \texttt{True}s and \texttt{False}s for
every row in a larger DataFrame. To make things easier, we can instead
provide a logical condition as an input to \texttt{.loc} or
\texttt{{[}{]}} that returns a boolean array with the necessary length.

For example, to return all names associated with \texttt{F} sex:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# First, use a logical condition to generate a boolean array}
\NormalTok{logical\_operator }\OperatorTok{=}\NormalTok{ (babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{)}

\CommentTok{\# Then, use this boolean array to filter the DataFrame}
\NormalTok{babynames[logical\_operator].head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &      Name &  Count \\
\midrule
0 &    CA &   F &  1910 &      Mary &    295 \\
1 &    CA &   F &  1910 &     Helen &    239 \\
2 &    CA &   F &  1910 &   Dorothy &    220 \\
3 &    CA &   F &  1910 &  Margaret &    163 \\
4 &    CA &   F &  1910 &   Frances &    134 \\
\bottomrule
\end{tabular}

Here, \texttt{logical\_operator} evaluates to a Series of boolean values
with length 400762.

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{print}\NormalTok{(}\StringTok{"There are a total of }\SpecialCharTok{\{\}}\StringTok{ values in \textquotesingle{}logical\_operator\textquotesingle{}"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(}\BuiltInTok{len}\NormalTok{(logical\_operator)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
There are a total of 400762 values in 'logical_operator'
\end{verbatim}

Rows starting at row 0 and ending at row 235790 evaluate to
\texttt{True} and are thus returned in the DataFrame.

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{print}\NormalTok{(}\StringTok{"The 0th item in this \textquotesingle{}logical\_operator\textquotesingle{} is: }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(logical\_operator.iloc[}\DecValTok{0}\NormalTok{]))}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The 235790th item in this \textquotesingle{}logical\_operator\textquotesingle{} is: }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(logical\_operator.iloc[}\DecValTok{235790}\NormalTok{]))}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The 235791th item in this \textquotesingle{}logical\_operator\textquotesingle{} is: }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(logical\_operator.iloc[}\DecValTok{235791}\NormalTok{]))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
The 0th item in this 'logical_operator' is: True
The 235790th item in this 'logical_operator' is: True
The 235791th item in this 'logical_operator' is: False
\end{verbatim}

Passing a Series as an argument to \texttt{babynames{[}{]}} has the same
affect as using a boolean array. In fact, the \texttt{{[}{]}} selection
operator can take a boolean Series, array, and list as arguments. These
three are used interchangeably thoughout the course.

We can also use \texttt{.loc} to achieve similar results.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.loc[babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{].head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &      Name &  Count \\
\midrule
0 &    CA &   F &  1910 &      Mary &    295 \\
1 &    CA &   F &  1910 &     Helen &    239 \\
2 &    CA &   F &  1910 &   Dorothy &    220 \\
3 &    CA &   F &  1910 &  Margaret &    163 \\
4 &    CA &   F &  1910 &   Frances &    134 \\
\bottomrule
\end{tabular}

Boolean conditions can be combined using various operators that allow us
to filter results by multiple conditions. Some examples include the
\texttt{\&} (and) operator and the \texttt{\textbar{}} (or) operator.

\textbf{Note:} When combining multiple conditions with logical
operators, be sure to surround each condition with a set of parenthesis
\texttt{()}. If you forget, your code will throw an error.

For example, if we want to return data on all females born before the
21st century, we can write:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames[(babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{) }\OperatorTok{\&}\NormalTok{ (babynames[}\StringTok{"Year"}\NormalTok{] }\OperatorTok{\textless{}} \DecValTok{2000}\NormalTok{)].head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &      Name &  Count \\
\midrule
0 &    CA &   F &  1910 &      Mary &    295 \\
1 &    CA &   F &  1910 &     Helen &    239 \\
2 &    CA &   F &  1910 &   Dorothy &    220 \\
3 &    CA &   F &  1910 &  Margaret &    163 \\
4 &    CA &   F &  1910 &   Frances &    134 \\
\bottomrule
\end{tabular}

Boolean array selection is a useful tool, but can lead to overly verbose
code for complex conditions. \texttt{Pandas} provide many alternatives:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(}
\NormalTok{    babynames[(babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Bella"}\NormalTok{) }\OperatorTok{|} 
\NormalTok{              (babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Alex"}\NormalTok{) }\OperatorTok{|}
\NormalTok{              (babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Ani"}\NormalTok{) }\OperatorTok{|}
\NormalTok{              (babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Lisa"}\NormalTok{)]}
\NormalTok{).head()}
\CommentTok{\# Note: The parentheses surrounding the code make it possible to break the code on to multiple lines for readability}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &   Name &  Count \\
\midrule
6289  &    CA &   F &  1923 &  Bella &      5 \\
7512  &    CA &   F &  1925 &  Bella &      8 \\
12368 &    CA &   F &  1932 &   Lisa &      5 \\
14741 &    CA &   F &  1936 &   Lisa &      8 \\
17084 &    CA &   F &  1939 &   Lisa &      5 \\
\bottomrule
\end{tabular}

The \texttt{.isin} function can be used to filter dataframes. The method
helps in selecting rows with having a particular (or multiple) value in
a particular column.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{names }\OperatorTok{=}\NormalTok{ [}\StringTok{"Bella"}\NormalTok{, }\StringTok{"Alex"}\NormalTok{, }\StringTok{"Ani"}\NormalTok{, }\StringTok{"Lisa"}\NormalTok{]}
\NormalTok{babynames[babynames[}\StringTok{"Name"}\NormalTok{].isin(names)].head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &   Name &  Count \\
\midrule
6289  &    CA &   F &  1923 &  Bella &      5 \\
7512  &    CA &   F &  1925 &  Bella &      8 \\
12368 &    CA &   F &  1932 &   Lisa &      5 \\
14741 &    CA &   F &  1936 &   Lisa &      8 \\
17084 &    CA &   F &  1939 &   Lisa &      5 \\
\bottomrule
\end{tabular}

The function \texttt{str.startswith} can be used to define a filter
based on string values in a \texttt{Series} object.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames[babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.startswith(}\StringTok{"N"}\NormalTok{)].head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &    Name &  Count \\
\midrule
76  &    CA &   F &  1910 &   Norma &     23 \\
83  &    CA &   F &  1910 &  Nellie &     20 \\
127 &    CA &   F &  1910 &    Nina &     11 \\
198 &    CA &   F &  1910 &    Nora &      6 \\
310 &    CA &   F &  1911 &  Nellie &     23 \\
\bottomrule
\end{tabular}

\hypertarget{handy-utility-functions}{%
\section{Handy Utility Functions}\label{handy-utility-functions}}

\texttt{pandas} contains an extensive library of functions that can help
shorten the process of setting and getting information from its data
structures. In the following section, we will give overviews of each of
the main utility functions that will help us in Data 100.

\begin{itemize}
\tightlist
\item
  \texttt{Numpy} and built-in function support
\item
  \texttt{.shape}
\item
  \texttt{.size}
\item
  \texttt{.describe()}
\item
  \texttt{.sample()}
\item
  \texttt{.value\_counts()}
\item
  \texttt{.unique()}
\item
  \texttt{.sort\_values()}
\end{itemize}

\hypertarget{numpy}{%
\subsection{\texorpdfstring{\texttt{Numpy}}{Numpy}}\label{numpy}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bella\_counts }\OperatorTok{=}\NormalTok{ babynames[babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Bella"}\NormalTok{][}\StringTok{"Count"}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Average number of babies named Bella each year}
\NormalTok{np.mean(bella\_counts)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
270.1860465116279
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Max number of babies named Bella born on a given year}
\BuiltInTok{max}\NormalTok{(bella\_counts)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
902
\end{verbatim}

\hypertarget{shape-and-.size}{%
\subsection{\texorpdfstring{\texttt{.shape} and
\texttt{.size}}{.shape and .size}}\label{shape-and-.size}}

\texttt{.shape} and \texttt{.size} are attributes of Series and
DataFrames that measure the ``amount'' of data stored in the structure.
Calling \texttt{.shape} returns a tuple containing the number of rows
and columns present in the DataFrame or Series. \texttt{.size} is used
to find the total number of elements in a structure, equivalent to the
number of rows times the number of columns.

Many functions strictly require the dimensions of the arguments along
certain axes to match. Calling these dimension-finding functions is much
faster than counting all of the items by hand.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.shape}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
(400762, 5)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.size}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
2003810
\end{verbatim}

\hypertarget{describe}{%
\subsection{\texorpdfstring{\texttt{.describe()}}{.describe()}}\label{describe}}

If many statistics are required from a DataFrame (minimum value, maximum
value, mean value, etc.), then
\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html}{\texttt{.describe()}}
can be used to compute all of them at once.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.describe()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrr}
\toprule
{} &           Year &          Count \\
\midrule
count &  400762.000000 &  400762.000000 \\
mean  &    1985.131287 &      79.953781 \\
std   &      26.821004 &     295.414618 \\
min   &    1910.000000 &       5.000000 \\
25\%   &    1968.000000 &       7.000000 \\
50\%   &    1991.000000 &      13.000000 \\
75\%   &    2007.000000 &      38.000000 \\
max   &    2021.000000 &    8262.000000 \\
\bottomrule
\end{tabular}

A different set of statistics will be reported if \texttt{.describe()}
is called on a Series.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames[}\StringTok{"Sex"}\NormalTok{].describe()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &     Sex \\
\midrule
count  &  400762 \\
unique &       2 \\
top    &       F \\
freq   &  235791 \\
\bottomrule
\end{tabular}

\hypertarget{sample}{%
\subsection{\texorpdfstring{\texttt{.sample()}}{.sample()}}\label{sample}}

As we will see later in the semester, random processes are at the heart
of many data science techniques (for example, train-test splits,
bootstrapping, and cross-validation).
\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html}{\texttt{.sample()}}
lets us quickly select random entries (a row if called from a DataFrame,
or a value if called from a Series).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.sample()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year & Name &  Count \\
\midrule
312047 &    CA &   M &  1989 &  Ron &     21 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.sample(}\DecValTok{5}\NormalTok{).iloc[:, }\DecValTok{2}\NormalTok{:]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrlr}
\toprule
{} &  Year &    Name &  Count \\
\midrule
28157  &  1950 &   Trina &     14 \\
282627 &  1972 &  Dorian &     25 \\
49401  &  1964 &    Lori &   1564 \\
25927  &  1948 &    Reva &      8 \\
384478 &  2016 &  Pranav &     26 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames[babynames[}\StringTok{"Year"}\NormalTok{] }\OperatorTok{==} \DecValTok{2000}\NormalTok{].sample(}\DecValTok{4}\NormalTok{, replace }\OperatorTok{=} \VariableTok{True}\NormalTok{).iloc[:, }\DecValTok{2}\NormalTok{:]}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrlr}
\toprule
{} &  Year &      Name &  Count \\
\midrule
340948 &  2000 &    Derian &      5 \\
339824 &  2000 &     Uziel &     14 \\
152696 &  2000 &     Reiko &      5 \\
150884 &  2000 &  Madilynn &     12 \\
\bottomrule
\end{tabular}

\hypertarget{value_counts}{%
\subsection{\texorpdfstring{\texttt{.value\_counts()}}{.value\_counts()}}\label{value_counts}}

When we want to know the distribution of the items in a Series (for
example, what items are most/least common), we use
\href{https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html}{\texttt{.value-counts()}}
to get a breakdown of the unique \emph{values} and their \emph{counts}.
In the example below, we can determine the name with the most years in
which at least one person has taken that name by counting the number of
times each name appears in the \texttt{"Name"} column of
\texttt{babynames}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].value\_counts().head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lr}
\toprule
{} &  Name \\
\midrule
Jean      &   221 \\
Francis   &   219 \\
Guadalupe &   216 \\
Jessie    &   215 \\
Marion    &   213 \\
\bottomrule
\end{tabular}

\hypertarget{unique}{%
\subsection{\texorpdfstring{\texttt{.unique()}}{.unique()}}\label{unique}}

If we have a Series with many repeated values, then
\href{https://pandas.pydata.org/docs/reference/api/pandas.unique.html}{\texttt{.unique()}}
can be used to identify only the \emph{unique} values. Here we can get a
list of all the names in \texttt{babynames}.

\textbf{Exercise:} what function can we call on the Series below to get
the number of unique names?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].unique()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
array(['Mary', 'Helen', 'Dorothy', ..., 'Zyire', 'Zylo', 'Zyrus'],
      dtype=object)
\end{verbatim}

\hypertarget{sort_values}{%
\subsection{\texorpdfstring{\texttt{.sort\_values()}}{.sort\_values()}}\label{sort_values}}

Ordering a DataFrame can be useful for isolating extreme values. For
example, the first 5 entries of a row sorted in descending order (that
is, from highest to lowest) are the largest 5 values.
\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html}{\texttt{.sort\_values}}
allows us to order a DataFrame or Series by a specified rule. For
DataFrames, we must specify the column by which we want to compare the
rows and the function will return such rows. We can choose to either
receive the rows in \texttt{ascending} order (default) or
\texttt{descending} order.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.sort\_values(by }\OperatorTok{=} \StringTok{"Count"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{).head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &     Name &  Count \\
\midrule
263272 &    CA &   M &  1956 &  Michael &   8262 \\
264297 &    CA &   M &  1957 &  Michael &   8250 \\
313644 &    CA &   M &  1990 &  Michael &   8247 \\
278109 &    CA &   M &  1969 &  Michael &   8244 \\
279405 &    CA &   M &  1970 &  Michael &   8197 \\
\bottomrule
\end{tabular}

We do not need to explicitly specify the column used for sorting when
calling \texttt{.value\_counts()} on a Series. We can still specify the
ordering paradigm -- that is, whether values are sorted in ascending or
descending order.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].sort\_values(ascending}\OperatorTok{=}\VariableTok{True}\NormalTok{).head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &     Name \\
\midrule
380256 &    Aadan \\
362255 &    Aadan \\
365374 &    Aadan \\
394460 &  Aadarsh \\
366561 &    Aaden \\
\bottomrule
\end{tabular}

\hypertarget{sorting-with-a-custom-key}{%
\subsubsection{Sorting With a Custom
Key}\label{sorting-with-a-custom-key}}

Using \texttt{.sort\_values} can be useful in many situations, but it
many not cover all use cases. This is because \texttt{pandas}
automatically sorts values in order according to numeric value (for
number data) or alphabetical order (for string data). The following code
finds the top 5 most popular names in California in 2021.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Sort names by count in year 2021}
\CommentTok{\# Recall that \textasciigrave{}.head(5)\textasciigrave{} displays the first five rows in the DataFrame}
\NormalTok{babynames[babynames[}\StringTok{"Year"}\NormalTok{] }\OperatorTok{==} \DecValTok{2021}\NormalTok{].sort\_values(}\StringTok{"Count"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{).head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &    Name &  Count \\
\midrule
397909 &    CA &   M &  2021 &    Noah &   2591 \\
397910 &    CA &   M &  2021 &    Liam &   2469 \\
232145 &    CA &   F &  2021 &  Olivia &   2395 \\
232146 &    CA &   F &  2021 &    Emma &   2171 \\
397911 &    CA &   M &  2021 &   Mateo &   2108 \\
\bottomrule
\end{tabular}

This offers us a lot of functionality, but what if we need to sort by
some other metric? For example, what if we wanted to find the longest
names in the DataFrame?

We can do this by specifying the \texttt{key} parameter of
\texttt{.sort\_values}. The \texttt{key} parameter is assigned to a
function of our choice. This function is then applied to each value in
the specified column. \texttt{pandas} will, finally, sort the DataFrame
by the values outputted by the function.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Here, a lambda function is applied to find the length of each value, \textasciigrave{}x\textasciigrave{}, in the "Name" column}
\NormalTok{babynames.sort\_values(}\StringTok{"Name"}\NormalTok{, key}\OperatorTok{=}\KeywordTok{lambda}\NormalTok{ x: x.}\BuiltInTok{str}\NormalTok{.}\BuiltInTok{len}\NormalTok{(), ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &             Name &  Count \\
\midrule
313143 &    CA &   M &  1989 &  Franciscojavier &      6 \\
333732 &    CA &   M &  1997 &  Ryanchristopher &      5 \\
330421 &    CA &   M &  1996 &  Franciscojavier &      8 \\
323615 &    CA &   M &  1993 &  Johnchristopher &      5 \\
310235 &    CA &   M &  1988 &  Franciscojavier &     10 \\
\bottomrule
\end{tabular}

\hypertarget{adding-and-removing-columns}{%
\section{Adding and Removing
Columns}\label{adding-and-removing-columns}}

To add a new column to a DataFrame, we use a syntax similar to that used
when accessing an existing column. Specify the name of the new column by
writing \texttt{dataframe{[}"new\_column"{]}}, then assign this to a
Series or Array containing the values that will populate this column.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add a column named "name\_lengths" that includes the length of each name}
\NormalTok{babynames[}\StringTok{"name\_lengths"}\NormalTok{] }\OperatorTok{=}\NormalTok{ babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.}\BuiltInTok{len}\NormalTok{()}
\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlrr}
\toprule
{} & State & Sex &  Year &      Name &  Count &  name\_lengths \\
\midrule
0 &    CA &   F &  1910 &      Mary &    295 &             4 \\
1 &    CA &   F &  1910 &     Helen &    239 &             5 \\
2 &    CA &   F &  1910 &   Dorothy &    220 &             7 \\
3 &    CA &   F &  1910 &  Margaret &    163 &             8 \\
4 &    CA &   F &  1910 &   Frances &    134 &             7 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Sort by the temporary column}
\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.sort\_values(by }\OperatorTok{=} \StringTok{"name\_lengths"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
\NormalTok{babynames.head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlrr}
\toprule
{} & State & Sex &  Year &             Name &  Count &  name\_lengths \\
\midrule
313143 &    CA &   M &  1989 &  Franciscojavier &      6 &            15 \\
333732 &    CA &   M &  1997 &  Ryanchristopher &      5 &            15 \\
330421 &    CA &   M &  1996 &  Franciscojavier &      8 &            15 \\
323615 &    CA &   M &  1993 &  Johnchristopher &      5 &            15 \\
310235 &    CA &   M &  1988 &  Franciscojavier &     10 &            15 \\
\bottomrule
\end{tabular}

In the example above, we made use of an in-built function given to us by
the \texttt{str} accessor for getting the length of names. Then we used
\texttt{name\_length} column to sort the dataframe. What if we had
wanted to generate the values in our new column using a function of our
own making?

We can do this using the Series
\href{https://pandas.pydata.org/docs/reference/api/pandas.Series.map.html}{\texttt{.map}}
method. \texttt{.map} takes in a function as input, and will apply this
function to each value of a Series.

For example, say we wanted to find the number of occurrences of the
sequence ``dr'' or ``ea'' in each name.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# First, define a function to count the number of times "dr" or "ea" appear in each name}
\KeywordTok{def}\NormalTok{ dr\_ea\_count(string):}
    \ControlFlowTok{return}\NormalTok{ string.count(}\StringTok{"dr"}\NormalTok{) }\OperatorTok{+}\NormalTok{ string.count(}\StringTok{"ea"}\NormalTok{)}

\CommentTok{\# Then, use \textasciigrave{}map\textasciigrave{} to apply \textasciigrave{}dr\_ea\_count\textasciigrave{} to each name in the "Name" column}
\NormalTok{babynames[}\StringTok{"dr\_ea\_count"}\NormalTok{] }\OperatorTok{=}\NormalTok{ babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{map}\NormalTok{(dr\_ea\_count)}

\CommentTok{\# Sort the DataFrame by the new "dr\_ea\_count" column so we can see our handiwork}
\NormalTok{babynames.sort\_values(by }\OperatorTok{=} \StringTok{"dr\_ea\_count"}\NormalTok{, ascending }\OperatorTok{=} \VariableTok{False}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlrrr}
\toprule
{} & State & Sex &  Year &      Name &  Count &  name\_lengths &  dr\_ea\_count \\
\midrule
101969 &    CA &   F &  1986 &  Deandrea &      6 &             8 &            3 \\
304390 &    CA &   M &  1985 &  Deandrea &      6 &             8 &            3 \\
131022 &    CA &   F &  1994 &  Leandrea &      5 &             8 &            3 \\
115950 &    CA &   F &  1990 &  Deandrea &      5 &             8 &            3 \\
108723 &    CA &   F &  1988 &  Deandrea &      5 &             8 &            3 \\
\bottomrule
\end{tabular}

If we want to remove a column or row of a DataFrame, we can call the
\href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html}{\texttt{.drop}}
method. Use the \texttt{axis} parameter to specify whether a column or
row should be dropped. Unless otherwise specified, \texttt{pandas} will
assume that we are dropping a row by default.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Drop our "dr\_ea\_count" and "length" columns from the DataFrame}
\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.drop([}\StringTok{"dr\_ea\_count"}\NormalTok{, }\StringTok{"name\_lengths"}\NormalTok{], axis}\OperatorTok{=}\StringTok{"columns"}\NormalTok{)}
\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &             Name &  Count \\
\midrule
313143 &    CA &   M &  1989 &  Franciscojavier &      6 \\
333732 &    CA &   M &  1997 &  Ryanchristopher &      5 \\
330421 &    CA &   M &  1996 &  Franciscojavier &      8 \\
323615 &    CA &   M &  1993 &  Johnchristopher &      5 \\
310235 &    CA &   M &  1988 &  Franciscojavier &     10 \\
\bottomrule
\end{tabular}

Notice that we reassigned \texttt{babynames} to the result of
\texttt{babynames.drop(...)}. This is a subtle, but important point:
\texttt{pandas} table operations \textbf{do not occur in-place}. Calling
\texttt{dataframe.drop(...)} will output a \emph{copy} of
\texttt{dataframe} with the row/column of interest removed, without
modifying the original \texttt{dataframe} table.

In other words, if we simply call:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This creates a copy of \textasciigrave{}babynames\textasciigrave{} and removes the row with label 3...}
\NormalTok{babynames.drop(}\DecValTok{3}\NormalTok{, axis}\OperatorTok{=}\StringTok{"rows"}\NormalTok{)}

\CommentTok{\# ...but the original \textasciigrave{}babynames\textasciigrave{} is unchanged! }
\CommentTok{\# Notice that the row with label 3 is still present}
\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrlr}
\toprule
{} & State & Sex &  Year &             Name &  Count \\
\midrule
313143 &    CA &   M &  1989 &  Franciscojavier &      6 \\
333732 &    CA &   M &  1997 &  Ryanchristopher &      5 \\
330421 &    CA &   M &  1996 &  Franciscojavier &      8 \\
323615 &    CA &   M &  1993 &  Johnchristopher &      5 \\
310235 &    CA &   M &  1988 &  Franciscojavier &     10 \\
\bottomrule
\end{tabular}

\hypertarget{aggregating-data-with-groupby}{%
\section{Aggregating Data with
GroupBy}\label{aggregating-data-with-groupby}}

Up until this point, we have been working with individual rows of
DataFrames. As data scientists, we often wish to investigate trends
across a larger \emph{subset} of our data. For example, we may want to
compute some summary statistic (the mean, median, sum, etc.) for a group
of rows in our DataFrame. To do this, we'll use \texttt{pandas}
\texttt{GroupBy} objects.

Let's say we wanted to aggregate all rows in \texttt{babynames} for a
given year.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.groupby(}\StringTok{"Year"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc293694be0>
\end{verbatim}

What does this strange output mean? Calling
\href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html}{\texttt{.groupby}}
has generated a \texttt{GroupBy} object. You can imagine this as a set
of ``mini'' sub-DataFrames, where each subframe contains all of the rows
from \texttt{babynames} that correspond to a particular year.

The diagram below shows a simplified view of \texttt{babynames} to help
illustrate this idea.

\begin{figure}

{\centering \includegraphics{pandas_2/images/gb.png}

}

\caption{Creating a GroupBy object}

\end{figure}

We can't work with a \texttt{GroupBy} object directly -- that is why you
saw that strange output earlier, rather than a standard view of a
DataFrame. To actually manipulate values within these ``mini''
DataFrames, we'll need to call an \emph{aggregation method}. This is a
method that tells \texttt{pandas} how to aggregate the values within the
\texttt{GroupBy} object. Once the aggregation is applied,
\texttt{pandas} will return a normal (now grouped) DataFrame.

The first aggregation method we'll consider is \texttt{.agg}. The
\texttt{.agg} method takes in a function as its argument; this function
is then applied to each column of a ``mini'' grouped DataFrame. We end
up with a new DataFrame with one aggregated row per subframe. Let's see
this in action by finding the \texttt{sum} of all counts for each year
in \texttt{babynames} -- this is equivalent to finding the number of
babies born in each year.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.groupby(}\StringTok{"Year"}\NormalTok{).agg(}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lr}
\toprule
{} &  Count \\
Year &        \\
\midrule
1910 &   9163 \\
1911 &   9983 \\
1912 &  17946 \\
1913 &  22094 \\
1914 &  26926 \\
\bottomrule
\end{tabular}

We can relate this back to the diagram we used above. Remember that the
diagram uses a simplified version of \texttt{babynames}, which is why we
see smaller values for the summed counts.

\begin{figure}

{\centering \includegraphics{pandas_2/images/agg.png}

}

\caption{Performing an aggregation}

\end{figure}

Calling \texttt{.agg} has condensed each subframe back into a single
row. This gives us our final output: a DataFrame that is now indexed by
\texttt{"Year"}, with a single row for each unique year in the original
\texttt{babynames} DataFrame.

You may be wondering: where did the \texttt{"State"}, \texttt{"Sex"},
and \texttt{"Name"} columns go? Logically, it doesn't make sense to
\texttt{sum} the string data in these columns (how would we add ``Mary''
+ ``Ann''?). Because of this, \texttt{pandas} will simply omit these
columns when it performs the aggregation on the DataFrame. Since this
happens implicitly, without the user specifying that these columns
should be ignored, it's easy to run into troubling situations where
columns are removed without the programmer noticing. It is better coding
practice to select \emph{only} the columns we care about before
performing the aggregation.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Same result, but now we explicitly tell Pandas to only consider the "Count" column when summing}
\NormalTok{babynames.groupby(}\StringTok{"Year"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lr}
\toprule
{} &  Count \\
Year &        \\
\midrule
1910 &   9163 \\
1911 &   9983 \\
1912 &  17946 \\
1913 &  22094 \\
1914 &  26926 \\
\bottomrule
\end{tabular}

\hypertarget{parting-note-1}{%
\subsection{Parting note}\label{parting-note-1}}

Manipulating \texttt{DataFrames} is a skill that is not mastered in just
one day. Due to the flexibility of \texttt{pandas}, there are many
different ways to get from a point A to a point B. We recommend trying
multiple different ways to solve the same problem to gain even more
practice and reach that point of mastery sooner.

Next, we will start digging deeper into the mechanics behind grouping
data.

\bookmarksetup{startatroot}

\hypertarget{pandas-iii}{%
\chapter{Pandas III}\label{pandas-iii}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Perform advanced aggregation using \texttt{.groupby()}
\item
  Use the \texttt{pd.pivot\_table} method to contruct a pivot table
\item
  Perform simple merges between DataFrames using \texttt{pd.merge()}
\end{itemize}

\end{tcolorbox}

\hypertarget{more-on-agg-function}{%
\section{\texorpdfstring{More on \texttt{agg()}
Function}{More on agg() Function}}\label{more-on-agg-function}}

Last time, we introduced the concept of aggregating data -- we
familiarized ourselves with \texttt{GroupBy} objects and used them as
tools to consolidate and summarize a DataFrame. In this lecture, we will
explore some advanced \texttt{.groupby} methods to show just how
powerful of a resource they can be for understanding our data. We will
also introduce other techniques for data aggregation to provide
flexibility in how we manipulate our tables.

\hypertarget{groupby-continued}{%
\section{\texorpdfstring{\texttt{GroupBy()},
Continued}{GroupBy(), Continued}}\label{groupby-continued}}

As we learned last lecture, a \texttt{groupby} operation involves some
combination of \textbf{splitting a DataFrame into grouped subframes},
\textbf{applying a function}, and \textbf{combining the results}.

For some arbitrary DataFrame \texttt{df} below, the code
\texttt{df.groupby("year").agg(sum)} does the following:

\begin{itemize}
\tightlist
\item
  Organizes all rows with the same year into a subframe for that year.
\item
  Creates a new DataFrame with one row representing each subframe year.
\item
  Combines all integer rows in each subframe using the \texttt{sum}
  function.
\end{itemize}

\hypertarget{aggregation-with-lambda-functions}{%
\subsection{\texorpdfstring{Aggregation with \texttt{lambda}
Functions}{Aggregation with lambda Functions}}\label{aggregation-with-lambda-functions}}

Throughout this note, we'll work with the \texttt{elections} DataFrame.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}

\NormalTok{elections }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{)}
\NormalTok{elections.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &          Candidate &                  Party &  Popular vote & Result &          \% \\
\midrule
0 &  1824 &     Andrew Jackson &  Democratic-Republican &        151271 &   loss &  57.210122 \\
1 &  1824 &  John Quincy Adams &  Democratic-Republican &        113142 &    win &  42.789878 \\
2 &  1828 &     Andrew Jackson &             Democratic &        642806 &    win &  56.203927 \\
3 &  1828 &  John Quincy Adams &    National Republican &        500897 &   loss &  43.796073 \\
4 &  1832 &     Andrew Jackson &             Democratic &        702735 &    win &  54.574789 \\
\bottomrule
\end{tabular}

What if we wish to aggregate our DataFrame using a non-standard function
-- for example, a function of our own design? We can do so by combining
\texttt{.agg} with \texttt{lambda} expressions.

Let's first consider a puzzle to jog our memory. We will attempt to find
the \texttt{Candidate} from each \texttt{Party} with the highest
\texttt{\%} of votes.

A naive approach may be to group by the \texttt{Party} column and
aggregate by the maximum.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.groupby(}\StringTok{"Party"}\NormalTok{).agg(}\BuiltInTok{max}\NormalTok{).head(}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrlrlr}
\toprule
{} &  Year &           Candidate &  Popular vote & Result &          \% \\
Party                 &       &                     &               &        &            \\
\midrule
American              &  1976 &  Thomas J. Anderson &        873053 &   loss &  21.554001 \\
American Independent  &  1976 &       Lester Maddox &       9901118 &   loss &  13.571218 \\
Anti-Masonic          &  1832 &        William Wirt &        100715 &   loss &   7.821583 \\
Anti-Monopoly         &  1884 &     Benjamin Butler &        134294 &   loss &   1.335838 \\
Citizens              &  1980 &      Barry Commoner &        233052 &   loss &   0.270182 \\
Communist             &  1932 &   William Z. Foster &        103307 &   loss &   0.261069 \\
Constitution          &  2016 &    Michael Peroutka &        203091 &   loss &   0.152398 \\
Constitutional Union  &  1860 &           John Bell &        590901 &   loss &  12.639283 \\
Democratic            &  2020 &      Woodrow Wilson &      81268924 &    win &  61.344703 \\
Democratic-Republican &  1824 &   John Quincy Adams &        151271 &    win &  57.210122 \\
\bottomrule
\end{tabular}

This approach is clearly wrong -- the DataFrame claims that Woodrow
Wilson won the presidency in 2020.

Why is this happening? Here, the \texttt{max} aggregation function is
taken over every column \emph{independently}. Among Democrats,
\texttt{max} is computing:

\begin{itemize}
\tightlist
\item
  The most recent \texttt{Year} a Democratic candidate ran for president
  (2020)
\item
  The \texttt{Candidate} with the alphabetically ``largest'' name
  (``Woodrow Wilson'')
\item
  The \texttt{Result} with the alphabetically ``largest'' outcome
  (``win'')
\end{itemize}

Instead, let's try a different approach. We will:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Sort the DataFrame so that rows are in descending order of \texttt{\%}
\item
  Group by \texttt{Party} and select the first row of each groupby
  object
\end{enumerate}

While it may seem unintuitive, sorting \texttt{elections} by descending
order of \texttt{\%} is extremely helpful. If we then group by
\texttt{Party}, the first row of each groupby object will contain
information about the \texttt{Candidate} with the highest voter
\texttt{\%}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections\_sorted\_by\_percent }\OperatorTok{=}\NormalTok{ elections.sort\_values(}\StringTok{"\%"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
\NormalTok{elections\_sorted\_by\_percent.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &           Candidate &       Party &  Popular vote & Result &          \% \\
\midrule
114 &  1964 &      Lyndon Johnson &  Democratic &      43127041 &    win &  61.344703 \\
91  &  1936 &  Franklin Roosevelt &  Democratic &      27752648 &    win &  60.978107 \\
120 &  1972 &       Richard Nixon &  Republican &      47168710 &    win &  60.907806 \\
79  &  1920 &      Warren Harding &  Republican &      16144093 &    win &  60.574501 \\
133 &  1984 &       Ronald Reagan &  Republican &      54455472 &    win &  59.023326 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections\_sorted\_by\_percent.groupby(}\StringTok{"Party"}\NormalTok{).agg(}\KeywordTok{lambda}\NormalTok{ x : x.iloc[}\DecValTok{0}\NormalTok{]).head(}\DecValTok{10}\NormalTok{)}

\CommentTok{\# Equivalent to the below code}
\CommentTok{\# elections\_sorted\_by\_percent.groupby("Party").agg(\textquotesingle{}first\textquotesingle{}).head(10)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrlrlr}
\toprule
{} &  Year &          Candidate &  Popular vote & Result &          \% \\
Party                 &       &                    &               &        &            \\
\midrule
American              &  1856 &   Millard Fillmore &        873053 &   loss &  21.554001 \\
American Independent  &  1968 &     George Wallace &       9901118 &   loss &  13.571218 \\
Anti-Masonic          &  1832 &       William Wirt &        100715 &   loss &   7.821583 \\
Anti-Monopoly         &  1884 &    Benjamin Butler &        134294 &   loss &   1.335838 \\
Citizens              &  1980 &     Barry Commoner &        233052 &   loss &   0.270182 \\
Communist             &  1932 &  William Z. Foster &        103307 &   loss &   0.261069 \\
Constitution          &  2008 &      Chuck Baldwin &        199750 &   loss &   0.152398 \\
Constitutional Union  &  1860 &          John Bell &        590901 &   loss &  12.639283 \\
Democratic            &  1964 &     Lyndon Johnson &      43127041 &    win &  61.344703 \\
Democratic-Republican &  1824 &     Andrew Jackson &        151271 &   loss &  57.210122 \\
\bottomrule
\end{tabular}

Notice how our code correctly determines that Lyndon Johnson from the
Democratic Party has the highest voter \texttt{\%}.

More generally, \texttt{lambda} functions are used to design custom
aggregation functions that aren't pre-defined by Python. The input
parameter \texttt{x} to the \texttt{lambda} function is a
\texttt{GroupBy} object. Therefore, it should make sense why
\texttt{lambda\ x\ :\ x.iloc{[}0{]}} selects the first row in each
groupby object.

In fact, there's a few different ways to approach this problem. Each
approach has different tradeoffs in terms of readability, performance,
memory consumption, complexity, etc. We've given a few examples below.

\textbf{Note}: Understanding these alternative solutions is not
required. They are given to demonstrate the vast number of
problem-solving approaches in \texttt{pandas}.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Using the idxmax function}
\NormalTok{best\_per\_party }\OperatorTok{=}\NormalTok{ elections.loc[elections.groupby(}\StringTok{\textquotesingle{}Party\textquotesingle{}}\NormalTok{)[}\StringTok{\textquotesingle{}\%\textquotesingle{}}\NormalTok{].idxmax()]}
\NormalTok{best\_per\_party.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &         Candidate &                 Party &  Popular vote & Result &          \% \\
\midrule
22  &  1856 &  Millard Fillmore &              American &        873053 &   loss &  21.554001 \\
115 &  1968 &    George Wallace &  American Independent &       9901118 &   loss &  13.571218 \\
6   &  1832 &      William Wirt &          Anti-Masonic &        100715 &   loss &   7.821583 \\
38  &  1884 &   Benjamin Butler &         Anti-Monopoly &        134294 &   loss &   1.335838 \\
127 &  1980 &    Barry Commoner &              Citizens &        233052 &   loss &   0.270182 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Using the .drop\_duplicates function}
\NormalTok{best\_per\_party2 }\OperatorTok{=}\NormalTok{ elections.sort\_values(}\StringTok{\textquotesingle{}\%\textquotesingle{}}\NormalTok{).drop\_duplicates([}\StringTok{\textquotesingle{}Party\textquotesingle{}}\NormalTok{], keep}\OperatorTok{=}\StringTok{\textquotesingle{}last\textquotesingle{}}\NormalTok{)}
\NormalTok{best\_per\_party2.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &           Candidate &           Party &  Popular vote & Result &         \% \\
\midrule
148 &  1996 &        John Hagelin &     Natural Law &        113670 &   loss &  0.118219 \\
164 &  2008 &       Chuck Baldwin &    Constitution &        199750 &   loss &  0.152398 \\
110 &  1956 &  T. Coleman Andrews &  States' Rights &        107929 &   loss &  0.174883 \\
147 &  1996 &     Howard Phillips &       Taxpayers &        184656 &   loss &  0.192045 \\
136 &  1988 &       Lenora Fulani &    New Alliance &        217221 &   loss &  0.237804 \\
\bottomrule
\end{tabular}

\hypertarget{other-groupby-features}{%
\subsection{\texorpdfstring{Other \texttt{GroupBy}
Features}{Other GroupBy Features}}\label{other-groupby-features}}

There are many aggregation methods we can use with \texttt{.agg}. Some
useful options are:

\begin{itemize}
\tightlist
\item
  \href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.GroupBy.max.html}{\texttt{.max}}:
  creates a new DataFrame with the maximum value of each group
\item
  \href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.GroupBy.mean.html}{\texttt{.mean}}:
  creates a new DataFrame with the mean value of each group
\item
  \href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.GroupBy.size.html}{\texttt{.size}}:
  creates a new Series with the number of entries in each group
\end{itemize}

In fact, these (and other) aggregation functions are so common that
\texttt{pandas} allows for writing shorthand. Instead of explicitly
stating the use of \texttt{.agg}, we can call the function directly on
the \texttt{GroupBy} object.

For example, the following are equivalent:

\begin{itemize}
\tightlist
\item
  \texttt{elections.groupby("Candidate").agg(mean)}
\item
  \texttt{elections.groupby("Candidate").mean()}
\end{itemize}

\hypertarget{the-groupby.filter-function}{%
\subsection{\texorpdfstring{The \texttt{groupby.filter()}
function}{The groupby.filter() function}}\label{the-groupby.filter-function}}

Another common use for \texttt{GroupBy} objects is to filter data by
group.

\texttt{groupby.filter} takes an argument \(\text{f}\), where
\(\text{f}\) is a function that:

\begin{itemize}
\tightlist
\item
  Takes a \texttt{GroupBy} object as input
\item
  Returns a single \texttt{True} or \texttt{False} for the entire
  subframe
\end{itemize}

\texttt{GroupBy} objects that correspond to \texttt{True} are returned
in the final result, whereas those with a \texttt{False} value are not.
Importantly, \texttt{groupby.filter} is different from
\texttt{groupby.agg} in that the \emph{entire} subframe is returned in
the final DataFrame, not just a single row.

To illustrate how this happens, consider the following \texttt{.filter}
function applied on some arbitrary data. Say we want to identify
``tight'' election years -- that is, we want to find all rows that
correspond to elections years where all candidates in that year won a
similar portion of the total vote. Specifically, let's find all rows
corresponding to a year where no candidate won more than 45\% of the
total vote.

An equivalent way of framing this goal is to say:

\begin{itemize}
\tightlist
\item
  Find the years where the maximum \texttt{\%} in that year is less than
  45\%
\item
  Return all DataFrame rows that correspond to these years
\end{itemize}

For each year, we need to find the maximum \texttt{\%} among \emph{all}
rows for that year. If this maximum \texttt{\%} is lower than 45\%, we
will tell \texttt{pandas} to keep all rows corresponding to that year.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.groupby(}\StringTok{"Year"}\NormalTok{).}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ sf: sf[}\StringTok{"\%"}\NormalTok{].}\BuiltInTok{max}\NormalTok{() }\OperatorTok{\textless{}} \DecValTok{45}\NormalTok{).head(}\DecValTok{9}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &             Candidate &                 Party &  Popular vote & Result &          \% \\
\midrule
23 &  1860 &       Abraham Lincoln &            Republican &       1855993 &    win &  39.699408 \\
24 &  1860 &             John Bell &  Constitutional Union &        590901 &   loss &  12.639283 \\
25 &  1860 &  John C. Breckinridge &   Southern Democratic &        848019 &   loss &  18.138998 \\
26 &  1860 &    Stephen A. Douglas &   Northern Democratic &       1380202 &   loss &  29.522311 \\
66 &  1912 &        Eugene V. Debs &             Socialist &        901551 &   loss &   6.004354 \\
67 &  1912 &      Eugene W. Chafin &           Prohibition &        208156 &   loss &   1.386325 \\
68 &  1912 &    Theodore Roosevelt &           Progressive &       4122721 &   loss &  27.457433 \\
69 &  1912 &          William Taft &            Republican &       3486242 &   loss &  23.218466 \\
70 &  1912 &        Woodrow Wilson &            Democratic &       6296284 &    win &  41.933422 \\
\bottomrule
\end{tabular}

What's going on here? In this example, we've defined our filtering
function, \(\text{f}\), to be
\texttt{lambda\ sf:\ sf{[}"\%"{]}.max()\ \textless{}\ 45}. This
filtering function will find the maximum \texttt{"\%"} value among all
entries in the grouped subframe, which we call \texttt{sf}. If the
maximum value is less than 45, then the filter function will return
\texttt{True} and all rows in that grouped subframe will appear in the
final output DataFrame.

Examine the DataFrame above. Notice how, in this preview of the first 9
rows, all entries from the years 1860 and 1912 appear. This means that
in 1860 and 1912, no candidate in that year won more than 45\% of the
total vote.

You may ask: how is the \texttt{groupby.filter} procedure different to
the boolean filtering we've seen previously? Boolean filtering considers
\emph{individual} rows when applying a boolean condition. For example,
the code \texttt{elections{[}elections{[}"\%"{]}\ \textless{}\ 45{]}}
will check the \texttt{"\%"} value of every single row in
\texttt{elections}; if it is less than 45, then that row will be kept in
the output. \texttt{groupby.filter}, in contrast, applies a boolean
condition \emph{across} all rows in a group. If not all rows in that
group satisfy the condition specified by the filter, the entire group
will be discarded in the output.

\hypertarget{aggregating-data-with-pivot-tables}{%
\section{Aggregating Data with Pivot
Tables}\label{aggregating-data-with-pivot-tables}}

We know now that \texttt{.groupby} gives us the ability to group and
aggregate data across our DataFrame. The examples above formed groups
using just one column in the DataFrame. It's possible to group by
multiple columns at once by passing in a list of columns names to
\texttt{.groupby}.

Let's consider the \texttt{babynames} dataset from last lecture. In this
problem, we will find the total number of baby names associated with
each sex for each year. To do this, we'll group by \emph{both} the
\texttt{"Year"} and \texttt{"Sex"} columns.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ urllib.request}
\ImportTok{import}\NormalTok{ os.path}

\CommentTok{\# Download data from the web directly}
\NormalTok{data\_url }\OperatorTok{=} \StringTok{"https://www.ssa.gov/oact/babynames/names.zip"}
\NormalTok{local\_filename }\OperatorTok{=} \StringTok{"data/babynames.zip"}
\ControlFlowTok{if} \KeywordTok{not}\NormalTok{ os.path.exists(local\_filename): }\CommentTok{\# if the data exists don\textquotesingle{}t download again}
    \ControlFlowTok{with}\NormalTok{ urllib.request.urlopen(data\_url) }\ImportTok{as}\NormalTok{ resp, }\BuiltInTok{open}\NormalTok{(local\_filename, }\StringTok{\textquotesingle{}wb\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{        f.write(resp.read())}

        
\CommentTok{\# Load data without unzipping the file}
\ImportTok{import}\NormalTok{ zipfile}
\NormalTok{babynames }\OperatorTok{=}\NormalTok{ [] }
\ControlFlowTok{with}\NormalTok{ zipfile.ZipFile(local\_filename, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ zf:}
\NormalTok{    data\_files }\OperatorTok{=}\NormalTok{ [f }\ControlFlowTok{for}\NormalTok{ f }\KeywordTok{in}\NormalTok{ zf.filelist }\ControlFlowTok{if}\NormalTok{ f.filename[}\OperatorTok{{-}}\DecValTok{3}\NormalTok{:] }\OperatorTok{==} \StringTok{"txt"}\NormalTok{]}
    \KeywordTok{def}\NormalTok{ extract\_year\_from\_filename(fn):}
        \ControlFlowTok{return} \BuiltInTok{int}\NormalTok{(fn[}\DecValTok{3}\NormalTok{:}\DecValTok{7}\NormalTok{])}
    \ControlFlowTok{for}\NormalTok{ f }\KeywordTok{in}\NormalTok{ data\_files:}
\NormalTok{        year }\OperatorTok{=}\NormalTok{ extract\_year\_from\_filename(f.filename)}
        \ControlFlowTok{with}\NormalTok{ zf.}\BuiltInTok{open}\NormalTok{(f) }\ImportTok{as}\NormalTok{ fp:}
\NormalTok{            df }\OperatorTok{=}\NormalTok{ pd.read\_csv(fp, names}\OperatorTok{=}\NormalTok{[}\StringTok{"Name"}\NormalTok{, }\StringTok{"Sex"}\NormalTok{, }\StringTok{"Count"}\NormalTok{])}
\NormalTok{            df[}\StringTok{"Year"}\NormalTok{] }\OperatorTok{=}\NormalTok{ year}
\NormalTok{            babynames.append(df)}
\NormalTok{babynames }\OperatorTok{=}\NormalTok{ pd.concat(babynames)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames.head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrr}
\toprule
{} &       Name & Sex &  Count &  Year \\
\midrule
0 &       Mary &   F &   7065 &  1880 \\
1 &       Anna &   F &   2604 &  1880 \\
2 &       Emma &   F &   2003 &  1880 \\
3 &  Elizabeth &   F &   1939 &  1880 \\
4 &     Minnie &   F &   1746 &  1880 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Find the total number of baby names associated with each sex for each year in the data}
\NormalTok{babynames.groupby([}\StringTok{"Year"}\NormalTok{, }\StringTok{"Sex"}\NormalTok{])[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{6}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llr}
\toprule
     &   &   Count \\
Year & Sex &         \\
\midrule
1880 & F &   90994 \\
     & M &  110490 \\
1881 & F &   91953 \\
     & M &  100737 \\
1882 & F &  107847 \\
     & M &  113686 \\
\bottomrule
\end{tabular}

Notice that both \texttt{"Year"} and \texttt{"Sex"} serve as the index
of the DataFrame (they are both rendered in bold). We've created a
\emph{multindex} where two different index values, the year and sex, are
used to uniquely identify each row.

This isn't the most intuitive way of representing this data -- and,
because multindexes have multiple dimensions in their index, they can
often be difficult to use.

Another strategy to aggregate across two columns is to create a pivot
table. You saw these back in
\href{https://inferentialthinking.com/chapters/08/3/Cross-Classifying_by_More_than_One_Variable.html\#pivot-tables-rearranging-the-output-of-group}{Data
8}. One set of values is used to create the index of the table; another
set is used to define the column names. The values contained in each
cell of the table correspond to the aggregated data for each
index-column pair.

The best way to understand pivot tables is to see one in action. Let's
return to our original goal of summing the total number of names
associated with each combination of year and sex. We'll call the
\texttt{pandas}
\href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html}{\texttt{.pivot\_table}}
method to create a new table.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The \textasciigrave{}pivot\_table\textasciigrave{} method is used to generate a Pandas pivot table}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\NormalTok{babynames.pivot\_table(index }\OperatorTok{=} \StringTok{"Year"}\NormalTok{, columns }\OperatorTok{=} \StringTok{"Sex"}\NormalTok{, values }\OperatorTok{=} \StringTok{"Count"}\NormalTok{, aggfunc }\OperatorTok{=}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrr}
\toprule
Sex &       F &       M \\
Year &         &         \\
\midrule
1880 &   90994 &  110490 \\
1881 &   91953 &  100737 \\
1882 &  107847 &  113686 \\
1883 &  112319 &  104625 \\
1884 &  129019 &  114442 \\
\bottomrule
\end{tabular}

Looks a lot better! Now, our DataFrame is structured with clear
index-column combinations. Each entry in the pivot table represents the
summed count of names for a given combination of \texttt{"Year"} and
\texttt{"Sex"}.

Let's take a closer look at the code implemented above.

\begin{itemize}
\tightlist
\item
  \texttt{index\ =\ "Year"} specifies the column name in the original
  DataFrame that should be used as the index of the pivot table
\item
  \texttt{columns\ =\ "Sex"} specifies the column name in the original
  DataFrame that should be used to generate the columns of the pivot
  table
\item
  \texttt{values\ =\ "Count"} indicates what values from the original
  DataFrame should be used to populate the entry for each index-column
  combination
\item
  \texttt{aggfunc\ =\ np.sum} tells \texttt{pandas} what function to use
  when aggregating the data specified by \texttt{values}. Here, we are
  \texttt{sum}ming the name counts for each pair of \texttt{"Year"} and
  \texttt{"Sex"}
\end{itemize}

We can even include multiple values in the index or columns of our pivot
tables.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{babynames\_pivot }\OperatorTok{=}\NormalTok{ babynames.pivot\_table(}
\NormalTok{    index}\OperatorTok{=}\StringTok{"Year"}\NormalTok{,     }\CommentTok{\# the rows (turned into index)}
\NormalTok{    columns}\OperatorTok{=}\StringTok{"Sex"}\NormalTok{,    }\CommentTok{\# the column values}
\NormalTok{    values}\OperatorTok{=}\NormalTok{[}\StringTok{"Count"}\NormalTok{, }\StringTok{"Name"}\NormalTok{], }
\NormalTok{    aggfunc}\OperatorTok{=}\BuiltInTok{max}\NormalTok{,   }\CommentTok{\# group operation}
\NormalTok{)}
\NormalTok{babynames\_pivot.head(}\DecValTok{6}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrll}
\toprule
{} & \multicolumn{2}{l}{Count} & \multicolumn{2}{l}{Name} \\
Sex &     F &     M &     F &       M \\
Year &       &       &       &         \\
\midrule
1880 &  7065 &  9655 &  Zula &    Zeke \\
1881 &  6919 &  8769 &  Zula &     Zeb \\
1882 &  8148 &  9557 &  Zula &     Zed \\
1883 &  8012 &  8894 &  Zula &    Zeno \\
1884 &  9217 &  9388 &  Zula &  Zollie \\
1885 &  9128 &  8756 &  Zula &  Zollie \\
\bottomrule
\end{tabular}

\hypertarget{joining-tables}{%
\section{Joining Tables}\label{joining-tables}}

When working on data science projects, we're unlikely to have absolutely
all the data we want contained in a single DataFrame -- a real-world
data scientist needs to grapple with data coming from multiple sources.
If we have access to multiple datasets with related information, we can
join two or more tables into a single DataFrame.

To put this into practice, we'll revisit the \texttt{elections} dataset.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{elections.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &          Candidate &                  Party &  Popular vote & Result &          \% \\
\midrule
0 &  1824 &     Andrew Jackson &  Democratic-Republican &        151271 &   loss &  57.210122 \\
1 &  1824 &  John Quincy Adams &  Democratic-Republican &        113142 &    win &  42.789878 \\
2 &  1828 &     Andrew Jackson &             Democratic &        642806 &    win &  56.203927 \\
3 &  1828 &  John Quincy Adams &    National Republican &        500897 &   loss &  43.796073 \\
4 &  1832 &     Andrew Jackson &             Democratic &        702735 &    win &  54.574789 \\
\bottomrule
\end{tabular}

Say we want to understand the 2020 popularity of the names of each
presidential candidate. To do this, we'll need the combined data of
\texttt{babynames} \emph{and} \texttt{elections}.

We'll start by creating a new column containing the first name of each
presidential candidate. This will help us join each name in
\texttt{elections} to the corresponding name data in \texttt{babynames}.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This \textasciigrave{}str\textasciigrave{} operation splits each candidate\textquotesingle{}s full name at each }
\CommentTok{\# blank space, then takes just the candidiate\textquotesingle{}s first name}
\NormalTok{elections[}\StringTok{"First Name"}\NormalTok{] }\OperatorTok{=}\NormalTok{ elections[}\StringTok{"Candidate"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.split().}\BuiltInTok{str}\NormalTok{[}\DecValTok{0}\NormalTok{]}
\NormalTok{elections.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlrl}
\toprule
{} &  Year &          Candidate &                  Party &  Popular vote & Result &          \% & First Name \\
\midrule
0 &  1824 &     Andrew Jackson &  Democratic-Republican &        151271 &   loss &  57.210122 &     Andrew \\
1 &  1824 &  John Quincy Adams &  Democratic-Republican &        113142 &    win &  42.789878 &       John \\
2 &  1828 &     Andrew Jackson &             Democratic &        642806 &    win &  56.203927 &     Andrew \\
3 &  1828 &  John Quincy Adams &    National Republican &        500897 &   loss &  43.796073 &       John \\
4 &  1832 &     Andrew Jackson &             Democratic &        702735 &    win &  54.574789 &     Andrew \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Here, we\textquotesingle{}ll only consider \textasciigrave{}babynames\textasciigrave{} data from 2020}
\NormalTok{babynames\_2020 }\OperatorTok{=}\NormalTok{ babynames[babynames[}\StringTok{"Year"}\NormalTok{]}\OperatorTok{==}\DecValTok{2020}\NormalTok{]}
\NormalTok{babynames\_2020.head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllrr}
\toprule
{} &       Name & Sex &  Count &  Year \\
\midrule
0 &     Olivia &   F &  17641 &  2020 \\
1 &       Emma &   F &  15656 &  2020 \\
2 &        Ava &   F &  13160 &  2020 \\
3 &  Charlotte &   F &  13065 &  2020 \\
4 &     Sophia &   F &  13036 &  2020 \\
\bottomrule
\end{tabular}

Now, we're ready to join the two tables.
\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html}{\texttt{pd.merge}}
is the \texttt{pandas} method used to join DataFrames together. The
\texttt{left} and \texttt{right} parameters are used to specify the
DataFrames to be joined. The \texttt{left\_on} and \texttt{right\_on}
parameters are assigned to the string names of the columns to be used
when performing the join. These two \texttt{on} parameters tell
\texttt{pandas} what values should act as pairing keys to determine
which rows to merge across the DataFrames. We'll talk more about this
idea of a pairing key next lecture.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{merged }\OperatorTok{=}\NormalTok{ pd.merge(left }\OperatorTok{=}\NormalTok{ elections, right }\OperatorTok{=}\NormalTok{ babynames\_2020, }\OperatorTok{\textbackslash{}}
\NormalTok{                  left\_on }\OperatorTok{=} \StringTok{"First Name"}\NormalTok{, right\_on }\OperatorTok{=} \StringTok{"Name"}\NormalTok{)}
\NormalTok{merged.head()}
\CommentTok{\# Notice that pandas automatically specifies \textasciigrave{}Year\_x\textasciigrave{} and \textasciigrave{}Year\_y\textasciigrave{} }
\CommentTok{\# when both merged DataFrames have the same column name to avoid confusion}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlrlllrr}
\toprule
{} &  Year\_x &       Candidate &                  Party &  Popular vote & Result &          \% & First Name &    Name & Sex &  Count &  Year\_y \\
\midrule
0 &    1824 &  Andrew Jackson &  Democratic-Republican &        151271 &   loss &  57.210122 &     Andrew &  Andrew &   F &     12 &    2020 \\
1 &    1824 &  Andrew Jackson &  Democratic-Republican &        151271 &   loss &  57.210122 &     Andrew &  Andrew &   M &   6036 &    2020 \\
2 &    1828 &  Andrew Jackson &             Democratic &        642806 &    win &  56.203927 &     Andrew &  Andrew &   F &     12 &    2020 \\
3 &    1828 &  Andrew Jackson &             Democratic &        642806 &    win &  56.203927 &     Andrew &  Andrew &   M &   6036 &    2020 \\
4 &    1832 &  Andrew Jackson &             Democratic &        702735 &    win &  54.574789 &     Andrew &  Andrew &   F &     12 &    2020 \\
\bottomrule
\end{tabular}

\bookmarksetup{startatroot}

\hypertarget{data-cleaning-and-eda}{%
\chapter{Data Cleaning and EDA}\label{data-cleaning-and-eda}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Recognize common file formats
\item
  Categorize data by its variable type
\item
  Build awareness of issues with data faithfulness and develop targeted
  solutions
\end{itemize}

\end{tcolorbox}

In the past few lectures, we've learned that \texttt{pandas} is a
toolkit to restructure, modify, and explore a dataset. What we haven't
yet touched on is \emph{how} to make these data transformation
decisions. When we receive a new set of data from the ``real world,''
how do we know what processing we should do to convert this data into a
usable form?

\textbf{Data cleaning}, also called \textbf{data wrangling}, is the
process of transforming raw data to facilitate subsequent analysis. It
is often used to address issues like:

\begin{itemize}
\tightlist
\item
  Unclear structure or formatting
\item
  Missing or corrupted values
\item
  Unit conversions
\item
  \ldots and so on
\end{itemize}

\textbf{Exploratory Data Analysis (EDA)} is the process of understanding
a new dataset. It is an open-ended, informal analysis that involves
familiarizing ourselves with the variables present in the data,
discovering potential hypotheses, and identifying potential issues with
the data. This last point can often motivate further data cleaning to
address any problems with the dataset's format; because of this, EDA and
data cleaning are often thought of as an ``infinite loop,'' with each
process driving the other.

In this lecture, we will consider the key properties of data to consider
when performing data cleaning and EDA. In doing so, we'll develop a
``checklist'' of sorts for you to consider when approaching a new
dataset. Throughout this process, we'll build a deeper understanding of
this early (but very important!) stage of the data science lifecycle.

\hypertarget{structure}{%
\section{Structure}\label{structure}}

\hypertarget{file-format}{%
\subsection{File Format}\label{file-format}}

In the past two \texttt{pandas} lectures, we briefly touched on the idea
of file format: the way data is encoded in a file for storage.
Specifically, our \texttt{elections} and \texttt{babynames} datasets
were stored and loaded as CSVs:

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\NormalTok{pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllrlr}
\toprule
{} &  Year &          Candidate &                  Party &  Popular vote & Result &          \% \\
\midrule
0 &  1824 &     Andrew Jackson &  Democratic-Republican &        151271 &   loss &  57.210122 \\
1 &  1824 &  John Quincy Adams &  Democratic-Republican &        113142 &    win &  42.789878 \\
2 &  1828 &     Andrew Jackson &             Democratic &        642806 &    win &  56.203927 \\
3 &  1828 &  John Quincy Adams &    National Republican &        500897 &   loss &  43.796073 \\
4 &  1832 &     Andrew Jackson &             Democratic &        702735 &    win &  54.574789 \\
\bottomrule
\end{tabular}

CSVs, which stand for \textbf{Comma-Separated Values}, are a common
tabular data format. To better understand the properties of a CSV, let's
take a look at the first few rows of the raw data file to see what it
looks like before being loaded into a DataFrame.

\begin{verbatim}
Year,Candidate,Party,Popular vote,Result,%

1824,Andrew Jackson,Democratic-Republican,151271,loss,57.21012204

1824,John Quincy Adams,Democratic-Republican,113142,win,42.78987796

1828,Andrew Jackson,Democratic,642806,win,56.20392707
\end{verbatim}

Each row, or \textbf{record}, in the data is delimited by a newline.
Each column, or \textbf{field}, in the data is delimited by a comma
(hence, comma-separated!).

Another common file type is the \textbf{TSV (Tab-Separated Values)}. In
a TSV, records are still delimited by a newline, while fields are
delimited by \texttt{\textbackslash{}t} tab character. A TSV can be
loaded into \texttt{pandas} using \texttt{pd.read\_csv()} with the
\texttt{delimiter} parameter:
\texttt{pd.read\_csv("file\_name.tsv",\ delimiter="\textbackslash{}t")}.
A raw TSV file is shown below.

\begin{verbatim}
Year   Candidate   Party   Popular vote    Result  %

1824    Andrew Jackson  Democratic-Republican   151271  loss    57.21012204

1824    John Quincy Adams   Democratic-Republican   113142  win 42.78987796

1828    Andrew Jackson  Democratic  642806  win 56.20392707
\end{verbatim}

\textbf{JSON (JavaScript Object Notation)} files behave similarly to
Python dictionaries. They can be loaded into \texttt{pandas} using
\texttt{pd.read\_json}. A raw JSON is shown below.

\begin{verbatim}
[

 {

   "Year": 1824,

   "Candidate": "Andrew Jackson",

   "Party": "Democratic-Republican",

   "Popular vote": 151271,

   "Result": "loss",

   "%": 57.21012204

 },
\end{verbatim}

\hypertarget{variable-types}{%
\subsection{Variable Types}\label{variable-types}}

After loading data into a file, it's a good idea to take the time to
understand what pieces of information are encoded in the dataset. In
particular, we want to identify what variable types are present in our
data. Broadly speaking, we can categorize variables into one of two
overarching types.

\textbf{Quantitative variables} describe some numeric quantity or
amount. We can sub-divide quantitative data into:

\begin{itemize}
\tightlist
\item
  \textbf{Continuous quantitative variables}: numeric data that can be
  measured on a continuous scale to arbitrary precision. Continuous
  variables do not have a strict set of possible values -- they can be
  recorded to any number of decimal places. For example, weights, GPA,
  or CO2 concentrations
\item
  \textbf{Discrete quantitative variables}: numeric data that can only
  take on a finite set of possible values. For example, someone's age or
  number of siblings.
\end{itemize}

\textbf{Qualitative variables}, also known as \textbf{categorical
variables}, describe data that isn't measuring some quantity or amount.
The sub-categories of categorical data are:

\begin{itemize}
\tightlist
\item
  \textbf{Ordinal qualitative variables}: categories with ordered
  levels. Specifically, ordinal variables are those where the difference
  between levels has no consistent, quantifiable meaning. For example, a
  Yelp rating or set of income brackets.
\item
  \textbf{Nominal qualitative variables}: categories with no specific
  order. For example, someone's political affiliation or Cal ID number.
\end{itemize}

\begin{figure}

{\centering \includegraphics{eda/images/variable.png}

}

\caption{Classification of variable types}

\end{figure}

\hypertarget{primary-and-foreign-keys}{%
\subsection{Primary and Foreign Keys}\label{primary-and-foreign-keys}}

Last time, we introduced \texttt{.merge} as the \texttt{pandas} method
for joining multiple DataFrames together. In our discussion of joins, we
touched on the idea of using a ``key'' to determine what rows should be
merged from each table. Let's take a moment to examine this idea more
closely.

The \textbf{primary key} is the column or set of columns in a table that
determine the values of the remaining columns. It can be thought of as
the unique identifier for each individual row in the table. For example,
a table of Data 100 students might use each student's Cal ID as the
primary key.

\begin{tabular}{lrll}
\toprule
{} &      Cal ID &   Name &             Major \\
\midrule
0 &  3034619471 &   Oski &      Data Science \\
1 &  3035619472 &  Ollie &  Computer Science \\
2 &  3025619473 &  Orrie &      Data Science \\
3 &  3046789372 &  Ollie &         Economics \\
\bottomrule
\end{tabular}

The \textbf{foreign key} is the column or set of columns in a table that
reference primary keys in other tables. Knowing a dataset's foreign keys
can be useful when assigning the \texttt{left\_on} and
\texttt{right\_on} parameters of \texttt{.merge}. In the table of office
hour tickets below, \texttt{"Cal\ ID"} is a foreign key referencing the
previous table.

\begin{tabular}{lrrl}
\toprule
{} &  OH Request &      Cal ID &  Question \\
\midrule
0 &           1 &  3034619471 &   HW 2 Q1 \\
1 &           2 &  3035619472 &   HW 2 Q3 \\
2 &           3 &  3025619473 &  Lab 3 Q4 \\
3 &           4 &  3035619472 &   HW 2 Q7 \\
\bottomrule
\end{tabular}

\hypertarget{granularity-scope-and-temporality}{%
\section{Granularity, Scope, and
Temporality}\label{granularity-scope-and-temporality}}

After understanding the structure of the dataset, the next task is to
determine what exactly the data represents. We'll do so by considering
the data's granularity, scope, and temporality.

The \textbf{granularity} of a dataset is the level of detail included in
the data. To determine the data's granularity, ask: what does each row
in the dataset represent? Fine-grained data contains a high level of
detail, with a single row representing a small individual unit. For
example, each record may represent one person. Coarse-grained data is
encoded such that a single row represents a large individual unit -- for
example, each record may represent a group of people.

The \textbf{scope} of a dataset is the subset of the population covered
by the data. If we were investigating student performance in Data
Science courses, a dataset with narrow scope might encompass all
students enrolled in Data 100; a dataset with expansive scope might
encompass all students in California.

The \textbf{temporality} of a dataset describes the time period over
which the data was collected. To fully understand the temporality of the
data, it may be necessary to standardize timezones or inspect recurring
time-based trends in the data (Do patterns recur in 24-hour patterns?
Over the course of a month? Seasonally?).

\hypertarget{faithfulness}{%
\section{Faithfulness}\label{faithfulness}}

At this stage in our data cleaning and EDA workflow, we've achieved
quite a lot: we've identified how our data is structured, come to terms
with what information it encodes, and gained insight as to how it was
generated. Throughout this process, we should always recall the original
intent of our work in Data Science -- to use data to better understand
and model the real world. To achieve this goal, we need to ensure that
the data we use is faithful to reality; that is, that our data
accurately captures the ``real world.''

Data used in research or industry is often ``messy'' -- there may be
errors or inaccuracies that impact the faithfulness of the dataset.
Signs that data may not be faithful include:

\begin{itemize}
\tightlist
\item
  Unrealistic or ``incorrect'' values, such as negative counts,
  locations that don't exist, or dates set in the future
\item
  Violations of obvious dependencies, like an age that does not match a
  birthday
\item
  Clear signs that data was entered by hand, which can lead to spelling
  errors or fields that are incorrectly shifted
\item
  Signs of data falsification, such as fake email addresses or repeated
  use of the same names
\item
  Duplicated records or fields containing the same information
\end{itemize}

A common issue encountered with real-world datasets is that of missing
data. One strategy to resolve this is to simply drop any records with
missing values from the dataset. This does, however, introduce the risk
of inducing biases -- it is possible that the missing or corrupt records
may be systemically related to some feature of interest in the data.

Another method to address missing data is to perform
\textbf{imputation}: infer the missing values using other data available
in the dataset. There is a wide variety of imputation techniques that
can be implemented; some of the most common are listed below.

\begin{itemize}
\tightlist
\item
  Average imputation: replace missing values with the average value for
  that field
\item
  Hot deck imputation: replace missing values with some random value
\item
  Regression imputation: develop a model to predict missing values
\item
  Multiple imputation: replace missing values with multiple random
  values
\end{itemize}

Regardless of the strategy used to deal with missing data, we should
think carefully about \emph{why} particular records or fields may be
missing -- this can help inform whether or not the absence of these
values is signficant in some meaningful way.

\bookmarksetup{startatroot}

\hypertarget{eda-demo-tuberculosis-in-the-united-states}{%
\chapter{EDA Demo: Tuberculosis in the United
States}\label{eda-demo-tuberculosis-in-the-united-states}}

Now, let's follow this data-cleaning and EDA workflow to see what can we
say about the presence of Tuberculosis in the United States!

We will examine the data included in the
\href{https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w\#T1_down}{original
CDC article} published in 2021.

\hypertarget{csvs-and-field-names}{%
\section{CSVs and Field Names}\label{csvs-and-field-names}}

Suppose Table 1 was saved as a CSV file located in
\texttt{data/cdc\_tuberculosis.csv}.

We can then explore the CSV (which is a text file, and does not contain
binary-encoded data) in many ways: 1. Using a text editor like emacs,
vim, VSCode, etc. 2. Opening the CSV directly in DataHub (read-only),
Excel, Google Sheets, etc. 3. The Python file object 4. pandas, using
\texttt{pd.read\_csv()}

1, 2. Let's start with the first two so we really solidify the idea of a
CSV as \textbf{rectangular data (i.e., tabular data) stored as
comma-separated values}.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Next, let's try using the Python file object. Let's check out the
  first three lines:
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{    i }\OperatorTok{=} \DecValTok{0}
    \ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ f:}
        \BuiltInTok{print}\NormalTok{(row)}
\NormalTok{        i }\OperatorTok{+=} \DecValTok{1}
        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}} \DecValTok{3}\NormalTok{:}
            \ControlFlowTok{break}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
,No. of TB cases,,,TB incidence,,

U.S. jurisdiction,2019,2020,2021,2019,2020,2021

Total,"8,900","7,173","7,860",2.71,2.16,2.37

Alabama,87,72,92,1.77,1.43,1.83
\end{verbatim}

Whoa, why are there blank lines interspaced between the lines of the
CSV?

You may recall that all line breaks in text files are encoded as the
special newline character \texttt{\textbackslash{}n}. Python's
\texttt{print()} prints each string (including the newline), and an
additional newline on top of that.

If you're curious, we can use the \texttt{repr()} function to return the
raw string with all special characters:

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{    i }\OperatorTok{=} \DecValTok{0}
    \ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ f:}
        \BuiltInTok{print}\NormalTok{(}\BuiltInTok{repr}\NormalTok{(row)) }\CommentTok{\# print raw strings}
\NormalTok{        i }\OperatorTok{+=} \DecValTok{1}
        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}} \DecValTok{3}\NormalTok{:}
            \ControlFlowTok{break}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
',No. of TB cases,,,TB incidence,,\n'
'U.S. jurisdiction,2019,2020,2021,2019,2020,2021\n'
'Total,"8,900","7,173","7,860",2.71,2.16,2.37\n'
'Alabama,87,72,92,1.77,1.43,1.83\n'
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Finally, let's see the tried-and-true Data 100 approach: pandas.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{)}
\NormalTok{tb\_df.head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllllrrr}
\toprule
{} &         Unnamed: 0 & No. of TB cases & Unnamed: 2 & Unnamed: 3 &  TB incidence &  Unnamed: 5 &  Unnamed: 6 \\
\midrule
0 &  U.S. jurisdiction &            2019 &       2020 &       2021 &       2019.00 &     2020.00 &     2021.00 \\
1 &              Total &           8,900 &      7,173 &      7,860 &          2.71 &        2.16 &        2.37 \\
2 &            Alabama &              87 &         72 &         92 &          1.77 &        1.43 &        1.83 \\
3 &             Alaska &              58 &         58 &         58 &          7.91 &        7.92 &        7.92 \\
4 &            Arizona &             183 &        136 &        129 &          2.51 &        1.89 &        1.77 \\
\bottomrule
\end{tabular}

Wait, what's up with the ``Unnamed'' column names? And the first row,
for that matter?

Congratulations -- you're ready to wrangle your data. Because of how
things are stored, we'll need to clean the data a bit to name our
columns better.

A reasonable first step is to identify the row with the right header.
The \texttt{pd.read\_csv()} function
(\href{https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html}{documentation})
has the convenient \texttt{header} parameter:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\CommentTok{\# row index}
\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllllrrr}
\toprule
{} & U.S. jurisdiction &   2019 &   2020 &   2021 &  2019.1 &  2020.1 &  2021.1 \\
\midrule
0 &             Total &  8,900 &  7,173 &  7,860 &    2.71 &    2.16 &    2.37 \\
1 &           Alabama &     87 &     72 &     92 &    1.77 &    1.43 &    1.83 \\
2 &            Alaska &     58 &     58 &     58 &    7.91 &    7.92 &    7.92 \\
3 &           Arizona &    183 &    136 &    129 &    2.51 &    1.89 &    1.77 \\
4 &          Arkansas &     64 &     59 &     69 &    2.12 &    1.96 &    2.28 \\
\bottomrule
\end{tabular}

Wait\ldots but now we can't differentiate betwen the ``Number of TB
cases'' and ``TB incidence'' year columns. pandas has tried to make our
lives easier by automatically adding ``.1'' to the latter columns, but
this doesn't help us as humans understand the data.

We can do this manually with \texttt{df.rename()}
(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html?highlight=rename\#pandas.DataFrame.rename}{documentation}):

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rename\_dict }\OperatorTok{=}\NormalTok{ \{}\StringTok{\textquotesingle{}2019\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB cases 2019\textquotesingle{}}\NormalTok{,}
               \StringTok{\textquotesingle{}2020\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB cases 2020\textquotesingle{}}\NormalTok{,}
               \StringTok{\textquotesingle{}2021\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB cases 2021\textquotesingle{}}\NormalTok{,}
               \StringTok{\textquotesingle{}2019.1\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB incidence 2019\textquotesingle{}}\NormalTok{,}
               \StringTok{\textquotesingle{}2020.1\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB incidence 2020\textquotesingle{}}\NormalTok{,}
               \StringTok{\textquotesingle{}2021.1\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB incidence 2021\textquotesingle{}}\NormalTok{\}}
\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ tb\_df.rename(columns}\OperatorTok{=}\NormalTok{rename\_dict)}
\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lllllrrr}
\toprule
{} & U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 \\
\midrule
0 &             Total &         8,900 &         7,173 &         7,860 &               2.71 &               2.16 &               2.37 \\
1 &           Alabama &            87 &            72 &            92 &               1.77 &               1.43 &               1.83 \\
2 &            Alaska &            58 &            58 &            58 &               7.91 &               7.92 &               7.92 \\
3 &           Arizona &           183 &           136 &           129 &               2.51 &               1.89 &               1.77 \\
4 &          Arkansas &            64 &            59 &            69 &               2.12 &               1.96 &               2.28 \\
\bottomrule
\end{tabular}

\hypertarget{record-granularity}{%
\section{Record Granularity}\label{record-granularity}}

You might already be wondering: What's up with that first record?

Row 0 is what we call a \textbf{rollup record}, or summary record. It's
often useful when displaying tables to humans. The \textbf{granularity}
of record 0 (Totals) vs the rest of the records (States) is different.

Okay, EDA step two. How was the rollup record aggregated?

Let's check if Total TB cases is the sum of all state TB cases. If we
sum over all rows, we should get \textbf{2x} the total cases in each of
our TB cases by year (why?).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_df.}\BuiltInTok{sum}\NormalTok{(axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &                                                  0 \\
\midrule
U.S. jurisdiction &  TotalAlabamaAlaskaArizonaArkansasCaliforniaCol... \\
TB cases 2019     &  8,9008758183642,111666718245583029973261085237... \\
TB cases 2020     &  7,1737258136591,706525417194122219282169239376... \\
TB cases 2021     &  7,8609258129691,750585443194992281064255127494... \\
TB incidence 2019 &                                             109.94 \\
TB incidence 2020 &                                              93.09 \\
TB incidence 2021 &                                             102.94 \\
\bottomrule
\end{tabular}

Whoa, what's going on? Check out the column types:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_df.dtypes}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &        0 \\
\midrule
U.S. jurisdiction &   object \\
TB cases 2019     &   object \\
TB cases 2020     &   object \\
TB cases 2021     &   object \\
TB incidence 2019 &  float64 \\
TB incidence 2020 &  float64 \\
TB incidence 2021 &  float64 \\
\bottomrule
\end{tabular}

Looks like those commas are causing all TB cases to be read as the
\texttt{object} datatype, or \textbf{storage type} (close to the Python
string datatype), so pandas is concatenating strings instead of adding
integers.

Fortunately \texttt{read\_csv} also has a \texttt{thousands} parameter
(https://pandas.pydata.org/docs/reference/api/pandas.read\_csv.html):

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# improve readability: chaining method calls with outer parentheses/line breaks}
\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ (}
\NormalTok{    pd.read\_csv(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{1}\NormalTok{, thousands}\OperatorTok{=}\StringTok{\textquotesingle{},\textquotesingle{}}\NormalTok{)}
\NormalTok{    .rename(columns}\OperatorTok{=}\NormalTok{rename\_dict)}
\NormalTok{)}
\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrrrrr}
\toprule
{} & U.S. jurisdiction &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 \\
\midrule
0 &             Total &           8900 &           7173 &           7860 &               2.71 &               2.16 &               2.37 \\
1 &           Alabama &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 \\
2 &            Alaska &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 \\
3 &           Arizona &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 \\
4 &          Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_df.}\BuiltInTok{sum}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &                                                  0 \\
\midrule
U.S. jurisdiction &  TotalAlabamaAlaskaArizonaArkansasCaliforniaCol... \\
TB cases 2019     &                                              17800 \\
TB cases 2020     &                                              14346 \\
TB cases 2021     &                                              15720 \\
TB incidence 2019 &                                             109.94 \\
TB incidence 2020 &                                              93.09 \\
TB incidence 2021 &                                             102.94 \\
\bottomrule
\end{tabular}

The Total TB cases look right. Phew!

Let's just look at the records with \textbf{state-level granularity}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{state\_tb\_df }\OperatorTok{=}\NormalTok{ tb\_df[}\DecValTok{1}\NormalTok{:]}
\NormalTok{state\_tb\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrrrrr}
\toprule
{} & U.S. jurisdiction &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 \\
\midrule
1 &           Alabama &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 \\
2 &            Alaska &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 \\
3 &           Arizona &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 \\
4 &          Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 \\
5 &        California &           2111 &           1706 &           1750 &               5.35 &               4.32 &               4.46 \\
\bottomrule
\end{tabular}

\hypertarget{gather-more-data-census}{%
\section{Gather More Data: Census}\label{gather-more-data-census}}

U.S. Census population estimates
\href{https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-total.html}{source}
(2019),
\href{https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-total.html}{source}
(2020-2021).

Running the below cells cleans the data. There are a few new methods
here: * \texttt{df.convert\_dtypes()}
(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.convert_dtypes.html}{documentation})
conveniently converts all float dtypes into ints and is out of scope for
the class. * \texttt{df.drop\_na()}
(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html}{documentation})
will be explained in more detail next time.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 2010s census data}
\NormalTok{census\_2010s\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/nst{-}est2019{-}01.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{3}\NormalTok{, thousands}\OperatorTok{=}\StringTok{","}\NormalTok{)}
\NormalTok{census\_2010s\_df }\OperatorTok{=}\NormalTok{ (}
\NormalTok{    census\_2010s\_df}
\NormalTok{    .reset\_index()}
\NormalTok{    .drop(columns}\OperatorTok{=}\NormalTok{[}\StringTok{"index"}\NormalTok{, }\StringTok{"Census"}\NormalTok{, }\StringTok{"Estimates Base"}\NormalTok{])}
\NormalTok{    .rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{"Unnamed: 0"}\NormalTok{: }\StringTok{"Geographic Area"}\NormalTok{\})}
\NormalTok{    .convert\_dtypes()                 }\CommentTok{\# "smart" converting of columns, use at your own risk}
\NormalTok{    .dropna()                         }\CommentTok{\# we\textquotesingle{}ll introduce this next time}
\NormalTok{)}
\NormalTok{census\_2010s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ census\_2010s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.strip(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{)}

\CommentTok{\# with pd.option\_context(\textquotesingle{}display.min\_rows\textquotesingle{}, 30): \# shows more rows}
\CommentTok{\#     display(census\_2010s\_df)}
    
\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrrrrrrrrr}
\toprule
{} & Geographic Area &       2010 &       2011 &       2012 &       2013 &       2014 &       2015 &       2016 &       2017 &       2018 &       2019 \\
\midrule
0 &   United States &  309321666 &  311556874 &  313830990 &  315993715 &  318301008 &  320635163 &  322941311 &  324985539 &  326687501 &  328239523 \\
1 &       Northeast &   55380134 &   55604223 &   55775216 &   55901806 &   56006011 &   56034684 &   56042330 &   56059240 &   56046620 &   55982803 \\
2 &         Midwest &   66974416 &   67157800 &   67336743 &   67560379 &   67745167 &   67860583 &   67987540 &   68126781 &   68236628 &   68329004 \\
3 &           South &  114866680 &  116006522 &  117241208 &  118364400 &  119624037 &  120997341 &  122351760 &  123542189 &  124569433 &  125580448 \\
4 &            West &   72100436 &   72788329 &   73477823 &   74167130 &   74925793 &   75742555 &   76559681 &   77257329 &   77834820 &   78347268 \\
\bottomrule
\end{tabular}

Occasionally, you will want to modify code that you have imported. To
reimport those modifications you can either use the python importlib
library:

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{from}\NormalTok{ importlib }\ImportTok{import} \BuiltInTok{reload}
\BuiltInTok{reload}\NormalTok{(utils)}
\end{Highlighting}
\end{Shaded}

or use iPython magic which will intelligently import code when files
change:

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{\%}\NormalTok{load\_ext autoreload}
\OperatorTok{\%}\NormalTok{autoreload }\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# census 2020s data}
\NormalTok{census\_2020s\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/NST{-}EST2022{-}POP.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{3}\NormalTok{, thousands}\OperatorTok{=}\StringTok{","}\NormalTok{)}
\NormalTok{census\_2020s\_df }\OperatorTok{=}\NormalTok{ (}
\NormalTok{    census\_2020s\_df}
\NormalTok{    .reset\_index()}
\NormalTok{    .drop(columns}\OperatorTok{=}\NormalTok{[}\StringTok{"index"}\NormalTok{, }\StringTok{"Unnamed: 1"}\NormalTok{])}
\NormalTok{    .rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{"Unnamed: 0"}\NormalTok{: }\StringTok{"Geographic Area"}\NormalTok{\})}
\NormalTok{    .convert\_dtypes()                 }\CommentTok{\# "smart" converting of columns, use at your own risk}
\NormalTok{    .dropna()                         }\CommentTok{\# we\textquotesingle{}ll introduce this next time}
\NormalTok{)}
\NormalTok{census\_2020s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ census\_2020s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.strip(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{)}

\NormalTok{census\_2020s\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrr}
\toprule
{} & Geographic Area &       2020 &       2021 &       2022 \\
\midrule
0 &   United States &  331511512 &  332031554 &  333287557 \\
1 &       Northeast &   57448898 &   57259257 &   57040406 \\
2 &         Midwest &   68961043 &   68836505 &   68787595 \\
3 &           South &  126450613 &  127346029 &  128716192 \\
4 &            West &   78650958 &   78589763 &   78743364 \\
\bottomrule
\end{tabular}

\hypertarget{joining-data-on-primary-keys}{%
\section{Joining Data on Primary
Keys}\label{joining-data-on-primary-keys}}

Time to \texttt{merge}! Here we use the DataFrame method
\texttt{df1.merge(right=df2,\ ...)} on DataFrame \texttt{df1}
(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html}{documentation}).
Contrast this with the function
\texttt{pd.merge(left=df1,\ right=df2,\ ...)}
(\href{https://pandas.pydata.org/docs/reference/api/pandas.merge.html?highlight=pandas\%20merge\#pandas.merge}{documentation}).
Feel free to use either.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# merge TB dataframe with two US census dataframes}
\NormalTok{tb\_census\_df }\OperatorTok{=}\NormalTok{ (}
\NormalTok{    tb\_df}
\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2010s\_df,}
\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2020s\_df,}
\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{)}
\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrrrrrlrrrrrrrrrrlrrr}
\toprule
{} & U.S. jurisdiction &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 & Geographic Area\_x &      2010 &      2011 &      2012 &      2013 &      2014 &      2015 &      2016 &      2017 &      2018 &      2019 & Geographic Area\_y &      2020 &      2021 &      2022 \\
\midrule
0 &           Alabama &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 &           Alabama &   4785437 &   4799069 &   4815588 &   4830081 &   4841799 &   4852347 &   4863525 &   4874486 &   4887681 &   4903185 &           Alabama &   5031362 &   5049846 &   5074296 \\
1 &            Alaska &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 &            Alaska &    713910 &    722128 &    730443 &    737068 &    736283 &    737498 &    741456 &    739700 &    735139 &    731545 &            Alaska &    732923 &    734182 &    733583 \\
2 &           Arizona &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 &           Arizona &   6407172 &   6472643 &   6554978 &   6632764 &   6730413 &   6829676 &   6941072 &   7044008 &   7158024 &   7278717 &           Arizona &   7179943 &   7264877 &   7359197 \\
3 &          Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 &          Arkansas &   2921964 &   2940667 &   2952164 &   2959400 &   2967392 &   2978048 &   2989918 &   3001345 &   3009733 &   3017804 &          Arkansas &   3014195 &   3028122 &   3045637 \\
4 &        California &           2111 &           1706 &           1750 &               5.35 &               4.32 &               4.46 &        California &  37319502 &  37638369 &  37948800 &  38260787 &  38596972 &  38918045 &  39167117 &  39358497 &  39461588 &  39512223 &        California &  39501653 &  39142991 &  39029342 \\
\bottomrule
\end{tabular}

This is a little unwieldy. We could either drop the unneeded columns
now, or just merge on smaller census DataFrames. Let's do the latter.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# try merging again, but cleaner this time}
\NormalTok{tb\_census\_df }\OperatorTok{=}\NormalTok{ (}
\NormalTok{    tb\_df}
\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2010s\_df[[}\StringTok{"Geographic Area"}\NormalTok{, }\StringTok{"2019"}\NormalTok{]],}
\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{    .drop(columns}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2020s\_df[[}\StringTok{"Geographic Area"}\NormalTok{, }\StringTok{"2020"}\NormalTok{, }\StringTok{"2021"}\NormalTok{]],}
\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{    .drop(columns}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{)}
\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrrrrrrrr}
\toprule
{} & U.S. jurisdiction &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 &      2019 &      2020 &      2021 \\
\midrule
0 &           Alabama &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 &   4903185 &   5031362 &   5049846 \\
1 &            Alaska &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 &    731545 &    732923 &    734182 \\
2 &           Arizona &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 &   7278717 &   7179943 &   7264877 \\
3 &          Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 &   3017804 &   3014195 &   3028122 \\
4 &        California &           2111 &           1706 &           1750 &               5.35 &               4.32 &               4.46 &  39512223 &  39501653 &  39142991 \\
\bottomrule
\end{tabular}

\hypertarget{reproducing-data-compute-incidence}{%
\section{Reproducing Data: Compute
Incidence}\label{reproducing-data-compute-incidence}}

Let's recompute incidence to make sure we know where the original CDC
numbers came from.

From the
\href{https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w\#T1_down}{CDC
report}: TB incidence is computed as ``Cases per 100,000 persons using
mid-year population estimates from the U.S. Census Bureau.''

If we define a group as 100,000 people, then we can compute the TB
incidence for a given state population as

\[\text{TB incidence} = \frac{\text{TB cases in population}}{\text{groups in population}} = \frac{\text{TB cases in population}}{\text{population}/100000} \]

\[= \frac{\text{TB cases in population}}{\text{population}} \times 100000\]

Let's try this for 2019:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_census\_df[}\StringTok{"recompute incidence 2019"}\NormalTok{] }\OperatorTok{=}\NormalTok{ tb\_census\_df[}\StringTok{"TB cases 2019"}\NormalTok{]}\OperatorTok{/}\NormalTok{tb\_census\_df[}\StringTok{"2019"}\NormalTok{]}\OperatorTok{*}\DecValTok{100000}
\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrrrrrrrrr}
\toprule
{} & U.S. jurisdiction &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 &      2019 &      2020 &      2021 &  recompute incidence 2019 \\
\midrule
0 &           Alabama &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 &   4903185 &   5031362 &   5049846 &                  1.774357 \\
1 &            Alaska &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 &    731545 &    732923 &    734182 &                  7.928425 \\
2 &           Arizona &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 &   7278717 &   7179943 &   7264877 &                  2.514179 \\
3 &          Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 &   3017804 &   3014195 &   3028122 &                  2.120747 \\
4 &        California &           2111 &           1706 &           1750 &               5.35 &               4.32 &               4.46 &  39512223 &  39501653 &  39142991 &                  5.342651 \\
\bottomrule
\end{tabular}

Awesome!!!

Let's use a for-loop and Python format strings to compute TB incidence
for all years. Python f-strings are just used for the purposes of this
demo, but they're handy to know when you explore data beyond this course
(\href{https://docs.python.org/3/tutorial/inputoutput.html}{Python
documentation}).

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# recompute incidence for all years}
\ControlFlowTok{for}\NormalTok{ year }\KeywordTok{in}\NormalTok{ [}\DecValTok{2019}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2021}\NormalTok{]:}
\NormalTok{    tb\_census\_df[}\SpecialStringTok{f"recompute incidence }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{] }\OperatorTok{=}\NormalTok{ tb\_census\_df[}\SpecialStringTok{f"TB cases }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{/}\NormalTok{tb\_census\_df[}\SpecialStringTok{f"}\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{*}\DecValTok{100000}
\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrrrrrrrrrrrr}
\toprule
{} & U.S. jurisdiction &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 &      2019 &      2020 &      2021 &  recompute incidence 2019 &  recompute incidence 2020 &  recompute incidence 2021 \\
\midrule
0 &           Alabama &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 &   4903185 &   5031362 &   5049846 &                  1.774357 &                  1.431024 &                  1.821838 \\
1 &            Alaska &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 &    731545 &    732923 &    734182 &                  7.928425 &                  7.913519 &                  7.899949 \\
2 &           Arizona &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 &   7278717 &   7179943 &   7264877 &                  2.514179 &                  1.894165 &                  1.775667 \\
3 &          Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 &   3017804 &   3014195 &   3028122 &                  2.120747 &                  1.957405 &                  2.278640 \\
4 &        California &           2111 &           1706 &           1750 &               5.35 &               4.32 &               4.46 &  39512223 &  39501653 &  39142991 &                  5.342651 &                  4.318807 &                  4.470788 \\
\bottomrule
\end{tabular}

These numbers look pretty close!!! There are a few errors in the
hundredths place, particularly in 2021. It may be useful to further
explore reasons behind this discrepancy.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_census\_df.describe()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
{} &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 &          2019 &          2020 &          2021 &  recompute incidence 2019 &  recompute incidence 2020 &  recompute incidence 2021 \\
\midrule
count &      51.000000 &      51.000000 &      51.000000 &          51.000000 &          51.000000 &          51.000000 &  5.100000e+01 &  5.100000e+01 &  5.100000e+01 &                 51.000000 &                 51.000000 &                 51.000000 \\
mean  &     174.509804 &     140.647059 &     154.117647 &           2.102549 &           1.782941 &           1.971961 &  6.436069e+06 &  6.500226e+06 &  6.510423e+06 &                  2.104969 &                  1.784655 &                  1.969928 \\
std   &     341.738752 &     271.055775 &     286.781007 &           1.498745 &           1.337414 &           1.478468 &  7.360660e+06 &  7.408168e+06 &  7.394300e+06 &                  1.500236 &                  1.338263 &                  1.474929 \\
min   &       1.000000 &       0.000000 &       2.000000 &           0.170000 &           0.000000 &           0.210000 &  5.787590e+05 &  5.776050e+05 &  5.794830e+05 &                  0.172783 &                  0.000000 &                  0.210049 \\
25\%   &      25.500000 &      29.000000 &      23.000000 &           1.295000 &           1.210000 &           1.235000 &  1.789606e+06 &  1.820311e+06 &  1.844920e+06 &                  1.297485 &                  1.211433 &                  1.233905 \\
50\%   &      70.000000 &      67.000000 &      69.000000 &           1.800000 &           1.520000 &           1.700000 &  4.467673e+06 &  4.507445e+06 &  4.506589e+06 &                  1.808606 &                  1.521612 &                  1.694502 \\
75\%   &     180.500000 &     139.000000 &     150.000000 &           2.575000 &           1.990000 &           2.220000 &  7.446805e+06 &  7.451987e+06 &  7.502811e+06 &                  2.577577 &                  1.993607 &                  2.219482 \\
max   &    2111.000000 &    1706.000000 &    1750.000000 &           7.910000 &           7.920000 &           7.920000 &  3.951222e+07 &  3.950165e+07 &  3.914299e+07 &                  7.928425 &                  7.913519 &                  7.899949 \\
\bottomrule
\end{tabular}

\hypertarget{bonus-eda-reproducing-the-reported-statistic}{%
\section{Bonus EDA: Reproducing the reported
statistic}\label{bonus-eda-reproducing-the-reported-statistic}}

\textbf{How do we reproduce that reported statistic in the original
\href{https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w}{CDC
report}?}

\begin{quote}
Reported TB incidence (cases per 100,000 persons) increased
\textbf{9.4\%}, from \textbf{2.2} during 2020 to \textbf{2.4} during
2021 but was lower than incidence during 2019 (2.7). Increases occurred
among both U.S.-born and non--U.S.-born persons.
\end{quote}

This is TB incidence computed across the entire U.S. population! How do
we reproduce this * We need to reproduce the ``Total'' TB incidences in
our rolled record. * But our current \texttt{tb\_census\_df} only has 51
entries (50 states plus Washington, D.C.). There is no rolled record. *
What happened\ldots?

Let's get exploring!

Before we keep exploring, we'll set all indexes to more meaningful
values, instead of just numbers that pertained to some row at some
point. This will make our cleaning slightly easier.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ tb\_df.set\_index(}\StringTok{"U.S. jurisdiction"}\NormalTok{)}
\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrr}
\toprule
{} &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 \\
U.S. jurisdiction &                &                &                &                    &                    &                    \\
\midrule
Total             &           8900 &           7173 &           7860 &               2.71 &               2.16 &               2.37 \\
Alabama           &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 \\
Alaska            &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 \\
Arizona           &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 \\
Arkansas          &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{census\_2010s\_df }\OperatorTok{=}\NormalTok{ census\_2010s\_df.set\_index(}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &       2010 &       2011 &       2012 &       2013 &       2014 &       2015 &       2016 &       2017 &       2018 &       2019 \\
Geographic Area &            &            &            &            &            &            &            &            &            &            \\
\midrule
United States   &  309321666 &  311556874 &  313830990 &  315993715 &  318301008 &  320635163 &  322941311 &  324985539 &  326687501 &  328239523 \\
Northeast       &   55380134 &   55604223 &   55775216 &   55901806 &   56006011 &   56034684 &   56042330 &   56059240 &   56046620 &   55982803 \\
Midwest         &   66974416 &   67157800 &   67336743 &   67560379 &   67745167 &   67860583 &   67987540 &   68126781 &   68236628 &   68329004 \\
South           &  114866680 &  116006522 &  117241208 &  118364400 &  119624037 &  120997341 &  122351760 &  123542189 &  124569433 &  125580448 \\
West            &   72100436 &   72788329 &   73477823 &   74167130 &   74925793 &   75742555 &   76559681 &   77257329 &   77834820 &   78347268 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{census\_2020s\_df }\OperatorTok{=}\NormalTok{ census\_2020s\_df.set\_index(}\StringTok{"Geographic Area"}\NormalTok{)}
\NormalTok{census\_2020s\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrr}
\toprule
{} &       2020 &       2021 &       2022 \\
Geographic Area &            &            &            \\
\midrule
United States   &  331511512 &  332031554 &  333287557 \\
Northeast       &   57448898 &   57259257 &   57040406 \\
Midwest         &   68961043 &   68836505 &   68787595 \\
South           &  126450613 &  127346029 &  128716192 \\
West            &   78650958 &   78589763 &   78743364 \\
\bottomrule
\end{tabular}

It turns out that our merge above only kept state records, even though
our original \texttt{tb\_df} had the ``Total'' rolled record:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_df.head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrr}
\toprule
{} &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 \\
U.S. jurisdiction &                &                &                &                    &                    &                    \\
\midrule
Total             &           8900 &           7173 &           7860 &               2.71 &               2.16 &               2.37 \\
Alabama           &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 \\
Alaska            &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 \\
Arizona           &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 \\
Arkansas          &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 \\
\bottomrule
\end{tabular}

Recall that merge by default does an \textbf{inner} merge by default,
meaning that it only preserves keys that are present in \textbf{both}
DataFrames.

The rolled records in our census dataframes have different
\texttt{Geographic\ Area} fields, which was the key we merged on:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &       2010 &       2011 &       2012 &       2013 &       2014 &       2015 &       2016 &       2017 &       2018 &       2019 \\
Geographic Area &            &            &            &            &            &            &            &            &            &            \\
\midrule
United States   &  309321666 &  311556874 &  313830990 &  315993715 &  318301008 &  320635163 &  322941311 &  324985539 &  326687501 &  328239523 \\
Northeast       &   55380134 &   55604223 &   55775216 &   55901806 &   56006011 &   56034684 &   56042330 &   56059240 &   56046620 &   55982803 \\
Midwest         &   66974416 &   67157800 &   67336743 &   67560379 &   67745167 &   67860583 &   67987540 &   68126781 &   68236628 &   68329004 \\
South           &  114866680 &  116006522 &  117241208 &  118364400 &  119624037 &  120997341 &  122351760 &  123542189 &  124569433 &  125580448 \\
West            &   72100436 &   72788329 &   73477823 &   74167130 &   74925793 &   75742555 &   76559681 &   77257329 &   77834820 &   78347268 \\
\bottomrule
\end{tabular}

The Census DataFrame has several rolled records. The aggregate record we
are looking for actually has the Geographic Area named ``United
States''.

One straightforward way to get the right merge is to rename the value
itself. Because we now have the Geographic Area index, we'll use
\texttt{df.rename()}
(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html}{documentation}):

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# rename rolled record for 2010s}
\NormalTok{census\_2010s\_df.rename(index}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}United States\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{\}, inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &       2010 &       2011 &       2012 &       2013 &       2014 &       2015 &       2016 &       2017 &       2018 &       2019 \\
Geographic Area &            &            &            &            &            &            &            &            &            &            \\
\midrule
Total           &  309321666 &  311556874 &  313830990 &  315993715 &  318301008 &  320635163 &  322941311 &  324985539 &  326687501 &  328239523 \\
Northeast       &   55380134 &   55604223 &   55775216 &   55901806 &   56006011 &   56034684 &   56042330 &   56059240 &   56046620 &   55982803 \\
Midwest         &   66974416 &   67157800 &   67336743 &   67560379 &   67745167 &   67860583 &   67987540 &   68126781 &   68236628 &   68329004 \\
South           &  114866680 &  116006522 &  117241208 &  118364400 &  119624037 &  120997341 &  122351760 &  123542189 &  124569433 &  125580448 \\
West            &   72100436 &   72788329 &   73477823 &   74167130 &   74925793 &   75742555 &   76559681 &   77257329 &   77834820 &   78347268 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# same, but for 2020s rename rolled record}
\NormalTok{census\_2020s\_df.rename(index}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}United States\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{\}, inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\NormalTok{census\_2020s\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrr}
\toprule
{} &       2020 &       2021 &       2022 \\
Geographic Area &            &            &            \\
\midrule
Total           &  331511512 &  332031554 &  333287557 \\
Northeast       &   57448898 &   57259257 &   57040406 \\
Midwest         &   68961043 &   68836505 &   68787595 \\
South           &  126450613 &  127346029 &  128716192 \\
West            &   78650958 &   78589763 &   78743364 \\
\bottomrule
\end{tabular}

Next let's rerun our merge. Note the different chaining, because we are
now merging on indexes (\texttt{df.merge()}
\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html}{documentation}).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tb\_census\_df }\OperatorTok{=}\NormalTok{ (}
\NormalTok{    tb\_df}
\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2010s\_df[[}\StringTok{"2019"}\NormalTok{]],}
\NormalTok{           left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2020s\_df[[}\StringTok{"2020"}\NormalTok{, }\StringTok{"2021"}\NormalTok{]],}
\NormalTok{           left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\NormalTok{)}
\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrrrrr}
\toprule
{} &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 &       2019 &       2020 &       2021 \\
\midrule
Total    &           8900 &           7173 &           7860 &               2.71 &               2.16 &               2.37 &  328239523 &  331511512 &  332031554 \\
Alabama  &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 &    4903185 &    5031362 &    5049846 \\
Alaska   &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 &     731545 &     732923 &     734182 \\
Arizona  &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 &    7278717 &    7179943 &    7264877 \\
Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 &    3017804 &    3014195 &    3028122 \\
\bottomrule
\end{tabular}

Finally, let's recompute our incidences:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# recompute incidence for all years}
\ControlFlowTok{for}\NormalTok{ year }\KeywordTok{in}\NormalTok{ [}\DecValTok{2019}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2021}\NormalTok{]:}
\NormalTok{    tb\_census\_df[}\SpecialStringTok{f"recompute incidence }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{] }\OperatorTok{=}\NormalTok{ tb\_census\_df[}\SpecialStringTok{f"TB cases }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{/}\NormalTok{tb\_census\_df[}\SpecialStringTok{f"}\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{*}\DecValTok{100000}
\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
{} &  TB cases 2019 &  TB cases 2020 &  TB cases 2021 &  TB incidence 2019 &  TB incidence 2020 &  TB incidence 2021 &       2019 &       2020 &       2021 &  recompute incidence 2019 &  recompute incidence 2020 &  recompute incidence 2021 \\
\midrule
Total    &           8900 &           7173 &           7860 &               2.71 &               2.16 &               2.37 &  328239523 &  331511512 &  332031554 &                  2.711435 &                  2.163726 &                  2.367245 \\
Alabama  &             87 &             72 &             92 &               1.77 &               1.43 &               1.83 &    4903185 &    5031362 &    5049846 &                  1.774357 &                  1.431024 &                  1.821838 \\
Alaska   &             58 &             58 &             58 &               7.91 &               7.92 &               7.92 &     731545 &     732923 &     734182 &                  7.928425 &                  7.913519 &                  7.899949 \\
Arizona  &            183 &            136 &            129 &               2.51 &               1.89 &               1.77 &    7278717 &    7179943 &    7264877 &                  2.514179 &                  1.894165 &                  1.775667 \\
Arkansas &             64 &             59 &             69 &               2.12 &               1.96 &               2.28 &    3017804 &    3014195 &    3028122 &                  2.120747 &                  1.957405 &                  2.278640 \\
\bottomrule
\end{tabular}

We reproduced the total U.S. incidences correctly!

We're almost there. Let's revisit the quote:

\begin{quote}
Reported TB incidence (cases per 100,000 persons) increased
\textbf{9.4\%}, from \textbf{2.2} during 2020 to \textbf{2.4} during
2021 but was lower than incidence during 2019 (2.7). Increases occurred
among both U.S.-born and non--U.S.-born persons.
\end{quote}

Recall that percent change from \(A\) to \(B\) is computed as
\(\text{percent change} = \frac{B - A}{A} \times 100\).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{incidence\_2020 }\OperatorTok{=}\NormalTok{ tb\_census\_df.loc[}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}recompute incidence 2020\textquotesingle{}}\NormalTok{]}
\NormalTok{incidence\_2020}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
2.1637257652759883
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{incidence\_2021 }\OperatorTok{=}\NormalTok{ tb\_census\_df.loc[}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}recompute incidence 2021\textquotesingle{}}\NormalTok{]}
\NormalTok{incidence\_2021}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
2.3672448914298068
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{difference }\OperatorTok{=}\NormalTok{ (incidence\_2021 }\OperatorTok{{-}}\NormalTok{ incidence\_2020)}\OperatorTok{/}\NormalTok{incidence\_2020 }\OperatorTok{*} \DecValTok{100}
\NormalTok{difference}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
9.405957511804143
\end{verbatim}

\bookmarksetup{startatroot}

\hypertarget{regular-expressions}{%
\chapter{Regular Expressions}\label{regular-expressions}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Understand Python string manipulation, Pandas Series methods
\item
  Parse and create regex, with a reference table
\item
  Use vocabulary (closure, metacharater, groups, etc.) to describe regex
  metacharacters
\end{itemize}

\end{tcolorbox}

\hypertarget{why-work-with-text}{%
\section{Why Work with Text?}\label{why-work-with-text}}

Last lecture, we learned of the difference between quantitative and
qualitative variable types. The latter includes string data - the
primary focus of today's lecture. In this note, we'll discuss the
necessary tools to manipulate text: Python string manipulation and
regular expressions.

There are two main reasons for working with text.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Canonicalization: Convert data that has multiple formats into a
  standard form.

  \begin{itemize}
  \tightlist
  \item
    By manipulating text, we can join tables with mismatched string
    labels
  \end{itemize}
\item
  Extract information into a new feature.

  \begin{itemize}
  \tightlist
  \item
    For example, we can extract date and time features from text
  \end{itemize}
\end{enumerate}

\hypertarget{python-string-methods}{%
\section{Python String Methods}\label{python-string-methods}}

First, we'll introduce a few methods useful for string manipulation. The
following table includes a number of string operations supported by
Python and \texttt{pandas}. The Python functions operate on a single
string, while their equivalent in \texttt{pandas} are
\textbf{vectorized} - they operate on a Series of string data.

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3333}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2500}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3889}}@{}}
\toprule()
\begin{minipage}[b]{\linewidth}\raggedright
Operation
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Python
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Pandas (Series)
\end{minipage} \\
\midrule()
\endhead
Transformation & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{s.lower(\_)}
\item
  \texttt{s.upper(\_)}
\end{itemize}
\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{ser.str.lower(\_)}
\item
  \texttt{ser.str.upper(\_)}
\end{itemize}
\end{minipage} \\
Replacement + Deletion & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{s.replace(\_)}
\end{itemize}
\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{ser.str.replace(\_)}
\end{itemize}
\end{minipage} \\
Split & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{s.split(\_)}
\end{itemize}
\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{ser.str.split(\_)}
\end{itemize}
\end{minipage} \\
Substring & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{s{[}1:4{]}}
\end{itemize}
\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{ser.str{[}1:4{]}}
\end{itemize}
\end{minipage} \\
Membership & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{\textquotesingle{}\_\textquotesingle{}\ in\ s}
\end{itemize}
\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{ser.str.contains(\_)}
\end{itemize}
\end{minipage} \\
Length & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{len(s)}
\end{itemize}
\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
\begin{itemize}
\tightlist
\item
  \texttt{ser.str.len()}
\end{itemize}
\end{minipage} \\
\bottomrule()
\end{longtable}

We'll discuss the differences between Python string functions and
\texttt{pandas} Series methods in the following section on
canonicalization.

\hypertarget{canonicalization}{%
\subsection{Canonicalization}\label{canonicalization}}

Assume we want to merge the given tables.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}

\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{\textquotesingle{}data/county\_and\_state.csv\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{    county\_and\_state }\OperatorTok{=}\NormalTok{ pd.read\_csv(f)}
    
\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{\textquotesingle{}data/county\_and\_population.csv\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{    county\_and\_pop }\OperatorTok{=}\NormalTok{ pd.read\_csv(f)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{display(county\_and\_state), display(county\_and\_pop)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lll}
\toprule
{} &                      County & State \\
\midrule
0 &              De Witt County &    IL \\
1 &        Lac qui Parle County &    MN \\
2 &      Lewis and Clark County &    MT \\
3 &  St John the Baptist Parish &    LS \\
\bottomrule
\end{tabular}

\begin{tabular}{llr}
\toprule
{} &                County &  Population \\
\midrule
0 &                DeWitt &       16798 \\
1 &         Lac Qui Parle &        8067 \\
2 &         Lewis \& Clark &       55716 \\
3 &  St. John the Baptist &       43044 \\
\bottomrule
\end{tabular}

Last time, we used a \textbf{primary key} and \textbf{foreign key} to
join two tables. While neither of these keys exist in our DataFrames,
the \texttt{County} columns look similar enough. Can we convert these
columns into one standard, canonical form to merge the two tables?

\hypertarget{canonicalization-with-python-string-manipulation}{%
\subsubsection{Canonicalization with Python String
Manipulation}\label{canonicalization-with-python-string-manipulation}}

The following function uses Python string manipulation to convert a
single county name into canonical form. It does so by eliminating
whitespace, punctuation, and unnecessary text.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ canonicalize\_county(county\_name):}
    \ControlFlowTok{return}\NormalTok{ (}
\NormalTok{        county\_name}
\NormalTok{            .lower()}
\NormalTok{            .replace(}\StringTok{\textquotesingle{} \textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{            .replace(}\StringTok{\textquotesingle{}\&\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}and\textquotesingle{}}\NormalTok{)}
\NormalTok{            .replace(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{            .replace(}\StringTok{\textquotesingle{}county\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{            .replace(}\StringTok{\textquotesingle{}parish\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{    )}

\NormalTok{canonicalize\_county(}\StringTok{"St. John the Baptist"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
'stjohnthebaptist'
\end{verbatim}

We will use the \texttt{pandas} \texttt{map} function to apply the
\texttt{canonicalize\_county} function to every row in both DataFrames.
In doing so, we'll create a new column in each called
\texttt{clean\_county\_python} with the canonical form.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{county\_and\_pop[}\StringTok{\textquotesingle{}clean\_county\_python\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ county\_and\_pop[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{].}\BuiltInTok{map}\NormalTok{(canonicalize\_county)}
\NormalTok{county\_and\_state[}\StringTok{\textquotesingle{}clean\_county\_python\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ county\_and\_state[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{].}\BuiltInTok{map}\NormalTok{(canonicalize\_county)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{display(county\_and\_state), display(county\_and\_pop)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llll}
\toprule
{} &                      County & State & clean\_county\_python \\
\midrule
0 &              De Witt County &    IL &              dewitt \\
1 &        Lac qui Parle County &    MN &         lacquiparle \\
2 &      Lewis and Clark County &    MT &       lewisandclark \\
3 &  St John the Baptist Parish &    LS &    stjohnthebaptist \\
\bottomrule
\end{tabular}

\begin{tabular}{llrl}
\toprule
{} &                County &  Population & clean\_county\_python \\
\midrule
0 &                DeWitt &       16798 &              dewitt \\
1 &         Lac Qui Parle &        8067 &         lacquiparle \\
2 &         Lewis \& Clark &       55716 &       lewisandclark \\
3 &  St. John the Baptist &       43044 &    stjohnthebaptist \\
\bottomrule
\end{tabular}

\hypertarget{canonicalization-with-pandas-series-methods}{%
\subsubsection{Canonicalization with Pandas Series
Methods}\label{canonicalization-with-pandas-series-methods}}

Alternatively, we can use \texttt{pandas} Series methods to create this
standardized column. To do so, we must call the \texttt{.str} attribute
of our Series object prior to calling any methods, like \texttt{.lower}
and \texttt{.replace}. Notice how these method names match their
equivalent built-in Python string functions.

Chaining multiple Series methods in this manner eliminates the need to
use the \texttt{map} function (as this code is vectorized).

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ canonicalize\_county\_series(county\_series):}
    \ControlFlowTok{return}\NormalTok{ (}
\NormalTok{        county\_series}
\NormalTok{            .}\BuiltInTok{str}\NormalTok{.lower()}
\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{} \textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}\&\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}and\textquotesingle{}}\NormalTok{)}
\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}county\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}parish\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{    )}

\NormalTok{county\_and\_pop[}\StringTok{\textquotesingle{}clean\_county\_pandas\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ canonicalize\_county\_series(county\_and\_pop[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{])}
\NormalTok{county\_and\_state[}\StringTok{\textquotesingle{}clean\_county\_pandas\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ canonicalize\_county\_series(county\_and\_state[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{display(county\_and\_pop), display(county\_and\_state)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{llrll}
\toprule
{} &                County &  Population & clean\_county\_python & clean\_county\_pandas \\
\midrule
0 &                DeWitt &       16798 &              dewitt &              dewitt \\
1 &         Lac Qui Parle &        8067 &         lacquiparle &         lacquiparle \\
2 &         Lewis \& Clark &       55716 &       lewisandclark &       lewisandclark \\
3 &  St. John the Baptist &       43044 &    stjohnthebaptist &    stjohnthebaptist \\
\bottomrule
\end{tabular}

\begin{tabular}{lllll}
\toprule
{} &                      County & State & clean\_county\_python & clean\_county\_pandas \\
\midrule
0 &              De Witt County &    IL &              dewitt &              dewitt \\
1 &        Lac qui Parle County &    MN &         lacquiparle &         lacquiparle \\
2 &      Lewis and Clark County &    MT &       lewisandclark &       lewisandclark \\
3 &  St John the Baptist Parish &    LS &    stjohnthebaptist &    stjohnthebaptist \\
\bottomrule
\end{tabular}

\hypertarget{extraction}{%
\subsection{Extraction}\label{extraction}}

Extraction explores the idea of obtaining useful information from text
data. This will be particularily important in model building, which
we'll study in a few weeks.

Say we want to read some data from a \texttt{.txt} file.

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{\textquotesingle{}data/log.txt\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}r\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{    log\_lines }\OperatorTok{=}\NormalTok{ f.readlines()}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{log\_lines}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
 '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
 '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']
\end{verbatim}

Suppose we want to extract the day, month, year, hour, minutes, seconds,
and timezone. Unfortunately, these items are not in a fixed position
from the beginning of the string, so slicing by some fixed offset won't
work.

Instead, we can use some clever thinking. Notice how the relevant
information is contained within a set of brackets, further seperated by
\texttt{/} and \texttt{:}. We can hone in on this region of text, and
split the data on these characters. Python's built-in \texttt{.split}
function makes this easy.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{first }\OperatorTok{=}\NormalTok{ log\_lines[}\DecValTok{0}\NormalTok{] }\CommentTok{\# Only considering the first row of data}

\NormalTok{pertinent }\OperatorTok{=}\NormalTok{ first.split(}\StringTok{"["}\NormalTok{)[}\DecValTok{1}\NormalTok{].split(}\StringTok{\textquotesingle{}]\textquotesingle{}}\NormalTok{)[}\DecValTok{0}\NormalTok{]}
\NormalTok{day, month, rest }\OperatorTok{=}\NormalTok{ pertinent.split(}\StringTok{\textquotesingle{}/\textquotesingle{}}\NormalTok{)}
\NormalTok{year, hour, minute, rest }\OperatorTok{=}\NormalTok{ rest.split(}\StringTok{\textquotesingle{}:\textquotesingle{}}\NormalTok{)}
\NormalTok{seconds, time\_zone }\OperatorTok{=}\NormalTok{ rest.split(}\StringTok{\textquotesingle{} \textquotesingle{}}\NormalTok{)}
\NormalTok{day, month, year, hour, minute, seconds, time\_zone}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
('26', 'Jan', '2014', '10', '47', '58', '-0800')
\end{verbatim}

There are two problems with this code:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Python's built-in functions limit us to extract data one record at a
  time

  \begin{itemize}
  \tightlist
  \item
    This can be resolved using a map function or Pandas Series methods.
  \end{itemize}
\item
  The code is quite verbose

  \begin{itemize}
  \tightlist
  \item
    This is a larger issue that is trickier to solve
  \end{itemize}
\end{enumerate}

In the next section, we'll introduce regular expressions - a tool that
solves problem 2.

\hypertarget{regex-basics}{%
\section{Regex Basics}\label{regex-basics}}

A \textbf{regular expression (``regex'')} is a sequence of characters
that specifies a search pattern. They are written to extract specific
information from text. Regular expressions are essentially part of a
smaller programming language embedded in Python, made available through
the \texttt{re} module. As such, they have a stand-alone syntax and
methods for various capabilities.

Regular expressions are useful in many applications beyond data science.
For example, Social Security Numbers (SSNs) are often validated with
regular expresions.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{r"[0{-}9]\{3\}{-}[0{-}9]\{2\}{-}[0{-}9]\{4\}"} \CommentTok{\# Regular Expression Syntax}

\CommentTok{\# 3 of any digit, then a dash,}
\CommentTok{\# then 2 of any digit, then a dash,}
\CommentTok{\# then 4 of any digit}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
'[0-9]{3}-[0-9]{2}-[0-9]{4}'
\end{verbatim}

There are a ton of resources to learn and experiment with regular
expressions. A few are provided below:

\begin{itemize}
\tightlist
\item
  \href{https://docs.python.org/3/howto/regex.html}{Official Regex
  Guide}
\item
  \href{https://ds100.org/sp22/resources/assets/hw/regex_reference.pdf}{Data
  100 Reference Sheet}
\item
  \href{https://regex101.com/}{Regex101.com}

  \begin{itemize}
  \tightlist
  \item
    Be sure to check \texttt{Python} under the category on the left.
  \end{itemize}
\end{itemize}

\hypertarget{basics-regex-syntax}{%
\subsection{Basics Regex Syntax}\label{basics-regex-syntax}}

There are four basic operations with regular expressions.

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2500}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1875}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1771}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1458}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2083}}@{}}
\toprule()
\begin{minipage}[b]{\linewidth}\raggedright
Operation
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Order
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Syntax Example
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Matches
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Doesn't Match
\end{minipage} \\
\midrule()
\endhead
\texttt{Or}: \texttt{\textbar{}} & 4 & AA\textbar BAAB & AA BAAB & every
other string \\
\texttt{Concatenation} & 3 & AABAAB & AABAAB & every other string \\
\texttt{Closure}: \texttt{*} (zero or more) & 2 & AB*A & AA ABBBBBBA &
AB ABABA \\
\texttt{Group}: \texttt{()} (parenthesis) & 1 & A(A\textbar B)AAB (AB)*A
& AAAAB ABAAB A ABABABABA & every other string AA ABBA \\
\bottomrule()
\end{longtable}

Notice how these metacharacter operations are ordered. Rather than being
literal characters, these \textbf{metacharacters} manipulate adjacent
characters. \texttt{()} takes precedence, followed by \texttt{*}, and
finally \texttt{\textbar{}}. This allows us to differentiate between
very different regex commands like \texttt{AB*} and \texttt{(AB)*}. The
former reads ``\texttt{A} then zero or more copies of \texttt{B}'',
while the latter specifies ``zero or more copies of \texttt{AB}''.

\hypertarget{examples}{%
\subsubsection{Examples}\label{examples}}

\textbf{Question 1}: Give a regular expression that matches
\texttt{moon}, \texttt{moooon}, etc. Your expression should match any
even number of \texttt{o}s except zero (i.e.~don't match \texttt{mn}).

\textbf{Answer 1}: \texttt{moo(oo)*n}

\begin{itemize}
\tightlist
\item
  Hardcoding \texttt{oo} before the capture group ensures that
  \texttt{mn} is not matched.
\item
  A capture group of \texttt{(oo)*} ensures the number of \texttt{o}'s
  is even.
\end{itemize}

\textbf{Question 2}: Using only basic operations, formulate a regex that
matches \texttt{muun}, \texttt{muuuun}, \texttt{moon}, \texttt{moooon},
etc. Your expression should match any even number of \texttt{u}s or
\texttt{o}s except zero (i.e.~don't match \texttt{mn}).

\textbf{Answer 2}: \texttt{m(uu(uu)*\textbar{}oo(oo)*)n}

\begin{itemize}
\tightlist
\item
  The leading \texttt{m} and trailing \texttt{n} ensures that only
  strings beginning with \texttt{m} and ending with \texttt{n} are
  matched.
\item
  Notice how the outer capture group surrounds the \texttt{\textbar{}}.

  \begin{itemize}
  \tightlist
  \item
    Consider the regex \texttt{m(uu(uu)*)\textbar{}(oo(oo)*)n}. This
    incorrectly matches \texttt{muu} and \texttt{oooon}.

    \begin{itemize}
    \tightlist
    \item
      Each OR clause is everything to the left and right of
      \texttt{\textbar{}}. The incorrect solution matches only half of
      the string, and ignores either the beginning \texttt{m} or
      trailing \texttt{n}.
    \item
      A set of paranthesis must surround \texttt{\textbar{}}. That way,
      each OR clause is everything to the left and right of
      \texttt{\textbar{}} \textbf{within} the group. This ensures both
      the beginning \texttt{m} \emph{and} trailing \texttt{n} are
      matched.
    \end{itemize}
  \end{itemize}
\end{itemize}

\hypertarget{regex-expanded}{%
\section{Regex Expanded}\label{regex-expanded}}

Provided below are more complex regular expression functions.

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.4667}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1714}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1619}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1810}}@{}}
\toprule()
\begin{minipage}[b]{\linewidth}\raggedright
Operation
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Syntax Example
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Matches
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Doesn't Match
\end{minipage} \\
\midrule()
\endhead
\texttt{Any\ Character}: \texttt{.} (except newline) & .U.U.U. & CUMULUS
JUGULUM & SUCCUBUS TUMULTUOUS \\
\texttt{Character\ Class}: \texttt{{[}{]}} (match one character in
\texttt{{[}{]}}) & {[}A-Za-z{]}{[}a-z{]}* & word Capitalized & camelCase
4illegal \\
\texttt{Repeated\ "a"\ Times}: \texttt{\{a\}} & j{[}aeiou{]}\{3\}hn &
jaoehn jooohn & jhn jaeiouhn \\
\texttt{Repeated\ "from\ a\ to\ b"\ Times}: \texttt{\{a,\ b\}} &
j{[}0u{]}\{1,2\}hn & john juohn & jhn jooohn \\
\texttt{At\ Least\ One}: \texttt{+} & jo+hn & john joooooohn & jhn
jjohn \\
\texttt{Zero\ or\ One}: \texttt{?} & joh?n & jon john & any other
string \\
\bottomrule()
\end{longtable}

A character class matches a single character in it's class. These
characters can be hardcoded -- in the case of \texttt{{[}aeiou{]}} -- or
shorthand can be specified to mean a range of characters. Examples
include:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{{[}A-Z{]}}: Any capitalized letter
\item
  \texttt{{[}a-z{]}}: Any lowercase letter
\item
  \texttt{{[}0-9{]}}: Any single digit
\item
  \texttt{{[}A-Za-z{]}}: Any capitalized of lowercase letter
\item
  \texttt{{[}A-Za-z0-9{]}}: Any capitalized or lowercase letter or
  single digit
\end{enumerate}

\hypertarget{examples-1}{%
\subsubsection{Examples}\label{examples-1}}

Let's analyze a few examples of complex regular expressions.

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.4722}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.4722}}@{}}
\toprule()
\begin{minipage}[b]{\linewidth}\raggedright
Matches
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Does Not Match
\end{minipage} \\
\midrule()
\endhead
\begin{minipage}[t]{\linewidth}\raggedright
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{.*SPB.*}
\end{enumerate}
\end{minipage} & \\
RASPBERRY SPBOO & SUBSPACE SUBSPECIES \\
\begin{minipage}[t]{\linewidth}\raggedright
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  \texttt{{[}0-9{]}\{3\}-{[}0-9{]}\{2\}-{[}0-9{]}\{4\}}
\end{enumerate}
\end{minipage} & \\
231-41-5121 573-57-1821 & 231415121 57-3571821 \\
\begin{minipage}[t]{\linewidth}\raggedright
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  \texttt{{[}a-z{]}+@({[}a-z{]}+\textbackslash{}.)+(edu\textbar{}com)}
\end{enumerate}
\end{minipage} & \\
horse@pizza.com horse@pizza.food.com & frank\_99@yahoo.com hug@cs \\
\bottomrule()
\end{longtable}

\textbf{Explanations}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{.*SPB.*} only matches strings that contain the substring
  \texttt{SPB}.

  \begin{itemize}
  \tightlist
  \item
    The \texttt{.*} metacharacter matches any amount of non-negative
    characters. Newlines do not count.\\
  \end{itemize}
\item
  This regular expression matches 3 of any digit, then a dash, then 2 of
  any digit, then a dash, then 4 of any digit

  \begin{itemize}
  \tightlist
  \item
    You'll recognize this as the familiar Social Security Number regular
    expression
  \end{itemize}
\item
  Matches any email with a \texttt{com} or \texttt{edu} domain, where
  all characters of the email are letters.

  \begin{itemize}
  \tightlist
  \item
    At least one \texttt{.} must preceed the domain name. Including a
    backslash \texttt{\textbackslash{}} before any metacharacter (in
    this case, the \texttt{.}) tells regex to match that character
    exactly.
  \end{itemize}
\end{enumerate}

\hypertarget{convenient-regex}{%
\section{Convenient Regex}\label{convenient-regex}}

Here are a few more convenient regular expressions.

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.4667}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1714}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1619}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1810}}@{}}
\toprule()
\begin{minipage}[b]{\linewidth}\raggedright
Operation
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Syntax Example
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Matches
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Doesn't Match
\end{minipage} \\
\midrule()
\endhead
\texttt{built\ in\ character\ class} & \texttt{\textbackslash{}w+}
\texttt{\textbackslash{}d+} \texttt{\textbackslash{}s+} & Fawef\_03
231123 \texttt{whitespace} & this person 423 people
\texttt{non-whitespace} \\
\texttt{character\ class\ negation}: \texttt{{[}\^{}{]}} (everything
except the given characters) & {[}\^{}a-z{]}+. & PEPPERS3982 17211!↑å &
porch CLAmS \\
\texttt{escape\ character}: \texttt{\textbackslash{}} (match the literal
next character) & cow\textbackslash.com & cow.com & cowscom \\
\texttt{beginning\ of\ line}: \texttt{\^{}} & \^{}ark & ark two ark o
ark & dark \\
\texttt{end\ of\ line}: \texttt{\$} & ark\$ & dark ark o ark & ark
two \\
\texttt{lazy\ version\ of\ zero\ or\ more} : \texttt{*?} & 5.*?5 & 5005
55 & 5005005 \\
\bottomrule()
\end{longtable}

\hypertarget{examples-2}{%
\subsubsection{Examples}\label{examples-2}}

Let's revist our earlier problem of extracting date/time data from the
given \texttt{.txt} files. Here is how the data looked.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{log\_lines[}\DecValTok{0}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'
\end{verbatim}

\textbf{Question}: Give a regular expression that matches everything
contained within and including the brackets - the day, month, year,
hour, minutes, seconds, and timezone.

\textbf{Answer}: \texttt{\textbackslash{}{[}.*\textbackslash{}{]}}

\begin{itemize}
\tightlist
\item
  Notice how matching the literal \texttt{{[}} and \texttt{{]}} is
  necessary. Therefore, an escape character \texttt{\textbackslash{}} is
  required before both \texttt{{[}} and \texttt{{]}} - otherwise these
  metacharacters will match character classes.
\item
  We need to match a particular format between \texttt{{[}} and
  \texttt{{]}}. For this example, \texttt{.*} will suffice.
\end{itemize}

\textbf{Alternative Solution}:
\texttt{\textbackslash{}{[}\textbackslash{}w+/\textbackslash{}w+/\textbackslash{}w+:\textbackslash{}w+:\textbackslash{}w+:\textbackslash{}w+\textbackslash{}s-\textbackslash{}w+\textbackslash{}{]}}

\begin{itemize}
\tightlist
\item
  This solution is much safer.

  \begin{itemize}
  \tightlist
  \item
    Imagine the data between \texttt{{[}} and \texttt{{]}} was garbage -
    \texttt{.*} will still match that.
  \item
    The alternate solution will only match data that follows the correct
    format.
  \end{itemize}
\end{itemize}

\hypertarget{regex-in-python-and-pandas-regex-groups}{%
\section{Regex in Python and Pandas (Regex
Groups)}\label{regex-in-python-and-pandas-regex-groups}}

\hypertarget{canonicalization-1}{%
\subsection{Canonicalization}\label{canonicalization-1}}

\hypertarget{canonicalization-with-regex}{%
\subsubsection{Canonicalization with
Regex}\label{canonicalization-with-regex}}

Earlier in this note, we examined the process of canonicalization using
Python string manipulation and \texttt{pandas} Series methods. However,
we mentioned this approach had a major flaw: our code was unnecessarily
verbose. Equipped with our knowledge of regular expressions, let's fix
this.

To do so, we need to understand a few functions in the \texttt{re}
module. The first of these is the substitute function:
\texttt{re.sub(pattern,\ rep1,\ text)}. It behaves similarily to
Python's built-in \texttt{.replace} function, and returns text with all
instances of \texttt{pattern} replaced by \texttt{rep1}.

The regular expression here removes text surrounded by
\texttt{\textless{}\textgreater{}} (also known as HTML tags).

In order, the pattern matches \ldots{} 1. a single \texttt{\textless{}}
2. any character that is not a \texttt{\textgreater{}} : div, td
valign\ldots, /td, /div 3. a single \texttt{\textgreater{}}

Any substring in \texttt{text} that fulfills all three conditions will
be replaced by \texttt{\textquotesingle{}\textquotesingle{}}.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ re}

\NormalTok{text }\OperatorTok{=} \StringTok{"\textless{}div\textgreater{}\textless{}td valign=\textquotesingle{}top\textquotesingle{}\textgreater{}Moo\textless{}/td\textgreater{}\textless{}/div\textgreater{}"}
\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r"\textless{}[\^{}\textgreater{}]+\textgreater{}"}
\NormalTok{re.sub(pattern, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{, text) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
'Moo'
\end{verbatim}

Notice the \texttt{r} preceeding the regular expression pattern; this
specifies the regular expression is a raw string. Raw strings do not
recognize escape sequences (ie the Python newline metacharacter
\texttt{\textbackslash{}n}). This makes them useful for regular
expressions, which often contain literal \texttt{\textbackslash{}}
characters.

In other words, don't forget to tag your regex with a \texttt{r}.

\hypertarget{canonicalization-with-pandas}{%
\subsubsection{Canonicalization with
Pandas}\label{canonicalization-with-pandas}}

We can also use regular expressions with Pandas Series methods. This
gives us the benefit of operating on an entire column of data as opposed
to a single value. The code is simple:
\texttt{ser.str.replace(pattern,\ repl,\ regex=True}).

Consider the following DataFrame \texttt{html\_data} with a single
column.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data }\OperatorTok{=}\NormalTok{ \{}\StringTok{"HTML"}\NormalTok{: [}\StringTok{"\textless{}div\textgreater{}\textless{}td valign=\textquotesingle{}top\textquotesingle{}\textgreater{}Moo\textless{}/td\textgreater{}\textless{}/div\textgreater{}"}\NormalTok{, }\OperatorTok{\textbackslash{}}
                 \StringTok{"\textless{}a href=\textquotesingle{}http://ds100.org\textquotesingle{}\textgreater{}Link\textless{}/a\textgreater{}"}\NormalTok{, }\OperatorTok{\textbackslash{}}
                 \StringTok{"\textless{}b\textgreater{}Bold text\textless{}/b\textgreater{}"}\NormalTok{]\}}
\NormalTok{html\_data }\OperatorTok{=}\NormalTok{ pd.DataFrame(data)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{html\_data}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &                                  HTML \\
\midrule
0 &  <div><td valign='top'>Moo</td></div> \\
1 &   <a href='http://ds100.org'>Link</a> \\
2 &                      <b>Bold text</b> \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r"\textless{}[\^{}\textgreater{}]+\textgreater{}"}
\NormalTok{html\_data[}\StringTok{\textquotesingle{}HTML\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.replace(pattern, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{, regex}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &       HTML \\
\midrule
0 &        Moo \\
1 &       Link \\
2 &  Bold text \\
\bottomrule
\end{tabular}

\hypertarget{extraction-1}{%
\subsection{Extraction}\label{extraction-1}}

\hypertarget{extraction-with-regex}{%
\subsubsection{Extraction with Regex}\label{extraction-with-regex}}

Just like with canonicalization, the \texttt{re} module provides
capability to extract relevant text from a string:
\texttt{re.findall(pattern,\ text)}. This function returns a list of all
matches to \texttt{pattern}.

Using the familiar regular expression for Social Security Numbers:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{text }\OperatorTok{=} \StringTok{"My social security number is 123{-}45{-}6789 bro, or maybe it’s 321{-}45{-}6789."}
\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r"[0{-}9]}\SpecialCharTok{\{3\}}\VerbatimStringTok{{-}[0{-}9]}\SpecialCharTok{\{2\}}\VerbatimStringTok{{-}[0{-}9]}\SpecialCharTok{\{4\}}\VerbatimStringTok{"}
\NormalTok{re.findall(pattern, text)  }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
['123-45-6789', '321-45-6789']
\end{verbatim}

\hypertarget{extraction-with-pandas}{%
\subsubsection{Extraction with Pandas}\label{extraction-with-pandas}}

Pandas similarily provides extraction functionality on a Series of data:
\texttt{ser.str.findall(pattern)}

Consider the following DataFrame \texttt{ssn\_data}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data }\OperatorTok{=}\NormalTok{ \{}\StringTok{"SSN"}\NormalTok{: [}\StringTok{"987{-}65{-}4321"}\NormalTok{, }\StringTok{"forty"}\NormalTok{, }\OperatorTok{\textbackslash{}}
                \StringTok{"123{-}45{-}6789 bro or 321{-}45{-}6789"}\NormalTok{,}
               \StringTok{"999{-}99{-}9999"}\NormalTok{]\}}
\NormalTok{ssn\_data }\OperatorTok{=}\NormalTok{ pd.DataFrame(data)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ssn\_data}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &                             SSN \\
\midrule
0 &                     987-65-4321 \\
1 &                           forty \\
2 &  123-45-6789 bro or 321-45-6789 \\
3 &                     999-99-9999 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ssn\_data[}\StringTok{"SSN"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.findall(pattern)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{ll}
\toprule
{} &                         SSN \\
\midrule
0 &               [987-65-4321] \\
1 &                          [] \\
2 &  [123-45-6789, 321-45-6789] \\
3 &               [999-99-9999] \\
\bottomrule
\end{tabular}

This function returns a list for every row containing the pattern
matches in a given string.

\hypertarget{regular-expression-capture-groups}{%
\subsection{Regular Expression Capture
Groups}\label{regular-expression-capture-groups}}

Earlier we used parentheses \texttt{(} \texttt{)} to specify the highest
order of operation in regular expressions. However, they have another
meaning; paranthesis are often used to represent \textbf{capture
groups}. Capture groups are essentially, a set of smaller regular
expressions that match multiple substrings in text data.

Let's take a look at an example.

\hypertarget{example-1}{%
\subsubsection{Example 1}\label{example-1}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{text }\OperatorTok{=} \StringTok{"Observations: 03:04:53 {-} Horse awakens. }\CharTok{\textbackslash{}}
\StringTok{        03:05:14 {-} Horse goes back to sleep."}
\end{Highlighting}
\end{Shaded}

Say we want to capture all occurences of time data (hour, minute, and
second) as \emph{seperate entities}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pattern\_1 }\OperatorTok{=} \VerbatimStringTok{r"(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d\textbackslash{}d)"}
\NormalTok{re.findall(pattern\_1, text)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
[('03', '04', '53'), ('03', '05', '14')]
\end{verbatim}

Notice how the given pattern has 3 capture groups, each specified by the
regular expression \texttt{(\textbackslash{}d\textbackslash{}d)}. We
then use \texttt{re.findall} to return these capture groups, each as
tuples containing 3 matches.

These regular expression capture groups can be different. We can use the
\texttt{(\textbackslash{}d\{2\})} shorthand to extract the same data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pattern\_2 }\OperatorTok{=} \VerbatimStringTok{r"(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{)"}
\NormalTok{re.findall(pattern\_2, text)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
[('03', '04', '53'), ('03', '05', '14')]
\end{verbatim}

\hypertarget{example-2}{%
\subsubsection{Example 2}\label{example-2}}

With the notion of capture groups, convince yourself how the following
regular expression works.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{first }\OperatorTok{=}\NormalTok{ log\_lines[}\DecValTok{0}\NormalTok{]}
\NormalTok{first}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r\textquotesingle{}\textbackslash{}[(\textbackslash{}d+)\textbackslash{}/(\textbackslash{}w+)\textbackslash{}/(\textbackslash{}d+):(\textbackslash{}d+):(\textbackslash{}d+):(\textbackslash{}d+) (.+)\textbackslash{}]\textquotesingle{}}
\NormalTok{day, month, year, hour, minute, second, time\_zone }\OperatorTok{=}\NormalTok{ re.findall(pattern, first)[}\DecValTok{0}\NormalTok{]}
\BuiltInTok{print}\NormalTok{(day, month, year, hour, minute, second, time\_zone)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
26 Jan 2014 10 47 58 -0800
\end{verbatim}

\hypertarget{limitations-of-regular-expressions}{%
\section{Limitations of Regular
Expressions}\label{limitations-of-regular-expressions}}

Today, we explored the capabilities of regular expressions in data
wrangling with text data. However, there are a few things to be wary of.

Writing regular expressions is like writing a program.

\begin{itemize}
\tightlist
\item
  Need to know the syntax well.
\item
  Can be easier to write than to read.
\item
  Can be difficult to debug.
\end{itemize}

Regular expressions are terrible at certain types of problems:

\begin{itemize}
\tightlist
\item
  For parsing a hierarchical structure, such as JSON, use the
  \texttt{json.load()} parser, not regex!
\item
  Complex features (e.g.~valid email address).
\item
  Counting (same number of instances of a and b). (impossible)
\item
  Complex properties (palindromes, balanced parentheses). (impossible)
\end{itemize}

\bookmarksetup{startatroot}

\hypertarget{visualization-i}{%
\chapter{Visualization I}\label{visualization-i}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Use \texttt{matplotlib}, \texttt{seaborn}, and \texttt{plotly} to
  create data visualization.
\item
  Analyze histogram and identify outliers, mode, and skewness.
\item
  Using \texttt{boxplot} and \texttt{violinplot} to compare two
  distributions.
\end{itemize}

\end{tcolorbox}

In our journey of the data science lifecycle, we have begun to explore
the vast world of exploratory data analysis. More recently, we learned
how to pre-process data using various data manipulation techniques. As
we work towards understanding our data, there is one key component
missing in our arsenal - the ability to visualize and discern
relationships in existing data.

These next two lectures will introduce you to various examples of data
visualizations and their underlying theory. In doing so, we'll motivate
their importace in real-world examples with the use of plotting
libraries.

\hypertarget{visualizations-in-data-8-and-data-100-so-far}{%
\section{Visualizations in Data 8 and Data 100 (so
far)}\label{visualizations-in-data-8-and-data-100-so-far}}

You've likely encountered several forms of data visualizations in your
studies. You may remember two such examples from Data 8: line charts and
histograms. Each of these served a unique purpose. For example, line
charts displayed how numerical quantities changed over time, while
histograms were useful in understanding a variable's distribution.

\textbf{Line Chart}

\textbf{Histogram}

\hypertarget{goals-of-visualization}{%
\section{Goals of Visualization}\label{goals-of-visualization}}

Visualizations are useful for a number of reasons. In Data 100, we
consider two areas in particular:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  To broaden your understanding of the data

  \begin{itemize}
  \tightlist
  \item
    Key part in exploratory data analysis.
  \item
    Useful in investigating relationships between variables.
  \end{itemize}
\item
  To communicate results/conclusions to others

  \begin{itemize}
  \tightlist
  \item
    Visualization theory is especially important here.
  \end{itemize}
\end{enumerate}

One of the most common applications of visualizations is in
understanding a distribution of data.

\hypertarget{an-overview-of-distributions}{%
\section{An Overview of
Distributions}\label{an-overview-of-distributions}}

A distribution describes the frequency of unique values in a variable.
Distributions must satisfy two properties:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Each data point must belong to only one category.
\item
  The total frequency of all categories must sum to 100\%. In other
  words, their total count should equal the number of values in
  consideration.
\end{enumerate}

\textbf{Not a Valid Distribution}

\textbf{Valid Distribution}

Left Diagram: This is not a valid distribution since individuals can be
associated to more than one category and the bar values demonstrate
values in minutes and not probability

Right Diagram: This example satisfies the two properties of
distributions, so it is a valid distribution.

\hypertarget{bar-plots}{%
\section{Bar Plots}\label{bar-plots}}

As we saw above, a \textbf{bar plot} is one of the most common ways of
displaying the distribution of a \textbf{qualitative} (categorical)
variable. The length of a bar plot encodes the frequency of a category;
the width encodes no useful information.

Let's contextualize this in an example. We will use the familiar
\texttt{births} dataset from Data 8 in our analysis.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}

\NormalTok{births }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/baby.csv"}\NormalTok{)}
\NormalTok{births.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrl}
\toprule
{} &  Birth Weight &  Gestational Days &  Maternal Age &  Maternal Height &  Maternal Pregnancy Weight &  Maternal Smoker \\
\midrule
0 &           120 &               284 &            27 &               62 &                        100 &            False \\
1 &           113 &               282 &            33 &               64 &                        135 &            False \\
2 &           128 &               279 &            28 &               64 &                        115 &             True \\
3 &           108 &               282 &            23 &               67 &                        125 &             True \\
4 &           136 &               286 &            25 &               62 &                         93 &            False \\
\bottomrule
\end{tabular}

We can visualize the distribution of the \texttt{Maternal\ Smoker}
column using a bar plot. There are a few ways to do this.

\hypertarget{plotting-in-pandas}{%
\subsection{Plotting in Pandas}\label{plotting-in-pandas}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{].value\_counts().plot(kind }\OperatorTok{=} \StringTok{\textquotesingle{}bar\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-3-output-1.png}

}

\end{figure}

Recall that \texttt{.value\_counts()} returns a \texttt{Series} with the
total count of each unique value. We call
\texttt{.plot(kind\ =\ \textquotesingle{}bar\textquotesingle{})} on this
result to visualize these counts as a bar plot.

Plotting methods in \texttt{pandas} are the least preferred and not
supported in Data 100, as their functionality is limited. Instead,
future examples will focus on other libaries built specifically for
visualizing data. The most well-known library here is
\texttt{matplotlib}.

\hypertarget{plotting-in-matplotlib}{%
\subsection{Plotting in Matplotlib}\label{plotting-in-matplotlib}}

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}

\NormalTok{ms }\OperatorTok{=}\NormalTok{ births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{].value\_counts()}
\NormalTok{plt.bar(ms.index.astype(}\StringTok{\textquotesingle{}string\textquotesingle{}}\NormalTok{), ms)}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Count\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-4-output-1.png}

}

\end{figure}

While more code is required to achieve the same result,
\texttt{matplotlib} is often used over \texttt{pandas} for its ability
to plot more complex visualizations, some of which are discussed
shortly.

However, notice how we need to explicitly specify the type of the value
for the x-axis to \texttt{string}. In absence of conversion, the x-axis
will be a range of integers rather than the two categories,
\texttt{True} and \texttt{False}. This is because \texttt{matplotlib}
coerces \texttt{True} to a value of 1 and \texttt{False} to 0. Also,
note how we needed to label the axes with \texttt{plt.xlabel} and
\texttt{plt.ylabel} - \texttt{matplotlib} does not support automatic
axis labeling. To get around these inconveniences, we can use a more
effecient plotting library, \texttt{seaborn}.

\hypertarget{plotting-in-seaborn}{%
\subsection{Plotting in Seaborn}\label{plotting-in-seaborn}}

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
\NormalTok{sns.countplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-5-output-1.png}

}

\end{figure}

\texttt{seaborn.countplot} both counts and visualizes the number of
unique values in a given column. This column is specified by the
\texttt{x} argument to \texttt{sns.countplot}, while the
\texttt{DataFrame} is specified by the \texttt{data} argument.

For the vast majority of visualizations, \texttt{seaborn} is far more
concise and aesthetically pleasing than \texttt{matplotlib}. However,
the color scheme of this particular bar plot is abritrary - it encodes
no additional information about the categories themselves. This is not
always true; color may signify meaningful detail in other
visualizations. We'll explore this more in-depth during the next
lecture.

\hypertarget{plotting-in-plotly}{%
\subsection{Plotting in Plotly}\label{plotting-in-plotly}}

\texttt{plotly} is one of the most versatile plottling libraries and
widely used in industry. However, \texttt{plotly} has various
dependencies that make it difficult to support in Data 100. Therfore, we
have intentionally excluded the code to generate the plot above.

By now, you'll have noticed that each of these plotting libraries have a
very different syntax. As with \texttt{pandas}, we'll teach you the
important methods in \texttt{matplotlib} and \texttt{seaborn}, but
you'll learn more through documentation.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \href{https://matplotlib.org/stable/index.html}{Matplotlib
  Documentation}
\item
  \href{https://seaborn.pydata.org/}{Seaborn Documentation}
\end{enumerate}

Example Questions:

\begin{itemize}
\tightlist
\item
  What colors should we use?
\item
  How wide should the bars be?
\item
  Should the legend exist?
\item
  Should the bars and axes have dark borders?
\end{itemize}

To accomplish goal 2, here are some ways we can improve plot:

\begin{itemize}
\tightlist
\item
  Introducing different colors for each bar
\item
  Including a legend
\item
  Including a title
\item
  Labeling the y-axis
\item
  Using color-blind friendly palettes
\item
  Re-orienting the labels
\item
  Increase the font size
\end{itemize}

\hypertarget{histograms}{%
\section{Histograms}\label{histograms}}

\textbf{Histograms} are a natural extension to bar plots; they visualize
the distribution of \textbf{quantitative} (numerical) data.

Revisiting our example with the \texttt{births} DataFrame, let's plot
the distribution of the \texttt{Maternal\ Pregnancy\ Weight} column.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{births.head(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrrrrrl}
\toprule
{} &  Birth Weight &  Gestational Days &  Maternal Age &  Maternal Height &  Maternal Pregnancy Weight &  Maternal Smoker \\
\midrule
0 &           120 &               284 &            27 &               62 &                        100 &            False \\
1 &           113 &               282 &            33 &               64 &                        135 &            False \\
2 &           128 &               279 &            28 &               64 &                        115 &             True \\
3 &           108 &               282 &            23 &               67 &                        125 &             True \\
4 &           136 &               286 &            25 &               62 &                         93 &            False \\
\bottomrule
\end{tabular}

How should we define our categories for this variable? In the previous
example, these were the unique values of the \texttt{Maternal\ Smoker}
column: \texttt{True} and \texttt{False}. If we use similar logic here,
our categories are the different numerical weights contained in the
\texttt{Maternal\ Pregnancy\ Weight} column.

Under this assumption, let's plot this distribution using the
\texttt{seaborn.countplot} function.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.countplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-7-output-1.png}

}

\end{figure}

This histogram clearly suffers from \textbf{overplotting}. This is
somewhat expected for \texttt{Maternal\ Pregnancy\ Weight} - it is a
quantitative variable that takes on a wide range of values.

To combat this problem, statisticians use bins to categorize numerical
data. Luckily, \texttt{seaborn} provides a helpful plotting function
that automatically bins our data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-8-output-1.png}

}

\end{figure}

This diagram is known as a histogram. While it looks more reasonable,
notice how we lose fine-grain information on the distribution of data
contained within each bin. We can introduce rug plots to minimize this
information loss. An overlaid ``rug plot'' displays the within-bin
distribution of our data, as denoted by the thickness of the colored
line on the x-axis.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\NormalTok{sns.rugplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{, color }\OperatorTok{=} \StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-9-output-1.png}

}

\end{figure}

You may have seen histograms drawn differently - perhaps with an
overlaid \textbf{density curve} and normalized y-axis. We can display
both with a few tweaks to our code.

To visualize a density curve, we can set the the \texttt{kde\ =\ True}
argument of the \texttt{sns.histplot}. Setting the argument
\texttt{stat\ =\ \textquotesingle{}density\textquotesingle{}} normalizes
our histogram and displays densities, instead of counts, on the y-axis.
You'll notice that the area under the density curve is 1.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{, kde }\OperatorTok{=} \VariableTok{True}\NormalTok{, }
\NormalTok{             stat }\OperatorTok{=} \StringTok{\textquotesingle{}density\textquotesingle{}}\NormalTok{)}
\NormalTok{sns.rugplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{, color }\OperatorTok{=} \StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-10-output-1.png}

}

\end{figure}

\hypertarget{evaluating-histograms}{%
\section{Evaluating Histograms}\label{evaluating-histograms}}

Histograms allow us to assess a distribution by their shape. There are a
few properties of histograms we can analyze:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Skewness and Tails

  \begin{itemize}
  \tightlist
  \item
    Skewed left vs skewed right
  \item
    Left tail vs right tail
  \end{itemize}
\item
  Outliers

  \begin{itemize}
  \tightlist
  \item
    Defined arbitrarily for now
  \end{itemize}
\item
  Modes

  \begin{itemize}
  \tightlist
  \item
    Most commonly occuring data
  \end{itemize}
\end{enumerate}

\hypertarget{skewness-and-tails}{%
\subsection{Skewness and Tails}\label{skewness-and-tails}}

If a distribution has a long right tail (such as
\texttt{Maternal\ Pregancy\ Weight}), it is \textbf{skewed right}. In a
right-skewed distribution, the few large outliers ``pull'' the mean to
the \textbf{right} of the median.

If a distribution has a long left tail, it is \textbf{skewed left}. In a
left-skewed distribution, the few small outliers ``pull'' the mean to
the \textbf{left} of the median.

In the case where a distribution has equal-sized right and left tails,
it is \textbf{symmetric}. The mean is approximately \textbf{equal} to
the median. Think of mean as the balancing point of the distribution

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}

\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\NormalTok{df\_mean }\OperatorTok{=}\NormalTok{ np.mean(births[}\StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{])}
\NormalTok{df\_median }\OperatorTok{=}\NormalTok{ np.median(births[}\StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{])}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The mean is: }\SpecialCharTok{\{\}}\StringTok{ and the median is }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(df\_mean,df\_median))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
The mean is: 128.4787052810903 and the median is 125.0
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-11-output-2.png}

}

\end{figure}

\hypertarget{outliers}{%
\subsection{Outliers}\label{outliers}}

Loosely speaking, an \textbf{outlier} is defined as a data point that
lies an abnormally large distance away from other values. We'll define
the statistical measure for this shortly.

Outliers disproportionately influce the mean because their magnitude is
directly involved in computing the average. However, the median is
largely unaffected - the magnitude of an outlier is irrelevant; we only
care that it is some non-zero distance away from the midpoint of the
data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\CommentTok{\#\# Where do we draw the line of outlier? }
\NormalTok{plt.axvline(df\_mean}\OperatorTok{*}\FloatTok{1.75}\NormalTok{, color }\OperatorTok{=} \StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-12-output-1.png}

}

\end{figure}

\hypertarget{modes}{%
\subsection{Modes}\label{modes}}

A \textbf{mode} of a distribution is a local or global maximum. A
distribution with a single clear maximum is \textbf{unimodal},
distributions with two modes are \textbf{bimodal}, and those with 3 or
more are \textbf{multimodal}. You need to distinguish between
\textbf{modes} and \emph{random noise}.

For example, the distribution of birth weights for maternal smokers is
(weakly) multimodal.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{births\_maternal\_smoker }\OperatorTok{=}\NormalTok{ births[births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \VariableTok{True}\NormalTok{]}
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births\_maternal\_smoker, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}}
\NormalTok{            .}\BuiltInTok{set}\NormalTok{(title }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Smoker histogram\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-13-output-1.png}

}

\end{figure}

On the other hand, the distribution of birth weights for maternal
non-smokers is weakly bi-modal.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{births\_maternal\_non\_smoker }\OperatorTok{=}\NormalTok{ births[births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \VariableTok{False}\NormalTok{]}
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births\_maternal\_non\_smoker, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}}
\NormalTok{            .}\BuiltInTok{set}\NormalTok{(title }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Non{-}Smoker histogram\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-14-output-1.png}

}

\end{figure}

However, changing the bins reveals that the data is not bi-modal.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births\_maternal\_non\_smoker, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{,}\OperatorTok{\textbackslash{}}
\NormalTok{             bins }\OperatorTok{=} \DecValTok{20}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-15-output-1.png}

}

\end{figure}

\hypertarget{density-curves}{%
\section{Density Curves}\label{density-curves}}

Instead of a discrete histogram, we can visualize what a continuous
distribution corresponding to that same data could look like using a
curve. - The smooth curve drawn on top of the histogram here is called a
density curve.

In lecture 8, we will study how exactly to compute these density curves
(using a technique is called Kernel Density Estimation).

If we plot \texttt{birth\ weights} of babies of \emph{smoking mothers},
we get a histogram that appears bimodal.

\begin{itemize}
\tightlist
\item
  Density curve reinforces belief in this bimodality.
\end{itemize}

However, if we plot \texttt{birth\ weights} of babies of
\emph{non-smoking mothers}, we get a histogram that appears unimodal.

From a goal 1 perspective, this is EDA which tells us there may be
something interesting here worth pursuing.

\begin{itemize}
\tightlist
\item
  Deeper analysis necessary!
\item
  If we found something truly interesting, we'd have to cautiously write
  up an argument and create goal 2 level visualizations.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{births\_non\_maternal\_smoker }\OperatorTok{=}\NormalTok{ births[births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \VariableTok{False}\NormalTok{]}
\NormalTok{births\_maternal\_smoker }\OperatorTok{=}\NormalTok{ births[births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \VariableTok{True}\NormalTok{]}
 
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births\_maternal\_smoker , x }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{,}\OperatorTok{\textbackslash{}}
\NormalTok{             kde }\OperatorTok{=} \VariableTok{True}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-16-output-1.png}

}

\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births\_non\_maternal\_smoker , x }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{,}\OperatorTok{\textbackslash{}}
\NormalTok{             kde }\OperatorTok{=} \VariableTok{True}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-17-output-1.png}

}

\end{figure}

\hypertarget{histograms-and-density}{%
\subsection{Histograms and Density}\label{histograms-and-density}}

Rather than labeling by counts, we can instead plot the density, as
shown below. Density gives us a measure that is invariant to the total
number of observed units. The numerical values on the Y-axis for a
sample of 100 units would be the same for when we observe a sample of
10000 units instead. We can still calculate the absolute number of
observed units using density.

Example: There are 1174 observations total. - Total area of this bin
should be: 120/1174 = \textasciitilde10\% - Density of this bin is
therefore: 10\% / (115 - 110) = 0.02

\hypertarget{box-plots-and-violin-plots}{%
\section{Box Plots and Violin Plots}\label{box-plots-and-violin-plots}}

\hypertarget{boxplots}{%
\subsection{Boxplots}\label{boxplots}}

\textbf{Boxplots} are an alternative to histograms that visualize
numerical distributions. They are especially useful in graphicaly
summarizing several characteristics of a distribution. These include:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Lower Quartile (\(1^{st}\) Quartile)
\item
  Median (\(2^{nd}\) Quartile)
\item
  Upper Quartile (\(3^{rd}\) Quartile)
\item
  Interquartile Range (IQR)
\item
  Whiskers
\item
  Outliers
\end{enumerate}

The \textbf{lower quartile}, \textbf{median}, and \textbf{uper quartile}
are the \(25^{th}\), \(50^{th}\), and \(75^{th}\) percentiles of data,
respectively. The \textbf{interquartile range} measures the spread of
the middle \(50\)\% of the distribution, calculated as the (\(3^{rd}\)
Quartile \(-\) \(1^{st}\) Quartile).

The \textbf{whiskers} of a box-plot are the two points that lie at the
{[}\(1^{st}\) Quartile \(-\) (\(1.5\times\) IQR){]}, and the
{[}\(3^{rd}\) Quartile \(+\) (\(1.5\times\) IQR){]}. They are the lower
and upper ranges of ``normal'' data (the points excluding outliers).
Subsequently, the \textbf{outliers} are the data points that fall beyond
the whiskers, or further than (\(1.5 \times\) IQR) from the extreme
quartiles.

Let's visualize a box-plot of the \texttt{Birth\ Weight} column.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.boxplot(data }\OperatorTok{=}\NormalTok{ births, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}

\NormalTok{bweights }\OperatorTok{=}\NormalTok{ births[}\StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{]}
\NormalTok{q1 }\OperatorTok{=}\NormalTok{ np.percentile(bweights, }\DecValTok{25}\NormalTok{)}
\NormalTok{q2 }\OperatorTok{=}\NormalTok{ np.percentile(bweights, }\DecValTok{50}\NormalTok{)}
\NormalTok{q3 }\OperatorTok{=}\NormalTok{ np.percentile(bweights, }\DecValTok{75}\NormalTok{)}
\NormalTok{iqr }\OperatorTok{=}\NormalTok{ q3 }\OperatorTok{{-}}\NormalTok{ q1}
\NormalTok{whisk1 }\OperatorTok{=}\NormalTok{ q1 }\OperatorTok{{-}}\NormalTok{ (}\FloatTok{1.5} \OperatorTok{*}\NormalTok{ iqr)}
\NormalTok{whisk2 }\OperatorTok{=}\NormalTok{ q3 }\OperatorTok{+}\NormalTok{ (}\FloatTok{1.5} \OperatorTok{*}\NormalTok{ iqr)}

\BuiltInTok{print}\NormalTok{(}\StringTok{"The first quartile is }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(q1))}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The second quartile is }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(q2))}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The third quartile is }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(q3))}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The interquartile range is }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(iqr))}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The whiskers are }\SpecialCharTok{\{\}}\StringTok{ and }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(whisk1, whisk2))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
The first quartile is 108.0
The second quartile is 120.0
The third quartile is 131.0
The interquartile range is 23.0
The whiskers are 73.5 and 165.5
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-18-output-2.png}

}

\end{figure}

Here is a helpful visual that summarizes our discussion above.

\hypertarget{violin-plots}{%
\subsection{Violin Plots}\label{violin-plots}}

Another diagram that is useful in visualizing a variable's distribution
is the violin plot. A \textbf{violin plot} supplements a box-plot with a
smoothed density curve on either side of the plot. These density curves
highlight the relative frequency of variable's possible values. If you
look closely, you'll be able to discern the quartiles, whiskers, and
other hallmark features of the box-plot.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.violinplot(data }\OperatorTok{=}\NormalTok{ births, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-19-output-1.png}

}

\end{figure}

\hypertarget{comparing-quantitative-distributions}{%
\section{Comparing Quantitative
Distributions}\label{comparing-quantitative-distributions}}

Earlier in our discussion of the mode, we visualized two histograms that
described the distribution of birth weights for maternal smokers and
non-smokers. However, comparing these histograms was difficult because
they were displayed on seperate plots. Can we overlay the two to tell a
more compelling story?

In \texttt{seaborn}, multiple calls to a plotting library in the same
code cell will overlay the plots. For example:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{births\_maternal\_smoker }\OperatorTok{=}\NormalTok{ births[births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \VariableTok{False}\NormalTok{]}
\NormalTok{births\_non\_maternal\_smoker }\OperatorTok{=}\NormalTok{ births[births[}\StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \VariableTok{True}\NormalTok{]}

\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births\_maternal\_smoker, x }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{,}
\NormalTok{             color }\OperatorTok{=} \StringTok{\textquotesingle{}orange\textquotesingle{}}\NormalTok{, label }\OperatorTok{=} \StringTok{\textquotesingle{}smoker\textquotesingle{}}\NormalTok{)}
\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ births\_non\_maternal\_smoker, x }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{,}
\NormalTok{             color }\OperatorTok{=} \StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, label }\OperatorTok{=} \StringTok{\textquotesingle{}nonsmoker\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.legend()}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-20-output-1.png}

}

\end{figure}

However, notice how this diagram suffers from overplotting. We can fix
this with a call to \texttt{sns.kdeplot}. This will remove the bins and
overlay the histogram with a density curve that better summarizes the
distribution.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.kdeplot(data }\OperatorTok{=}\NormalTok{ births\_maternal\_smoker, x }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{, color }\OperatorTok{=} \StringTok{\textquotesingle{}orange\textquotesingle{}}\NormalTok{, label }\OperatorTok{=} \StringTok{\textquotesingle{}smoker\textquotesingle{}}\NormalTok{)}
\NormalTok{sns.kdeplot(data }\OperatorTok{=}\NormalTok{ births\_non\_maternal\_smoker, x }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{, color }\OperatorTok{=} \StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, label }\OperatorTok{=} \StringTok{\textquotesingle{}nonsmoker\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.legend()}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-21-output-1.png}

}

\end{figure}

Unfortunately, we lose critical information in our distribution by
removing small details. Therefore, we typically prefer to use box-plots
and violin plots when comparing distributions. These are more concise
and allow us to compare summary statistics across many distributions.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.violinplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-22-output-1.png}

}

\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.boxplot(data}\OperatorTok{=}\NormalTok{births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-23-output-1.png}

}

\end{figure}

\hypertarget{ridge-plots}{%
\section{Ridge Plots}\label{ridge-plots}}

Ridge plots show many density curves offset from one another with
minimal overlap. They are useful when the specific shape of each curve
is important.

\bookmarksetup{startatroot}

\hypertarget{visualization-ii}{%
\chapter{Visualization II}\label{visualization-ii}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Use KDE for estimating density curve.
\item
  Using transformations to analyze the relationship between two
  variables.
\item
  Evalauting quality of a visualization based on visualization theory
  concepts.
\end{itemize}

\end{tcolorbox}

\hypertarget{kernel-density-functions}{%
\section{Kernel Density Functions}\label{kernel-density-functions}}

\hypertarget{kde-mechanics}{%
\subsection{KDE Mechanics}\label{kde-mechanics}}

In the last lecture, we learned that density curves are smooth,
continuous functions that represent a distribution of values. In this
section, we'll learn how to construct density curves using Kernel
Density Estimation (KDE).

\hypertarget{smoothing}{%
\subsubsection{Smoothing}\label{smoothing}}

Kernel Density Estimation involves a technique called \textbf{smoothing}
- a process applied to a distribution of values that allows us to
analyze the more general structure of the dataset.

Many of the visualizations we learned during the last lecture are
examples of smoothing. Histograms are smoothed versions of
one-dimensional rug plots, and hex plots are smoother alternatives to
two-dimensional scatter plots. They remove the detail from individual
observations so we can visualize the larger properties of our
distribution.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}

\NormalTok{titanic }\OperatorTok{=}\NormalTok{ sns.load\_dataset(}\StringTok{\textquotesingle{}titanic\textquotesingle{}}\NormalTok{)}
\NormalTok{sns.rugplot(titanic[}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{],height }\OperatorTok{=} \FloatTok{0.5}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-2-output-1.png}

}

\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.histplot(titanic[}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{])}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-3-output-1.png}

}

\end{figure}

\hypertarget{kernel-density-estimation}{%
\subsubsection{Kernel Density
Estimation}\label{kernel-density-estimation}}

\textbf{Kernel Density Estimation} is a smoothing technique that allows
us to estimate a density curve (also known as a probability density
function) from a set of observations. There are a few steps in this
process:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Place a kernel at each data point
\item
  Normalize kernels to have total area of 1 (across all kernels)
\item
  Sum kernels together
\end{enumerate}

Suppose we have 5 data points: \([2.2, 2.8, 3.7, 5.3, 5.7]\). We wish to
recreate the following Kernel Density Estimate:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data }\OperatorTok{=}\NormalTok{ [}\FloatTok{2.2}\NormalTok{, }\FloatTok{2.8}\NormalTok{, }\FloatTok{3.7}\NormalTok{, }\FloatTok{5.3}\NormalTok{, }\FloatTok{5.7}\NormalTok{]}
\NormalTok{sns.kdeplot(data)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-4-output-1.png}

}

\end{figure}

Let's walk through each step to construct this density curve.

\hypertarget{step-1---place-a-kernel-at-each-data-point}{%
\paragraph{Step 1 - Place a Kernel at Each Data
Point}\label{step-1---place-a-kernel-at-each-data-point}}

To begin generating a density curve, we need to choose a \textbf{kernel}
and \textbf{bandwidth value (\(\alpha\))}. What are these exactly? A
\textbf{kernel} is a density curve itself, and the \textbf{bandwidth
(\(\alpha\))} is a measure of the kernel's width. Recall that a valid
density has an area of 1.

At each of our 5 points (depicted in the rug plot on the left), we've
placed a Gaussian kernel with a bandwidth parameter of alpha = 1. We'll
explore what these are in the next section.

\textbf{Rugplot of Data}

\textbf{Kernelized Data}

\hypertarget{step-2---normalize-kernels-to-have-total-area-of-1}{%
\paragraph{Step 2 - Normalize Kernels to Have Total Area of
1}\label{step-2---normalize-kernels-to-have-total-area-of-1}}

Notice how these 5 kernels are density curves - meaning they each have
an area of 1. In Step 3, we will be summing each these kernels, and we
want the result to be a valid density that has an area of 1. Therefore,
it makes sense to normalize our current set of kernels by multiplying
each by \(\frac{1}{5}\).

\textbf{Kernelized Data}

\textbf{Normalized Kernels}

\hypertarget{step-3---sum-kernels-together}{%
\paragraph{Step 3 - Sum Kernels
Together}\label{step-3---sum-kernels-together}}

Our kernel density estimate (KDE) is the sum of the normalized kernels
along the x-axis. It is depicted below on the right.

\textbf{Normalized Kernels}

\textbf{Kernel Density Estimate}

\hypertarget{kernel-functions-and-bandwidth}{%
\subsection{Kernel Functions and
Bandwidth}\label{kernel-functions-and-bandwidth}}

\hypertarget{kernels}{%
\subsubsection{Kernels}\label{kernels}}

A \textbf{kernel} (for our purposes) is a valid density function. This
means it:

\begin{itemize}
\tightlist
\item
  Must be non-negative for all inputs.
\item
  Must integrate to 1.
\end{itemize}

A general ``KDE formula'' function is given above.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \(K_{\alpha}(x, xi)\) is the kernel centered on the observation
  \texttt{i}.

  \begin{itemize}
  \tightlist
  \item
    Each kernel individually has area 1.
  \item
    x represents any number on the number line. It is the input to our
    function.
  \end{itemize}
\item
  \(n\) is the number of observed data points that we have.

  \begin{itemize}
  \tightlist
  \item
    We multiply by \(\frac{1}{n}\) so that the total area of the KDE is
    still 1.
  \end{itemize}
\item
  Each \(x_i \in \{x_1, x_2, \dots, x_n\}\) represents an observed data
  point.

  \begin{itemize}
  \tightlist
  \item
    These are what we use to create our KDE by summing multiple shifted
    kernels centered at these points.
  \end{itemize}
\end{enumerate}

*\(\alpha\) (alpha) is the bandwidth or smoothing parameter.

\hypertarget{gaussian-kernel}{%
\paragraph{Gaussian Kernel}\label{gaussian-kernel}}

The most common kernel is the \textbf{Gaussian kernel}. The Gaussian
kernel is equivalent to the Gaussian probability density function (the
Normal distribution), centered at the observed value \(x_i\) with a
standard deviation of \(\alpha\) (this is known as the
\textbf{bandwidth} parameter).

\(K_a(x, x_i) = \frac{1}{\sqrt{2\pi\alpha^{2}}}e^{-\frac{(x-x_i)^{2}}{2\alpha^{2}}}\)

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt }

\KeywordTok{def}\NormalTok{ gaussian\_kernel(alpha, x, z):}
    \ControlFlowTok{return} \FloatTok{1.0}\OperatorTok{/}\NormalTok{np.sqrt(}\FloatTok{2.} \OperatorTok{*}\NormalTok{ np.pi }\OperatorTok{*}\NormalTok{ alpha}\OperatorTok{**}\DecValTok{2}\NormalTok{) }\OperatorTok{*}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{(x }\OperatorTok{{-}}\NormalTok{ z) }\OperatorTok{**} \DecValTok{2} \OperatorTok{/}\NormalTok{ (}\FloatTok{2.0} \OperatorTok{*}\NormalTok{ alpha}\OperatorTok{**}\DecValTok{2}\NormalTok{))}

\NormalTok{xs }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{5}\NormalTok{, }\DecValTok{5}\NormalTok{, }\DecValTok{200}\NormalTok{)}
\NormalTok{alpha }\OperatorTok{=} \DecValTok{1}
\NormalTok{kde\_curve }\OperatorTok{=}\NormalTok{ [gaussian\_kernel(alpha, x, }\DecValTok{0}\NormalTok{) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ xs]}
\NormalTok{plt.plot(xs, kde\_curve)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-5-output-1.png}

}

\caption{The Gaussian kernel centered at 0 with bandwidth \(\alpha\) =
1.}

\end{figure}

If you've taken a probability class, you'll recognize that the mean of
this Gaussian kernel is \(x_i\) and the standard deviation is
\(\alpha\). Increasing \(\alpha\) - equivalently, the bandwidth -
smoothens the density curve. Larger values of \(\alpha\) are typically
easier to understand; however, we begin to lose important distributional
information.

Here is how adjusting \(\alpha\) affects a distribution in some variable
from an arbitrary dataset.

\textbf{Gaussian Kernel, \(\alpha\) = 0.1}

\textbf{Gaussian Kernel, \(\alpha\) = 1}

\textbf{Gaussian Kernel, \(\alpha\) = 2}

\textbf{Gaussian Kernel, \(\alpha\) = 10}

\hypertarget{boxcar-kernel}{%
\paragraph{Boxcar Kernel}\label{boxcar-kernel}}

Another example of a kernel is the \textbf{Boxcar kernel}. The boxcar
kernel assigns a uniform density to points within a ``window'' of the
observation, and a density of 0 elsewhere. The equation below is a
Boxcar kernel with the center at \(x_i\) and the bandwidth of
\(\alpha\).

\(K_a(x, x_i) = \begin{cases}  \frac{1}{\alpha}, & |x - x_i| \le \frac{\alpha}{2}\\  0, & \text{else }  \end{cases}\)

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ boxcar\_kernel(alpha, x, z):}
    \ControlFlowTok{return}\NormalTok{ (((x}\OperatorTok{{-}}\NormalTok{z)}\OperatorTok{\textgreater{}={-}}\NormalTok{alpha}\OperatorTok{/}\DecValTok{2}\NormalTok{)}\OperatorTok{\&}\NormalTok{((x}\OperatorTok{{-}}\NormalTok{z)}\OperatorTok{\textless{}=}\NormalTok{alpha}\OperatorTok{/}\DecValTok{2}\NormalTok{))}\OperatorTok{/}\NormalTok{alpha}

\NormalTok{xs }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{5}\NormalTok{, }\DecValTok{5}\NormalTok{, }\DecValTok{200}\NormalTok{)}
\NormalTok{alpha}\OperatorTok{=}\DecValTok{1}
\NormalTok{kde\_curve }\OperatorTok{=}\NormalTok{ [boxcar\_kernel(alpha, x, }\DecValTok{0}\NormalTok{) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ xs]}
\NormalTok{plt.plot(xs, kde\_curve)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-6-output-1.png}

}

\caption{The Boxcar kernel centered at 0 with bandwidth \(\alpha\) = 1.}

\end{figure}

The diagram on the right is how the density curve for our 5 point
dataset would have looked had we used the Boxcar kernel with bandwidth
\(\alpha\) = 1.

\hypertarget{relationships-between-quantitative-variables}{%
\subsection{Relationships Between Quantitative
Variables}\label{relationships-between-quantitative-variables}}

Up until now, we've discussed how to visualize single-variable
distributions. Going beyond this, we want to understand the relationship
between pairs of numerical variables.

\hypertarget{scatter-plots}{%
\subsubsection{Scatter Plots}\label{scatter-plots}}

\textbf{Scatter plots} are one of the most useful tools in representing
the relationship between two quantitative variables. They are
particularly important in gauging the strength, or correlation between
variables. Knowledge of these relationships can then motivate decisions
in our modeling process.

For example, let's plot a scatter plot comparing the
\texttt{Maternal\ Height} and \texttt{Birth\ Weight} colums, using both
\texttt{matplotlib} and \texttt{seaborn}.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\NormalTok{births }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/baby.csv"}\NormalTok{)}
\NormalTok{births.head(}\DecValTok{5}\NormalTok{)}

\CommentTok{\# Matplotlib Example}
\NormalTok{plt.scatter(births[}\StringTok{\textquotesingle{}Maternal Height\textquotesingle{}}\NormalTok{], births[}\StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{])}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Maternal Height\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-7-output-1.png}

}

\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Seaborn Example}
\NormalTok{sns.scatterplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Height\textquotesingle{}}\NormalTok{, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{,}
\NormalTok{                hue }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<AxesSubplot: xlabel='Maternal Height', ylabel='Birth Weight'>
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-8-output-2.png}

}

\end{figure}

This is an example where color is used to add a third dimension to our
plot. This is possible with the \texttt{hue} paramater in
\texttt{seaborn}, which adds a categorical column encoding to an
existing visualization. This way, we can look for relationships in
\texttt{Maternal\ Height} and \texttt{Birth\ Weight} in both maternal
smokers and non-smokers. If we wish to see the relationship's strength
more clearly, we can use \texttt{sns.lmplot}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.lmplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Height\textquotesingle{}}\NormalTok{, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{, }
\NormalTok{           hue }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{, ci }\OperatorTok{=} \VariableTok{False}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-9-output-1.png}

}

\end{figure}

We can make out a weak, positive relationship in the mother's height and
birth weight for both maternal smokers and non-smokers (the baseline is
slightly lower in maternal smokers).

\hypertarget{overplotting}{%
\subsection{Overplotting}\label{overplotting}}

As you may have noticed, the scatterplots of \texttt{Maternal\ Height}
vs.~\texttt{Birth\ Weight} have many densely plotted areas. Many of the
points are on top of one other! This makes it difficult to tell exactly
how many babies are plotted in each the more densely populated regions
of the graph. This can arise when the tools used for measuring data have
low granularity, many different values are rounded to the same value, or
if the ranges of the two variables differ greatly in scale.

We can overcome this by introducing a small amount of uniform random
noise to our data. This is called \emph{jittering}. Let's see what
happens when we introduce noise to the \texttt{Maternal\ Height}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{births[}\StringTok{"Maternal Height (jittered)"}\NormalTok{] }\OperatorTok{=}\NormalTok{ births[}\StringTok{"Maternal Height"}\NormalTok{] }\OperatorTok{+}\NormalTok{ np.random.uniform(}\OperatorTok{{-}}\FloatTok{0.2}\NormalTok{, }\FloatTok{0.2}\NormalTok{, }\BuiltInTok{len}\NormalTok{(births))}
\NormalTok{sns.lmplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Height (jittered)\textquotesingle{}}\NormalTok{, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{, }
\NormalTok{           hue }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Smoker\textquotesingle{}}\NormalTok{, ci }\OperatorTok{=} \VariableTok{False}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-10-output-1.png}

}

\end{figure}

This plot more clearly shows that most of the data is clustered tightly
around the point (62.5,120) and gradually becomes more loose further
away from the center. It is much easier for us and others to see how the
data is distributed. In conclusion, \emph{jittering} helps us better
understand our own data (Goal 1) and communicate results to others (Goal
2).

\hypertarget{hex-plots-and-contour-plots}{%
\subsubsection{Hex Plots and Contour
Plots}\label{hex-plots-and-contour-plots}}

Unfortunately, our scatter plots above suffered from overplotting, which
made them hard to interpret. And with a large number of points,
jittering is unlikely to resolve the issue. Instead, we can look to hex
plots and contour plots.

\textbf{Hex Plots} can be thought of as a two dimensional histogram that
shows the joint distribution between two variables. This is particularly
useful working with very dense data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.jointplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Pregnancy Weight\textquotesingle{}}\NormalTok{, }
\NormalTok{              y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{, kind }\OperatorTok{=} \StringTok{\textquotesingle{}hex\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<seaborn.axisgrid.JointGrid at 0x7f8830294430>
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-11-output-2.png}

}

\end{figure}

The axes are evidently binned into hexagons, which makes the linear
relationship easier to decipher. Darker regions generally indicate a
higher density of points.

On the other hand, \textbf{contour plots} are two dimensional versions
of density curves with marginal distributions of each variable on the
axes. We've used very similar code here to generate our contour plots,
with the addition of the
\texttt{kind\ =\ \textquotesingle{}kde\textquotesingle{}} and
\texttt{fill\ =\ True} arguments.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.jointplot(data }\OperatorTok{=}\NormalTok{ births, x }\OperatorTok{=} \StringTok{\textquotesingle{}Maternal Height\textquotesingle{}}\NormalTok{, y }\OperatorTok{=} \StringTok{\textquotesingle{}Birth Weight\textquotesingle{}}\NormalTok{,}\OperatorTok{\textbackslash{}}
\NormalTok{              kind }\OperatorTok{=} \StringTok{\textquotesingle{}kde\textquotesingle{}}\NormalTok{, fill }\OperatorTok{=} \VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<seaborn.axisgrid.JointGrid at 0x7f88450e60a0>
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-12-output-2.png}

}

\end{figure}

\hypertarget{transformations}{%
\section{Transformations}\label{transformations}}

These last two lectures have covered visualizations in great depth. We
looked at various forms of visualizations, plotting libraries, and
high-level theory.

Much of this was done to uncover insights in data, which will prove
necessary for the modeling process. A strong graphical correlation
between two variables hinted an underlying relationship that has reason
for further study. However, relying on visual relationships alone is
limiting - not all plots show association. The presence of outliers and
other statistical anomalies make it hard to interpret data.

\textbf{Transformations} are the process of manipulating data to find
significant relationships between variables. These are often found by
applying mathematical functions to variables that ``transform'' their
range of possible values and highlight some previously hidden
associations between data.

\hypertarget{transforming-a-distribution}{%
\subsubsection{Transforming a
Distribution}\label{transforming-a-distribution}}

When a distribution has a large dynamic range, it can be useful to take
the logarithm of the data. For example, computing the logarithm of the
ticket prices on the Titanic reduces skeweness and yields a distribution
that is more ``spread'' across the x-axis. While it makes individual
observations harder to interpret, the distribution is more favorable for
subsequent analysis.

\hypertarget{linearizing-a-relationship}{%
\subsubsection{Linearizing a
Relationship}\label{linearizing-a-relationship}}

Transformations are perhaps most useful to \textbf{linearize a
relationship} between variables. If we find a transformation to make a
scatter plot of two variables linear, we can ``backtrack'' to find the
exact relationship between the variables. Linear relationships are
particularly simple to interpret, and we'll be doing a lot of linear
modeling in Data 100 - starting next week!

Say we want to understand the relationship between healthcare and life
expectancy. Intuitively there should be a positive correlation, but upon
plotting values from a dataset, we find a non-linear relationship that
is somewhat hard to understand. However, applying a logarithmic
transformation to both variables - healthcare and life expectancy -
results in a scatter plot with a linear trend that we can interpret.

How can we find the relationship between the original variables? We know
that taking a log of both axes gave us a linear relationship, so we can
say (roughly) that

\[\log y= a\times\log x + b\]

Solving for \(y\) implies a \textbf{power} relationship in the original
plot.

\[y= e^{a\times\log x + b}\] \[y= Ce^{a\times\log x}\] \[y= Cx^{a}\]

How did we know that taking the logarithm of both sides would result in
a linear relationship? The \textbf{Tukey-Mosteller Bulge Diagram} is
helpful here. We can use the direction of the buldge in our original
data to find the appropriate transformations that will linearize the
relationship. These transformations are found on axes that are nearest
to the buldge. The buldge in our earlier example lay in Quadrant 2, so
the transformations \(\log x\), \(\sqrt x\), \(y^{2}\), or \(y^{3}\) are
possible contenders. It's important to note that this diagram is not
perfect, and some transformations will work better than others. In our
case, \(\log x\) and \(\log y\) (found in Quadrant 3) were the best.

\hypertarget{additional-remarks}{%
\subsubsection{Additional Remarks}\label{additional-remarks}}

Visualization requires a lot of thought! - There are many tools for
visualizing distributions. - Distribution of a single variable: 1. rug
plot 2. histogram 3. density plot 4. box plot 5. violin plot - Joint
distribution of two quantitative variables: 1. scatter plot 2. hex plot
3. contour plot.

This class primarily uses \texttt{seaborn} and \texttt{matplotlib}, but
\texttt{Pandas} also has basic built-in plotting methods. Many other
visualization libraries exist, and \texttt{plotly} is one of them. -
\texttt{plotly} creates very easily creates interactive plots. -
\texttt{plotly} will occasionally appear in lecture code, labs, and
assignments!

Next, we'll go deeper into the theory behind visualization.

\hypertarget{visualization-theory}{%
\section{Visualization Theory}\label{visualization-theory}}

This section marks a pivot to the second major topic of this lecture -
visualization theory. We'll discuss the abstract nature of
visualizations and analyze how they convey information.

Remember, we had two goals for visualizing data. This section is
particularly important in:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Helping us understand the data and results
\item
  Communicating our results and conclusions with others
\end{enumerate}

\hypertarget{information-channels}{%
\subsection{Information Channels}\label{information-channels}}

Visualizations are able to convey information through various encodings.
In the remainder of this lecture, we'll look at the use of color, scale,
and depth, to name a few.

\hypertarget{encodings-in-rugplots}{%
\subsubsection{Encodings in Rugplots}\label{encodings-in-rugplots}}

One detail that we may have overlooked in our earlier discussion of
rugplots is the importance of encodings. Rugplots are effective visuals
because they utilize line thickness to encode frequency. Consider the
following diagram:

\hypertarget{multi-dimensional-encodings}{%
\subsubsection{Multi-Dimensional
Encodings}\label{multi-dimensional-encodings}}

Encodings are also useful for representing multi-dimensional data.
Notice how the following visual highlights four distinct ``dimensions''
of data:

\begin{itemize}
\tightlist
\item
  X-axis
\item
  Y-axis
\item
  Area
\item
  Color
\end{itemize}

The human visual perception sytem is only capable of visualizing data in
a three-dimensional plane, but as you've seen, we can encode many more
channels of information.

\hypertarget{harnessing-the-axes}{%
\subsection{Harnessing the Axes}\label{harnessing-the-axes}}

\hypertarget{consider-scale-of-the-data}{%
\subsubsection{Consider Scale of the
Data}\label{consider-scale-of-the-data}}

However, we should be careful to not misrepresent relationships in our
data by manipulating the scale or axes. The visualization below
improperly portrays two seemingly independent relationships on the same
plot. The authors have clearly changed the scale of the y-axis to
mislead their audience.

Notice how the downwards-facing line segment contains values in the
millions, while the upwards-trending segment only contains values near
three hundred thousand. These lines should not be intersecting.

When there is a large difference in the magnitude of the data, it's
advised to analyze percentages instead of counts. The following diagrams
correctly display the trends in cancer screening and abortion rates.

\hypertarget{reveal-the-data}{%
\subsubsection{Reveal the Data}\label{reveal-the-data}}

Great visualizations not only consider the scale of the data, but also
utilize the axes in a way that best conveys information. For example,
data scientists commonly set certain axes limits to highlight parts of
the visualization they are most interested in.

The visualization on the right captures the trend in coronavirus cases
during the month March in 2020. From only looking at the visualization
on the left, a viewer may incorrectly believe that coronavirus began to
skyrocket on March 4\textsuperscript{th}, 2020. However, the second
illustration tells a different story - cases rose closer to March
21\textsuperscript{th}, 2020.

\hypertarget{harnessing-color}{%
\subsection{Harnessing Color}\label{harnessing-color}}

Color is another important feature in visualizations that does more than
what meets the eye.

Last lecture, we used color to encode a categorical variable in our
scatter plot. In this section, we will discuss uses of color in novel
visualizations like colormaps and heatmaps.

5-8\% of the world is red-green color blind, so we have to be very
particular about our color scheme. We want to make these as accessible
as possible. Choosing a set of colors which work together is evidently a
challenging task!

\hypertarget{colormaps}{%
\subsubsection{Colormaps}\label{colormaps}}

Colormaps are mappings from pixel data to color values, and they're
often used to highlight distinct parts of an image. Let's investigate a
few properties of colormaps.

\textbf{Jet Colormap}

\textbf{Viridis Colormap}

The jet colormap is infamous for being misleading. While it seems more
vibrant than viridis, the aggressive colors poorly encode numerical
data. To understand why, let's analyze the following images.

The diagram on the left compares how a variety of colormaps represent
pixel data that transitions from a high to low intensity. These include
the jet colormap (row a) and grayscale (row b). Notice how the grayscale
images do the best job in smoothly transitioning between pixel data. The
jet colormap is the worst at this - the four images in row (a) look like
a conglomeration of individual colors.

The difference is also evident in the images labeled (a) and (b) on the
left side. The grayscale image is better at preserving finer detail in
the vertical line strokes. Additionally, grayscale is preferred in x-ray
scans for being more neutral. The intensity of dark red color in the jet
colormap is frightening and indicates something is wrong.

Why is the jet colormap so much worse? The answer lies in how its color
composition is percieved to the human eye.

\textbf{Jet Colormap Perception}

\textbf{Viridis Colormap Perception}

The jet colormap is largely misleading because it is not perceptually
uniform. \textbf{Perceptually uniform colormaps} have the property that
if the pixel data goes from 0.1 to 0.2, the perceptual change is the
same as when the data goes from 0.8 to 0.9.

Notice how the said uniformity is present within the linear trend
displayed in the viridis colormap. On the other hand, the jet colormap
is largely non-linear - this is precisely why it's considered a worse
colormap.

\hypertarget{harnessing-markings}{%
\subsection{Harnessing Markings}\label{harnessing-markings}}

In our earlier discussion of multi-dimensional encodings, we analyzed a
scatter plot with four pseudo-dimensions: the two axes, area, and color.
Were these appropriate to use? The following diagram analyzes how well
the human eye can distinguish between these ``markings''.

There are a few key takeaways from this diagram

\begin{itemize}
\tightlist
\item
  Lengths are easy to discern. Don't use plots with jiggled baselines -
  keep everything axis-aligned.
\item
  Avoid pie charts! Angle judgements are inaccurate.
\item
  Areas and volumes are hard to distinguish (area charts, word clouds,
  etc)
\end{itemize}

\hypertarget{harnessing-conditioning}{%
\subsection{Harnessing Conditioning}\label{harnessing-conditioning}}

Conditioning is the process of comparing data that belong to seperate
groups. We've seen this before in overlayed distributions, side-by-side
box-plots, and scatter plots with categorical encodings. Here, we'll
introduce terminology that formalizes these examples.

Consider an example where we want to analyze income earnings for male
and females with varying levels of education. There are multiple ways to
compare this data.

The barplot is an example of \textbf{juxtaposition}: placing multiple
plots side by side, with the same scale. The scatter plot is an example
of \textbf{superposition}: placing multiple density curves, scatter
plots on top of each other.

Which is better depends on the problem at hand. Here, superposition
makes the precise wage difference very clear from a quick glance. But
many sophisticated plots convey information that favors the use of
juxtaposition. Below is one example.

\hypertarget{harnessing-context}{%
\subsection{Harnessing Context}\label{harnessing-context}}

The last component to a great visualization is perhaps the most critical
- the use of context. Adding informative titles, axis labels, and
descriptive captions are all best practices that we've heard repeatedly
in Data 8.

A publication-ready plot (and every Data 100 plot) needs:

\begin{itemize}
\tightlist
\item
  Informative title (takeaway, not description)
\item
  Axis labels
\item
  Reference lines, markers, etc
\item
  Legends, if appropriate
\item
  Captions that describe data
\end{itemize}

Captions should be:

\begin{itemize}
\tightlist
\item
  Comprehensive and self-contained
\item
  Describe what has been graphed
\item
  Draw attention to important features
\item
  Describe conclusions drawn from graphs
\end{itemize}

\bookmarksetup{startatroot}

\hypertarget{sampling}{%
\chapter{Sampling}\label{sampling}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Understand how to appropriately collect data to help answer a
  question.
\end{itemize}

\end{tcolorbox}

In Data Science, understanding characteristics of a population starts
with having quality data to investigate. While it is often impossible to
collect all the data describing a population, we can overcome this by
properly sampling from the population. In this note, we will discuss
appropriate techniques for sampling from populations.

\begin{figure}

{\centering \includegraphics{sampling/images/data_life_cycle_sampling.png}

}

\caption{Lifecycle diagram}

\end{figure}

\hypertarget{censuses-and-surveys}{%
\section{Censuses and Surveys}\label{censuses-and-surveys}}

In general: a \textbf{census} is ``an official count or survey of a
population, typically recording various details of individuals.''

\begin{itemize}
\tightlist
\item
  Example: The U.S. Decennial Census was held in April 2020, and it
  counts \textbf{every person} living in all 50 states, DC, and US
  territories. (Not just citizens.) Participation is required by law (it
  is mandated by the U.S. Constitution). Important uses include the
  allocation of Federal funds, congressional representation, and drawing
  congressional and state legislative districts. The census is composed
  of a \textbf{survey} mailed to different housing addresses in the
  United States.
\item
  \textbf{Individuals} in a population are not always people. Other
  populations include: bacteria in your gut (sampled using DNA
  sequencing); trees of a certain species; small businesses receiving a
  microloan; or published results in an academic journal / field.
\end{itemize}

A \textbf{survey} is a set of questions. An example is workers sampling
individuals and households. What is asked, and how it is asked, can
affect how the respondent answers, or even whether the respondent
answers in the first place.

While censuses are great, it is often difficult and expensive to survey
everyone in a population. Thus, we usually survey a subset of the
population instead.

A \textbf{sample} is often used to make inferences about the population.
That being said, how the sample is drawn will affect the reliability of
such inferences. Two common source of error in sampling are
\textbf{chance error}, where random samples can vary from what is
expected, in any direction; and \textbf{bias}, which is a a systematic
error in one direction.

Because of how surveys and samples are drawn, it turns out that samples
are usually---but not always---a subset of the population: *
\textbf{Population}: The group that you want to learn something about. *
\textbf{Sampling Frame}: The list from which the sample is drawn. For
example, if sampling people, then the sampling frame is the set of all
people that could possibly end up in your sample. * \textbf{Sample}: Who
you actually end up sampling. The sample is therefore a subset of your
\emph{sampling frame}.

While ideally these three sets would be exactly the same, in practice
they usually aren't. For example, there may be individuals in your
sampling frame (and hence, your sample) that are not in your population.
And generally, sample sizes are much smaller than population sizes.

\begin{figure}

{\centering \includegraphics{sampling/images/samplingframe.png}

}

\caption{Sampling\_Frames}

\end{figure}

\hypertarget{bias-a-case-study}{%
\section{Bias: A Case Study}\label{bias-a-case-study}}

The following case study is adapted from \emph{Statistics} by Freedman,
Pisani, and Purves, W.W. Norton NY, 1978.

In 1936, President Franklin D. Roosevelt (D) went up for re-election
against Alf Landon (R) . As is usual, \textbf{polls} were conducted in
the months leading up to the election to try and predict the outcome.
The \emph{Literary Digest} was a magazine that had successfully
predicted the outcome of 5 general elections coming into 1936. In their
polling for the 1936 election, they sent out their survey to 10 million
individuals, who they found from phone books, lists of magazine
subscribers, and lists of country club members. Of the roughly 2.4
million people who filled out the survey, only 43\% reported they would
vote for Roosevelt; thus the \emph{Digest} predicted that Landon would
win.

On election day, Roosevelt won in a landslide, winning 61\% of the
popular vote of about 45 million voters. How could the \emph{Digest}
have been so wrong with their polling?

It turns out that the \emph{Literary Digest} sample was not
representative of the population. Their sampling frame inherently skewed
towards more affluent voters, who tended to vote Republican, and they
completely overlooked the lion's share of voters who were still
suffering through the Great Depression. Furthermore, they had a dismal
response rate (about 24\%); who knows how the other non-respondents
would have polled? The \emph{Digest} folded just 18 months after this
disaster.

At the same time, George Gallup, a rising statistician, also made
predictions about the 1936 elections. His estimate (56\% Roosevelt) was
much closer despite having a smaller sample size of ``only'' 50,000
(still more than necessary; more when we cover the Central Limit
Theorem). Gallup also predicted the \emph{Digest}'s prediction within
1\%, with a sample size of only 3000 people. He did so by anticipating
the \emph{Digest}'s affluent sampling frame and subsampled those
individuals. The \textbf{Gallup Poll} today is one of the leading polls
for election results.

So what's the moral of the story? Samples, while convenient, are subject
to chance error and \textbf{bias}. Election polling, in particular, can
involve many sources of bias. To name a few: * \textbf{Selection bias}
systematically excludes (or favors) particular groups. *
\textbf{Response bias} occurs because people don't always respond
truthfully. Survey designers pay special detail to the nature and
wording of questions to avoid this type of bias. * \textbf{Non-response
bias} occurs because people don't always respond to survey requests,
which can skew responses. For example, the Gallup poll is conducted
through landline phone calls, but many different populations in the U.S.
do not pay for a landline, and still more do not always answer the
phone. Surveyers address this bias by staying persistent and keeping
surveys short.

\hypertarget{probability-samples}{%
\section{Probability Samples}\label{probability-samples}}

When sampling, it is essential to focus on the quality of the sample
rather than the quantity of the sample. A huge sample size does not fix
a bad sampling method. Our main goal is to gather a sample that is
representative of the population it came from. The most common way to
accomplish this is by randomly sampling from the population.

\begin{itemize}
\tightlist
\item
  A \textbf{convenience sample} is whatever you can get ahold of. Note
  that haphazard sampling is not necessarily random sampling; there are
  many potential sources of bias.
\item
  In a \textbf{probability sample}, we know the chance any given set of
  individuals will be in the sample.

  \begin{itemize}
  \tightlist
  \item
    Probability samples allow us to estimate the bias and chance error,
    which helps us quantify uncertainty (more in a future lecture).
  \item
    Note that this does not imply that all individuals in the population
    need have the same chance of being selected (see: stratified random
    samples).
  \item
    Further note that the real world is usually more complicated. For
    example, we do not generally know the probability that a given
    bacterium is in a microbiome sample, or whether people will answer
    when Gallup calls landlines. That being said, we try to model
    probability sampling where possible if the sampling or measurement
    process is not fully under our control.
  \end{itemize}
\end{itemize}

A few common random sampling schemes: * A \textbf{random sample with
replacement} is a sample drawn \textbf{uniformly} at random
\textbf{with} replacement. * Random doesn't always mean ``uniformly at
random,'' but in this specific context, it does. * Some individuals in
the population might get picked more than once

\begin{itemize}
\tightlist
\item
  A \textbf{simple random sample (SRS)} is a sample drawn uniformly at
  random without replacement.

  \begin{itemize}
  \tightlist
  \item
    Every individual (and subset of individuals) has the same chance of
    being selected.
  \item
    Every pair has the same chance as every other pair.
  \item
    Every triple has the same chance as every other triple.
  \item
    And so on.
  \end{itemize}
\item
  A \textbf{stratified random sample}, where random sampling is
  performed on strata (specific groups), and the groups together compose
  a sample.
\end{itemize}

\hypertarget{example-stratified-random-sample}{%
\subsection{Example: Stratified random
sample}\label{example-stratified-random-sample}}

Suppose that we are trying to run a poll to predict the mayoral election
in Bearkeley City (an imaginary city that neighbors Berkeley). Suppose
we try a \textbf{stratified random sample} to select 100 voters as
follows: 1. First, we take a simple random sample and obtain 50 voters
that are above the median city income (``above-median-income''), i.e.,
in the upper 50-th percentile of income in the city. 2. We then take a
simple random sample of the other 50 from voters that are below the
median city income.

This is a \textbf{probability sample}: For any group of 100 people, if
there are not exactly 50 ``above-median-income'' voters, then that group
has zero probability of being chosen. For any other group (which has
exactly 50 ``above-median-income'' voters), then the chance of it being
chosen is 1/ \# of such groups.

Note that even if we replace the group counts with 80/20 (80
``above-median-income'' voters, 20 others), then it is still a
probability sample, because we can compute the precise probability of
each group being chosen. However, the sampling scheme (and thus the
modeling of voter preferences) becomes biased towards voters with income
above the median.

\hypertarget{approximating-simple-random-sampling}{%
\section{Approximating Simple Random
Sampling}\label{approximating-simple-random-sampling}}

The following is a very common situation in data science: - We have an
enormous population. - We can only afford to sample a relatively small
number of individuals. If the population is huge compared to the sample,
then random sampling with and without replacement are pretty much the
same.

\textbf{Example} : Suppose there are 10,000 people in a population.
Exactly 7,500 of them like Snack 1; the other 2,500 like Snack 2. What
is the probability that in a random sample of 20, all people like Snack
1?

\begin{itemize}
\tightlist
\item
  Method 1: SRS (Random Sample Without Replacement):
  \(\prod\limits_{k=0}^{19} \dfrac{7500 - k}{10000 - k} \approx 0.003151\)
\item
  Method 2: Random Sample with Replacement:
  \((0.75)^{20} \approx 0.003171\)
\end{itemize}

As seen here, when the population size is large, probabilities of
sampling with replacement are much easier to compute and lead to a
reasonable approximation.

\hypertarget{multinomial-probabilities}{%
\subsection{Multinomial Probabilities}\label{multinomial-probabilities}}

The approximation discussed above suggests the convenience of
\textbf{multinomial probabilities}, which arise from sampling a
categorical distribution at random **with replacement*.

Suppose that we have a bag of marbles with the following distribution:
60\% are blue, 30\% are green, and 10\% are red. If we then proceed to
draw 100 marbles from this bag, at random with replacement, then the
resulting 100-size sample is modeled as a multinomial distribution using
\texttt{np.random.multinomial}:

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\NormalTok{np.random.multinomial(}\DecValTok{100}\NormalTok{, [}\FloatTok{0.60}\NormalTok{, }\FloatTok{0.30}\NormalTok{, }\FloatTok{0.10}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
array([56, 32, 12])
\end{verbatim}

This method allows us to generate, say, 10 samples of size 100 using the
\texttt{size} parameter:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{np.random.multinomial(}\DecValTok{100}\NormalTok{, [}\FloatTok{0.60}\NormalTok{, }\FloatTok{0.30}\NormalTok{, }\FloatTok{0.10}\NormalTok{], size}\OperatorTok{=}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
array([[58, 32, 10],
       [55, 31, 14],
       [59, 35,  6],
       [60, 29, 11],
       [58, 31, 11],
       [69, 22,  9],
       [61, 28, 11],
       [60, 34,  6],
       [55, 37,  8],
       [61, 25, 14]])
\end{verbatim}

\hypertarget{comparing-convenience-sample-and-srs}{%
\section{Comparing Convenience Sample and
SRS}\label{comparing-convenience-sample-and-srs}}

Suppose that we are trying to run a poll to predict the mayoral election
in Bearkeley City (an imaginary city that neighbors Berkeley). Suppose
we took a sample to predict the election outcome by polling all
retirees. Even if they answer truthfully, we have a \textbf{convenience
sample}. How biased would this sample be in predicting the results?
While we will not numerically quantify the bias, in this demo we'll
visually show that because of the voter population distribution, any
error in our prediction from a retiree sample cannot be simply due to
chance:

First, let's grab a data set that has every single voter in the
Bearkeley (again, this is a fake dataset) and how they \textbf{actually}
voted in the election. For the purposes of this example, assume: *
``high income'' indicates a voter is above the median household income,
which is \$97,834 (actual Berkeley number). * There are only two mayoral
candidates: one Democrat and one Republican. * Every registered voter
votes in the election for the candidate under their registered party
(Dem or Rep).

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\NormalTok{bearkeley }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/bearkeley.csv"}\NormalTok{)}

\CommentTok{\# create a 1/0 int that indicates democratic vote}
\NormalTok{bearkeley[}\StringTok{\textquotesingle{}vote.dem\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (bearkeley[}\StringTok{\textquotesingle{}vote\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \StringTok{\textquotesingle{}Dem\textquotesingle{}}\NormalTok{).astype(}\BuiltInTok{int}\NormalTok{)}
\NormalTok{bearkeley.head()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{lrllr}
\toprule
{} &  age &  high\_income & vote &  vote.dem \\
\midrule
0 &   35 &        False &  Dem &         1 \\
1 &   42 &         True &  Rep &         0 \\
2 &   55 &        False &  Dem &         1 \\
3 &   77 &         True &  Rep &         0 \\
4 &   31 &        False &  Dem &         1 \\
\bottomrule
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bearkeley.shape}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
(1300000, 4)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{actual\_vote }\OperatorTok{=}\NormalTok{ np.mean(bearkeley[}\StringTok{"vote.dem"}\NormalTok{])}
\NormalTok{actual\_vote}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
0.5302792307692308
\end{verbatim}

This is the \textbf{actual outcome} of the election. Based on this
result, the Democratic candidate would win. However, if we were to only
consider retiree voters (a retired person is anyone age 65 and up):

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{convenience\_sample }\OperatorTok{=}\NormalTok{ bearkeley[bearkeley[}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{] }\OperatorTok{\textgreater{}=} \DecValTok{65}\NormalTok{]}
\NormalTok{np.mean(convenience\_sample[}\StringTok{"vote.dem"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
0.3744755089093924
\end{verbatim}

Based on this result, we would have predicted that the Republican
candidate would win! This error is not due to the sample being too small
to yield accurate predictions, because there are 359,396 retirees (about
27\% of the 1.3 million Bearkeley voters). Instead, there seems to be
something larger happening. Let's visualize the voter preferences of the
entire population to see how retirees trend:

Let us aggregate all voters by age and visualize the fraction of
Democratic voters, split by income.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{import}\NormalTok{ matplotlib.ticker }\ImportTok{as}\NormalTok{ ticker}

\NormalTok{sns.set\_theme(style}\OperatorTok{=}\StringTok{\textquotesingle{}darkgrid\textquotesingle{}}\NormalTok{, font\_scale }\OperatorTok{=} \FloatTok{1.5}\NormalTok{,}
\NormalTok{              rc}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{:(}\DecValTok{7}\NormalTok{,}\DecValTok{5}\NormalTok{)\})}

\CommentTok{\# aggregate all voters by age}
\NormalTok{votes\_by\_demo }\OperatorTok{=}\NormalTok{ bearkeley.groupby([}\StringTok{"age"}\NormalTok{,}\StringTok{"high\_income"}\NormalTok{]).agg(}\StringTok{"mean"}\NormalTok{).reset\_index()}

\NormalTok{fig }\OperatorTok{=}\NormalTok{ plt.figure()}\OperatorTok{;}
\NormalTok{red\_blue }\OperatorTok{=}\NormalTok{ [}\StringTok{"\#bf1518"}\NormalTok{, }\StringTok{"\#397eb7"}\NormalTok{]}
\ControlFlowTok{with}\NormalTok{ sns.color\_palette(sns.color\_palette(red\_blue)):}
\NormalTok{    ax }\OperatorTok{=}\NormalTok{ sns.pointplot(data}\OperatorTok{=}\NormalTok{votes\_by\_demo, x }\OperatorTok{=} \StringTok{"age"}\NormalTok{, y }\OperatorTok{=} \StringTok{"vote.dem"}\NormalTok{, hue }\OperatorTok{=} \StringTok{"high\_income"}\NormalTok{)}

\NormalTok{ax.set\_title(}\StringTok{"Voting preferences by demographics"}\NormalTok{)}
\NormalTok{fig.canvas.draw()}
\NormalTok{new\_ticks }\OperatorTok{=}\NormalTok{ [i.get\_text() }\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in}\NormalTok{ ax.get\_xticklabels()]}\OperatorTok{;}
\NormalTok{plt.xticks(}\BuiltInTok{range}\NormalTok{(}\DecValTok{0}\NormalTok{, }\BuiltInTok{len}\NormalTok{(new\_ticks), }\DecValTok{10}\NormalTok{), new\_ticks[::}\DecValTok{10}\NormalTok{])}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{sampling/sampling_files/figure-pdf/cell-8-output-1.png}

}

\end{figure}

From the plot above, we see that retirees in the imaginary city of
Bearkeley tend to vote less Democrat, which skewed our predictions from
our sample. We also note that high-income voters tend to vote less
Democrat (and more Republican).

Let's compare our biased convenience sample to a simple random sample.
Supposing we took an SRS the same size as our retiree sample, we see
that we get a result very close to the actual vote:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\# By default, replace = False}
\NormalTok{n }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(convenience\_sample)}
\NormalTok{random\_sample }\OperatorTok{=}\NormalTok{ bearkeley.sample(n, replace }\OperatorTok{=} \VariableTok{False}\NormalTok{)}

\NormalTok{np.mean(random\_sample[}\StringTok{"vote.dem"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
0.5304149183630313
\end{verbatim}

This is very close to the actual vote!

We could even get pretty close with a \emph{much smaller sample size},
say 800:

It turns out that we are pretty close, \textbf{much smaller sample
size}, say, 800 (we'll learn how to choose this number when we introduce
the Central Limit Theorem):

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{n }\OperatorTok{=} \DecValTok{800}
\NormalTok{random\_sample }\OperatorTok{=}\NormalTok{ bearkeley.sample(n, replace }\OperatorTok{=} \VariableTok{False}\NormalTok{)}
\NormalTok{np.mean(random\_sample[}\StringTok{"vote.dem"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
0.50875
\end{verbatim}

To visualize the chance error in an SRS, let's simulate 1000 samples of
the 800-size Simple Random Sample:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{poll\_result }\OperatorTok{=}\NormalTok{ []}
\NormalTok{nrep }\OperatorTok{=} \DecValTok{1000}   \CommentTok{\# number of simulations}
\NormalTok{n }\OperatorTok{=} \DecValTok{800}       \CommentTok{\# size of our sample}
\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{0}\NormalTok{,nrep):}
\NormalTok{    random\_sample }\OperatorTok{=}\NormalTok{ bearkeley.sample(n, replace }\OperatorTok{=} \VariableTok{False}\NormalTok{)}
\NormalTok{    poll\_result.append(np.mean(random\_sample[}\StringTok{"vote.dem"}\NormalTok{]))}
\NormalTok{sns.histplot(poll\_result, stat}\OperatorTok{=}\StringTok{\textquotesingle{}density\textquotesingle{}}\NormalTok{, kde}\OperatorTok{=}\VariableTok{True}\NormalTok{)}

\CommentTok{\# What fraction of these simulated samples would have predicted Democrat?}
\NormalTok{poll\_result }\OperatorTok{=}\NormalTok{ pd.Series(poll\_result)}
\NormalTok{np.}\BuiltInTok{sum}\NormalTok{(poll\_result }\OperatorTok{\textgreater{}=} \FloatTok{0.5}\NormalTok{)}\OperatorTok{/}\DecValTok{1000}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
0.955
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{sampling/sampling_files/figure-pdf/cell-11-output-2.png}

}

\end{figure}

A few observations: First, the KDE looks roughly Gaussian. Second,
supposing that we predicted a Democratic winner if 50\% of our sample
voted Democrat, then just about 4\% of our simulated samples would have
predicted the election result incorrectly. This visualization further
justifies why our convenience sample had error that was not entirely
just due to chance. We'll revisit this notion later in the course.

\hypertarget{summary}{%
\section{Summary}\label{summary}}

Understanding the sampling process is what lets us go from describing
the data to understanding the world. Without knowing / assuming
something about how the data were collected, there is no connection
between the sample and the population. Ultimately, the dataset doesn't
tell us about the world behind the data.

\bookmarksetup{startatroot}

\hypertarget{introduction-to-modeling}{%
\chapter{Introduction to Modeling}\label{introduction-to-modeling}}

\begin{tcolorbox}[enhanced jigsaw, colback=white, colbacktitle=quarto-callout-note-color!10!white, titlerule=0mm, opacityback=0, breakable, bottomrule=.15mm, arc=.35mm, leftrule=.75mm, toprule=.15mm, colframe=quarto-callout-note-color-frame, bottomtitle=1mm, toptitle=1mm, opacitybacktitle=0.6, left=2mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Note}, coltitle=black, rightrule=.15mm]

\begin{itemize}
\tightlist
\item
  Understand what models are and how to carry out the four-step modeling
  process
\item
  Define the concept of loss and gain familiarity with L1 and L2 loss
\item
  Fit a model using minimization techniques
\end{itemize}

\end{tcolorbox}

Up until this point in the semester, we've focused on analyzing
datasets. We've looked into the early stages of the data science
lifecycle, focusing on the programming tools, visualization techniques,
and data cleaning methods needed for data analysis.

This lecture marks a shift in focus. We will move away from examining
datasets to actually \emph{using} our data to better understand the
world. Specifically, the next sequence of lectures will explore
predictive modeling: generating models to make some prediction about the
world around us. In this lecture, we'll introduce the conceptual
framework for setting up a modeling task. In the next few lectures,
we'll put this framework into practice by implementing several kinds of
models.

\hypertarget{what-is-a-model}{%
\section{What is a Model?}\label{what-is-a-model}}

A model is an \textbf{idealized representation} of a system. A system is
a set of principles or procedures according to which something
functions. We live in a world full of systems: the procedure of turning
on a light happens according to a specific set of rules dictating the
flow of electricity. The truth behind how any event occurs are usually
complex, and many times the specifics are unknown. The workings of the
world can be viewed is its own giant procedure. Models seek to simplify
the world and distill them it into workable pieces.

Example: We model the fall of an object on Earth as subject to a
constant acceleration of \(9.81 \frac{m}{s^2}\) due to gravity.

\begin{itemize}
\tightlist
\item
  While this describes the behavior of our system, it is merely an
  approximation.
\item
  It doesn't account for the effects of air resistance, local variations
  in gravity, etc.
\item
  In practice, it's accurate enough to be useful!
\end{itemize}

\hypertarget{reasons-for-building-models}{%
\subsection{Reasons for building
models}\label{reasons-for-building-models}}

Often times, (1) we care about creating models that are simple and
interpretable, allowing us to understand what the relationships between
our variables are. Other times, (2) we care more about making extremely
accurate predictions, at the cost of having an uninterpretable model.
These are sometimes called black-box models, and are common in fields
like deep learning.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  To understand complex phenomena occurring in the world we live in.

  \begin{itemize}
  \tightlist
  \item
    What factors play a role in the growth of COVID-19?
  \item
    How do an object's velocity and acceleration impact how far it
    travels? (Physics: \(d = d_0 + vt + \frac{1}{2}at^2\))
  \end{itemize}
\item
  To make accurate predictions about unseen data.

  \begin{itemize}
  \tightlist
  \item
    Can we predict if an email is spam or not?
  \item
    Can we generate a one-sentence summary of this 10-page long article?
  \end{itemize}
\end{enumerate}

\hypertarget{common-types-of-models}{%
\subsection{Common Types of Models}\label{common-types-of-models}}

In general, models can be split into two categories:

Note: These specific models are not in the scope of Data 100 and exist
to serve as motivation.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Deterministic physical (mechanistic) models: Laws that govern how the
  world works.

  \begin{itemize}
  \tightlist
  \item
    \href{https://en.wikipedia.org/wiki/Kepler\%27s_laws_of_planetary_motion\#Third_law}{Kepler's
    Third Law of Planetary Motion (1619)}: The ratio of the square of an
    object's orbital period with the cube of the semi-major axis of its
    orbit is the same for all objects orbiting the same primary.

    \begin{itemize}
    \tightlist
    \item
      \(T^2 \propto R^3\)
    \end{itemize}
  \item
    \href{https://en.wikipedia.org/wiki/Newton\%27s_laws_of_motion}{Newton's
    Laws: motion and gravitation (1687)}: Newton's second law of motion
    models the relationship between the mass of an object and the force
    required to accelerate it.

    \begin{itemize}
    \tightlist
    \item
      \(F = ma\)
    \item
      \(F_g = G \frac{m_1 m_2}{r^2}\)
    \end{itemize}
  \end{itemize}
\item
  Probabilistic models: models that attempt to understand how random
  processes evolve. These are more general and can be used describe many
  phenomena in the real world. These models commonly make simplifying
  assumption about the nature of the world.

  \begin{itemize}
  \tightlist
  \item
    \href{https://en.wikipedia.org/wiki/Poisson_point_process}{Poisson
    Process models}: Used to model random events that can happen with
    some probability at any point in time and are strictly increasing in
    count, such as the arrival of customers at a store.
  \end{itemize}
\end{enumerate}

\hypertarget{simple-linear-regression}{%
\section{Simple Linear Regression}\label{simple-linear-regression}}

The \textbf{regression line} is the unique straight line that minimizes
the \textbf{mean squared error} of estimation among all straight lines.
As with any straight line, it can be defined by a slope and a
y-intercept:

\begin{itemize}
\tightlist
\item
  slope:
  \(r \cdot \frac{\text{Standard Deviation of y}}{\text{Standard Deviation of x}}\)
\item
  y-intercept:
  \(\text{average of y} - \text{slope}\cdot\text{average of x}\)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
\CommentTok{\# Set random seed for consistency }
\NormalTok{np.random.seed(}\DecValTok{43}\NormalTok{)}
\NormalTok{plt.style.use(}\StringTok{\textquotesingle{}default\textquotesingle{}}\NormalTok{) }

\CommentTok{\#Generate random noise for plotting}
\NormalTok{x }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
\NormalTok{y }\OperatorTok{=}\NormalTok{ x }\OperatorTok{*} \FloatTok{0.5} \OperatorTok{{-}} \DecValTok{1} \OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{) }\OperatorTok{*} \FloatTok{0.3}

\CommentTok{\#plot regression line}
\NormalTok{sns.regplot(x}\OperatorTok{=}\NormalTok{x,y}\OperatorTok{=}\NormalTok{y)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{intro_to_modeling/intro_to_modeling_files/figure-pdf/cell-2-output-1.png}

}

\end{figure}

\hypertarget{definitions}{%
\subsection{Definitions}\label{definitions}}

For a random variable x:

\begin{itemize}
\tightlist
\item
  Mean: \(\bar{x}\)
\item
  Standard Deviation: \(\sigma_x\)
\item
  Predicted value: \(\hat{x}\)
\end{itemize}

\hypertarget{standard-units}{%
\subsubsection{Standard Units}\label{standard-units}}

A random variable is represented in standard units if the following are
true:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  0 in standard units is the mean (\(\bar{x}\)) in the original
  variable's units.
\item
  An increase of 1 standard unit is an increase of 1 standard
  deviation(\(\sigma_x\)) in the original variable's units
\end{enumerate}

\hypertarget{correlation}{%
\subsubsection{Correlation}\label{correlation}}

The correlation (\(r\)) is the average of the product of \(x\) and
\(y\), both measured in \emph{standard units}. Correlation measures the
strength of a linear association between two variables.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \(r = \frac{1}{n} \sum_1^n (\frac{x_i - \bar{x}}{\sigma_x})(\frac{y_i - \bar{y}}{\sigma_y})\)
\item
  Correlations are between -1 and 1: \(|r| < 1\)
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ plot\_and\_get\_corr(ax, x, y, title):}
\NormalTok{    ax.set\_xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{    ax.set\_ylim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{    ax.set\_xticks([])}
\NormalTok{    ax.set\_yticks([])}
\NormalTok{    ax.scatter(x, y, alpha }\OperatorTok{=} \FloatTok{0.73}\NormalTok{)}
\NormalTok{    r }\OperatorTok{=}\NormalTok{ np.corrcoef(x, y)[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{]}
\NormalTok{    ax.set\_title(title }\OperatorTok{+} \StringTok{" (corr: }\SpecialCharTok{\{\}}\StringTok{)"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(r.}\BuiltInTok{round}\NormalTok{(}\DecValTok{2}\NormalTok{)))}
    \ControlFlowTok{return}\NormalTok{ r}

\NormalTok{fig, axs }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{, figsize }\OperatorTok{=}\NormalTok{ (}\DecValTok{10}\NormalTok{, }\DecValTok{10}\NormalTok{))}

\CommentTok{\# Just noise}
\NormalTok{x1, y1 }\OperatorTok{=}\NormalTok{ np.random.randn(}\DecValTok{2}\NormalTok{, }\DecValTok{100}\NormalTok{)}
\NormalTok{corr1 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{], x1, y1, title }\OperatorTok{=} \StringTok{"noise"}\NormalTok{)}

\CommentTok{\# Strong linear}
\NormalTok{x2 }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
\NormalTok{y2 }\OperatorTok{=}\NormalTok{ x2 }\OperatorTok{*} \FloatTok{0.5} \OperatorTok{{-}} \DecValTok{1} \OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{) }\OperatorTok{*} \FloatTok{0.3}
\NormalTok{corr2 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{], x2, y2, title }\OperatorTok{=} \StringTok{"strong linear"}\NormalTok{)}

\CommentTok{\# Unequal spread}
\NormalTok{x3 }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
\NormalTok{y3 }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{ x3}\OperatorTok{/}\DecValTok{3} \OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{)}\OperatorTok{*}\NormalTok{(x3)}\OperatorTok{/}\FloatTok{2.5}
\NormalTok{corr3 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{], x3, y3, title }\OperatorTok{=} \StringTok{"strong linear"}\NormalTok{)}
\NormalTok{extent }\OperatorTok{=}\NormalTok{ axs[}\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{].get\_window\_extent().transformed(fig.dpi\_scale\_trans.inverted())}

\CommentTok{\# Strong non{-}linear}
\NormalTok{x4 }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
\NormalTok{y4 }\OperatorTok{=} \DecValTok{2}\OperatorTok{*}\NormalTok{np.sin(x3 }\OperatorTok{{-}} \FloatTok{1.5}\NormalTok{) }\OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{) }\OperatorTok{*} \FloatTok{0.3}
\NormalTok{corr4 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{], x4, y4, title }\OperatorTok{=} \StringTok{"strong non{-}linear"}\NormalTok{)}

\NormalTok{plt.show()}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{intro_to_modeling/intro_to_modeling_files/figure-pdf/cell-3-output-1.png}

}

\end{figure}

\hypertarget{alternate-form}{%
\subsection{Alternate Form}\label{alternate-form}}

When the variables \(y\) and \(x\) are measured in \emph{standard
units}, the regression line for predicting \(y\) based on \(x\) has
slope \(r\) and passes through the origin.

\begin{itemize}
\tightlist
\item
  \$\hat{y}\emph{\{su\} = r \cdot x}\{su\} \$
\item
  Both measured in standard units
\end{itemize}

In the original units, this becomes

\begin{itemize}
\tightlist
\item
  \(\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}\)
\end{itemize}

\hypertarget{derivation}{%
\subsection{Derivation}\label{derivation}}

Starting from the top, we have our claimed form of the regression line
and we want to show that its equivalent to the optimal linear regression
line: \(\hat{y} = \hat{a} + \hat{b}x\)

Recall:

\begin{itemize}
\tightlist
\item
  \(\hat{b}\):
  \(r \cdot \frac{\text{Standard Deviation of y}}{\text{Standard Deviation of x}}\)
\item
  \(\hat{a}\):
  \(\text{average of y} - \text{slope}\cdot\text{average of x}\)
\end{itemize}

Proof:

\[\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}\]

Multiply by \(\sigma_y\) and add \(\bar{y}\) on both sides.

\[\hat{y} = \sigma_y \cdot r \cdot \frac{x - \bar{x}}{\sigma_x} + \bar{y}\]

Distribute coefficient \(\sigma_{y}\cdot r\) to the
\(\frac{x - \bar{x}}{\sigma_x}\) term

\[\hat{y} = (\frac{r\sigma_y}{\sigma_x} ) \cdot x + (\bar{y} - (\frac{r\sigma_y}{\sigma_x} ) \bar{x})\]

We now see that we have a line that matches our claim:

\begin{itemize}
\tightlist
\item
  slope:
  \(r\cdot\frac{\text{SD of x}}{\text{SD of y}} = r\cdot\frac{\sigma_x}{\sigma_y}\)
\item
  intercept: \(\bar{y} - \text{slope}\cdot x\)
\end{itemize}

\hypertarget{the-modeling-process}{%
\section{The Modeling Process}\label{the-modeling-process}}

At a high level, a model is some way of representing a system. In Data
100, we'll treat a model as some mathematical rule we use to describe
the relationship between variables.

What variables are we modeling? Typically, we use a subset of the
variables in our sample of collected data to model another variable in
this data. To put this more formally, say we have the following dataset
\(\mathbb{D}\):

\[\mathbb{D} = \{(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)\}\]

Each pair of values \((x_i, y_i)\) represents a datapoint. In a modeling
setting, we call these \textbf{observations}. \(y_i\) is the dependent
variable we are trying to model, also called an \textbf{output} or
\textbf{response}. \(x_i\) is the independent variable inputted into the
model to make predictions, also known as a \textbf{feature}.

Our goal in modeling is to use the observed data \(\mathbb{D}\) to
predict the output variable \(y_i\). We denote each prediction as
\(\hat{y}_i\) (read: ``y hat sub i'').

How do we generate these predictions? Some examples of models we'll
encounter in the next few lectures are given below:

\begin{itemize}
\tightlist
\item
  \(\hat{y}_i = \theta\)
\item
  \(\hat{y}_i = \theta_0 + \theta_1 x_i\)
\end{itemize}

The examples above are known as \textbf{parametric models}. They relate
the collected data, \(x_i\), to the prediction we make, \(\hat{y}_i\). A
few parameters (\(\theta\), \(\theta_0\), \(\theta_1\)) are used to
describe the relationship between \(x_i\) and \(\hat{y}_i\).

Notice that we don't immediately know the values of these parameters.
While the features, \(x_i\), are taken from our observed data, we need
to decide what values to give \(\theta\), \(\theta_0\), and \(\theta_1\)
ourselves. This is the heart of parametric modeling: \emph{what
parameter values should we choose so our model makes the best possible
predictions?}

To choose our model parameters, we'll work through the \textbf{modeling
process}.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Choose a model: how should we represent the world?
\item
  Choose a loss function: how do we quantify prediction error?
\item
  Fit the model: how do we choose the best parameters of our model given
  our data?
\item
  Evaluate model performance: how do we evaluate whether this process
  gave rise to a good model?
\end{enumerate}

\hypertarget{choosing-a-model}{%
\section{Choosing a Model}\label{choosing-a-model}}

Our first step is choosing a model: defining the mathematical rule that
describes the relationship between the features, \(x_i\), and
predictions \(\hat{y}_i\).

In
\href{https://inferentialthinking.com/chapters/15/4/Least_Squares_Regression.html}{Data
8}, you learned about the \textbf{Simple Linear Regression (SLR) model}.
You learned that the model takes the form: \[\hat{y}_i = a + bx_i\]

In Data 100, we'll use slightly different notation: we will replace
\(a\) with \(\theta_0\) and \(b\) with \(\theta_1\). This will allow us
to use the same notation when we explore more complex models later on in
the course.

\[\hat{y}_i = \theta_0 + \theta_1 x_i\]

The parameters of the SLR model are \(\theta_0\), also called the
intercept term, and \(\theta_1\), also called the slope term. To create
an effective model, we want to choose values for \(\theta_0\) and
\(\theta_1\) that most accurately predict the output variable. The
``best'' fitting model parameters are given the special names
\(\hat{\theta}_0\) and \(\hat{\theta}_1\) -- they are the specific
parameter values that allow our model to generate the best possible
predictions.

In Data 8, you learned that the best SLR model parameters are:
\[\hat{\theta}_0 = \bar{y} - \hat{\theta}_1\bar{x} \qquad \qquad \hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}\]

A quick reminder on notation:

\begin{itemize}
\tightlist
\item
  \(\bar{y}\) and \(\bar{x}\) indicate the mean value of \(y\) and
  \(x\), respectively
\item
  \(\sigma_y\) and \(\sigma_x\) indicate the standard deviations of
  \(y\) and \(x\)
\item
  \(r\) is the
  \href{https://inferentialthinking.com/chapters/15/1/Correlation.html\#the-correlation-coefficient}{correlation
  coefficient}, defined as the average of the product of \(x\) and \(y\)
  measured in standard units:
  \(\frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y})\)
\end{itemize}

In Data 100, we want to understand \emph{how} to derive these best model
coefficients. To do so, we'll introduce the concept of a loss function.

\hypertarget{choosing-a-loss-function}{%
\section{Choosing a Loss Function}\label{choosing-a-loss-function}}

We've talked about the idea of creating the ``best'' possible
predictions. This begs the question: how do we decide how ``good'' or
``bad'' our model's predictions are?

A \textbf{loss function} characterizes the cost, error, or fit resulting
from a particular choice of model or model parameters. This function,
\(L(y, \hat{y})\), quantifies how ``far off'' a single prediction by our
model is from a true, observed value in our collected data.

The choice of loss function for a particular model depends on the
modeling task at hand. Regardless of the specific function used, a loss
function should follow two basic principles:

\begin{itemize}
\tightlist
\item
  If the prediction \(\hat{y}_i\) is \emph{close} to the actual value
  \(y_i\), loss should be low
\item
  If the prediction \(\hat{y}_i\) is \emph{far} from the actual value
  \(y_i\), loss should be high
\end{itemize}

Two common choices of loss function are squared loss and absolute loss.

\textbf{Squared loss}, also known as \textbf{L2 loss}, computes loss as
the square of the difference between the observed \(y_i\) and predicted
\(\hat{y}_i\): \[L(y_i, \hat{y}_i) = (y_i - \hat{y}_i)^2\]

\textbf{Absolute loss}, also known as \textbf{L1 loss}, computes loss as
the absolute difference between the observed \(y_i\) and predicted
\(\hat{y}_i\): \[L(y_i, \hat{y}_i) = |y_i - \hat{y}_i|\]

L1 and L2 loss give us a tool for quantifying our model's performance on
a single datapoint. This is a good start, but ideally we want to
understand how our model performs across our \emph{entire} dataset. A
natural way to do this is to compute the average loss across all
datapoints in the dataset. This is known as the \textbf{cost function},
\(\hat{R}(\theta)\):
\[\hat{R}(\theta) = \frac{1}{n} \sum^n_{i=1} L(y_i, \hat{y}_i)\]

The cost function has many names in statistics literature. You may also
encounter the terms:

\begin{itemize}
\tightlist
\item
  Empirical risk (this is why we give the cost function the name \(R\))
\item
  Error function
\item
  Average loss
\end{itemize}

We can substitute our L1 and L2 loss into the cost function definition.
The \textbf{Mean Squared Error (MSE)} is the average squared loss across
a dataset: \[\text{MSE} = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2\]

The \textbf{Mean Absolute Error (MAE)} is the average absolute loss
across a dataset:
\[\text{MAE}= \frac{1}{n} \sum_{i=1}^n |y_i - \hat{y}_i|\]

\hypertarget{fitting-the-model}{%
\section{Fitting the Model}\label{fitting-the-model}}

Now that we've established the concept of a loss function, we can return
to our original goal of choosing model parameters. Specifically, we want
to choose the best set of model parameters that will minimize the
model's cost on our dataset. This process is called fitting the model.

We know from calculus that a function is minimized when (1) its first
derivative is equal to zero and (2) its second derivative is positive.
We often call the function being minimized the \textbf{objective
function} (our objective is to find its minimum).

To find the optimal model parameter, we:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Take the derivative of the cost function with respect to that
  parameter
\item
  Set the derivative equal to 0
\item
  Solve for the parameter
\end{enumerate}

We repeat this process for each parameter present in the model. For now,
we'll disregard the second derivative condition.

To help us make sense of this process, let's put it into action by
deriving the optimal model parameters for simple linear regression using
the mean squared error as our cost function. Remember: although the
notation may look tricky, all we are doing is following the three steps
above!

Step 1: take the derivative of the cost function with respect to each
model parameter. We substitute the SLR model,
\(\hat{y}_i = \theta_0+\theta_1 x_i\), into the definition of MSE above
and differentiate with respect to \(\theta_0\) and \(\theta_1\).
\[\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 = \frac{1}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)^2\]

\[\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} y_i - \theta_0 - \theta_1 x_i\]

\[\frac{\partial}{\partial \theta_1} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)x_i\]

Step 2: set the derivatives equal to 0. After simplifying terms, this
produces two \textbf{estimating equations}. The best set of model
parameters \((\theta_0, \theta_1)\) \emph{must} satisfy these two
optimality conditions.
\[0 = \frac{-2}{n} \sum_{i=1}^{n} y_i - \theta_0 - \theta_1 x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} y_i - \hat{y}_i = 0\]
\[0 = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} (y_i - \hat{y}_i)x_i = 0\]

Step 3: solve the estimating equations to compute estimates for
\(\hat{\theta}_0\) and \(\hat{\theta}_1\).

Taking the first equation gives the estimate of \(\hat{\theta}_0\):
\[\begin{align}
\frac{1}{n} \sum_{i=1}^n y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i &= 0 \\
\left(\frac{1}{n} \sum_{i=1}^n y_i \right) - \hat{\theta}_0 - \hat{\theta}_1\left(\frac{1}{n} \sum_{i=1}^n x_i \right) &= 0 \\
\hat{\theta}_0 &= \bar{y} - \hat{\theta}_1 \bar{x}
\end{align}\]

With a bit more maneuvering, the second equation gives the estimate of
\(\hat{\theta}_1\). Start by multiplying the first estimating equation
by \(\bar{x}\), then subtracting the result from the second estimating
equation. \[\begin{align}
\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)x_i - \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)\bar{x} &= 0 \\
\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)(x_i - \bar{x}) &= 0
\end{align}\]

Next, plug in
\(\hat{y}_i = \hat{\theta}_0 + \hat{\theta}_1 x_i = \bar{y} + \hat{\theta}_1(x_i - \bar{x})\):
\[\begin{align}
\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y} - \hat{\theta}_1(x - \bar{x}))(x_i - \bar{x}) &= 0 \\
\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y})(x_i - \bar{x}) = \hat{\theta}_1 \times \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2
\end{align}\]

By using the definition of correlation
\(\left(r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y}) \right)\)
and standard deviation
\(\left(\sigma_x = \sqrt{\frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2} \right)\),
we can conclude:
\[r \sigma_x \sigma_y = \hat{\theta}_1 \times \sigma_x^2\]
\[\hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}\]

Just as was given in Data 8!

Remember, this derivation found the optimal model parameters for SLR
when using the MSE cost function. If we had used a different model or
different loss function, we likely would have found different values for
the best model parameters. However, regardless of the model and loss
used, we can \emph{always} follow these three steps to fit the model.

\hypertarget{evaluating-performance}{%
\section{Evaluating Performance}\label{evaluating-performance}}

At this point, we've:

\begin{itemize}
\tightlist
\item
  Defined our model
\item
  Defined our loss function
\item
  Fit the model to identify the best model parameters
\end{itemize}

Now, what are some ways to determine if our model was a good fit to our
data? We will delve into this more in the next chapter, but there are
three main ways for evaluating a model.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Statistics:

  \begin{itemize}
  \tightlist
  \item
    Plot original data
  \item
    Compute column means
  \item
    Compute standard deviations
  \item
    If we want to fit a linear model, compute correlation (r)
  \end{itemize}
\item
  Performance metrics:

  \begin{itemize}
  \tightlist
  \item
    Root Mean Square Error (RMSE). It is the square root of MSE, which
    is the average loss that we've been minimizing to determine optimal
    model parameters.
  \item
    RMSE is in the same units as \(y\).
  \item
    A lower RMSE indicates more ``accurate'' predictions (lower
    ``average loss'' across data)
  \end{itemize}
\item
  Visualization:

  \begin{itemize}
  \tightlist
  \item
    Look at a residual plot of \(e_i = y_i - \hat{y_i}\) to visualize
    the difference between actual and predicted \(y\) values.
  \end{itemize}
\end{enumerate}


\end{document}