Beers.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<meta name="author" content="Max Pagan" />

<meta name="date" content="2023-10-21" />

<title>Beers Case Study</title>

<script src="site_libs/header-attrs-2.25/header-attrs.js"></script>
<script src="site_libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/darkly.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
       h1.title {font-size: 38px;}
       h2 {font-size: 30px;}
       h3 {font-size: 24px;}
       h4 {font-size: 18px;}
       h5 {font-size: 16px;}
       h6 {font-size: 12px;}
       code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
       pre:not([class]) { background-color: white }</style>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>

<style type="text/css">code{white-space: pre;}</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
details > summary > p:only-child {
  display: inline;
}
pre code {
  padding: 0;
}
</style>


<style type="text/css">
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #adb5bd;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script type="text/javascript">
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark the anchor link active (and if it's in a dropdown, also mark that active)
  var dropdown = menuAnchor.closest('li.dropdown');
  if (window.bootstrap) { // Bootstrap 4+
    menuAnchor.addClass('active');
    dropdown.find('> .dropdown-toggle').addClass('active');
  } else { // Bootstrap 3
    menuAnchor.parent().addClass('active');
    dropdown.addClass('active');
  }

  // Navbar adjustments
  var navHeight = $(".navbar").first().height() + 15;
  var style = document.createElement('style');
  var pt = "padding-top: " + navHeight + "px; ";
  var mt = "margin-top: -" + navHeight + "px; ";
  var css = "";
  // offset scroll position for anchor links (for fixed navbar)
  for (var i = 1; i <= 6; i++) {
    css += ".section h" + i + "{ " + pt + mt + "}\n";
  }
  style.innerHTML = "body {" + pt + "padding-bottom: 40px; }\n" + css;
  document.head.appendChild(style);
});
</script>

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "\e259";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "\e258";
  font-family: 'Glyphicons Halflings';
  border: none;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->


</head>

<body>


<div class="container-fluid main-container">


<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-bs-toggle="collapse" data-target="#navbar" data-bs-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Pages:</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Welcome</a>
</li>
<li>
  <a href="Beers.html">Beers</a>
</li>
<li>
  <a href="AmesProject.html">Ames Advanced Regression</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div id="header">


<h1 class="title toc-ignore">Beers Case Study</h1>
<h4 class="author">Max Pagan</h4>
<h4 class="date">2023-10-21</h4>

</div>


<div id="introduction" class="section level2">
<h2>Introduction</h2>
<p>In this class project, authors Christian Castro and Max Pagan will
take a look at data provided to us in order to provide insight about
beers, breweries, and all the information about them that we can gather.
Our goals for this project are to understand what is contained in the
data,analyze basic statistics of the data and draw conclusions about
demographics and trends, so that we can recommend future strategies to
Budweiser to gain further edge in the competitive beer marketplace. We
will do so by answering the nine questions below.</p>
<p>To download the dataset we used for this project (so that you can use
it in my RShiny app), click the link: <a href="Beers_data.csv">Download
CSV File</a></p>
</div>
<div id="questions" class="section level2">
<h2>Questions:</h2>
<div id="question-1---how-many-breweries-are-present-in-each-state"
class="section level3">
<h3>Question 1 - How many breweries are present in each state?</h3>
<pre class="r"><code>library(tidyverse)
library(ggplot2)

#bar charts with numbers: Breweries by state
breweries_per_state &lt;- merged_data %&gt;% 
  group_by(State) %&gt;% 
  summarise(Brewery_Count = n_distinct(Brewery_id))


ggplot(breweries_per_state, aes(x = reorder(State, -Brewery_Count), y = Brewery_Count)) + 
  geom_bar(stat = &quot;identity&quot;, fill = &quot;steelblue&quot;) +
  geom_text(aes(label = Brewery_Count), vjust = -0.5, size = 3, position = position_dodge(width = 0.9)) +
  labs(title = &quot;Number of Breweries by State&quot;, 
       x = &quot;State&quot;, 
       y = &quot;Number of Breweries&quot;) +
  theme_minimal() +
  theme(axis.text.x = element_text(hjust = 1, size = 5))</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-1-1.png" width="672" /></p>
<p>The state with the most breweries is Colorado - with 47. We can see
that there is a sharp drop in number of breweries soon afterwards, with
most states having fewer than ten breweries. DC, The Dakotas, and West
Virginia all only have one brewery.</p>
</div>
<div
id="question-2---merge-beer-data-with-the-breweries-data.-print-the-first-6-observations-and-the-last-six-observations-to-check-the-merged-file.-rmd-only-this-does-not-need-to-be-included-in-the-presentation-or-the-deck."
class="section level3">
<h3>Question 2 - Merge beer data with the breweries data. Print the
first 6 observations and the last six observations to check the merged
file. (RMD only, this does not need to be included in the presentation
or the deck.)</h3>
<pre class="r"><code># Assuming &#39;beers&#39; and &#39;breweries&#39; are the names of your dataframes
merged_data &lt;- merge(beers, breweries, by.x = &quot;Brewery_id&quot;, by.y = &quot;Brew_ID&quot;, all.x = TRUE)
#renaming some columns in merged_data for readability
merged_data &lt;- merged_data %&gt;%
  rename(
    Beer_name = Name.x,
    Brewery_name = Name.y
  )
# Print the first 6 observations
head(merged_data, n = 6)</code></pre>
<pre><code>##   Brewery_id     Beer_name Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie&#39;s Leap    2691 0.049  26
## 3          1    Wall&#39;s End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces       Brewery_name        City
## 1                        American IPA     16 NorthGate Brewing  Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing  Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing  Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing  Minneapolis
## 5                     American Porter     16 NorthGate Brewing  Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing  Minneapolis
##   State
## 1    MN
## 2    MN
## 3    MN
## 4    MN
## 5    MN
## 6    MN</code></pre>
<pre class="r"><code># Print the last 6 observations
tail(merged_data, n = 6)</code></pre>
<pre><code>##      Brewery_id                 Beer_name Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                  Brewery_name          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK</code></pre>
</div>
<div
id="question-3---adress-the-missing-values-in-each-column.-revisited-with-chatgpt"
class="section level3">
<h3>Question 3 - Adress the missing values in each column. (revisited
with ChatGPT)</h3>
<pre class="r"><code># Create &#39;is_missing_ABV&#39; column
merged_data$is_missing_ABV &lt;- ifelse(is.na(merged_data$ABV), 1, 0)

# Create &#39;is_missing_IBU&#39; column
merged_data$is_missing_IBU &lt;- ifelse(is.na(merged_data$IBU), 1, 0)


# Install and load the naniar package
#install.packages(&quot;naniar&quot;)
library(naniar)

#to visualize missing data we&#39;ll create a missingness heatmap:
gg_miss_upset(merged_data)</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-3-1.png" width="672" /></p>
<pre class="r"><code>#this plot shows how many variables are missing from any column, and if any missing values overlap between columns

#the plot shows - 1005 missing IBU, 62 of those are also missing ABV.

#none are only missing ABV

#For missing IBU data:

#We&#39;ll visualize the missing data with a scatter plot and geom_miss_point
#This plot will sow us the ABV and IBU values present in blue, and the missing values in red underneath the axes. Missing IBU values are underneath the X-Axis, plotted by their ABV values, and missing ABV values are plotted behind the y-axis.

ggplot(beers, aes(x = ABV, y = IBU)) +
  geom_point() +
  labs(title = &quot;Scatter Plot of ABV vs. IBU&quot;,
       x = &quot;ABV (Alcohol by Volume)&quot;,
       y = &quot;IBU (International Bitterness Units)&quot;) +
  theme_minimal() + geom_miss_point()</code></pre>
<pre><code>## Warning: Removed 1005 rows containing missing values (`geom_point()`).</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-3-2.png" width="672" /></p>
<pre class="r"><code>#Next, we must remove the missing ABV rows and Impute the median into the missing IBU values.

# Filter out rows with NA in ABV and remove the is_missing_ABV and is_missing IBU columns
merged_data_adjusted &lt;- merged_data %&gt;%
  filter(!is.na(ABV)) %&gt;%
  select(-is_missing_ABV) %&gt;%
  select(-is_missing_IBU)

# Calculate the median IBU value (excluding missing values)
median_IBU &lt;- median(merged_data_adjusted$IBU, na.rm = TRUE)

# condicting the median imputation
merged_data_adjusted &lt;- merged_data_adjusted %&gt;%
  mutate(IBU = ifelse(is.na(IBU), median_IBU, IBU))

# Check the structure of the new dataframe
str(merged_data_adjusted)</code></pre>
<pre><code>## &#39;data.frame&#39;:    2348 obs. of  10 variables:
##  $ Brewery_id  : int  1 1 1 1 1 1 2 2 2 2 ...
##  $ Beer_name   : chr  &quot;Get Together&quot; &quot;Maggie&#39;s Leap&quot; &quot;Wall&#39;s End&quot; &quot;Pumpion&quot; ...
##  $ Beer_ID     : int  2692 2691 2690 2689 2688 2687 2686 2685 2684 2683 ...
##  $ ABV         : num  0.045 0.049 0.048 0.06 0.06 0.056 0.08 0.125 0.077 0.042 ...
##  $ IBU         : int  50 26 19 38 25 47 68 80 25 42 ...
##  $ Style       : chr  &quot;American IPA&quot; &quot;Milk / Sweet Stout&quot; &quot;English Brown Ale&quot; &quot;Pumpkin Ale&quot; ...
##  $ Ounces      : num  16 16 16 16 16 16 16 16 16 16 ...
##  $ Brewery_name: chr  &quot;NorthGate Brewing &quot; &quot;NorthGate Brewing &quot; &quot;NorthGate Brewing &quot; &quot;NorthGate Brewing &quot; ...
##  $ City        : chr  &quot;Minneapolis&quot; &quot;Minneapolis&quot; &quot;Minneapolis&quot; &quot;Minneapolis&quot; ...
##  $ State       : chr  &quot; MN&quot; &quot; MN&quot; &quot; MN&quot; &quot; MN&quot; ...</code></pre>
<p>Summmarizing the data, we can see that there are missing values in
the ABV and IBU columns. We must create plots/visualizations to aid us
in determining why these values are missing, and how we can deal with
them. For this analysis, we’ll first demonstrate a missingness heatmap,
then a scatterplot with missing data, which will elucidate how we should
deal with missing values.</p>
<p>In the first plot, we can see from the missingness heatmap that all
missing ABV values are also missing IBU values. Because ABV and IBU have
a largely linear relationship, as demonstrated later, We can’t know for
sure why an ABV value is missing without having its IBU value.
Therefore, we don’t have enough information about why any ABV is missing
from this data set. We must categorize missing ABV as NMAR, adjust the
scope of the study, and remove all values where ABV is missing.</p>
<p>The second plot, the scatter plot with missing values, demonstrates
to us that IBU values are missing across the whole spectrum of ABV
values. This tells us two things: 1. IBU values are not missing because
of their corresponding ABV value. IBU was missing relatively evenly
across the distribution of ABV values 2. IBU values were not missing
because of their own value. This is because there is an approximate
linear relationship between ABV and IBU, so if IBU values were missing
more because they were large or small, then the missing values would be
disproportionately represented by large or small ABVs.</p>
<p>Hence, the IBUs are not missing because of their own value or the
value of another variable. They are missing completely at random, and we
can impute their values as such.</p>
<p>Because the data has outliers, we will impute (replace) all the
missing IBU values with the median IBU value for a complete robust
analysis.</p>
<p>After determining how to deal with the missing values, we adjusted
the data set and stored the new information in merged_data_adjusted. We
will use this data frame going forward.</p>
</div>
<div
id="question-4---compute-the-median-alcohol-content-and-international-bitterness-unit-for-each-state.-plot-a-bar-chart-to-compare."
class="section level3">
<h3>Question 4 - Compute the median alcohol content and international
bitterness unit for each state. Plot a bar chart to compare.</h3>
<pre class="r"><code>#IBU by state - creating a dataframe of states and their median IBUs
median_ibu_per_state &lt;- merged_data_adjusted %&gt;% 
  group_by(State) %&gt;% 
  summarise(Median_IBU = median(IBU, na.rm = TRUE))

#plotting the median IBU values in a bar plot
ggplot(median_ibu_per_state, aes(x = reorder(State, -Median_IBU), y = Median_IBU)) + 
  geom_bar(stat = &quot;identity&quot;, fill = &quot;darkblue&quot;) +
  geom_text(aes(label = Median_IBU), vjust = -0.5, size = 2, position = position_dodge(width = 0.9)) +
  labs(title = &quot;Median IBU by State&quot;, 
       x = &quot;State&quot;, 
       y = &quot;Median IBU&quot;) +
  theme_minimal() +
  theme(axis.text.x = element_text(hjust = 1, size = 5))</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-4-1.png" width="672" /></p>
<pre class="r"><code>#ABV by state - creating a dataframe of states and their median ABVs
median_abv_per_state &lt;- merged_data_adjusted %&gt;% 
  group_by(State) %&gt;% 
  summarise(Median_ABV = median(ABV, na.rm = TRUE))

#plotting the median ABV values in a bar plot
ggplot(median_abv_per_state, aes(x = reorder(State, -Median_ABV), y = Median_ABV)) + 
  geom_bar(stat = &quot;identity&quot;, fill = &quot;darkblue&quot;) +
  geom_text(aes(label = Median_ABV*100), vjust = -0.5, size = 2, position = position_dodge(width = 0.9)) +
  labs(title = &quot;Median ABV (%) by State&quot;, 
       x = &quot;State&quot;, 
       y = &quot;Median ABV (%)&quot;) +
  theme_minimal() +
  theme(axis.text.x = element_text(hjust = 1, size = 5))</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-4-2.png" width="672" /></p>
<p>The bar plots above reveal interesting information.</p>
<p>The highest Median IBU (after adjusting the missing values) belonged
to West Virginia, with a Median of approximately 58 IBUs. The lowest
median IBU belonged to Kansas, with a median IBU of 22.</p>
<p>The highest Median ABV belonged to DC and Kentucky, with a median of
approximately 6.2%. The lowest median ABV belonged to Utah, with
4.0%.</p>
</div>
<div
id="question-5---which-state-has-the-maximum-alcoholic-abv-beer-which-state-has-the-most-bitter-ibu-beer"
class="section level3">
<h3>Question 5 - Which state has the maximum alcoholic (ABV) beer? Which
state has the most bitter (IBU) beer?</h3>
<pre class="r"><code>#state with the most alcoholic beer:
max_abv_beer &lt;- merged_data_adjusted[which.max(merged_data_adjusted$ABV), ]

#state with the most bitter beer:
Max_IBU_beer &lt;- merged_data_adjusted[which.max(merged_data_adjusted$IBU), ]

#printing the results:
cat(&quot;Highest ABV - State:&quot;, max_abv_beer$State, &quot;\n&quot;)</code></pre>
<pre><code>## Highest ABV - State:  CO</code></pre>
<pre class="r"><code>cat(&quot;Highest ABV - Beer Name:&quot;, max_abv_beer$Beer_name, &quot;\n&quot;)</code></pre>
<pre><code>## Highest ABV - Beer Name: Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale</code></pre>
<pre class="r"><code>cat(&quot;Highest ABV - Brewery:&quot;, max_abv_beer$Brewery_name, &quot;\n&quot;)</code></pre>
<pre><code>## Highest ABV - Brewery: Upslope Brewing Company</code></pre>
<pre class="r"><code>cat(&quot;Highest ABV - ABV value:&quot;, max_abv_beer$ABV, &quot;\n\n&quot;)</code></pre>
<pre><code>## Highest ABV - ABV value: 0.128</code></pre>
<pre class="r"><code>cat(&quot;Highest IBU - State:&quot;, Max_IBU_beer$State, &quot;\n&quot;)</code></pre>
<pre><code>## Highest IBU - State:  OR</code></pre>
<pre class="r"><code>cat(&quot;Highest IBU - Beer Name:&quot;, Max_IBU_beer$Beer_name, &quot;\n&quot;)</code></pre>
<pre><code>## Highest IBU - Beer Name: Bitter Bitch Imperial IPA</code></pre>
<pre class="r"><code>cat(&quot;Highest IBU - Brewery:&quot;, Max_IBU_beer$Brewery_name, &quot;\n&quot;)</code></pre>
<pre><code>## Highest IBU - Brewery: Astoria Brewing Company</code></pre>
<pre class="r"><code>cat(&quot;Highest IBU - IBU value:&quot;, Max_IBU_beer$IBU, &quot;\n&quot;)</code></pre>
<pre><code>## Highest IBU - IBU value: 138</code></pre>
<p>From this data, we can gather:</p>
<p>Colorado has the beer with the highest ABV. Quadrupel Ale from
Upslope Brewing Company in Boulder, CO has an ABV of 12.8%. Oregon has
the beer with the highest IBU rating. Bitter Bitch Imperial IPA (fitting
name) from Astoria Brewing Company from Astoria, OR has an IBU of
138.</p>
</div>
<div
id="question-6---comment-on-the-summary-statistics-and-distribution-of-the-abv-variable."
class="section level3">
<h3>Question 6 - Comment on the summary statistics and distribution of
the ABV variable.</h3>
<pre class="r"><code># calculating and printing the ABV statistics
cat(&quot;Minimum ABV:&quot;, min(merged_data_adjusted$ABV), &quot;\n&quot;)</code></pre>
<pre><code>## Minimum ABV: 0.001</code></pre>
<pre class="r"><code>cat(&quot;Maximum ABV:&quot;, max(merged_data_adjusted$ABV), &quot;\n&quot;)</code></pre>
<pre><code>## Maximum ABV: 0.128</code></pre>
<pre class="r"><code>cat(&quot;Median ABV:&quot;,  median(merged_data_adjusted$ABV), &quot;\n&quot;)</code></pre>
<pre><code>## Median ABV: 0.056</code></pre>
<pre class="r"><code>cat(&quot;Mean ABV:&quot;,  mean(merged_data_adjusted$ABV), &quot;\n&quot;)</code></pre>
<pre><code>## Mean ABV: 0.05977342</code></pre>
<pre class="r"><code>#creating a histogram of ABV to see the distribution:
ggplot(merged_data_adjusted, aes(x = ABV)) +
  geom_histogram(binwidth = 0.005, fill = &quot;blue&quot;, color = &quot;black&quot;) +
  labs(title = &quot;ABV Histogram&quot;,
       x = &quot;ABV (Alcohol by Volume)&quot;,
       y = &quot;Frequency&quot;) +
  theme_minimal()</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-6-1.png" width="672" /></p>
<p>The mean beer in the data set has an alcohol content of 5.98%, and
the median has an alcohol content of 5.6%. Considering that the mean is
slightly larger than the median, as well as the histogram, we can see
that there is a slight right skew to the data, Meaning that more beers
fall towards the lower end of the ABV scale than the higher end.</p>
</div>
<div id="question-7" class="section level3">
<h3>Question 7</h3>
<p>Is there an apparent relationship between the bitterness of the beer
and its alcoholic content? Draw a scatter plot. Make your best judgment
of a relationship and EXPLAIN your answer.</p>
<pre class="r"><code>#to demonstrate if a relationship exists - we&#39;ll make a scatterplot
ggplot(merged_data_adjusted, aes(x = ABV, y = IBU)) + geom_point() +
  labs(title = &quot;Scatter Plot of ABV vs. IBU&quot;,
       x = &quot;ABV (Alcohol by Volume)&quot;,
       y = &quot;IBU (International Bitterness Units)&quot;) +
       theme_minimal()</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-7-1.png" width="672" /></p>
<pre class="r"><code>#there is visual evidence of a linear relationship
#note the largely horizontal line  of points at 35, this is from our imputed data</code></pre>
<p>Based on the plotting of Alcohol By Volume (ABV) and International
Bitterness Units (IBU) it appears that there is a linear relationship
between the two. Generally, as ABV increases, we can expect IBU to also
increase, and vice versa.</p>
<p>Perhaps, based on ingredients and fermentation time, the alcohol
content increases, but more bitter flavors are produced as well. There
could be a number of causes of this apparent relationship.</p>
</div>
<div id="question-8" class="section level3">
<h3>Question 8</h3>
<p>Budweiser would also like to investigate the difference with respect
to IBU and ABV between IPAs (India Pale Ales) and other types of Ale
(any beer with “Ale” in its name other than IPA). You decide to use KNN
classification to investigate this relationship. Provide statistical
evidence one way or the other. You can of course assume your audience is
comfortable with percentages … KNN is very easy to understand
conceptually. In addition, while you have decided to use KNN to
investigate this relationship (KNN is required) you may also feel free
to supplement your response to this question with any other methods or
techniques you have learned. Creativity and alternative solutions are
always encouraged (Supplement is done in chatGPT)</p>
<pre class="r"><code>#first, we will filter so that the data only includes IPAs and other Ales. This filtered dataset, filtered_data, will be used for this analysis.
filtered_data &lt;- merged_data_adjusted %&gt;%  filter(grepl(&quot;Ale|IPA&quot;,Style))

# Create the &#39;category&#39; column - 0 is IPA and 1 is other ale This column will be the focus of the knn modeling.
filtered_data$category &lt;- ifelse(grepl(&quot;IPA&quot;, filtered_data$Style), 0, 1)

#plotting the new filtered dataset of ABV vs IBU for ales
plot &lt;- ggplot(filtered_data, aes(x = ABV, y = IBU, color = as.factor(category))) +
  geom_point() +
  labs(title = &quot;ABV vs IBU for IPA and Other Ales&quot;,
       x = &quot;ABV (Alcohol by Volume)&quot;,
       y = &quot;IBU (International Bitterness Units)&quot;) +
  scale_color_manual(values = c(&quot;0&quot; = &quot;blue&quot;, &quot;1&quot; = &quot;red&quot;),
                     labels = c(&quot;0&quot; = &quot;IPA&quot;, &quot;1&quot; = &quot;Other Ale&quot;)) +
  theme_minimal()  + labs(color = &quot;Category&quot;)

plot</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-8-1.png" width="672" /></p>
<pre class="r"><code>library(class) #this library is used for knn evaluation
library(e1071)
library(caret) # for model evaluation</code></pre>
<pre><code>## Loading required package: lattice</code></pre>
<pre><code>## 
## Attaching package: &#39;caret&#39;</code></pre>
<pre><code>## The following object is masked from &#39;package:purrr&#39;:
## 
##     lift</code></pre>
<pre class="r"><code>library(tidyverse)


set.seed(4) #setting an initial seed before running through multiple seeds - for reproducibility
splitPerc = .70 #70/30 train/test split

#initializing the variables we would like to analyze
knn_accuracies &lt;- numeric(100)
knn_sensitivities &lt;- numeric(100)
knn_specificities &lt;- numeric(100)

# Loop through 1000 different seeds - finding an average accuracy
for (i in 1:1000) {
  # Set a new seed for each iteration
  seed &lt;- sample.int(10000, 1)
  set.seed(seed)
  
  #BELOW I just copied &amp; pasted the code to get the train &amp; test for the knn
  trainIndices = sample(1:dim(filtered_data)[1],round(splitPerc * dim(filtered_data)[1]))
  train = filtered_data[trainIndices,]
  
  test = filtered_data[-trainIndices,]
  
  
  #knn model, k = 3, confusion matrix 2
  classifications = knn(train[,c(4,5)],test[,c(4,5)], train$category, prob = TRUE, k = 3)
  table(classifications,test$category)
  confusion_matrix2 &lt;- confusionMatrix(table(classifications,test$category))
  
  # Evaluating KNN model
  
  knn_accuracies[i] &lt;- accuracy &lt;- confusion_matrix2$overall[&quot;Accuracy&quot;]
  knn_sensitivities[i] &lt;- sensitivity &lt;- confusion_matrix2$byClass[&quot;Sensitivity&quot;]
  knn_specificities[i] &lt;- specificity &lt;- confusion_matrix2$byClass[&quot;Specificity&quot;]
  
}

avg_knn_accuracy &lt;- mean(knn_accuracies) #accuracy of the model
avg_knn_sensitivity &lt;- mean(knn_sensitivities) #likelihood of getting &#39;IPA&#39; correct
avg_knn_specificity &lt;- mean(knn_specificities) #likelihood of getting &#39;Other Ale&#39; correct

#printing out the information in a digestible way
cat(&quot;\nK-Nearest Neighbors (KNN) Model Average Evaluation:\n&quot;)</code></pre>
<pre><code>## 
## K-Nearest Neighbors (KNN) Model Average Evaluation:</code></pre>
<pre class="r"><code>cat(paste(&quot;Average Accuracy:&quot;, avg_knn_accuracy, &quot;\n&quot;))</code></pre>
<pre><code>## Average Accuracy: 0.814066964285714</code></pre>
<pre class="r"><code>cat(paste(&quot;Average Sensitivity:&quot;, avg_knn_sensitivity, &quot;\n&quot;))</code></pre>
<pre><code>## Average Sensitivity: 0.729762930090677</code></pre>
<pre class="r"><code>cat(paste(&quot;Average Specificity:&quot;, avg_knn_specificity, &quot;\n&quot;))</code></pre>
<pre><code>## Average Specificity: 0.865048364484173</code></pre>
<pre class="r"><code># Create a histogram for KNN accuracies
hist(knn_accuracies, main = &quot;KNN Accuracies Histogram&quot;, xlab = &quot;Accuracy&quot;, col = &quot;blue&quot;)

# Add a vertical line at the average KNN accuracy
abline(v = avg_knn_accuracy, col = &quot;red&quot;, lwd = 2)</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-8-2.png" width="672" /></p>
<p>After filtering the data to only include IPAs and other Ales, we
aksed the question: could we determine if an ale is an IPA based on its
ABV and IBU?</p>
<p>We trained a machine learning model to attempt to do just that.</p>
<p>As visually demonstrated, the IPAs mostly had higher ABV and IBU
values, while other ales largely lived in the lower ABV and IBU values.
So, we trained a k-nearest neighbor model, with k = 3, to predict if an
ale was an IPA or not.</p>
<p>Using 1000 different randomness seeds, we ran the model 1000 times,
and captured the average accuracy statistics. The model’s mean accuracy,
as demonstrated by the printed values and the histogram, was 81.4%. The
mean sensitivity, which represents the rate at which it correctly
guessed that a beer was an IPA, was 72.97%. And the mean specificity,
which represents the rate at which it correctly guessed that a beer was
an ‘other ale’, was 86.5%</p>
<p>These insights could be used to influence marketing decisions, even
when the model was not accurate in classifying a beer. If the model,
which is usually correct, thought an ale was an IPA when it wasn’t, this
is an interesting insight that could affect how we sell it. For example,
such an ale could be marketed with the following tagline:</p>
<p>“Hey IPA lovers, try this new ale instead, with just the right amount
of bitterness!”</p>
</div>
<div id="question-9---finding-additional-insights"
class="section level3">
<h3>Question 9 - Finding Additional Insights</h3>
<pre class="r"><code>#no 9
cor.test(merged_data$ABV, merged_data$IBU, method=&quot;pearson&quot;)</code></pre>
<pre><code>## 
##  Pearson&#39;s product-moment correlation
## 
## data:  merged_data$ABV and merged_data$IBU
## t = 33.863, df = 1403, p-value &lt; 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6407982 0.6984238
## sample estimates:
##       cor 
## 0.6706215</code></pre>
<pre class="r"><code>merged_data %&gt;% 
  ggplot(aes(x=ABV, y=IBU)) +
  geom_bin2d(bins=30) + 
  theme_minimal() + 
  labs(title=&quot;Density Plot of ABV vs. IBU&quot;, x=&quot;ABV (%)&quot;, y=&quot;IBU&quot;) +
  scale_fill_viridis_c()</code></pre>
<pre><code>## Warning: Removed 1005 rows containing non-finite values (`stat_bin2d()`).</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-9-1.png" width="672" /></p>
<pre class="r"><code>mean_ABV &lt;- mean(merged_data$ABV, na.rm=TRUE)
mean_IBU &lt;- mean(merged_data$IBU, na.rm=TRUE)
mean_ABV</code></pre>
<pre><code>## [1] 0.05977342</code></pre>
<pre class="r"><code>mean_IBU</code></pre>
<pre><code>## [1] 42.71317</code></pre>
<p>In order to go above and beyond and find new insights, we decided to
do two things. First, we wanted to determine the strength of the
correlation between ABV and IBU, which we previously hypothesized to be
linearly related. After analyzing the linear relationship between the
two with cor.test, we determined that the two have a correlation
coefficient between the two of them of 0.6706. This suggests rather
strongly that there is some positive linear relationship between the
two. If the coefficient were 0, there would be no relationship between
ABV and IBU.</p>
<p>Additionally, with a p-value of 2.2e-16, the likelihood of there
being no relationship between the two is incredibly small. We can
conclude with extreme confidence that this linear relationship
exists.</p>
<p>Next, we created a heatmap of ABV and IBU to determine what
‘neighborhoods’ of alcohol content and bitterness are popular, to
provide insights on what corners of the market Budweiser could enter. We
found two areas that are immensely popular. First, there is a high
concentration of beers with an ABV of around 5% and an IBU under 25.
Next, there is another high concentration of beers with an ABV slightly
less than 7.5% and and IBU slightly under 75. We believe Budweiser
should make a push towards focusing on smaller subsidiaries that can
outproduce and outperform other craft beers at the popular intersections
of IBU and ABV</p>
</div>
<div id="chatgpt-use---questions-3-and-8-again" class="section level3">
<h3>ChatGPT Use - Questions 3 and 8 again</h3>
<pre class="r"><code>merged_data_adjusted2 &lt;- merged_data

# Filter out rows with NA in ABV
merged_data_adjusted2 &lt;- merged_data_adjusted2 %&gt;%
  filter(!is.na(ABV))

#below is code provided by ChatGPT - greating a linear model and imputing the IBU values onto the line of best fit
# Fit a linear regression model (ABV as predictor and IBU as the response)
lm_model &lt;- lm(IBU ~ ABV, data = merged_data_adjusted2)

# Predict missing IBU values based on the linear regression model
missing_IBU_data &lt;- merged_data_adjusted2[is.na(merged_data_adjusted2$IBU), ]

missing_IBU_data$Predicted_IBU &lt;- predict(lm_model, newdata = missing_IBU_data)

# Impute missing IBU values with the predicted values
merged_data_imputed &lt;- merge(merged_data_adjusted2, missing_IBU_data, all.x = TRUE)
merged_data_imputed$IBU &lt;- ifelse(is.na(merged_data_imputed$IBU), merged_data_imputed$Predicted_IBU, merged_data_imputed$IBU)

#Above is the end of the code provided by chatGPT - variable names were tweaked as necessary but otherwise the code worked perfectly 
#recreating the filtered dataset for the knn model
filtered_data2 &lt;- merged_data_imputed %&gt;%  filter(grepl(&quot;Ale|IPA&quot;,Style))

# Create the &#39;category&#39; column - 0 is IPA and 1 is other ale
filtered_data2$category &lt;- ifelse(grepl(&quot;IPA&quot;, filtered_data2$Style), 0, 1)

#plotting the new filtered dataset of ABV vs IBU for ales
plot &lt;- ggplot(filtered_data2, aes(x = ABV, y = IBU, color = as.factor(category))) +
  geom_point() +
  labs(title = &quot;ABV vs IBU for IPA and Other Ales&quot;,
       x = &quot;ABV (Alcohol by Volume)&quot;,
       y = &quot;IBU (International Bitterness Units)&quot;) +
  scale_color_manual(values = c(&quot;0&quot; = &quot;blue&quot;, &quot;1&quot; = &quot;red&quot;),
                     labels = c(&quot;0&quot; = &quot;IPA&quot;, &quot;1&quot; = &quot;Other Ale&quot;)) +
  theme_minimal()  + labs(color = &quot;Category&quot;)

plot</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-10-1.png" width="672" /></p>
<pre class="r"><code>set.seed(4)
splitPerc = .70
knn_accuracies &lt;- numeric(100)
knn_sensitivities &lt;- numeric(100)
knn_specificities &lt;- numeric(100)

# Loop through 1000 different seeds
for (i in 1:1000) {
  # Set a new seed for each iteration
  seed &lt;- sample.int(10000, 1)
  set.seed(seed)
  
  #BELOW I just copied &amp; pasted the code to get the train &amp; test for the knn
  trainIndices = sample(1:dim(filtered_data2)[1],round(splitPerc * dim(filtered_data2)[1]))
  train = filtered_data2[trainIndices,]
  
  test = filtered_data2[-trainIndices,]
  
  
  #knn model, k = 3, confusion matrix 2
  classifications = knn(train[,c(4,5)],test[,c(4,5)], train$category, prob = TRUE, k = 3)
  table(classifications,test$category)
  confusion_matrix2 &lt;- confusionMatrix(table(classifications,test$category))
  
  # Evaluating KNN model
  
  knn_accuracies[i] &lt;- accuracy &lt;- confusion_matrix2$overall[&quot;Accuracy&quot;]
  knn_sensitivities[i] &lt;- sensitivity &lt;- confusion_matrix2$byClass[&quot;Sensitivity&quot;]
  knn_specificities[i] &lt;- specificity &lt;- confusion_matrix2$byClass[&quot;Specificity&quot;]
  
}

avg_knn_accuracy &lt;- mean(knn_accuracies)
avg_knn_sensitivity &lt;- mean(knn_sensitivities)
avg_knn_specificity &lt;- mean(knn_specificities)

cat(&quot;\nK-Nearest Neighbors (KNN) Model Average Evaluation:\n&quot;)</code></pre>
<pre><code>## 
## K-Nearest Neighbors (KNN) Model Average Evaluation:</code></pre>
<pre class="r"><code>cat(paste(&quot;Average Accuracy:&quot;, avg_knn_accuracy, &quot;\n&quot;))</code></pre>
<pre><code>## Average Accuracy: 0.811982142857143</code></pre>
<pre class="r"><code>cat(paste(&quot;Average Sensitivity:&quot;, avg_knn_sensitivity, &quot;\n&quot;))</code></pre>
<pre><code>## Average Sensitivity: 0.73242272243728</code></pre>
<pre class="r"><code>cat(paste(&quot;Average Specificity:&quot;, avg_knn_specificity, &quot;\n&quot;))</code></pre>
<pre><code>## Average Specificity: 0.859684484778119</code></pre>
<pre class="r"><code># Create a histogram for KNN accuracies
hist(knn_accuracies, main = &quot;KNN Accuracies Histogram - Regression&quot;, xlab = &quot;Accuracy&quot;, col = &quot;blue&quot;)

# Add a vertical line at the average KNN accuracy
abline(v = avg_knn_accuracy, col = &quot;red&quot;, lwd = 2)</code></pre>
<p><img src="Beers_files/figure-html/unnamed-chunk-10-2.png" width="672" /></p>
<p>Utilizing AI (specifically ChatGPT), as well as supplemental
statistics knowledge, we thought of another way we could deal with
missing IBU values, as well as perform the machine learning model again
using this additional method. Since we knew about the linear
relationship between ABV and IBU, we decided to create a best-fit line
and impute the missing IBU values onto that line. By creating a best-fit
line, which approximates the linear relationship between IBU and ABV, we
can extrapolate an estimate for a missing IBU value based on the ABV.
The scatterplot above demonstrates this new way to impute the data.
instead of a horizontal line of points on IBU = 35 like we had seen
before, we now see a diagonal line of points right along the best fit
line. The missing IBU values were placed on that line based on ABV.</p>
<p>We decided to re-do the machine learning model based on this new
scatterplot in the hopes that it would be more accurate. However,
curiously, the model became slightly less accurate in classiflying IPAs
and other ales. However, the insights and visualizations provided by the
second model were too interesting not to include. They also speak to the
power of ChatGPT and other generative AIs. These are useful tools that
greatly reduced the mount of time we needed to answer our questions, we
discovered, but only if we knew the right questions to ask.</p>
</div>
</div>
<div id="conclusion" class="section level2">
<h2>Conclusion</h2>
<p>In conclusion, our analysis of this data has been able to provide
insights, create new areas of interest to look into further, and
provided direction for new ways Budweiser can compete in the beer
marketplace and gain an edge. We greatly appreciate the opportunity to
work on this incredible project, and we hope our insights are able to
make an impact! Thank you so much!</p>
</div>


</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open');
  });
});
</script>

<!-- code folding -->


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>