DAPC.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<title>Discriminant analysis of principal components (DAPC)</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/sandstone.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<script src="site_libs/accessible-code-block-0.0.1/empty-anchor.js"></script>
<link href="site_libs/anchor-sections-1.0/anchor-sections.css" rel="stylesheet" />
<script src="site_libs/anchor-sections-1.0/anchor-sections.js"></script>
<!-- Global Site Tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-107144798-3"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments)};
  gtag('js', new Date());

  gtag('config', 'UA-107144798-3');
</script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>


<style type="text/css">code{white-space: pre;}</style>
<style type="text/css" data-origin="pandoc">
code.sourceCode > span { display: inline-block; line-height: 1.25; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {  background-color: #f8f8f8; }
@media screen {
code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ef2929; } /* Alert */
code span.an { color: #8f5902; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #c4a000; } /* Attribute */
code span.bn { color: #0000cf; } /* BaseN */
code span.cf { color: #204a87; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4e9a06; } /* Char */
code span.cn { color: #000000; } /* Constant */
code span.co { color: #8f5902; font-style: italic; } /* Comment */
code span.cv { color: #8f5902; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #8f5902; font-weight: bold; font-style: italic; } /* Documentation */
code span.dt { color: #204a87; } /* DataType */
code span.dv { color: #0000cf; } /* DecVal */
code span.er { color: #a40000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #0000cf; } /* Float */
code span.fu { color: #000000; } /* Function */
code span.im { } /* Import */
code span.in { color: #8f5902; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #204a87; font-weight: bold; } /* Keyword */
code span.op { color: #ce5c00; font-weight: bold; } /* Operator */
code span.ot { color: #8f5902; } /* Other */
code span.pp { color: #8f5902; font-style: italic; } /* Preprocessor */
code span.sc { color: #000000; } /* SpecialChar */
code span.ss { color: #4e9a06; } /* SpecialString */
code span.st { color: #4e9a06; } /* String */
code span.va { color: #000000; } /* Variable */
code span.vs { color: #4e9a06; } /* VerbatimString */
code span.wa { color: #8f5902; font-weight: bold; font-style: italic; } /* Warning */

</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    for (var j = 0; j < rules.length; j++) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue;
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' && rule.style.backgroundColor === '') continue;
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>


<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>

<link rel="stylesheet" href="styles.css" type="text/css" />


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 61px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 66px;
  margin-top: -66px;
}
.section h2 {
  padding-top: 66px;
  margin-top: -66px;
}
.section h3 {
  padding-top: 66px;
  margin-top: -66px;
}
.section h4 {
  padding-top: 66px;
  margin-top: -66px;
}
.section h5 {
  padding-top: 66px;
  margin-top: -66px;
}
.section h6 {
  padding-top: 66px;
  margin-top: -66px;
}
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #ffffff;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  background: white;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "&#xe258;";
  border: none;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->


</head>

<body>


<div class="container-fluid main-container">


<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Population genetics and genomics in R</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="TOC.html">Table of contents</a>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
    Part I
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="Introduction.html">Introduction</a>
    </li>
    <li>
      <a href="Getting_ready_to_use_R.html">Getting ready to use R</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
    Part II
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="Data_Preparation.html">Data preparation</a>
    </li>
    <li>
      <a href="First_Steps.html">First steps</a>
    </li>
    <li>
      <a href="Population_Strata.html">Population strata and clone correction</a>
    </li>
    <li>
      <a href="Locus_Stats.html">Locus-based statistics and missing data</a>
    </li>
    <li>
      <a href="Genotypic_EvenRichDiv.html">Genotypic evenness, richness, and diversity</a>
    </li>
    <li>
      <a href="Linkage_disequilibrium.html">Linkage disequilibrium</a>
    </li>
    <li>
      <a href="Pop_Structure.html">Population structure</a>
    </li>
    <li>
      <a href="Minimum_Spanning_Networks.html">Minimum Spanning Networks</a>
    </li>
    <li>
      <a href="AMOVA.html">AMOVA</a>
    </li>
    <li>
      <a href="DAPC.html">Discriminant analysis of principal components (DAPC)</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
    Part III
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="intro_vcf.html">Population genomics and HTS</a>
    </li>
    <li>
      <a href="reading_vcf.html">Reading VCF data</a>
    </li>
    <li>
      <a href="analysis_of_genome.html">Analysis of genomic data</a>
    </li>
    <li>
      <a href="gbs_analysis.html">Analysis of GBS data</a>
    </li>
    <li>
      <a href="clustering_plot.html">Clustering plot</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
    Workshops
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li class="dropdown-submenu">
      <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">ICPP</a>
      <ul class="dropdown-menu" role="menu">
        <li>
          <a href="workshop_icpp.html">Preparation</a>
        </li>
        <li>
          <a href="intro_vcf.html">Introduction</a>
        </li>
        <li>
          <a href="reading_vcf.html">VCF data</a>
        </li>
        <li>
          <a href="quality_control.html">Quality control</a>
        </li>
        <li>
          <a href="gbs_analysis.html">Analysis of GBS data</a>
        </li>
        <li>
          <a href="analysis_of_genome.html">Analysis of genome data</a>
        </li>
      </ul>
    </li>
    <li class="dropdown-submenu">
      <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">APS Southern Division</a>
      <ul class="dropdown-menu" role="menu">
        <li>
          <a href="workshop_southernAPS.html">Preparation</a>
        </li>
        <li>
          <a href="intro_vcf.html">Introduction</a>
        </li>
        <li>
          <a href="reading_vcf.html">VCF data</a>
        </li>
        <li>
          <a href="quality_control.html">Quality control</a>
        </li>
        <li>
          <a href="gbs_analysis.html">Analysis of GBS data</a>
        </li>
      </ul>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
    About
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="Authors.html">Authors</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
    Appendices
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="intro_to_R.html">Introduction to R</a>
    </li>
    <li>
      <a href="Data_sets.html">Data sets</a>
    </li>
    <li>
      <a href="funpendix.html">Function glossary</a>
    </li>
    <li>
      <a href="background_functions.html">Background_functions</a>
    </li>
    <li>
      <a href="https://github.com/grunwaldlab/Population_Genetics_in_R/">Source Code</a>
    </li>
  </ul>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div class="fluid-row" id="header">


<h1 class="title toc-ignore">Discriminant analysis of principal components (DAPC)</h1>
<h3 class="subtitle"><em>NJ Grünwald, ZN Kamvar, and SE Everhart</em></h3>

</div>


<div id="introduction" class="section level1">
<h1>Introduction</h1>
<p>Often we want to infer population structure by determining the number of clusters (groups) observed without prior knowledge. Several approaches can be used to infer groups such as for example K-means clustering, Bayesian clustering using STRUCTURE, and multivariate methods such as Discriminant Analysis of Principal Components (DAPC) <span class="citation">(Pritchard, Stephens &amp; Donnelly, 2000; Jombart, Devillard &amp; Balloux, 2010; Grünwald &amp; Goss, 2011)</span>. A STRUCTURE-like approach assumes that markers are not linked and that populations are panmictic <span class="citation">(Pritchard et al., 2000)</span>. To use model-free methods K-means clustering based on genetic distance or DAPC are more convenient approaches for populations that are clonal or partially clonal. Here we explore DAPC further.</p>
</div>
<div id="dapc-analysis-of-the-h3n2-influenza-strains" class="section level1">
<h1>DAPC analysis of the H3N2 influenza strains</h1>
<p>DAPC was pioneered by Jombart and colleagues <span class="citation">(Jombart et al., 2010)</span> and can be used to infer the number of clusters of genetically related individuals. In this multivariate statistical approach variance in the sample is partitioned into a between-group and within- group component, in an effort to maximize discrimination between groups. In DAPC, data is first transformed using a principal components analysis (PCA) and subsequently clusters are identified using discriminant analysis (DA). This tutorial is based on the <a href="http://adegenet.r-forge.r-project.org/files%20/tutorial-dapc.pdf">vignette</a> written by Thibaut Jombart. We encourage the user to explore this vignette further. The vignette can also be opened within R by executing <code>adegenetTutorial("dapc")</code>.</p>
<p>We will use the seasonal influenza dataset H3N2 data containing 1903 isolates genotyped for 125 SNPs located in the hemagglutinin segment. This dataset as well as the <code>dapc()</code> function is part of the <a href="http://adegenet.r-forge.r-project.org"><em>adegenet</em></a> package.</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1"></a><span class="co"># DAPC requires the adegenet package. Let&#39;s load this package:</span></span>
<span id="cb1-2"><a href="#cb1-2"></a><span class="kw">library</span>(<span class="st">&quot;adegenet&quot;</span>)</span>
<span id="cb1-3"><a href="#cb1-3"></a><span class="kw">data</span>(H3N2) <span class="co"># load the H3N2 influenza data. Type ?H3N2 for more info.</span></span>
<span id="cb1-4"><a href="#cb1-4"></a><span class="kw">pop</span>(H3N2) &lt;-<span class="st"> </span>H3N2<span class="op">$</span>other<span class="op">$</span>epid</span>
<span id="cb1-5"><a href="#cb1-5"></a>dapc.H3N2 &lt;-<span class="st"> </span><span class="kw">dapc</span>(H3N2, <span class="dt">var.contrib =</span> <span class="ot">TRUE</span>, <span class="dt">scale =</span> <span class="ot">FALSE</span>, <span class="dt">n.pca =</span> <span class="dv">30</span>, <span class="dt">n.da =</span> <span class="kw">nPop</span>(H3N2) <span class="op">-</span><span class="st"> </span><span class="dv">1</span>)</span>
<span id="cb1-6"><a href="#cb1-6"></a><span class="kw">scatter</span>(dapc.H3N2, <span class="dt">cell =</span> <span class="dv">0</span>, <span class="dt">pch =</span> <span class="dv">18</span><span class="op">:</span><span class="dv">23</span>, <span class="dt">cstar =</span> <span class="dv">0</span>, <span class="dt">mstree =</span> <span class="ot">TRUE</span>, <span class="dt">lwd =</span> <span class="dv">2</span>, <span class="dt">lty =</span> <span class="dv">2</span>)</span></code></pre></div>
<p><img src="DAPC_files/figure-html/unnamed-chunk-2-1.png" width="960" /></p>
<p>The <code>dapc()</code> arguments we used refer to:</p>
<ul>
<li>the dataset H3N2</li>
<li><code>var.contrib</code> this is set to <code>TRUE</code>, meaning that we want to retain the variables contributing to the analysis in our output. We will use this later to see which loci are responsible for separating populations.</li>
<li><code>center</code> this is set to <code>FALSE</code>, indicating that we do not want the data to be rescaled so the mean = 0.</li>
<li><code>n.pca</code> is the number of axes retained in the Principal Component Analysis (PCA). By default, it is set to <code>NULL</code>.</li>
<li><code>n.da</code> is the number of axes retained in the Discriminant Analysis (DA). By default, it is set to <code>NULL</code>.</li>
</ul>
<blockquote>
<p>It is important to set <code>n.pca = NULL</code> when you analyze your data because the number of principal components retained has a large effect on the outcome of the data. See the section below for a statistical method called cross- validation as an aid for choosing <code>n.pca</code></p>
</blockquote>
<p>The <code>scatter()</code> function is part of the <em>ade4</em> package and plots results of a DAPC analysis.</p>
<p>As you can see, each year between 2001 to 2005 is a cluster of H3N2 strains separated by axis 1. In contrast, axis 2 separates the strains observed in the 2006 cluster from the clusters observed during 2001-5, indicating that the strains observed in 2006 are genetically distinct.</p>
<p>Next, let’s assess if there are alleles that most differentiate the 2006 cluster from those in other years.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1"></a><span class="kw">set.seed</span>(<span class="dv">4</span>)</span>
<span id="cb2-2"><a href="#cb2-2"></a>contrib &lt;-<span class="st"> </span><span class="kw">loadingplot</span>(dapc.H3N2<span class="op">$</span>var.contr, <span class="dt">axis =</span> <span class="dv">2</span>, <span class="dt">thres =</span> <span class="fl">0.07</span>, <span class="dt">lab.jitter =</span> <span class="dv">1</span>)</span></code></pre></div>
<p><img src="DAPC_files/figure-html/unnamed-chunk-3-1.png" width="960" /></p>
<p>It looks like SNPs at position 399 and 906 are involved. Let’s check this further by looking at allele frequencies by year:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1"></a>temp    &lt;-<span class="st"> </span><span class="kw">seploc</span>(H3N2)       <span class="co"># seploc {adegenet} creates a list of individual loci.</span></span>
<span id="cb3-2"><a href="#cb3-2"></a>snp906  &lt;-<span class="st"> </span><span class="kw">tab</span>(temp[[<span class="st">&quot;906&quot;</span>]]) <span class="co"># tab {adegenet} returns a matrix of genotypes</span></span>
<span id="cb3-3"><a href="#cb3-3"></a>snp399  &lt;-<span class="st"> </span><span class="kw">tab</span>(temp[[<span class="st">&quot;399&quot;</span>]])</span>
<span id="cb3-4"><a href="#cb3-4"></a></span>
<span id="cb3-5"><a href="#cb3-5"></a><span class="co"># The following two commands find the average allele frequencies per population</span></span>
<span id="cb3-6"><a href="#cb3-6"></a>(freq906 &lt;-<span class="st"> </span><span class="kw">apply</span>(snp906, <span class="dv">2</span>, <span class="cf">function</span>(e) <span class="kw">tapply</span>(e, <span class="kw">pop</span>(H3N2), mean, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)))</span></code></pre></div>
<pre><code>##            906.c     906.t
## 2001 0.000000000 1.0000000
## 2002 0.000000000 1.0000000
## 2003 0.000000000 1.0000000
## 2004 0.000000000 1.0000000
## 2005 0.002155172 0.9978448
## 2006 0.616071429 0.3839286</code></pre>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1"></a>(freq399 &lt;-<span class="st"> </span><span class="kw">apply</span>(snp399, <span class="dv">2</span>, <span class="cf">function</span>(e) <span class="kw">tapply</span>(e, <span class="kw">pop</span>(H3N2), mean, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)))</span></code></pre></div>
<pre><code>##            399.c     399.t
## 2001 0.000000000 1.0000000
## 2002 0.000000000 1.0000000
## 2003 0.000000000 1.0000000
## 2004 0.001848429 0.9981516
## 2005 0.002079002 0.9979210
## 2006 0.357142857 0.6428571</code></pre>
<p>Note that a new allele appeared in 2005 for SNP locus 906 and 2004 for locus 399 separating populations along axis 2.</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1"></a><span class="co"># First, set the plotting parameters</span></span>
<span id="cb7-2"><a href="#cb7-2"></a><span class="co"># mfrow = number of columns, rows</span></span>
<span id="cb7-3"><a href="#cb7-3"></a><span class="co"># mar   = plot margin size</span></span>
<span id="cb7-4"><a href="#cb7-4"></a><span class="co"># las   = axis label style (3: always vertical)</span></span>
<span id="cb7-5"><a href="#cb7-5"></a><span class="kw">par</span>(<span class="dt">mfrow =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>), <span class="dt">mar =</span> <span class="kw">c</span>(<span class="dv">5</span>, <span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">0</span>) <span class="op">+</span><span class="st"> </span><span class="fl">0.1</span>, <span class="dt">las =</span> <span class="dv">3</span>)</span>
<span id="cb7-6"><a href="#cb7-6"></a></span>
<span id="cb7-7"><a href="#cb7-7"></a><span class="kw">matplot</span>(freq906,  <span class="dt">pch =</span> <span class="kw">c</span>(<span class="st">&quot;c&quot;</span>, <span class="st">&quot;t&quot;</span>), <span class="dt">type =</span> <span class="st">&quot;b&quot;</span>,</span>
<span id="cb7-8"><a href="#cb7-8"></a>        <span class="dt">xlab =</span> <span class="st">&quot;year&quot;</span>, <span class="dt">ylab =</span> <span class="st">&quot;allele frequency&quot;</span>, <span class="dt">main =</span> <span class="st">&quot;SNP # 906&quot;</span>,</span>
<span id="cb7-9"><a href="#cb7-9"></a>        <span class="dt">xaxt =</span> <span class="st">&quot;n&quot;</span>, <span class="dt">cex =</span> <span class="fl">1.5</span>)</span>
<span id="cb7-10"><a href="#cb7-10"></a><span class="kw">axis</span>(<span class="dt">side =</span> <span class="dv">1</span>, <span class="dt">at =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">6</span>, <span class="dt">lab =</span> <span class="dv">2001</span><span class="op">:</span><span class="dv">2006</span>)</span>
<span id="cb7-11"><a href="#cb7-11"></a></span>
<span id="cb7-12"><a href="#cb7-12"></a><span class="kw">matplot</span>(freq399, <span class="dt">pch =</span> <span class="kw">c</span>(<span class="st">&quot;c&quot;</span>, <span class="st">&quot;t&quot;</span>), <span class="dt">type =</span> <span class="st">&quot;b&quot;</span>,</span>
<span id="cb7-13"><a href="#cb7-13"></a>        <span class="dt">xlab =</span> <span class="st">&quot;year&quot;</span>, <span class="dt">ylab =</span> <span class="st">&quot;allele frequency&quot;</span>, <span class="dt">main =</span> <span class="st">&quot;SNP #399&quot;</span>,</span>
<span id="cb7-14"><a href="#cb7-14"></a>        <span class="dt">xaxt =</span> <span class="st">&quot;n&quot;</span>, <span class="dt">cex =</span> <span class="fl">1.5</span>)</span>
<span id="cb7-15"><a href="#cb7-15"></a><span class="kw">axis</span>(<span class="dt">side =</span> <span class="dv">1</span>, <span class="dt">at =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">6</span>, <span class="dt">lab =</span> <span class="dv">2001</span><span class="op">:</span><span class="dv">2006</span>)</span></code></pre></div>
<p><img src="DAPC_files/figure-html/unnamed-chunk-5-1.png" width="960" /></p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1"></a><span class="co"># Now we reset the plotting parameters to default</span></span>
<span id="cb8-2"><a href="#cb8-2"></a><span class="kw">par</span>(<span class="dt">mfrow =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>), <span class="dt">mar =</span> <span class="kw">c</span>(<span class="dv">5</span>, <span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">2</span>) <span class="op">+</span><span class="st"> </span><span class="fl">0.1</span>, <span class="dt">las =</span> <span class="dv">0</span>)</span></code></pre></div>
<p>This plot nicely illustrates the effect of mutation, followed by selection or drift in the seasonal H3N2 influenza virus.</p>
</div>
<div id="cross-validation-dapc-analysis-of-phytophthora-ramorum-from-forests-and-nurseries" class="section level1">
<h1>Cross validation: DAPC analysis of <em>Phytophthora ramorum</em> from forests and nurseries</h1>
<p>Above we showed a nice example of a story that shows how two loci can drastically influence an epidemic. Next we will spend some time establishing what the appropriate number of principal components (PC) is for the analysis. It is important to carefully choose the correct number of PCs so as to include most sources of variation explained by an appropriate number of PCs retained. One way of ensuring that you have selected the correct number of PCs is to do cross-validation. This is a procedure in which you leave out a certain percentage of your data, run DAPC, and then see if the data that was left out is correctly placed into the population.</p>
<p>Since the H3N2 data set is quite large, we will use <em>Phytophthora ramorum</em> data from nurseries in California and Oregon <span class="citation">(Goss et al., 2009)</span> and forests in Curry County, Oregon <span class="citation">(Kamvar et al., 2015)</span>. These data represent the Sudden Oak Death epidemic in Curry County, OR from 2001-2014 separated into different watershed regions. In <span class="citation">Kamvar et al. (2015)</span>, the “Hunter Creek (HunterCr)” population was shown to be the result of a second introduction, likely from nurseries. Part of the evidence to support this conclusion came from DAPC results. Here, we will recreate the process of cross validation and reporting.</p>
<p>If we run the function <code>xvalDapc()</code> with default parameters, it will run 30 replicates of cross-validation for a number of PCs less than the total number of alleles in the data. This is a good way to get an idea of where to focus more intense cross-validation runs:</p>
<blockquote>
<p>By default <code>xvalDapc()</code> needs two parameters: 1. The genotype matrix, 2. The population factors.</p>
</blockquote>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1"></a><span class="kw">library</span>(<span class="st">&quot;poppr&quot;</span>)</span>
<span id="cb9-2"><a href="#cb9-2"></a><span class="kw">data</span>(<span class="st">&quot;Pram&quot;</span>, <span class="dt">package =</span> <span class="st">&quot;poppr&quot;</span>)</span>
<span id="cb9-3"><a href="#cb9-3"></a>Pram</span></code></pre></div>
<pre><code>## 
## This is a genclone object
## -------------------------
## Genotype information:
## 
##     98 multilocus genotypes 
##    729 diploid individuals
##      5 codominant loci
## 
## Population information:
## 
##      3 strata - SOURCE, YEAR, STATE
##      9 populations defined - 
## Nursery_CA, Nursery_OR, JHallCr_OR, ..., Winchuck_OR, ChetcoMain_OR, PistolRSF_OR</code></pre>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1"></a><span class="kw">set.seed</span>(<span class="dv">999</span>)</span>
<span id="cb11-2"><a href="#cb11-2"></a>pramx &lt;-<span class="st"> </span><span class="kw">xvalDapc</span>(<span class="kw">tab</span>(Pram, <span class="dt">NA.method =</span> <span class="st">&quot;mean&quot;</span>), <span class="kw">pop</span>(Pram))</span></code></pre></div>
<p><img src="DAPC_files/figure-html/unnamed-chunk-6-1.png" width="960" /></p>
<p>You can see that we have a peak around 15 PC. From here, we can narrow the search by specifying the number of PC to try with <code>n.pca</code> and centering it around 15, and doing 1000 replicates each (Note, this will take a long time).</p>
<blockquote>
<p>Windows users: change parallel to “snow”.</p>
</blockquote>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1"></a><span class="kw">set.seed</span>(<span class="dv">999</span>)</span>
<span id="cb12-2"><a href="#cb12-2"></a><span class="kw">system.time</span>(pramx &lt;-<span class="st"> </span><span class="kw">xvalDapc</span>(<span class="kw">tab</span>(Pram, <span class="dt">NA.method =</span> <span class="st">&quot;mean&quot;</span>), <span class="kw">pop</span>(Pram),</span>
<span id="cb12-3"><a href="#cb12-3"></a>                             <span class="dt">n.pca =</span> <span class="dv">10</span><span class="op">:</span><span class="dv">20</span>, <span class="dt">n.rep =</span> <span class="dv">1000</span>,</span>
<span id="cb12-4"><a href="#cb12-4"></a>                             <span class="dt">parallel =</span> <span class="st">&quot;multicore&quot;</span>, <span class="dt">ncpus =</span> 4L))</span></code></pre></div>
<p><img src="DAPC_files/figure-html/Pramhide-1.png" width="960" /></p>
<pre><code>##    user  system elapsed 
## 176.473   9.777  47.494</code></pre>
<p>We can see that it’s basically a flat line all the way. If we take a look at the object, we see that 16 PCs give us the highest percent of correctly predicted subsamples with the lowest error.</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1"></a><span class="kw">names</span>(pramx) <span class="co"># The first element are all the samples</span></span></code></pre></div>
<pre><code>## [1] &quot;Cross-Validation Results&quot;                          
## [2] &quot;Median and Confidence Interval for Random Chance&quot;  
## [3] &quot;Mean Successful Assignment by Number of PCs of PCA&quot;
## [4] &quot;Number of PCs Achieving Highest Mean Success&quot;      
## [5] &quot;Root Mean Squared Error by Number of PCs of PCA&quot;   
## [6] &quot;Number of PCs Achieving Lowest MSE&quot;                
## [7] &quot;DAPC&quot;</code></pre>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1"></a>pramx[<span class="op">-</span><span class="dv">1</span>]</span></code></pre></div>
<pre><code>## $`Median and Confidence Interval for Random Chance`
##       2.5%        50%      97.5% 
## 0.09179837 0.11054024 0.13281445 
## 
## $`Mean Successful Assignment by Number of PCs of PCA`
##        10        11        12        13        14        15        16        17 
## 0.6483115 0.6484877 0.6460399 0.6459813 0.6563492 0.6673671 0.6733597 0.6704714 
##        18        19        20 
## 0.6703640 0.6692078 0.6670846 
## 
## $`Number of PCs Achieving Highest Mean Success`
## [1] &quot;16&quot;
## 
## $`Root Mean Squared Error by Number of PCs of PCA`
##        10        11        12        13        14        15        16        17 
## 0.3534834 0.3531600 0.3557343 0.3558209 0.3456140 0.3344596 0.3285616 0.3315655 
##        18        19        20 
## 0.3315402 0.3324705 0.3349063 
## 
## $`Number of PCs Achieving Lowest MSE`
## [1] &quot;16&quot;
## 
## $DAPC
##  #################################################
##  # Discriminant Analysis of Principal Components #
##  #################################################
## class: dapc
## $call: dapc.data.frame(x = as.data.frame(x), grp = ..1, n.pca = ..2, 
##     n.da = ..3)
## 
## $n.pca: 16 first PCs of PCA used
## $n.da: 8 discriminant functions saved
## $var (proportion of conserved variance): 0.984
## 
## $eig (eigenvalues): 881.9 207.4 97.39 57.36 49.03 ...
## 
##   vector    length content                   
## 1 $eig      8      eigenvalues               
## 2 $grp      729    prior group assignment    
## 3 $prior    9      prior group probabilities 
## 4 $assign   729    posterior group assignment
## 5 $pca.cent 38     centring vector of PCA    
## 6 $pca.norm 38     scaling vector of PCA     
## 7 $pca.eig  33     eigenvalues of PCA        
## 
##   data.frame    nrow ncol content                                          
## 1 $tab          729  16   retained PCs of PCA                              
## 2 $means        9    16   group means                                      
## 3 $loadings     16   8    loadings of variables                            
## 4 $ind.coord    729  8    coordinates of individuals (principal components)
## 5 $grp.coord    9    8    coordinates of groups                            
## 6 $posterior    729  9    posterior membership probabilities               
## 7 $pca.loadings 38   16   PCA loadings of original variables               
## 8 $var.contr    38   8    contribution of original variables</code></pre>
<p>We also have a DAPC object that we can plot comparable to figure 4 in <span class="citation">Kamvar et al. (2015)</span>:</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1"></a><span class="kw">scatter</span>(pramx<span class="op">$</span>DAPC, <span class="dt">col =</span> <span class="kw">other</span>(Pram)<span class="op">$</span>comparePal, <span class="dt">cex =</span> <span class="dv">2</span>, <span class="dt">legend =</span> <span class="ot">TRUE</span>,</span>
<span id="cb18-2"><a href="#cb18-2"></a>        <span class="dt">clabel =</span> <span class="ot">FALSE</span>, <span class="dt">posi.leg =</span> <span class="st">&quot;bottomleft&quot;</span>, <span class="dt">scree.pca =</span> <span class="ot">TRUE</span>,</span>
<span id="cb18-3"><a href="#cb18-3"></a>        <span class="dt">posi.pca =</span> <span class="st">&quot;topleft&quot;</span>, <span class="dt">cleg =</span> <span class="fl">0.75</span>, <span class="dt">xax =</span> <span class="dv">1</span>, <span class="dt">yax =</span> <span class="dv">2</span>, <span class="dt">inset.solid =</span> <span class="dv">1</span>)</span></code></pre></div>
<p><img src="DAPC_files/figure-html/unnamed-chunk-8-1.png" width="960" /></p>
<p>We can see that this shows the clear separation of Hunter Creek from the rest of the epidemic, providing evidence that this population arose from a separate introduction.</p>
</div>
<div id="conclusions" class="section level1">
<h1>Conclusions</h1>
<p>DAPC is a wonderful tool for exploring structure of populations based on PCA and DA without making assumptions of panmixia. Thus, this technique provides a robust alternative to Bayesian clustering methods like STRUCTURE <span class="citation">(Pritchard et al., 2000)</span> that should not be used for clonal or partially clonal populations.</p>
<p>DAPC analysis is inherently interactive and cannot be scripted <em>a priori</em>. Please refer to the <a href="https://github.com/thibautjombart/adegenet/blob/master/tutorials/tutorial-dapc.pdf">vignette</a> written by Thibaut Jombart for a more interactive analysis.</p>
</div>
<div id="references" class="section level1 unnumbered">
<h1>References</h1>
<div id="refs" class="references">
<div id="ref-goss2009population">
<p>Goss EM., Larsen M., Chastagner GA., Givens DR., Grünwald NJ. 2009. Population genetic analysis infers migration pathways of phytophthora ramorum in us nurseries. <em>PLoS Pathog</em> 5:e1000583. Available at: <a href="http://dx.doi.org/10.1371/journal.ppat.1000583">http://dx.doi.org/10.1371/journal.ppat.1000583</a></p>
</div>
<div id="ref-grunwald2011evolution">
<p>Grünwald NJ., Goss EM. 2011. Evolution and population genetics of exotic and re-emerging pathogens: Novel tools and approaches. <em>Annual Review of Phytopathology</em> 49:249–267. Available at: <a href="http://www.annualreviews.org/doi/abs/10.1146/annurev-phyto-072910-095246?journalCode=phyto">http://www.annualreviews.org/doi/abs/10.1146/annurev-phyto-072910-095246?journalCode=phyto</a></p>
</div>
<div id="ref-jombart2010discriminant">
<p>Jombart T., Devillard S., Balloux F. 2010. Discriminant analysis of principal components: A new method for the analysis of genetically structured populations. <em>BMC genetics</em> 11:94. Available at: <a href="http://www.biomedcentral.com/1471-2156/11/94">http://www.biomedcentral.com/1471-2156/11/94</a></p>
</div>
<div id="ref-kamvar2015spatial">
<p>Kamvar Z., Larsen M., Kanaskie A., Hansen E., Grünwald N. 2015. Spatial and temporal analysis of populations of the sudden oak death pathogen in oregon forests. <em>Phytopathology</em> 105:982–989. Available at: <a href="http://dx.doi.org/10.1094/PHYTO-12-14-0350-FI">http://dx.doi.org/10.1094/PHYTO-12-14-0350-FI</a></p>
</div>
<div id="ref-pritchard2000inference">
<p>Pritchard JK., Stephens M., Donnelly P. 2000. Inference of population structure using multilocus genotype data. <em>Genetics</em> 155:945–959. Available at: <a href="http://www.genetics.org/content/155/2/945.abstract">http://www.genetics.org/content/155/2/945.abstract</a></p>
</div>
</div>
</div>


</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open')
  });
});
</script>

<!-- code folding -->


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>