From 74b1742a56fa4004f380a2a2bd9684398387c932 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 22 Aug 2019 16:12:01 +0100 Subject: [PATCH] Documentation for dimension stripping. --- docs/examples.py | 53 +++++++++++- docs/stats.rst | 78 ++++++++++++++---- docs/tutorial.rst | 204 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 317 insertions(+), 18 deletions(-) diff --git a/docs/examples.py b/docs/examples.py index 5e84adeda4..0f696d4a57 100644 --- a/docs/examples.py +++ b/docs/examples.py @@ -175,7 +175,58 @@ def missing_data(): tree = ts.first() tree.draw("_static/missing_data1.svg") + +def stats(): + ts = msprime.simulate( + 10**4, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, length=10**7, + random_seed=42) + print("num_trees = ", ts.num_trees, ", num_sites = ", ts.num_sites, sep="") + + x = ts.diversity() + print("Average diversity per unit sequence length = {:.3G}".format(x)) + + windows = np.linspace(0, ts.sequence_length, num=5) + x = ts.diversity(windows=windows) + print(windows) + print(x) + + A = ts.samples()[:100] + x = ts.diversity(sample_sets=A) + print(x) + + B = ts.samples()[100:200] + C = ts.samples()[200:300] + x = ts.diversity(sample_sets=[A, B, C]) + print(x) + + x = ts.diversity(sample_sets=[A, B, C], windows=windows) + print("shape = ", x.shape) + print(x) + + A = ts.samples()[:100] + B = ts.samples()[:100] + x = ts.divergence([A, B]) + print(x) + + x = ts.divergence([A, B], windows=windows) + print(x) + + x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)]) + print(x) + + x = ts.divergence([A, B, C], indexes=(0, 1)) + print(x) + + x = ts.divergence([A, B, C], indexes=[(0, 1)]) + print(x) + + x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)], windows=windows) + print(x) + # moving_along_tree_sequence() # parsimony() # allele_frequency_spectra() -missing_data() +# missing_data() + +stats() + diff --git a/docs/stats.rst b/docs/stats.rst index ab2af619c0..410d2c8282 100644 --- a/docs/stats.rst +++ b/docs/stats.rst @@ -14,9 +14,8 @@ and of the underlying trees (whose definition does not depend on the genome sequ Furthermore, these statistics have a common interface to easily compute (a) averaged statistics in windows along the genome, and (b) summary statistics of many combinations of sets of samples simultaneously. -All methods return numpy arrays, -whose rows correspond to the windows along the genome, -and whose remaining dimensions are determined by the statistic. +All methods return numpy arrays whose dimensions are +determined by the parameters (see :ref:`sec_general_stats_output_dimensions`). .. warning:: :ref:`sec_data_model_missing_data` is not currently handled correctly by site statistics defined here, as we always @@ -70,12 +69,6 @@ e.g., the sites of the SNPs. Windowing ********* -By default, statistics - -``windows = None`` - This is the default, and equivalent to passing ``windows = [0.0, ts.sequence_length]``. - The output will still be a two-dimensional array, but with only one row. - Each statistic has an argument, ``windows``, which defines a collection of contiguous windows along the genome. If ``windows`` is a list of ``n+1`` increasing numbers between 0 and the ``sequence_length``, @@ -95,6 +88,14 @@ obtaining ``((b - a) * S[0] + (c - b) * S[1]) / (c - a)``. There are some shortcuts to other useful options: +``windows = None`` + This is the default and computes statistics in single window over the whole + sequence. As the first returned array contains only a single + value, we drop this dimension as described in the :ref:`output dimensions + ` section. **Note:** if you really do + want to have an array with a single value as the result, please use + ``windows = [0.0, ts.sequence_length]``. + ``windows = "trees"`` This says that you want statistics computed separately on the portion of the genome spanned by each tree, so is equivalent to passing ``windows = ts.breakpoints()``. @@ -191,7 +192,9 @@ Here are some additional special cases: If the statistic takes ``k`` inputs for ``k > 1``, and there are exactly ``k`` lists in ``sample_sets``, then this will compute just one statistic, and is equivalent to passing - ``indexes = [(0, 1, ..., k-1)]``. + ``indexes = (0, 1, ..., k-1)``. Note that this also drops the last + dimension of the output, as described in the :ref:`sec_general_stats_output_dimensions` + section. If there are not exactly ``k`` sample sets, this will throw an error. ``k=1`` does not allow ``indexes``: @@ -203,16 +206,16 @@ Here are some additional special cases: were that allowed.) -.. _sec_general_stats_output: +.. _sec_general_stats_output_format: -****** -Output -****** +************* +Output format +************* Each of the statistics methods returns a ``numpy`` ndarray. Suppose that the output is name ``out``. -In all cases, the number of rows of the output is equal to the number of windows, -so that ``out.shape[0]`` is equal to ``len(windows) - 1`` +If ``windows`` has been specified, the number of rows of the output is equal to the +number of windows, so that ``out.shape[0]`` is equal to ``len(windows) - 1`` and ``out[i]`` is an array of statistics describing the portion of the tree sequence from ``windows[i]`` to ``windows[i + 1]`` (including the left but not the right endpoint). @@ -229,6 +232,9 @@ from ``windows[i]`` to ``windows[i + 1]`` (including the left but not the right The final dimension of the arrays in other cases is specified by the method. +Note, however, that empty dimensions can optionally be dropped, +as described in the :ref:`sec_general_stats_output_dimensions` section. + A note about **default values** and **division by zero**: Under the hood, statistics computation fills in zeros everywhere, then updates these (since statistics are all **additive**, this makes sense). @@ -243,6 +249,46 @@ For ``branch`` statistics, any windows with **no branches** will similarly remai That said, you should **not** rely on the specific behavior of whether ``0`` or ``nan`` is returned for "empty" cases like these: it is subject to change. +.. _sec_general_stats_output_dimensions: + +***************** +Output dimensions +***************** + +In the general case, tskit outputs two dimensional (or three dimensional, in the case of node +stats) numpy arrays, as described in the :ref:`sec_general_stats_output_format` section. +The first dimension corresponds to the window along the genome +such that for some result array ``x``, ``x[j]`` contains information about the jth window. +The last dimension corresponds to the statistics being computed, so that ``x[j, k]`` is the +value of the kth statistic in the jth window (in the two dimensional case). This is +a powerful and general interface, but in many cases we will not use this full generality +and the extra dimensions in the numpy arrays are inconvenient. + +Tskit optionally removes empty dimensions from the output arrays following a few +simple rules. + +1. If ``windows`` is None we are computing over the single window covering the + full sequence. We therefore drop the first dimension of the array. + +2. In one-way stats, if the ``sample_sets`` argument is a 1D array we interpret + this as specifying a single sample set (and therefore a single statistic), and + drop the last dimension of the output array. If ``sample_sets`` is None + (the default), we use the sample set ``ts.samples()``, invoking + this rule (we therefore drop the last dimension by default). + +3. In k-way stats, if the ``indexes`` argument is a 1D array of length k + we intepret this as specifying a single statistic and drop the last + dimension of the array. If ``indexes`` is None (the default) and + there are k sample sets, we compute the statistic from these sample sets + and drop the last dimension. + +Rules 2 and 3 can be summarised by "the dimensions of the input determines +the dimensions of the output". Note that dropping these dimensions is +**optional**: it is always possible to keep the full dimensions of the +output arrays. + +Please see the :ref:`tutorial ` for examples of the +various output dimension options. ******************** Available statistics diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 835b751fd7..31eb4e0ab3 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -821,6 +821,208 @@ to remember that the algorithm is minimising the number of state transitions; this may not correspond always to what we might consider the most parsimonious explanation. +.. _sec_tutorial_stats: + +******************** +Computing statistics +******************** + +Tskit provides an extensive and flexible interface for computing population +genetic statistics, which is documented in detail in the :ref:`general statistics +` section. This tutorial aims to give a quick overview of +how the APIs work how to use them effectively. + +First, lets simulate a tree sequence to work with which has roughly human +parameters for 10 thousand samples and 10Mb chromosomes:: + + ts = msprime.simulate( + 10**4, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, length=10**7, + random_seed=42) + +We end up with 36K trees 39K segregating sites. We'd now like to compute some statistics on +this dataset. + +++++++++++++++++++ +One-way statistics +++++++++++++++++++ + +We refer to statistics that are defined with respect to a single set of +samples as "one-way". An example of such a statistic is diversity, which +is computed using the :meth:`.TreeSequence.diversity` method:: + + x = ts.diversity() + print("Average diversity per unit sequence length = {:.3G}".format(x)) + + [Output] + + Average diversity per unit sequence length = 0.000401 + +This tells the average diversity across the whole sequence and returns a single +number. We'll usually want to compute statistics in +:ref:`windows ` along the genome and we +use the ``windows`` argument to do this:: + + windows = np.linspace(0, ts.sequence_length, num=5) + x = ts.diversity(windows=windows) + print(windows) + print(x) + + [Output] + + [ 0. 2500000. 5000000. 7500000. 10000000.] + [0.00041602 0.00039112 0.00041554 0.00038329] + +The ``windows`` argument takes a numpy array specifying the breakpoints +along the genome. Here, we use numpy to create four equally spaced windows +of size 2.5 megabases (the windows array contains k + 1 elements to define +k windows). Because we have asked for values in windows, tskit now returns +a numpy array rather than a single value. (See +:ref:`sec_general_stats_output_dimensions` for a full description of how the output +dimensions of statistics are determined by the ``windows`` argument.) + +Suppose we wanted to compute diversity within a specific subset of samples. +We can do this using the ``sample_sets`` argument:: + + A = ts.samples()[:100] + x = ts.diversity(sample_sets=A) + print(x) + + [Output] + + 0.00040166573737371227 + +Here, we've computed the average diversity within the first hundred samples across +the whole genome. As we've not specified any windows, this is again a single value. + +We can also compute diversity in *multiple* sample sets at the same time by providing +a list of sample sets as an argument:: + + A = ts.samples()[:100] + B = ts.samples()[100:200] + C = ts.samples()[200:300] + x = ts.diversity(sample_sets=[A, B, C]) + print(x) + + [Output] + + [0.00040167 0.00040008 0.00040103] + +Because we've computed multiple statistics concurrently, tskit returns a numpy array +of these statistics. We have asked for diversity within three different sample sets, +and tskit therefore returns an array with three values. (In general, the +dimensions of the input determines the dimensions of the output: see +:ref:`sec_general_stats_output_dimensions` for a detailed description of the rules.) + +We can also compute multiple statistics in multiple windows:: + + x = ts.diversity(sample_sets=[A, B, C], windows=windows) + print("shape = ", x.shape) + print(x) + + [Output] + + shape = (4, 3) + [[0.0004139 0.00041567 0.00041774] + [0.00039148 0.00039152 0.00038997] + [0.00042019 0.00041039 0.00041475] + [0.0003811 0.00038274 0.00038166]] + +We have computed diversity within three different sample sets across four +genomic windows, and our output is therefore a 2D numpy array with four +rows and three columns: each row contains the diversity values within +A, B and C for a particular window. + +++++++++++++++++++++ +Multi-way statistics +++++++++++++++++++++ + +Many population genetic statistics compare multiple sets of samples to +each other. For example, the :meth:`TreeSequence.divergence` method computes +the divergence between two subsets of samples:: + + A = ts.samples()[:100] + B = ts.samples()[:100] + x = ts.divergence([A, B]) + print(x) + + [Output] + + 0.00039764908000000676 + +The divergence between two sets of samples A and B is a single number, +and we we again return a single floating point value as the result. We can also +compute this in windows along the genome, as before:: + + + x = ts.divergence([A, B], windows=windows) + print(x) + + [Output] + + [0.00040976 0.00038756 0.00041599 0.00037728] + + +Again, as we have defined four genomic windows along the sequence, the result is +numpy array with four values. + +A powerful feature of tskit's stats API is that we can compute the divergences +between multiple sets of samples simultaneously using the ``indexes`` argument:: + + + x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)]) + print(x) + + [Output] + + [0.00039765 0.00040181] + +Here, we've specified three sample sets A, B and C and we've computed the +divergences between A and B, and between A and C. The ``indexes`` argument is used +to specify which pairs of sets we are interested in. In this example +we've computed two different divergence values and the output is therefore +a numpy array of length 2. + +As before, we can combine computing multiple statistics in multiple windows +to return a 2D numpy array:: + + windows = np.linspace(0, ts.sequence_length, num=5) + x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)], windows=windows) + print(x) + + [Output] + + [[0.00040976 0.0004161 ] + [0.00038756 0.00039025] + [0.00041599 0.00041847] + [0.00037728 0.0003824 ]] + +Each row again corresponds to a window, which contains the average divergence +values between the chosen sets. + +If the ``indexes`` parameter is 1D array, we interpret this as specifying +a single statistic and remove the empty outer dimension:: + + x = ts.divergence([A, B, C], indexes=(0, 1)) + print(x) + + [Output] + + 0.00039764908000000676 + +It's important to note that we don't **have** to remove empty dimensions: tskit +will only do this if you explicitly ask it to. Here, for example, we can keep the +output as an array with one value if we wish:: + + x = ts.divergence([A, B, C], indexes=[(0, 1)]) + print(x) + + [Output] + + [0.00039765] + +Please see :ref:`sec_general_stats_sample_sets` for a +full description of the ``sample_sets`` and ``indexes`` arguments. + .. _sec_tutorial_afs: ************************ @@ -881,7 +1083,7 @@ giving:: This time, we've asked for the number of sites at each frequency in two equal windows. Now we can see that in the first half of the sequence we have three sites (compare with the site table above): one singleton, -one doubleton and one tripleton. +one doubleton and one tripleton. +++++++++++++ Joint spectra