From e4d3831b350d58a2f2c33f900bc1d6197b415d9a Mon Sep 17 00:00:00 2001 From: MoiColl Date: Wed, 7 Dec 2022 10:24:48 +0100 Subject: [PATCH] make depth_per_haplotype callable for the user --- notebook/simGL.ipynb | 416 ++++++++++++++++++++++++++++++++++++++++++- simGL/__init__.py | 2 +- 2 files changed, 408 insertions(+), 10 deletions(-) diff --git a/notebook/simGL.ipynb b/notebook/simGL.ipynb index 35b4457..867a93b 100644 --- a/notebook/simGL.ipynb +++ b/notebook/simGL.ipynb @@ -47,6 +47,9 @@ "name": "stderr", "output_type": "stream", "text": [ + "R[write to console]: RStudio Community is a great place to get help:\n", + "https://community.rstudio.com/c/tidyverse\n", + "\n", "R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──\n", "\n", "R[write to console]: ✔ tibble 3.1.7 ✔ dplyr 1.0.9\n", @@ -5153,12 +5156,12 @@ "source": [] }, { - "cell_type": "code", - "execution_count": null, - "id": "8bd53327-4ff2-44d0-b60c-f2419b54a0ff", + "cell_type": "markdown", + "id": "eadfc241-68eb-4582-b613-3ddf911e3428", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## 11. Error rate flexibility\n" + ] }, { "cell_type": "code", @@ -5586,7 +5589,7 @@ }, { "cell_type": "code", - "execution_count": 263, + "execution_count": 3, "id": "01ede551-2772-45a1-92ae-9f12f04eebe9", "metadata": {}, "outputs": [], @@ -5997,7 +6000,7 @@ }, { "cell_type": "code", - "execution_count": 252, + "execution_count": 4, "id": "1d4227e9-3158-4286-8394-fcb522312217", "metadata": {}, "outputs": [ @@ -6026,7 +6029,7 @@ " [ 2, 0, 0, 24]]])" ] }, - "execution_count": 252, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -6926,11 +6929,406 @@ "np.exp(-60/10)" ] }, + { + "cell_type": "markdown", + "id": "36ab6188-0865-4ed2-b0a2-1730debf83ab", + "metadata": {}, + "source": [ + "## 12. Check that GP can be obtained from the normalized GL" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "a17f675b-2961-4365-aa28-7a77f955e6ee", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DPh\n", + "[15 12 24 32]\n", + "DP\n", + "[[22 10 20 47]\n", + " [14 14 28 26]]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[[22, 0, 0, 0],\n", + " [ 9, 1, 0, 0],\n", + " [ 0, 20, 0, 0],\n", + " [41, 2, 2, 2]],\n", + "\n", + " [[ 0, 0, 1, 13],\n", + " [ 0, 0, 1, 13],\n", + " [ 0, 24, 3, 1],\n", + " [ 2, 0, 0, 24]]])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seed = 1234\n", + "gm = np.array([[0, 0, 1, 0], \n", + " [1, 1, 0, 1]])\n", + "ref = np.array([\"A\", \"C\"])\n", + "alt = np.array([\"C\", \"T\"])\n", + "e = np.array([0.05, 0.05, 0.05, 0.05])\n", + "mean_depth = np.array([15, 12, 24, 32])\n", + "ploidy = 2\n", + "arc = sim_allelereadcounts(gm, mean_depth, e, ploidy = 1, seed = seed, std_depth = None, ref = ref, alt = alt, read_length = None, depth_type = \"independent\")\n", + "arc" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f6118734-7382-41a4-beb5-517704397233", + "metadata": {}, + "outputs": [], + "source": [ + "def ploidy_sum(arr, ploidy):\n", + " s = arr.shape\n", + " return arr.reshape(-1).reshape(s[0], s[1]//ploidy, ploidy, s[2]).sum(axis = 2)\n", + "\n", + "def allelereadcounts_to_GL(arc, e, ploidy):\n", + " '''\n", + " Computes genotype likelihoods from allele read counts per site per individual. \n", + " \n", + " Parameters\n", + " ----------\n", + " arc : `numpy.ndarray`\n", + " Allele read counts per site per individual or haplotype. The dimentions of the array are \n", + " (sites, individuals or haplotypes, alleles). \n", + " \n", + " The second dimention will depend on the format of the `e` parameter. If the error parameter \n", + " is the same for every haplotype (`int` or `float`), the arc inputed can be per individual. \n", + " Instead, if the error parameter has a value for every haplotype (`np.array`), the arc must \n", + " be per haplotypic sample. This is because to compute GL it is needed to know the number of \n", + " reads per haplotype and their error rate. For example, to obtain the arc fir the former case \n", + " for diploid organisms one must call:\n", + " `sim_allelereadcounts(..., ploidy = 2, ...)` \n", + " but the latter, one must use:\n", + " `sim_allelereadcounts(..., ploidy = 1, ...)`. \n", + " \n", + " The third dimention of the array has size = 4, which corresponds to the four possible alleles: \n", + " 0 = \"A\", 1 = \"C\", 2 = \"G\" and 3 = \"T\".\n", + " \n", + " e : `int` or `float` or `numpy.ndarray`\n", + " Sequencing error probability per base pair per site. The values must be between 0 and 1. If a `int` or `float` \n", + " value is inputed, the function will use the same error probablity value for each haplotype and each site. \n", + " If a `numpy.ndarray` is inputed, there must be an error value per haplotype (i.e., the array must have size \n", + " (haplotypic samples, )) and the order must be the same as the second dimention of `arc`.\n", + "\n", + " ploidy : `int` \n", + " Number of haplotypic chromosomes per individual. \n", + "\n", + " Returns \n", + " -------\n", + "\n", + " GL : `numpy.ndarray`\n", + " Normalized genotype likelihoods per site per individual. The dimentions of the array are (sites, individuals, genotypes). \n", + " The third dimention of the array corresponds to the combinations with replacement of all 4 possible alleles \n", + " {\"A\", \"C\", \"G\", \"T\"} (i.e., for a diploid, there are 10 possible genotypes and the combination order is \"AA\", \"AC\",\n", + " \"AG\", \"AT\", \"CC\", \"CG\", ..., \"TT\"). \n", + "\n", + " References\n", + " ----------\n", + " 1) McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303.\n", + " 2) Thorfinn Sand Korneliussen, Anders Albrechtsen, Rasmus Nielsen. ANGSD: Analysis of Next Generation Sequencing Data. BMC Bioinform. 2014 Nov;15,356.\n", + " '''\n", + " assert check_arc(arc) and check_e(arc, e) and check_ploidy(ploidy)\n", + " \n", + " #1. Obtain an array which rows are possible genotypes depending (GT) on ploidy (ploidy) and each value is the encoded bp in that genotype (e.g., [\"AA\", \"AC\"] = [[0, 0], [0, 1]])\n", + " GTxploidy = get_GTxploidy(ploidy)\n", + " #2. Obtain an array which rows are the 4 bp, the columns are the GT and each value denotes the frequency of each allele\n", + " AFxGTxploidy = np.array([(GTxploidy == 0).sum(axis = 1), (GTxploidy == 1).sum(axis = 1), (GTxploidy == 2).sum(axis = 1), (GTxploidy == 3).sum(axis = 1)])/ploidy\n", + " \n", + " #3. We can compute the GL in two different ways: the first, which allows different error values per haplotype, is a generalized form of the second which only allows errors to be the same for all haplotypes and sites\n", + " # The reason why I keep both is because the former might be slower than the latter.\n", + " if isinstance(e, np.ndarray):\n", + " #I reformat the error array such that I can make matrix operations\n", + " er = np.repeat(e, AFxGTxploidy.size).reshape(e.shape + AFxGTxploidy.shape)\n", + " #Here it is computed the negative log of the multiplication of the error values and the \"AFxGTxploidy\" which results into an array that determines for every genotype the probabilities of observing a read\n", + " #taking into account the error probabilities\n", + " ERxAFxGTxploidy = -np.log(((AFxGTxploidy*(1-er)+(1-AFxGTxploidy)*(er/3))))\n", + " #This array is then reformated for later operations\n", + " ERxAFxGTxploidy = ERxAFxGTxploidy.reshape((1,) + ERxAFxGTxploidy.shape)\n", + " #The number of reads of each base pair are taken into account to compute the likelihood of observing all reads for a given genotype considering the error\n", + " RExerxAFxGTxploidy = np.multiply(ERxAFxGTxploidy, arc.reshape(arc.shape + (1,))).sum(axis = 2)\n", + " #The likelihoods for haplotypes of the same individual are finally added up together\n", + " GL = ploidy_sum(RExerxAFxGTxploidy, ploidy)\n", + " #The GL are normalized to the most likely genotype\n", + " return GL-GL.min(axis = 2).reshape(GL.shape[0], GL.shape[1], 1)\n", + " else:\n", + " #All the steps in the prevous if statement are done in a single line since the error is the same and simplifies the calculation\n", + " GL = np.multiply(-np.log(AFxGTxploidy*(1-e)+(1-AFxGTxploidy)*(e/3)), arc.reshape(arc.shape[0], arc.shape[1], arc.shape[2], 1)).sum(axis = 2)\n", + " #The GL are normalized to the most likely genotype\n", + " return GL-GL.min(axis = 2).reshape(GL.shape[0], GL.shape[1], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "84cf0c5a-39c4-413c-8cd5-42f91ad48c03", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[ 0. , 17.58112274, 20.94841857, 20.94841857,\n", + " 121.29153804, 121.96729347, 121.96729347, 125.3345893 ,\n", + " 125.3345893 , 125.3345893 ],\n", + " [ 46.37453531, 0. , 67.3459166 , 67.3459166 ,\n", + " 123.1925094 , 131.32453737, 131.32453737, 204.05353475,\n", + " 198.67045397, 204.05353475]],\n", + "\n", + " [[105.11933296, 105.11933296, 98.3847413 , 17.56964138,\n", + " 105.11933296, 98.3847413 , 17.56964138, 97.03323043,\n", + " 10.83504972, 0. ],\n", + " [156.91139313, 77.44780409, 148.16101652, 74.08050826,\n", + " 67.96426524, 74.08050826, 0. , 152.86834187,\n", + " 70.71321243, 63.92121397]]])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GL_norm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa5676d3-aece-4e18-beb6-68407b943267", + "metadata": {}, + "outputs": [], + "source": [ + ".transpose((1, 0, 2)).reshape(-1).reshape(GL.shape[1], GL.shape[0]*3)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "2a010180-f518-4cad-a14c-677b0bb2523b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[ 0. , 17.58112274, 121.29153804],\n", + " [ 46.37453531, 0. , 123.1925094 ]],\n", + "\n", + " [[105.11933296, 105.11933296, 105.11933296],\n", + " [156.91139313, 77.44780409, 67.96426524]]])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GL_norm[:, :, [0, 1, 4]]" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "be55b850-e327-4ca8-a981-7db2449b8762", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[9.99999954e-01, 4.63068653e-08, 2.10743559e-53],\n", + " [3.62047222e-21, 1.00000000e+00, 1.57450108e-54]],\n", + "\n", + " [[2.50000000e-01, 5.00000000e-01, 2.50000000e-01],\n", + " [2.34794049e-39, 1.52165191e-04, 9.99847835e-01]]])" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#GL_norm = allelereadcounts_to_GL(arc, e, ploidy = 2)\n", + "np.exp(-GL_norm[:, :, [0, 1, 4]])*np.array([1/4, 1/2, 1/4])/np.sum(np.exp(-GL_norm[:, :, [0, 1, 4]])*np.array([1/4, 1/2, 1/4]), axis = 2).reshape(2, 2, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "466b7ce6-416f-41f9-91d2-bb81fe72190a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[9.99999954e-01, 4.63068653e-08, 2.10743559e-53],\n", + " [3.62047222e-21, 1.00000000e+00, 1.57450108e-54]],\n", + "\n", + " [[2.50000000e-01, 5.00000000e-01, 2.50000000e-01],\n", + " [2.34794049e-39, 1.52165191e-04, 9.99847835e-01]]])" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#GL_norm = allelereadcounts_to_GL(arc, e, ploidy = 2)\n", + "np.exp(-GL_norm[:, :, [0, 1, 4]])*np.array([1/4, 1/2, 1/4])/np.repeat(np.sum(np.exp(-GL_norm[:, :, [0, 1, 4]])*np.array([1/4, 1/2, 1/4]), axis = 2), 3).reshape(2, 2, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "330951c9-0de9-412d-acb6-814f37822b0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[2.50000012e-01, 2.50000012e-01, 2.50000012e-01],\n", + " [5.00000000e-01, 5.00000000e-01, 5.00000000e-01]],\n", + "\n", + " [[2.22460932e-46, 2.22460932e-46, 2.22460932e-46],\n", + " [7.61203431e-31, 7.61203431e-31, 7.61203431e-31]]])" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.repeat(np.sum(np.exp(-GL_norm[:, :, [0, 1, 4]])*np.array([1/4, 1/2, 1/4]), axis = 2), 3).reshape(2, 2, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "b960168b-7a4f-411d-abff-9634897ef403", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2, 2, 1)" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GL_norm.shape[:2] + (1,)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "54e415a5-da7c-4a46-97a6-eefbf20abe24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([9.99999954e-01, 4.63068653e-08, 2.10743559e-53])" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.exp(-GL_norm[[0, 0, 0], [0, 0, 0], [0, 1, 4]])*np.array([1/4, 1/2, 1/4])/np.sum(np.exp(-GL_norm[[0, 0, 0], [0, 0, 0], [0, 1, 4]])*np.array([1/4, 1/2, 1/4]))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "9d858005-6c52-432b-b047-6448b4a8e3d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([9.99999953e-01, 4.63068652e-08, 7.98394228e-10])" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#GL_notnorm = allelereadcounts_to_GL(arc, e, ploidy = 2)\n", + "np.exp(-GL_notnorm[[0, 0, 0], [0, 0, 0], [0, 1, 2]])*np.array([1/4, 1/2, 1/4])/np.sum(np.exp(-GL_notnorm[[0, 0, 0], [0, 0, 0], [0, 1, 2]])*np.array([1/4, 1/2, 1/4]))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e448a917-2a8f-4751-b516-ee29b8cc8b37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.049787068367863944" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.exp(-3)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "6629c645-b39a-4c6a-8cb5-a8e44dcbfe17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.0" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "-np.log(0.049787068367863944)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33ad2a60-492c-4117-82e7-388bf2439818", + "metadata": {}, "outputs": [], "source": [] } diff --git a/simGL/__init__.py b/simGL/__init__.py index f6c6669..a8471cc 100644 --- a/simGL/__init__.py +++ b/simGL/__init__.py @@ -1 +1 @@ -from .simGL import incorporate_monomorphic, sim_allelereadcounts, allelereadcounts_to_GL, GL_to_Mm, allelereadcounts_to_pileup \ No newline at end of file +from .simGL import incorporate_monomorphic, depth_per_haplotype, sim_allelereadcounts, allelereadcounts_to_GL, GL_to_Mm, allelereadcounts_to_pileup \ No newline at end of file