Merge pull request #8 from BarinthusBio/revert-7-HLAF-8-better_tests

Revert "HLAF-8 better tests"
BarinthusBio · Aug 28, 2024 · cb18736 · cb18736
2 parents bed6d10 + ed8f4f9
commit cb18736
Show file tree

Hide file tree

Showing 12 changed files with 152 additions and 263 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,5 +5,4 @@ writeup/
 envs/.venv
 build/
 dist/
-src/*.egg-info/
 .coverage
diff --git a/src/HLAfreq.egg-info/PKG-INFO b/src/HLAfreq.egg-info/PKG-INFO
@@ -0,0 +1,105 @@
+Metadata-Version: 2.1
+Name: HLAfreq
+Version: 0.0.2
+Summary: Download and combine HLA frequency data from multiple studies
+Home-page: https://github.com/Vaccitech/HLAfreq
+Author: David Wells
+Author-email: david.wells@vaccitech.co.uk
+Project-URL: Tracker, https://github.com/Vaccitech/HLAfreq/issues
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Description-Content-Type: text/markdown
+License-File: LICENSE
+
+# HLAfreq
+
+`HLAfreq` allows you to download and combine HLA allele
+frequencies from multiple datasets, e.g. combine data from
+several studies within a country or combine countries.
+Useful for studying regional diversity in immune genes
+and, when paired with epitope prediction, estimating a population's
+ability to mount an immune response to specific epitopes.
+
+Automated download of allele frequency data download from 
+[allele frequencies.net](http://www.allelefrequencies.net/).
+
+## Details
+Estimates are combined by modelling allele frequency as a 
+Dirichlet distribution which defines the probability of drawing each
+allele. When combining studies their estimates are weighted as 2x sample size by
+default. Sample size is doubled as each person in the study
+contributes two alleles. Alternative weightings can be used
+for example population size when averaging across countries.
+
+When selecting a panel of HLA alleles to represent a population,
+allele frequency is not the only thing to consider. Depending on
+the purpose of the panel, you should include a range of loci and
+supertypes (groups alleles sharing binding specificies).
+
+## Install
+```
+pip install HLAfreq
+```
+
+## Minimal example
+Download HLA data using `makeURL()` and `getAFdata()`.
+All arguments that can be specified in the webpage form are available,
+see `help(HLAfreq.makeURL)` for details (press `q` to exit).
+```
+import HLAfreq
+base_url = HLAfreq.makeURL("Uganda", locus="A")
+aftab = HLAfreq.getAFdata(base_url)
+```
+
+After downloading the data, it must be filtered so that all studies
+sum to allele frequency 1 (within tolerence). Then we must ensure
+that all studies report alleles at the same resolution.
+Finaly we can combine frequency estimates.
+```
+aftab = HLAfreq.only_complete(aftab)
+aftab = HLAfreq.decrease_resolution(aftab, 2)
+caf = HLAfreq.combineAF(aftab)
+```
+
+## Detailed examples
+For more detailed walkthroughs see [HLAfreq/examples](https://github.com/Vaccitech/HLAfreq/tree/main/examples).
+
+- [Single country](https://github.com/Vaccitech/HLAfreq/blob/main/examples/single_country.ipynb) download and combine
+- [Multi-country](https://github.com/Vaccitech/HLAfreq/blob/main/examples/multi_country.ipynb) download and combine, weight by population coverage
+- [Using priors](https://github.com/Vaccitech/HLAfreq/blob/main/examples/working_with_priors.ipynb)
+- [Credible intervals](https://github.com/Vaccitech/HLAfreq/blob/main/examples/credible_intervals.ipynb)
+
+## Docs
+For help on specific functions view the docstring, `help(function_name)`.
+Full documentation API at [HLAfreq/docs](https://github.com/Vaccitech/HLAfreq/blob/main/docs/HLAfreq.md)
+created with pdoc3 in pdf mode.
+
+<!-- ## Developer notes
+# Install in dev mode
+pip install -e HLAfreq
+
+Update version in setup.py
+
+Update documentation with `pdoc --pdf -o docs/ src/HLAfreq/ > docs/HLAfreq.md`.
+
+Run tests `pytest` 
+
+# Clear old build info
+rm -rf build dist src/*.egg-info 
+
+Build with `python -m build`.
+
+twine check dist/*
+
+# Upload to test pypi
+twine upload --repository testpypi dist/*
+
+# Install from test pypi
+python3 -m pip install --extra-index-url https://test.pypi.org/simple/ HLAfreq
+
+# Upload to pypi
+twine upload dist/*
+-->
+
+## Citation
+*In prep.*
diff --git a/src/HLAfreq.egg-info/SOURCES.txt b/src/HLAfreq.egg-info/SOURCES.txt
@@ -0,0 +1,18 @@
+LICENSE
+MANIFEST.in
+README.md
+pyproject.toml
+setup.py
+src/HLAfreq/HLAfreq.py
+src/HLAfreq/HLAfreq_data.py
+src/HLAfreq/HLAfreq_pymc.py
+src/HLAfreq/__init__.py
+src/HLAfreq/examples.py
+src/HLAfreq.egg-info/PKG-INFO
+src/HLAfreq.egg-info/SOURCES.txt
+src/HLAfreq.egg-info/dependency_links.txt
+src/HLAfreq.egg-info/requires.txt
+src/HLAfreq.egg-info/top_level.txt
+src/HLAfreq/data/HLA1supertypes_Sidney2008.csv
+src/HLAfreq/data/countries.csv
+tests/test_single_country_caf.py
diff --git a/src/HLAfreq.egg-info/dependency_links.txt b/src/HLAfreq.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/HLAfreq.egg-info/requires.txt b/src/HLAfreq.egg-info/requires.txt
@@ -0,0 +1,8 @@
+bs4
+requests
+pandas
+numpy
+matplotlib
+scipy
+pymc>=3
+arviz
diff --git a/src/HLAfreq.egg-info/top_level.txt b/src/HLAfreq.egg-info/top_level.txt
@@ -0,0 +1 @@
+HLAfreq
diff --git a/src/HLAfreq/HLAfreq.py b/src/HLAfreq/HLAfreq.py
@@ -8,7 +8,6 @@
 global HLA frequencies.
 """
 
-from collections.abc import Iterable
 from bs4 import BeautifulSoup
 import requests
 import pandas as pd
@@ -19,33 +18,6 @@
 import matplotlib.colors as mcolors
 
 
-def simulate_population(alleles: Iterable[str], locus: str, population: str):
-    pop_size = np.random.randint(len(alleles), 50)
-    samples = np.random.choice(alleles, pop_size, replace=True)
-    counts = pd.Series(samples).value_counts()
-    counts.values / pop_size
-    pop = pd.DataFrame(
-        {
-            "allele": counts.index,
-            "loci": locus,
-            "population": population,
-            "allele_freq": counts.values / pop_size,
-            "sample_size": pop_size,
-        }
-    )
-    return pop
-
-
-def simulate_study(alleles, populations, locus):
-    study = []
-    for i in range(populations):
-        pop = simulate_population(alleles=alleles, locus=locus, population=f"pop_{i}")
-        study.append(pop)
-
-    study = pd.concat(study)
-    return study
-
-
 def makeURL(
     country="",
     standard="s",
@@ -192,9 +164,7 @@ def Npages(bs):
     # Get the table with number of pages
     navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"})
     if not navtab:
-        raise AssertionError(
-            "navtab does not evaluate to True. Check URL returns results in web browser."
-        )
+        raise AssertionError("navtab does not evaluate to True. Check URL returns results in web browser.")
     # Get cell with ' of ' in
     pagesOfN = [
         td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text
@@ -254,9 +224,7 @@ def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
     try:
         bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser")
     except requests.exceptions.ReadTimeout as e:
-        raise Exception(
-            "Requests timeout, try a larger `timeout` value for `getAFdata()`"
-        ) from e
+        raise Exception("Requests timeout, try a larger `timeout` value for `getAFdata()`") from e
     # How many pages of results
     N = Npages(bs)
     print("%s pages of results" % N)
@@ -269,9 +237,7 @@ def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
         try:
             bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser")
         except requests.exceptions.ReadTimeout as e:
-            raise Exception(
-                "Requests timeout, try a larger `timeout` value for `getAFdata()`"
-            ) from e
+            raise Exception("Requests timeout, try a larger `timeout` value for `getAFdata()`") from e
         tab = parseAF(bs)
         tabs.append(tab)
     print("Download complete")
@@ -401,13 +367,9 @@ def collapse_reduced_alleles(AFtab, datasetID="population"):
     ).reset_index()
     # Within a study each all identical alleles should have the same loci and sample size
     if not all(collapsed["#loci"] == 1):
-        raise AssertionError(
-            "Multiple loci found for a single allele in a single population"
-        )
+        raise AssertionError("Multiple loci found for a single allele in a single population")
     if not all(collapsed["#sample_sizes"] == 1):
-        raise AssertionError(
-            "Multiple sample_sizes found for a single allele in a single population"
-        )
+        raise AssertionError("Multiple sample_sizes found for a single allele in a single population")
     collapsed = collapsed[
         ["allele", "loci", "population", "allele_freq", "sample_size"]
     ]
@@ -439,9 +401,7 @@ def unmeasured_alleles(AFtab, datasetID="population"):
             # What was the sample size for this data?
             dataset_sample_size = datasetAF.sample_size.unique()
             if not (len(dataset_sample_size) == 1):
-                raise AssertionError(
-                    "dataset_sample_size must be 1, not %s" % len(dataset_sample_size)
-                )
+                raise AssertionError("dataset_sample_size must be 1, not %s" % len(dataset_sample_size))
             dataset_sample_size = dataset_sample_size[0]
             # Get all alleles for this locus (across datasets)
             ualleles = df[df.loci == locus].allele.unique()
@@ -529,14 +489,10 @@ def combineAF(
             raise AssertionError("The same allele appears multiple times in a dataset")
     if complete:
         if not incomplete_studies(df, datasetID=datasetID).empty:
-            raise AssertionError(
-                "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)"
-            )
+            raise AssertionError("AFtab contains studies with AF that doesn't sum to 1. Checkincomplete_studies(AFtab)")
     if resolution:
         if not check_resolution(df):
-            raise AssertionError(
-                "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)"
-            )
+            raise AssertionError("AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)")
     if format:
         df = formatAF(df, ignoreG)
     if add_unmeasured:
@@ -593,7 +549,7 @@ def single_loci(AFtab):
         AFtab (pd.DataFrame): Allele frequency data
     """
     if not len(AFtab.loci.unique()) == 1:
-        raise AssertionError("'AFtab' must contain only 1 loci")
+        raise AssertionError("'AFtab' must conatain only 1 loci")
 
 
 def alleles_unique_in_study(AFtab, datasetID="population"):
@@ -632,9 +588,7 @@ def id_duplicated_allele(grouped):
     """Reports the allele that has mupltiple sample sizes"""
     duplicated_population = grouped.population.apply(lambda x: any(x.duplicated()))
     if not all(~duplicated_population):
-        raise AssertionError(
-            f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}"
-        )
+        raise AssertionError(f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}")
 
 
 def population_coverage(p):

diff --git a/src/HLAfreq/HLAfreq_pymc.py b/src/HLAfreq/HLAfreq_pymc.py
@@ -29,14 +29,10 @@ def _make_c_array(
             raise AssertionError("The same allele appears multiple times in a dataset")
     if complete:
         if not HLAfreq.incomplete_studies(df, datasetID=datasetID).empty:
-            raise AssertionError(
-                "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)"
-            )
+            raise AssertionError("AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)")
     if resolution:
         if not HLAfreq.check_resolution(df):
-            raise AssertionError(
-                "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)"
-            )
+            raise AssertionError("AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)")
     if format:
         df = HLAfreq.formatAF(df, ignoreG)
     if add_unmeasured:
@@ -57,10 +53,8 @@ def _make_c_array(
     # The check is that the sum of allele i is the same
     for a, b in zip(np.apply_along_axis(sum, 0, c_array), df.groupby("allele").c.sum()):
         if not math.isclose(a, b):
-            raise AssertionError(
-                "Error making c_array sum of single allele"
-                "frequency differs between c_array and AFloc"
-            )
+            raise AssertionError("Error making c_array sum of single allele"
+                                 "frequency differs between c_array and AFloc")
     return c_array, allele_names
 
 

diff --git a/src/HLAfreq/examples.py b/src/HLAfreq/examples.py
@@ -7,3 +7,4 @@
 - [Using priors](https://BarinthusBio.github.io/HLAfreq/HLAfreq/examples/working_with_priors.html)
 - [Credible intervals](https://BarinthusBio.github.io/HLAfreq/HLAfreq/examples/credible_intervals.html)
 """
+
diff --git a/tests/test_HLAfreq_pymc.py b/tests/test_HLAfreq_pymc.py