popsim-consortium · grahamgower · Jul 17, 2020 · Jun 15, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -37,12 +37,12 @@ jobs:
             flake8 --max-line-length 89 stdpopsim setup.py tests docs/_ext
 
       - run:
-          name: Run Python tests 
+          name: Run Python tests
           command: |
-            nosetests -vs --with-coverage --cover-package stdpopsim \
+            nosetests -vs --with-coverage --cover-package stdpopsim maintenance \
               --cover-branches --cover-erase --cover-xml \
               --cover-inclusive tests
-            python3 -m codecov 
+            python3 -m codecov
             rm .coverage
 
       - run:
@@ -53,12 +53,12 @@ jobs:
             python setup.py check
             python -m twine check dist/*.tar.gz
             python -m venv venv
-            source venv/bin/activate 
+            source venv/bin/activate
             pip install --upgrade setuptools pip wheel
             python setup.py build_ext
             python setup.py egg_info
             python setup.py bdist_wheel
-            pip install dist/*.tar.gz 
+            pip install dist/*.tar.gz
             python3 -m stdpopsim -V
             # Run a small simulation to make sure it all works.
             stdpopsim HomSap -c chr22 -l 0.001 2 > /dev/null

diff --git a/maintenance/__init__.py b/maintenance/__init__.py
@@ -0,0 +1,4 @@
+"""
+Code used for internal maintenance tasks.
+"""
+from . ensembl import *
diff --git a/maintenance/ensembl.py b/maintenance/ensembl.py
@@ -0,0 +1,113 @@
+"""
+Utilites for working with the ensembl Rest API.
+"""
+import json
+import time
+import logging
+
+import urllib.parse
+import urllib.request
+
+
+class EnsemblRestClient:
+    """
+    A client for the Ensembl REST API. Based on the example code
+    given at
+    https://github.com/Ensembl/ensembl-rest/wiki/Example-Python-Client
+
+
+    The ``max_requests_per_second`` parameter is the number of requests per second.
+    """
+
+    def __init__(self, server="http://rest.ensembl.org", max_requests_per_second=15):
+        self.server = server
+        self.max_requests_per_second = max_requests_per_second
+        self.num_requests = 0
+        self.last_request_time = 0
+
+    def _make_request(self, endpoint, headers=None, params=None):
+        if headers is None:
+            headers = {}
+        else:
+            headers = dict(headers)
+        if "Content-type" not in headers:
+            headers["Content-type"] = "Application/json"
+
+        quoted_endpoint = urllib.parse.quote(endpoint)
+        url = urllib.parse.urljoin(self.server, quoted_endpoint)
+        parsed_url = urllib.parse.urlparse(url)
+        new_url = list(parsed_url)
+        if params is not None:
+            new_url[4] = urllib.parse.urlencode(params, doseq=True)
+        return urllib.request.Request(urllib.parse.urlunparse(new_url), headers=headers)
+
+    def _sleep_if_needed(self):
+        # check if we need to rate limit ourselves
+        if self.num_requests >= self.max_requests_per_second:
+            delta = time.time() - self.last_request_time
+            if delta < 1:
+                logging.debug("Rate limiting REST API")
+                time.sleep(1 - delta)
+            self.num_requests = 0
+        else:
+            self.num_requests += 1
+        self.last_request_time = time.time()
+
+    def get(self, endpoint, headers=None, params=None):
+        """
+        Runs a HTTP get request to the specified endpoint with the
+        specified headers and parameters, and returns a dictionary
+        consisting of the response.
+        """
+        self._sleep_if_needed()
+        request = self._make_request(endpoint, headers, params)
+        logging.debug("making request to %s", request.full_url)
+        response = urllib.request.urlopen(request)
+        content = response.read()
+        data = json.loads(content)
+        return data
+
+    def get_release(self):
+        """
+        Returns the current ensembl release number.
+        """
+        output = self.get(endpoint=f"/info/data")
+        releases = output["releases"]
+        # Docs say: may return more than one release
+        # (unfrequent non-standard Ensembl configuration).
+        assert len(releases) == 1
+        return releases[0]
+
+    def get_genome_data(self, ensembl_id):
+        """
+        Returns the genome data for the specified Ensembl species
+        ID (e.g., "homo_sapiens"). This is a subset of what
+        is returned by the Ensembl API, to restrict to the information
+        that we're interested in for stdpopsim.
+        """
+        output = self.get(
+            endpoint=f"/info/assembly/{ensembl_id}", params={"synonyms": "1"}
+        )
+        data = {
+            "assembly_accession": output["assembly_accession"],
+            "assembly_name": output["assembly_name"],
+        }
+        chromosomes = {}
+        for region in output["top_level_region"]:
+            if region["coord_system"] == "chromosome":
+                synonyms = []
+                for record in region.get("synonyms", []):
+                    # We're only interested in UCSC synonyms
+                    if record["dbname"] == "UCSC":
+                        synonyms.append(record["name"])
+                chromosomes[region["name"]] = {
+                    "length": region["length"],
+                    "synonyms": synonyms
+                }
+        # Make sure the chromosomes are sorted by name
+        data["chromosomes"] = {
+            name: chromosomes[name]
+            for name in output["karyotype"]
+            if name in chromosomes
+        }
+        return data
diff --git a/stdpopsim/catalog/AraTha.py → stdpopsim/catalog/AraTha/__init__.py b/stdpopsim/catalog/AraTha.py → stdpopsim/catalog/AraTha/__init__.py
@@ -5,39 +5,39 @@
 import numpy as np
 
 import stdpopsim
+from . import genome_data
 
 ###########################################################
 #
 # Genome definition
 #
 ###########################################################
 
-# Data for length information based on:
-# https://www.arabidopsis.org/portals/genAnnotation/
-#   gene_structural_annotation/agicomplete.jsp
-# Lengths from TAIR 10 although its unclear what reference the genetic map used
-#   -- follow up on this with Salome 2012 authors
+_mean_recombination_rate = 200 / 124000 / 2 / 1e6
+
+_recombination_rate_data = {
+    str(j): _mean_recombination_rate for j in range(1, 6)
+}
+_recombination_rate_data["Mt"] = 0
+_recombination_rate_data["Pt"] = 0  # JK Is this correct??
+
 
-_chromosome_data = """\
-chr1 30427671
-chr2 19698289
-chr3 23459830
-chr4 18585056
-chr5 26975502
-"""
 # mutation rate from Ossowski 2010 Science
 # recombination value from Huber et al 2014 MBE
 # rho=200/Mb, assume Ne=124,000, rho=2*Ne*r
 _chromosomes = []
-for line in _chromosome_data.splitlines():
-    name, length = line.split()[:2]
+for name, data in genome_data.data["chromosomes"].items():
     _chromosomes.append(stdpopsim.Chromosome(
-        id=name, length=int(length),
+        id=name,
+        length=data["length"],
+        synonyms=data["synonyms"],
         mutation_rate=7e-9,
-        recombination_rate=200 / 124000 / 2 / 1e6))
+        recombination_rate=_recombination_rate_data[name]))
 
 _genome = stdpopsim.Genome(
         chromosomes=_chromosomes,
+        assembly_name=genome_data.data["assembly_name"],
+        assembly_accession=genome_data.data["assembly_accession"],
         mutation_rate_citations=[
             stdpopsim.Citation(
                 author="Ossowski et al.",
@@ -99,7 +99,7 @@
     url=(
         "https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
         "AraTha/salome2012_maps.tar.gz"),
-    file_pattern="arab_{id}_map_loess.txt",
+    file_pattern="arab_chr{id}_map_loess.txt",
     citations=[stdpopsim.Citation(
         doi="https://doi.org/10.1038/hdy.2011.95",
         author="Salomé et al.",

diff --git a/stdpopsim/catalog/AraTha/genome_data.py b/stdpopsim/catalog/AraTha/genome_data.py
@@ -0,0 +1,14 @@
+# File autogenerated from Ensembl REST API. Do not edit.
+data = {
+    "assembly_accession": "GCA_000001735.1",
+    "assembly_name": "TAIR10",
+    "chromosomes": {
+        "1": {"length": 30427671, "synonyms": []},
+        "2": {"length": 19698289, "synonyms": []},
+        "3": {"length": 23459830, "synonyms": []},
+        "4": {"length": 18585056, "synonyms": []},
+        "5": {"length": 26975502, "synonyms": []},
+        "Mt": {"length": 366924, "synonyms": []},
+        "Pt": {"length": 154478, "synonyms": []},
+    },
+}
diff --git a/stdpopsim/catalog/CanFam.py → stdpopsim/catalog/CanFam/__init__.py b/stdpopsim/catalog/CanFam.py → stdpopsim/catalog/CanFam/__init__.py
@@ -1,49 +1,51 @@
 import stdpopsim
+from . import genome_data
 
-# Lengths are based on CanFam3.1, and recombination rates are the means
-# calculated from the Campbell2016 map. ChrX isn't included in that genetic
-# map, so the weighted genome-wide average is provided here instead.
-_chromosome_data = """\
-chr1	122678785	7.636001498077e-09
-chr2	85426708	8.798516284201015e-09
-chr3	91889043	8.000868549297668e-09
-chr4	88276631	8.052302321538563e-09
-chr5	88915250	9.344333839828404e-09
-chr6	77573801	8.192191165856811e-09
-chr7	80974532	7.293465322857858e-09
-chr8	74330416	8.291312822214086e-09
-chr9	61074082	9.287722798450121e-09
-chr10	69331447	9.107151930130527e-09
-chr11	74389097	7.639449804041734e-09
-chr12	72498081	7.761061128067527e-09
-chr13	63241923	8.413015225299971e-09
-chr14	60966679	9.028123091776737e-09
-chr15	64190966	7.856754982030383e-09
-chr16	59632846	8.614063697877811e-09
-chr17	64289059	9.718834385033715e-09
-chr18	55844845	1.029925446520415e-08
-chr19	53741614	1.0425051705950442e-08
-chr20	58134056	9.990971109010329e-09
-chr21	50858623	1.0339033506095636e-08
-chr22	61439934	8.61504863805126e-09
-chr23	52294480	9.1266350068389e-09
-chr24	47698779	1.1146043069685935e-08
-chr25	51628933	1.1543746646856006e-08
-chr26	38964690	1.2084571786110922e-08
-chr27	45876710	1.1260328390864541e-08
-chr28	41182112	1.2463587044656355e-08
-chr29	41845238	1.1013629677730708e-08
-chr30	40214260	1.1687642441683382e-08
-chr31	39895921	1.1397713284329192e-08
-chr32	38810281	1.1555927931648279e-08
-chr33	31377067	1.3339402745926785e-08
-chr34	42124431	1.0483812411227089e-08
-chr35	26524999	1.4299102611645524e-08
-chr36	30810995	1.187517782077471e-08
-chr37	30902991	1.3834580623461596e-08
-chr38	23914537	1.4363726512881696e-08
-chrX	123869142	9.506483722244087e-09
-"""
+# Recombination rates are the means calculated from the Campbell2016 map. ChrX
+# isn't included in that genetic map, so the weighted genome-wide average is
+# provided here instead.
+_recombination_rate_data = {
+    "1": 7.636001498077e-09,
+    "2": 8.798516284201015e-09,
+    "3": 8.000868549297668e-09,
+    "4": 8.052302321538563e-09,
+    "5": 9.344333839828404e-09,
+    "6": 8.192191165856811e-09,
+    "7": 7.293465322857858e-09,
+    "8": 8.291312822214086e-09,
+    "9": 9.287722798450121e-09,
+    "10": 9.107151930130527e-09,
+    "11": 7.639449804041734e-09,
+    "12": 7.761061128067527e-09,
+    "13": 8.413015225299971e-09,
+    "14": 9.028123091776737e-09,
+    "15": 7.856754982030383e-09,
+    "16": 8.614063697877811e-09,
+    "17": 9.718834385033715e-09,
+    "18": 1.029925446520415e-08,
+    "19": 1.0425051705950442e-08,
+    "20": 9.990971109010329e-09,
+    "21": 1.0339033506095636e-08,
+    "22": 8.61504863805126e-09,
+    "23": 9.1266350068389e-09,
+    "24": 1.1146043069685935e-08,
+    "25": 1.1543746646856006e-08,
+    "26": 1.2084571786110922e-08,
+    "27": 1.1260328390864541e-08,
+    "28": 1.2463587044656355e-08,
+    "29": 1.1013629677730708e-08,
+    "30": 1.1687642441683382e-08,
+    "31": 1.1397713284329192e-08,
+    "32": 1.1555927931648279e-08,
+    "33": 1.3339402745926785e-08,
+    "34": 1.0483812411227089e-08,
+    "35": 1.4299102611645524e-08,
+    "36": 1.187517782077471e-08,
+    "37": 1.3834580623461596e-08,
+    "38": 1.4363726512881696e-08,
+    "X": 9.506483722244087e-09,
+    "MT": 0,
+}
 
 _LindbladTohEtAl = stdpopsim.Citation(
         # Genome sequence, comparative analysis and haplotype structure of the
@@ -73,15 +75,17 @@
         doi="https://doi.org/10.1534/g3.116.034678")
 
 _chromosomes = []
-for line in _chromosome_data.splitlines():
-    name, length, mean_rr = line.split()
+for name, data in genome_data.data["chromosomes"].items():
     _chromosomes.append(stdpopsim.Chromosome(
-        id=name, length=int(length),
+        id=name, length=data["length"],
+        synonyms=data["synonyms"],
         mutation_rate=4e-9,  # based on non-CpG sites only
-        recombination_rate=float(mean_rr)))
+        recombination_rate=_recombination_rate_data[name]))
 
 _genome = stdpopsim.Genome(
     chromosomes=_chromosomes,
+    assembly_name=genome_data.data["assembly_name"],
+    assembly_accession=genome_data.data["assembly_accession"],
     mutation_rate_citations=[
         _SkoglundEtAl.because(stdpopsim.CiteReason.MUT_RATE),
         _FranzEtAl.because(stdpopsim.CiteReason.MUT_RATE),
@@ -126,7 +130,7 @@
         """,
     url="https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
         "CanFam/dog_genetic_maps.tar.gz",
-    file_pattern="{id}_average_canFam3.1.txt",
+    file_pattern="chr{id}_average_canFam3.1.txt",
     citations=[
             _CampbellEtAl.because(stdpopsim.CiteReason.GEN_MAP)
         ],