Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic infrastructure for pulling genome data from Ensembl #563

Merged
merged 1 commit into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ jobs:
flake8 --max-line-length 89 stdpopsim setup.py tests docs/_ext

- run:
name: Run Python tests
name: Run Python tests
command: |
nosetests -vs --with-coverage --cover-package stdpopsim \
nosetests -vs --with-coverage --cover-package stdpopsim maintenance \
--cover-branches --cover-erase --cover-xml \
--cover-inclusive tests
python3 -m codecov
python3 -m codecov
rm .coverage

- run:
Expand All @@ -53,12 +53,12 @@ jobs:
python setup.py check
python -m twine check dist/*.tar.gz
python -m venv venv
source venv/bin/activate
source venv/bin/activate
pip install --upgrade setuptools pip wheel
python setup.py build_ext
python setup.py egg_info
python setup.py bdist_wheel
pip install dist/*.tar.gz
pip install dist/*.tar.gz
python3 -m stdpopsim -V
# Run a small simulation to make sure it all works.
stdpopsim HomSap -c chr22 -l 0.001 2 > /dev/null
Expand Down
4 changes: 4 additions & 0 deletions maintenance/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
Code used for internal maintenance tasks.
"""
from . ensembl import *
113 changes: 113 additions & 0 deletions maintenance/ensembl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Utilites for working with the ensembl Rest API.
"""
import json
import time
import logging

import urllib.parse
import urllib.request


class EnsemblRestClient:
"""
A client for the Ensembl REST API. Based on the example code
given at
https://github.com/Ensembl/ensembl-rest/wiki/Example-Python-Client


The ``max_requests_per_second`` parameter is the number of requests per second.
"""

def __init__(self, server="http://rest.ensembl.org", max_requests_per_second=15):
self.server = server
self.max_requests_per_second = max_requests_per_second
self.num_requests = 0
self.last_request_time = 0

def _make_request(self, endpoint, headers=None, params=None):
if headers is None:
headers = {}
else:
headers = dict(headers)
if "Content-type" not in headers:
headers["Content-type"] = "Application/json"

quoted_endpoint = urllib.parse.quote(endpoint)
url = urllib.parse.urljoin(self.server, quoted_endpoint)
parsed_url = urllib.parse.urlparse(url)
new_url = list(parsed_url)
if params is not None:
new_url[4] = urllib.parse.urlencode(params, doseq=True)
return urllib.request.Request(urllib.parse.urlunparse(new_url), headers=headers)

def _sleep_if_needed(self):
# check if we need to rate limit ourselves
if self.num_requests >= self.max_requests_per_second:
delta = time.time() - self.last_request_time
if delta < 1:
logging.debug("Rate limiting REST API")
time.sleep(1 - delta)
self.num_requests = 0
else:
self.num_requests += 1
self.last_request_time = time.time()

def get(self, endpoint, headers=None, params=None):
"""
Runs a HTTP get request to the specified endpoint with the
specified headers and parameters, and returns a dictionary
consisting of the response.
"""
self._sleep_if_needed()
request = self._make_request(endpoint, headers, params)
logging.debug("making request to %s", request.full_url)
response = urllib.request.urlopen(request)
content = response.read()
data = json.loads(content)
return data

def get_release(self):
"""
Returns the current ensembl release number.
"""
output = self.get(endpoint=f"/info/data")
releases = output["releases"]
# Docs say: may return more than one release
# (unfrequent non-standard Ensembl configuration).
assert len(releases) == 1
return releases[0]

def get_genome_data(self, ensembl_id):
"""
Returns the genome data for the specified Ensembl species
ID (e.g., "homo_sapiens"). This is a subset of what
is returned by the Ensembl API, to restrict to the information
that we're interested in for stdpopsim.
"""
output = self.get(
endpoint=f"/info/assembly/{ensembl_id}", params={"synonyms": "1"}
)
data = {
"assembly_accession": output["assembly_accession"],
"assembly_name": output["assembly_name"],
}
chromosomes = {}
for region in output["top_level_region"]:
if region["coord_system"] == "chromosome":
synonyms = []
for record in region.get("synonyms", []):
# We're only interested in UCSC synonyms
if record["dbname"] == "UCSC":
synonyms.append(record["name"])
chromosomes[region["name"]] = {
"length": region["length"],
"synonyms": synonyms
}
# Make sure the chromosomes are sorted by name
data["chromosomes"] = {
name: chromosomes[name]
for name in output["karyotype"]
if name in chromosomes
}
return data
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,39 @@
import numpy as np

import stdpopsim
from . import genome_data

###########################################################
#
# Genome definition
#
###########################################################

# Data for length information based on:
# https://www.arabidopsis.org/portals/genAnnotation/
# gene_structural_annotation/agicomplete.jsp
# Lengths from TAIR 10 although its unclear what reference the genetic map used
# -- follow up on this with Salome 2012 authors
_mean_recombination_rate = 200 / 124000 / 2 / 1e6

_recombination_rate_data = {
str(j): _mean_recombination_rate for j in range(1, 6)
}
_recombination_rate_data["Mt"] = 0
_recombination_rate_data["Pt"] = 0 # JK Is this correct??


_chromosome_data = """\
chr1 30427671
chr2 19698289
chr3 23459830
chr4 18585056
chr5 26975502
"""
# mutation rate from Ossowski 2010 Science
# recombination value from Huber et al 2014 MBE
# rho=200/Mb, assume Ne=124,000, rho=2*Ne*r
_chromosomes = []
for line in _chromosome_data.splitlines():
name, length = line.split()[:2]
for name, data in genome_data.data["chromosomes"].items():
_chromosomes.append(stdpopsim.Chromosome(
id=name, length=int(length),
id=name,
length=data["length"],
synonyms=data["synonyms"],
mutation_rate=7e-9,
recombination_rate=200 / 124000 / 2 / 1e6))
recombination_rate=_recombination_rate_data[name]))

_genome = stdpopsim.Genome(
chromosomes=_chromosomes,
assembly_name=genome_data.data["assembly_name"],
assembly_accession=genome_data.data["assembly_accession"],
mutation_rate_citations=[
stdpopsim.Citation(
author="Ossowski et al.",
Expand Down Expand Up @@ -99,7 +99,7 @@
url=(
"https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
"AraTha/salome2012_maps.tar.gz"),
file_pattern="arab_{id}_map_loess.txt",
file_pattern="arab_chr{id}_map_loess.txt",
citations=[stdpopsim.Citation(
doi="https://doi.org/10.1038/hdy.2011.95",
author="Salomé et al.",
Expand Down
14 changes: 14 additions & 0 deletions stdpopsim/catalog/AraTha/genome_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# File autogenerated from Ensembl REST API. Do not edit.
data = {
"assembly_accession": "GCA_000001735.1",
"assembly_name": "TAIR10",
"chromosomes": {
"1": {"length": 30427671, "synonyms": []},
"2": {"length": 19698289, "synonyms": []},
"3": {"length": 23459830, "synonyms": []},
"4": {"length": 18585056, "synonyms": []},
"5": {"length": 26975502, "synonyms": []},
"Mt": {"length": 366924, "synonyms": []},
"Pt": {"length": 154478, "synonyms": []},
},
}
102 changes: 53 additions & 49 deletions stdpopsim/catalog/CanFam.py → stdpopsim/catalog/CanFam/__init__.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,51 @@
import stdpopsim
from . import genome_data

# Lengths are based on CanFam3.1, and recombination rates are the means
# calculated from the Campbell2016 map. ChrX isn't included in that genetic
# map, so the weighted genome-wide average is provided here instead.
_chromosome_data = """\
chr1 122678785 7.636001498077e-09
chr2 85426708 8.798516284201015e-09
chr3 91889043 8.000868549297668e-09
chr4 88276631 8.052302321538563e-09
chr5 88915250 9.344333839828404e-09
chr6 77573801 8.192191165856811e-09
chr7 80974532 7.293465322857858e-09
chr8 74330416 8.291312822214086e-09
chr9 61074082 9.287722798450121e-09
chr10 69331447 9.107151930130527e-09
chr11 74389097 7.639449804041734e-09
chr12 72498081 7.761061128067527e-09
chr13 63241923 8.413015225299971e-09
chr14 60966679 9.028123091776737e-09
chr15 64190966 7.856754982030383e-09
chr16 59632846 8.614063697877811e-09
chr17 64289059 9.718834385033715e-09
chr18 55844845 1.029925446520415e-08
chr19 53741614 1.0425051705950442e-08
chr20 58134056 9.990971109010329e-09
chr21 50858623 1.0339033506095636e-08
chr22 61439934 8.61504863805126e-09
chr23 52294480 9.1266350068389e-09
chr24 47698779 1.1146043069685935e-08
chr25 51628933 1.1543746646856006e-08
chr26 38964690 1.2084571786110922e-08
chr27 45876710 1.1260328390864541e-08
chr28 41182112 1.2463587044656355e-08
chr29 41845238 1.1013629677730708e-08
chr30 40214260 1.1687642441683382e-08
chr31 39895921 1.1397713284329192e-08
chr32 38810281 1.1555927931648279e-08
chr33 31377067 1.3339402745926785e-08
chr34 42124431 1.0483812411227089e-08
chr35 26524999 1.4299102611645524e-08
chr36 30810995 1.187517782077471e-08
chr37 30902991 1.3834580623461596e-08
chr38 23914537 1.4363726512881696e-08
chrX 123869142 9.506483722244087e-09
"""
# Recombination rates are the means calculated from the Campbell2016 map. ChrX
# isn't included in that genetic map, so the weighted genome-wide average is
# provided here instead.
_recombination_rate_data = {
"1": 7.636001498077e-09,
"2": 8.798516284201015e-09,
"3": 8.000868549297668e-09,
"4": 8.052302321538563e-09,
"5": 9.344333839828404e-09,
"6": 8.192191165856811e-09,
"7": 7.293465322857858e-09,
"8": 8.291312822214086e-09,
"9": 9.287722798450121e-09,
"10": 9.107151930130527e-09,
"11": 7.639449804041734e-09,
"12": 7.761061128067527e-09,
"13": 8.413015225299971e-09,
"14": 9.028123091776737e-09,
"15": 7.856754982030383e-09,
"16": 8.614063697877811e-09,
"17": 9.718834385033715e-09,
"18": 1.029925446520415e-08,
"19": 1.0425051705950442e-08,
"20": 9.990971109010329e-09,
"21": 1.0339033506095636e-08,
"22": 8.61504863805126e-09,
"23": 9.1266350068389e-09,
"24": 1.1146043069685935e-08,
"25": 1.1543746646856006e-08,
"26": 1.2084571786110922e-08,
"27": 1.1260328390864541e-08,
"28": 1.2463587044656355e-08,
"29": 1.1013629677730708e-08,
"30": 1.1687642441683382e-08,
"31": 1.1397713284329192e-08,
"32": 1.1555927931648279e-08,
"33": 1.3339402745926785e-08,
"34": 1.0483812411227089e-08,
"35": 1.4299102611645524e-08,
"36": 1.187517782077471e-08,
"37": 1.3834580623461596e-08,
"38": 1.4363726512881696e-08,
"X": 9.506483722244087e-09,
"MT": 0,
}

_LindbladTohEtAl = stdpopsim.Citation(
# Genome sequence, comparative analysis and haplotype structure of the
Expand Down Expand Up @@ -73,15 +75,17 @@
doi="https://doi.org/10.1534/g3.116.034678")

_chromosomes = []
for line in _chromosome_data.splitlines():
name, length, mean_rr = line.split()
for name, data in genome_data.data["chromosomes"].items():
_chromosomes.append(stdpopsim.Chromosome(
id=name, length=int(length),
id=name, length=data["length"],
synonyms=data["synonyms"],
mutation_rate=4e-9, # based on non-CpG sites only
recombination_rate=float(mean_rr)))
recombination_rate=_recombination_rate_data[name]))

_genome = stdpopsim.Genome(
chromosomes=_chromosomes,
assembly_name=genome_data.data["assembly_name"],
assembly_accession=genome_data.data["assembly_accession"],
mutation_rate_citations=[
_SkoglundEtAl.because(stdpopsim.CiteReason.MUT_RATE),
_FranzEtAl.because(stdpopsim.CiteReason.MUT_RATE),
Expand Down Expand Up @@ -126,7 +130,7 @@
""",
url="https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
"CanFam/dog_genetic_maps.tar.gz",
file_pattern="{id}_average_canFam3.1.txt",
file_pattern="chr{id}_average_canFam3.1.txt",
citations=[
_CampbellEtAl.because(stdpopsim.CiteReason.GEN_MAP)
],
Expand Down
Loading