Skip to content

Commit

Permalink
Add support for downloading plant species with pyEnsembl
Browse files Browse the repository at this point in the history
This commit introduces the ability to download plant species data using pyEnsembl. We've added the 'is_plant' parameter to the Species class, and registered two new species: Arabidopsis Thaliana and Oryza Sativa (Rice).

We've also added the ENSEMBL_PLANTS_FTP_SERVER URL, and the PLANTS_GTF_SUBDIR_TEMPLATE and PLANTS_FASTA_SUBDIR_TEMPLATE for creating the download links. The code checks if the species is a plant to determine which templates to use.
  • Loading branch information
pamonlan committed Mar 28, 2024
1 parent 2208f87 commit 8c32bc1
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 12 deletions.
3 changes: 3 additions & 0 deletions pyensembl/ensembl_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,14 @@ def __init__(
species=self.species.latin_name,
sequence_type="cdna",
server=server,
is_plant = self.species.is_plant,
),
make_fasta_url(
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="ncrna",
server=server,
is_plant = self.species.is_plant,
),
]

Expand All @@ -92,6 +94,7 @@ def __init__(
species=self.species.latin_name,
sequence_type="pep",
server=self.server,
is_plant = self.species.is_plant,
)
]

Expand Down
31 changes: 24 additions & 7 deletions pyensembl/ensembl_url_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,19 @@
from .ensembl_versions import check_release_number

ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
ENSEMBL_PLANTS_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk/"

# Example directories
# FASTA files: /pub/release-78/fasta/homo_sapiens/
# GTF annotation files: /pub/release-78/gtf/homo_sapiens/
FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/"
PLANTS_FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/fasta/%(species)s/%(type)s/"
GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/"
PLANTS_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/gtf/%(species)s/"

#List plants
#Lest do a vector with all the plants species that we added to make the custom url
lPlants = ("arabidopsis_thaliana","arabidopsis")

def normalize_release_properties(ensembl_release, species):
"""
Expand Down Expand Up @@ -63,12 +69,18 @@ def make_gtf_filename(ensembl_release, species):
}


def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER, gtf_subdir=GTF_SUBDIR_TEMPLATE):
"""
Returns a URL and a filename, which can be joined together.
"""
if species.is_plant:
server = ENSEMBL_PLANTS_FTP_SERVER
gtf_subdir = PLANTS_GTF_SUBDIR_TEMPLATE
#else:
#print(f"[+] {species.latin_name} it is not a plant", flush=True)

ensembl_release, species, _ = normalize_release_properties(ensembl_release, species)
subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species}
subdir = gtf_subdir % {"release": ensembl_release, "species": species}
filename = make_gtf_filename(ensembl_release=ensembl_release, species=species)
return server + subdir + filename

Expand All @@ -93,11 +105,11 @@ def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"


def make_fasta_filename(ensembl_release, species, sequence_type):
def make_fasta_filename(ensembl_release, species, sequence_type, is_plant):
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
)
if ensembl_release <= 75:
if ensembl_release <= 75 and not is_plant:
if sequence_type == "ncrna":
return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % {
"Species": species.capitalize(),
Expand Down Expand Up @@ -125,7 +137,7 @@ def make_fasta_filename(ensembl_release, species, sequence_type):
}


def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER):
def make_fasta_url(ensembl_release, species, sequence_type, is_plant, server=ENSEMBL_FTP_SERVER, fasta_subdir=FASTA_SUBDIR_TEMPLATE):
"""Construct URL to FASTA file with cDNA transcript or protein sequences
Parameter examples:
Expand All @@ -136,12 +148,17 @@ def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_S
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
)
subdir = FASTA_SUBDIR_TEMPLATE % {

if is_plant:
server = ENSEMBL_PLANTS_FTP_SERVER
fasta_subdir = PLANTS_FASTA_SUBDIR_TEMPLATE

subdir = fasta_subdir % {
"release": ensembl_release,
"species": species,
"type": sequence_type,
}
filename = make_fasta_filename(
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type, is_plant = is_plant
)
return server + subdir + filename
2 changes: 1 addition & 1 deletion pyensembl/ensembl_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

MIN_ENSEMBL_RELEASE = 47
MAX_ENSEMBL_RELEASE = 111

MAX_PLANTS_ENSEMBL_RELEASE = 58

def check_release_number(release):
"""
Expand Down
26 changes: 23 additions & 3 deletions pyensembl/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from serializable import Serializable

from .ensembl_versions import MAX_ENSEMBL_RELEASE
from .ensembl_versions import MAX_ENSEMBL_RELEASE, MAX_PLANTS_ENSEMBL_RELEASE

# TODO: replace Serializable with data class

Expand All @@ -30,7 +30,7 @@ class Species(Serializable):
_reference_names_to_species = {}

@classmethod
def register(cls, latin_name, synonyms, reference_assemblies):
def register(cls, latin_name, synonyms, reference_assemblies, is_plant=False):
"""
Create a Species object from the given arguments and enter into
all the dicts used to look the species up by its fields.
Expand All @@ -39,6 +39,7 @@ def register(cls, latin_name, synonyms, reference_assemblies):
latin_name=latin_name,
synonyms=synonyms,
reference_assemblies=reference_assemblies,
is_plant=is_plant,
)
cls._latin_names_to_species[species.latin_name] = species
for synonym in synonyms:
Expand Down Expand Up @@ -80,7 +81,7 @@ def all_species_release_pairs(cls):
for release in range(release_range[0], release_range[1] + 1):
yield species_name, release

def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
def __init__(self, latin_name, synonyms=[], reference_assemblies={}, is_plant=False):
"""
Parameters
----------
Expand All @@ -96,6 +97,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
self.synonyms = synonyms
self.reference_assemblies = reference_assemblies
self._release_to_genome = {}
self.is_plant = is_plant
for genome_name, (start, end) in self.reference_assemblies.items():
for i in range(start, end + 1):
if i in self._release_to_genome:
Expand Down Expand Up @@ -350,3 +352,21 @@ def check_species_object(species_name_or_object):
"R64-1-1": (76, MAX_ENSEMBL_RELEASE),
},
)

arabidopsis_thaliana = Species.register(
latin_name="arabidopsis_thaliana",
synonyms=["arabidopsis"],
reference_assemblies={
"TAIR10": (40, MAX_PLANTS_ENSEMBL_RELEASE),
},
is_plant=True
)

rice = Species.register(
latin_name="oryza_sativa",
synonyms=["rice"],
reference_assemblies={
"IRGSP-1.0": (40, MAX_PLANTS_ENSEMBL_RELEASE),
},
is_plant=True
)
2 changes: 1 addition & 1 deletion pyensembl/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "2.3.11"
__version__ = "2.3.12"

def print_version():
print(f"v{__version__}")
Expand Down

0 comments on commit 8c32bc1

Please sign in to comment.