From 819cbb197113497f6af97999239ddec2553911ed Mon Sep 17 00:00:00 2001 From: Chang Y Date: Fri, 29 Jul 2022 23:45:47 -0500 Subject: [PATCH 01/35] Update ensembl_release_versions.py --- pyensembl/ensembl_release_versions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py index 1b86d24..c19279e 100644 --- a/pyensembl/ensembl_release_versions.py +++ b/pyensembl/ensembl_release_versions.py @@ -11,7 +11,7 @@ # limitations under the License. MIN_ENSEMBL_RELEASE = 54 -MAX_ENSEMBL_RELEASE = 106 +MAX_ENSEMBL_RELEASE = 107 def check_release_number(release): """ From 14634449db472812041ad2a50480fbfea1ef3426 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Thu, 18 Aug 2022 16:31:06 -0500 Subject: [PATCH 02/35] fix naming --- pyensembl/fasta.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py index 0893a81..9c724db 100644 --- a/pyensembl/fasta.py +++ b/pyensembl/fasta.py @@ -51,9 +51,10 @@ def _parse_header_id(line): # .e.g. # "ENST00000448914.1" instead of "ENST00000448914" # So now we have to parse out the identifier - dot_index = identifier.find(b".") - if dot_index >= 0: - identifier = identifier[:dot_index] + if identifier.startswith(b"ENS"): + dot_index = identifier.find(b".") + if dot_index >= 0: + identifier = identifier[:dot_index] return identifier.decode("ascii") From 6829d89399954d68622995ec22849a3e2bdd3852 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Thu, 18 Aug 2022 17:11:42 -0500 Subject: [PATCH 03/35] quick update --- pyensembl/fasta.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py index 9c724db..0893a81 100644 --- a/pyensembl/fasta.py +++ b/pyensembl/fasta.py @@ -51,10 +51,9 @@ def _parse_header_id(line): # .e.g. # "ENST00000448914.1" instead of "ENST00000448914" # So now we have to parse out the identifier - if identifier.startswith(b"ENS"): - dot_index = identifier.find(b".") - if dot_index >= 0: - identifier = identifier[:dot_index] + dot_index = identifier.find(b".") + if dot_index >= 0: + identifier = identifier[:dot_index] return identifier.decode("ascii") From 2ee9eb2dd62f66b0469bd69e06523d3de8c9f863 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Thu, 18 Aug 2022 17:41:58 -0500 Subject: [PATCH 04/35] quick update --- pyensembl/fasta.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py index 0893a81..bf2d813 100644 --- a/pyensembl/fasta.py +++ b/pyensembl/fasta.py @@ -51,9 +51,13 @@ def _parse_header_id(line): # .e.g. # "ENST00000448914.1" instead of "ENST00000448914" # So now we have to parse out the identifier - dot_index = identifier.find(b".") - if dot_index >= 0: - identifier = identifier[:dot_index] + + # only split name of ENSEMBL naming. In other database, such as TAIR, + # the '.1' notation is the isoform not the version. + if identifier.startswith(b"ENS"): + dot_index = identifier.find(b".") + if dot_index >= 0: + identifier = identifier[:dot_index] return identifier.decode("ascii") From 43f5e1cbe592ec1a229e89bca7b14b36cb5502d6 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Thu, 18 Aug 2022 17:55:49 -0500 Subject: [PATCH 05/35] quick update --- pyensembl/transcript.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py index 385f07a..19061b4 100644 --- a/pyensembl/transcript.py +++ b/pyensembl/transcript.py @@ -416,6 +416,9 @@ def sequence(self): Spliced cDNA sequence of transcript (includes 5" UTR, coding sequence, and 3" UTR) """ + transcript_id = self.transcript_id + if transcript_id.startswith("ENS"): + transcript_id = transcript_id.rsplit(".", 1)[0] return self.genome.transcript_sequences.get(self.transcript_id.rsplit(".", 1)[0]) @memoized_property From be260ffdc4b32c177144c246e06f2a83e4a41145 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Thu, 18 Aug 2022 17:59:14 -0500 Subject: [PATCH 06/35] quick update --- pyensembl/transcript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py index 19061b4..9d30c5c 100644 --- a/pyensembl/transcript.py +++ b/pyensembl/transcript.py @@ -419,7 +419,7 @@ def sequence(self): transcript_id = self.transcript_id if transcript_id.startswith("ENS"): transcript_id = transcript_id.rsplit(".", 1)[0] - return self.genome.transcript_sequences.get(self.transcript_id.rsplit(".", 1)[0]) + return self.genome.transcript_sequences.get(transcript_id) @memoized_property def first_start_codon_spliced_offset(self): From 86968b54c05e932258aa7a9fb2749a097194f151 Mon Sep 17 00:00:00 2001 From: Ye Chang Date: Thu, 28 Dec 2023 16:13:42 -0600 Subject: [PATCH 07/35] add species --- pyensembl/species.py | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/pyensembl/species.py b/pyensembl/species.py index 588c4f7..bc05890 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -100,7 +100,8 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}): for i in range(start, end + 1): if i in self._release_to_genome: raise ValueError( - "Ensembl release %d already has an associated genome" % i + "Ensembl release %d already has an associated genome" + % i ) self._release_to_genome[i] = genome_name @@ -113,10 +114,13 @@ def which_reference(self, ensembl_release): return self._release_to_genome[ensembl_release] def __str__(self): - return "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" % ( - self.latin_name, - self.synonyms, - self.reference_assemblies, + return ( + "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" + % ( + self.latin_name, + self.synonyms, + self.reference_assemblies, + ) ) def __eq__(self, other): @@ -304,6 +308,18 @@ def check_species_object(species_name_or_object): reference_assemblies={"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)}, ) +zebrafish = Species.register( + latin_name="danio_rerio", + synonyms=["zebrafish"], + reference_assemblies={ + "ZFISH7": (47, 53), + "Zv8": (54, 59), + "Zv9": (60, 79), + "GRCz10": (80, 91), + "GRCz11": (92, MAX_ENSEMBL_RELEASE), + }, +) + fly = Species.register( latin_name="drosophila_melanogaster", synonyms=["drosophila", "fruit fly", "fly"], @@ -316,10 +332,23 @@ def check_species_object(species_name_or_object): }, ) +nematode = Species.register( + latin_name="caenorhabditis_elegans", + synonyms=["nematode", "C_elegans"], + reference_assemblies={ + "WS180": (47, 49), + "WS190": (50, 54), + "WS200": (55, 57), + "WS210": (58, 78), + "WS220": (79, 66), + "WBcel235": (67, MAX_ENSEMBL_RELEASE), + }, +) + yeast = Species.register( latin_name="saccharomyces_cerevisiae", - synonyms=["yeast","budding_yeast"], + synonyms=["yeast", "budding_yeast"], reference_assemblies={ "R64-1-1": (76, MAX_ENSEMBL_RELEASE), }, -) \ No newline at end of file +) From 29c4cce5133ea24e80257fff085dca5e57476119 Mon Sep 17 00:00:00 2001 From: Ye Chang Date: Fri, 29 Dec 2023 12:05:17 -0600 Subject: [PATCH 08/35] format and relase --- README.md | 33 ++--- docs/conf.py | 155 ++++++++++++----------- pyensembl/genome.py | 5 +- pyensembl/species.py | 22 ++-- pyensembl/transcript.py | 167 ++++++++++++------------- pyensembl/version.py | 2 +- test/common.py | 3 + test/data.py | 109 ++++++++-------- test/test_contigs.py | 1 + test/test_download_cache.py | 36 +++--- test/test_ensembl_gtf.py | 1 + test/test_ensembl_object_properties.py | 1 + test/test_exon_id.py | 150 +++++++++++++--------- test/test_exon_object.py | 19 +-- test/test_gene_ids.py | 44 ++++--- test/test_gene_names.py | 34 +++-- test/test_gene_objects.py | 30 +++-- test/test_id_length.py | 18 +-- test/test_locus.py | 5 + test/test_missing_genome_sources.py | 44 ++++--- test/test_mouse.py | 26 ++-- test/test_release_versions.py | 15 ++- test/test_search.py | 32 +++-- test/test_sequence_data.py | 35 +++--- test/test_serialization.py | 7 +- test/test_string_representation.py | 21 +++- test/test_timings.py | 26 +++- test/test_transcript_ids.py | 41 +++--- test/test_transcript_objects.py | 121 +++++++++++------- test/test_transcript_sequences.py | 1 + test/test_transcript_support_level.py | 9 +- test/test_ucsc_gtf.py | 72 +++++------ 32 files changed, 732 insertions(+), 553 deletions(-) diff --git a/README.md b/README.md index 624d036..d1445fa 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,9 @@ PyPI +# PyEnsembl -PyEnsembl -======= -PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files. +PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files. # Example Usage @@ -25,7 +24,7 @@ data = EnsemblRelease(77) gene_names = data.gene_names_at_locus(contig=6, position=29945884) # get all exons associated with HLA-A -exon_ids = data.exon_ids_of_gene_name('HLA-A') +exon_ids = data.exon_ids_of_gene_name("HLA-A") ``` # Installation @@ -52,6 +51,7 @@ Alternatively, you can create the `EnsemblRelease` object from inside a Python process and call `ensembl_object.download()` followed by `ensembl_object.index()`. ## Cache Location + By default, PyEnsembl uses the platform-specific `Cache` folder and caches the files into the `pyensembl` sub-directory. You can override this default by setting the environment key `PYENSEMBL_CACHE_DIR` @@ -66,11 +66,11 @@ or ```python import os -os.environ['PYENSEMBL_CACHE_DIR'] = '/custom/cache/dir' +os.environ["PYENSEMBL_CACHE_DIR"] = "/custom/cache/dir" # ... PyEnsembl API usage ``` -# Usage tips +# Usage tips ## List installed genomes @@ -80,6 +80,7 @@ pyensembl list ```python from pyensembl.shell import collect_all_installed_ensembl_releases + collect_all_installed_ensembl_releases() ``` @@ -87,10 +88,11 @@ collect_all_installed_ensembl_releases() ```python from pyensembl import EnsemblRelease + data = EnsemblRelease( release=100, - species=find_species_by_name('drosophila_melanogaster'), - ) + species=find_species_by_name("drosophila_melanogaster"), +) ``` ## Data structure @@ -98,13 +100,13 @@ data = EnsemblRelease( ### Gene object ```python -gene=data.gene_by_id(gene_id='FBgn0011747') +gene = data.gene_by_id(gene_id="FBgn0011747") ``` ### Transcript object ```python -transcript=gene.transcripts[0] +transcript = gene.transcripts[0] ``` ### Protein information @@ -125,11 +127,12 @@ For example: ```python from pyensembl import Genome + data = Genome( - reference_name='GRCh38', - annotation_name='my_genome_features', + reference_name="GRCh38", + annotation_name="my_genome_features", # annotation_version=None, - gtf_path_or_url='/My/local/gtf/path_to_my_genome_features.gtf', # Path or URL of GTF file + gtf_path_or_url="/My/local/gtf/path_to_my_genome_features.gtf", # Path or URL of GTF file # transcript_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing transcript sequences # protein_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing protein sequences # cache_directory_path=None, # Where to place downloaded and cached files for this genome @@ -142,8 +145,8 @@ gene_names = data.gene_names_at_locus(contig=6, position=29945884) # API The `EnsemblRelease` object has methods to let you access all possible -combinations of the annotation features *gene\_name*, *gene\_id*, -*transcript\_name*, *transcript\_id*, *exon\_id* as well as the location of +combinations of the annotation features _gene_name_, _gene_id_, +_transcript_name_, _transcript_id_, _exon_id_ as well as the location of these genomic elements (contig, start position, end position, strand). ## Genes diff --git a/docs/conf.py b/docs/conf.py index bbc0aaf..1c4034e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,47 +18,47 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', + "sphinx.ext.autodoc", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'pyensembl' -copyright = u'2016, Hammer Lab' -author = u'Hammer Lab' +project = "pyensembl" +copyright = "2016, Hammer Lab" +author = "Hammer Lab" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u'0.8.10' +version = "0.8.10" # The full version, including alpha/beta/rc tags. -release = u'0.8.10' +release = "0.8.10" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -69,37 +69,37 @@ # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -109,156 +109,149 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'pyensembldoc' +htmlhelp_basename = "pyensembldoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'pyensembl.tex', u'pyensembl Documentation', - u'Hammer Lab', 'manual'), + (master_doc, "pyensembl.tex", "pyensembl Documentation", "Hammer Lab", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'pyensembl', u'pyensembl Documentation', - [author], 1) -] +man_pages = [(master_doc, "pyensembl", "pyensembl Documentation", [author], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -267,19 +260,25 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'pyensembl', u'pyensembl Documentation', - author, 'pyensembl', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "pyensembl", + "pyensembl Documentation", + author, + "pyensembl", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/pyensembl/genome.py b/pyensembl/genome.py index 05b6efc..5345742 100644 --- a/pyensembl/genome.py +++ b/pyensembl/genome.py @@ -291,8 +291,9 @@ def db(self): # make sure GTF file exists locally # and populate self.gtf_path self._set_local_paths( - download_if_missing=True, ## if set at False the files are not downloaded in interactive python, works anyways via command line though - overwrite=False) + download_if_missing=True, ## if set at False the files are not downloaded in interactive python, works anyways via command line though + overwrite=False, + ) if self.gtf_path is None: raise ValueError("Property 'gtf_path' of %s cannot be None" % self) diff --git a/pyensembl/species.py b/pyensembl/species.py index bc05890..c19f359 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -100,8 +100,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}): for i in range(start, end + 1): if i in self._release_to_genome: raise ValueError( - "Ensembl release %d already has an associated genome" - % i + "Ensembl release %d already has an associated genome" % i ) self._release_to_genome[i] = genome_name @@ -114,13 +113,10 @@ def which_reference(self, ensembl_release): return self._release_to_genome[ensembl_release] def __str__(self): - return ( - "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" - % ( - self.latin_name, - self.synonyms, - self.reference_assemblies, - ) + return "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" % ( + self.latin_name, + self.synonyms, + self.reference_assemblies, ) def __eq__(self, other): @@ -312,7 +308,7 @@ def check_species_object(species_name_or_object): latin_name="danio_rerio", synonyms=["zebrafish"], reference_assemblies={ - "ZFISH7": (47, 53), + # "ZFISH7": (47, 53), "Zv8": (54, 59), "Zv9": (60, 79), "GRCz10": (80, 91), @@ -336,8 +332,8 @@ def check_species_object(species_name_or_object): latin_name="caenorhabditis_elegans", synonyms=["nematode", "C_elegans"], reference_assemblies={ - "WS180": (47, 49), - "WS190": (50, 54), + # "WS180": (47, 49), + # "WS190": (50, 54), "WS200": (55, 57), "WS210": (58, 78), "WS220": (79, 66), @@ -349,6 +345,6 @@ def check_species_object(species_name_or_object): latin_name="saccharomyces_cerevisiae", synonyms=["yeast", "budding_yeast"], reference_assemblies={ - "R64-1-1": (76, MAX_ENSEMBL_RELEASE), + "R64-1-1": (75, MAX_ENSEMBL_RELEASE), }, ) diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py index 9d30c5c..012a152 100644 --- a/pyensembl/transcript.py +++ b/pyensembl/transcript.py @@ -24,18 +24,20 @@ class Transcript(LocusWithGenome): and not using the sequence, avoid the memory/performance overhead of fetching and storing sequences from a FASTA file. """ + def __init__( - self, - transcript_id, - transcript_name, - contig, - start, - end, - strand, - biotype, - gene_id, - genome, - support_level=None): + self, + transcript_id, + transcript_name, + contig, + start, + end, + strand, + biotype, + gene_id, + genome, + support_level=None, + ): LocusWithGenome.__init__( self, contig=contig, @@ -43,7 +45,8 @@ def __init__( end=end, strand=strand, biotype=biotype, - genome=genome) + genome=genome, + ) self.transcript_id = transcript_id self.transcript_name = transcript_name self.gene_id = gene_id @@ -71,16 +74,18 @@ def __str__(self): " biotype='%s'," " contig='%s'," " start=%d," - " end=%d, strand='%s', genome='%s')") % ( - self.transcript_id, - self.name, - self.gene_id, - self.biotype, - self.contig, - self.start, - self.end, - self.strand, - self.genome.reference_name) + " end=%d, strand='%s', genome='%s')" + ) % ( + self.transcript_id, + self.name, + self.gene_id, + self.biotype, + self.contig, + self.start, + self.end, + self.strand, + self.genome.reference_name, + ) def __len__(self): """ @@ -90,9 +95,10 @@ def __len__(self): def __eq__(self, other): return ( - other.__class__ is Transcript and - self.id == other.id and - self.genome == other.genome) + other.__class__ is Transcript + and self.id == other.id + and self.genome == other.genome + ) def __hash__(self): return hash(self.id) @@ -120,10 +126,8 @@ def exons(self): # in each transcript columns = ["exon_number", "exon_id"] exon_numbers_and_ids = self.db.query( - columns, - filter_column="transcript_id", - filter_value=self.id, - feature="exon") + columns, filter_column="transcript_id", filter_value=self.id, feature="exon" + ) # fill this list in its correct order (by exon_number) by using # the exon_number as a 1-based list offset @@ -133,15 +137,16 @@ def exons(self): exon = self.genome.exon_by_id(exon_id) if exon is None: raise ValueError( - "Missing exon %s for transcript %s" % ( - exon_number, self.id)) + "Missing exon %s for transcript %s" % (exon_number, self.id) + ) exon_number = int(exon_number) if exon_number < 1: raise ValueError("Invalid exon number: %s" % exon_number) elif exon_number > len(exons): raise ValueError( - "Invalid exon number: %s (max expected = %d)" % ( - exon_number, len(exons))) + "Invalid exon number: %s (max expected = %d)" + % (exon_number, len(exons)) + ) # exon_number is 1-based, convert to list index by subtracting 1 exon_idx = exon_number - 1 @@ -164,12 +169,13 @@ def _transcript_feature_position_ranges(self, feature, required=True): select_column_names=["start", "end"], filter_column="transcript_id", filter_value=self.id, - feature=feature) + feature=feature, + ) if required and len(results) == 0: raise ValueError( - "Transcript %s does not contain feature %s" % ( - self.id, feature)) + "Transcript %s does not contain feature %s" % (self.id, feature) + ) return results @memoize @@ -177,20 +183,20 @@ def _transcript_feature_positions(self, feature): """ Get unique positions for feature, raise an error if feature is absent. """ - ranges = self._transcript_feature_position_ranges( - feature, required=True) + ranges = self._transcript_feature_position_ranges(feature, required=True) results = [] # a feature (such as a stop codon), maybe be split over multiple # contiguous ranges. Collect all the nucleotide positions into a # single list. - for (start, end) in ranges: + for start, end in ranges: # since ranges are [inclusive, inclusive] and # Python ranges are [inclusive, exclusive) we have to increment # the end position for position in range(start, end + 1): if position in results: raise ValueError( - "Repeated position %d for %s" % (position, feature)) + "Repeated position %d for %s" % (position, feature) + ) results.append(position) return results @@ -207,10 +213,9 @@ def _codon_positions(self, feature): results = self._transcript_feature_positions(feature) if len(results) != 3: raise ValueError( - "Expected 3 positions for %s of %s but got %d" % ( - feature, - self.id, - len(results))) + "Expected 3 positions for %s of %s but got %d" + % (feature, self.id, len(results)) + ) return results @memoized_property @@ -219,7 +224,8 @@ def contains_start_codon(self): Does this transcript have an annotated start_codon entry? """ start_codons = self._transcript_feature_position_ranges( - "start_codon", required=False) + "start_codon", required=False + ) return len(start_codons) > 0 @memoized_property @@ -228,9 +234,10 @@ def contains_stop_codon(self): Does this transcript have an annotated stop_codon entry? """ stop_codons = self._transcript_feature_position_ranges( - "stop_codon", required=False) + "stop_codon", required=False + ) return len(stop_codons) > 0 - + @memoized_property def start_codon_complete(self): """ @@ -266,9 +273,10 @@ def exon_intervals(self): select_column_names=["exon_number", "start", "end"], filter_column="transcript_id", filter_value=self.id, - feature="exon") + feature="exon", + ) sorted_intervals = [None] * len(results) - for (exon_number, start, end) in results: + for exon_number, start, end in results: sorted_intervals[int(exon_number) - 1] = (start, end) return sorted_intervals @@ -281,15 +289,15 @@ def spliced_offset(self, position): """ if type(position) is not int: raise TypeError( - "Position argument must be an integer, got %s : %s" % ( - position, type(position))) + "Position argument must be an integer, got %s : %s" + % (position, type(position)) + ) if position < self.start or position > self.end: raise ValueError( - "Invalid position: %d (must be between %d and %d)" % ( - position, - self.start, - self.end)) + "Invalid position: %d (must be between %d and %d)" + % (position, self.start, self.end) + ) # offset from beginning of unspliced transcript (including introns) unspliced_offset = self.offset(position) @@ -306,7 +314,8 @@ def spliced_offset(self, position): # Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii... for exon in self.exons: exon_unspliced_start, exon_unspliced_end = self.offset_range( - exon.start, exon.end) + exon.start, exon.end + ) # If the relative position is not within this exon, keep a running # total of the total exonic length-so-far. # @@ -323,8 +332,8 @@ def spliced_offset(self, position): exon_length = len(exon) # exon_end_position - exon_start_position + 1 total_spliced_offset += exon_length raise ValueError( - "Couldn't find position %d on any exon of %s" % ( - position, self.id)) + "Couldn't find position %d on any exon of %s" % (position, self.id) + ) @memoized_property def start_codon_unspliced_offsets(self): @@ -332,11 +341,7 @@ def start_codon_unspliced_offsets(self): Offsets from start of unspliced pre-mRNA transcript of nucleotides in start codon. """ - return [ - self.offset(position) - for position - in self.start_codon_positions - ] + return [self.offset(position) for position in self.start_codon_positions] @memoized_property def stop_codon_unspliced_offsets(self): @@ -344,11 +349,7 @@ def stop_codon_unspliced_offsets(self): Offsets from start of unspliced pre-mRNA transcript of nucleotides in stop codon. """ - return [ - self.offset(position) - for position - in self.stop_codon_positions - ] + return [self.offset(position) for position in self.stop_codon_positions] def _contiguous_offsets(self, offsets): """ @@ -358,8 +359,7 @@ def _contiguous_offsets(self, offsets): offsets.sort() for i in range(len(offsets) - 1): if offsets[i] + 1 != offsets[i + 1]: - raise ValueError( - "Offsets not contiguous: %s" % (offsets,)) + raise ValueError("Offsets not contiguous: %s" % (offsets,)) return offsets @memoized_property @@ -369,9 +369,7 @@ def start_codon_spliced_offsets(self): of nucleotides in start codon. """ offsets = [ - self.spliced_offset(position) - for position - in self.start_codon_positions + self.spliced_offset(position) for position in self.start_codon_positions ] return self._contiguous_offsets(offsets) @@ -382,9 +380,7 @@ def stop_codon_spliced_offsets(self): of nucleotides in stop codon. """ offsets = [ - self.spliced_offset(position) - for position - in self.stop_codon_positions + self.spliced_offset(position) for position in self.stop_codon_positions ] return self._contiguous_offsets(offsets) @@ -403,11 +399,11 @@ def complete(self): a coding sequence whose length is divisible by 3 """ return ( - self.contains_start_codon and - self.start_codon_complete and - self.contains_stop_codon and - self.coding_sequence is not None and - len(self.coding_sequence) % 3 == 0 + self.contains_start_codon + and self.start_codon_complete + and self.contains_stop_codon + and self.coding_sequence is not None + and len(self.coding_sequence) % 3 == 0 ) @memoized_property @@ -459,7 +455,7 @@ def coding_sequence(self): # pylint: disable=invalid-slice-index # TODO(tavi) Figure out pylint is not happy with this slice - return self.sequence[start:end + 1] + return self.sequence[start : end + 1] @memoized_property def five_prime_utr_sequence(self): @@ -469,7 +465,7 @@ def five_prime_utr_sequence(self): """ # pylint: disable=invalid-slice-index # TODO(tavi) Figure out pylint is not happy with this slice - return self.sequence[:self.first_start_codon_spliced_offset] + return self.sequence[: self.first_start_codon_spliced_offset] @memoized_property def three_prime_utr_sequence(self): @@ -477,7 +473,7 @@ def three_prime_utr_sequence(self): cDNA sequence of 3' UTR (untranslated region at the end of the transcript) """ - return self.sequence[self.last_stop_codon_spliced_offset + 1:] + return self.sequence[self.last_stop_codon_spliced_offset + 1 :] @memoized_property def protein_id(self): @@ -487,7 +483,8 @@ def protein_id(self): filter_value=self.id, feature="CDS", distinct=True, - required=False) + required=False, + ) if result_tuple: return result_tuple[0] else: diff --git a/pyensembl/version.py b/pyensembl/version.py index 73b4b05..519574c 100644 --- a/pyensembl/version.py +++ b/pyensembl/version.py @@ -1 +1 @@ -__version__ = "2.2.9" +__version__ = "2.2.10" diff --git a/test/common.py b/test/common.py index 9b20c3b..094b6a2 100644 --- a/test/common.py +++ b/test/common.py @@ -14,6 +14,7 @@ contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"] + @nottest def test_ensembl_releases(*versions): """ @@ -33,7 +34,9 @@ def decorator(test_fn): def new_test_fn(): for ensembl in ensembl_releases: test_fn(ensembl) + return new_test_fn + return decorator diff --git a/test/data.py b/test/data.py index 0b41369..60cd08a 100644 --- a/test/data.py +++ b/test/data.py @@ -21,25 +21,29 @@ def data_path(name): CTNNBIP1_004_transcript_id = "ENST00000377256" # coding sequence for beta-catenin interacting protein (CTNNBIP1-004) -CTNNBIP1_004_CDS = "".join([ - "ATG", - "AACCGCGAGGGAGCTCCCGGGAAGAGTCCGGAG", - "GAGATGTACATTCAGCAGAAGGTCCGAGTGCTGCTCATGCTGCGGAAGATGGGATCAAAC", - "CTGACAGCCAGCGAGGAGGAGTTCCTGCGCACCTATGCAGGGGTGGTCAACAGCCAGCTC", - "AGCCAGCTGCCTCCGCACTCCATCGACCAGG", - "GTGCAGAGGACGTGGTGATGGCGTTTTCCAGGTCGGAGACGGAAGACCGGAGGCAG", - "TAG" -]) +CTNNBIP1_004_CDS = "".join( + [ + "ATG", + "AACCGCGAGGGAGCTCCCGGGAAGAGTCCGGAG", + "GAGATGTACATTCAGCAGAAGGTCCGAGTGCTGCTCATGCTGCGGAAGATGGGATCAAAC", + "CTGACAGCCAGCGAGGAGGAGTTCCTGCGCACCTATGCAGGGGTGGTCAACAGCCAGCTC", + "AGCCAGCTGCCTCCGCACTCCATCGACCAGG", + "GTGCAGAGGACGTGGTGATGGCGTTTTCCAGGTCGGAGACGGAAGACCGGAGGCAG", + "TAG", + ] +) # 5' UTR for beta-catenin interacting protein (CTNNBIP1-004) -CTNNBIP1_004_UTR5 = "".join([ - "TGTGGGTGCAGGTTTCCTGGGCTTGCCAGACACACAGGGCGGCACCTTCCTACTTCTGCC", - "CAGCCACAGCCCTCCCCTCACAGTTGAGCACCTGTTTGCCTGAAGTTAATTTCCAGAAGC", - "AGGAGTCCCCAGAGCCAGGCAGGGGG"]) +CTNNBIP1_004_UTR5 = "".join( + [ + "TGTGGGTGCAGGTTTCCTGGGCTTGCCAGACACACAGGGCGGCACCTTCCTACTTCTGCC", + "CAGCCACAGCCCTCCCCTCACAGTTGAGCACCTGTTTGCCTGAAGTTAATTTCCAGAAGC", + "AGGAGTCCCCAGAGCCAGGCAGGGGG", + ] +) # 3' UTR for beta-catenin interacting protein (CTNNBIP1-004) -CTNNBIP1_004_UTR3 = \ - "CTGCAAAGCCCTTGGAACACCCTGGATGCTGTTGAGGGCCAAGAGATCTGTGTGGCTCC" +CTNNBIP1_004_UTR3 = "CTGCAAAGCCCTTGGAACACCCTGGATGCTGTTGAGGGCCAAGAGATCTGTGTGGCTCC" CTNNBIP1_004_locus = Locus("1", 9850659, 9878176, "-") @@ -47,20 +51,14 @@ def data_path(name): # http://useast.ensembl.org/Homo_sapiens/Transcript/Exons?g=ENSG00000178585; # r=1:9850659-9878176;redirect=no;t=ENST00000377256 CTTNNIP1_004_exon_ids = [ - 'ENSE00001473268', - 'ENSE00001643659', - 'ENSE00001600669', - 'ENSE00001267940', - 'ENSE00001473265', + "ENSE00001473268", + "ENSE00001643659", + "ENSE00001600669", + "ENSE00001267940", + "ENSE00001473265", ] -CTTNNIP1_004_exon_lengths = [ - 37, - 85, - 120, - 91, - 118 -] +CTTNNIP1_004_exon_lengths = [37, 85, 120, 91, 118] # @@ -72,26 +70,28 @@ def data_path(name): EGFR_001_transcript_id = "ENST00000275493" EGFR_001_ccds_id = "CCDS5514" EGFR_001_protein_id = "ENSP00000275493" -EGFR_001_protein_sequence = "".join([ - "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYV", - "QRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNL", - "QEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKL", - "TKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVN", - "PEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLS", - "INATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAF", - "ENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKI", - "ISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHP", - "ECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTG", - "PGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPN", - "QALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVD", - "NPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAA", - "RNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTF", - "GSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLV", - "IQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACI", - "DRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPS" - "RDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGS" - "TAENAEYLRVAPQSSEFIGA" -]) +EGFR_001_protein_sequence = "".join( + [ + "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYV", + "QRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNL", + "QEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKL", + "TKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVN", + "PEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLS", + "INATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAF", + "ENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKI", + "ISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHP", + "ECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTG", + "PGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPN", + "QALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVD", + "NPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAA", + "RNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTF", + "GSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLV", + "IQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACI", + "DRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPS" + "RDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGS" + "TAENAEYLRVAPQSSEFIGA", + ] +) # GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/ @@ -114,13 +114,17 @@ def data_path(name): # http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167 MOUSE_ENSMUSG00000017167_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf") + "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf" +) MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.fa") + "mouse.ensembl.81.partial.ENSMUSG00000017167.fa" +) MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa") + "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa" +) MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.pep") + "mouse.ensembl.81.partial.ENSMUSG00000017167.pep" +) custom_mouse_genome_grcm38_subset = Genome( @@ -128,7 +132,8 @@ def data_path(name): annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], - protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH]) + protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], +) def setup_init_custom_mouse_genome(): diff --git a/test/test_contigs.py b/test/test_contigs.py index 1101061..b4eb702 100644 --- a/test/test_contigs.py +++ b/test/test_contigs.py @@ -2,6 +2,7 @@ grch38 = genome_for_reference_name("GRCh38") + def test_contig_names(): contig_names = set(grch38.contigs()) for chrom in list(range(1, 23)) + ["X", "Y", "MT"]: diff --git a/test/test_download_cache.py b/test/test_download_cache.py index 03c7da6..2bf5913 100644 --- a/test/test_download_cache.py +++ b/test/test_download_cache.py @@ -1,9 +1,5 @@ from nose.tools import assert_raises, ok_ -from pyensembl.download_cache import ( - DownloadCache, - MissingLocalFile, - MissingRemoteFile -) +from pyensembl.download_cache import DownloadCache, MissingLocalFile, MissingRemoteFile import os import tempfile @@ -13,21 +9,27 @@ download_cache = DownloadCache( reference_name="__test_reference", annotation_name="__test_annotation", - copy_local_files_to_cache=False) + copy_local_files_to_cache=False, +) + def test_download_cache_missing_local_file(): # clear the cache download_cache.delete_cache_directory() with assert_raises(MissingLocalFile): download_cache.download_or_copy_if_necessary( - path_or_url="test_file_doesn_not_exist.file") + path_or_url="test_file_doesn_not_exist.file" + ) + def test_download_cache_missing_remote_file(): # clear the cache download_cache.delete_cache_directory() with assert_raises(MissingRemoteFile): download_cache.download_or_copy_if_necessary( - path_or_url="ftp://NOTAURL.NOTAURL.NOTAURL") + path_or_url="ftp://NOTAURL.NOTAURL.NOTAURL" + ) + def test_download_cache_custom_location(): test_file = "refseq.ucsc.small.gtf" @@ -36,29 +38,27 @@ def test_download_cache_custom_location(): print("DIR: %s" % tmp_dir) assert tmp_dir is not None - os.environ['PYENSEMBL_CACHE_DIR'] = tmp_dir + os.environ["PYENSEMBL_CACHE_DIR"] = tmp_dir # We need another instance of DownloadCache # that copies files over to cache folder download_cache = DownloadCache( reference_name="test_reference", annotation_name="test_annotation", - copy_local_files_to_cache=True) + copy_local_files_to_cache=True, + ) # clean up download_cache.delete_cache_directory() download_cache.download_or_copy_if_necessary( - download_if_missing=True, - path_or_url=data_path(test_file)) + download_if_missing=True, path_or_url=data_path(test_file) + ) full_path = os.path.join( - tmp_dir, - "pyensembl", - "test_reference", - "test_annotation", - test_file) + tmp_dir, "pyensembl", "test_reference", "test_annotation", test_file + ) print("FULL PATH: %s" % full_path) assert len(full_path) > 0 ok_(os.path.exists(full_path)) - del os.environ['PYENSEMBL_CACHE_DIR'] + del os.environ["PYENSEMBL_CACHE_DIR"] diff --git a/test/test_ensembl_gtf.py b/test/test_ensembl_gtf.py index 040023b..c22cf74 100644 --- a/test/test_ensembl_gtf.py +++ b/test/test_ensembl_gtf.py @@ -4,6 +4,7 @@ from .common import test_ensembl_releases + @test_ensembl_releases() def gtf_path_endswith_gtf_gz(ensembl): path = ensembl.gtf.gtf_path diff --git a/test/test_ensembl_object_properties.py b/test/test_ensembl_object_properties.py index ff90dcf..b3c4582 100644 --- a/test/test_ensembl_object_properties.py +++ b/test/test_ensembl_object_properties.py @@ -8,6 +8,7 @@ from nose.tools import eq_ from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE + def test_human_reference_name(): eq_(EnsemblRelease(release=54).reference_name, "NCBI36") eq_(EnsemblRelease(release=74).reference_name, "GRCh37") diff --git a/test/test_exon_id.py b/test/test_exon_id.py index ceb145f..18590f8 100644 --- a/test/test_exon_id.py +++ b/test/test_exon_id.py @@ -10,71 +10,104 @@ # all exons associated with TP53 gene in Ensembl release 77 TP53_EXON_IDS_RELEASE_77 = [ - 'ENSE00002337729', 'ENSE00002419584', - 'ENSE00003625790', 'ENSE00003518480', - 'ENSE00003723991', 'ENSE00003712342', - 'ENSE00001657961', 'ENSE00003725258', - 'ENSE00003740946', 'ENSE00002204316', - 'ENSE00002064269', 'ENSE00003750554', - 'ENSE00003634848', 'ENSE00003492844', - 'ENSE00003735852', 'ENSE00003545950', - 'ENSE00003605891', 'ENSE00002051192', - 'ENSE00002084733', 'ENSE00003726882', - 'ENSE00001146308', 'ENSE00002667911', - 'ENSE00003752869', 'ENSE00003739898', - 'ENSE00003753508', 'ENSE00002034209', - 'ENSE00002030826', 'ENSE00001596491', - 'ENSE00002037735', 'ENSE00003736616', - 'ENSE00002672443', 'ENSE00002226620', - 'ENSE00003715195', 'ENSE00003750794', - 'ENSE00003745267', 'ENSE00003746220', - 'ENSE00003656695', 'ENSE00003669712', - 'ENSE00002051873', 'ENSE00002048269', - 'ENSE00002670535', 'ENSE00002677565', - 'ENSE00003532881', 'ENSE00003520683', - 'ENSE00002076714', 'ENSE00002062958', - 'ENSE00002073243', 'ENSE00003670707', - 'ENSE00002065802', 'ENSE00002362269' + "ENSE00002337729", + "ENSE00002419584", + "ENSE00003625790", + "ENSE00003518480", + "ENSE00003723991", + "ENSE00003712342", + "ENSE00001657961", + "ENSE00003725258", + "ENSE00003740946", + "ENSE00002204316", + "ENSE00002064269", + "ENSE00003750554", + "ENSE00003634848", + "ENSE00003492844", + "ENSE00003735852", + "ENSE00003545950", + "ENSE00003605891", + "ENSE00002051192", + "ENSE00002084733", + "ENSE00003726882", + "ENSE00001146308", + "ENSE00002667911", + "ENSE00003752869", + "ENSE00003739898", + "ENSE00003753508", + "ENSE00002034209", + "ENSE00002030826", + "ENSE00001596491", + "ENSE00002037735", + "ENSE00003736616", + "ENSE00002672443", + "ENSE00002226620", + "ENSE00003715195", + "ENSE00003750794", + "ENSE00003745267", + "ENSE00003746220", + "ENSE00003656695", + "ENSE00003669712", + "ENSE00002051873", + "ENSE00002048269", + "ENSE00002670535", + "ENSE00002677565", + "ENSE00003532881", + "ENSE00003520683", + "ENSE00002076714", + "ENSE00002062958", + "ENSE00002073243", + "ENSE00003670707", + "ENSE00002065802", + "ENSE00002362269", ] + def test_exon_ids_of_gene_id(): """ test_exon_ids_of_gene_id: Ensure that gene_id ENSG00000141510 (name=TP53), has all the same exon IDs found on the Ensembl website. """ - exon_ids = ensembl.exon_ids_of_gene_id('ENSG00000141510') - assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \ - "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( - len(TP53_EXON_IDS_RELEASE_77), - len(exon_ids), - len(set(exon_ids))) + exon_ids = ensembl.exon_ids_of_gene_id("ENSG00000141510") + assert len(exon_ids) == len( + TP53_EXON_IDS_RELEASE_77 + ), "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( + len(TP53_EXON_IDS_RELEASE_77), + len(exon_ids), + len(set(exon_ids)), + ) assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids) + def test_exon_ids_of_gene_name(): """ test_exon_ids_of_gene_name: Ensure that TP53 has the same exon IDs found on the Ensembl website. """ exon_ids = ensembl.exon_ids_of_gene_name("TP53") - assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \ - "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( - len(TP53_EXON_IDS_RELEASE_77), - len(exon_ids), - len(set(exon_ids))) + assert len(exon_ids) == len( + TP53_EXON_IDS_RELEASE_77 + ), "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( + len(TP53_EXON_IDS_RELEASE_77), + len(exon_ids), + len(set(exon_ids)), + ) assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids) + # Exon IDs of transcript TP53-026 TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 = [ - 'ENSE00002064269', - 'ENSE00003723991', - 'ENSE00003712342', - 'ENSE00003725258', - 'ENSE00003740946', - 'ENSE00003750554', - 'ENSE00003634848', - 'ENSE00003492844' + "ENSE00002064269", + "ENSE00003723991", + "ENSE00003712342", + "ENSE00003725258", + "ENSE00003740946", + "ENSE00003750554", + "ENSE00003634848", + "ENSE00003492844", ] + def test_exon_ids_of_transcript_name(): """ test_exon_ids_of_transcript_name : Look up exon IDs of transcript TP53-026 @@ -82,13 +115,16 @@ def test_exon_ids_of_transcript_name(): for release 77 """ exon_ids = ensembl.exon_ids_of_transcript_name("TP53-026") - assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \ - "Expected %d exons, got %d" % ( - len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), - len(exon_ids)) + assert len(exon_ids) == len( + TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 + ), "Expected %d exons, got %d" % ( + len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), + len(exon_ids), + ) assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - for exon_id in exon_ids) + exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 for exon_id in exon_ids + ) + def exon_ids_of_transcript_id(): """ @@ -97,10 +133,12 @@ def exon_ids_of_transcript_id(): what we find on the Ensembl website. """ exon_ids = ensembl.exon_ids_of_transcript_id("ENST00000610623") - assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \ - "Expected %d exons, got %d" % ( - len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), - len(exon_ids)) + assert len(exon_ids) == len( + TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 + ), "Expected %d exons, got %d" % ( + len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), + len(exon_ids), + ) assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - for exon_id in exon_ids) + exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 for exon_id in exon_ids + ) diff --git a/test/test_exon_object.py b/test/test_exon_object.py index 4587284..9a77cde 100644 --- a/test/test_exon_object.py +++ b/test/test_exon_object.py @@ -9,14 +9,14 @@ ensembl = cached_release(77) + def test_exon_object_by_id(): """ test_exon_object_by_id : check properties of exon 4 of CTNNB1 when looked up by ID in Ensembl 77. """ exon = ensembl.exon_by_id("ENSE00003464041") - assert exon.gene_name == "CTNNB1", \ - "Unexpected gene name: %s" % exon.gene_name + assert exon.gene_name == "CTNNB1", "Unexpected gene name: %s" % exon.gene_name assert exon.contig == "3", exon.contig assert exon.strand == "+" assert exon.on_forward_strand @@ -25,14 +25,14 @@ def test_exon_object_by_id(): assert exon.end == 41224753, "Unexpected exon end: %s" % exon.end assert exon.length == len(exon) == 228 + def test_exon_object_by_id_on_negative_strand(): """ test_exon_object_by_id_on_negative_strand : check properties of exon 1 from CXCR3 when looked up by ID in Ensembl 77. """ exon = ensembl.exon_by_id("ENSE00001817013") - assert exon.gene_name == "CXCR3", \ - "Unexpected gene name: %s" % exon.gene_name + assert exon.gene_name == "CXCR3", "Unexpected gene name: %s" % exon.gene_name assert exon.contig == "X", exon.contig assert exon.strand == "-" assert exon.on_backward_strand @@ -57,6 +57,7 @@ def test_exon_object_at_locus(): assert exon.start <= 41224526, "Unexpected exon start: %s" % exon.start assert exon.end >= 41224526, "Unexpected exon end: %s" % exon.end + def test_exon_object_at_locus_on_negative_strand(): """ test_exon_object_at_locus : check properties of exon 1 of CXCR3 when looked @@ -72,6 +73,7 @@ def test_exon_object_at_locus_on_negative_strand(): assert exon.start <= 71618517, "Unexpected exon start: %s" % exon.start assert exon.end >= 71618517, "Unexpected exon end: %s" % exon.end + def test_exon_basic_properties_str(): exon = ensembl.exon_by_id("ENSE00001817013") assert isinstance(str(exon), str) @@ -81,11 +83,14 @@ def test_exon_basic_properties_str(): # change this test assert str(exon) == repr(exon), "%s != %s" % (str(exon), repr(exon)) + def test_exon_basic_properties_hash(): exon = ensembl.exon_by_id("ENSE00001817013") - assert isinstance(hash(exon), int), \ - "Hash function returns %s instead of int" % ( - type(hash(exon),)) + assert isinstance(hash(exon), int), "Hash function returns %s instead of int" % ( + type( + hash(exon), + ) + ) assert hash(exon) == hash(exon), "Hash function is non-deterministic!" other_exon = ensembl.exon_by_id("ENSE00003464041") assert exon != other_exon diff --git a/test/test_gene_ids.py b/test/test_gene_ids.py index b121b8b..3f1420e 100644 --- a/test/test_gene_ids.py +++ b/test/test_gene_ids.py @@ -13,6 +13,7 @@ ensembl77 = cached_release(77, "human") + def test_gene_ids_grch38_hla_a(): # chr6:29,945,884 is a position for HLA-A # Gene ID = ENSG00000206503 @@ -21,40 +22,55 @@ def test_gene_ids_grch38_hla_a(): # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 ids = ensembl_grch38.gene_ids_at_locus(6, 29945884) expected = "ENSG00000206503" - assert ids == ["ENSG00000206503"], \ - "Expected HLA-A, gene ID = %s, got: %s" % (expected, ids) + assert ids == ["ENSG00000206503"], "Expected HLA-A, gene ID = %s, got: %s" % ( + expected, + ids, + ) + def test_gene_ids_of_gene_name_hla_grch38(): hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A") - assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids + assert "ENSG00000206503" in hla_a_gene_ids, hla_a_gene_ids hla_b_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-B") - assert 'ENSG00000234745' in hla_b_gene_ids, hla_b_gene_ids + assert "ENSG00000234745" in hla_b_gene_ids, hla_b_gene_ids hla_c_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-C") - assert 'ENSG00000204525' in hla_c_gene_ids, hla_c_gene_ids + assert "ENSG00000204525" in hla_c_gene_ids, hla_c_gene_ids + def test_gene_id_of_protein_id_release77(): gene_id = ensembl77.gene_id_of_protein_id("ENSP00000485677") - ok_('ENSG00000279634', gene_id) + ok_("ENSG00000279634", gene_id) + def test_gene_id_of_invalid_name(): with assert_raises(Exception): - ensembl_grch38.gene_ids_of_gene_name( - "A wonderous pony sees through your soul") + ensembl_grch38.gene_ids_of_gene_name("A wonderous pony sees through your soul") + @test_ensembl_releases() def test_gene_ids_on_contig(ensembl): gene_ids_chr17 = ensembl.gene_ids(contig=17) # gene ID of TP53 tp53 = "ENSG00000141510" - assert tp53 in gene_ids_chr17, \ - "Missing %s from %s on chr17, example IDs: %s (total = %d)" % ( - tp53, ensembl, gene_ids_chr17[:5], len(gene_ids_chr17)) + assert ( + tp53 in gene_ids_chr17 + ), "Missing %s from %s on chr17, example IDs: %s (total = %d)" % ( + tp53, + ensembl, + gene_ids_chr17[:5], + len(gene_ids_chr17), + ) # gene ID of SMAD4 gene_ids_chr18 = ensembl.gene_ids(contig=18) smad4 = "ENSG00000141646" - assert smad4 in gene_ids_chr18, \ - "Missing %s from %s on chr18, example result: %s (total = %d)" % ( - smad4, ensembl, gene_ids_chr18[:5], len(gene_ids_chr18)) + assert ( + smad4 in gene_ids_chr18 + ), "Missing %s from %s on chr18, example result: %s (total = %d)" % ( + smad4, + ensembl, + gene_ids_chr18[:5], + len(gene_ids_chr18), + ) diff --git a/test/test_gene_names.py b/test/test_gene_names.py index e6c839c..626537b 100644 --- a/test/test_gene_names.py +++ b/test/test_gene_names.py @@ -17,6 +17,7 @@ "HLA-A", ] + @test_ensembl_releases() def test_all_gene_names(ensembl): """ @@ -26,8 +27,11 @@ def test_all_gene_names(ensembl): gene_names = ensembl.gene_names() print(type(gene_names)) for gene_name in KNOWN_GENE_NAMES: - assert gene_name in gene_names, \ - "Missing gene name %s from %s" % (gene_name, ensembl) + assert gene_name in gene_names, "Missing gene name %s from %s" % ( + gene_name, + ensembl, + ) + def test_gene_names_at_locus_grch38_hla_a(): # chr6:29,945,884 is a position for HLA-A @@ -37,25 +41,31 @@ def test_gene_names_at_locus_grch38_hla_a(): names = grch38.gene_names_at_locus(6, 29945884) assert names == ["HLA-A"], "Expected gene name HLA-A, got: %s" % (names,) + @test_ensembl_releases() def test_gene_names_on_contig(ensembl): gene_names_chr17 = ensembl.gene_names(17) - assert "TP53" in gene_names_chr17, \ - "No TP53 in gene names on chr17 of %s, gene names: %s ... (%d)" % ( - ensembl, list(gene_names_chr17[:4]), len(gene_names_chr17)) + assert ( + "TP53" in gene_names_chr17 + ), "No TP53 in gene names on chr17 of %s, gene names: %s ... (%d)" % ( + ensembl, + list(gene_names_chr17[:4]), + len(gene_names_chr17), + ) gene_names_chr18 = ensembl.gene_names(18) - assert "SMAD4" in gene_names_chr18, \ - "No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % ( - ensembl, list(gene_names_chr18[:4]), len(gene_names_chr18)) + assert ( + "SMAD4" in gene_names_chr18 + ), "No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % ( + ensembl, + list(gene_names_chr18[:4]), + len(gene_names_chr18), + ) def test_gene_name_of_HLA_gene_id(): gene_ids = grch38.gene_ids_of_gene_name("HLA-A") - gene_names = [ - grch38.gene_name_of_gene_id(gene_id) - for gene_id in gene_ids - ] + gene_names = [grch38.gene_name_of_gene_id(gene_id) for gene_id in gene_ids] unique_gene_names = list(set(gene_names)) assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names) gene_name = unique_gene_names[0] diff --git a/test/test_gene_objects.py b/test/test_gene_objects.py index 2258f43..63fe006 100644 --- a/test/test_gene_objects.py +++ b/test/test_gene_objects.py @@ -3,27 +3,35 @@ from .common import test_ensembl_releases from .data import TP53_gene_id + @test_ensembl_releases() def test_TP53_gene_object_by_id(genome): # when we look up TP53 by its gene ID, we should get the # correct gene back gene = genome.gene_by_id(TP53_gene_id) - assert gene.name == "TP53", \ - "Incorrect gene name %s for gene ID %s in %s" % ( - gene.name, gene.id, genome) - assert gene.contig == "17", \ - "Incorrect gene contig %s for gene ID %s in %s" % ( - gene.contig, gene.id, genome) + assert gene.name == "TP53", "Incorrect gene name %s for gene ID %s in %s" % ( + gene.name, + gene.id, + genome, + ) + assert gene.contig == "17", "Incorrect gene contig %s for gene ID %s in %s" % ( + gene.contig, + gene.id, + genome, + ) + @test_ensembl_releases() def test_TP53_gene_object_by_name(genome): genes = genome.genes_by_name("TP53") # we should only have one TP53 gene (there aren't any copies) - assert len(genes) == 1, \ - "Expected only one gene with name TP53, got %s" % (genes,) + assert len(genes) == 1, "Expected only one gene with name TP53, got %s" % (genes,) # make sure it has the correct gene ID - assert genes[0].id == TP53_gene_id, \ - "Expected gene to have ID %s, got %s" % (TP53_gene_id, genes[0].id) + assert genes[0].id == TP53_gene_id, "Expected gene to have ID %s, got %s" % ( + TP53_gene_id, + genes[0].id, + ) + @test_ensembl_releases() def test_equal_genes(genome): @@ -34,6 +42,7 @@ def test_equal_genes(genome): assert hash(gene1) == hash(gene2) assert gene1 == gene2 + @test_ensembl_releases() def test_not_equal_genes(genome): gene1 = genome.genes_by_name("MUC1")[0] @@ -41,6 +50,7 @@ def test_not_equal_genes(genome): assert hash(gene1) != hash(gene2) assert gene1 != gene2 + @test_ensembl_releases() def test_BRCA1_protein_coding_biotype(genome): gene = genome.genes_by_name("BRCA1")[0] diff --git a/test/test_id_length.py b/test/test_id_length.py index 7371cd4..cc61869 100644 --- a/test/test_id_length.py +++ b/test/test_id_length.py @@ -2,6 +2,7 @@ from nose.tools import nottest + @nottest def check_id_length(method_name): for release in major_releases: @@ -9,16 +10,19 @@ def check_id_length(method_name): # only load chromosome Y to speed up tests idents = method(contig="Y") assert len(idents) > 0, "No values returned by %s" % method_name - assert all(len(ident) == 15 for ident in idents), \ - "Invalid IDs for %s: %s" % ( - method_name, - [ident for ident in idents if len(ident) != 15]) + assert all(len(ident) == 15 for ident in idents), "Invalid IDs for %s: %s" % ( + method_name, + [ident for ident in idents if len(ident) != 15], + ) + def test_gene_id_length(): - check_id_length('gene_ids') + check_id_length("gene_ids") + def test_transcript_id_length(): - check_id_length('transcript_ids') + check_id_length("transcript_ids") + def test_protein_id_length(): - check_id_length('protein_ids') + check_id_length("protein_ids") diff --git a/test/test_locus.py b/test/test_locus.py index a1af6fd..475a018 100644 --- a/test/test_locus.py +++ b/test/test_locus.py @@ -3,6 +3,7 @@ from nose.tools import assert_raises + def test_normalize_chromosome(): assert normalize_chromosome("X") == "X" assert normalize_chromosome("chrX") == "chrX" @@ -38,6 +39,7 @@ def test_normalize_chromosome(): with assert_raises(ValueError): normalize_chromosome(0) + def test_locus_overlaps(): locus = Locus("1", 10, 20, "+") assert locus.overlaps("1", 10, 20, "+") @@ -57,6 +59,7 @@ def test_locus_overlaps(): # wrong strand assert not locus.overlaps("1", 10, 20, "-") + def test_locus_contains(): locus = Locus("1", 10, 20, "+") assert locus.contains("1", 10, 20, "+") @@ -82,6 +85,7 @@ def test_locus_contains(): # wrong strand assert not locus.contains("1", 10, 20, "-") + def test_position_offset(): forward_locus = Locus("1", 10, 20, "+") assert forward_locus.offset(10) == 0 @@ -143,6 +147,7 @@ def test_range_offset(): with assert_raises(ValueError): negative_locus.offset_range(9, 10) + def test_locus_distance(): locus_chr1_10_20_pos = Locus("1", 10, 20, "+") locus_chr1_21_25_pos = Locus("1", 21, 25, "+") diff --git a/test/test_missing_genome_sources.py b/test/test_missing_genome_sources.py index 35a4f41..6069261 100644 --- a/test/test_missing_genome_sources.py +++ b/test/test_missing_genome_sources.py @@ -4,37 +4,39 @@ from .data import data_path MOUSE_ENSMUSG00000017167_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf") + "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf" +) MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.fa") + "mouse.ensembl.81.partial.ENSMUSG00000017167.fa" +) # MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path( # "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa") MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.pep") + "mouse.ensembl.81.partial.ENSMUSG00000017167.pep" +) + def no_gtf_(cm): - print("Testing for 'GTF' in %s : %s" % ( - type(cm.exception), - cm.exception)) + print("Testing for 'GTF' in %s : %s" % (type(cm.exception), cm.exception)) ok_("GTF" in str(cm.exception)) + def no_transcript_(cm): - print("Testing for 'transcript' in %s : %s" % ( - type(cm.exception), - cm.exception)) + print("Testing for 'transcript' in %s : %s" % (type(cm.exception), cm.exception)) ok_("transcript" in str(cm.exception)) + def no_protein_(cm): - print("Testing for 'protein' in %s : %s" % ( - type(cm.exception), - cm.exception)) + print("Testing for 'protein' in %s : %s" % (type(cm.exception), cm.exception)) ok_("protein" in str(cm.exception)) + def test_transcript_fasta_only(): genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", - transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH]) + transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], + ) genome.index() eq_(2, len(genome.transcript_sequences.fasta_dictionary)) @@ -59,11 +61,13 @@ def test_transcript_fasta_only(): genome.protein_sequence("test") no_protein_(cm) + def test_protein_fasta_only(): genome_only_proteins = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", - protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH]) + protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], + ) genome_only_proteins.index() eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary)) @@ -76,11 +80,13 @@ def test_protein_fasta_only(): genome_only_proteins.transcript_sequence("DOES_NOT_EXIST") no_transcript_(cm) + def test_gtf_only(): genome_only_gtf = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", - gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH) + gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, + ) genome_only_gtf.index() eq_(1, len(genome_only_gtf.genes())) @@ -95,12 +101,14 @@ def test_gtf_only(): no_protein_(cm) + def test_gtf_transcript_only(): genome_gtf_with_cdna = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, - transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH]) + transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], + ) genome_gtf_with_cdna.index() eq_(1, len(genome_gtf_with_cdna.genes())) @@ -112,12 +120,14 @@ def test_gtf_transcript_only(): transcript.protein_sequence no_protein_(cm) + def test_gtf_protein_only(): genome_gtf_with_proteins = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, - protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH]) + protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], + ) genome_gtf_with_proteins.index() eq_(1, len(genome_gtf_with_proteins.genes())) diff --git a/test/test_mouse.py b/test/test_mouse.py index 24a0b4a..5ec03b6 100644 --- a/test/test_mouse.py +++ b/test/test_mouse.py @@ -1,9 +1,7 @@ from nose.tools import eq_, with_setup -from .data import ( - custom_mouse_genome_grcm38_subset, - setup_init_custom_mouse_genome -) +from .data import custom_mouse_genome_grcm38_subset, setup_init_custom_mouse_genome + @with_setup(setup=setup_init_custom_mouse_genome) def test_mouse_ENSMUSG00000017167(): @@ -39,9 +37,17 @@ def test_mouse_ENSMUSG00000017167(): ] eq_(len(transcripts_coding_cntnap1), 1) transcript_cntnap1 = transcripts_coding_cntnap1[0] - eq_(transcript_cntnap1.sequence[:120], - ("GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" - "GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT")) - eq_(transcript_cntnap1.protein_sequence[:120], - ("MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS" - "GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH")) + eq_( + transcript_cntnap1.sequence[:120], + ( + "GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" + "GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT" + ), + ) + eq_( + transcript_cntnap1.protein_sequence[:120], + ( + "MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS" + "GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH" + ), + ) diff --git a/test/test_release_versions.py b/test/test_release_versions.py index 4eca06f..42761bd 100644 --- a/test/test_release_versions.py +++ b/test/test_release_versions.py @@ -7,29 +7,36 @@ def test_version_too_old_1(): EnsemblRelease(1) + @raises(Exception) def test_version_too_old_47(): EnsemblRelease(47) + @raises(Exception) def test_version_is_not_numeric(): EnsemblRelease("wuzzle") + @raises(Exception) def test_version_is_none(): EnsemblRelease(None) + def test_max_ensembl_release(): - assert isinstance(MAX_ENSEMBL_RELEASE, int), \ - "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % ( - type(MAX_ENSEMBL_RELEASE),) - assert 83 <= MAX_ENSEMBL_RELEASE < 1000, \ + assert isinstance( + MAX_ENSEMBL_RELEASE, int + ), "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % (type(MAX_ENSEMBL_RELEASE),) + assert 83 <= MAX_ENSEMBL_RELEASE < 1000, ( "Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE + ) + def test_int_version(): for version in range(54, MAX_ENSEMBL_RELEASE): EnsemblRelease(version) + def test_str_version(): for version in range(54, MAX_ENSEMBL_RELEASE): EnsemblRelease(str(version)) diff --git a/test/test_search.py b/test/test_search.py index b04688e..40930a4 100644 --- a/test/test_search.py +++ b/test/test_search.py @@ -3,6 +3,7 @@ from pyensembl import find_nearest_locus from .common import test_ensembl_releases + @test_ensembl_releases() def test_find_nearest_BRAF_exon(ensembl): braf = ensembl.genes_by_name("BRAF")[0] @@ -11,25 +12,23 @@ def test_find_nearest_BRAF_exon(ensembl): for exon in exons: # immediately before exon result_before = find_nearest_locus( - start=exon.start - 2, - end=exon.start - 1, - loci=exons) + start=exon.start - 2, end=exon.start - 1, loci=exons + ) eq_(result_before, (1, exon)) # overlapping with exon result_overlap = find_nearest_locus( - start=exon.start - 2, - end=exon.start + 1, - loci=exons) + start=exon.start - 2, end=exon.start + 1, loci=exons + ) eq_(result_overlap, (0, exon)) # immediately after exon result_after = find_nearest_locus( - start=exon.end + 1, - end=exon.end + 2, - loci=exons) + start=exon.end + 1, end=exon.end + 2, loci=exons + ) eq_(result_after, (1, exon)) + @test_ensembl_releases() def test_find_nearest_BRAF_transcript(ensembl): braf_transcript = ensembl.genes_by_name("BRAF")[0].transcripts[0] @@ -38,22 +37,19 @@ def test_find_nearest_BRAF_transcript(ensembl): for transcript in transcripts: # immediately before transcript result_before = find_nearest_locus( - start=transcript.start - 2, - end=transcript.start - 1, - loci=transcripts) + start=transcript.start - 2, end=transcript.start - 1, loci=transcripts + ) eq_(result_before, (1, transcript)) # overlapping with transcript result_overlap = find_nearest_locus( - start=transcript.start - 2, - end=transcript.start + 1, - loci=transcripts) + start=transcript.start - 2, end=transcript.start + 1, loci=transcripts + ) eq_(result_overlap, (0, transcript)) # immediately after transcript # may overlap with other transcripts result_after = find_nearest_locus( - start=transcript.end + 1, - end=transcript.end + 2, - loci=transcripts) + start=transcript.end + 1, end=transcript.end + 2, loci=transcripts + ) eq_(result_after, (1, transcript)) diff --git a/test/test_sequence_data.py b/test/test_sequence_data.py index 98fa7c4..1d8b7fd 100644 --- a/test/test_sequence_data.py +++ b/test/test_sequence_data.py @@ -13,16 +13,17 @@ FASTA_PATH = data_path("mouse.ensembl.81.partial.ENSMUSG00000017167.fa") + def test_sequence_type(): with TemporaryDirectory() as tmpdir: - seqs_dna = SequenceData( - [FASTA_PATH], - cache_directory_path=tmpdir) + seqs_dna = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) seq = seqs_dna.get("ENSMUST00000138942") - assert seq is not None, \ - "Failed to find sequence for ENSMUST00000138942" - assert isinstance(seq, str), \ - "Wrong sequence type, expected %s but got %s" % (str, type(seq)) + assert seq is not None, "Failed to find sequence for ENSMUST00000138942" + assert isinstance(seq, str), "Wrong sequence type, expected %s but got %s" % ( + str, + type(seq), + ) + def test_missing_sequence(): with TemporaryDirectory() as tmpdir: @@ -30,24 +31,24 @@ def test_missing_sequence(): seq = seqs.get("NotInFasta") assert seq is None, "Should get None back for missing sequence" + def test_clear_cache(): with TemporaryDirectory() as tmpdir: seqs = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) - assert not seqs._fasta_dictionary, \ - "Expected _fasta_dictionary to load lazily" + assert not seqs._fasta_dictionary, "Expected _fasta_dictionary to load lazily" seqs._load_or_create_fasta_dictionary_pickle() - assert len(seqs._fasta_dictionary) > 0, \ - "FASTA dictionary didn't get created" + assert len(seqs._fasta_dictionary) > 0, "FASTA dictionary didn't get created" seqs.clear_cache() - assert not seqs._fasta_dictionary, \ - "Expected FASTA dictionary to be empty after clear_cache()" + assert ( + not seqs._fasta_dictionary + ), "Expected FASTA dictionary to be empty after clear_cache()" for pickle_path in seqs.fasta_dictionary_pickle_paths: - assert not exists(pickle_path), \ - "Cached pickle file should have been deleted" + assert not exists( + pickle_path + ), "Cached pickle file should have been deleted" seqs._load_or_create_fasta_dictionary_pickle() for pickle_path in seqs.fasta_dictionary_pickle_paths: - assert exists(pickle_path), \ - "Cached pickle file should have been created" + assert exists(pickle_path), "Cached pickle file should have been created" diff --git a/test/test_serialization.py b/test/test_serialization.py index d90b6b7..40d2c9f 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -19,7 +19,7 @@ from .data import ( TP53_gene_id, custom_mouse_genome_grcm38_subset, - setup_init_custom_mouse_genome + setup_init_custom_mouse_genome, ) @@ -125,5 +125,6 @@ def test_species_to_pickle(): @test_ensembl_releases() def test_unique_memory_address_of_unpickled_genomes(ensembl_genome): unpickled = pickle.loads(pickle.dumps(ensembl_genome)) - assert ensembl_genome is unpickled, \ - "Expected same object for %s but got two different instances" % (unpickled,) + assert ( + ensembl_genome is unpickled + ), "Expected same object for %s but got two different instances" % (unpickled,) diff --git a/test/test_string_representation.py b/test/test_string_representation.py index e2d390c..80d6e48 100644 --- a/test/test_string_representation.py +++ b/test/test_string_representation.py @@ -11,16 +11,23 @@ def test_Locus_string_representation(): def test_Gene_string_representation(): gene = Gene( - gene_id="ENSG0001", gene_name="CAPITALISM", - biotype="protein_coding", contig="Y", start=1, end=5, strand="+", - genome=ensembl_grch37) + gene_id="ENSG0001", + gene_name="CAPITALISM", + biotype="protein_coding", + contig="Y", + start=1, + end=5, + strand="+", + genome=ensembl_grch37, + ) string_repr = str(gene) expected = ( "Gene(gene_id='ENSG0001'," " gene_name='CAPITALISM'," " biotype='protein_coding'," " contig='Y'," - " start=1, end=5, strand='+', genome='GRCh37')") + " start=1, end=5, strand='+', genome='GRCh37')" + ) eq_(string_repr, expected) @@ -34,7 +41,8 @@ def test_Transcript_string_representation(): start=1, end=5, strand="+", - genome=ensembl_grch37) + genome=ensembl_grch37, + ) expected = ( "Transcript(transcript_id='ENST0001'," @@ -57,7 +65,8 @@ def test_Exon_string_representation(): contig="Y", start=1, end=5, - strand="+") + strand="+", + ) expected = ( "Exon(exon_id='ENSE0001'," diff --git a/test/test_timings.py b/test/test_timings.py index a948886..b0fd8e1 100644 --- a/test/test_timings.py +++ b/test/test_timings.py @@ -5,17 +5,21 @@ ensembl = genome_for_reference_name("GRCh38") contigs = [str(i + 1) for i in range(22)] + ["X", "Y"] + def make_repeat_lookup_fn(lookup_fn, n_positions): """ Make a thunk which calls the lookup_fn at a number of loci for each human chromosome (excluding MT). """ + def repeat_lookup_fn(): for contig in contigs: - for position in [10 ** 6 + i * 10 ** 6 for i in range(n_positions)]: + for position in [10**6 + i * 10**6 for i in range(n_positions)]: lookup_fn(contig, position) + return repeat_lookup_fn + def run_benchmark(lookup_fn, n_positions_per_contig=20, time_limit=60.0): """ Take a lookup functions (such as EnsemblRelease.genes_at_locus) and @@ -24,31 +28,38 @@ def run_benchmark(lookup_fn, n_positions_per_contig=20, time_limit=60.0): repeat_lookup_fn = make_repeat_lookup_fn(lookup_fn, n_positions_per_contig) n_loci = n_positions_per_contig * len(contigs) name = lookup_fn.__name__ - average_time = benchmark( - repeat_lookup_fn, - name="%s for %d loci" % (name, n_loci)) + average_time = benchmark(repeat_lookup_fn, name="%s for %d loci" % (name, n_loci)) print("-- %s : %0.4fs" % (name, average_time)) - assert average_time < time_limit, \ - "%s took too long for %s loci: %0.4fs" % (name, n_loci, average_time) + assert average_time < time_limit, "%s took too long for %s loci: %0.4fs" % ( + name, + n_loci, + average_time, + ) return average_time + def test_timing_genes_at_locus(): run_benchmark(ensembl.genes_at_locus) + def test_timing_transcripts_at_locus(): run_benchmark(ensembl.transcripts_at_locus) + def test_timing_exons_at_locus(): run_benchmark(ensembl.exons_at_locus) + def test_timing_transcript_sequences_at_locus(): def transcript_sequences_at_locus(contig, position): sequences = [] for transcript in ensembl.transcripts_at_locus(contig, position): sequences.append(transcript.sequence) return sequences + run_benchmark(transcript_sequences_at_locus) + def test_timing_transcript_coding_sequences_at_locus(): def transcript_coding_sequences_at_locus(contig, position): sequences = [] @@ -56,8 +67,10 @@ def transcript_coding_sequences_at_locus(contig, position): if transcript.sequence and transcript.complete: sequences.append(transcript.coding_sequence) return sequences + run_benchmark(transcript_coding_sequences_at_locus) + def run_all_benchmarks(): import types @@ -69,5 +82,6 @@ def run_all_benchmarks(): if isinstance(f, types.FunctionType): f() + if __name__ == "__main__": run_all_benchmarks() diff --git a/test/test_transcript_ids.py b/test/test_transcript_ids.py index 7868800..f1e910f 100644 --- a/test/test_transcript_ids.py +++ b/test/test_transcript_ids.py @@ -13,46 +13,51 @@ # subset of transcript IDs for HLA-A HLA_A_TRANSCRIPT_IDS = [ - 'ENST00000396634', - 'ENST00000376809', - 'ENST00000376806', - 'ENST00000376802', - 'ENST00000496081', - 'ENST00000495183', - 'ENST00000461903', - 'ENST00000479320', + "ENST00000396634", + "ENST00000376809", + "ENST00000376806", + "ENST00000376802", + "ENST00000496081", + "ENST00000495183", + "ENST00000461903", + "ENST00000479320", ] + def test_transcript_ids_ensembl_grch38_hla_a(): # chr6:29,945,884 is a position for HLA-A # based on: # http://useast.ensembl.org/Homo_sapiens/Gene/ # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 - transcript_ids = grch38.transcript_ids_at_locus( - 6, 29941260, 29945884) + transcript_ids = grch38.transcript_ids_at_locus(6, 29941260, 29945884) for transcript_id in HLA_A_TRANSCRIPT_IDS: - assert transcript_id in transcript_ids, \ + assert transcript_id in transcript_ids, ( "Transcript %s of HLA-A not found overlapping locus" % transcript_id + ) + KNOWN_TRANSCRIPT_IDS = HLA_A_TRANSCRIPT_IDS + [ - 'ENST00000398417', # transcript ID of SMAD4-001 - 'ENST00000334701', # transcript ID of HSP90AA1-001 - 'ENST00000599837', # transcript ID of CTAG1A-002 + "ENST00000398417", # transcript ID of SMAD4-001 + "ENST00000334701", # transcript ID of HSP90AA1-001 + "ENST00000599837", # transcript ID of CTAG1A-002 ] + # TODO: add release 54 after transcript IDs for older GTFs are filled in # See https://github.com/hammerlab/pyensembl/issues/20 @test_ensembl_releases(75, grch38.release) def test_all_transcript_ids(ensembl): transcript_ids = set(ensembl.transcript_ids()) for transcript_id in KNOWN_TRANSCRIPT_IDS: - assert transcript_id in transcript_ids, \ - "Missing transcript ID %s from %s" % (transcript_id, ensembl) + assert transcript_id in transcript_ids, "Missing transcript ID %s from %s" % ( + transcript_id, + ensembl, + ) + def test_transcript_id_of_protein_id_CCR2(): # Looked up on Oct 9 2021: # CCR2-203 ENST00000445132.3 maps to ENSP00000399285.2 # Ensembl release 104, GRCh38.p13 - transcript_id = grch38.transcript_id_of_protein_id( - "ENSP00000399285") + transcript_id = grch38.transcript_id_of_protein_id("ENSP00000399285") eq_("ENST00000445132", transcript_id) diff --git a/test/test_transcript_objects.py b/test/test_transcript_objects.py index 79d08d5..b8d5d58 100644 --- a/test/test_transcript_objects.py +++ b/test/test_transcript_objects.py @@ -23,27 +23,35 @@ def test_transcript_start_codon(): test_transcript_start_codon : Check that fields Transcript (for transcript named CTNNBIP1-004) matches known values. """ - CTNNBIP1_004_transcript = ensembl77.transcript_by_id( - CTNNBIP1_004_transcript_id) + CTNNBIP1_004_transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) - assert Locus.__eq__(CTNNBIP1_004_transcript, CTNNBIP1_004_locus), \ - "Expected locus %s but got %s" % ( - CTNNBIP1_004_locus, Locus.__str__(CTNNBIP1_004_transcript)) + assert Locus.__eq__( + CTNNBIP1_004_transcript, CTNNBIP1_004_locus + ), "Expected locus %s but got %s" % ( + CTNNBIP1_004_locus, + Locus.__str__(CTNNBIP1_004_transcript), + ) start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets - assert len(start_offsets) == 3, \ - "Wrong length for start codon: %d (%s)" % ( - len(start_offsets), start_offsets) + assert len(start_offsets) == 3, "Wrong length for start codon: %d (%s)" % ( + len(start_offsets), + start_offsets, + ) - assert all(isinstance(i, int) for i in start_offsets), \ - "Wrong type %s for beginning start codon offset" % ( - [type(i) for i in start_offsets],) + assert all( + isinstance(i, int) for i in start_offsets + ), "Wrong type %s for beginning start codon offset" % ( + [type(i) for i in start_offsets], + ) expected_start_codon_offset = len(CTNNBIP1_004_UTR5) start_codon_offset = min(start_offsets) - assert start_codon_offset == expected_start_codon_offset, \ - "Incorrect start codon offset, expected %d but got %d" % ( - expected_start_codon_offset, start_codon_offset) + assert ( + start_codon_offset == expected_start_codon_offset + ), "Incorrect start codon offset, expected %d but got %d" % ( + expected_start_codon_offset, + start_codon_offset, + ) def test_transcript_exons(): @@ -53,24 +61,37 @@ def test_transcript_exons(): """ transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) exons = transcript.exons - assert isinstance(exons, list), \ - "Expected list of Exon objects, got %s : %s" % (exons, type(exons)) + assert isinstance(exons, list), "Expected list of Exon objects, got %s : %s" % ( + exons, + type(exons), + ) # CTTNBIP1-004 has 5 exons - assert len(exons) == len(CTTNNIP1_004_exon_lengths), \ - "Expected %d exons but got %d" % ( - len(CTTNNIP1_004_exon_lengths), len(exons)) + assert len(exons) == len( + CTTNNIP1_004_exon_lengths + ), "Expected %d exons but got %d" % (len(CTTNNIP1_004_exon_lengths), len(exons)) for i, exon in enumerate(exons): expected_id = CTTNNIP1_004_exon_ids[i] - assert exon.id == expected_id, \ - "Expected exon #%d of %s to have ID %s but got %s" % ( - i + 1, transcript, expected_id, exon.id) + assert ( + exon.id == expected_id + ), "Expected exon #%d of %s to have ID %s but got %s" % ( + i + 1, + transcript, + expected_id, + exon.id, + ) expected_length = CTTNNIP1_004_exon_lengths[i] - assert len(exon) == expected_length, \ - "Expected exon #%d of %s (%s) to have length %d but got %d" % ( - i + 1, transcript, exon, expected_length, len(exon)) + assert ( + len(exon) == expected_length + ), "Expected exon #%d of %s (%s) to have length %d but got %d" % ( + i + 1, + transcript, + exon, + expected_length, + len(exon), + ) # not testing NCBI/Release 54 since I just discovered that ensembl54 @@ -106,39 +127,41 @@ def test_sequence_parts(genome): eq_( combined_sequence_length, len(transcript), - "Length 5' UTR(%dnt) + CDS(%dnt) + 3' UTR(%d) = %d, expected %d" % ( - len(utr5), - len(cds), - len(utr3), - combined_sequence_length, - len(transcript))) + "Length 5' UTR(%dnt) + CDS(%dnt) + 3' UTR(%d) = %d, expected %d" + % (len(utr5), len(cds), len(utr3), combined_sequence_length, len(transcript)), + ) eq_( combined_string, full_sequence, - "Expected FOXP3-001 sequence:\n%s\n\n5' UTR + CDS + 3' UTR:\n%s" % ( - full_sequence, - combined_string)) + "Expected FOXP3-001 sequence:\n%s\n\n5' UTR + CDS + 3' UTR:\n%s" + % (full_sequence, combined_string), + ) + def test_transcript_utr5_sequence_CTNNIP1_004(): transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) utr5 = transcript.five_prime_utr_sequence expected_utr5_length = len(CTNNBIP1_004_UTR5) - eq_(len(utr5), + eq_( + len(utr5), expected_utr5_length, - "Expected 5' UTR length %d, got %d" % ( - expected_utr5_length, len(utr5))) + "Expected 5' UTR length %d, got %d" % (expected_utr5_length, len(utr5)), + ) eq_(utr5, CTNNBIP1_004_UTR5) + def test_transcript_utr3_sequence_CTNNIP1_004(): transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) utr3 = transcript.three_prime_utr_sequence expected_utr3_length = len(CTNNBIP1_004_UTR3) - eq_(len(utr3), + eq_( + len(utr3), expected_utr3_length, - "Expected 3' UTR length %d, got %d" % ( - expected_utr3_length, len(utr3))) + "Expected 3' UTR length %d, got %d" % (expected_utr3_length, len(utr3)), + ) eq_(utr3, CTNNBIP1_004_UTR3) + def test_transcript_cds_CTNNIP1_004(): transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) cds = transcript.coding_sequence @@ -146,9 +169,11 @@ def test_transcript_cds_CTNNIP1_004(): eq_( len(cds), expected_cds_length, - "Expected CDS length %d, got %d" % (expected_cds_length, len(cds))) + "Expected CDS length %d, got %d" % (expected_cds_length, len(cds)), + ) eq_(cds, CTNNBIP1_004_CDS) + @test_ensembl_releases() def test_equal_transcripts(genome): t1 = genome.genes_by_name("TP53")[0].transcripts[0] @@ -157,29 +182,37 @@ def test_equal_transcripts(genome): eq_(t1, t2) eq_(hash(t1), hash(t2)) + @test_ensembl_releases() def test_not_equal_transcripts(genome): t1 = genome.genes_by_name("MUC1")[0].transcripts[0] t2 = genome.genes_by_name("BRCA1")[0].transcripts[0] assert_not_equal(t1, t2) + def test_protein_id(): transcript = ensembl77.transcripts_by_name("EGFR-001")[0] eq_(transcript.protein_id, "ENSP00000275493") + def test_protein_protein_sequence(): transcript = ensembl77.transcripts_by_name("EGFR-001")[0] eq_(transcript.protein_sequence, EGFR_001_protein_sequence) + def test_transcript_gene_should_match_parent_gene(): gene = ensembl77.gene_by_id(TP53_gene_id) for transcript in gene.transcripts: eq_(transcript.gene, gene) + @test_ensembl_releases() def test_BRCA1_201_has_protein_coding_biotype(genome): transcript = genome.transcripts_by_name("BRCA1-201")[0] - assert transcript.is_protein_coding, \ - "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" % ( - transcript, genome) + assert ( + transcript.is_protein_coding + ), "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" % ( + transcript, + genome, + ) eq_(transcript.biotype, "protein_coding") diff --git a/test/test_transcript_sequences.py b/test/test_transcript_sequences.py index 529e599..f654a3a 100644 --- a/test/test_transcript_sequences.py +++ b/test/test_transcript_sequences.py @@ -9,6 +9,7 @@ grch38 = genome_for_reference_name("GRCh38") + def test_transcript_sequence_ensembl_grch38(): # extremely short TRD gene seq = grch38.transcript_sequence("ENST00000448914") diff --git a/test/test_transcript_support_level.py b/test/test_transcript_support_level.py index 0d9cabd..6bf8a40 100644 --- a/test/test_transcript_support_level.py +++ b/test/test_transcript_support_level.py @@ -8,11 +8,12 @@ from pyensembl import cached_release + def test_transcript_support_level(): - """ The Transcript Support Level (TSL) is a method to highlight the well-supported and poorly-supported transcript - models for users, based on the type and quality of the alignments used to annotate the transcript. - In the Ensembl database, it can be assigned to a value 1 through 5, or reported as NA, or missing, or missing - completely in older releases. We translate it to an integer value, otherwise to None. + """The Transcript Support Level (TSL) is a method to highlight the well-supported and poorly-supported transcript + models for users, based on the type and quality of the alignments used to annotate the transcript. + In the Ensembl database, it can be assigned to a value 1 through 5, or reported as NA, or missing, or missing + completely in older releases. We translate it to an integer value, otherwise to None. """ ensembl93 = cached_release(93) transcript = ensembl93.transcripts_by_name("DDX11L1-202")[0] diff --git a/test/test_ucsc_gtf.py b/test/test_ucsc_gtf.py index 3e4a9e5..7cecde5 100644 --- a/test/test_ucsc_gtf.py +++ b/test/test_ucsc_gtf.py @@ -8,16 +8,14 @@ UCSC_GENCODE_PATH = data_path("gencode.ucsc.small.gtf") UCSC_REFSEQ_PATH = data_path("refseq.ucsc.small.gtf") + def test_ucsc_gencode_gtf(): with TemporaryDirectory() as tmpdir: - db = Database( - UCSC_GENCODE_PATH, - cache_directory_path=tmpdir) + db = Database(UCSC_GENCODE_PATH, cache_directory_path=tmpdir) df = db._load_gtf_as_dataframe() exons = df[df["feature"] == "exon"] # expect 12 exons from the dataframe - assert len(exons) == 12, \ - "Expected 12 exons, got %d: %s" % (len(exons), exons) + assert len(exons) == 12, "Expected 12 exons, got %d: %s" % (len(exons), exons) def test_ucsc_gencode_genome(): @@ -30,22 +28,22 @@ def test_ucsc_gencode_genome(): reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_GENCODE_PATH, - cache_directory_path=tmpdir) + cache_directory_path=tmpdir, + ) genome.index() genes = genome.genes() for gene in genes: - assert gene.id, \ - "Gene with missing ID in %s" % (genome.gtf.dataframe(),) - assert len(genes) == 7, \ - "Expected 7 genes, got %d: %s" % ( - len(genes), genes) + assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),) + assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: - assert transcript.id, \ - "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) - assert len(transcripts) == 7, \ - "Expected 7 transcripts, got %d: %s" % ( - len(transcripts), transcripts) + assert transcript.id, "Transcript with missing ID in %s" % ( + genome.gtf.dataframe(), + ) + assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % ( + len(transcripts), + transcripts, + ) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") @@ -58,21 +56,18 @@ def test_ucsc_gencode_genome(): transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564) eq_(transcript_1_30564[0].id, "uc057aty.1") + def test_ucsc_refseq_gtf(): """ Test GTF object with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: - db = Database( - UCSC_REFSEQ_PATH, - cache_directory_path=tmpdir) + db = Database(UCSC_REFSEQ_PATH, cache_directory_path=tmpdir) df = db._load_gtf_as_dataframe() exons = df[df["feature"] == "exon"] # expect 16 exons from the GTF - assert len(exons) == 16, \ - "Expected 16 exons, got %d: %s" % ( - len(exons), exons) + assert len(exons) == 16, "Expected 16 exons, got %d: %s" % (len(exons), exons) def test_ucsc_refseq_genome(): @@ -85,25 +80,30 @@ def test_ucsc_refseq_genome(): reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_REFSEQ_PATH, - cache_directory_path=tmpdir) + cache_directory_path=tmpdir, + ) genome.index() genes = genome.genes() for gene in genes: - assert gene.id, \ - "Gene with missing ID in %s" % (genome.db._load_gtf_as_dataframe(),) - assert len(genes) == 2, \ - "Expected 2 genes, got %d: %s" % ( - len(genes), genes) + assert gene.id, "Gene with missing ID in %s" % ( + genome.db._load_gtf_as_dataframe(), + ) + assert len(genes) == 2, "Expected 2 genes, got %d: %s" % (len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: - assert transcript.id, \ - "Transcript with missing ID in %s" % (genome.db._load_gtf_as_dataframe(),) - assert len(transcripts) == 2, \ - "Expected 2 transcripts, got %d: %s" % ( - len(transcripts), transcripts) + assert transcript.id, "Transcript with missing ID in %s" % ( + genome.db._load_gtf_as_dataframe(), + ) + assert len(transcripts) == 2, "Expected 2 transcripts, got %d: %s" % ( + len(transcripts), + transcripts, + ) genes_at_locus = genome.genes_at_locus("chr1", 67092176) - assert len(genes_at_locus) == 2, \ - "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( - len(genes_at_locus), genes_at_locus) + assert ( + len(genes_at_locus) == 2 + ), "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( + len(genes_at_locus), + genes_at_locus, + ) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids) From 7c581a879c46a9a503876ee5e3f442766f04ae30 Mon Sep 17 00:00:00 2001 From: Ye Chang Date: Fri, 29 Dec 2023 12:16:59 -0600 Subject: [PATCH 09/35] format and release --- pyensembl/species.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyensembl/species.py b/pyensembl/species.py index c19f359..7af4569 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -335,8 +335,8 @@ def check_species_object(species_name_or_object): # "WS180": (47, 49), # "WS190": (50, 54), "WS200": (55, 57), - "WS210": (58, 78), - "WS220": (79, 66), + "WS210": (58, 60), + "WS220": (61, 66), "WBcel235": (67, MAX_ENSEMBL_RELEASE), }, ) From 3819ea2e5544897488e205fe5a9f8b7a09ff34c6 Mon Sep 17 00:00:00 2001 From: Ye Chang Date: Fri, 29 Dec 2023 12:56:01 -0600 Subject: [PATCH 10/35] add comand to list all available --- pyensembl/shell.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/pyensembl/shell.py b/pyensembl/shell.py index cd7ab3c..2cc56ee 100755 --- a/pyensembl/shell.py +++ b/pyensembl/shell.py @@ -40,14 +40,17 @@ import argparse import logging.config -import pkg_resources import os -from .ensembl_release import EnsemblRelease, MAX_ENSEMBL_RELEASE +import pkg_resources + +from .ensembl_release import MAX_ENSEMBL_RELEASE, EnsemblRelease from .genome import Genome from .species import Species -logging.config.fileConfig(pkg_resources.resource_filename(__name__, "logging.conf")) +logging.config.fileConfig( + pkg_resources.resource_filename(__name__, "logging.conf") +) logger = logging.getLogger(__name__) @@ -94,7 +97,9 @@ ) path_group.add_argument( - "--annotation-name", default=None, help="Name of annotation source (e.g. refseq)" + "--annotation-name", + default=None, + help="Name of annotation source (e.g. refseq)", ) path_group.add_argument( @@ -140,6 +145,7 @@ "delete-all-files", "delete-index-files", "list", + "available", ), help=( '"install" will download and index any data that is not ' @@ -151,6 +157,20 @@ ) +def collect_all_available_ensembl_releases(): + for species_name in Species.all_registered_latin_names(): + species = Species._latin_names_to_species[species_name] + # print in tree format + print( + "* " + species_name + " (" + ",".join(species.synonyms) + ")" + ":" + ) + for ( + release_name, + release_range, + ) in species.reference_assemblies.items(): + print(" * " + release_name + ":", release_range) + + def collect_all_installed_ensembl_releases(): genomes = [] for species, release in Species.all_species_release_pairs(): @@ -182,11 +202,13 @@ def all_combinations_of_ensembl_genomes(args): # URL to be a directory with all the same filenames as # would be provided by Ensembl gtf_url = os.path.join( - args.custom_mirror, os.path.basename(ensembl_release.gtf_url) + args.custom_mirror, + os.path.basename(ensembl_release.gtf_url), ) transcript_fasta_urls = [ os.path.join( - args.custom_mirror, os.path.basename(transcript_fasta_url) + args.custom_mirror, + os.path.basename(transcript_fasta_url), ) for transcript_fasta_url in ensembl_release.transcript_fasta_urls ] @@ -244,7 +266,9 @@ def collect_selected_genomes(args): def run(): args = parser.parse_args() - if args.action == "list": + if args.action == "available": + collect_all_available_ensembl_releases() + elif args.action == "list": # TODO: how do we also identify which non-Ensembl genomes are # installed? genomes = collect_all_installed_ensembl_releases() From ed1de05908cc7d0b0c2834f0852142627a0ce3c2 Mon Sep 17 00:00:00 2001 From: Ye Chang Date: Fri, 29 Dec 2023 13:01:55 -0600 Subject: [PATCH 11/35] add comand to list all available --- pyensembl/shell.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyensembl/shell.py b/pyensembl/shell.py index 2cc56ee..66f874c 100755 --- a/pyensembl/shell.py +++ b/pyensembl/shell.py @@ -162,7 +162,12 @@ def collect_all_available_ensembl_releases(): species = Species._latin_names_to_species[species_name] # print in tree format print( - "* " + species_name + " (" + ",".join(species.synonyms) + ")" + ":" + "* " + + species_name + + " (" + + ", ".join(species.synonyms) + + ")" + + ":" ) for ( release_name, From 1426100e47478308c805556f6000db17f955033d Mon Sep 17 00:00:00 2001 From: Ye Chang Date: Fri, 29 Dec 2023 13:03:44 -0600 Subject: [PATCH 12/35] add comand to list all available --- pyensembl/species.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pyensembl/species.py b/pyensembl/species.py index 7af4569..c153f5e 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -100,7 +100,8 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}): for i in range(start, end + 1): if i in self._release_to_genome: raise ValueError( - "Ensembl release %d already has an associated genome" % i + "Ensembl release %d already has an associated genome" + % i ) self._release_to_genome[i] = genome_name @@ -113,10 +114,13 @@ def which_reference(self, ensembl_release): return self._release_to_genome[ensembl_release] def __str__(self): - return "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" % ( - self.latin_name, - self.synonyms, - self.reference_assemblies, + return ( + "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" + % ( + self.latin_name, + self.synonyms, + self.reference_assemblies, + ) ) def __eq__(self, other): @@ -188,9 +192,9 @@ def check_species_object(species_name_or_object): latin_name="homo_sapiens", synonyms=["human"], reference_assemblies={ - "GRCh38": (76, MAX_ENSEMBL_RELEASE), - "GRCh37": (55, 75), "NCBI36": (54, 54), + "GRCh37": (55, 75), + "GRCh38": (76, MAX_ENSEMBL_RELEASE), }, ) From e03a213ccedda997ecb868490e62367f779f8c9a Mon Sep 17 00:00:00 2001 From: Ye Chang Date: Tue, 2 Jan 2024 04:13:56 -0600 Subject: [PATCH 13/35] fix bug --- pyensembl/genome.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyensembl/genome.py b/pyensembl/genome.py index 5345742..e56dd2e 100644 --- a/pyensembl/genome.py +++ b/pyensembl/genome.py @@ -896,8 +896,10 @@ def transcript_by_id(self, transcript_id): extra_data = dict(zip(extra_field_names, result[5:])) transcript_name = extra_data.get("transcript_name") transcript_biotype = extra_data.get("transcript_biotype") - tsl = extra_data.get("transcript_support_level") - if not tsl or tsl == "NA": + tsl = extra_data.get("transcript_support_level", "NA") + if tsl: + tsl = tsl.split(" ")[0] + if not tsl or tsl == "NA" or not tsl.isnumeric(): tsl = None else: tsl = int(tsl) From c6915d1c33879973c7a7539037b9941cbb4950d1 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 19:27:22 -0600 Subject: [PATCH 14/35] support plants --- pyensembl/download_cache.py | 50 ++++++---- pyensembl/ensembl_release.py | 49 +++++++--- pyensembl/ensembl_release_versions.py | 2 + pyensembl/ensembl_url_templates.py | 136 ++++++++++++++++++-------- pyensembl/shell.py | 37 ++++--- pyensembl/species.py | 45 ++++++--- 6 files changed, 221 insertions(+), 98 deletions(-) diff --git a/pyensembl/download_cache.py b/pyensembl/download_cache.py index c33d6fe..48ebd00 100644 --- a/pyensembl/download_cache.py +++ b/pyensembl/download_cache.py @@ -11,14 +11,13 @@ # limitations under the License. +import logging from os import listdir, remove -from os.path import join, exists, split, abspath, isdir +from os.path import abspath, exists, isdir, join, split from shutil import copy2, rmtree -import logging import datacache - logger = logging.getLogger(__name__) CACHE_BASE_SUBDIR = "pyensembl" @@ -29,9 +28,11 @@ def cache_subdirectory( reference_name=None, annotation_name=None, annotation_version=None ): """ - Which cache subdirectory to use for a given annotation database - over a particular reference. All arguments can be omitted to just get - the base subdirectory for all pyensembl cached datasets. + Which cache subdirectory to use for a given annotation database over a + particular reference. + + All arguments can be omitted to just get the base subdirectory for + all pyensembl cached datasets. """ if reference_name is None: reference_name = "" @@ -135,7 +136,7 @@ def cache_directory_path(self): def _fields(self): """ - Fields used for hashing, string representation, equality comparison + Fields used for hashing, string representation, equality comparison. """ return ( ( @@ -150,7 +151,10 @@ def _fields(self): ) def __eq__(self, other): - return other.__class__ is DownloadCache and self._fields() == other._fields() + return ( + other.__class__ is DownloadCache + and self._fields() == other._fields() + ) def __hash__(self): return hash(self._fields()) @@ -202,7 +206,9 @@ def cached_path(self, path_or_url): # for stripping decompression extensions for both local # and remote files local_filename = datacache.build_local_filename( - download_url=path_or_url, filename=remote_filename, decompress=False + download_url=path_or_url, + filename=remote_filename, + decompress=False, ) else: local_filename = remote_filename @@ -210,10 +216,14 @@ def cached_path(self, path_or_url): # if we expect the download function to decompress this file then # we should use its name without the compression extension if self.decompress_on_download: - local_filename = self._remove_compression_suffix_if_present(local_filename) + local_filename = self._remove_compression_suffix_if_present( + local_filename + ) if len(local_filename) == 0: - raise ValueError("Can't determine local filename for %s" % (path_or_url,)) + raise ValueError( + "Can't determine local filename for %s" % (path_or_url,) + ) return join(self.cache_directory_path, local_filename) @@ -254,8 +264,8 @@ def download_or_copy_if_necessary( self, path_or_url, download_if_missing=False, overwrite=False ): """ - Download a remote file or copy - Get the local path to a possibly remote file. + Download a remote file or copy Get the local path to a possibly remote + file. Download if file is missing from the cache directory and `download_if_missing` is True. Download even if local file exists if @@ -295,7 +305,11 @@ def _raise_missing_file_error(self, missing_urls_dict): raise ValueError(error_message) def local_path_or_install_error( - self, field_name, path_or_url, download_if_missing=False, overwrite=False + self, + field_name, + path_or_url, + download_if_missing=False, + overwrite=False, ): try: return self.download_or_copy_if_necessary( @@ -308,13 +322,13 @@ def local_path_or_install_error( def delete_cached_files(self, prefixes=[], suffixes=[]): """ - Deletes any cached files matching the prefixes or suffixes given + Deletes any cached files matching the prefixes or suffixes given. """ if isdir(self.cache_directory_path): for filename in listdir(): - delete = any([filename.endswith(ext) for ext in suffixes]) or any( - [filename.startswith(pre) for pre in prefixes] - ) + delete = any( + [filename.endswith(ext) for ext in suffixes] + ) or any([filename.startswith(pre) for pre in prefixes]) if delete: path = join(self.cache_directory_path, filename) logger.info("Deleting %s", path) diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 8ad47ab..8af2584 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -11,22 +11,25 @@ # limitations under the License. """ -Contains the EnsemblRelease class, which extends the Genome class -to be specific to (a particular release of) Ensembl. +Contains the EnsemblRelease class, which extends the Genome class to be +specific to (a particular release of) Ensembl. """ from weakref import WeakValueDictionary +from .ensembl_release_versions import MAX_ENSEMBL_RELEASE, check_release_number +from .ensembl_url_templates import ( + ENSEMBL_FTP_SERVER, + make_fasta_url, + make_gtf_url, +) from .genome import Genome -from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE from .species import check_species_object, human -from .ensembl_url_templates import ENSEMBL_FTP_SERVER, make_gtf_url, make_fasta_url - class EnsemblRelease(Genome): """ - Bundles together the genomic annotation and sequence data associated with - a particular release of the Ensembl database. + Bundles together the genomic annotation and sequence data associated with a + particular release of the Ensembl database. """ @classmethod @@ -47,7 +50,11 @@ def normalize_init_values(cls, release, species, server): @classmethod def cached( - cls, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER + cls, + release=MAX_ENSEMBL_RELEASE, + species=human, + server=None, + # server=ENSEMBL_FTP_SERVER, ): """ Construct EnsemblRelease if it's never been made before, otherwise @@ -61,14 +68,21 @@ def cached( return genome def __init__( - self, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER + self, + release=MAX_ENSEMBL_RELEASE, + species=human, + server=None, + # ENSEMBL_FTP_SERVER, ): self.release, self.species, self.server = self.normalize_init_values( release=release, species=species, server=server ) self.gtf_url = make_gtf_url( - ensembl_release=self.release, species=self.species, server=self.server + ensembl_release=self.release, + species=self.species.latin_name, + server=self.server, + database=self.species.database, ) self.transcript_fasta_urls = [ @@ -77,12 +91,14 @@ def __init__( species=self.species.latin_name, sequence_type="cdna", server=server, + database=self.species.database, ), make_fasta_url( ensembl_release=self.release, species=self.species.latin_name, sequence_type="ncrna", server=server, + database=self.species.database, ), ] @@ -92,6 +108,7 @@ def __init__( species=self.species.latin_name, sequence_type="pep", server=self.server, + database=self.species.database, ) ] @@ -130,7 +147,11 @@ def __hash__(self): return hash((self.release, self.species)) def to_dict(self): - return {"release": self.release, "species": self.species, "server": self.server} + return { + "release": self.release, + "species": self.species, + "server": self.server, + } @classmethod def from_dict(cls, state_dict): @@ -144,7 +165,9 @@ def cached_release(release, species="human"): """ Create an EnsemblRelease instance only if it's hasn't already been made, otherwise returns the old instance. - Keeping this function for backwards compatibility but this functionality - has been moving into the cached method of EnsemblRelease. + + Keeping this function for backwards compatibility but this + functionality has been moving into the cached method of + EnsemblRelease. """ return EnsemblRelease.cached(release=release, species=species) diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py index 79649bd..020c6e6 100644 --- a/pyensembl/ensembl_release_versions.py +++ b/pyensembl/ensembl_release_versions.py @@ -12,6 +12,8 @@ MIN_ENSEMBL_RELEASE = 54 MAX_ENSEMBL_RELEASE = 110 +MIN_ENSEMBLGENOME_RELEASE = 50 +MAX_ENSEMBLGENOME_RELEASE = 57 def check_release_number(release): diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py index ded3570..298e517 100644 --- a/pyensembl/ensembl_url_templates.py +++ b/pyensembl/ensembl_url_templates.py @@ -11,19 +11,23 @@ # limitations under the License. """ -Templates for URLs and paths to specific relase, species, and file type -on the Ensembl ftp server. +Templates for URLs and paths to specific relase, species, and file type on the +Ensembl ftp server. For example, the human chromosomal DNA sequences for release 78 are in: https://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/ +For plant, fungi and metazoa species, the url is as follow: + + https://ftp.ensemblgenomes.ebi.ac.uk/pub/release-57/plants/fasta/glycine_max/cdna/ """ -from .species import Species, find_species_by_name from .ensembl_release_versions import check_release_number +from .species import Species, find_species_by_name ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org" +ENSEMBLGENOME_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk" # Example directories # FASTA files: /pub/release-78/fasta/homo_sapiens/ @@ -31,6 +35,39 @@ FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/" GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/" +DATABASE_FASTA_SUBDIR_TEMPLATE = ( + "/pub/release-%(release)d/$(database)s/fasta/%(species)s/%(type)s/" +) +DATABASE_GTF_SUBDIR_TEMPLATE = ( + "/pub/release-%(release)d/%(database)s/gtf/%(species)s/" +) + +# GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz +GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz" + +# cDNA & protein FASTA file for releases before (and including) Ensembl 75 +# example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz +OLD_FASTA_FILENAME_TEMPLATE = ( + "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz" +) + +# ncRNA FASTA file for releases before (and including) Ensembl 75 +# example: Homo_sapiens.NCBI36.54.ncrna.fa.gz + +OLD_FASTA_FILENAME_TEMPLATE_NCRNA = ( + "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" +) + +# cDNA & protein FASTA file for releases after Ensembl 75 +# example: Homo_sapiens.GRCh37.cdna.all.fa.gz +NEW_FASTA_FILENAME_TEMPLATE = ( + "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" +) + +# ncRNA FASTA file for releases after Ensembl 75 +# example: Homo_sapiens.GRCh37.ncrna.fa.gz +NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz" + def normalize_release_properties(ensembl_release, species): """ @@ -44,14 +81,10 @@ def normalize_release_properties(ensembl_release, species): return ensembl_release, species.latin_name, reference_name -# GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz -GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz" - - def make_gtf_filename(ensembl_release, species): """ Return GTF filename expect on Ensembl FTP server for a specific - species/release combination + species/release combination. """ ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species @@ -63,36 +96,36 @@ def make_gtf_filename(ensembl_release, species): } -def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER): +def make_gtf_url(ensembl_release, species, server=None, database=None): """ Returns a URL and a filename, which can be joined together. """ - ensembl_release, species, _ = normalize_release_properties(ensembl_release, species) - subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species} - filename = make_gtf_filename(ensembl_release=ensembl_release, species=species) + if server is None: + if database is None: + server = ENSEMBL_FTP_SERVER + else: + server = ENSEMBLGENOME_FTP_SERVER + ensembl_release, species, _ = normalize_release_properties( + ensembl_release, species + ) + if database is None: + subdir = GTF_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "species": species, + } + else: + print(ensembl_release, species, database) + subdir = DATABASE_GTF_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "database": database, + "species": species, + } + filename = make_gtf_filename( + ensembl_release=ensembl_release, species=species + ) return server + subdir + filename -# cDNA & protein FASTA file for releases before (and including) Ensembl 75 -# example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz -OLD_FASTA_FILENAME_TEMPLATE = ( - "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz" -) - -# ncRNA FASTA file for releases before (and including) Ensembl 75 -# example: Homo_sapiens.NCBI36.54.ncrna.fa.gz - -OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" - -# cDNA & protein FASTA file for releases after Ensembl 75 -# example: Homo_sapiens.GRCh37.cdna.all.fa.gz -NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" - -# ncRNA FASTA file for releases after Ensembl 75 -# example: Homo_sapiens.GRCh37.ncrna.fa.gz -NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz" - - def make_fasta_filename(ensembl_release, species, sequence_type): ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species @@ -125,23 +158,46 @@ def make_fasta_filename(ensembl_release, species, sequence_type): } -def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER): - """Construct URL to FASTA file with cDNA transcript or protein sequences +def make_fasta_url( + ensembl_release, + species, + sequence_type, + server=None, + database=None, +): + """ + Construct URL to FASTA file with cDNA transcript or protein sequences. Parameter examples: ensembl_release = 75 species = "Homo_sapiens" sequence_type = "cdna" (other option: "pep") """ + if server is None: + if database is None: + server = ENSEMBL_FTP_SERVER + else: + server = ENSEMBLGENOME_FTP_SERVER ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species ) - subdir = FASTA_SUBDIR_TEMPLATE % { - "release": ensembl_release, - "species": species, - "type": sequence_type, - } + if database is None: + subdir = FASTA_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "species": species, + "type": sequence_type, + } + else: + subdir = DATABASE_FASTA_SUBDIR_TEMPLATE % { + "release": ensembl_release, + "database": database, + "species": species, + "type": sequence_type, + } + filename = make_fasta_filename( - ensembl_release=ensembl_release, species=species, sequence_type=sequence_type + ensembl_release=ensembl_release, + species=species, + sequence_type=sequence_type, ) return server + subdir + filename diff --git a/pyensembl/shell.py b/pyensembl/shell.py index 66f874c..0a7a54f 100755 --- a/pyensembl/shell.py +++ b/pyensembl/shell.py @@ -30,6 +30,9 @@ To list all installed genomes: %(prog)s list +To list all available genomes: + %(prog)s available + To install a genome from source files: %(prog)s install \ --reference-name "GRCh38" \ @@ -46,11 +49,9 @@ from .ensembl_release import MAX_ENSEMBL_RELEASE, EnsemblRelease from .genome import Genome -from .species import Species +from .species import Species, normalize_species_name -logging.config.fileConfig( - pkg_resources.resource_filename(__name__, "logging.conf") -) +logging.config.fileConfig(pkg_resources.resource_filename(__name__, "logging.conf")) logger = logging.getLogger(__name__) @@ -161,14 +162,7 @@ def collect_all_available_ensembl_releases(): for species_name in Species.all_registered_latin_names(): species = Species._latin_names_to_species[species_name] # print in tree format - print( - "* " - + species_name - + " (" - + ", ".join(species.synonyms) - + ")" - + ":" - ) + print("* " + species_name + " (" + ", ".join(species.synonyms) + ")" + ":") for ( release_name, release_range, @@ -189,11 +183,26 @@ def all_combinations_of_ensembl_genomes(args): """ Use all combinations of species and release versions specified by the commandline arguments to return a list of EnsemblRelease or Genome objects. - The results will typically be of type EnsemblRelease unless the + The results will typically be of type EnsemblRelease unless the. + --custom-mirror argument was given. """ species_list = args.species if args.species else ["human"] - release_list = args.release if args.release else [MAX_ENSEMBL_RELEASE] + + release_list = ( + args.release + if args.release + else [ + max( + i + for _, i in Species._latin_names_to_species[ + normalize_species_name(species_name) + ].reference_assemblies.values() + ) + for species_name in species_list + ] + ) + genomes = [] for species in species_list: # Otherwise, use Ensembl release information diff --git a/pyensembl/species.py b/pyensembl/species.py index c153f5e..4249234 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -12,7 +12,10 @@ from serializable import Serializable -from .ensembl_release_versions import MAX_ENSEMBL_RELEASE +from .ensembl_release_versions import ( + MAX_ENSEMBL_RELEASE, + MAX_ENSEMBLGENOME_RELEASE, +) # TODO: replace Serializable with data class @@ -30,15 +33,16 @@ class Species(Serializable): _reference_names_to_species = {} @classmethod - def register(cls, latin_name, synonyms, reference_assemblies): + def register(cls, latin_name, synonyms, reference_assemblies, database=None): """ - Create a Species object from the given arguments and enter into - all the dicts used to look the species up by its fields. + Create a Species object from the given arguments and enter into all the + dicts used to look the species up by its fields. """ species = Species( latin_name=latin_name, synonyms=synonyms, reference_assemblies=reference_assemblies, + database=database, ) cls._latin_names_to_species[species.latin_name] = species for synonym in synonyms: @@ -71,8 +75,8 @@ def all_registered_latin_names(cls): @classmethod def all_species_release_pairs(cls): """ - Generator which yields (species, release) pairs - for all possible combinations. + Generator which yields (species, release) pairs for all possible + combinations. """ for species_name in cls.all_registered_latin_names(): species = cls._latin_names_to_species[species_name] @@ -80,7 +84,7 @@ def all_species_release_pairs(cls): for release in range(release_range[0], release_range[1] + 1): yield species_name, release - def __init__(self, latin_name, synonyms=[], reference_assemblies={}): + def __init__(self, latin_name, synonyms=[], reference_assemblies={}, database=None): """ Parameters ---------- @@ -95,13 +99,13 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}): self.latin_name = latin_name.lower().replace(" ", "_") self.synonyms = synonyms self.reference_assemblies = reference_assemblies + self.database = database self._release_to_genome = {} for genome_name, (start, end) in self.reference_assemblies.items(): for i in range(start, end + 1): if i in self._release_to_genome: raise ValueError( - "Ensembl release %d already has an associated genome" - % i + "Ensembl release %d already has an associated genome" % i ) self._release_to_genome[i] = genome_name @@ -115,11 +119,12 @@ def which_reference(self, ensembl_release): def __str__(self): return ( - "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" + "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s, database=%s)" % ( self.latin_name, self.synonyms, self.reference_assemblies, + self.database, ) ) @@ -129,6 +134,7 @@ def __eq__(self, other): and self.latin_name == other.latin_name and self.synonyms == other.synonyms and self.reference_assemblies == other.reference_assemblies + and self.database == other.database ) def to_dict(self): @@ -144,15 +150,17 @@ def __hash__(self): self.latin_name, tuple(self.synonyms), frozenset(self.reference_assemblies.items()), + self.database, ) ) def normalize_species_name(name): """ - If species name was "Homo sapiens" then replace spaces with underscores - and return "homo_sapiens". Also replace common names like "human" with - "homo_sapiens". + If species name was "Homo sapiens" then replace spaces with underscores and + return "homo_sapiens". + + Also replace common names like "human" with "homo_sapiens". """ lower_name = name.lower().strip() @@ -176,6 +184,8 @@ def find_species_by_name(species_name): def check_species_object(species_name_or_object): """ Helper for validating user supplied species names or objects. + + Return `Species` Object """ if isinstance(species_name_or_object, Species): return species_name_or_object @@ -352,3 +362,12 @@ def check_species_object(species_name_or_object): "R64-1-1": (75, MAX_ENSEMBL_RELEASE), }, ) + +rice = Species.register( + latin_name="oryza_sativa", + synonyms=["rice", "japanese_rice"], + reference_assemblies={ + "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + database="plants", +) From 9b426eb578edbd43cb08f1c529c52091a80ed9bf Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 19:28:34 -0600 Subject: [PATCH 15/35] bump version 2.3.0 --- pyensembl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/version.py b/pyensembl/version.py index 519574c..55e4709 100644 --- a/pyensembl/version.py +++ b/pyensembl/version.py @@ -1 +1 @@ -__version__ = "2.2.10" +__version__ = "2.3.0" From bb6adf8a82508bbf0159875a607f473dced0b4ab Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 19:31:07 -0600 Subject: [PATCH 16/35] bump version 2.3.0 --- pyensembl/ensembl_url_templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py index 298e517..a3783b2 100644 --- a/pyensembl/ensembl_url_templates.py +++ b/pyensembl/ensembl_url_templates.py @@ -36,7 +36,7 @@ GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/" DATABASE_FASTA_SUBDIR_TEMPLATE = ( - "/pub/release-%(release)d/$(database)s/fasta/%(species)s/%(type)s/" + "/pub/release-%(release)d/%(database)s/fasta/%(species)s/%(type)s/" ) DATABASE_GTF_SUBDIR_TEMPLATE = ( "/pub/release-%(release)d/%(database)s/gtf/%(species)s/" From aceabe007f48b6c97f3bf6fe65d66ccb341a924f Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 19:49:24 -0600 Subject: [PATCH 17/35] fix bug in fasta name --- pyensembl/ensembl_url_templates.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py index a3783b2..dc2b4da 100644 --- a/pyensembl/ensembl_url_templates.py +++ b/pyensembl/ensembl_url_templates.py @@ -114,7 +114,6 @@ def make_gtf_url(ensembl_release, species, server=None, database=None): "species": species, } else: - print(ensembl_release, species, database) subdir = DATABASE_GTF_SUBDIR_TEMPLATE % { "release": ensembl_release, "database": database, @@ -126,11 +125,16 @@ def make_gtf_url(ensembl_release, species, server=None, database=None): return server + subdir + filename -def make_fasta_filename(ensembl_release, species, sequence_type): +def make_fasta_filename(ensembl_release, species, database, sequence_type): ensembl_release, species, reference_name = normalize_release_properties( ensembl_release, species ) - if ensembl_release <= 75: + # for plant database, start from release 32 (inlcude 32) , the fasta file use the "old name" + # for releses before 31, the fasta file use the "new name" + # version 31 use both old and new name + if (ensembl_release <= 75 and database is None) or ( + ensembl_release <= 31 and database is not None + ): if sequence_type == "ncrna": return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % { "Species": species.capitalize(), @@ -198,6 +202,7 @@ def make_fasta_url( filename = make_fasta_filename( ensembl_release=ensembl_release, species=species, + database=database, sequence_type=sequence_type, ) return server + subdir + filename From e77c9b1712ac63fdee32c64ecdce3f674ab9d255 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 19:54:22 -0600 Subject: [PATCH 18/35] support arabidopsis --- pyensembl/species.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pyensembl/species.py b/pyensembl/species.py index 4249234..0998e5e 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -33,7 +33,9 @@ class Species(Serializable): _reference_names_to_species = {} @classmethod - def register(cls, latin_name, synonyms, reference_assemblies, database=None): + def register( + cls, latin_name, synonyms, reference_assemblies, database=None + ): """ Create a Species object from the given arguments and enter into all the dicts used to look the species up by its fields. @@ -84,7 +86,9 @@ def all_species_release_pairs(cls): for release in range(release_range[0], release_range[1] + 1): yield species_name, release - def __init__(self, latin_name, synonyms=[], reference_assemblies={}, database=None): + def __init__( + self, latin_name, synonyms=[], reference_assemblies={}, database=None + ): """ Parameters ---------- @@ -105,7 +109,8 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}, database=No for i in range(start, end + 1): if i in self._release_to_genome: raise ValueError( - "Ensembl release %d already has an associated genome" % i + "Ensembl release %d already has an associated genome" + % i ) self._release_to_genome[i] = genome_name @@ -371,3 +376,13 @@ def check_species_object(species_name_or_object): }, database="plants", ) + + +cress = Species.register( + latin_name="arabidopsis_thaliana", + synonyms=["cress", "thale_cress"], + reference_assemblies={ + "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + database="plants", +) From 247f9cf7d18679ae31b5baf91285a7875b3bfb73 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 20:55:46 -0600 Subject: [PATCH 19/35] update check --- Makefile | 4 ++-- lint.sh | 15 --------------- pyensembl/shell.py | 0 requirements.txt | 2 +- {test => tests}/__init__.py | 0 {test => tests}/common.py | 0 {test => tests}/data.py | 0 {test => tests}/data/gencode.ucsc.small.gtf | 0 ...mouse.ensembl.81.partial.ENSMUSG00000017167.fa | 0 ...ouse.ensembl.81.partial.ENSMUSG00000017167.gtf | 0 ...ouse.ensembl.81.partial.ENSMUSG00000017167.pep | 0 ...ensembl.81.partial.ncrna.ENSMUSG00000017167.fa | 0 {test => tests}/data/refseq.ucsc.small.gtf | 0 {test => tests}/test_contigs.py | 0 {test => tests}/test_download_cache.py | 0 {test => tests}/test_ensembl_gtf.py | 0 {test => tests}/test_ensembl_object_properties.py | 0 {test => tests}/test_exon_id.py | 0 {test => tests}/test_exon_object.py | 0 {test => tests}/test_gene_ids.py | 0 {test => tests}/test_gene_names.py | 0 {test => tests}/test_gene_objects.py | 0 {test => tests}/test_id_length.py | 0 {test => tests}/test_locus.py | 0 {test => tests}/test_missing_genome_sources.py | 0 {test => tests}/test_mouse.py | 0 {test => tests}/test_release_versions.py | 0 {test => tests}/test_search.py | 0 {test => tests}/test_sequence_data.py | 0 {test => tests}/test_serialization.py | 0 {test => tests}/test_shell.py | 0 {test => tests}/test_string_representation.py | 0 {test => tests}/test_timings.py | 0 {test => tests}/test_transcript_ids.py | 0 {test => tests}/test_transcript_objects.py | 0 {test => tests}/test_transcript_sequences.py | 0 {test => tests}/test_transcript_support_level.py | 0 {test => tests}/test_ucsc_gtf.py | 0 38 files changed, 3 insertions(+), 18 deletions(-) delete mode 100755 lint.sh mode change 100755 => 100644 pyensembl/shell.py rename {test => tests}/__init__.py (100%) rename {test => tests}/common.py (100%) rename {test => tests}/data.py (100%) rename {test => tests}/data/gencode.ucsc.small.gtf (100%) rename {test => tests}/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa (100%) rename {test => tests}/data/mouse.ensembl.81.partial.ENSMUSG00000017167.gtf (100%) rename {test => tests}/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep (100%) rename {test => tests}/data/mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa (100%) rename {test => tests}/data/refseq.ucsc.small.gtf (100%) rename {test => tests}/test_contigs.py (100%) rename {test => tests}/test_download_cache.py (100%) rename {test => tests}/test_ensembl_gtf.py (100%) rename {test => tests}/test_ensembl_object_properties.py (100%) rename {test => tests}/test_exon_id.py (100%) rename {test => tests}/test_exon_object.py (100%) rename {test => tests}/test_gene_ids.py (100%) rename {test => tests}/test_gene_names.py (100%) rename {test => tests}/test_gene_objects.py (100%) rename {test => tests}/test_id_length.py (100%) rename {test => tests}/test_locus.py (100%) rename {test => tests}/test_missing_genome_sources.py (100%) rename {test => tests}/test_mouse.py (100%) rename {test => tests}/test_release_versions.py (100%) rename {test => tests}/test_search.py (100%) rename {test => tests}/test_sequence_data.py (100%) rename {test => tests}/test_serialization.py (100%) rename {test => tests}/test_shell.py (100%) rename {test => tests}/test_string_representation.py (100%) rename {test => tests}/test_timings.py (100%) rename {test => tests}/test_transcript_ids.py (100%) rename {test => tests}/test_transcript_objects.py (100%) rename {test => tests}/test_transcript_sequences.py (100%) rename {test => tests}/test_transcript_support_level.py (100%) rename {test => tests}/test_ucsc_gtf.py (100%) diff --git a/Makefile b/Makefile index efc8962..d9844c6 100644 --- a/Makefile +++ b/Makefile @@ -13,8 +13,8 @@ PYTHON3 ?= python3 all: check check: - ./lint.sh - cd test && pytest + find pyensembl -name '*.py' | xargs pylint --errors-only --disable=unsubscriptable-object,not-an-iterable,no-member && echo 'Passes pylint check' + pytest --cov=pyensembl/ --cov-report=term-missing tests #: Clean up temporary files clean: diff --git a/lint.sh b/lint.sh deleted file mode 100755 index 9bac0e0..0000000 --- a/lint.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -o errexit - - -# disabling several categories of errors due to false positives in pylint, -# see these issues: -# - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and -# - https://bitbucket.org/logilab/pylint/issues/58 - -find pyensembl -name '*.py' \ - | xargs pylint \ - --errors-only \ - --disable=print-statement,unsubscriptable-object,not-an-iterable,no-member - -echo 'Passes pylint check' diff --git a/pyensembl/shell.py b/pyensembl/shell.py old mode 100755 new mode 100644 diff --git a/requirements.txt b/requirements.txt index 88f4b5e..03c7e70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ typechecks>=0.0.2 datacache>=1.1.4 memoized-property>=1.0.2 tinytimer -gtfparse>=1.3.0,<2.0.0 +gtfparse>=2.1.0 serializable nose>=1.3.3 pylint>=1.4.4 diff --git a/test/__init__.py b/tests/__init__.py similarity index 100% rename from test/__init__.py rename to tests/__init__.py diff --git a/test/common.py b/tests/common.py similarity index 100% rename from test/common.py rename to tests/common.py diff --git a/test/data.py b/tests/data.py similarity index 100% rename from test/data.py rename to tests/data.py diff --git a/test/data/gencode.ucsc.small.gtf b/tests/data/gencode.ucsc.small.gtf similarity index 100% rename from test/data/gencode.ucsc.small.gtf rename to tests/data/gencode.ucsc.small.gtf diff --git a/test/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa b/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa similarity index 100% rename from test/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa rename to tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa diff --git a/test/data/mouse.ensembl.81.partial.ENSMUSG00000017167.gtf b/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.gtf similarity index 100% rename from test/data/mouse.ensembl.81.partial.ENSMUSG00000017167.gtf rename to tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.gtf diff --git a/test/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep b/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep similarity index 100% rename from test/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep rename to tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep diff --git a/test/data/mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa b/tests/data/mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa similarity index 100% rename from test/data/mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa rename to tests/data/mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa diff --git a/test/data/refseq.ucsc.small.gtf b/tests/data/refseq.ucsc.small.gtf similarity index 100% rename from test/data/refseq.ucsc.small.gtf rename to tests/data/refseq.ucsc.small.gtf diff --git a/test/test_contigs.py b/tests/test_contigs.py similarity index 100% rename from test/test_contigs.py rename to tests/test_contigs.py diff --git a/test/test_download_cache.py b/tests/test_download_cache.py similarity index 100% rename from test/test_download_cache.py rename to tests/test_download_cache.py diff --git a/test/test_ensembl_gtf.py b/tests/test_ensembl_gtf.py similarity index 100% rename from test/test_ensembl_gtf.py rename to tests/test_ensembl_gtf.py diff --git a/test/test_ensembl_object_properties.py b/tests/test_ensembl_object_properties.py similarity index 100% rename from test/test_ensembl_object_properties.py rename to tests/test_ensembl_object_properties.py diff --git a/test/test_exon_id.py b/tests/test_exon_id.py similarity index 100% rename from test/test_exon_id.py rename to tests/test_exon_id.py diff --git a/test/test_exon_object.py b/tests/test_exon_object.py similarity index 100% rename from test/test_exon_object.py rename to tests/test_exon_object.py diff --git a/test/test_gene_ids.py b/tests/test_gene_ids.py similarity index 100% rename from test/test_gene_ids.py rename to tests/test_gene_ids.py diff --git a/test/test_gene_names.py b/tests/test_gene_names.py similarity index 100% rename from test/test_gene_names.py rename to tests/test_gene_names.py diff --git a/test/test_gene_objects.py b/tests/test_gene_objects.py similarity index 100% rename from test/test_gene_objects.py rename to tests/test_gene_objects.py diff --git a/test/test_id_length.py b/tests/test_id_length.py similarity index 100% rename from test/test_id_length.py rename to tests/test_id_length.py diff --git a/test/test_locus.py b/tests/test_locus.py similarity index 100% rename from test/test_locus.py rename to tests/test_locus.py diff --git a/test/test_missing_genome_sources.py b/tests/test_missing_genome_sources.py similarity index 100% rename from test/test_missing_genome_sources.py rename to tests/test_missing_genome_sources.py diff --git a/test/test_mouse.py b/tests/test_mouse.py similarity index 100% rename from test/test_mouse.py rename to tests/test_mouse.py diff --git a/test/test_release_versions.py b/tests/test_release_versions.py similarity index 100% rename from test/test_release_versions.py rename to tests/test_release_versions.py diff --git a/test/test_search.py b/tests/test_search.py similarity index 100% rename from test/test_search.py rename to tests/test_search.py diff --git a/test/test_sequence_data.py b/tests/test_sequence_data.py similarity index 100% rename from test/test_sequence_data.py rename to tests/test_sequence_data.py diff --git a/test/test_serialization.py b/tests/test_serialization.py similarity index 100% rename from test/test_serialization.py rename to tests/test_serialization.py diff --git a/test/test_shell.py b/tests/test_shell.py similarity index 100% rename from test/test_shell.py rename to tests/test_shell.py diff --git a/test/test_string_representation.py b/tests/test_string_representation.py similarity index 100% rename from test/test_string_representation.py rename to tests/test_string_representation.py diff --git a/test/test_timings.py b/tests/test_timings.py similarity index 100% rename from test/test_timings.py rename to tests/test_timings.py diff --git a/test/test_transcript_ids.py b/tests/test_transcript_ids.py similarity index 100% rename from test/test_transcript_ids.py rename to tests/test_transcript_ids.py diff --git a/test/test_transcript_objects.py b/tests/test_transcript_objects.py similarity index 100% rename from test/test_transcript_objects.py rename to tests/test_transcript_objects.py diff --git a/test/test_transcript_sequences.py b/tests/test_transcript_sequences.py similarity index 100% rename from test/test_transcript_sequences.py rename to tests/test_transcript_sequences.py diff --git a/test/test_transcript_support_level.py b/tests/test_transcript_support_level.py similarity index 100% rename from test/test_transcript_support_level.py rename to tests/test_transcript_support_level.py diff --git a/test/test_ucsc_gtf.py b/tests/test_ucsc_gtf.py similarity index 100% rename from test/test_ucsc_gtf.py rename to tests/test_ucsc_gtf.py From 82b52bc9f3951c1eb232b76a18881f251e857f0a Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 20:57:48 -0600 Subject: [PATCH 20/35] format code --- pyensembl/download_cache.py | 19 ++++++------------- pyensembl/ensembl_url_templates.py | 20 +++++--------------- pyensembl/species.py | 11 +++-------- setup.py | 1 + 4 files changed, 15 insertions(+), 36 deletions(-) diff --git a/pyensembl/download_cache.py b/pyensembl/download_cache.py index 48ebd00..47a0766 100644 --- a/pyensembl/download_cache.py +++ b/pyensembl/download_cache.py @@ -151,10 +151,7 @@ def _fields(self): ) def __eq__(self, other): - return ( - other.__class__ is DownloadCache - and self._fields() == other._fields() - ) + return other.__class__ is DownloadCache and self._fields() == other._fields() def __hash__(self): return hash(self._fields()) @@ -216,14 +213,10 @@ def cached_path(self, path_or_url): # if we expect the download function to decompress this file then # we should use its name without the compression extension if self.decompress_on_download: - local_filename = self._remove_compression_suffix_if_present( - local_filename - ) + local_filename = self._remove_compression_suffix_if_present(local_filename) if len(local_filename) == 0: - raise ValueError( - "Can't determine local filename for %s" % (path_or_url,) - ) + raise ValueError("Can't determine local filename for %s" % (path_or_url,)) return join(self.cache_directory_path, local_filename) @@ -326,9 +319,9 @@ def delete_cached_files(self, prefixes=[], suffixes=[]): """ if isdir(self.cache_directory_path): for filename in listdir(): - delete = any( - [filename.endswith(ext) for ext in suffixes] - ) or any([filename.startswith(pre) for pre in prefixes]) + delete = any([filename.endswith(ext) for ext in suffixes]) or any( + [filename.startswith(pre) for pre in prefixes] + ) if delete: path = join(self.cache_directory_path, filename) logger.info("Deleting %s", path) diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py index dc2b4da..4fcf774 100644 --- a/pyensembl/ensembl_url_templates.py +++ b/pyensembl/ensembl_url_templates.py @@ -38,9 +38,7 @@ DATABASE_FASTA_SUBDIR_TEMPLATE = ( "/pub/release-%(release)d/%(database)s/fasta/%(species)s/%(type)s/" ) -DATABASE_GTF_SUBDIR_TEMPLATE = ( - "/pub/release-%(release)d/%(database)s/gtf/%(species)s/" -) +DATABASE_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/%(database)s/gtf/%(species)s/" # GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz" @@ -54,15 +52,11 @@ # ncRNA FASTA file for releases before (and including) Ensembl 75 # example: Homo_sapiens.NCBI36.54.ncrna.fa.gz -OLD_FASTA_FILENAME_TEMPLATE_NCRNA = ( - "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" -) +OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" # cDNA & protein FASTA file for releases after Ensembl 75 # example: Homo_sapiens.GRCh37.cdna.all.fa.gz -NEW_FASTA_FILENAME_TEMPLATE = ( - "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" -) +NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" # ncRNA FASTA file for releases after Ensembl 75 # example: Homo_sapiens.GRCh37.ncrna.fa.gz @@ -105,9 +99,7 @@ def make_gtf_url(ensembl_release, species, server=None, database=None): server = ENSEMBL_FTP_SERVER else: server = ENSEMBLGENOME_FTP_SERVER - ensembl_release, species, _ = normalize_release_properties( - ensembl_release, species - ) + ensembl_release, species, _ = normalize_release_properties(ensembl_release, species) if database is None: subdir = GTF_SUBDIR_TEMPLATE % { "release": ensembl_release, @@ -119,9 +111,7 @@ def make_gtf_url(ensembl_release, species, server=None, database=None): "database": database, "species": species, } - filename = make_gtf_filename( - ensembl_release=ensembl_release, species=species - ) + filename = make_gtf_filename(ensembl_release=ensembl_release, species=species) return server + subdir + filename diff --git a/pyensembl/species.py b/pyensembl/species.py index 0998e5e..1be8180 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -33,9 +33,7 @@ class Species(Serializable): _reference_names_to_species = {} @classmethod - def register( - cls, latin_name, synonyms, reference_assemblies, database=None - ): + def register(cls, latin_name, synonyms, reference_assemblies, database=None): """ Create a Species object from the given arguments and enter into all the dicts used to look the species up by its fields. @@ -86,9 +84,7 @@ def all_species_release_pairs(cls): for release in range(release_range[0], release_range[1] + 1): yield species_name, release - def __init__( - self, latin_name, synonyms=[], reference_assemblies={}, database=None - ): + def __init__(self, latin_name, synonyms=[], reference_assemblies={}, database=None): """ Parameters ---------- @@ -109,8 +105,7 @@ def __init__( for i in range(start, end + 1): if i in self._release_to_genome: raise ValueError( - "Ensembl release %d already has an associated genome" - % i + "Ensembl release %d already has an associated genome" % i ) self._release_to_genome[i] = genome_name diff --git a/setup.py b/setup.py index 45dc0a4..65dee28 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ # limitations under the License. from __future__ import print_function + import os import re From 6b6d8db5a357aa6028a00d304c6c46d671b7dc75 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 21:33:36 -0600 Subject: [PATCH 21/35] update config --- pyensembl/config.py | 163 ++++++++++++++ pyensembl/ensembl_release.py | 11 +- pyensembl/ensembl_release_versions.py | 26 ++- pyensembl/shell.py | 16 +- pyensembl/species.py | 205 ++---------------- ...e.ensembl.81.partial.ENSMUSG00000017167.db | Bin 0 -> 249856 bytes ...bl.81.partial.ENSMUSG00000017167.fa.pickle | Bin 0 -> 3736 bytes ...l.81.partial.ENSMUSG00000017167.pep.pickle | Bin 0 -> 2850 bytes 8 files changed, 210 insertions(+), 211 deletions(-) create mode 100644 pyensembl/config.py create mode 100644 tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.db create mode 100644 tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa.pickle create mode 100644 tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep.pickle diff --git a/pyensembl/config.py b/pyensembl/config.py new file mode 100644 index 0000000..3dfd54a --- /dev/null +++ b/pyensembl/config.py @@ -0,0 +1,163 @@ +# TODO: save the config in YMAL file, or TOML file? + +MIN_ENSEMBL_RELEASE = 54 +MAX_ENSEMBL_RELEASE = 110 +MIN_ENSEMBLGENOME_RELEASE = 50 +MAX_ENSEMBLGENOME_RELEASE = 57 + + +SPECIES_DATA = [ + { + "latin_name": "homo_sapiens", + "synonyms": ["human"], + "reference_assemblies": { + "NCBI36": (54, 54), + "GRCh37": (55, 75), + "GRCh38": (76, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "mus_musculus", + "synonyms": ["mouse", "house mouse"], + "reference_assemblies": { + "NCBIM37": (54, 67), + "GRCm38": (68, 102), + "GRCm39": (103, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "canis_familiaris", + "synonyms": ["dog"], + "reference_assemblies": {"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "felis_catus", + "synonyms": ["cat"], + "reference_assemblies": { + "Felis_catus_6.2": (75, 90), + "Felis_catus_8.0": (91, 92), + "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "gallus_gallus", + "synonyms": ["chicken"], + "reference_assemblies": { + "Galgal4": (75, 85), + "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "rattus_norvegicus", + "synonyms": ["rat", "brown_rat", "lab_rat"], + "reference_assemblies": { + "Rnor_5.0": (75, 79), + "Rnor_6.0": (80, 104), + "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "macaca_fascicularis", + "synonyms": ["macaque", "Crab-eating_macaque"], + "reference_assemblies": { + "Macaca_fascicularis_6.0": (103, MAX_ENSEMBL_RELEASE) + }, + }, + { + "latin_name": "chlorocebus_sabaeus", + "synonyms": ["green_monkey", "african_green_monkey"], + "reference_assemblies": {"ChlSab1.1": (86, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "macaca_mulatta", + "synonyms": ["rhesus"], + "reference_assemblies": {"Mmul_10": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "oryctolagus_cuniculus", + "synonyms": ["rabbit"], + "reference_assemblies": {"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "meriones_unguiculatus", + "synonyms": ["gerbil"], + "reference_assemblies": {"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "mesocricetus_auratus", + "synonyms": ["syrian_hamster"], + "reference_assemblies": {"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "cricetulus_griseus_chok1gshd", + "synonyms": ["chinese_hamster"], + "reference_assemblies": {"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "heterocephalus_glaber_female", + "synonyms": ["naked_mole_rat"], + "reference_assemblies": { + "HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE) + }, + }, + { + "latin_name": "cavia_porcellus", + "synonyms": ["guinea_pig"], + "reference_assemblies": {"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "sus_scrofa", + "synonyms": ["pig"], + "reference_assemblies": {"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "danio_rerio", + "synonyms": ["zebrafish"], + "reference_assemblies": { + "Zv8": (54, 59), + "Zv9": (60, 79), + "GRCz10": (80, 91), + "GRCz11": (92, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "drosophila_melanogaster", + "synonyms": ["drosophila", "fruit fly", "fly"], + "reference_assemblies": { + "BDGP5": (75, 78), + "BDGP6": (79, 95), + "BDGP6.22": (96, 98), + "BDGP6.28": (99, 102), + "BDGP6.32": (103, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "caenorhabditis_elegans", + "synonyms": ["nematode", "C_elegans"], + "reference_assemblies": { + "WS200": (55, 57), + "WS210": (58, 60), + "WS220": (61, 66), + "WBcel235": (67, MAX_ENSEMBL_RELEASE), + }, + }, + { + "latin_name": "saccharomyces_cerevisiae", + "synonyms": ["yeast", "budding_yeast"], + "reference_assemblies": {"R64-1-1": (75, MAX_ENSEMBL_RELEASE)}, + }, + { + "latin_name": "oryza_sativa", + "synonyms": ["rice", "japanese_rice"], + "reference_assemblies": { + "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + }, + { + "latin_name": "arabidopsis_thaliana", + "synonyms": ["cress", "thale_cress"], + "reference_assemblies": { + "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + }, +] diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 8af2584..521e2d7 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -16,12 +16,9 @@ """ from weakref import WeakValueDictionary -from .ensembl_release_versions import MAX_ENSEMBL_RELEASE, check_release_number -from .ensembl_url_templates import ( - ENSEMBL_FTP_SERVER, - make_fasta_url, - make_gtf_url, -) +from .config import MAX_ENSEMBL_RELEASE # ENSEMBL_FTP_SERVER, +from .ensembl_release_versions import check_release_number +from .ensembl_url_templates import make_fasta_url, make_gtf_url from .genome import Genome from .species import check_species_object, human @@ -72,7 +69,7 @@ def __init__( release=MAX_ENSEMBL_RELEASE, species=human, server=None, - # ENSEMBL_FTP_SERVER, + # server=EMBL_FTP_SERVER,, ): self.release, self.species, self.server = self.normalize_init_values( release=release, species=species, server=server diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py index 020c6e6..05d4a15 100644 --- a/pyensembl/ensembl_release_versions.py +++ b/pyensembl/ensembl_release_versions.py @@ -10,25 +10,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -MIN_ENSEMBL_RELEASE = 54 -MAX_ENSEMBL_RELEASE = 110 -MIN_ENSEMBLGENOME_RELEASE = 50 -MAX_ENSEMBLGENOME_RELEASE = 57 +from .config import ( + MAX_ENSEMBL_RELEASE, + MAX_ENSEMBLGENOME_RELEASE, + MIN_ENSEMBL_RELEASE, + MIN_ENSEMBLGENOME_RELEASE, +) -def check_release_number(release): +def check_release_number(release, database=None): """ - Check to make sure a release is in the valid range of - Ensembl releases. + Check to make sure a release is in the valid range of Ensembl releases. """ try: release = int(release) - except: + except ValueError: raise ValueError("Invalid Ensembl release: %s" % release) - - if release < MIN_ENSEMBL_RELEASE: + if database is None: + min_release = MIN_ENSEMBL_RELEASE + else: + min_release = MIN_ENSEMBLGENOME_RELEASE + if release < min_release: raise ValueError( "Invalid Ensembl releases %d, must be greater than %d" - % (release, MIN_ENSEMBL_RELEASE) + % (release, min_release) ) return release diff --git a/pyensembl/shell.py b/pyensembl/shell.py index 0a7a54f..546dfa9 100644 --- a/pyensembl/shell.py +++ b/pyensembl/shell.py @@ -47,11 +47,14 @@ import pkg_resources -from .ensembl_release import MAX_ENSEMBL_RELEASE, EnsemblRelease +from .config import MAX_ENSEMBL_RELEASE +from .ensembl_release import EnsemblRelease from .genome import Genome from .species import Species, normalize_species_name -logging.config.fileConfig(pkg_resources.resource_filename(__name__, "logging.conf")) +logging.config.fileConfig( + pkg_resources.resource_filename(__name__, "logging.conf") +) logger = logging.getLogger(__name__) @@ -162,7 +165,14 @@ def collect_all_available_ensembl_releases(): for species_name in Species.all_registered_latin_names(): species = Species._latin_names_to_species[species_name] # print in tree format - print("* " + species_name + " (" + ", ".join(species.synonyms) + ")" + ":") + print( + "* " + + species_name + + " (" + + ", ".join(species.synonyms) + + ")" + + ":" + ) for ( release_name, release_range, diff --git a/pyensembl/species.py b/pyensembl/species.py index 1be8180..fe8f3b0 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -12,10 +12,7 @@ from serializable import Serializable -from .ensembl_release_versions import ( - MAX_ENSEMBL_RELEASE, - MAX_ENSEMBLGENOME_RELEASE, -) +from .config import SPECIES_DATA # TODO: replace Serializable with data class @@ -33,7 +30,9 @@ class Species(Serializable): _reference_names_to_species = {} @classmethod - def register(cls, latin_name, synonyms, reference_assemblies, database=None): + def register( + cls, latin_name, synonyms, reference_assemblies, database=None + ): """ Create a Species object from the given arguments and enter into all the dicts used to look the species up by its fields. @@ -84,7 +83,9 @@ def all_species_release_pairs(cls): for release in range(release_range[0], release_range[1] + 1): yield species_name, release - def __init__(self, latin_name, synonyms=[], reference_assemblies={}, database=None): + def __init__( + self, latin_name, synonyms=[], reference_assemblies={}, database=None + ): """ Parameters ---------- @@ -105,7 +106,8 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}, database=No for i in range(start, end + 1): if i in self._release_to_genome: raise ValueError( - "Ensembl release %d already has an associated genome" % i + "Ensembl release %d already has an associated genome" + % i ) self._release_to_genome[i] = genome_name @@ -198,186 +200,9 @@ def check_species_object(species_name_or_object): ) -human = Species.register( - latin_name="homo_sapiens", - synonyms=["human"], - reference_assemblies={ - "NCBI36": (54, 54), - "GRCh37": (55, 75), - "GRCh38": (76, MAX_ENSEMBL_RELEASE), - }, -) - -mouse = Species.register( - latin_name="mus_musculus", - synonyms=["mouse", "house mouse"], - reference_assemblies={ - "NCBIM37": (54, 67), - "GRCm38": (68, 102), - "GRCm39": (103, MAX_ENSEMBL_RELEASE), - }, -) - -dog = Species.register( - latin_name="canis_familiaris", - synonyms=["dog"], - reference_assemblies={"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)}, -) - -cat = Species.register( - latin_name="felis_catus", - synonyms=["cat"], - reference_assemblies={ - "Felis_catus_6.2": (75, 90), - "Felis_catus_8.0": (91, 92), - "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE), - }, -) - -chicken = Species.register( - latin_name="gallus_gallus", - synonyms=["chicken"], - reference_assemblies={ - "Galgal4": (75, 85), - "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE), - }, -) - -# Does the black rat (Rattus Rattus) get used for research too? -brown_rat = Species.register( - latin_name="rattus_norvegicus", - synonyms=["brown rat", "lab rat", "rat"], - reference_assemblies={ - "Rnor_5.0": (75, 79), - "Rnor_6.0": (80, 104), - "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE), - }, -) - -macaque = Species.register( - latin_name="macaca_fascicularis", - synonyms=["macaque", "Crab-eating macaque"], - reference_assemblies={ - "Macaca_fascicularis_6.0": (103, MAX_ENSEMBL_RELEASE), - }, -) - -green_monkey = Species.register( - latin_name="chlorocebus_sabaeus", - synonyms=["green_monkey", "african_green_monkey"], - reference_assemblies={ - "ChlSab1.1": (86, MAX_ENSEMBL_RELEASE), - }, -) - -rhesus = Species.register( - latin_name="macaca_mulatta", - synonyms=["rhesus"], - reference_assemblies={"Mmul_10": (75, MAX_ENSEMBL_RELEASE)}, -) - -rabbit = Species.register( - latin_name="oryctolagus_cuniculus", - synonyms=["rabbit"], - reference_assemblies={"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -gerbil = Species.register( - latin_name="meriones_unguiculatus", - synonyms=["gerbil"], - reference_assemblies={"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -syrian_hamster = Species.register( - latin_name="mesocricetus_auratus", - synonyms=["syrian_hamster"], - reference_assemblies={"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -chinese_hamster = Species.register( - latin_name="cricetulus_griseus_chok1gshd", - synonyms=["chinese_hamster"], - reference_assemblies={"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)}, -) - -naked_mole_rat = Species.register( - latin_name="heterocephalus_glaber_female", - synonyms=["naked_mole_rat"], - reference_assemblies={"HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -guinea_pig = Species.register( - latin_name="cavia_porcellus", - synonyms=["guinea_pig"], - reference_assemblies={"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)}, -) - -pig = Species.register( - latin_name="sus_scrofa", - synonyms=["pig"], - reference_assemblies={"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)}, -) - -zebrafish = Species.register( - latin_name="danio_rerio", - synonyms=["zebrafish"], - reference_assemblies={ - # "ZFISH7": (47, 53), - "Zv8": (54, 59), - "Zv9": (60, 79), - "GRCz10": (80, 91), - "GRCz11": (92, MAX_ENSEMBL_RELEASE), - }, -) - -fly = Species.register( - latin_name="drosophila_melanogaster", - synonyms=["drosophila", "fruit fly", "fly"], - reference_assemblies={ - "BDGP5": (75, 78), - "BDGP6": (79, 95), - "BDGP6.22": (96, 98), - "BDGP6.28": (99, 102), - "BDGP6.32": (103, MAX_ENSEMBL_RELEASE), - }, -) - -nematode = Species.register( - latin_name="caenorhabditis_elegans", - synonyms=["nematode", "C_elegans"], - reference_assemblies={ - # "WS180": (47, 49), - # "WS190": (50, 54), - "WS200": (55, 57), - "WS210": (58, 60), - "WS220": (61, 66), - "WBcel235": (67, MAX_ENSEMBL_RELEASE), - }, -) - -yeast = Species.register( - latin_name="saccharomyces_cerevisiae", - synonyms=["yeast", "budding_yeast"], - reference_assemblies={ - "R64-1-1": (75, MAX_ENSEMBL_RELEASE), - }, -) - -rice = Species.register( - latin_name="oryza_sativa", - synonyms=["rice", "japanese_rice"], - reference_assemblies={ - "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), - }, - database="plants", -) - - -cress = Species.register( - latin_name="arabidopsis_thaliana", - synonyms=["cress", "thale_cress"], - reference_assemblies={ - "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE), - }, - database="plants", -) +for data in SPECIES_DATA: + globals()[data["synonyms"][0]] = Species.register( + latin_name=data["latin_name"], + synonyms=data["synonyms"], + reference_assemblies=data["reference_assemblies"], + ) diff --git a/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.db b/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.db new file mode 100644 index 0000000000000000000000000000000000000000..e2eba44f73986fcae19abbdd0794bdc1a8aaee76 GIT binary patch literal 249856 zcmeI5eRv$jo#*GZdqz@s>BXawttB2K8{9F$>KTn>U6#u_vLIfJjgkGtJaCzerLjGG z@<^7(<|Wy?)gJ$teQsIECJ*}%?l!Rdc%QxGF5L5E@Ak3|**pigFCi>}i!olX{lFdA zfL{RP%VXH*s=9mBzpm+#GJQYLl)qPQj{Y#ZKJv53-N8SPTp#QX9FPi}>Qs|!mj!27u{2h=w>*_E zO^;3$C(Ai2p4(X{6>_Eg?t;4~IdsSH+AYJW^&8g>Z%etloFlbqBiWl;Ia`!5(~?k= zo4L`9ztQQ!9$_*!UCvLHbA{5FYov2-<5f9jC!7Q~q^z=UNb$mr2k!4slgpL`_kW?B z-#$)i>Y82TnBB7WiyMYLwQ|$s_7$e4ixZ{Pmf>w%QX4n%zpWcKa7z<=r$!61O7e-i z%DZP_$vX=7PLyg{B4rlG?2KDCuHU?MICc9S>u+0o$DOHLhwr4G3BLR|dc3vS%VsTq zwihSL_f1y$IxXxv_S|X{XKLiLs#0;(&A6kGFYleIvNAe4Hl3?Z%@3ed1FP85-rd^^ zQ*tecPt$uRCnxyXHD1_P7?;yowXJbDt$o#gOioRd3&oO;hm}3P)8Ae#BwZ4JRhs)#Mj?f-}u_F6%NGofzY1hMmn}WnVwmjt-Cf3H>)e%fP9tKsZ63Ro6Hl(J zqZ>0ZrY1M>+ODcmTlY{{Z6h(=I*(~7XH~tW{l-j4SChALS9<1m#lxO0+!1~<%Or>! z#W7=2oBIejmjuC}&ttuE5y^HP-PqTnCWm;fJ8IP0N+;S)VpCg)O{=Z-wJP=)Gp)^P za)cWmsA0IOgCP#I%;P{+-YV?Az0+?ssmY$6;Qnr_cOv#ku-P$f_fKR;Y9%?8zAext z?x5Uu-||{&%N-!T?Q@M@1-%jf&fzwnGP=qh-*dqji7QDn5wv?B`(hMd3v*64Ae1K0 zSzet+oYX>f`;NO~N^&3({F-%1kqi}Aieo~I*DBXd<`(U0#)#_gveTH0qGPp+CLMw8 za-|0tSNON$MCDsC@tIsrqQMY*)LNxWXG_B(sU&J+c|AewFG(Ynhl$V%I6&OxGhJ0E#vheCHeV8 zt?%jL9Lt$}%SYGUEBIz{hsL)-_GEzn!v_RF00ck)1V8`;KmY_l00ck)1VG@@5x6VZ zsTo?grbXHerfCT+qG`<3v_fgRuzUM>ZdZO^zLXy-l}q_aQ&A7zWtx@S2mGO0v9xoQ z822l&mUaEZ8%J*2I&w>|^)Uy`{(&Z5uOK@a;Q#Oe0T2KI5C8!X009sH0T2KI5C8!X zxZ(*U!e+eI`2S3Row?#GhzfuJ2!H?xfB*=900@8p2!H?xfB*rZ z82?}0#}~B&0T2KI5C8!X009sH0T2KI5C8!|0ONn02Ot0fAOHd&00JNY0w4eaAOHd& zaP-2+S-Fb~2-&1$@)kbWq@_V5ipAOHd&00JNY z0w4eaAOHd&00JOzWf7Q3hB}$i)2*42HiH@3vZWD?F;mkrh0=6k_xADJuKd1yDetU; zCw>X<1npPk(^upv>d#M@=H%2wxlk=>*yI1B z0d|HR{giV<^+5mxKmY_l00ck)1V8`;KmY_l;7TRX8f}Y2f~G0vFo+omAtr>l5DwlY z<^2YqyRru0 z?EL?e{6_%zfB*=900@8p2!H?xfB*=900@8p2wW)y-1+~v^jaTt!0aC&^Z%)=vHut7 z2p$b+W9<2sQ!U?US=D^7c~^5&(+^@_Qh%V{r2M^dbM%MN^^u=N?hYOe{&}!Fa3C2lNM{8Twt zD3Qu`&TYIZr|g82;D(e{_6;dsxRF_;ByU+33|P)g?-?(a3%UH>@`TOXt$5CK!-)s( z?@*J=mIe2Jp`717PVl&C7CF9dS^LEe!yfo zg`aF5Z|wsnyKDKgy*N?6Z?dW?)55M}&#g9brba%iDiufFj5`YX^4_T`E2E=h)BIwQ z9pgt`s)1E(Y47gsg(*2P@o9SR3srIn4$G1FV zvRzGnp(i-gY@f2$Wx`*d!Vj{2&N|h$uODkihv&K>JG?o>@>cFj&-|`<*t3N@!p}aL1aYG{W=v{x9|7m=CK&X2tXHlF z*{-7-`&!iG5U+JdjapmjB=p)sY+9|OuT`lKn=rH9So^-%RCNL<*mZ* z*E>D%TTN=Rrzg0-+v;eDJrZnoOxs-z*^ydF4yA8na*6vIw+XkrmYQV;h}>>khrdP# zfsTcLeQ=vk8C_+M@3~-%#FeC(2-;tL?2A!+3C%g(fKZw|XL)rRaZ(G_?K|#{DanCE z@N3p3MKV-eDUJy-UaMR;nOn5086&E{%T8l1ijLJPnsfxV%atBrT;bn}6P0hp#Ak9f zi3UULQEQbhrFFfRb{IRN{2M>FW`E=Nw`N097GTS6&h7dTOS~PwGomE3iQql*fv_xA zsuQhcVm6gqky*(ohxzvdUWIPC?5ajjQI*=Y<=Zyy3aZIOBDg736%V(R@}e32A6?C4Il1bNJ%69QCz~*N2qk=M%LCiQ*i~nPMB}BlwuO zpdQN8|r8_O9~H$R9=4 zh5v8(;o$XwM+J$dZ~y9e%(Seg58tQD6O&@;h1q9^Rkz^W6n&oCcCPL`L#)avZr-`6 zQY(FL;qAQ9evZD_dAw0^$IY1W+|F}#+|>IN-PCiR6MfPUTdpG*G>RP2sX;YgpYpJUFa~WMevzE55upgLcR%8u*`%x}kjNX&imT%0>;wthxQW;%k?lF!y|hs(0LiWF(Qg$l)(yoS#Z;w~F!pZHnlJ2f+<~HbNk!`wrjPb>DR?=xm zU7No)w@uOzyLnP8EA8=2q)ScS#%p=K)#SKIoLUk=v?OkH$=qh$gjRbl*H_wY*w6pZ z2iQmK{I6~_kCH(E1V8`;KmY_l00ck)1V8`;KmY_T0f9(Z3x>jBEspX3C7=(JAOHd& z00JNY0w4eaAOHd&00JQJ&qF{O|DO`i|4;q%n1&S)009sH0T2KI5C8!X009sH0T2Lz zUo8PGyfiq(fBuus_GPoy^MCRC|E|FL0J~HFuX?jK)^f0=vw5QFZj$I?g}W2VV6h0=6k_xADJ zuKd1yDetU7C|*9`1cx_{+_rUOi@j7qZ^rB$RAS0`)2#e7ga7QmSlYSD8x_B6s#HXY z#k$gcX1`q!UERP|Znu{~7#z$DuF1@-NH);VXrL}XqpQ=I1^e0AKtIEe*X3vAk#u^& zekK~|XXrp(eug7ybHRQtYMh_y$N#+^Kh=L4NcS$-&yEKAdEvRb{QUSvb8x|awl~nv z50iEId0sKsEZEPs2KxCyM_qn?a7L`XvcPt!wSj(~9k0vJ_g0ww3-*&W(9hFH>+tcZXVlFmz&4VEx0pUt~AiiSD&uS&DZW(a9^}M+CVpte7!C=|M8Xu zcSXx14RrIBhw5_k$h8aZiI#^O=;q6t>T>gN>w-I?<)H?;dFYY4+&pxC!Tr$kU<2KJ z@h5e;`I5TOZb;1k@6wCn{r^YV&)7lsUG^AzkiP|R7yB~1nPu5>*1n*s7J1ixWdbu6!max*cA>vqNs@V5(*mu}B*?#uhY!552JK1e)Ex!|BuxnTwi|W7B-_@~Jpx-$T4Fo^{1V8`;KmY_l z00ck)1VG@@5oiu~R~EeMBH<(ncam^|gcp%;2MM>6a2pA?k}xA-orEl9yhnm9Ol{Epa@xOLBz)rFk*U)KF)&0lWumz(|NCVx5ZFUS04)n8Wp<*2_L z@t4E?awritv{--h-|5KTa|EWyI|5PU9e=3vlKb6V& zpUPzXPh~Rxr!pD;Q<;qasZ7TIR3_tpDwFX)mF@ZeGXZv%9p|(E|C&9;=l(y;X86qi zJlo9Y{hMqVpY6Q7sJ2m&Ag0w4eaAOHd&00JNY0wD0| zB*4NwmHGMNo$y^!JSoLHrFcS$FOuRNQoLP?w@L99%qlTZ}8s;?EiG0tf(~zfB*=900@8p2!H?xfB*=900>+pfk;>j zc8N_=Y&yjzAvTM|rbBGn#imVcTE&KmjV?Bt*tCdEv)D9=OD zY^?SFF#rD|j~lXs00@8p2!H?xfB*=900@AVnnauyE zGMWERWitPt%4GgOmC5{nDwFyDR3`KPsZ8eoQ<=>Fr!txUPh~RypUPzZKb6V+e=3vt z|5PUP|EWyo|5KUF|EDsU|4(Hy|DVca{y&xN`TtJ^*!%4NutV%``P2UIvH!?_0r2nG zUbd6J`~OyU6I;!H1JJ?W{C{3Qp&!wo<8K6f>QbNHunz(t00JNY0w4eaAOHd&00JOz zl@rjyOM^p0>qgSqzHGM36H9txot{|26I-eF*6N8dPfYj3G*7I>6KnRw znmn<%Cl>R>R8LIt#G;;9#1jj9VxgAs()cXFC_i0-?1DIHKFcm>GcAW(%;x<~$D0P@ z564c#)~b&xrMD8a5UY=9W}j<@-;2f zz5b4NHqy}x&(-0mAev1}j(&WjIq2_bqLGe%n5@&$zBQ7g=M{5}zoUy9>F5U?bvT;I z^k>sD(GSj;1OAS7G}6(t<8?S{T}=HF(f3xE{r--&H`3A5N9%O7PrjJWe8tTAJKEMr zM^Bxq*HQV@od&rWGx+qwJIS_3E3v1NV0BbM*C@ z!GVEZndqC(_;*Yznm?hRX`k;T>g(UC!_RbIrq`5D(KjCR@0V6IeL_F|zB{SMcGTf# ze?RZ)d!;k<*g5}hX+^w|e!lv29e#>#sZZ{fUc1M?S6UHkq@PE=Uaz0hcTv7>w|}R! zLT#jh4oS9xTnC`clOzzyZX@BQ}jd1SZU4BP)F6Vj&`g_-`v5v2zKCPLOb9@MIbZLCs z#fS53RDV`CF#f-o8x5i z&{d58XE6Ss_fo?6f4*l?HO4RG(G(rISobO9aW%&O)?>Qf48P@|s2Kla{694@_b3+Q ze{tue;p2btxN$&?|HFDTz`m^iXZ{~PAOHd&00JNY0w4eaAOHd&00LJtftl;NT&ig&7u5ph{+;dzb-6M;&tV{b(KDlnmCv?C9v z_Md%n-Qq?%8h*S^NBg8%ak0oFY2WE5;`jgEdVheOVMp1|*g^iD|Hs&a>^`=OeVN_N zvTQm3<$s8;{&!4&UjLE)J^kDI@9FpJMSYuo3(tlR2!H?xfB*=900@8p2!H?xfWTEv zpf%bSi3Ckk;bVP;kJh_gAsSHlsCbDhgdbP<=yS0vgbs)wWOvO9eA=|afB2kqg$vIq z>c=;By26J^MLn-1T;YQbMg8E+B3C#&uBh*==x~M8M-}zVSK3|S)G0+h{a~9by!)i0 zo+`Dv!rR9c^_^|Z6;57I)VB+|E4=lrqMq2Jxx$;hiu&d=Ew1qTw-oh_$C_Q?*bYTK zcCN`4UVU0oU%Myn3P--K@E--myuy*6hF#&6hs2KyR9OhQ!poZ!^>C}=3Wpw1)I;Z^ zuJH2{iu#foVdBWe`H|!E3B->A?D7A>06WJ{uvgg&e8s=NS6e(n2cZX)40 z3CBoSC1Hhxqa++5;V=n@n!??2hnqG2*ZwfTPO=x-U$H-Bzt8SrUuElAnsu`XAL;*s zzwQ6e^oRK~|51H|p4C62tJ)duu=W$}Kk;n%fB*=900@8p2!H?xfB*=900>+J0e9)X zTdY@__gcTy?)I0L_{)p^WwQPsEt{;F-itp7)4vi={H$@+g(ChPxEnXLauWwQPsmC5>lR3_{HQJJj&M`g19AC<}ae^e&x z|52H&|3_uA{vVZR$NwkT&-vPaf5QGf+sAV3OUz_TSXh6DKj;6Een9^N{XTuWeyiT6 zU!yD9Y3*f<|1aVKM^+F30T2KI5C8!X009sH0T6HqxI=g6C9c-H^U1i5%4A$e<*G8Q z_brlp2=x81DwBH% zR3`Tjs7&r5P?_9Apfb6KKxJ|dfy(3_0+q=<1S*qz2vjEb5U5P-8_I>up><`%Qut`1(;CA*Kte2%&D+}mn_!|KJLI1J-ef^L5-GJZG_vm?j zGcH*W009sH0T2KI5C8!X009sHfln;~7VfFc$>;03r^8Y_q=$PdbI-+o?{2eOit}gC zv^aCI6z`JaNh#hb#S>C|kreNc;_XtrO^UZlaVEueDXvNJ7Af8=#havfT#Cn}xGKdJ zDIS&L_V{1S|Ht$HPwgd(ih}?MfB*=900@8p2!H?xTsi{ov-@FdsYH8?`B0TSFvyos zR1;w{P8XfD7k!r!_YPQ#=#e=DR?)roil5|pv@5R}B(nuI8x*a*wpG~L9 zqkCIU_howf`^YSHSI(x%V`^76v*dw!MNSW9Oft8?maUiZuOScPiQG@7GY~mLp6C-f zO(rN1*(49lUAb?NJd7vu8Zw=M$o=GrK9T##Oavll$b1VTr^$2%BAaCLgDYnS$?Oaw z_mdeEM9wnn`9Hfqz&>K<*?a6=_7;1cy~Yl+m)P^{@A%sQf6RXs@E7bUzAnH4_5^#J zJ<7hp-x08%J;44g&xQ{OfB*=900@8p2!H?xfB*=900@A+$`*6xY5d19@e81uw*PfYW~T0F64Ppruki+f@*PfYd16i+Pb ziAm#sjUS018w>D%_<#TifB*=900@8p2!H?xfB*=900>+Jftkg@PG_VL`V{JwlC?<~p3UyEQmLHlf;Ix$u( z?Of%JPM0Sp#h!`M>}?aCt2^Ci_S@yj@Ba_7GyMI37g0fE1pyEM0T2KI5C8!X009sH z0T2KI5V))euxMK(5;sk8A3@v|5cB_o>|B8V!v_RF00ck)1V8`;KmY_l00ck)1VG?w zBG44>uB^cz=Kr@Vg#deyt>(Y$pVA}Rf6+F#eAM(vQ#k%md|B)}u_fwv)vWTrcp`j2 z00ck)1VEr30$&|al9^1fSk7-BFXYDZ<@{)VbXOs_yHL)Hn2&VR&>h2Tw+yGYto`DK z;gnB;)XIH@sp;ZGDYbs%7Gu0$P2RLDIHMLzV}*O&8E`o(o*NwaOb;@59n zH@q$7>fZFJO&iI+)JmrqV{cYX4smN8{?;nf@u532dO>loS+Ebqu{k^_jv3?ToX~d`N(B$}fw?U^>1CUaY)tj4$+f(;%)ho)GB54y z0_=$#z81v}W2R-bn%vBdX8es#7xsvBx#@C#s+=p7$YsGCS{SJ+oLH`r zuX626UhOF4%X_D)tT^qS>=-{3FHFgaiBHpeCnqQPsWe{LR~VPmS+%V} z)Y@0g(rqH;!YX@w%QHqkrzFipaKbw6Z6c!gsPq;>NqbK4>NMhn7OLAf+b{M=t<<{)3+TWTD>Ba@vvYT&Fis$0z4fn`Oe^ky%B(i+2mVwHN zHhXj(!MO8!C7DeG?~#wLwZFpXhB=G1+81xZLNdzNsmWo^rCTn$sudNkuc~FEDz$qg z{90T4`TxlPJ6X>cI-~^w5C8!X009sH0T2KI5C8!X009uVJPEk#L{|U)ALIYa^O(Ua z5C8!X009sH0T2KI5C8!X009uFmw-F|r|%QApZ}i?u(S1+hx8x-0w4eaAOHd&00JNY z0w4eaAOHfFEdf2;Q~B+`SZKf=|9=!Q{!!Zy50T2KI5C8!X009sH0T2KI5CDOT zAP^2~ag6^j0wA)200@8p2!H?xfB*=900@8p2!Oy9K)@aUzaL=lUjYuoMGyc15C8!X z009sH0T2KI5C8!X0D*-PXbCTk4-KswNoV`AS+V|KkX;DyfB1j^2!H?xfB*=900@8p U2!H?xfB* z_92U9p(eMr)14)S8zU=+XF7AhwKic%FQWe)u6eYzDRw~R!EB#)Igx3{=LD06<}@2k z7?l#?5U$WPW&FfytuxDU$ za_de>G{HiTQpmSbx>BK(*Ej#Zw{PFSd;8(_%TIrOeY^32-RJ%PpQ~3ZpPBTZ1U|S2 zYa(sr9d$L4+yKX?DLB#6YPkN_k9d3QESj7AWGNq+c1$2)q#K*5Vc-`3c%tHQ)`csp zz>*5C!s)_B#^xQuZ5%!TK+qEQ4zpe|%J25`yP zuXPG&dA5zalb9)s4+zhOqS*tTx20!=f`Cd-|61WtVO^+RR@+TqzvK&?JD>((%BWq0 zTx5y&>a?7KE9MzkKNbGZe@l@}Ce1DN7*pT$nF3C&5YdwYliL~jEnw)J5TmFu$i-@p zImp7#6iaP+@D58Mw`!W@$eClfBfvcg6Nm9l2BM1SC>;b+E`A(J$Jn}s z%McsY!SQQiJjQikRp2%aa3ek?w9+;%1$%C6^ZCdq7H#s2a`TDTzC z$|;zNK@zuwjRP4Yl%F_YG{p47TT!MR!JD7f#$LYwld+C8(Dzsy4b6)ODTOhN+J_6G z?KURhAYRpt*m)<%s41QQlps>57w0>9%A-eAdas_UDx@owNmM}){i5pX*{6EY)ehrZ zKe6MD0F9^slG4}niNejIK)kWdLmQ3lE!5YRP69*8g49nE2!#jKAi+-~q@UWIP!K5k zg=JYO%0xtyiblgi>sc>_dLEZUX(65&Lx{=f7eJflCCZUPY86lLGE=e&&5|XpSt#Ws z^SzYL53=JOXn+ax{*05zjfp>s>*{#PuU0Cq7ZR4L3J0msR=7@vY7bwvqst`6PwEk> z(dpA&B~8jDULS#HWB_xNqW5PTrzYyokAc?n1i>>SG-?1!qG_$|EaA!iOwek0x`~~< zXQJw4Zj4*Uz>7Um2Xv0P^RsO^#m~paxO=>}gLPhJ z&tAOTAC1%&^Exg&!(ht#>xGBN8kCPbl~OvKj#I5|NAd1>8J5p~Wh7SCBU$c=R?us( zA}Q`!1_|rs+|)tTeX3#^*Q@P^3W-qxkvg;dNf6sFmFtW)5<;%1Y>thBCmXK&a{wz8 zHKn1_R%THmQTI&APPVOj6lFi#>@YITQj&%mTlLv;gcYX8HPa2352`vHLMEg4q=$hs bZmS#zi!63m%BFG3wHjm(CEd*#Izz6fy4EW{PyE%ZX z(WtB+`S^a3|NQ3n*V|t|zJGgrqiNUQO+S5keE$67=U-lb`1r?nZ*SLSvZY!VNgiRG zZoc^(&XY)jRZiL_oXqg1RQI%N+S0fi-=@2B(W$qrUgR31?Y>haU!3Z+yWHXXTq1|d zj$*fKJ!^2A$!p~qvsjAQtw=kqV)4^=50!LJ-8?!29e2Rd;|7$c7A_@qJG-t8Sj*%t zQB$=#oM;plk>iKDWa_N807xd&cz$Sh)aI{|$A@Gfaqm%=BtR`Ds9 zN?WGTq+UBSNJ(jrDe3Gs>*m~PMn7}+*%MtUuH9^RuY~iZy0a7wZQ5sLR{SJZC|BK^ zx1_MT>-M{dKgg>!VVr?mkX&NKD`r?v%XyC>tER*Ucna6MQDC*SXtosHdMQDi2q&?l zB7Hl%9q7TD(+!V7>@BzZ4u>39*;?VRzRo?9 z-9%wY5-wNEwC+yT@|m_prYGj3$LigA!=J0wb2F4y?rZ0iS<12;MU%8Vb4g|Cy0@1B zZoNinqTwqdxcJ;Qjnb-BwhE31i^8(gyVd3pS$0*0oo6#C(CI)YOKKbyZnz+3&HLV4 zAMX(0*)H)|v2=!Q2&_gkFh+HjLU)tciTaGOkRaHtZM_4VUnaBnsLGLG@fR3E5i~sMUF)p?!ohLE>{nBQQGa=n06hdb{!AS~4Jj`nBau5OOg9^JW z*=Gq{2##<8(ZN3~r_Ew^Sm5j_wXZ0rsyMAtO$0@;q?nE*M|j5t2ja;(ZAM zaAeK4;c+|7CjpDMVojL`HdMGk)55{kpZ_Iao}btF!)L6!tSP#>5ZN|*ee-n@e|mp^ z|MdFykFAnGgeLj0*@M~kQ?Qa42@_u3DhCj)+d@66$)GUJv+K+BFuaE@t6xI z9dc-*b`e%87aa*u={dJ~tPfXMv%vYQYn@beNQVcm8>LHb16sPnA6@wZMQ9dDV|MZZ zYNgWgg(jYcT6R3pa(FACiBOSYRtUC2g-C{)2mX!H+l7Q286uR?2RK^^NQh-oxc2(v z*W~>4{`30}ukSwo^)1LLhz6v&feh&A89s~SBvlX0i-x^Z21I)NPq3AvZ94RRAK%Q}QrZi~Pb7Xw*igyfT?BqN!lX{oib}EAcDMe#k>moZ(bwE}OzD<|$b#`8E z>rAd_Pu-Lxmh@OhQ9Y!HF7>^5m2Kgd<&9cR2jv~&hi=?MOz??$xGK5@c15MZs$d&A zfstmm-X|qaC{aTh0A7N<2r3spQ{!H9HyF!~$+~@AQ%k}fLyu?fF?LV9*1m6A)CUp@ zVh1Xyx~jf4GMxjw+fnZBhyj)+yNPFiAKQM*ea*73W{$`!)-(g?h*}CtZG14_y9Qe ge*NjIZ2$1%&+ngpdHwn0x2O32PuYI`<=4Oc2he0q4*&oF literal 0 HcmV?d00001 From 3f78d05b26642d0966520815d9c2b02367610c96 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 21:34:55 -0600 Subject: [PATCH 22/35] quick update --- ...se.ensembl.81.partial.ENSMUSG00000017167.db | Bin 249856 -> 0 bytes ...mbl.81.partial.ENSMUSG00000017167.fa.pickle | Bin 3736 -> 0 bytes ...bl.81.partial.ENSMUSG00000017167.pep.pickle | Bin 2850 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.db delete mode 100644 tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa.pickle delete mode 100644 tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep.pickle diff --git a/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.db b/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.db deleted file mode 100644 index e2eba44f73986fcae19abbdd0794bdc1a8aaee76..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 249856 zcmeI5eRv$jo#*GZdqz@s>BXawttB2K8{9F$>KTn>U6#u_vLIfJjgkGtJaCzerLjGG z@<^7(<|Wy?)gJ$teQsIECJ*}%?l!Rdc%QxGF5L5E@Ak3|**pigFCi>}i!olX{lFdA zfL{RP%VXH*s=9mBzpm+#GJQYLl)qPQj{Y#ZKJv53-N8SPTp#QX9FPi}>Qs|!mj!27u{2h=w>*_E zO^;3$C(Ai2p4(X{6>_Eg?t;4~IdsSH+AYJW^&8g>Z%etloFlbqBiWl;Ia`!5(~?k= zo4L`9ztQQ!9$_*!UCvLHbA{5FYov2-<5f9jC!7Q~q^z=UNb$mr2k!4slgpL`_kW?B z-#$)i>Y82TnBB7WiyMYLwQ|$s_7$e4ixZ{Pmf>w%QX4n%zpWcKa7z<=r$!61O7e-i z%DZP_$vX=7PLyg{B4rlG?2KDCuHU?MICc9S>u+0o$DOHLhwr4G3BLR|dc3vS%VsTq zwihSL_f1y$IxXxv_S|X{XKLiLs#0;(&A6kGFYleIvNAe4Hl3?Z%@3ed1FP85-rd^^ zQ*tecPt$uRCnxyXHD1_P7?;yowXJbDt$o#gOioRd3&oO;hm}3P)8Ae#BwZ4JRhs)#Mj?f-}u_F6%NGofzY1hMmn}WnVwmjt-Cf3H>)e%fP9tKsZ63Ro6Hl(J zqZ>0ZrY1M>+ODcmTlY{{Z6h(=I*(~7XH~tW{l-j4SChALS9<1m#lxO0+!1~<%Or>! z#W7=2oBIejmjuC}&ttuE5y^HP-PqTnCWm;fJ8IP0N+;S)VpCg)O{=Z-wJP=)Gp)^P za)cWmsA0IOgCP#I%;P{+-YV?Az0+?ssmY$6;Qnr_cOv#ku-P$f_fKR;Y9%?8zAext z?x5Uu-||{&%N-!T?Q@M@1-%jf&fzwnGP=qh-*dqji7QDn5wv?B`(hMd3v*64Ae1K0 zSzet+oYX>f`;NO~N^&3({F-%1kqi}Aieo~I*DBXd<`(U0#)#_gveTH0qGPp+CLMw8 za-|0tSNON$MCDsC@tIsrqQMY*)LNxWXG_B(sU&J+c|AewFG(Ynhl$V%I6&OxGhJ0E#vheCHeV8 zt?%jL9Lt$}%SYGUEBIz{hsL)-_GEzn!v_RF00ck)1V8`;KmY_l00ck)1VG@@5x6VZ zsTo?grbXHerfCT+qG`<3v_fgRuzUM>ZdZO^zLXy-l}q_aQ&A7zWtx@S2mGO0v9xoQ z822l&mUaEZ8%J*2I&w>|^)Uy`{(&Z5uOK@a;Q#Oe0T2KI5C8!X009sH0T2KI5C8!X zxZ(*U!e+eI`2S3Row?#GhzfuJ2!H?xfB*=900@8p2!H?xfB*rZ z82?}0#}~B&0T2KI5C8!X009sH0T2KI5C8!|0ONn02Ot0fAOHd&00JNY0w4eaAOHd& zaP-2+S-Fb~2-&1$@)kbWq@_V5ipAOHd&00JNY z0w4eaAOHd&00JOzWf7Q3hB}$i)2*42HiH@3vZWD?F;mkrh0=6k_xADJuKd1yDetU; zCw>X<1npPk(^upv>d#M@=H%2wxlk=>*yI1B z0d|HR{giV<^+5mxKmY_l00ck)1V8`;KmY_l;7TRX8f}Y2f~G0vFo+omAtr>l5DwlY z<^2YqyRru0 z?EL?e{6_%zfB*=900@8p2!H?xfB*=900@8p2wW)y-1+~v^jaTt!0aC&^Z%)=vHut7 z2p$b+W9<2sQ!U?US=D^7c~^5&(+^@_Qh%V{r2M^dbM%MN^^u=N?hYOe{&}!Fa3C2lNM{8Twt zD3Qu`&TYIZr|g82;D(e{_6;dsxRF_;ByU+33|P)g?-?(a3%UH>@`TOXt$5CK!-)s( z?@*J=mIe2Jp`717PVl&C7CF9dS^LEe!yfo zg`aF5Z|wsnyKDKgy*N?6Z?dW?)55M}&#g9brba%iDiufFj5`YX^4_T`E2E=h)BIwQ z9pgt`s)1E(Y47gsg(*2P@o9SR3srIn4$G1FV zvRzGnp(i-gY@f2$Wx`*d!Vj{2&N|h$uODkihv&K>JG?o>@>cFj&-|`<*t3N@!p}aL1aYG{W=v{x9|7m=CK&X2tXHlF z*{-7-`&!iG5U+JdjapmjB=p)sY+9|OuT`lKn=rH9So^-%RCNL<*mZ* z*E>D%TTN=Rrzg0-+v;eDJrZnoOxs-z*^ydF4yA8na*6vIw+XkrmYQV;h}>>khrdP# zfsTcLeQ=vk8C_+M@3~-%#FeC(2-;tL?2A!+3C%g(fKZw|XL)rRaZ(G_?K|#{DanCE z@N3p3MKV-eDUJy-UaMR;nOn5086&E{%T8l1ijLJPnsfxV%atBrT;bn}6P0hp#Ak9f zi3UULQEQbhrFFfRb{IRN{2M>FW`E=Nw`N097GTS6&h7dTOS~PwGomE3iQql*fv_xA zsuQhcVm6gqky*(ohxzvdUWIPC?5ajjQI*=Y<=Zyy3aZIOBDg736%V(R@}e32A6?C4Il1bNJ%69QCz~*N2qk=M%LCiQ*i~nPMB}BlwuO zpdQN8|r8_O9~H$R9=4 zh5v8(;o$XwM+J$dZ~y9e%(Seg58tQD6O&@;h1q9^Rkz^W6n&oCcCPL`L#)avZr-`6 zQY(FL;qAQ9evZD_dAw0^$IY1W+|F}#+|>IN-PCiR6MfPUTdpG*G>RP2sX;YgpYpJUFa~WMevzE55upgLcR%8u*`%x}kjNX&imT%0>;wthxQW;%k?lF!y|hs(0LiWF(Qg$l)(yoS#Z;w~F!pZHnlJ2f+<~HbNk!`wrjPb>DR?=xm zU7No)w@uOzyLnP8EA8=2q)ScS#%p=K)#SKIoLUk=v?OkH$=qh$gjRbl*H_wY*w6pZ z2iQmK{I6~_kCH(E1V8`;KmY_l00ck)1V8`;KmY_T0f9(Z3x>jBEspX3C7=(JAOHd& z00JNY0w4eaAOHd&00JQJ&qF{O|DO`i|4;q%n1&S)009sH0T2KI5C8!X009sH0T2Lz zUo8PGyfiq(fBuus_GPoy^MCRC|E|FL0J~HFuX?jK)^f0=vw5QFZj$I?g}W2VV6h0=6k_xADJ zuKd1yDetU7C|*9`1cx_{+_rUOi@j7qZ^rB$RAS0`)2#e7ga7QmSlYSD8x_B6s#HXY z#k$gcX1`q!UERP|Znu{~7#z$DuF1@-NH);VXrL}XqpQ=I1^e0AKtIEe*X3vAk#u^& zekK~|XXrp(eug7ybHRQtYMh_y$N#+^Kh=L4NcS$-&yEKAdEvRb{QUSvb8x|awl~nv z50iEId0sKsEZEPs2KxCyM_qn?a7L`XvcPt!wSj(~9k0vJ_g0ww3-*&W(9hFH>+tcZXVlFmz&4VEx0pUt~AiiSD&uS&DZW(a9^}M+CVpte7!C=|M8Xu zcSXx14RrIBhw5_k$h8aZiI#^O=;q6t>T>gN>w-I?<)H?;dFYY4+&pxC!Tr$kU<2KJ z@h5e;`I5TOZb;1k@6wCn{r^YV&)7lsUG^AzkiP|R7yB~1nPu5>*1n*s7J1ixWdbu6!max*cA>vqNs@V5(*mu}B*?#uhY!552JK1e)Ex!|BuxnTwi|W7B-_@~Jpx-$T4Fo^{1V8`;KmY_l z00ck)1VG@@5oiu~R~EeMBH<(ncam^|gcp%;2MM>6a2pA?k}xA-orEl9yhnm9Ol{Epa@xOLBz)rFk*U)KF)&0lWumz(|NCVx5ZFUS04)n8Wp<*2_L z@t4E?awritv{--h-|5KTa|EWyI|5PU9e=3vlKb6V& zpUPzXPh~Rxr!pD;Q<;qasZ7TIR3_tpDwFX)mF@ZeGXZv%9p|(E|C&9;=l(y;X86qi zJlo9Y{hMqVpY6Q7sJ2m&Ag0w4eaAOHd&00JNY0wD0| zB*4NwmHGMNo$y^!JSoLHrFcS$FOuRNQoLP?w@L99%qlTZ}8s;?EiG0tf(~zfB*=900@8p2!H?xfB*=900>+pfk;>j zc8N_=Y&yjzAvTM|rbBGn#imVcTE&KmjV?Bt*tCdEv)D9=OD zY^?SFF#rD|j~lXs00@8p2!H?xfB*=900@AVnnauyE zGMWERWitPt%4GgOmC5{nDwFyDR3`KPsZ8eoQ<=>Fr!txUPh~RypUPzZKb6V+e=3vt z|5PUP|EWyo|5KUF|EDsU|4(Hy|DVca{y&xN`TtJ^*!%4NutV%``P2UIvH!?_0r2nG zUbd6J`~OyU6I;!H1JJ?W{C{3Qp&!wo<8K6f>QbNHunz(t00JNY0w4eaAOHd&00JOz zl@rjyOM^p0>qgSqzHGM36H9txot{|26I-eF*6N8dPfYj3G*7I>6KnRw znmn<%Cl>R>R8LIt#G;;9#1jj9VxgAs()cXFC_i0-?1DIHKFcm>GcAW(%;x<~$D0P@ z564c#)~b&xrMD8a5UY=9W}j<@-;2f zz5b4NHqy}x&(-0mAev1}j(&WjIq2_bqLGe%n5@&$zBQ7g=M{5}zoUy9>F5U?bvT;I z^k>sD(GSj;1OAS7G}6(t<8?S{T}=HF(f3xE{r--&H`3A5N9%O7PrjJWe8tTAJKEMr zM^Bxq*HQV@od&rWGx+qwJIS_3E3v1NV0BbM*C@ z!GVEZndqC(_;*Yznm?hRX`k;T>g(UC!_RbIrq`5D(KjCR@0V6IeL_F|zB{SMcGTf# ze?RZ)d!;k<*g5}hX+^w|e!lv29e#>#sZZ{fUc1M?S6UHkq@PE=Uaz0hcTv7>w|}R! zLT#jh4oS9xTnC`clOzzyZX@BQ}jd1SZU4BP)F6Vj&`g_-`v5v2zKCPLOb9@MIbZLCs z#fS53RDV`CF#f-o8x5i z&{d58XE6Ss_fo?6f4*l?HO4RG(G(rISobO9aW%&O)?>Qf48P@|s2Kla{694@_b3+Q ze{tue;p2btxN$&?|HFDTz`m^iXZ{~PAOHd&00JNY0w4eaAOHd&00LJtftl;NT&ig&7u5ph{+;dzb-6M;&tV{b(KDlnmCv?C9v z_Md%n-Qq?%8h*S^NBg8%ak0oFY2WE5;`jgEdVheOVMp1|*g^iD|Hs&a>^`=OeVN_N zvTQm3<$s8;{&!4&UjLE)J^kDI@9FpJMSYuo3(tlR2!H?xfB*=900@8p2!H?xfWTEv zpf%bSi3Ckk;bVP;kJh_gAsSHlsCbDhgdbP<=yS0vgbs)wWOvO9eA=|afB2kqg$vIq z>c=;By26J^MLn-1T;YQbMg8E+B3C#&uBh*==x~M8M-}zVSK3|S)G0+h{a~9by!)i0 zo+`Dv!rR9c^_^|Z6;57I)VB+|E4=lrqMq2Jxx$;hiu&d=Ew1qTw-oh_$C_Q?*bYTK zcCN`4UVU0oU%Myn3P--K@E--myuy*6hF#&6hs2KyR9OhQ!poZ!^>C}=3Wpw1)I;Z^ zuJH2{iu#foVdBWe`H|!E3B->A?D7A>06WJ{uvgg&e8s=NS6e(n2cZX)40 z3CBoSC1Hhxqa++5;V=n@n!??2hnqG2*ZwfTPO=x-U$H-Bzt8SrUuElAnsu`XAL;*s zzwQ6e^oRK~|51H|p4C62tJ)duu=W$}Kk;n%fB*=900@8p2!H?xfB*=900>+J0e9)X zTdY@__gcTy?)I0L_{)p^WwQPsEt{;F-itp7)4vi={H$@+g(ChPxEnXLauWwQPsmC5>lR3_{HQJJj&M`g19AC<}ae^e&x z|52H&|3_uA{vVZR$NwkT&-vPaf5QGf+sAV3OUz_TSXh6DKj;6Een9^N{XTuWeyiT6 zU!yD9Y3*f<|1aVKM^+F30T2KI5C8!X009sH0T6HqxI=g6C9c-H^U1i5%4A$e<*G8Q z_brlp2=x81DwBH% zR3`Tjs7&r5P?_9Apfb6KKxJ|dfy(3_0+q=<1S*qz2vjEb5U5P-8_I>up><`%Qut`1(;CA*Kte2%&D+}mn_!|KJLI1J-ef^L5-GJZG_vm?j zGcH*W009sH0T2KI5C8!X009sHfln;~7VfFc$>;03r^8Y_q=$PdbI-+o?{2eOit}gC zv^aCI6z`JaNh#hb#S>C|kreNc;_XtrO^UZlaVEueDXvNJ7Af8=#havfT#Cn}xGKdJ zDIS&L_V{1S|Ht$HPwgd(ih}?MfB*=900@8p2!H?xTsi{ov-@FdsYH8?`B0TSFvyos zR1;w{P8XfD7k!r!_YPQ#=#e=DR?)roil5|pv@5R}B(nuI8x*a*wpG~L9 zqkCIU_howf`^YSHSI(x%V`^76v*dw!MNSW9Oft8?maUiZuOScPiQG@7GY~mLp6C-f zO(rN1*(49lUAb?NJd7vu8Zw=M$o=GrK9T##Oavll$b1VTr^$2%BAaCLgDYnS$?Oaw z_mdeEM9wnn`9Hfqz&>K<*?a6=_7;1cy~Yl+m)P^{@A%sQf6RXs@E7bUzAnH4_5^#J zJ<7hp-x08%J;44g&xQ{OfB*=900@8p2!H?xfB*=900@A+$`*6xY5d19@e81uw*PfYW~T0F64Ppruki+f@*PfYd16i+Pb ziAm#sjUS018w>D%_<#TifB*=900@8p2!H?xfB*=900>+Jftkg@PG_VL`V{JwlC?<~p3UyEQmLHlf;Ix$u( z?Of%JPM0Sp#h!`M>}?aCt2^Ci_S@yj@Ba_7GyMI37g0fE1pyEM0T2KI5C8!X009sH z0T2KI5V))euxMK(5;sk8A3@v|5cB_o>|B8V!v_RF00ck)1V8`;KmY_l00ck)1VG?w zBG44>uB^cz=Kr@Vg#deyt>(Y$pVA}Rf6+F#eAM(vQ#k%md|B)}u_fwv)vWTrcp`j2 z00ck)1VEr30$&|al9^1fSk7-BFXYDZ<@{)VbXOs_yHL)Hn2&VR&>h2Tw+yGYto`DK z;gnB;)XIH@sp;ZGDYbs%7Gu0$P2RLDIHMLzV}*O&8E`o(o*NwaOb;@59n zH@q$7>fZFJO&iI+)JmrqV{cYX4smN8{?;nf@u532dO>loS+Ebqu{k^_jv3?ToX~d`N(B$}fw?U^>1CUaY)tj4$+f(;%)ho)GB54y z0_=$#z81v}W2R-bn%vBdX8es#7xsvBx#@C#s+=p7$YsGCS{SJ+oLH`r zuX626UhOF4%X_D)tT^qS>=-{3FHFgaiBHpeCnqQPsWe{LR~VPmS+%V} z)Y@0g(rqH;!YX@w%QHqkrzFipaKbw6Z6c!gsPq;>NqbK4>NMhn7OLAf+b{M=t<<{)3+TWTD>Ba@vvYT&Fis$0z4fn`Oe^ky%B(i+2mVwHN zHhXj(!MO8!C7DeG?~#wLwZFpXhB=G1+81xZLNdzNsmWo^rCTn$sudNkuc~FEDz$qg z{90T4`TxlPJ6X>cI-~^w5C8!X009sH0T2KI5C8!X009uVJPEk#L{|U)ALIYa^O(Ua z5C8!X009sH0T2KI5C8!X009uFmw-F|r|%QApZ}i?u(S1+hx8x-0w4eaAOHd&00JNY z0w4eaAOHfFEdf2;Q~B+`SZKf=|9=!Q{!!Zy50T2KI5C8!X009sH0T2KI5CDOT zAP^2~ag6^j0wA)200@8p2!H?xfB*=900@8p2!Oy9K)@aUzaL=lUjYuoMGyc15C8!X z009sH0T2KI5C8!X0D*-PXbCTk4-KswNoV`AS+V|KkX;DyfB1j^2!H?xfB*=900@8p U2!H?xfB* z_92U9p(eMr)14)S8zU=+XF7AhwKic%FQWe)u6eYzDRw~R!EB#)Igx3{=LD06<}@2k z7?l#?5U$WPW&FfytuxDU$ za_de>G{HiTQpmSbx>BK(*Ej#Zw{PFSd;8(_%TIrOeY^32-RJ%PpQ~3ZpPBTZ1U|S2 zYa(sr9d$L4+yKX?DLB#6YPkN_k9d3QESj7AWGNq+c1$2)q#K*5Vc-`3c%tHQ)`csp zz>*5C!s)_B#^xQuZ5%!TK+qEQ4zpe|%J25`yP zuXPG&dA5zalb9)s4+zhOqS*tTx20!=f`Cd-|61WtVO^+RR@+TqzvK&?JD>((%BWq0 zTx5y&>a?7KE9MzkKNbGZe@l@}Ce1DN7*pT$nF3C&5YdwYliL~jEnw)J5TmFu$i-@p zImp7#6iaP+@D58Mw`!W@$eClfBfvcg6Nm9l2BM1SC>;b+E`A(J$Jn}s z%McsY!SQQiJjQikRp2%aa3ek?w9+;%1$%C6^ZCdq7H#s2a`TDTzC z$|;zNK@zuwjRP4Yl%F_YG{p47TT!MR!JD7f#$LYwld+C8(Dzsy4b6)ODTOhN+J_6G z?KURhAYRpt*m)<%s41QQlps>57w0>9%A-eAdas_UDx@owNmM}){i5pX*{6EY)ehrZ zKe6MD0F9^slG4}niNejIK)kWdLmQ3lE!5YRP69*8g49nE2!#jKAi+-~q@UWIP!K5k zg=JYO%0xtyiblgi>sc>_dLEZUX(65&Lx{=f7eJflCCZUPY86lLGE=e&&5|XpSt#Ws z^SzYL53=JOXn+ax{*05zjfp>s>*{#PuU0Cq7ZR4L3J0msR=7@vY7bwvqst`6PwEk> z(dpA&B~8jDULS#HWB_xNqW5PTrzYyokAc?n1i>>SG-?1!qG_$|EaA!iOwek0x`~~< zXQJw4Zj4*Uz>7Um2Xv0P^RsO^#m~paxO=>}gLPhJ z&tAOTAC1%&^Exg&!(ht#>xGBN8kCPbl~OvKj#I5|NAd1>8J5p~Wh7SCBU$c=R?us( zA}Q`!1_|rs+|)tTeX3#^*Q@P^3W-qxkvg;dNf6sFmFtW)5<;%1Y>thBCmXK&a{wz8 zHKn1_R%THmQTI&APPVOj6lFi#>@YITQj&%mTlLv;gcYX8HPa2352`vHLMEg4q=$hs bZmS#zi!63m%BFG3wHjm(CEd*#Izz6fy4EW{PyE%ZX z(WtB+`S^a3|NQ3n*V|t|zJGgrqiNUQO+S5keE$67=U-lb`1r?nZ*SLSvZY!VNgiRG zZoc^(&XY)jRZiL_oXqg1RQI%N+S0fi-=@2B(W$qrUgR31?Y>haU!3Z+yWHXXTq1|d zj$*fKJ!^2A$!p~qvsjAQtw=kqV)4^=50!LJ-8?!29e2Rd;|7$c7A_@qJG-t8Sj*%t zQB$=#oM;plk>iKDWa_N807xd&cz$Sh)aI{|$A@Gfaqm%=BtR`Ds9 zN?WGTq+UBSNJ(jrDe3Gs>*m~PMn7}+*%MtUuH9^RuY~iZy0a7wZQ5sLR{SJZC|BK^ zx1_MT>-M{dKgg>!VVr?mkX&NKD`r?v%XyC>tER*Ucna6MQDC*SXtosHdMQDi2q&?l zB7Hl%9q7TD(+!V7>@BzZ4u>39*;?VRzRo?9 z-9%wY5-wNEwC+yT@|m_prYGj3$LigA!=J0wb2F4y?rZ0iS<12;MU%8Vb4g|Cy0@1B zZoNinqTwqdxcJ;Qjnb-BwhE31i^8(gyVd3pS$0*0oo6#C(CI)YOKKbyZnz+3&HLV4 zAMX(0*)H)|v2=!Q2&_gkFh+HjLU)tciTaGOkRaHtZM_4VUnaBnsLGLG@fR3E5i~sMUF)p?!ohLE>{nBQQGa=n06hdb{!AS~4Jj`nBau5OOg9^JW z*=Gq{2##<8(ZN3~r_Ew^Sm5j_wXZ0rsyMAtO$0@;q?nE*M|j5t2ja;(ZAM zaAeK4;c+|7CjpDMVojL`HdMGk)55{kpZ_Iao}btF!)L6!tSP#>5ZN|*ee-n@e|mp^ z|MdFykFAnGgeLj0*@M~kQ?Qa42@_u3DhCj)+d@66$)GUJv+K+BFuaE@t6xI z9dc-*b`e%87aa*u={dJ~tPfXMv%vYQYn@beNQVcm8>LHb16sPnA6@wZMQ9dDV|MZZ zYNgWgg(jYcT6R3pa(FACiBOSYRtUC2g-C{)2mX!H+l7Q286uR?2RK^^NQh-oxc2(v z*W~>4{`30}ukSwo^)1LLhz6v&feh&A89s~SBvlX0i-x^Z21I)NPq3AvZ94RRAK%Q}QrZi~Pb7Xw*igyfT?BqN!lX{oib}EAcDMe#k>moZ(bwE}OzD<|$b#`8E z>rAd_Pu-Lxmh@OhQ9Y!HF7>^5m2Kgd<&9cR2jv~&hi=?MOz??$xGK5@c15MZs$d&A zfstmm-X|qaC{aTh0A7N<2r3spQ{!H9HyF!~$+~@AQ%k}fLyu?fF?LV9*1m6A)CUp@ zVh1Xyx~jf4GMxjw+fnZBhyj)+yNPFiAKQM*ea*73W{$`!)-(g?h*}CtZG14_y9Qe ge*NjIZ2$1%&+ngpdHwn0x2O32PuYI`<=4Oc2he0q4*&oF From 6d840b421d5bb6b266cb26acce19a7474a09404e Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 21:52:04 -0600 Subject: [PATCH 23/35] ensemblrelease suport --- pyensembl/config.py | 2 +- pyensembl/ensembl_release.py | 29 +++++++++++---------------- pyensembl/ensembl_release_versions.py | 2 ++ 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pyensembl/config.py b/pyensembl/config.py index 3dfd54a..91ae5ed 100644 --- a/pyensembl/config.py +++ b/pyensembl/config.py @@ -157,7 +157,7 @@ "latin_name": "arabidopsis_thaliana", "synonyms": ["cress", "thale_cress"], "reference_assemblies": { - "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), + "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE), }, }, ] diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 521e2d7..1ed0bcf 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -16,7 +16,6 @@ """ from weakref import WeakValueDictionary -from .config import MAX_ENSEMBL_RELEASE # ENSEMBL_FTP_SERVER, from .ensembl_release_versions import check_release_number from .ensembl_url_templates import make_fasta_url, make_gtf_url from .genome import Genome @@ -29,16 +28,6 @@ class EnsemblRelease(Genome): particular release of the Ensembl database. """ - @classmethod - def normalize_init_values(cls, release, species, server): - """ - Normalizes the arguments which uniquely specify an EnsemblRelease - genome. - """ - release = check_release_number(release) - species = check_species_object(species) - return (release, species, server) - # Using a WeakValueDictionary instead of an ordinary dict to prevent a # memory leak in cases where we test many different releases in sequence. # When all the references to a particular EnsemblRelease die then that @@ -48,8 +37,9 @@ def normalize_init_values(cls, release, species, server): @classmethod def cached( cls, - release=MAX_ENSEMBL_RELEASE, + release=None, species=human, + database=None, server=None, # server=ENSEMBL_FTP_SERVER, ): @@ -57,7 +47,10 @@ def cached( Construct EnsemblRelease if it's never been made before, otherwise return an old instance. """ - init_args_tuple = cls.normalize_init_values(release, species, server) + release = check_release_number(release, database) + species = check_species_object(species) + init_args_tuple = (release, species, database, server) + if init_args_tuple in cls._genome_cache: genome = cls._genome_cache[init_args_tuple] else: @@ -66,14 +59,16 @@ def cached( def __init__( self, - release=MAX_ENSEMBL_RELEASE, + release=None, species=human, + database=None, server=None, # server=EMBL_FTP_SERVER,, ): - self.release, self.species, self.server = self.normalize_init_values( - release=release, species=species, server=server - ) + self.release = check_release_number(release, database) + self.species = check_species_object(species) + self.database = database + self.server = server self.gtf_url = make_gtf_url( ensembl_release=self.release, diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py index 05d4a15..9a5bc46 100644 --- a/pyensembl/ensembl_release_versions.py +++ b/pyensembl/ensembl_release_versions.py @@ -22,6 +22,8 @@ def check_release_number(release, database=None): """ Check to make sure a release is in the valid range of Ensembl releases. """ + if release is None: + return MAX_ENSEMBL_RELEASE if database is None else MAX_ENSEMBLGENOME_RELEASE try: release = int(release) except ValueError: From 9a0cf7fb3ab0362046b1237cc07353d56102cbb1 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 21:54:44 -0600 Subject: [PATCH 24/35] ensemblrelease suport --- pyensembl/ensembl_release.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 1ed0bcf..0055d6a 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -47,8 +47,8 @@ def cached( Construct EnsemblRelease if it's never been made before, otherwise return an old instance. """ - release = check_release_number(release, database) species = check_species_object(species) + release = check_release_number(release, species.database) init_args_tuple = (release, species, database, server) if init_args_tuple in cls._genome_cache: @@ -65,8 +65,8 @@ def __init__( server=None, # server=EMBL_FTP_SERVER,, ): - self.release = check_release_number(release, database) self.species = check_species_object(species) + self.release = check_release_number(release, species.database) self.database = database self.server = server From f5c537dd1168b1d06db20d972a508a9e3568b360 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 21:58:53 -0600 Subject: [PATCH 25/35] ensemblrelease suport --- pyensembl/ensembl_url_templates.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/pyensembl/ensembl_url_templates.py b/pyensembl/ensembl_url_templates.py index 4fcf774..e00968b 100644 --- a/pyensembl/ensembl_url_templates.py +++ b/pyensembl/ensembl_url_templates.py @@ -38,7 +38,9 @@ DATABASE_FASTA_SUBDIR_TEMPLATE = ( "/pub/release-%(release)d/%(database)s/fasta/%(species)s/%(type)s/" ) -DATABASE_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/%(database)s/gtf/%(species)s/" +DATABASE_GTF_SUBDIR_TEMPLATE = ( + "/pub/release-%(release)d/%(database)s/gtf/%(species)s/" +) # GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz" @@ -52,11 +54,15 @@ # ncRNA FASTA file for releases before (and including) Ensembl 75 # example: Homo_sapiens.NCBI36.54.ncrna.fa.gz -OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" +OLD_FASTA_FILENAME_TEMPLATE_NCRNA = ( + "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" +) # cDNA & protein FASTA file for releases after Ensembl 75 # example: Homo_sapiens.GRCh37.cdna.all.fa.gz -NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" +NEW_FASTA_FILENAME_TEMPLATE = ( + "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" +) # ncRNA FASTA file for releases after Ensembl 75 # example: Homo_sapiens.GRCh37.ncrna.fa.gz @@ -68,9 +74,11 @@ def normalize_release_properties(ensembl_release, species): Make sure a given release is valid, normalize it to be an integer, normalize the species name, and get its associated reference. """ - ensembl_release = check_release_number(ensembl_release) if not isinstance(species, Species): species = find_species_by_name(species) + ensembl_release = check_release_number( + ensembl_release, database=species.database + ) reference_name = species.which_reference(ensembl_release) return ensembl_release, species.latin_name, reference_name @@ -99,7 +107,9 @@ def make_gtf_url(ensembl_release, species, server=None, database=None): server = ENSEMBL_FTP_SERVER else: server = ENSEMBLGENOME_FTP_SERVER - ensembl_release, species, _ = normalize_release_properties(ensembl_release, species) + ensembl_release, species, _ = normalize_release_properties( + ensembl_release, species + ) if database is None: subdir = GTF_SUBDIR_TEMPLATE % { "release": ensembl_release, @@ -111,7 +121,9 @@ def make_gtf_url(ensembl_release, species, server=None, database=None): "database": database, "species": species, } - filename = make_gtf_filename(ensembl_release=ensembl_release, species=species) + filename = make_gtf_filename( + ensembl_release=ensembl_release, species=species + ) return server + subdir + filename @@ -172,7 +184,7 @@ def make_fasta_url( server = ENSEMBL_FTP_SERVER else: server = ENSEMBLGENOME_FTP_SERVER - ensembl_release, species, reference_name = normalize_release_properties( + ensembl_release, species, _ = normalize_release_properties( ensembl_release, species ) if database is None: From 9d1ce7c376f295bdeafe4dec3b173bad81232851 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 22:01:29 -0600 Subject: [PATCH 26/35] ensemblrelease suport, fix bu --- pyensembl/ensembl_release.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 0055d6a..497b503 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -66,7 +66,7 @@ def __init__( # server=EMBL_FTP_SERVER,, ): self.species = check_species_object(species) - self.release = check_release_number(release, species.database) + self.release = check_release_number(release, self.species.database) self.database = database self.server = server From a7e8b5b389537e10d6767a8be3259dd20fb24585 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 22:33:49 -0600 Subject: [PATCH 27/35] ensemblrelease suport, fix bug --- pyensembl/__init__.py | 27 ++++----- pyensembl/common.py | 14 ++--- pyensembl/config.py | 4 +- pyensembl/database.py | 85 ++++++++++++++++----------- pyensembl/ensembl_release_versions.py | 8 +-- pyensembl/fasta.py | 3 +- pyensembl/genome.py | 2 +- pyensembl/normalization.py | 3 +- pyensembl/reference_name.py | 6 +- pyensembl/sequence_data.py | 10 ++-- pyensembl/species.py | 1 + 11 files changed, 86 insertions(+), 77 deletions(-) diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index eeb28fb..75e8360 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -10,27 +10,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .config import MAX_ENSEMBL_RELEASE, MAX_ENSEMBLGENOME_RELEASE from .database import Database from .download_cache import DownloadCache from .ensembl_release import EnsemblRelease, cached_release -from .ensembl_release_versions import MAX_ENSEMBL_RELEASE from .exon import Exon -from .genome import Genome from .gene import Gene +from .genome import Genome from .locus import Locus -from .reference_name import ( - ensembl_grch36, - ensembl_grch37, - ensembl_grch38, - normalize_reference_name, - find_species_by_reference, - which_reference, - genome_for_reference_name, -) - +from .reference_name import ( # ensembl_grch36,; ensembl_grch37,; ensembl_grch38, + find_species_by_reference, genome_for_reference_name, + normalize_reference_name, which_reference) from .search import find_nearest_locus from .sequence_data import SequenceData -from .species import find_species_by_name, check_species_object, normalize_species_name +from .species import (check_species_object, find_species_by_name, + normalize_species_name) from .transcript import Transcript from .version import __version__ @@ -41,6 +35,7 @@ "EnsemblRelease", "cached_release", "MAX_ENSEMBL_RELEASE", + "MAX_ENSEMBLGENOME_RELEASE", "Gene", "Transcript", "Exon", @@ -56,7 +51,7 @@ "Genome", "Locus", "Exon", - "ensembl_grch36", - "ensembl_grch37", - "ensembl_grch38", + # "ensembl_grch36", + # "ensembl_grch37", + # "ensembl_grch38", ] diff --git a/pyensembl/common.py b/pyensembl/common.py index ccc5eb1..a9a3964 100644 --- a/pyensembl/common.py +++ b/pyensembl/common.py @@ -11,7 +11,6 @@ # limitations under the License. import pickle - from functools import wraps @@ -28,10 +27,11 @@ def load_pickle(filepath): def _memoize_cache_key(args, kwargs): - """Turn args tuple and kwargs dictionary into a hashable key. + """ + Turn args tuple and kwargs dictionary into a hashable key. - Expects that all arguments to a memoized function are either hashable - or can be uniquely identified from type(arg) and repr(arg). + Expects that all arguments to a memoized function are either + hashable or can be uniquely identified from type(arg) and repr(arg). """ cache_key_list = [] @@ -51,9 +51,9 @@ def _memoize_cache_key(args, kwargs): def memoize(fn): - """Simple reset-able memoization decorator for functions and methods, - assumes that all arguments to the function can be hashed and - compared. + """ + Simple reset-able memoization decorator for functions and methods, assumes + that all arguments to the function can be hashed and compared. """ cache = {} diff --git a/pyensembl/config.py b/pyensembl/config.py index 91ae5ed..cd58605 100644 --- a/pyensembl/config.py +++ b/pyensembl/config.py @@ -152,12 +152,14 @@ "reference_assemblies": { "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), }, + "database": "plants", }, { "latin_name": "arabidopsis_thaliana", - "synonyms": ["cress", "thale_cress"], + "synonyms": ["cress", "thale_cress", "hehe"], "reference_assemblies": { "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE), }, + "database": "plants", }, ] diff --git a/pyensembl/database.py b/pyensembl/database.py index 4286908..562aa06 100644 --- a/pyensembl/database.py +++ b/pyensembl/database.py @@ -11,16 +11,16 @@ # limitations under the License. import logging -from os.path import split, join, exists, splitext import sqlite3 +from os.path import exists, join, split, splitext import datacache +from gtfparse import create_missing_features, read_gtf from typechecks import require_integer, require_string -from gtfparse import read_gtf, create_missing_features from .common import memoize -from .normalization import normalize_chromosome, normalize_strand from .locus import Locus +from .normalization import normalize_chromosome, normalize_strand # any time we update the database schema, increment this version number DATABASE_SCHEMA_VERSION = 3 @@ -31,9 +31,9 @@ class Database(object): """ - Wrapper around sqlite3 database so that the rest of the - library doesn't have to worry about constructing the .db file or - writing SQL queries directly. + Wrapper around sqlite3 database so that the rest of the library doesn't + have to worry about constructing the .db file or writing SQL queries + directly. """ def __init__( @@ -104,8 +104,8 @@ def local_db_path(self): def _all_possible_indices(self, column_names): """ - Create list of tuples containing all possible index groups - we might want to create over tables in this database. + Create list of tuples containing all possible index groups we might + want to create over tables in this database. If a set of genome annotations is missing some column we want to index on, we have to drop any indices which use that column. @@ -136,7 +136,8 @@ def _all_possible_indices(self, column_names): # other GTFs) if column_name not in column_set: logger.info( - "Skipping database index for {%s}", ", ".join(column_group) + "Skipping database index for {%s}", + ", ".join(column_group), ) skip = True if skip: @@ -149,7 +150,8 @@ def _all_possible_indices(self, column_names): PRIMARY_KEY_COLUMNS = {"gene": "gene_id", "transcript": "transcript_id"} def _get_primary_key(self, feature_name, feature_df): - """Name of primary key for a feature table (e.g. "gene" -> "gene_id") + """ + Name of primary key for a feature table (e.g. "gene" -> "gene_id") Since we're potentially going to run this code over unseen data, make sure that the primary is unique and never null. @@ -163,18 +165,21 @@ def _get_primary_key(self, feature_name, feature_df): if primary_key_values.isnull().any(): raise ValueError( "Column '%s' can't be primary key of table '%s'" - " because it contains nulls values" % (primary_key, feature_name) + " because it contains nulls values" + % (primary_key, feature_name) ) elif len(primary_key_values.unique()) < len(primary_key_values): raise ValueError( "Column '%s' can't be primary key of table '%s'" - " because it contains repeated values" % (primary_key, feature_name) + " because it contains repeated values" + % (primary_key, feature_name) ) else: return primary_key def _feature_indices(self, all_index_groups, primary_key, feature_df): - """Choose subset of index group tuples from `all_index_groups` which are + """ + Choose subset of index group tuples from `all_index_groups` which are applicable to a particular feature (not same as its primary key, have non-null values). """ @@ -194,9 +199,8 @@ def _feature_indices(self, all_index_groups, primary_key, feature_df): def create(self, overwrite=False): """ - Create the local database (including indexing) if it's not - already set up. If `overwrite` is True, always re-create - the database from scratch. + Create the local database (including indexing) if it's not already set + up. If `overwrite` is True, always re-create the database from scratch. Returns a connection to the database. """ @@ -204,7 +208,8 @@ def create(self, overwrite=False): datacache.ensure_dir(self.cache_directory_path) df = self._load_gtf_as_dataframe( - usecols=self.restrict_gtf_columns, features=self.restrict_gtf_features + usecols=self.restrict_gtf_columns, + features=self.restrict_gtf_features, ) all_index_groups = self._all_possible_indices(df.columns) @@ -261,7 +266,7 @@ def _get_connection(self): @property def connection(self): """ - Get a connection to the database or raise an exception + Get a connection to the database or raise an exception. """ connection = self._get_connection() if connection: @@ -275,6 +280,7 @@ def connection(self): def connect_or_create(self, overwrite=False): """ Return a connection to the database if it exists, otherwise create it. + Overwrite the existing database if `overwrite` is True. """ connection = self._get_connection() @@ -306,8 +312,8 @@ def column_values_at_locus( sorted=False, ): """ - Get the non-null values of a column from the database - at a particular range of loci + Get the non-null values of a column from the database at a particular + range of loci. """ # TODO: combine with the query method, since they overlap @@ -408,8 +414,8 @@ def distinct_column_values_at_locus( def run_sql_query(self, sql, required=False, query_params=[]): """ - Given an arbitrary SQL query, run it against the database - and return the results. + Given an arbitrary SQL query, run it against the database and return + the results. Parameters ---------- @@ -454,8 +460,8 @@ def query( required=False, ): """ - Construct a SQL query and run against the sqlite3 database, - filtered both by the feature type and a user-provided column/value. + Construct a SQL query and run against the sqlite3 database, filtered + both by the feature type and a user-provided column/value. """ sql = """ SELECT %s%s @@ -468,7 +474,9 @@ def query( filter_column, ) query_params = [filter_value] - return self.run_sql_query(sql, required=required, query_params=query_params) + return self.run_sql_query( + sql, required=required, query_params=query_params + ) def query_one( self, @@ -490,7 +498,9 @@ def query_one( if len(results) == 0: if required: - raise ValueError("%s not found: %s" % (filter_column, filter_value)) + raise ValueError( + "%s not found: %s" % (filter_column, filter_value) + ) else: return None elif len(results) > 1: @@ -505,8 +515,8 @@ def query_feature_values( self, column, feature, distinct=True, contig=None, strand=None ): """ - Run a SQL query against the sqlite3 database, filtered - only on the feature type. + Run a SQL query against the sqlite3 database, filtered only on the + feature type. """ query = """ SELECT %s%s @@ -541,7 +551,6 @@ def query_loci(self, filter_column, filter_value, feature): """ Query for loci satisfying a given filter and feature type. - Parameters ---------- filter_column : str @@ -571,8 +580,8 @@ def query_loci(self, filter_column, filter_value, feature): def query_locus(self, filter_column, filter_value, feature): """ - Query for unique locus, raises error if missing or more than - one locus in the database. + Query for unique locus, raises error if missing or more than one locus + in the database. Parameters ---------- @@ -588,7 +597,9 @@ def query_locus(self, filter_column, filter_value, feature): Returns single Locus object. """ loci = self.query_loci( - filter_column=filter_column, filter_value=filter_value, feature=feature + filter_column=filter_column, + filter_value=filter_value, + feature=feature, ) if len(loci) == 0: @@ -605,7 +616,7 @@ def query_locus(self, filter_column, filter_value, feature): def _load_gtf_as_dataframe(self, usecols=None, features=None): """ - Parse this genome source's GTF file and load it as a Pandas DataFrame + Parse this genome source's GTF file and load it as a Pandas DataFrame. """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf( @@ -621,7 +632,9 @@ def _load_gtf_as_dataframe(self, usecols=None, features=None): column_names = set(df.keys()) expect_gene_feature = features is None or "gene" in features - expect_transcript_feature = features is None or "transcript" in features + expect_transcript_feature = ( + features is None or "transcript" in features + ) observed_features = set(df["feature"]) # older Ensembl releases don't have "gene" or "transcript" @@ -635,7 +648,9 @@ def _load_gtf_as_dataframe(self, usecols=None, features=None): dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ - "gene": {"gene_name", "gene_biotype"}.intersection(column_names), + "gene": {"gene_name", "gene_biotype"}.intersection( + column_names + ), }, missing_value="", ) diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py index 9a5bc46..d31612a 100644 --- a/pyensembl/ensembl_release_versions.py +++ b/pyensembl/ensembl_release_versions.py @@ -10,12 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config import ( - MAX_ENSEMBL_RELEASE, - MAX_ENSEMBLGENOME_RELEASE, - MIN_ENSEMBL_RELEASE, - MIN_ENSEMBLGENOME_RELEASE, -) +from .config import (MAX_ENSEMBL_RELEASE, MAX_ENSEMBLGENOME_RELEASE, + MIN_ENSEMBL_RELEASE, MIN_ENSEMBLGENOME_RELEASE) def check_release_number(release, database=None): diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py index e339a8a..3e8ba92 100644 --- a/pyensembl/fasta.py +++ b/pyensembl/fasta.py @@ -19,9 +19,8 @@ """ -from gzip import GzipFile import logging - +from gzip import GzipFile logger = logging.getLogger(__name__) diff --git a/pyensembl/genome.py b/pyensembl/genome.py index e56dd2e..355841b 100644 --- a/pyensembl/genome.py +++ b/pyensembl/genome.py @@ -21,8 +21,8 @@ from serializable import Serializable -from .download_cache import DownloadCache from .database import Database +from .download_cache import DownloadCache from .exon import Exon from .gene import Gene from .sequence_data import SequenceData diff --git a/pyensembl/normalization.py b/pyensembl/normalization.py index fb0cc33..81f65c5 100644 --- a/pyensembl/normalization.py +++ b/pyensembl/normalization.py @@ -11,7 +11,8 @@ # limitations under the License. from sys import intern -from typechecks import is_string, is_integer + +from typechecks import is_integer, is_string # Manually memoizing here, since our simple common.memoize function has # noticable overhead in this instance. diff --git a/pyensembl/reference_name.py b/pyensembl/reference_name.py index 1b7639d..dbb8d1f 100644 --- a/pyensembl/reference_name.py +++ b/pyensembl/reference_name.py @@ -70,6 +70,6 @@ def genome_for_reference_name(reference_name, allow_older_downloaded_release=Tru return EnsemblRelease.cached(release=max_ensembl_release, species=species) -ensembl_grch36 = genome_for_reference_name("ncbi36") -ensembl_grch37 = genome_for_reference_name("grch37") -ensembl_grch38 = genome_for_reference_name("grch38") +# ensembl_grch36 = genome_for_reference_name("ncbi36") +# ensembl_grch37 = genome_for_reference_name("grch37") +# ensembl_grch38 = genome_for_reference_name("grch38") diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py index 631c748..c2a6e0d 100644 --- a/pyensembl/sequence_data.py +++ b/pyensembl/sequence_data.py @@ -10,14 +10,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from os import remove -from os.path import exists, abspath, split, join import logging -from collections import Counter import pickle -from .common import load_pickle, dump_pickle -from .fasta import parse_fasta_dictionary +from collections import Counter +from os import remove +from os.path import abspath, exists, join, split +from .common import dump_pickle, load_pickle +from .fasta import parse_fasta_dictionary logger = logging.getLogger(__name__) diff --git a/pyensembl/species.py b/pyensembl/species.py index fe8f3b0..ca477e7 100644 --- a/pyensembl/species.py +++ b/pyensembl/species.py @@ -205,4 +205,5 @@ def check_species_object(species_name_or_object): latin_name=data["latin_name"], synonyms=data["synonyms"], reference_assemblies=data["reference_assemblies"], + database=data.get("database", None), ) From 9ead6571713ee972acb259558ebe2e3467b34c6e Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 23:20:41 -0600 Subject: [PATCH 28/35] update more species --- pyensembl/config.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pyensembl/config.py b/pyensembl/config.py index cd58605..faaa3a5 100644 --- a/pyensembl/config.py +++ b/pyensembl/config.py @@ -146,19 +146,35 @@ "synonyms": ["yeast", "budding_yeast"], "reference_assemblies": {"R64-1-1": (75, MAX_ENSEMBL_RELEASE)}, }, + { + "latin_name": "arabidopsis_thaliana", + "synonyms": ["cress", "thale_cress", "hehe"], + "reference_assemblies": { + "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + "database": "plants", + }, { "latin_name": "oryza_sativa", - "synonyms": ["rice", "japanese_rice"], + "synonyms": ["rice"], "reference_assemblies": { "IRGSP-1.0": (55, MAX_ENSEMBLGENOME_RELEASE), }, "database": "plants", }, { - "latin_name": "arabidopsis_thaliana", - "synonyms": ["cress", "thale_cress", "hehe"], + "latin_name": "zea_mays", + "synonyms": ["maize"], "reference_assemblies": { - "TAIR10": (55, MAX_ENSEMBLGENOME_RELEASE), + "Zm-B73-REFERENCE-NAM-5.0": (55, MAX_ENSEMBLGENOME_RELEASE), + }, + "database": "plants", + }, + { + "latin_name": "glycine_max", + "synonyms": ["soybean"], + "reference_assemblies": { + "Glycine_max_v2.1": (55, MAX_ENSEMBLGENOME_RELEASE), }, "database": "plants", }, From 41612a03a8a6f21f8ac6e239dde76bddf8ed42fb Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 23:43:37 -0600 Subject: [PATCH 29/35] format code --- docs/conf.py | 8 +- pyensembl/__init__.py | 14 +- pyensembl/download_cache.py | 19 ++- pyensembl/ensembl_release_versions.py | 14 +- pyensembl/exon.py | 4 +- pyensembl/fasta.py | 3 +- pyensembl/gene.py | 7 +- pyensembl/genome.py | 204 +++++++++++++++++--------- pyensembl/locus.py | 25 +++- pyensembl/reference_name.py | 12 +- pyensembl/sequence_data.py | 12 +- pyensembl/transcript.py | 33 +++-- tests/common.py | 4 +- tests/data.py | 8 +- tests/test_download_cache.py | 6 +- tests/test_exon_id.py | 6 +- tests/test_exon_object.py | 12 +- tests/test_gene_ids.py | 8 +- tests/test_gene_names.py | 5 +- tests/test_gene_objects.py | 16 +- tests/test_id_length.py | 4 +- tests/test_missing_genome_sources.py | 25 +++- tests/test_mouse.py | 5 +- tests/test_release_versions.py | 4 +- tests/test_search.py | 8 +- tests/test_sequence_data.py | 20 ++- tests/test_serialization.py | 8 +- tests/test_shell.py | 4 +- tests/test_timings.py | 12 +- tests/test_transcript_ids.py | 7 +- tests/test_transcript_objects.py | 38 +++-- tests/test_ucsc_gtf.py | 24 ++- 32 files changed, 412 insertions(+), 167 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 1c4034e..aefddaa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -220,7 +220,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "pyensembl.tex", "pyensembl Documentation", "Hammer Lab", "manual"), + ( + master_doc, + "pyensembl.tex", + "pyensembl Documentation", + "Hammer Lab", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index 75e8360..991af8c 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -19,12 +19,18 @@ from .genome import Genome from .locus import Locus from .reference_name import ( # ensembl_grch36,; ensembl_grch37,; ensembl_grch38, - find_species_by_reference, genome_for_reference_name, - normalize_reference_name, which_reference) + find_species_by_reference, + genome_for_reference_name, + normalize_reference_name, + which_reference, +) from .search import find_nearest_locus from .sequence_data import SequenceData -from .species import (check_species_object, find_species_by_name, - normalize_species_name) +from .species import ( + check_species_object, + find_species_by_name, + normalize_species_name, +) from .transcript import Transcript from .version import __version__ diff --git a/pyensembl/download_cache.py b/pyensembl/download_cache.py index 47a0766..48ebd00 100644 --- a/pyensembl/download_cache.py +++ b/pyensembl/download_cache.py @@ -151,7 +151,10 @@ def _fields(self): ) def __eq__(self, other): - return other.__class__ is DownloadCache and self._fields() == other._fields() + return ( + other.__class__ is DownloadCache + and self._fields() == other._fields() + ) def __hash__(self): return hash(self._fields()) @@ -213,10 +216,14 @@ def cached_path(self, path_or_url): # if we expect the download function to decompress this file then # we should use its name without the compression extension if self.decompress_on_download: - local_filename = self._remove_compression_suffix_if_present(local_filename) + local_filename = self._remove_compression_suffix_if_present( + local_filename + ) if len(local_filename) == 0: - raise ValueError("Can't determine local filename for %s" % (path_or_url,)) + raise ValueError( + "Can't determine local filename for %s" % (path_or_url,) + ) return join(self.cache_directory_path, local_filename) @@ -319,9 +326,9 @@ def delete_cached_files(self, prefixes=[], suffixes=[]): """ if isdir(self.cache_directory_path): for filename in listdir(): - delete = any([filename.endswith(ext) for ext in suffixes]) or any( - [filename.startswith(pre) for pre in prefixes] - ) + delete = any( + [filename.endswith(ext) for ext in suffixes] + ) or any([filename.startswith(pre) for pre in prefixes]) if delete: path = join(self.cache_directory_path, filename) logger.info("Deleting %s", path) diff --git a/pyensembl/ensembl_release_versions.py b/pyensembl/ensembl_release_versions.py index d31612a..246a380 100644 --- a/pyensembl/ensembl_release_versions.py +++ b/pyensembl/ensembl_release_versions.py @@ -10,8 +10,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config import (MAX_ENSEMBL_RELEASE, MAX_ENSEMBLGENOME_RELEASE, - MIN_ENSEMBL_RELEASE, MIN_ENSEMBLGENOME_RELEASE) +from .config import ( + MAX_ENSEMBL_RELEASE, + MAX_ENSEMBLGENOME_RELEASE, + MIN_ENSEMBL_RELEASE, + MIN_ENSEMBLGENOME_RELEASE, +) def check_release_number(release, database=None): @@ -19,7 +23,11 @@ def check_release_number(release, database=None): Check to make sure a release is in the valid range of Ensembl releases. """ if release is None: - return MAX_ENSEMBL_RELEASE if database is None else MAX_ENSEMBLGENOME_RELEASE + return ( + MAX_ENSEMBL_RELEASE + if database is None + else MAX_ENSEMBLGENOME_RELEASE + ) try: release = int(release) except ValueError: diff --git a/pyensembl/exon.py b/pyensembl/exon.py index a520290..a84b75f 100644 --- a/pyensembl/exon.py +++ b/pyensembl/exon.py @@ -15,7 +15,9 @@ class Exon(Locus): - def __init__(self, exon_id, contig, start, end, strand, gene_name, gene_id): + def __init__( + self, exon_id, contig, start, end, strand, gene_name, gene_id + ): Locus.__init__(self, contig, start, end, strand) self.exon_id = exon_id self.gene_name = gene_name diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py index 3e8ba92..b55750b 100644 --- a/pyensembl/fasta.py +++ b/pyensembl/fasta.py @@ -32,7 +32,8 @@ def _parse_header_id(line): """ if type(line) is not bytes: raise TypeError( - "Expected header line to be of type %s but got %s" % (bytes, type(line)) + "Expected header line to be of type %s but got %s" + % (bytes, type(line)) ) if len(line) <= 1: diff --git a/pyensembl/gene.py b/pyensembl/gene.py index f26de48..b787c64 100644 --- a/pyensembl/gene.py +++ b/pyensembl/gene.py @@ -17,7 +17,9 @@ class Gene(LocusWithGenome): - def __init__(self, gene_id, gene_name, contig, start, end, strand, biotype, genome): + def __init__( + self, gene_id, gene_name, contig, start, end, strand, biotype, genome + ): LocusWithGenome.__init__( self, contig=contig, @@ -98,7 +100,8 @@ def transcripts(self): # its particular information, might be more efficient if we # just get all the columns here, but how do we keep that modular? return [ - self.genome.transcript_by_id(result[0]) for result in transcript_id_results + self.genome.transcript_by_id(result[0]) + for result in transcript_id_results ] @memoized_property diff --git a/pyensembl/genome.py b/pyensembl/genome.py index 355841b..a5e202d 100644 --- a/pyensembl/genome.py +++ b/pyensembl/genome.py @@ -11,8 +11,8 @@ # limitations under the License. """ -Contains the Genome class, with its millions of accessors and wrappers -around an arbitrary genomic database. +Contains the Genome class, with its millions of accessors and wrappers around +an arbitrary genomic database. """ @@ -31,8 +31,8 @@ class Genome(Serializable): """ - Bundles together the genomic annotation and sequence data associated with - a particular genomic database source (e.g. a single Ensembl release) and + Bundles together the genomic annotation and sequence data associated with a + particular genomic database source (e.g. a single Ensembl release) and provides a wide variety of helper methods for accessing this data. """ @@ -148,7 +148,7 @@ def to_dict(self): def _init_lazy_fields(self): """ - Member data that gets loaded or constructed on demand + Member data that gets loaded or constructed on demand. """ self.gtf_path = None self._protein_sequences = None @@ -163,11 +163,15 @@ def _init_lazy_fields(self): self._exons = {} def _get_cached_path( - self, field_name, path_or_url, download_if_missing=False, overwrite=False + self, + field_name, + path_or_url, + download_if_missing=False, + overwrite=False, ): """ - Get the local path for a possibly remote file, invoking either - a download or install error message if it's missing. + Get the local path for a possibly remote file, invoking either a + download or install error message if it's missing. """ if len(field_name) == 0: raise ValueError("Expected non-empty field name") @@ -188,7 +192,9 @@ def _get_gtf_path(self, download_if_missing=False, overwrite=False): overwrite=overwrite, ) - def _get_transcript_fasta_paths(self, download_if_missing=False, overwrite=False): + def _get_transcript_fasta_paths( + self, download_if_missing=False, overwrite=False + ): if not self.requires_transcript_fasta: raise ValueError("No transcript FASTA source for %s" % self) return [ @@ -201,7 +207,9 @@ def _get_transcript_fasta_paths(self, download_if_missing=False, overwrite=False for path in self._transcript_fasta_paths_or_urls ] - def _get_protein_fasta_paths(self, download_if_missing=False, overwrite=False): + def _get_protein_fasta_paths( + self, download_if_missing=False, overwrite=False + ): # get the path for peptide FASTA files containing # this genome's protein sequences if not self.requires_protein_fasta: @@ -233,7 +241,9 @@ def _set_local_paths(self, download_if_missing=True, overwrite=False): def required_local_files(self): paths = [] if self._gtf_path_or_url: - paths.append(self.download_cache.cached_path(self._gtf_path_or_url)) + paths.append( + self.download_cache.cached_path(self._gtf_path_or_url) + ) if self._transcript_fasta_paths_or_urls: paths.extend( [ @@ -273,8 +283,8 @@ def download(self, overwrite=False): def index(self, overwrite=False): """ Assuming that all necessary data for this Genome has been downloaded, - generate the GTF database and save efficient representation of - FASTA sequence files. + generate the GTF database and save efficient representation of FASTA + sequence files. """ if self.requires_gtf: self.db.connect_or_create(overwrite=overwrite) @@ -295,7 +305,9 @@ def db(self): overwrite=False, ) if self.gtf_path is None: - raise ValueError("Property 'gtf_path' of %s cannot be None" % self) + raise ValueError( + "Property 'gtf_path' of %s cannot be None" % self + ) # Database object turns the GTF dataframes into sqlite3 tables # and wraps them with methods like `query_one` @@ -348,7 +360,8 @@ def protein_sequences(self): self._set_local_paths(download_if_missing=False, overwrite=False) if self.protein_fasta_paths is None: raise ValueError( - "Property 'protein_fasta_paths' of %s cannot be None" % self + "Property 'protein_fasta_paths' of %s cannot be None" + % self ) self._protein_sequences = SequenceData( fasta_paths=self.protein_fasta_paths, @@ -360,13 +373,16 @@ def protein_sequences(self): def transcript_sequences(self): if self._transcript_sequences is None: if not self.requires_transcript_fasta: - raise ValueError("Missing transcript FASTA source for %s" % self) + raise ValueError( + "Missing transcript FASTA source for %s" % self + ) # make sure transcript FASTA file exists locally # and populate self.transcript_fasta_paths self._set_local_paths(download_if_missing=False, overwrite=False) if self.transcript_fasta_paths is None: raise ValueError( - "Property 'transcript_fasta_paths' of %s cannot be None" % (self,) + "Property 'transcript_fasta_paths' of %s cannot be None" + % (self,) ) self._transcript_sequences = SequenceData( fasta_paths=self.transcript_fasta_paths, @@ -376,8 +392,8 @@ def transcript_sequences(self): def install_string(self): """ - Add every missing file to the install string shown to the user - in an error message. + Add every missing file to the install string shown to the user in an + error message. """ args = [ "--reference-name", @@ -451,7 +467,7 @@ def __hash__(self): def clear_cache(self): """ - Clear any in-memory cached values + Clear any in-memory cached values. """ for maybe_fn in self.__dict__.values(): # clear cache associated with all memoization decorators, @@ -461,7 +477,7 @@ def clear_cache(self): def delete_index_files(self): """ - Delete all data aside from source GTF and FASTA files + Delete all data aside from source GTF and FASTA files. """ self.clear_cache() db_path = self.db.local_db_path() @@ -472,9 +488,8 @@ def _all_feature_values( self, column, feature, distinct=True, contig=None, strand=None ): """ - Cached lookup of all values for a particular feature property from - the database, caches repeated queries in memory and - stores them as a CSV. + Cached lookup of all values for a particular feature property from the + database, caches repeated queries in memory and stores them as a CSV. Parameters ---------- @@ -505,23 +520,31 @@ def _all_feature_values( ) def transcript_sequence(self, transcript_id): - """Return cDNA nucleotide sequence of transcript, or None if - transcript doesn't have cDNA sequence. + """ + Return cDNA nucleotide sequence of transcript, or None if transcript + doesn't have cDNA sequence. """ if self.transcript_sequences is None: - raise ValueError("No transcript FASTA supplied to this Genome: %s" % self) + raise ValueError( + "No transcript FASTA supplied to this Genome: %s" % self + ) return self.transcript_sequences.get(transcript_id) def protein_sequence(self, protein_id): - """Return cDNA nucleotide sequence of transcript, or None if - transcript doesn't have cDNA sequence. + """ + Return cDNA nucleotide sequence of transcript, or None if transcript + doesn't have cDNA sequence. """ if self.protein_sequences is None: - raise ValueError("No protein FASTA supplied to this Genome: %s" % self) + raise ValueError( + "No protein FASTA supplied to this Genome: %s" % self + ) return self.protein_sequences.get(protein_id) def genes_at_locus(self, contig, position, end=None, strand=None): - gene_ids = self.gene_ids_at_locus(contig, position, end=end, strand=strand) + gene_ids = self.gene_ids_at_locus( + contig, position, end=end, strand=strand + ) return [self.gene_by_id(gene_id) for gene_id in gene_ids] def transcripts_at_locus(self, contig, position, end=None, strand=None): @@ -529,11 +552,14 @@ def transcripts_at_locus(self, contig, position, end=None, strand=None): contig, position, end=end, strand=strand ) return [ - self.transcript_by_id(transcript_id) for transcript_id in transcript_ids + self.transcript_by_id(transcript_id) + for transcript_id in transcript_ids ] def exons_at_locus(self, contig, position, end=None, strand=None): - exon_ids = self.exon_ids_at_locus(contig, position, end=end, strand=strand) + exon_ids = self.exon_ids_at_locus( + contig, position, end=end, strand=strand + ) return [self.exon_by_id(exon_id) for exon_id in exon_ids] def gene_ids_at_locus(self, contig, position, end=None, strand=None): @@ -576,7 +602,9 @@ def transcript_ids_at_locus(self, contig, position, end=None, strand=None): strand=strand, ) - def transcript_names_at_locus(self, contig, position, end=None, strand=None): + def transcript_names_at_locus( + self, contig, position, end=None, strand=None + ): return self.db.distinct_column_values_at_locus( column="transcript_name", feature="transcript", @@ -606,7 +634,7 @@ def protein_ids_at_locus(self, contig, position, end=None, strand=None): def locus_of_gene_id(self, gene_id): """ - Given a gene ID returns Locus with: chromosome, start, stop, strand + Given a gene ID returns Locus with: chromosome, start, stop, strand. """ return self.db.query_locus( filter_column="gene_id", filter_value=gene_id, feature="gene" @@ -615,9 +643,9 @@ def locus_of_gene_id(self, gene_id): def loci_of_gene_names(self, gene_name): """ Given a gene name returns list of Locus objects with fields: - chromosome, start, stop, strand - You can get multiple results since a gene might have multiple copies - in the genome. + + chromosome, start, stop, strand You can get multiple results + since a gene might have multiple copies in the genome. """ return self.db.query_loci("gene_name", gene_name, "gene") @@ -630,7 +658,7 @@ def locus_of_transcript_id(self, transcript_id): def locus_of_exon_id(self, exon_id): """ - Given an exon ID returns Locus + Given an exon ID returns Locus. """ return self.db.query_locus("exon_id", exon_id, feature="exon") @@ -642,8 +670,8 @@ def locus_of_exon_id(self, exon_id): def contigs(self): """ - Returns all contig names for any gene in the genome - (field called "seqname" in Ensembl GTF files) + Returns all contig names for any gene in the genome (field called + "seqname" in Ensembl GTF files) """ return self.db.query_feature_values("seqname", "gene") @@ -704,7 +732,9 @@ def gene_by_id(self, gene_id): gene_name, gene_biotype = None, None if len(result) < 4 or len(result) > 6: - raise ValueError("Result is not the expected length: %d" % len(result)) + raise ValueError( + "Result is not the expected length: %d" % len(result) + ) contig, start, end, strand = result[:4] if len(result) == 5: if "gene_name" in field_names: @@ -738,8 +768,8 @@ def genes_by_name(self, gene_name): def gene_by_protein_id(self, protein_id): """ - Get the gene ID associated with the given protein ID, - return its Gene object + Get the gene ID associated with the given protein ID, return its Gene + object. """ gene_id = self.gene_id_of_protein_id(protein_id) return self.gene_by_id(gene_id) @@ -763,8 +793,8 @@ def _query_gene_name(self, property_name, property_value, feature_type): def gene_names(self, contig=None, strand=None): """ - Return all genes in the database, - optionally restrict to a chromosome and/or strand. + Return all genes in the database, optionally restrict to a chromosome + and/or strand. """ return self._all_feature_values( column="gene_name", feature="gene", contig=contig, strand=strand @@ -774,10 +804,14 @@ def gene_name_of_gene_id(self, gene_id): return self._query_gene_name("gene_id", gene_id, "gene") def gene_name_of_transcript_id(self, transcript_id): - return self._query_gene_name("transcript_id", transcript_id, "transcript") + return self._query_gene_name( + "transcript_id", transcript_id, "transcript" + ) def gene_name_of_transcript_name(self, transcript_name): - return self._query_gene_name("transcript_name", transcript_name, "transcript") + return self._query_gene_name( + "transcript_name", transcript_name, "transcript" + ) def gene_name_of_exon_id(self, exon_id): return self._query_gene_name("exon_id", exon_id, "exon") @@ -801,8 +835,8 @@ def _query_gene_ids(self, property_name, value, feature="gene"): def gene_ids(self, contig=None, strand=None): """ - What are all the gene IDs - (optionally restrict to a given chromosome/contig and/or strand) + What are all the gene IDs (optionally restrict to a given + chromosome/contig and/or strand) """ return self._all_feature_values( column="gene_id", feature="gene", contig=contig, strand=strand @@ -811,6 +845,7 @@ def gene_ids(self, contig=None, strand=None): def gene_ids_of_gene_name(self, gene_name): """ What are the gene IDs associated with a given gene name? + (due to copy events, there might be multiple genes per name) """ results = self._query_gene_ids("gene_name", gene_name) @@ -843,17 +878,21 @@ def gene_id_of_protein_id(self, protein_id): def transcripts(self, contig=None, strand=None): """ - Construct Transcript object for every transcript entry in - the database. Optionally restrict to a particular - chromosome using the `contig` argument. + Construct Transcript object for every transcript entry in the database. + + Optionally restrict to a particular chromosome using the + `contig` argument. """ transcript_ids = self.transcript_ids(contig=contig, strand=strand) return [ - self.transcript_by_id(transcript_id) for transcript_id in transcript_ids + self.transcript_by_id(transcript_id) + for transcript_id in transcript_ids ] def transcript_by_id(self, transcript_id): - """Construct Transcript object with given transcript ID""" + """ + Construct Transcript object with given transcript ID. + """ if transcript_id not in self._transcripts: optional_field_names = [ "transcript_name", @@ -886,8 +925,12 @@ def transcript_by_id(self, transcript_id): raise ValueError("Transcript not found: %s" % (transcript_id,)) transcript_name, transcript_biotype, tsl = None, None, None - if len(result) < 5 or len(result) > (5 + len(optional_field_names)): - raise ValueError("Result is not the expected length: %d" % len(result)) + if len(result) < 5 or len(result) > ( + 5 + len(optional_field_names) + ): + raise ValueError( + "Result is not the expected length: %d" % len(result) + ) contig, start, end, strand, gene_id = result[:5] if len(result) > 5: extra_field_names = [ @@ -920,9 +963,12 @@ def transcript_by_id(self, transcript_id): return self._transcripts[transcript_id] def transcripts_by_name(self, transcript_name): - transcript_ids = self.transcript_ids_of_transcript_name(transcript_name) + transcript_ids = self.transcript_ids_of_transcript_name( + transcript_name + ) return [ - self.transcript_by_id(transcript_id) for transcript_id in transcript_ids + self.transcript_by_id(transcript_id) + for transcript_id in transcript_ids ] def transcript_by_protein_id(self, protein_id): @@ -948,25 +994,31 @@ def _query_transcript_names(self, property_name, value): def transcript_names(self, contig=None, strand=None): """ - What are all the transcript names in the database - (optionally, restrict to a given chromosome and/or strand) + What are all the transcript names in the database (optionally, restrict + to a given chromosome and/or strand) """ return self._all_feature_values( - column="transcript_name", feature="transcript", contig=contig, strand=strand + column="transcript_name", + feature="transcript", + contig=contig, + strand=strand, ) def transcript_names_of_gene_name(self, gene_name): return self._query_transcript_names("gene_name", gene_name) def transcript_name_of_transcript_id(self, transcript_id): - transcript_names = self._query_transcript_names("transcript_id", transcript_id) + transcript_names = self._query_transcript_names( + "transcript_id", transcript_id + ) if len(transcript_names) == 0: raise ValueError( "No transcript names for transcript ID = %s" % transcript_id ) elif len(transcript_names) > 1: raise ValueError( - "Multiple transcript names for transcript ID = %s" % (transcript_id,) + "Multiple transcript names for transcript ID = %s" + % (transcript_id,) ) return transcript_names[0] @@ -976,7 +1028,9 @@ def transcript_name_of_transcript_id(self, transcript_id): # ################################################### - def _query_transcript_ids(self, property_name, value, feature="transcript"): + def _query_transcript_ids( + self, property_name, value, feature="transcript" + ): results = self.db.query( select_column_names=["transcript_id"], filter_column=property_name, @@ -989,7 +1043,10 @@ def _query_transcript_ids(self, property_name, value, feature="transcript"): def transcript_ids(self, contig=None, strand=None): return self._all_feature_values( - column="transcript_id", feature="transcript", contig=contig, strand=strand + column="transcript_id", + feature="transcript", + contig=contig, + strand=strand, ) def transcript_ids_of_gene_id(self, gene_id): @@ -1008,7 +1065,9 @@ def transcript_id_of_protein_id(self, protein_id): """ What is the transcript ID associated with a given protein ID? """ - results = self._query_transcript_ids("protein_id", protein_id, feature="CDS") + results = self._query_transcript_ids( + "protein_id", protein_id, feature="CDS" + ) if len(results) == 0: raise ValueError("Protein ID not found: %s" % protein_id) elif len(results) > 1: @@ -1029,15 +1088,16 @@ def transcript_id_of_protein_id(self, protein_id): def exons(self, contig=None, strand=None): """ - Create exon object for all exons in the database, optionally - restrict to a particular chromosome using the `contig` argument. + Create exon object for all exons in the database, optionally restrict + to a particular chromosome using the `contig` argument. """ # DataFrame with single column called "exon_id" exon_ids = self.exon_ids(contig=contig, strand=strand) return [self.exon_by_id(exon_id) for exon_id in exon_ids] def exon_by_id(self, exon_id): - """Construct an Exon object from its ID by looking up the exon"s + """ + Construct an Exon object from its ID by looking up the exon"s properties in the given Database. """ if exon_id not in self._exons: @@ -1112,8 +1172,8 @@ def exon_ids_of_transcript_id(self, transcript_id): def protein_ids(self, contig=None, strand=None): """ - What are all the protein IDs - (optionally restrict to a given chromosome and/or strand) + What are all the protein IDs (optionally restrict to a given chromosome + and/or strand) """ protein_ids = self._all_feature_values( column="protein_id", diff --git a/pyensembl/locus.py b/pyensembl/locus.py index b88b4a3..c087183 100644 --- a/pyensembl/locus.py +++ b/pyensembl/locus.py @@ -49,7 +49,8 @@ def __init__(self, contig, start, end, strand): if end < start: raise ValueError( - "Expected start <= end, got start = %d, end = %d" % (start, end) + "Expected start <= end, got start = %d, end = %d" + % (start, end) ) self.start = start self.end = end @@ -149,7 +150,9 @@ def offset_range(self, start, end): ) if start < self.start or end > self.end: - raise ValueError("Range (%d, %d) falls outside %s" % (start, end, self)) + raise ValueError( + "Range (%d, %d) falls outside %s" % (start, end, self) + ) if self.on_forward_strand: return (start - self.start, end - self.start) @@ -183,7 +186,9 @@ def can_overlap(self, contig, strand=None): """ Is this locus on the same contig and (optionally) on the same strand? """ - return self.on_contig(contig) and (strand is None or self.on_strand(strand)) + return self.on_contig(contig) and ( + strand is None or self.on_strand(strand) + ) def distance_to_interval(self, start, end): """ @@ -220,15 +225,23 @@ def overlaps(self, contig, start, end, strand=None): def overlaps_locus(self, other_locus): return self.overlaps( - other_locus.contig, other_locus.start, other_locus.end, other_locus.strand + other_locus.contig, + other_locus.start, + other_locus.end, + other_locus.strand, ) def contains(self, contig, start, end, strand=None): return ( - self.can_overlap(contig, strand) and start >= self.start and end <= self.end + self.can_overlap(contig, strand) + and start >= self.start + and end <= self.end ) def contains_locus(self, other_locus): return self.contains( - other_locus.contig, other_locus.start, other_locus.end, other_locus.strand + other_locus.contig, + other_locus.start, + other_locus.end, + other_locus.strand, ) diff --git a/pyensembl/reference_name.py b/pyensembl/reference_name.py index dbb8d1f..5731d80 100644 --- a/pyensembl/reference_name.py +++ b/pyensembl/reference_name.py @@ -29,7 +29,9 @@ def normalize_reference_name(name): def find_species_by_reference(reference_name): - return Species._reference_names_to_species[normalize_reference_name(reference_name)] + return Species._reference_names_to_species[ + normalize_reference_name(reference_name) + ] def which_reference(species_name, ensembl_release): @@ -42,7 +44,9 @@ def max_ensembl_release(reference_name): return max_release -def genome_for_reference_name(reference_name, allow_older_downloaded_release=True): +def genome_for_reference_name( + reference_name, allow_older_downloaded_release=True +): """ Given a genome reference name, such as "GRCh38", returns the corresponding Ensembl Release object. @@ -60,7 +64,9 @@ def genome_for_reference_name(reference_name, allow_older_downloaded_release=Tru ] if allow_older_downloaded_release: # go through candidate releases in descending order - for release in reversed(range(min_ensembl_release, max_ensembl_release + 1)): + for release in reversed( + range(min_ensembl_release, max_ensembl_release + 1) + ): # check if release has been locally downloaded candidate = EnsemblRelease.cached(release=release, species=species) if candidate.required_local_files_exist(): diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py index c2a6e0d..e18a9e8 100644 --- a/pyensembl/sequence_data.py +++ b/pyensembl/sequence_data.py @@ -32,10 +32,14 @@ def __init__(self, fasta_paths, cache_directory_path=None): fasta_paths = [fasta_paths] self.fasta_paths = [abspath(path) for path in fasta_paths] - self.fasta_directory_paths = [split(path)[0] for path in self.fasta_paths] + self.fasta_directory_paths = [ + split(path)[0] for path in self.fasta_paths + ] self.fasta_filenames = [split(path)[1] for path in self.fasta_paths] if cache_directory_path: - self.cache_directory_paths = [cache_directory_path] * len(self.fasta_paths) + self.cache_directory_paths = [cache_directory_path] * len( + self.fasta_paths + ) else: self.cache_directory_paths = self.fasta_directory_paths for path in self.fasta_paths: @@ -104,7 +108,9 @@ def _load_or_create_fasta_dictionary_pickle(self): try: fasta_dictionary_tmp = load_pickle(pickle_path) self._add_to_fasta_dictionary(fasta_dictionary_tmp) - logger.info("Loaded sequence dictionary from %s", pickle_path) + logger.info( + "Loaded sequence dictionary from %s", pickle_path + ) continue except (pickle.UnpicklingError, AttributeError): # catch either an UnpicklingError or an AttributeError diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py index 012a152..694e702 100644 --- a/pyensembl/transcript.py +++ b/pyensembl/transcript.py @@ -126,7 +126,10 @@ def exons(self): # in each transcript columns = ["exon_number", "exon_id"] exon_numbers_and_ids = self.db.query( - columns, filter_column="transcript_id", filter_value=self.id, feature="exon" + columns, + filter_column="transcript_id", + filter_value=self.id, + feature="exon", ) # fill this list in its correct order (by exon_number) by using @@ -137,7 +140,8 @@ def exons(self): exon = self.genome.exon_by_id(exon_id) if exon is None: raise ValueError( - "Missing exon %s for transcript %s" % (exon_number, self.id) + "Missing exon %s for transcript %s" + % (exon_number, self.id) ) exon_number = int(exon_number) if exon_number < 1: @@ -174,7 +178,8 @@ def _transcript_feature_position_ranges(self, feature, required=True): if required and len(results) == 0: raise ValueError( - "Transcript %s does not contain feature %s" % (self.id, feature) + "Transcript %s does not contain feature %s" + % (self.id, feature) ) return results @@ -183,7 +188,9 @@ def _transcript_feature_positions(self, feature): """ Get unique positions for feature, raise an error if feature is absent. """ - ranges = self._transcript_feature_position_ranges(feature, required=True) + ranges = self._transcript_feature_position_ranges( + feature, required=True + ) results = [] # a feature (such as a stop codon), maybe be split over multiple # contiguous ranges. Collect all the nucleotide positions into a @@ -329,7 +336,9 @@ def spliced_offset(self, position): exon_offset = unspliced_offset - exon_unspliced_start return total_spliced_offset + exon_offset else: - exon_length = len(exon) # exon_end_position - exon_start_position + 1 + exon_length = len( + exon + ) # exon_end_position - exon_start_position + 1 total_spliced_offset += exon_length raise ValueError( "Couldn't find position %d on any exon of %s" % (position, self.id) @@ -341,7 +350,9 @@ def start_codon_unspliced_offsets(self): Offsets from start of unspliced pre-mRNA transcript of nucleotides in start codon. """ - return [self.offset(position) for position in self.start_codon_positions] + return [ + self.offset(position) for position in self.start_codon_positions + ] @memoized_property def stop_codon_unspliced_offsets(self): @@ -349,7 +360,9 @@ def stop_codon_unspliced_offsets(self): Offsets from start of unspliced pre-mRNA transcript of nucleotides in stop codon. """ - return [self.offset(position) for position in self.stop_codon_positions] + return [ + self.offset(position) for position in self.stop_codon_positions + ] def _contiguous_offsets(self, offsets): """ @@ -369,7 +382,8 @@ def start_codon_spliced_offsets(self): of nucleotides in start codon. """ offsets = [ - self.spliced_offset(position) for position in self.start_codon_positions + self.spliced_offset(position) + for position in self.start_codon_positions ] return self._contiguous_offsets(offsets) @@ -380,7 +394,8 @@ def stop_codon_spliced_offsets(self): of nucleotides in stop codon. """ offsets = [ - self.spliced_offset(position) for position in self.stop_codon_positions + self.spliced_offset(position) + for position in self.stop_codon_positions ] return self._contiguous_offsets(offsets) diff --git a/tests/common.py b/tests/common.py index 094b6a2..5012dcc 100644 --- a/tests/common.py +++ b/tests/common.py @@ -26,7 +26,9 @@ def test_ensembl_releases(*versions): ensembl_releases = major_releases else: if any(version > MAX_ENSEMBL_RELEASE for version in versions): - raise ValueError("Invalid ensembl release numbers: %s" % (versions,)) + raise ValueError( + "Invalid ensembl release numbers: %s" % (versions,) + ) ensembl_releases = [cached_release(version) for version in versions] def decorator(test_fn): diff --git a/tests/data.py b/tests/data.py index 60cd08a..eea2ed7 100644 --- a/tests/data.py +++ b/tests/data.py @@ -43,7 +43,9 @@ def data_path(name): ) # 3' UTR for beta-catenin interacting protein (CTNNBIP1-004) -CTNNBIP1_004_UTR3 = "CTGCAAAGCCCTTGGAACACCCTGGATGCTGTTGAGGGCCAAGAGATCTGTGTGGCTCC" +CTNNBIP1_004_UTR3 = ( + "CTGCAAAGCCCTTGGAACACCCTGGATGCTGTTGAGGGCCAAGAGATCTGTGTGGCTCC" +) CTNNBIP1_004_locus = Locus("1", 9850659, 9878176, "-") @@ -131,7 +133,9 @@ def data_path(name): reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, - transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], + transcript_fasta_paths_or_urls=[ + MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH + ], protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], ) diff --git a/tests/test_download_cache.py b/tests/test_download_cache.py index 2bf5913..3194a01 100644 --- a/tests/test_download_cache.py +++ b/tests/test_download_cache.py @@ -1,5 +1,9 @@ from nose.tools import assert_raises, ok_ -from pyensembl.download_cache import DownloadCache, MissingLocalFile, MissingRemoteFile +from pyensembl.download_cache import ( + DownloadCache, + MissingLocalFile, + MissingRemoteFile, +) import os import tempfile diff --git a/tests/test_exon_id.py b/tests/test_exon_id.py index 18590f8..8bd1b08 100644 --- a/tests/test_exon_id.py +++ b/tests/test_exon_id.py @@ -122,7 +122,8 @@ def test_exon_ids_of_transcript_name(): len(exon_ids), ) assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 for exon_id in exon_ids + exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 + for exon_id in exon_ids ) @@ -140,5 +141,6 @@ def exon_ids_of_transcript_id(): len(exon_ids), ) assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 for exon_id in exon_ids + exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 + for exon_id in exon_ids ) diff --git a/tests/test_exon_object.py b/tests/test_exon_object.py index 9a77cde..40d2724 100644 --- a/tests/test_exon_object.py +++ b/tests/test_exon_object.py @@ -16,7 +16,9 @@ def test_exon_object_by_id(): up by ID in Ensembl 77. """ exon = ensembl.exon_by_id("ENSE00003464041") - assert exon.gene_name == "CTNNB1", "Unexpected gene name: %s" % exon.gene_name + assert exon.gene_name == "CTNNB1", ( + "Unexpected gene name: %s" % exon.gene_name + ) assert exon.contig == "3", exon.contig assert exon.strand == "+" assert exon.on_forward_strand @@ -32,7 +34,9 @@ def test_exon_object_by_id_on_negative_strand(): from CXCR3 when looked up by ID in Ensembl 77. """ exon = ensembl.exon_by_id("ENSE00001817013") - assert exon.gene_name == "CXCR3", "Unexpected gene name: %s" % exon.gene_name + assert exon.gene_name == "CXCR3", ( + "Unexpected gene name: %s" % exon.gene_name + ) assert exon.contig == "X", exon.contig assert exon.strand == "-" assert exon.on_backward_strand @@ -86,7 +90,9 @@ def test_exon_basic_properties_str(): def test_exon_basic_properties_hash(): exon = ensembl.exon_by_id("ENSE00001817013") - assert isinstance(hash(exon), int), "Hash function returns %s instead of int" % ( + assert isinstance( + hash(exon), int + ), "Hash function returns %s instead of int" % ( type( hash(exon), ) diff --git a/tests/test_gene_ids.py b/tests/test_gene_ids.py index 3f1420e..8666c89 100644 --- a/tests/test_gene_ids.py +++ b/tests/test_gene_ids.py @@ -22,7 +22,9 @@ def test_gene_ids_grch38_hla_a(): # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 ids = ensembl_grch38.gene_ids_at_locus(6, 29945884) expected = "ENSG00000206503" - assert ids == ["ENSG00000206503"], "Expected HLA-A, gene ID = %s, got: %s" % ( + assert ids == [ + "ENSG00000206503" + ], "Expected HLA-A, gene ID = %s, got: %s" % ( expected, ids, ) @@ -46,7 +48,9 @@ def test_gene_id_of_protein_id_release77(): def test_gene_id_of_invalid_name(): with assert_raises(Exception): - ensembl_grch38.gene_ids_of_gene_name("A wonderous pony sees through your soul") + ensembl_grch38.gene_ids_of_gene_name( + "A wonderous pony sees through your soul" + ) @test_ensembl_releases() diff --git a/tests/test_gene_names.py b/tests/test_gene_names.py index 626537b..ac2e892 100644 --- a/tests/test_gene_names.py +++ b/tests/test_gene_names.py @@ -67,6 +67,9 @@ def test_gene_name_of_HLA_gene_id(): gene_ids = grch38.gene_ids_of_gene_name("HLA-A") gene_names = [grch38.gene_name_of_gene_id(gene_id) for gene_id in gene_ids] unique_gene_names = list(set(gene_names)) - assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names) + assert len(unique_gene_names) == 1, ( + len(unique_gene_names), + unique_gene_names, + ) gene_name = unique_gene_names[0] assert gene_name == "HLA-A", gene_name diff --git a/tests/test_gene_objects.py b/tests/test_gene_objects.py index 63fe006..07f5f11 100644 --- a/tests/test_gene_objects.py +++ b/tests/test_gene_objects.py @@ -9,12 +9,16 @@ def test_TP53_gene_object_by_id(genome): # when we look up TP53 by its gene ID, we should get the # correct gene back gene = genome.gene_by_id(TP53_gene_id) - assert gene.name == "TP53", "Incorrect gene name %s for gene ID %s in %s" % ( + assert ( + gene.name == "TP53" + ), "Incorrect gene name %s for gene ID %s in %s" % ( gene.name, gene.id, genome, ) - assert gene.contig == "17", "Incorrect gene contig %s for gene ID %s in %s" % ( + assert ( + gene.contig == "17" + ), "Incorrect gene contig %s for gene ID %s in %s" % ( gene.contig, gene.id, genome, @@ -25,9 +29,13 @@ def test_TP53_gene_object_by_id(genome): def test_TP53_gene_object_by_name(genome): genes = genome.genes_by_name("TP53") # we should only have one TP53 gene (there aren't any copies) - assert len(genes) == 1, "Expected only one gene with name TP53, got %s" % (genes,) + assert len(genes) == 1, "Expected only one gene with name TP53, got %s" % ( + genes, + ) # make sure it has the correct gene ID - assert genes[0].id == TP53_gene_id, "Expected gene to have ID %s, got %s" % ( + assert ( + genes[0].id == TP53_gene_id + ), "Expected gene to have ID %s, got %s" % ( TP53_gene_id, genes[0].id, ) diff --git a/tests/test_id_length.py b/tests/test_id_length.py index cc61869..2d48877 100644 --- a/tests/test_id_length.py +++ b/tests/test_id_length.py @@ -10,7 +10,9 @@ def check_id_length(method_name): # only load chromosome Y to speed up tests idents = method(contig="Y") assert len(idents) > 0, "No values returned by %s" % method_name - assert all(len(ident) == 15 for ident in idents), "Invalid IDs for %s: %s" % ( + assert all( + len(ident) == 15 for ident in idents + ), "Invalid IDs for %s: %s" % ( method_name, [ident for ident in idents if len(ident) != 15], ) diff --git a/tests/test_missing_genome_sources.py b/tests/test_missing_genome_sources.py index 6069261..d03f856 100644 --- a/tests/test_missing_genome_sources.py +++ b/tests/test_missing_genome_sources.py @@ -22,12 +22,17 @@ def no_gtf_(cm): def no_transcript_(cm): - print("Testing for 'transcript' in %s : %s" % (type(cm.exception), cm.exception)) + print( + "Testing for 'transcript' in %s : %s" + % (type(cm.exception), cm.exception) + ) ok_("transcript" in str(cm.exception)) def no_protein_(cm): - print("Testing for 'protein' in %s : %s" % (type(cm.exception), cm.exception)) + print( + "Testing for 'protein' in %s : %s" % (type(cm.exception), cm.exception) + ) ok_("protein" in str(cm.exception)) @@ -35,7 +40,9 @@ def test_transcript_fasta_only(): genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", - transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], + transcript_fasta_paths_or_urls=[ + MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH + ], ) genome.index() @@ -66,7 +73,9 @@ def test_protein_fasta_only(): genome_only_proteins = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", - protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], + protein_fasta_paths_or_urls=[ + MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH + ], ) genome_only_proteins.index() @@ -107,7 +116,9 @@ def test_gtf_transcript_only(): reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, - transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], + transcript_fasta_paths_or_urls=[ + MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH + ], ) genome_gtf_with_cdna.index() @@ -126,7 +137,9 @@ def test_gtf_protein_only(): reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, - protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], + protein_fasta_paths_or_urls=[ + MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH + ], ) genome_gtf_with_proteins.index() diff --git a/tests/test_mouse.py b/tests/test_mouse.py index 5ec03b6..cf45b54 100644 --- a/tests/test_mouse.py +++ b/tests/test_mouse.py @@ -1,6 +1,9 @@ from nose.tools import eq_, with_setup -from .data import custom_mouse_genome_grcm38_subset, setup_init_custom_mouse_genome +from .data import ( + custom_mouse_genome_grcm38_subset, + setup_init_custom_mouse_genome, +) @with_setup(setup=setup_init_custom_mouse_genome) diff --git a/tests/test_release_versions.py b/tests/test_release_versions.py index 42761bd..5fa1288 100644 --- a/tests/test_release_versions.py +++ b/tests/test_release_versions.py @@ -26,7 +26,9 @@ def test_version_is_none(): def test_max_ensembl_release(): assert isinstance( MAX_ENSEMBL_RELEASE, int - ), "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % (type(MAX_ENSEMBL_RELEASE),) + ), "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % ( + type(MAX_ENSEMBL_RELEASE), + ) assert 83 <= MAX_ENSEMBL_RELEASE < 1000, ( "Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE ) diff --git a/tests/test_search.py b/tests/test_search.py index 40930a4..6c90381 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -37,13 +37,17 @@ def test_find_nearest_BRAF_transcript(ensembl): for transcript in transcripts: # immediately before transcript result_before = find_nearest_locus( - start=transcript.start - 2, end=transcript.start - 1, loci=transcripts + start=transcript.start - 2, + end=transcript.start - 1, + loci=transcripts, ) eq_(result_before, (1, transcript)) # overlapping with transcript result_overlap = find_nearest_locus( - start=transcript.start - 2, end=transcript.start + 1, loci=transcripts + start=transcript.start - 2, + end=transcript.start + 1, + loci=transcripts, ) eq_(result_overlap, (0, transcript)) diff --git a/tests/test_sequence_data.py b/tests/test_sequence_data.py index 1d8b7fd..2675c14 100644 --- a/tests/test_sequence_data.py +++ b/tests/test_sequence_data.py @@ -18,8 +18,12 @@ def test_sequence_type(): with TemporaryDirectory() as tmpdir: seqs_dna = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) seq = seqs_dna.get("ENSMUST00000138942") - assert seq is not None, "Failed to find sequence for ENSMUST00000138942" - assert isinstance(seq, str), "Wrong sequence type, expected %s but got %s" % ( + assert ( + seq is not None + ), "Failed to find sequence for ENSMUST00000138942" + assert isinstance( + seq, str + ), "Wrong sequence type, expected %s but got %s" % ( str, type(seq), ) @@ -35,10 +39,14 @@ def test_missing_sequence(): def test_clear_cache(): with TemporaryDirectory() as tmpdir: seqs = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) - assert not seqs._fasta_dictionary, "Expected _fasta_dictionary to load lazily" + assert ( + not seqs._fasta_dictionary + ), "Expected _fasta_dictionary to load lazily" seqs._load_or_create_fasta_dictionary_pickle() - assert len(seqs._fasta_dictionary) > 0, "FASTA dictionary didn't get created" + assert ( + len(seqs._fasta_dictionary) > 0 + ), "FASTA dictionary didn't get created" seqs.clear_cache() assert ( @@ -51,4 +59,6 @@ def test_clear_cache(): seqs._load_or_create_fasta_dictionary_pickle() for pickle_path in seqs.fasta_dictionary_pickle_paths: - assert exists(pickle_path), "Cached pickle file should have been created" + assert exists( + pickle_path + ), "Cached pickle file should have been created" diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 40d2c9f..3ac0983 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -106,7 +106,9 @@ def test_custom_genome_to_json(): @with_setup(setup=setup_init_custom_mouse_genome) def test_custom_genome_to_dict(): - reconstructed = Genome.from_dict(custom_mouse_genome_grcm38_subset.to_dict()) + reconstructed = Genome.from_dict( + custom_mouse_genome_grcm38_subset.to_dict() + ) eq_(custom_mouse_genome_grcm38_subset, reconstructed) @@ -127,4 +129,6 @@ def test_unique_memory_address_of_unpickled_genomes(ensembl_genome): unpickled = pickle.loads(pickle.dumps(ensembl_genome)) assert ( ensembl_genome is unpickled - ), "Expected same object for %s but got two different instances" % (unpickled,) + ), "Expected same object for %s but got two different instances" % ( + unpickled, + ) diff --git a/tests/test_shell.py b/tests/test_shell.py index dcc3b77..390bb2d 100644 --- a/tests/test_shell.py +++ b/tests/test_shell.py @@ -4,7 +4,9 @@ def test_genome_selection_grch38(): - args = parser.parse_args(["install", "--release", "100", "--species", "human"]) + args = parser.parse_args( + ["install", "--release", "100", "--species", "human"] + ) genomes = all_combinations_of_ensembl_genomes(args) assert len(genomes) == 1 genome = genomes[0] diff --git a/tests/test_timings.py b/tests/test_timings.py index b0fd8e1..80d8474 100644 --- a/tests/test_timings.py +++ b/tests/test_timings.py @@ -14,7 +14,9 @@ def make_repeat_lookup_fn(lookup_fn, n_positions): def repeat_lookup_fn(): for contig in contigs: - for position in [10**6 + i * 10**6 for i in range(n_positions)]: + for position in [ + 10**6 + i * 10**6 for i in range(n_positions) + ]: lookup_fn(contig, position) return repeat_lookup_fn @@ -28,9 +30,13 @@ def run_benchmark(lookup_fn, n_positions_per_contig=20, time_limit=60.0): repeat_lookup_fn = make_repeat_lookup_fn(lookup_fn, n_positions_per_contig) n_loci = n_positions_per_contig * len(contigs) name = lookup_fn.__name__ - average_time = benchmark(repeat_lookup_fn, name="%s for %d loci" % (name, n_loci)) + average_time = benchmark( + repeat_lookup_fn, name="%s for %d loci" % (name, n_loci) + ) print("-- %s : %0.4fs" % (name, average_time)) - assert average_time < time_limit, "%s took too long for %s loci: %0.4fs" % ( + assert ( + average_time < time_limit + ), "%s took too long for %s loci: %0.4fs" % ( name, n_loci, average_time, diff --git a/tests/test_transcript_ids.py b/tests/test_transcript_ids.py index f1e910f..9d067c5 100644 --- a/tests/test_transcript_ids.py +++ b/tests/test_transcript_ids.py @@ -32,7 +32,8 @@ def test_transcript_ids_ensembl_grch38_hla_a(): transcript_ids = grch38.transcript_ids_at_locus(6, 29941260, 29945884) for transcript_id in HLA_A_TRANSCRIPT_IDS: assert transcript_id in transcript_ids, ( - "Transcript %s of HLA-A not found overlapping locus" % transcript_id + "Transcript %s of HLA-A not found overlapping locus" + % transcript_id ) @@ -49,7 +50,9 @@ def test_transcript_ids_ensembl_grch38_hla_a(): def test_all_transcript_ids(ensembl): transcript_ids = set(ensembl.transcript_ids()) for transcript_id in KNOWN_TRANSCRIPT_IDS: - assert transcript_id in transcript_ids, "Missing transcript ID %s from %s" % ( + assert ( + transcript_id in transcript_ids + ), "Missing transcript ID %s from %s" % ( transcript_id, ensembl, ) diff --git a/tests/test_transcript_objects.py b/tests/test_transcript_objects.py index b8d5d58..ea83140 100644 --- a/tests/test_transcript_objects.py +++ b/tests/test_transcript_objects.py @@ -23,7 +23,9 @@ def test_transcript_start_codon(): test_transcript_start_codon : Check that fields Transcript (for transcript named CTNNBIP1-004) matches known values. """ - CTNNBIP1_004_transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) + CTNNBIP1_004_transcript = ensembl77.transcript_by_id( + CTNNBIP1_004_transcript_id + ) assert Locus.__eq__( CTNNBIP1_004_transcript, CTNNBIP1_004_locus @@ -61,7 +63,9 @@ def test_transcript_exons(): """ transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) exons = transcript.exons - assert isinstance(exons, list), "Expected list of Exon objects, got %s : %s" % ( + assert isinstance( + exons, list + ), "Expected list of Exon objects, got %s : %s" % ( exons, type(exons), ) @@ -69,7 +73,10 @@ def test_transcript_exons(): # CTTNBIP1-004 has 5 exons assert len(exons) == len( CTTNNIP1_004_exon_lengths - ), "Expected %d exons but got %d" % (len(CTTNNIP1_004_exon_lengths), len(exons)) + ), "Expected %d exons but got %d" % ( + len(CTTNNIP1_004_exon_lengths), + len(exons), + ) for i, exon in enumerate(exons): expected_id = CTTNNIP1_004_exon_ids[i] @@ -128,7 +135,13 @@ def test_sequence_parts(genome): combined_sequence_length, len(transcript), "Length 5' UTR(%dnt) + CDS(%dnt) + 3' UTR(%d) = %d, expected %d" - % (len(utr5), len(cds), len(utr3), combined_sequence_length, len(transcript)), + % ( + len(utr5), + len(cds), + len(utr3), + combined_sequence_length, + len(transcript), + ), ) eq_( combined_string, @@ -145,7 +158,8 @@ def test_transcript_utr5_sequence_CTNNIP1_004(): eq_( len(utr5), expected_utr5_length, - "Expected 5' UTR length %d, got %d" % (expected_utr5_length, len(utr5)), + "Expected 5' UTR length %d, got %d" + % (expected_utr5_length, len(utr5)), ) eq_(utr5, CTNNBIP1_004_UTR5) @@ -157,7 +171,8 @@ def test_transcript_utr3_sequence_CTNNIP1_004(): eq_( len(utr3), expected_utr3_length, - "Expected 3' UTR length %d, got %d" % (expected_utr3_length, len(utr3)), + "Expected 3' UTR length %d, got %d" + % (expected_utr3_length, len(utr3)), ) eq_(utr3, CTNNBIP1_004_UTR3) @@ -209,10 +224,11 @@ def test_transcript_gene_should_match_parent_gene(): @test_ensembl_releases() def test_BRCA1_201_has_protein_coding_biotype(genome): transcript = genome.transcripts_by_name("BRCA1-201")[0] - assert ( - transcript.is_protein_coding - ), "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" % ( - transcript, - genome, + assert transcript.is_protein_coding, ( + "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" + % ( + transcript, + genome, + ) ) eq_(transcript.biotype, "protein_coding") diff --git a/tests/test_ucsc_gtf.py b/tests/test_ucsc_gtf.py index 7cecde5..777ad21 100644 --- a/tests/test_ucsc_gtf.py +++ b/tests/test_ucsc_gtf.py @@ -15,7 +15,10 @@ def test_ucsc_gencode_gtf(): df = db._load_gtf_as_dataframe() exons = df[df["feature"] == "exon"] # expect 12 exons from the dataframe - assert len(exons) == 12, "Expected 12 exons, got %d: %s" % (len(exons), exons) + assert len(exons) == 12, "Expected 12 exons, got %d: %s" % ( + len(exons), + exons, + ) def test_ucsc_gencode_genome(): @@ -33,8 +36,13 @@ def test_ucsc_gencode_genome(): genome.index() genes = genome.genes() for gene in genes: - assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),) - assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes) + assert gene.id, "Gene with missing ID in %s" % ( + genome.gtf.dataframe(), + ) + assert len(genes) == 7, "Expected 7 genes, got %d: %s" % ( + len(genes), + genes, + ) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, "Transcript with missing ID in %s" % ( @@ -67,7 +75,10 @@ def test_ucsc_refseq_gtf(): df = db._load_gtf_as_dataframe() exons = df[df["feature"] == "exon"] # expect 16 exons from the GTF - assert len(exons) == 16, "Expected 16 exons, got %d: %s" % (len(exons), exons) + assert len(exons) == 16, "Expected 16 exons, got %d: %s" % ( + len(exons), + exons, + ) def test_ucsc_refseq_genome(): @@ -88,7 +99,10 @@ def test_ucsc_refseq_genome(): assert gene.id, "Gene with missing ID in %s" % ( genome.db._load_gtf_as_dataframe(), ) - assert len(genes) == 2, "Expected 2 genes, got %d: %s" % (len(genes), genes) + assert len(genes) == 2, "Expected 2 genes, got %d: %s" % ( + len(genes), + genes, + ) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, "Transcript with missing ID in %s" % ( From e586f2842951c31e8f78b908d66d47670fd0817c Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Tue, 9 Jan 2024 23:54:25 -0600 Subject: [PATCH 30/35] fix gene name error --- pyensembl/database.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pyensembl/database.py b/pyensembl/database.py index 562aa06..f91e82b 100644 --- a/pyensembl/database.py +++ b/pyensembl/database.py @@ -226,7 +226,22 @@ def create(self, overwrite=False): primary_keys = {} for feature in feature_names: - df_subset = df[df.feature == feature] + # Some speices such as soybean, do not have a gene_name and transcript_name + if ( + feature == "gene_name" + and "gene_id" in feature_names + and (df.feature == "gene_name").sum() == 0 + ): + alias_feature = "gene_id" + if ( + feature == "transcript_name" + and "transcript_id" in feature_names + and (df.feature == "transcript_name").sum() == 0 + ): + alias_feature = "transcript_id" + + alias_feature = feature + df_subset = df[df.feature == alias_feature] if len(df_subset) == 0: continue dataframes[feature] = df_subset From b2d2f62d858c2c10d3eee1bc3439c9a217efc525 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Wed, 10 Jan 2024 00:38:19 -0600 Subject: [PATCH 31/35] fix gene name error for soybean and some other species --- pyensembl/database.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/pyensembl/database.py b/pyensembl/database.py index f91e82b..7f1e3f3 100644 --- a/pyensembl/database.py +++ b/pyensembl/database.py @@ -211,6 +211,16 @@ def create(self, overwrite=False): usecols=self.restrict_gtf_columns, features=self.restrict_gtf_features, ) + # Some species such as soybean, do not have a gene_name and transcript_name + # but do have gene_id and transcript_id, use the as alias of names + if "gene_id" in df.columns and "gene_name" not in df.columns: + df["gene_name"] = df["gene_id"] + if ( + "transcript_id" in df.columns + and "transcript_name" not in df.columns + ): + df["transcript_name"] = df["transcript_id"] + all_index_groups = self._all_possible_indices(df.columns) if self.restrict_gtf_features: @@ -227,21 +237,7 @@ def create(self, overwrite=False): for feature in feature_names: # Some speices such as soybean, do not have a gene_name and transcript_name - if ( - feature == "gene_name" - and "gene_id" in feature_names - and (df.feature == "gene_name").sum() == 0 - ): - alias_feature = "gene_id" - if ( - feature == "transcript_name" - and "transcript_id" in feature_names - and (df.feature == "transcript_name").sum() == 0 - ): - alias_feature = "transcript_id" - - alias_feature = feature - df_subset = df[df.feature == alias_feature] + df_subset = df[df.feature == feature] if len(df_subset) == 0: continue dataframes[feature] = df_subset From 65b5d6d9ebc7e454871d4142a88c6abb7f8701b0 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Wed, 10 Jan 2024 00:39:09 -0600 Subject: [PATCH 32/35] fix gene name error for maize --- pyensembl/database.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyensembl/database.py b/pyensembl/database.py index 7f1e3f3..b5fcd99 100644 --- a/pyensembl/database.py +++ b/pyensembl/database.py @@ -211,7 +211,7 @@ def create(self, overwrite=False): usecols=self.restrict_gtf_columns, features=self.restrict_gtf_features, ) - # Some species such as soybean, do not have a gene_name and transcript_name + # Some species such as maize, do not have a gene_name and transcript_name # but do have gene_id and transcript_id, use the as alias of names if "gene_id" in df.columns and "gene_name" not in df.columns: df["gene_name"] = df["gene_id"] @@ -236,7 +236,6 @@ def create(self, overwrite=False): primary_keys = {} for feature in feature_names: - # Some speices such as soybean, do not have a gene_name and transcript_name df_subset = df[df.feature == feature] if len(df_subset) == 0: continue From eec71157d91c02ba257c8bba755057c8ad06829d Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Wed, 10 Jan 2024 02:37:08 -0600 Subject: [PATCH 33/35] suport mRNA type --- pyensembl/locus_with_genome.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pyensembl/locus_with_genome.py b/pyensembl/locus_with_genome.py index 33dd38d..e93ef24 100644 --- a/pyensembl/locus_with_genome.py +++ b/pyensembl/locus_with_genome.py @@ -16,8 +16,8 @@ class LocusWithGenome(Locus): """ - Common base class for Gene and Transcript to avoid copying - their shared logic. + Common base class for Gene and Transcript to avoid copying their shared + logic. """ def __init__(self, contig, start, end, strand, biotype, genome): @@ -39,16 +39,17 @@ def to_dict(self): @property def is_protein_coding(self): """ - We're not counting immunoglobulin-like genes from the T-cell receptor or - or antibodies since they occur in fragments that must be recombined. - It might be worth consider counting non-sense mediated decay and - non-stop decay since variants in these could potentially make a - functional protein. To read more about the biotypes used in Ensembl: - http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html - http://www.gencodegenes.org/gencode_biotypes.html - - For now let's stick with the simple category of 'protein_coding', which - means that there is an open reading frame in this gene/transcript - whose successful transcription has been observed. + We're not counting immunoglobulin-like genes from the T-cell receptor + or or antibodies since they occur in fragments that must be recombined. + It might be worth consider counting non-sense mediated decay and non- + stop decay since variants in these could potentially make a functional + protein. To read more about the biotypes used in Ensembl: + http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html + http://www.gencodegenes.org/gencode_biotypes.html. + + For now let's stick with the simple category of + 'protein_coding', which means that there is an open reading + frame in this gene/transcript whose successful transcription has + been observed. """ - return self.biotype == "protein_coding" + return self.biotype in ["protein_coding", "mRNA"] From 303ada4d7dab5402b3de001f4d1d421aef080b3c Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Wed, 10 Jan 2024 02:48:49 -0600 Subject: [PATCH 34/35] suport mRNA type --- pyensembl/locus_with_genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/locus_with_genome.py b/pyensembl/locus_with_genome.py index e93ef24..338a222 100644 --- a/pyensembl/locus_with_genome.py +++ b/pyensembl/locus_with_genome.py @@ -52,4 +52,4 @@ def is_protein_coding(self): frame in this gene/transcript whose successful transcription has been observed. """ - return self.biotype in ["protein_coding", "mRNA"] + return self.biotype in "protein_coding" From b4928358dd5a89dc276ba8ce60065b43814ae0a1 Mon Sep 17 00:00:00 2001 From: Chang Ye Date: Thu, 11 Jan 2024 14:47:01 -0600 Subject: [PATCH 35/35] fix conflict --- tests/common.py | 40 -------- tests/data.py | 113 ++++++++++------------ tests/test_contigs.py | 1 - tests/test_download_cache.py | 32 +++--- tests/test_ensembl_gtf.py | 4 - tests/test_ensembl_object_properties.py | 1 - tests/test_exon_id.py | 123 ------------------------ tests/test_exon_object.py | 21 +--- tests/test_gene_ids.py | 26 ----- tests/test_gene_names.py | 34 +------ tests/test_gene_objects.py | 42 -------- tests/test_id_length.py | 20 ++-- tests/test_locus.py | 5 - tests/test_missing_genome_sources.py | 51 ---------- tests/test_mouse.py | 10 -- tests/test_release_versions.py | 22 ----- tests/test_search.py | 25 ----- tests/test_sequence_data.py | 26 ----- tests/test_serialization.py | 17 ---- tests/test_shell.py | 4 +- tests/test_timings.py | 16 --- tests/test_transcript_ids.py | 13 --- tests/test_transcript_objects.py | 60 ------------ tests/test_ucsc_gtf.py | 37 +------ 24 files changed, 82 insertions(+), 661 deletions(-) diff --git a/tests/common.py b/tests/common.py index 17a86e8..ea1ecff 100644 --- a/tests/common.py +++ b/tests/common.py @@ -1,18 +1,9 @@ import functools -<<<<<<< HEAD -from pyensembl import ( - genome_for_reference_name, - cached_release, - MAX_ENSEMBL_RELEASE, -) -from nose.tools import nottest -======= from pyensembl import genome_for_reference_name, cached_release import pytest ->>>>>>> upstream/master grch37 = genome_for_reference_name("GRCh37") grch38 = genome_for_reference_name("GRCh38") @@ -22,33 +13,6 @@ contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"] -<<<<<<< HEAD -@nottest -def test_ensembl_releases(*versions): - """ - Run a unit test which takes an EnsemblRelease as an argument - for multiple releases (most recent for each reference genome) - """ - - if len(versions) == 0: - ensembl_releases = major_releases - else: - if any(version > MAX_ENSEMBL_RELEASE for version in versions): - raise ValueError( - "Invalid ensembl release numbers: %s" % (versions,) - ) - ensembl_releases = [cached_release(version) for version in versions] - - def decorator(test_fn): - @functools.wraps(test_fn) - def new_test_fn(): - for ensembl in ensembl_releases: - test_fn(ensembl) - - return new_test_fn - - return decorator -======= def run_multiple_genomes(*versions): if len(versions) == 1 and callable(versions[0]): return pytest.mark.parametrize("genome", major_releases)(versions[0]) @@ -57,7 +21,6 @@ def run_multiple_genomes(*versions): else: genomes = [cached_release(v) for v in versions] return lambda fn: pytest.mark.parametrize("genome", genomes)(fn) ->>>>>>> upstream/master # TemporaryDirectory only got added to Python in version 3.2 @@ -81,8 +44,6 @@ def __exit__(self, type, value, traceback): rmtree(self.name) # don't suppress exceptions return False -<<<<<<< HEAD -======= def eq_(x, y, msg=None): @@ -125,4 +86,3 @@ def lte_(x, y, msg=None): assert x <= y else: assert x <= y, msg ->>>>>>> upstream/master diff --git a/tests/data.py b/tests/data.py index eea2ed7..0b41369 100644 --- a/tests/data.py +++ b/tests/data.py @@ -21,31 +21,25 @@ def data_path(name): CTNNBIP1_004_transcript_id = "ENST00000377256" # coding sequence for beta-catenin interacting protein (CTNNBIP1-004) -CTNNBIP1_004_CDS = "".join( - [ - "ATG", - "AACCGCGAGGGAGCTCCCGGGAAGAGTCCGGAG", - "GAGATGTACATTCAGCAGAAGGTCCGAGTGCTGCTCATGCTGCGGAAGATGGGATCAAAC", - "CTGACAGCCAGCGAGGAGGAGTTCCTGCGCACCTATGCAGGGGTGGTCAACAGCCAGCTC", - "AGCCAGCTGCCTCCGCACTCCATCGACCAGG", - "GTGCAGAGGACGTGGTGATGGCGTTTTCCAGGTCGGAGACGGAAGACCGGAGGCAG", - "TAG", - ] -) +CTNNBIP1_004_CDS = "".join([ + "ATG", + "AACCGCGAGGGAGCTCCCGGGAAGAGTCCGGAG", + "GAGATGTACATTCAGCAGAAGGTCCGAGTGCTGCTCATGCTGCGGAAGATGGGATCAAAC", + "CTGACAGCCAGCGAGGAGGAGTTCCTGCGCACCTATGCAGGGGTGGTCAACAGCCAGCTC", + "AGCCAGCTGCCTCCGCACTCCATCGACCAGG", + "GTGCAGAGGACGTGGTGATGGCGTTTTCCAGGTCGGAGACGGAAGACCGGAGGCAG", + "TAG" +]) # 5' UTR for beta-catenin interacting protein (CTNNBIP1-004) -CTNNBIP1_004_UTR5 = "".join( - [ - "TGTGGGTGCAGGTTTCCTGGGCTTGCCAGACACACAGGGCGGCACCTTCCTACTTCTGCC", - "CAGCCACAGCCCTCCCCTCACAGTTGAGCACCTGTTTGCCTGAAGTTAATTTCCAGAAGC", - "AGGAGTCCCCAGAGCCAGGCAGGGGG", - ] -) +CTNNBIP1_004_UTR5 = "".join([ + "TGTGGGTGCAGGTTTCCTGGGCTTGCCAGACACACAGGGCGGCACCTTCCTACTTCTGCC", + "CAGCCACAGCCCTCCCCTCACAGTTGAGCACCTGTTTGCCTGAAGTTAATTTCCAGAAGC", + "AGGAGTCCCCAGAGCCAGGCAGGGGG"]) # 3' UTR for beta-catenin interacting protein (CTNNBIP1-004) -CTNNBIP1_004_UTR3 = ( +CTNNBIP1_004_UTR3 = \ "CTGCAAAGCCCTTGGAACACCCTGGATGCTGTTGAGGGCCAAGAGATCTGTGTGGCTCC" -) CTNNBIP1_004_locus = Locus("1", 9850659, 9878176, "-") @@ -53,14 +47,20 @@ def data_path(name): # http://useast.ensembl.org/Homo_sapiens/Transcript/Exons?g=ENSG00000178585; # r=1:9850659-9878176;redirect=no;t=ENST00000377256 CTTNNIP1_004_exon_ids = [ - "ENSE00001473268", - "ENSE00001643659", - "ENSE00001600669", - "ENSE00001267940", - "ENSE00001473265", + 'ENSE00001473268', + 'ENSE00001643659', + 'ENSE00001600669', + 'ENSE00001267940', + 'ENSE00001473265', ] -CTTNNIP1_004_exon_lengths = [37, 85, 120, 91, 118] +CTTNNIP1_004_exon_lengths = [ + 37, + 85, + 120, + 91, + 118 +] # @@ -72,28 +72,26 @@ def data_path(name): EGFR_001_transcript_id = "ENST00000275493" EGFR_001_ccds_id = "CCDS5514" EGFR_001_protein_id = "ENSP00000275493" -EGFR_001_protein_sequence = "".join( - [ - "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYV", - "QRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNL", - "QEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKL", - "TKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVN", - "PEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLS", - "INATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAF", - "ENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKI", - "ISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHP", - "ECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTG", - "PGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPN", - "QALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVD", - "NPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAA", - "RNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTF", - "GSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLV", - "IQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACI", - "DRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPS" - "RDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGS" - "TAENAEYLRVAPQSSEFIGA", - ] -) +EGFR_001_protein_sequence = "".join([ + "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYV", + "QRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNL", + "QEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKL", + "TKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVN", + "PEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLS", + "INATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAF", + "ENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKI", + "ISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHP", + "ECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTG", + "PGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPN", + "QALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVD", + "NPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAA", + "RNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTF", + "GSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLV", + "IQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACI", + "DRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPS" + "RDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGS" + "TAENAEYLRVAPQSSEFIGA" +]) # GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/ @@ -116,28 +114,21 @@ def data_path(name): # http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167 MOUSE_ENSMUSG00000017167_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf" -) + "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf") MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.fa" -) + "mouse.ensembl.81.partial.ENSMUSG00000017167.fa") MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa" -) + "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa") MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( - "mouse.ensembl.81.partial.ENSMUSG00000017167.pep" -) + "mouse.ensembl.81.partial.ENSMUSG00000017167.pep") custom_mouse_genome_grcm38_subset = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, - transcript_fasta_paths_or_urls=[ - MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH - ], - protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], -) + transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], + protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH]) def setup_init_custom_mouse_genome(): diff --git a/tests/test_contigs.py b/tests/test_contigs.py index b4eb702..1101061 100644 --- a/tests/test_contigs.py +++ b/tests/test_contigs.py @@ -2,7 +2,6 @@ grch38 = genome_for_reference_name("GRCh38") - def test_contig_names(): contig_names = set(grch38.contigs()) for chrom in list(range(1, 23)) + ["X", "Y", "MT"]: diff --git a/tests/test_download_cache.py b/tests/test_download_cache.py index 3194a01..03c7da6 100644 --- a/tests/test_download_cache.py +++ b/tests/test_download_cache.py @@ -2,7 +2,7 @@ from pyensembl.download_cache import ( DownloadCache, MissingLocalFile, - MissingRemoteFile, + MissingRemoteFile ) import os @@ -13,27 +13,21 @@ download_cache = DownloadCache( reference_name="__test_reference", annotation_name="__test_annotation", - copy_local_files_to_cache=False, -) - + copy_local_files_to_cache=False) def test_download_cache_missing_local_file(): # clear the cache download_cache.delete_cache_directory() with assert_raises(MissingLocalFile): download_cache.download_or_copy_if_necessary( - path_or_url="test_file_doesn_not_exist.file" - ) - + path_or_url="test_file_doesn_not_exist.file") def test_download_cache_missing_remote_file(): # clear the cache download_cache.delete_cache_directory() with assert_raises(MissingRemoteFile): download_cache.download_or_copy_if_necessary( - path_or_url="ftp://NOTAURL.NOTAURL.NOTAURL" - ) - + path_or_url="ftp://NOTAURL.NOTAURL.NOTAURL") def test_download_cache_custom_location(): test_file = "refseq.ucsc.small.gtf" @@ -42,27 +36,29 @@ def test_download_cache_custom_location(): print("DIR: %s" % tmp_dir) assert tmp_dir is not None - os.environ["PYENSEMBL_CACHE_DIR"] = tmp_dir + os.environ['PYENSEMBL_CACHE_DIR'] = tmp_dir # We need another instance of DownloadCache # that copies files over to cache folder download_cache = DownloadCache( reference_name="test_reference", annotation_name="test_annotation", - copy_local_files_to_cache=True, - ) + copy_local_files_to_cache=True) # clean up download_cache.delete_cache_directory() download_cache.download_or_copy_if_necessary( - download_if_missing=True, path_or_url=data_path(test_file) - ) + download_if_missing=True, + path_or_url=data_path(test_file)) full_path = os.path.join( - tmp_dir, "pyensembl", "test_reference", "test_annotation", test_file - ) + tmp_dir, + "pyensembl", + "test_reference", + "test_annotation", + test_file) print("FULL PATH: %s" % full_path) assert len(full_path) > 0 ok_(os.path.exists(full_path)) - del os.environ["PYENSEMBL_CACHE_DIR"] + del os.environ['PYENSEMBL_CACHE_DIR'] diff --git a/tests/test_ensembl_gtf.py b/tests/test_ensembl_gtf.py index 6ee741b..14330e2 100644 --- a/tests/test_ensembl_gtf.py +++ b/tests/test_ensembl_gtf.py @@ -5,11 +5,7 @@ from .common import run_multiple_genomes -<<<<<<< HEAD -@test_ensembl_releases() -======= @run_multiple_genomes() ->>>>>>> upstream/master def gtf_path_endswith_gtf_gz(ensembl): path = ensembl.gtf.gtf_path assert exists(path) diff --git a/tests/test_ensembl_object_properties.py b/tests/test_ensembl_object_properties.py index b3c4582..ff90dcf 100644 --- a/tests/test_ensembl_object_properties.py +++ b/tests/test_ensembl_object_properties.py @@ -8,7 +8,6 @@ from nose.tools import eq_ from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE - def test_human_reference_name(): eq_(EnsemblRelease(release=54).reference_name, "NCBI36") eq_(EnsemblRelease(release=74).reference_name, "GRCh37") diff --git a/tests/test_exon_id.py b/tests/test_exon_id.py index 981cdab..ceb145f 100644 --- a/tests/test_exon_id.py +++ b/tests/test_exon_id.py @@ -10,61 +10,6 @@ # all exons associated with TP53 gene in Ensembl release 77 TP53_EXON_IDS_RELEASE_77 = [ -<<<<<<< HEAD - "ENSE00002337729", - "ENSE00002419584", - "ENSE00003625790", - "ENSE00003518480", - "ENSE00003723991", - "ENSE00003712342", - "ENSE00001657961", - "ENSE00003725258", - "ENSE00003740946", - "ENSE00002204316", - "ENSE00002064269", - "ENSE00003750554", - "ENSE00003634848", - "ENSE00003492844", - "ENSE00003735852", - "ENSE00003545950", - "ENSE00003605891", - "ENSE00002051192", - "ENSE00002084733", - "ENSE00003726882", - "ENSE00001146308", - "ENSE00002667911", - "ENSE00003752869", - "ENSE00003739898", - "ENSE00003753508", - "ENSE00002034209", - "ENSE00002030826", - "ENSE00001596491", - "ENSE00002037735", - "ENSE00003736616", - "ENSE00002672443", - "ENSE00002226620", - "ENSE00003715195", - "ENSE00003750794", - "ENSE00003745267", - "ENSE00003746220", - "ENSE00003656695", - "ENSE00003669712", - "ENSE00002051873", - "ENSE00002048269", - "ENSE00002670535", - "ENSE00002677565", - "ENSE00003532881", - "ENSE00003520683", - "ENSE00002076714", - "ENSE00002062958", - "ENSE00002073243", - "ENSE00003670707", - "ENSE00002065802", - "ENSE00002362269", -] - - -======= 'ENSE00002337729', 'ENSE00002419584', 'ENSE00003625790', 'ENSE00003518480', 'ENSE00003723991', 'ENSE00003712342', @@ -92,25 +37,11 @@ 'ENSE00002065802', 'ENSE00002362269' ] ->>>>>>> upstream/master def test_exon_ids_of_gene_id(): """ test_exon_ids_of_gene_id: Ensure that gene_id ENSG00000141510 (name=TP53), has all the same exon IDs found on the Ensembl website. """ -<<<<<<< HEAD - exon_ids = ensembl.exon_ids_of_gene_id("ENSG00000141510") - assert len(exon_ids) == len( - TP53_EXON_IDS_RELEASE_77 - ), "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( - len(TP53_EXON_IDS_RELEASE_77), - len(exon_ids), - len(set(exon_ids)), - ) - assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids) - - -======= exon_ids = ensembl.exon_ids_of_gene_id('ENSG00000141510') assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \ "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( @@ -119,38 +50,12 @@ def test_exon_ids_of_gene_id(): len(set(exon_ids))) assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids) ->>>>>>> upstream/master def test_exon_ids_of_gene_name(): """ test_exon_ids_of_gene_name: Ensure that TP53 has the same exon IDs found on the Ensembl website. """ exon_ids = ensembl.exon_ids_of_gene_name("TP53") -<<<<<<< HEAD - assert len(exon_ids) == len( - TP53_EXON_IDS_RELEASE_77 - ), "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( - len(TP53_EXON_IDS_RELEASE_77), - len(exon_ids), - len(set(exon_ids)), - ) - assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids) - - -# Exon IDs of transcript TP53-026 -TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 = [ - "ENSE00002064269", - "ENSE00003723991", - "ENSE00003712342", - "ENSE00003725258", - "ENSE00003740946", - "ENSE00003750554", - "ENSE00003634848", - "ENSE00003492844", -] - - -======= assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \ "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( len(TP53_EXON_IDS_RELEASE_77), @@ -170,7 +75,6 @@ def test_exon_ids_of_gene_name(): 'ENSE00003492844' ] ->>>>>>> upstream/master def test_exon_ids_of_transcript_name(): """ test_exon_ids_of_transcript_name : Look up exon IDs of transcript TP53-026 @@ -178,19 +82,6 @@ def test_exon_ids_of_transcript_name(): for release 77 """ exon_ids = ensembl.exon_ids_of_transcript_name("TP53-026") -<<<<<<< HEAD - assert len(exon_ids) == len( - TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - ), "Expected %d exons, got %d" % ( - len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), - len(exon_ids), - ) - assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - for exon_id in exon_ids - ) - -======= assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \ "Expected %d exons, got %d" % ( len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), @@ -198,7 +89,6 @@ def test_exon_ids_of_transcript_name(): assert all( exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 for exon_id in exon_ids) ->>>>>>> upstream/master def exon_ids_of_transcript_id(): """ @@ -207,18 +97,6 @@ def exon_ids_of_transcript_id(): what we find on the Ensembl website. """ exon_ids = ensembl.exon_ids_of_transcript_id("ENST00000610623") -<<<<<<< HEAD - assert len(exon_ids) == len( - TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - ), "Expected %d exons, got %d" % ( - len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), - len(exon_ids), - ) - assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - for exon_id in exon_ids - ) -======= assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \ "Expected %d exons, got %d" % ( len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), @@ -226,4 +104,3 @@ def exon_ids_of_transcript_id(): assert all( exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 for exon_id in exon_ids) ->>>>>>> upstream/master diff --git a/tests/test_exon_object.py b/tests/test_exon_object.py index 40d2724..4587284 100644 --- a/tests/test_exon_object.py +++ b/tests/test_exon_object.py @@ -9,16 +9,14 @@ ensembl = cached_release(77) - def test_exon_object_by_id(): """ test_exon_object_by_id : check properties of exon 4 of CTNNB1 when looked up by ID in Ensembl 77. """ exon = ensembl.exon_by_id("ENSE00003464041") - assert exon.gene_name == "CTNNB1", ( + assert exon.gene_name == "CTNNB1", \ "Unexpected gene name: %s" % exon.gene_name - ) assert exon.contig == "3", exon.contig assert exon.strand == "+" assert exon.on_forward_strand @@ -27,16 +25,14 @@ def test_exon_object_by_id(): assert exon.end == 41224753, "Unexpected exon end: %s" % exon.end assert exon.length == len(exon) == 228 - def test_exon_object_by_id_on_negative_strand(): """ test_exon_object_by_id_on_negative_strand : check properties of exon 1 from CXCR3 when looked up by ID in Ensembl 77. """ exon = ensembl.exon_by_id("ENSE00001817013") - assert exon.gene_name == "CXCR3", ( + assert exon.gene_name == "CXCR3", \ "Unexpected gene name: %s" % exon.gene_name - ) assert exon.contig == "X", exon.contig assert exon.strand == "-" assert exon.on_backward_strand @@ -61,7 +57,6 @@ def test_exon_object_at_locus(): assert exon.start <= 41224526, "Unexpected exon start: %s" % exon.start assert exon.end >= 41224526, "Unexpected exon end: %s" % exon.end - def test_exon_object_at_locus_on_negative_strand(): """ test_exon_object_at_locus : check properties of exon 1 of CXCR3 when looked @@ -77,7 +72,6 @@ def test_exon_object_at_locus_on_negative_strand(): assert exon.start <= 71618517, "Unexpected exon start: %s" % exon.start assert exon.end >= 71618517, "Unexpected exon end: %s" % exon.end - def test_exon_basic_properties_str(): exon = ensembl.exon_by_id("ENSE00001817013") assert isinstance(str(exon), str) @@ -87,16 +81,11 @@ def test_exon_basic_properties_str(): # change this test assert str(exon) == repr(exon), "%s != %s" % (str(exon), repr(exon)) - def test_exon_basic_properties_hash(): exon = ensembl.exon_by_id("ENSE00001817013") - assert isinstance( - hash(exon), int - ), "Hash function returns %s instead of int" % ( - type( - hash(exon), - ) - ) + assert isinstance(hash(exon), int), \ + "Hash function returns %s instead of int" % ( + type(hash(exon),)) assert hash(exon) == hash(exon), "Hash function is non-deterministic!" other_exon = ensembl.exon_by_id("ENSE00003464041") assert exon != other_exon diff --git a/tests/test_gene_ids.py b/tests/test_gene_ids.py index 612b3cf..436eaef 100644 --- a/tests/test_gene_ids.py +++ b/tests/test_gene_ids.py @@ -22,13 +22,7 @@ def test_gene_ids_grch38_hla_a(): # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 ids = ensembl_grch38.gene_ids_at_locus(6, 29945884) expected = "ENSG00000206503" -<<<<<<< HEAD - assert ids == [ - "ENSG00000206503" - ], "Expected HLA-A, gene ID = %s, got: %s" % ( -======= assert ids == ["ENSG00000206503"], "Expected HLA-A, gene ID = %s, got: %s" % ( ->>>>>>> upstream/master expected, ids, ) @@ -47,24 +41,12 @@ def test_gene_ids_of_gene_name_hla_grch38(): def test_gene_id_of_protein_id_release77(): gene_id = ensembl77.gene_id_of_protein_id("ENSP00000485677") -<<<<<<< HEAD - ok_("ENSG00000279634", gene_id) - - -def test_gene_id_of_invalid_name(): - with assert_raises(Exception): - ensembl_grch38.gene_ids_of_gene_name( - "A wonderous pony sees through your soul" - ) - -======= eq_("ENSG00000279634", gene_id) def test_gene_id_of_invalid_name(): with raises(Exception): ensembl_grch38.gene_ids_of_gene_name("A wonderous pony sees through your soul") ->>>>>>> upstream/master @run_multiple_genomes() @@ -76,11 +58,7 @@ def test_gene_ids_on_contig(genome): tp53 in gene_ids_chr17 ), "Missing %s from %s on chr17, example IDs: %s (total = %d)" % ( tp53, -<<<<<<< HEAD - ensembl, -======= genome, ->>>>>>> upstream/master gene_ids_chr17[:5], len(gene_ids_chr17), ) @@ -92,11 +70,7 @@ def test_gene_ids_on_contig(genome): smad4 in gene_ids_chr18 ), "Missing %s from %s on chr18, example result: %s (total = %d)" % ( smad4, -<<<<<<< HEAD - ensembl, -======= genome, ->>>>>>> upstream/master gene_ids_chr18[:5], len(gene_ids_chr18), ) diff --git a/tests/test_gene_names.py b/tests/test_gene_names.py index f343bfc..61f4480 100644 --- a/tests/test_gene_names.py +++ b/tests/test_gene_names.py @@ -18,13 +18,8 @@ ] -<<<<<<< HEAD -@test_ensembl_releases() -def test_all_gene_names(ensembl): -======= @run_multiple_genomes() def test_all_gene_names(genome): ->>>>>>> upstream/master """ test_all_gene_names : Make sure some known gene names such as SMAD4, TP53, ERBB2, &c @@ -34,11 +29,7 @@ def test_all_gene_names(genome): for gene_name in KNOWN_GENE_NAMES: assert gene_name in gene_names, "Missing gene name %s from %s" % ( gene_name, -<<<<<<< HEAD - ensembl, -======= genome, ->>>>>>> upstream/master ) @@ -50,25 +41,6 @@ def test_gene_names_at_locus_grch38_hla_a(): names = grch38.gene_names_at_locus(6, 29945884) assert names == ["HLA-A"], "Expected gene name HLA-A, got: %s" % (names,) -<<<<<<< HEAD - -@test_ensembl_releases() -def test_gene_names_on_contig(ensembl): - gene_names_chr17 = ensembl.gene_names(17) - assert ( - "TP53" in gene_names_chr17 - ), "No TP53 in gene names on chr17 of %s, gene names: %s ... (%d)" % ( - ensembl, - list(gene_names_chr17[:4]), - len(gene_names_chr17), - ) - - gene_names_chr18 = ensembl.gene_names(18) - assert ( - "SMAD4" in gene_names_chr18 - ), "No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % ( - ensembl, -======= @run_multiple_genomes() def test_gene_names_on_contig(genome): @@ -86,7 +58,6 @@ def test_gene_names_on_contig(genome): "SMAD4" in gene_names_chr18 ), "No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % ( genome, ->>>>>>> upstream/master list(gene_names_chr18[:4]), len(gene_names_chr18), ) @@ -96,9 +67,6 @@ def test_gene_name_of_HLA_gene_id(): gene_ids = grch38.gene_ids_of_gene_name("HLA-A") gene_names = [grch38.gene_name_of_gene_id(gene_id) for gene_id in gene_ids] unique_gene_names = list(set(gene_names)) - assert len(unique_gene_names) == 1, ( - len(unique_gene_names), - unique_gene_names, - ) + assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names) gene_name = unique_gene_names[0] assert gene_name == "HLA-A", gene_name diff --git a/tests/test_gene_objects.py b/tests/test_gene_objects.py index 65a078f..e66f639 100644 --- a/tests/test_gene_objects.py +++ b/tests/test_gene_objects.py @@ -4,67 +4,33 @@ from .data import TP53_gene_id -<<<<<<< HEAD -@test_ensembl_releases() -======= @run_multiple_genomes() ->>>>>>> upstream/master def test_TP53_gene_object_by_id(genome): # when we look up TP53 by its gene ID, we should get the # correct gene back gene = genome.gene_by_id(TP53_gene_id) -<<<<<<< HEAD - assert ( - gene.name == "TP53" - ), "Incorrect gene name %s for gene ID %s in %s" % ( -======= assert gene.name == "TP53", "Incorrect gene name %s for gene ID %s in %s" % ( ->>>>>>> upstream/master gene.name, gene.id, genome, ) -<<<<<<< HEAD - assert ( - gene.contig == "17" - ), "Incorrect gene contig %s for gene ID %s in %s" % ( -======= assert gene.contig == "17", "Incorrect gene contig %s for gene ID %s in %s" % ( ->>>>>>> upstream/master gene.contig, gene.id, genome, ) -<<<<<<< HEAD - -======= ->>>>>>> upstream/master @run_multiple_genomes() def test_TP53_gene_object_by_name(genome): genes = genome.genes_by_name("TP53") # we should only have one TP53 gene (there aren't any copies) -<<<<<<< HEAD - assert len(genes) == 1, "Expected only one gene with name TP53, got %s" % ( - genes, - ) - # make sure it has the correct gene ID - assert ( - genes[0].id == TP53_gene_id - ), "Expected gene to have ID %s, got %s" % ( - TP53_gene_id, - genes[0].id, - ) - -======= assert len(genes) == 1, "Expected only one gene with name TP53, got %s" % (genes,) # make sure it has the correct gene ID assert genes[0].id == TP53_gene_id, "Expected gene to have ID %s, got %s" % ( TP53_gene_id, genes[0].id, ) ->>>>>>> upstream/master @run_multiple_genomes() @@ -77,11 +43,7 @@ def test_equal_genes(genome): assert gene1 == gene2 -<<<<<<< HEAD -@test_ensembl_releases() -======= @run_multiple_genomes() ->>>>>>> upstream/master def test_not_equal_genes(genome): gene1 = genome.genes_by_name("MUC1")[0] gene2 = genome.genes_by_name("BRCA1")[0] @@ -89,11 +51,7 @@ def test_not_equal_genes(genome): assert gene1 != gene2 -<<<<<<< HEAD -@test_ensembl_releases() -======= @run_multiple_genomes() ->>>>>>> upstream/master def test_BRCA1_protein_coding_biotype(genome): gene = genome.genes_by_name("BRCA1")[0] assert gene.is_protein_coding diff --git a/tests/test_id_length.py b/tests/test_id_length.py index 2d48877..7371cd4 100644 --- a/tests/test_id_length.py +++ b/tests/test_id_length.py @@ -2,7 +2,6 @@ from nose.tools import nottest - @nottest def check_id_length(method_name): for release in major_releases: @@ -10,21 +9,16 @@ def check_id_length(method_name): # only load chromosome Y to speed up tests idents = method(contig="Y") assert len(idents) > 0, "No values returned by %s" % method_name - assert all( - len(ident) == 15 for ident in idents - ), "Invalid IDs for %s: %s" % ( - method_name, - [ident for ident in idents if len(ident) != 15], - ) - + assert all(len(ident) == 15 for ident in idents), \ + "Invalid IDs for %s: %s" % ( + method_name, + [ident for ident in idents if len(ident) != 15]) def test_gene_id_length(): - check_id_length("gene_ids") - + check_id_length('gene_ids') def test_transcript_id_length(): - check_id_length("transcript_ids") - + check_id_length('transcript_ids') def test_protein_id_length(): - check_id_length("protein_ids") + check_id_length('protein_ids') diff --git a/tests/test_locus.py b/tests/test_locus.py index 475a018..a1af6fd 100644 --- a/tests/test_locus.py +++ b/tests/test_locus.py @@ -3,7 +3,6 @@ from nose.tools import assert_raises - def test_normalize_chromosome(): assert normalize_chromosome("X") == "X" assert normalize_chromosome("chrX") == "chrX" @@ -39,7 +38,6 @@ def test_normalize_chromosome(): with assert_raises(ValueError): normalize_chromosome(0) - def test_locus_overlaps(): locus = Locus("1", 10, 20, "+") assert locus.overlaps("1", 10, 20, "+") @@ -59,7 +57,6 @@ def test_locus_overlaps(): # wrong strand assert not locus.overlaps("1", 10, 20, "-") - def test_locus_contains(): locus = Locus("1", 10, 20, "+") assert locus.contains("1", 10, 20, "+") @@ -85,7 +82,6 @@ def test_locus_contains(): # wrong strand assert not locus.contains("1", 10, 20, "-") - def test_position_offset(): forward_locus = Locus("1", 10, 20, "+") assert forward_locus.offset(10) == 0 @@ -147,7 +143,6 @@ def test_range_offset(): with assert_raises(ValueError): negative_locus.offset_range(9, 10) - def test_locus_distance(): locus_chr1_10_20_pos = Locus("1", 10, 20, "+") locus_chr1_21_25_pos = Locus("1", 21, 25, "+") diff --git a/tests/test_missing_genome_sources.py b/tests/test_missing_genome_sources.py index f2936ad..5129c18 100644 --- a/tests/test_missing_genome_sources.py +++ b/tests/test_missing_genome_sources.py @@ -14,28 +14,6 @@ MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( "mouse.ensembl.81.partial.ENSMUSG00000017167.pep" ) -<<<<<<< HEAD - - -def no_gtf_(cm): - print("Testing for 'GTF' in %s : %s" % (type(cm.exception), cm.exception)) - ok_("GTF" in str(cm.exception)) - - -def no_transcript_(cm): - print( - "Testing for 'transcript' in %s : %s" - % (type(cm.exception), cm.exception) - ) - ok_("transcript" in str(cm.exception)) - - -def no_protein_(cm): - print( - "Testing for 'protein' in %s : %s" % (type(cm.exception), cm.exception) - ) - ok_("protein" in str(cm.exception)) -======= def no_gtf_(e): @@ -52,20 +30,12 @@ def no_protein_(e): print("Testing for 'protein' in %s : %s" % (type(e), e)) assert "protein" in str(e) ->>>>>>> upstream/master - def test_transcript_fasta_only(): genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", -<<<<<<< HEAD - transcript_fasta_paths_or_urls=[ - MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH - ], -======= transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], ->>>>>>> upstream/master ) genome.index() @@ -92,18 +62,11 @@ def test_transcript_fasta_only(): no_protein_(e) - def test_protein_fasta_only(): genome_only_proteins = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", -<<<<<<< HEAD - protein_fasta_paths_or_urls=[ - MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH - ], -======= protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], ->>>>>>> upstream/master ) genome_only_proteins.index() @@ -118,7 +81,6 @@ def test_protein_fasta_only(): no_transcript_(e) - def test_gtf_only(): genome_only_gtf = Genome( reference_name="GRCm38", @@ -145,13 +107,7 @@ def test_gtf_transcript_only(): reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, -<<<<<<< HEAD - transcript_fasta_paths_or_urls=[ - MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH - ], -======= transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], ->>>>>>> upstream/master ) genome_gtf_with_cdna.index() @@ -165,19 +121,12 @@ def test_gtf_transcript_only(): no_protein_(e) - def test_gtf_protein_only(): genome_gtf_with_proteins = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, -<<<<<<< HEAD - protein_fasta_paths_or_urls=[ - MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH - ], -======= protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], ->>>>>>> upstream/master ) genome_gtf_with_proteins.index() diff --git a/tests/test_mouse.py b/tests/test_mouse.py index 0ad6f50..fdbf8bd 100644 --- a/tests/test_mouse.py +++ b/tests/test_mouse.py @@ -1,17 +1,7 @@ from .common import eq_ from .data import custom_mouse_genome_grcm38_subset, setup_init_custom_mouse_genome -<<<<<<< HEAD -from .data import ( - custom_mouse_genome_grcm38_subset, - setup_init_custom_mouse_genome, -) - -@with_setup(setup=setup_init_custom_mouse_genome) -======= - ->>>>>>> upstream/master def test_mouse_ENSMUSG00000017167(): """ GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/ diff --git a/tests/test_release_versions.py b/tests/test_release_versions.py index 68972d9..0be325b 100644 --- a/tests/test_release_versions.py +++ b/tests/test_release_versions.py @@ -8,47 +8,25 @@ def test_version_too_old_1(): EnsemblRelease(1) -<<<<<<< HEAD - -@raises(Exception) -======= ->>>>>>> upstream/master def test_version_too_old_47(): with raises(Exception): EnsemblRelease(47) -<<<<<<< HEAD - -@raises(Exception) -======= ->>>>>>> upstream/master def test_version_is_not_numeric(): with raises(Exception): EnsemblRelease("wuzzle") -<<<<<<< HEAD - -@raises(Exception) -======= ->>>>>>> upstream/master def test_version_is_none(): with raises(Exception): EnsemblRelease(None) - def test_max_ensembl_release(): assert isinstance( MAX_ENSEMBL_RELEASE, int -<<<<<<< HEAD - ), "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % ( - type(MAX_ENSEMBL_RELEASE), - ) -======= ), "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % (type(MAX_ENSEMBL_RELEASE),) ->>>>>>> upstream/master assert 83 <= MAX_ENSEMBL_RELEASE < 1000, ( "Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE ) diff --git a/tests/test_search.py b/tests/test_search.py index ed4f448..f4aa8e3 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -4,15 +4,9 @@ from .common import run_multiple_genomes -<<<<<<< HEAD -@test_ensembl_releases() -def test_find_nearest_BRAF_exon(ensembl): - braf = ensembl.genes_by_name("BRAF")[0] -======= @run_multiple_genomes() def test_find_nearest_BRAF_exon(genome): braf = genome.genes_by_name("BRAF")[0] ->>>>>>> upstream/master braf_transcripts = braf.transcripts exons = braf_transcripts[0].exons for exon in exons: @@ -35,40 +29,21 @@ def test_find_nearest_BRAF_exon(genome): eq_(result_after, (1, exon)) -<<<<<<< HEAD -@test_ensembl_releases() -def test_find_nearest_BRAF_transcript(ensembl): - braf_transcript = ensembl.genes_by_name("BRAF")[0].transcripts[0] - egfr_transcript = ensembl.genes_by_name("EGFR")[0].transcripts[0] -======= @run_multiple_genomes() def test_find_nearest_BRAF_transcript(genome): braf_transcript = genome.genes_by_name("BRAF")[0].transcripts[0] egfr_transcript = genome.genes_by_name("EGFR")[0].transcripts[0] ->>>>>>> upstream/master transcripts = [braf_transcript, egfr_transcript] for transcript in transcripts: # immediately before transcript result_before = find_nearest_locus( -<<<<<<< HEAD - start=transcript.start - 2, - end=transcript.start - 1, - loci=transcripts, -======= start=transcript.start - 2, end=transcript.start - 1, loci=transcripts ->>>>>>> upstream/master ) eq_(result_before, (1, transcript)) # overlapping with transcript result_overlap = find_nearest_locus( -<<<<<<< HEAD - start=transcript.start - 2, - end=transcript.start + 1, - loci=transcripts, -======= start=transcript.start - 2, end=transcript.start + 1, loci=transcripts ->>>>>>> upstream/master ) eq_(result_overlap, (0, transcript)) diff --git a/tests/test_sequence_data.py b/tests/test_sequence_data.py index 1d3cc02..1d8b7fd 100644 --- a/tests/test_sequence_data.py +++ b/tests/test_sequence_data.py @@ -18,17 +18,8 @@ def test_sequence_type(): with TemporaryDirectory() as tmpdir: seqs_dna = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) seq = seqs_dna.get("ENSMUST00000138942") -<<<<<<< HEAD - assert ( - seq is not None - ), "Failed to find sequence for ENSMUST00000138942" - assert isinstance( - seq, str - ), "Wrong sequence type, expected %s but got %s" % ( -======= assert seq is not None, "Failed to find sequence for ENSMUST00000138942" assert isinstance(seq, str), "Wrong sequence type, expected %s but got %s" % ( ->>>>>>> upstream/master str, type(seq), ) @@ -44,21 +35,10 @@ def test_missing_sequence(): def test_clear_cache(): with TemporaryDirectory() as tmpdir: seqs = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) -<<<<<<< HEAD - assert ( - not seqs._fasta_dictionary - ), "Expected _fasta_dictionary to load lazily" - - seqs._load_or_create_fasta_dictionary_pickle() - assert ( - len(seqs._fasta_dictionary) > 0 - ), "FASTA dictionary didn't get created" -======= assert not seqs._fasta_dictionary, "Expected _fasta_dictionary to load lazily" seqs._load_or_create_fasta_dictionary_pickle() assert len(seqs._fasta_dictionary) > 0, "FASTA dictionary didn't get created" ->>>>>>> upstream/master seqs.clear_cache() assert ( @@ -71,10 +51,4 @@ def test_clear_cache(): seqs._load_or_create_fasta_dictionary_pickle() for pickle_path in seqs.fasta_dictionary_pickle_paths: -<<<<<<< HEAD - assert exists( - pickle_path - ), "Cached pickle file should have been created" -======= assert exists(pickle_path), "Cached pickle file should have been created" ->>>>>>> upstream/master diff --git a/tests/test_serialization.py b/tests/test_serialization.py index c5ab820..738a39a 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -105,14 +105,8 @@ def test_custom_genome_to_json(): def test_custom_genome_to_dict(): -<<<<<<< HEAD - reconstructed = Genome.from_dict( - custom_mouse_genome_grcm38_subset.to_dict() - ) -======= setup_init_custom_mouse_genome() reconstructed = Genome.from_dict(custom_mouse_genome_grcm38_subset.to_dict()) ->>>>>>> upstream/master eq_(custom_mouse_genome_grcm38_subset, reconstructed) @@ -128,20 +122,9 @@ def test_species_to_pickle(): eq_(human, pickle.loads(pickle.dumps(human))) -<<<<<<< HEAD -@test_ensembl_releases() -def test_unique_memory_address_of_unpickled_genomes(ensembl_genome): - unpickled = pickle.loads(pickle.dumps(ensembl_genome)) - assert ( - ensembl_genome is unpickled - ), "Expected same object for %s but got two different instances" % ( - unpickled, - ) -======= @run_multiple_genomes() def test_unique_memory_address_of_unpickled_genomes(genome): unpickled = pickle.loads(pickle.dumps(genome)) assert ( genome is unpickled ), "Expected same object for %s but got two different instances" % (unpickled,) ->>>>>>> upstream/master diff --git a/tests/test_shell.py b/tests/test_shell.py index ee445ee..9c707f9 100644 --- a/tests/test_shell.py +++ b/tests/test_shell.py @@ -3,9 +3,7 @@ def test_genome_selection_grch38(): - args = parser.parse_args( - ["install", "--release", "100", "--species", "human"] - ) + args = parser.parse_args(["install", "--release", "100", "--species", "human"]) genomes = all_combinations_of_ensembl_genomes(args) assert len(genomes) == 1 genome = genomes[0] diff --git a/tests/test_timings.py b/tests/test_timings.py index 9505ea5..b0fd8e1 100644 --- a/tests/test_timings.py +++ b/tests/test_timings.py @@ -14,13 +14,7 @@ def make_repeat_lookup_fn(lookup_fn, n_positions): def repeat_lookup_fn(): for contig in contigs: -<<<<<<< HEAD - for position in [ - 10**6 + i * 10**6 for i in range(n_positions) - ]: -======= for position in [10**6 + i * 10**6 for i in range(n_positions)]: ->>>>>>> upstream/master lookup_fn(contig, position) return repeat_lookup_fn @@ -34,19 +28,9 @@ def run_benchmark(lookup_fn, n_positions_per_contig=20, time_limit=60.0): repeat_lookup_fn = make_repeat_lookup_fn(lookup_fn, n_positions_per_contig) n_loci = n_positions_per_contig * len(contigs) name = lookup_fn.__name__ -<<<<<<< HEAD - average_time = benchmark( - repeat_lookup_fn, name="%s for %d loci" % (name, n_loci) - ) - print("-- %s : %0.4fs" % (name, average_time)) - assert ( - average_time < time_limit - ), "%s took too long for %s loci: %0.4fs" % ( -======= average_time = benchmark(repeat_lookup_fn, name="%s for %d loci" % (name, n_loci)) print("-- %s : %0.4fs" % (name, average_time)) assert average_time < time_limit, "%s took too long for %s loci: %0.4fs" % ( ->>>>>>> upstream/master name, n_loci, average_time, diff --git a/tests/test_transcript_ids.py b/tests/test_transcript_ids.py index 29291c0..b806608 100644 --- a/tests/test_transcript_ids.py +++ b/tests/test_transcript_ids.py @@ -32,12 +32,7 @@ def test_transcript_ids_ensembl_grch38_hla_a(): transcript_ids = grch38.transcript_ids_at_locus(6, 29941260, 29945884) for transcript_id in HLA_A_TRANSCRIPT_IDS: assert transcript_id in transcript_ids, ( -<<<<<<< HEAD - "Transcript %s of HLA-A not found overlapping locus" - % transcript_id -======= "Transcript %s of HLA-A not found overlapping locus" % transcript_id ->>>>>>> upstream/master ) @@ -54,17 +49,9 @@ def test_transcript_ids_ensembl_grch38_hla_a(): def test_all_transcript_ids(genome): transcript_ids = set(genome.transcript_ids()) for transcript_id in KNOWN_TRANSCRIPT_IDS: -<<<<<<< HEAD - assert ( - transcript_id in transcript_ids - ), "Missing transcript ID %s from %s" % ( - transcript_id, - ensembl, -======= assert transcript_id in transcript_ids, "Missing transcript ID %s from %s" % ( transcript_id, genome, ->>>>>>> upstream/master ) diff --git a/tests/test_transcript_objects.py b/tests/test_transcript_objects.py index 2a38aaf..a243e13 100644 --- a/tests/test_transcript_objects.py +++ b/tests/test_transcript_objects.py @@ -22,13 +22,7 @@ def test_transcript_start_codon(): test_transcript_start_codon : Check that fields Transcript (for transcript named CTNNBIP1-004) matches known values. """ -<<<<<<< HEAD - CTNNBIP1_004_transcript = ensembl77.transcript_by_id( - CTNNBIP1_004_transcript_id - ) -======= CTNNBIP1_004_transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) ->>>>>>> upstream/master assert Locus.__eq__( CTNNBIP1_004_transcript, CTNNBIP1_004_locus @@ -66,13 +60,7 @@ def test_transcript_exons(): """ transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) exons = transcript.exons -<<<<<<< HEAD - assert isinstance( - exons, list - ), "Expected list of Exon objects, got %s : %s" % ( -======= assert isinstance(exons, list), "Expected list of Exon objects, got %s : %s" % ( ->>>>>>> upstream/master exons, type(exons), ) @@ -80,14 +68,7 @@ def test_transcript_exons(): # CTTNBIP1-004 has 5 exons assert len(exons) == len( CTTNNIP1_004_exon_lengths -<<<<<<< HEAD - ), "Expected %d exons but got %d" % ( - len(CTTNNIP1_004_exon_lengths), - len(exons), - ) -======= ), "Expected %d exons but got %d" % (len(CTTNNIP1_004_exon_lengths), len(exons)) ->>>>>>> upstream/master for i, exon in enumerate(exons): expected_id = CTTNNIP1_004_exon_ids[i] @@ -146,17 +127,7 @@ def test_sequence_parts(genome): combined_sequence_length, len(transcript), "Length 5' UTR(%dnt) + CDS(%dnt) + 3' UTR(%d) = %d, expected %d" -<<<<<<< HEAD - % ( - len(utr5), - len(cds), - len(utr3), - combined_sequence_length, - len(transcript), - ), -======= % (len(utr5), len(cds), len(utr3), combined_sequence_length, len(transcript)), ->>>>>>> upstream/master ) eq_( combined_string, @@ -173,12 +144,7 @@ def test_transcript_utr5_sequence_CTNNIP1_004(): eq_( len(utr5), expected_utr5_length, -<<<<<<< HEAD - "Expected 5' UTR length %d, got %d" - % (expected_utr5_length, len(utr5)), -======= "Expected 5' UTR length %d, got %d" % (expected_utr5_length, len(utr5)), ->>>>>>> upstream/master ) eq_(utr5, CTNNBIP1_004_UTR5) @@ -190,12 +156,7 @@ def test_transcript_utr3_sequence_CTNNIP1_004(): eq_( len(utr3), expected_utr3_length, -<<<<<<< HEAD - "Expected 3' UTR length %d, got %d" - % (expected_utr3_length, len(utr3)), -======= "Expected 3' UTR length %d, got %d" % (expected_utr3_length, len(utr3)), ->>>>>>> upstream/master ) eq_(utr3, CTNNBIP1_004_UTR3) @@ -212,11 +173,7 @@ def test_transcript_cds_CTNNIP1_004(): eq_(cds, CTNNBIP1_004_CDS) -<<<<<<< HEAD -@test_ensembl_releases() -======= @run_multiple_genomes() ->>>>>>> upstream/master def test_equal_transcripts(genome): t1 = genome.genes_by_name("TP53")[0].transcripts[0] # get an identical gene @@ -225,18 +182,13 @@ def test_equal_transcripts(genome): eq_(hash(t1), hash(t2)) -<<<<<<< HEAD -@test_ensembl_releases() -======= @run_multiple_genomes() ->>>>>>> upstream/master def test_not_equal_transcripts(genome): t1 = genome.genes_by_name("MUC1")[0].transcripts[0] t2 = genome.genes_by_name("BRCA1")[0].transcripts[0] neq_(t1, t2) - def test_protein_id(): transcript = ensembl77.transcripts_by_name("EGFR-001")[0] eq_(transcript.protein_id, "ENSP00000275493") @@ -253,17 +205,6 @@ def test_transcript_gene_should_match_parent_gene(): eq_(transcript.gene, gene) -<<<<<<< HEAD -@test_ensembl_releases() -def test_BRCA1_201_has_protein_coding_biotype(genome): - transcript = genome.transcripts_by_name("BRCA1-201")[0] - assert transcript.is_protein_coding, ( - "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" - % ( - transcript, - genome, - ) -======= @run_multiple_genomes() def test_BRCA1_201_has_protein_coding_biotype(genome): transcript = genome.transcripts_by_name("BRCA1-201")[0] @@ -272,6 +213,5 @@ def test_BRCA1_201_has_protein_coding_biotype(genome): ), "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" % ( transcript, genome, ->>>>>>> upstream/master ) eq_(transcript.biotype, "protein_coding") diff --git a/tests/test_ucsc_gtf.py b/tests/test_ucsc_gtf.py index 57015df..b40c3ff 100644 --- a/tests/test_ucsc_gtf.py +++ b/tests/test_ucsc_gtf.py @@ -13,14 +13,7 @@ def test_ucsc_gencode_gtf(): df = db._load_gtf_as_dataframe() exons = df[df["feature"] == "exon"] # expect 12 exons from the dataframe -<<<<<<< HEAD - assert len(exons) == 12, "Expected 12 exons, got %d: %s" % ( - len(exons), - exons, - ) -======= assert len(exons) == 12, "Expected 12 exons, got %d: %s" % (len(exons), exons) ->>>>>>> upstream/master def test_ucsc_gencode_genome(): @@ -38,23 +31,11 @@ def test_ucsc_gencode_genome(): genome.index() genes = genome.genes() for gene in genes: -<<<<<<< HEAD - assert gene.id, "Gene with missing ID in %s" % ( - genome.gtf.dataframe(), - ) - assert len(genes) == 7, "Expected 7 genes, got %d: %s" % ( - len(genes), - genes, - ) -======= - assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),) + assert gene.id, "Gene with missing ID in %s" % (genome,) assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes) ->>>>>>> upstream/master transcripts = genome.transcripts() for transcript in transcripts: - assert transcript.id, "Transcript with missing ID in %s" % ( - genome.gtf.dataframe(), - ) + assert transcript.id, "Transcript with missing ID in %s" % (genome,) assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % ( len(transcripts), transcripts, @@ -82,14 +63,7 @@ def test_ucsc_refseq_gtf(): df = db._load_gtf_as_dataframe() exons = df[df["feature"] == "exon"] # expect 16 exons from the GTF -<<<<<<< HEAD - assert len(exons) == 16, "Expected 16 exons, got %d: %s" % ( - len(exons), - exons, - ) -======= assert len(exons) == 16, "Expected 16 exons, got %d: %s" % (len(exons), exons) ->>>>>>> upstream/master def test_ucsc_refseq_genome(): @@ -110,14 +84,7 @@ def test_ucsc_refseq_genome(): assert gene.id, "Gene with missing ID in %s" % ( genome.db._load_gtf_as_dataframe(), ) -<<<<<<< HEAD - assert len(genes) == 2, "Expected 2 genes, got %d: %s" % ( - len(genes), - genes, - ) -======= assert len(genes) == 2, "Expected 2 genes, got %d: %s" % (len(genes), genes) ->>>>>>> upstream/master transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, "Transcript with missing ID in %s" % (