diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d77d059..d3716aa 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -4,7 +4,7 @@ jobs: build-and-test: strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -30,7 +30,7 @@ jobs: conda config --system --set channel_priority strict mamba create -y -n gffutils-env \ python=${{ matrix.python-version }} \ - --file requirements.txt + bedtools conda activate gffutils-env python setup.py clean sdist @@ -43,9 +43,9 @@ jobs: run: | source "${HOME}/conda/etc/profile.d/conda.sh" source "${HOME}/conda/etc/profile.d/mamba.sh" - mamba install -y -n gffutils-env --file optional-requirements.txt pytest hypothesis conda activate gffutils-env + pip install pytest hypothesis biopython pybedtools pytest -v --doctest-modules gffutils conda deactivate diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index 87fc3f7..f1f7545 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -3,6 +3,17 @@ Change log ========== +v0.13 +----- + +- Document options for avoiding deadlocks when simultaneously reading/writing + to a db on disk (fixes `#227 + `__). +- Support later versions of BioPython (fixes `#228 + `__). +- Drop support for Python 3.7 and unused ``six`` dependency; support Python + 3.11 and 3.12 (fixes `#223 `__) + v0.12 ----- diff --git a/gffutils/attributes.py b/gffutils/attributes.py index 937a4c5..0b88caf 100644 --- a/gffutils/attributes.py +++ b/gffutils/attributes.py @@ -1,4 +1,3 @@ -import six import collections try: @@ -95,7 +94,7 @@ def __str__(self): return "\n".join(s) def update(self, *args, **kwargs): - for k, v in six.iteritems(dict(*args, **kwargs)): + for k, v in dict(*args, **kwargs).items(): self[k] = v diff --git a/gffutils/biopython_integration.py b/gffutils/biopython_integration.py index b49e541..63356cd 100644 --- a/gffutils/biopython_integration.py +++ b/gffutils/biopython_integration.py @@ -2,7 +2,6 @@ Module for integration with BioPython, specifically SeqRecords and SeqFeature objects. """ -import six try: from Bio.SeqFeature import SeqFeature, FeatureLocation @@ -15,7 +14,8 @@ _biopython_strand = { "+": 1, "-": -1, - ".": 0, + ".": None, + "?": 0, } _feature_strand = dict((v, k) for k, v in _biopython_strand.items()) @@ -33,7 +33,7 @@ def to_seqfeature(feature): If string, assume it is a GFF or GTF-format line; otherwise just use the provided feature directly. """ - if isinstance(feature, six.string_types): + if isinstance(feature, str): feature = feature_from_line(feature) qualifiers = { @@ -46,10 +46,11 @@ def to_seqfeature(feature): return SeqFeature( # Convert from GFF 1-based to standard Python 0-based indexing used by # BioPython - FeatureLocation(feature.start - 1, feature.stop), + FeatureLocation( + feature.start - 1, feature.stop, strand=_biopython_strand[feature.strand] + ), id=feature.id, type=feature.featuretype, - strand=_biopython_strand[feature.strand], qualifiers=qualifiers, ) @@ -66,12 +67,12 @@ def from_seqfeature(s, **kwargs): score = s.qualifiers.get("score", ".")[0] seqid = s.qualifiers.get("seqid", ".")[0] frame = s.qualifiers.get("frame", ".")[0] - strand = _feature_strand[s.strand] + strand = _feature_strand[s.location.strand] # BioPython parses 1-based GenBank positions into 0-based for use within # Python. We need to convert back to 1-based GFF format here. - start = s.location.start.position + 1 - stop = s.location.end.position + start = s.location.start + 1 + stop = s.location.end featuretype = s.type id = s.id attributes = dict(s.qualifiers) diff --git a/gffutils/convert.py b/gffutils/convert.py index 6e9a7ba..ab2e286 100644 --- a/gffutils/convert.py +++ b/gffutils/convert.py @@ -2,8 +2,6 @@ Conversion functions that operate on :class:`FeatureDB` classes. """ -import six - def to_bed12(f, db, child_type="exon", name_field="ID"): """ @@ -22,7 +20,7 @@ def to_bed12(f, db, child_type="exon", name_field="ID"): Attribute to be used in the "name" field of the BED12 entry. Usually "ID" for GFF; "transcript_id" for GTF. """ - if isinstance(f, six.string_types): + if isinstance(f, str): f = db[f] children = list(db.children(f, featuretype=child_type, order_by="start")) sizes = [len(i) for i in children] diff --git a/gffutils/create.py b/gffutils/create.py index fbd4972..e137c1a 100644 --- a/gffutils/create.py +++ b/gffutils/create.py @@ -5,7 +5,6 @@ import sys import os import sqlite3 -import six from textwrap import dedent from gffutils import constants from gffutils import version @@ -119,7 +118,7 @@ def __init__( os.unlink(dbfn) self.dbfn = dbfn self.id_spec = id_spec - if isinstance(dbfn, six.string_types): + if isinstance(dbfn, str): conn = sqlite3.connect(dbfn) else: conn = dbfn @@ -171,7 +170,7 @@ def _id_handler(self, f): """ # If id_spec is a string or callable, convert to iterable for later - if isinstance(self.id_spec, six.string_types): + if isinstance(self.id_spec, str): id_key = [self.id_spec] elif hasattr(self.id_spec, "__call__"): id_key = [self.id_spec] @@ -181,7 +180,7 @@ def _id_handler(self, f): elif isinstance(self.id_spec, dict): try: id_key = self.id_spec[f.featuretype] - if isinstance(id_key, six.string_types): + if isinstance(id_key, str): id_key = [id_key] # Otherwise, use default auto-increment. @@ -217,7 +216,8 @@ def _id_handler(self, f): "a single value is required for a primary key in the " "database. Consider using a custom id_spec to " "convert these multiple values into a single " - "value".format(k)) + "value".format(k) + ) except KeyError: pass try: @@ -684,7 +684,7 @@ def _update_relations(self): # c.execute('CREATE INDEX childindex ON relations (child)') # self.conn.commit() - if isinstance(self._keep_tempfiles, six.string_types): + if isinstance(self._keep_tempfiles, str): suffix = self._keep_tempfiles else: suffix = ".gffutils" @@ -883,7 +883,7 @@ def _update_relations(self): msg = "transcript" logger.info("Inferring %s extents " "and writing to tempfile" % msg) - if isinstance(self._keep_tempfiles, six.string_types): + if isinstance(self._keep_tempfiles, str): suffix = self._keep_tempfiles else: suffix = ".gffutils" diff --git a/gffutils/feature.py b/gffutils/feature.py index 8972efa..6db720f 100644 --- a/gffutils/feature.py +++ b/gffutils/feature.py @@ -1,5 +1,4 @@ from pyfaidx import Fasta -import six import simplejson as json from gffutils import constants from gffutils import helpers @@ -166,7 +165,7 @@ def __init__( # for testing. attributes = attributes or dict_class() - if isinstance(attributes, six.string_types): + if isinstance(attributes, str): try: attributes = helpers._unjsonify(attributes, isattributes=True) @@ -182,7 +181,7 @@ def __init__( # If string, then try un-JSONifying it into a list; if that doesn't # work then assume it's tab-delimited and convert to a list. extra = extra or [] - if isinstance(extra, six.string_types): + if isinstance(extra, str): try: extra = helpers._unjsonify(extra) except json.JSONDecodeError: @@ -254,10 +253,7 @@ def __setitem__(self, key, value): self.attributes[key] = value def __str__(self): - if six.PY3: - return self.__unicode__() - else: - return unicode(self).encode("utf-8") + return self.__unicode__() def __unicode__(self): @@ -387,7 +383,7 @@ def sequence(self, fasta, use_strand=True): ------- string """ - if isinstance(fasta, six.string_types): + if isinstance(fasta, str): fasta = Fasta(fasta, as_raw=False) # recall GTF/GFF is 1-based closed; pyfaidx uses Python slice notation diff --git a/gffutils/gffwriter.py b/gffutils/gffwriter.py index cc64c24..c302692 100644 --- a/gffutils/gffwriter.py +++ b/gffutils/gffwriter.py @@ -1,7 +1,6 @@ ## ## GFF Writer (writer): serializing gffutils records as GFF text files. ## -import six import tempfile import shutil from time import strftime, localtime @@ -41,7 +40,7 @@ def __init__(self, out, with_header=True, in_place=False): self.temp_file = None # Output stream to write to self.out_stream = None - if isinstance(out, six.string_types): + if isinstance(out, str): if self.in_place: # Use temporary file self.temp_file = tempfile.NamedTemporaryFile(delete=False) diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 0e2f430..234994d 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -4,7 +4,6 @@ import simplejson as json import time import tempfile -import six from gffutils import constants from gffutils import bins import gffutils @@ -202,7 +201,7 @@ def make_query( # e.g., "featuretype = 'exon'" # # or, "featuretype IN ('exon', 'CDS')" - if isinstance(featuretype, six.string_types): + if isinstance(featuretype, str): d["FEATURETYPE"] = "features.featuretype = ?" args.append(featuretype) else: @@ -218,7 +217,7 @@ def make_query( # `limit` is a string or a tuple of (chrom, start, stop) # # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000" - if isinstance(limit, six.string_types): + if isinstance(limit, str): seqid, startstop = limit.split(":") start, end = startstop.split("-") else: @@ -257,7 +256,7 @@ def make_query( # Default is essentially random order. # # e.g. "ORDER BY seqid, start DESC" - if isinstance(order_by, six.string_types): + if isinstance(order_by, str): _order_by.append(order_by) else: @@ -387,7 +386,7 @@ def merge_attributes(attr1, attr2, numeric_sort=False): if not isinstance(v, list): new_d[k] = [v] - for k, v in six.iteritems(attr1): + for k, v in attr1.items(): if k in attr2: if not isinstance(v, list): v = [v] @@ -507,9 +506,9 @@ def is_gff_db(db_fname): def to_unicode(obj, encoding="utf-8"): - if isinstance(obj, six.string_types): - if not isinstance(obj, six.text_type): - obj = six.text_type(obj, encoding) + if isinstance(obj, str): + if not isinstance(obj, str): + obj = str(obj, encoding) return obj @@ -520,7 +519,6 @@ def canonical_transcripts(db, fasta_filename): """ import pyfaidx - fasta = pyfaidx.Fasta(fasta_filename, as_raw=False) for gene in db.features_of_type("gene"): @@ -536,7 +534,20 @@ def canonical_transcripts(db, fasta_filename): cds_len += exon_length total_len += exon_length - exon_list.append((cds_len, total_len, transcript, exons if cds_len == 0 else [e for e in exons if e.featuretype in ['CDS', 'five_prime_UTR', 'three_prime_UTR']])) + exon_list.append( + ( + cds_len, + total_len, + transcript, + exons + if cds_len == 0 + else [ + e + for e in exons + if e.featuretype in ["CDS", "five_prime_UTR", "three_prime_UTR"] + ], + ) + ) # If we have CDS, then use the longest coding transcript if max(i[0] for i in exon_list) > 0: @@ -549,7 +560,12 @@ def canonical_transcripts(db, fasta_filename): canonical_exons = best[-1] transcript = best[-2] - seqs = [i.sequence(fasta) for i in sorted(canonical_exons, key=lambda x: x.start, reverse=transcript.strand != '+')] + seqs = [ + i.sequence(fasta) + for i in sorted( + canonical_exons, key=lambda x: x.start, reverse=transcript.strand != "+" + ) + ] yield transcript, "".join(seqs) diff --git a/gffutils/interface.py b/gffutils/interface.py index 1579e4e..9216cd3 100644 --- a/gffutils/interface.py +++ b/gffutils/interface.py @@ -1,6 +1,5 @@ import collections import os -import six import sqlite3 import shutil import warnings @@ -102,7 +101,7 @@ def __init__( keep_order=False, pragmas=constants.default_pragmas, sort_attribute_values=False, - text_factory=str + text_factory=str, ): """ Connect to a database created by :func:`gffutils.create_db`. @@ -694,7 +693,7 @@ def region( "If region is supplied, do not supply seqid, " "start, or end as separate kwargs" ) - if isinstance(region, six.string_types): + if isinstance(region, str): toks = region.split(":") if len(toks) == 1: seqid = toks[0] @@ -774,7 +773,7 @@ def region( # Add the featuretype clause if featuretype is not None: - if isinstance(featuretype, six.string_types): + if isinstance(featuretype, str): featuretype = [featuretype] feature_clause = " or ".join(["featuretype = ?" for _ in featuretype]) query += " AND (%s) " % feature_clause @@ -872,10 +871,21 @@ def _init_interfeature(f): Used to initialize a new interfeature that is ready to be updated in-place. """ - keys = ['id', 'seqid', 'source', 'featuretype', 'start', 'end', - 'score', 'strand', 'frame', 'attributes', 'bin'] + keys = [ + "id", + "seqid", + "source", + "featuretype", + "start", + "end", + "score", + "strand", + "frame", + "attributes", + "bin", + ] d = dict(zip(keys, f.astuple())) - d['source'] = 'gffutils_derived' + d["source"] = "gffutils_derived" return d def _prep_for_yield(d): @@ -886,12 +896,12 @@ def _prep_for_yield(d): If start is greater than stop (which happens when trying to get interfeatures for overlapping features), then return None. """ - d['start'] += 1 - d['end'] -= 1 - new_bin = bins.bins(d['start'], d['end'], one=True) - d['bin'] = new_bin + d["start"] += 1 + d["end"] -= 1 + new_bin = bins.bins(d["start"], d["end"], one=True) + d["bin"] = new_bin - if d['start'] > d['end']: + if d["start"] > d["end"]: return None new_feature = self._feature_returner(**d) @@ -899,12 +909,13 @@ def _prep_for_yield(d): # concat list of ID to create uniq IDs because feature with # multiple values for their ID are no longer permitted since v0.11 if "ID" in new_feature.attributes and len(new_feature.attributes["ID"]) > 1: - new_id = '-'.join(new_feature.attributes["ID"]) + new_id = "-".join(new_feature.attributes["ID"]) new_feature.attributes["ID"] = [new_id] return new_feature # If not provided, use a no-op function instead. if not attribute_func: + def attribute_func(a): return a @@ -933,23 +944,23 @@ def attribute_func(a): nfeatures += 1 # Adjust the interfeature dict in-place with coords... - interfeature['start'] = last_feature.stop - interfeature['end'] = f.start + interfeature["start"] = last_feature.stop + interfeature["end"] = f.start # ...featuretype if new_featuretype is None: - interfeature['featuretype'] = "inter_%s_%s" % ( + interfeature["featuretype"] = "inter_%s_%s" % ( last_feature.featuretype, f.featuretype, ) else: - interfeature['featuretype'] = new_featuretype + interfeature["featuretype"] = new_featuretype # ...strand if last_feature.strand != f.strand: - interfeature['strand'] = '.' + interfeature["strand"] = "." else: - interfeature['strand'] = f.strand + interfeature["strand"] = f.strand # and attributes if merge_attributes: @@ -964,7 +975,7 @@ def attribute_func(a): if update_attributes: new_attributes.update(update_attributes) - interfeature['attributes'] = new_attributes + interfeature["attributes"] = new_attributes # Ready to yield new_feature = _prep_for_yield(interfeature) @@ -994,7 +1005,7 @@ def delete(self, features, make_backup=True, **kwargs): FeatureDB object, with features deleted. """ if make_backup: - if isinstance(self.dbfn, six.string_types): + if isinstance(self.dbfn, str): shutil.copy2(self.dbfn, self.dbfn + ".bak") c = self.conn.cursor() @@ -1006,12 +1017,12 @@ def delete(self, features, make_backup=True, **kwargs): """ if isinstance(features, FeatureDB): features = features.all_features() - if isinstance(features, six.string_types): + if isinstance(features, str): features = [features] if isinstance(features, Feature): features = [features] for feature in features: - if isinstance(feature, six.string_types): + if isinstance(feature, str): _id = feature else: _id = feature.id @@ -1027,7 +1038,13 @@ def update(self, data, make_backup=True, **kwargs): WARNING: If you used any non-default kwargs for gffutils.create_db when creating the database in the first place (especially `disable_infer_transcripts` or `disable_infer_genes`) then you should - use those same arguments here. + use those same arguments here. The exception is the `force` argument + though -- in some cases including that can truncate the database. + + WARNING: If you are creating features from the database and writing + immediately back to the database, you could experience deadlocks. See + the help for `create_introns` for some different options for avoiding + this. The returned object is the same FeatureDB, but since it is pointing to the same database and that has been just updated, the new features can @@ -1060,7 +1077,7 @@ def update(self, data, make_backup=True, **kwargs): from gffutils import iterators if make_backup: - if isinstance(self.dbfn, six.string_types): + if isinstance(self.dbfn, str): shutil.copy2(self.dbfn, self.dbfn + ".bak") # get iterator-specific kwargs @@ -1139,9 +1156,9 @@ def child_func(parent, child): ------- FeatureDB object with new relations added. """ - if isinstance(parent, six.string_types): + if isinstance(parent, str): parent = self[parent] - if isinstance(child, six.string_types): + if isinstance(child, str): child = self[child] c = self.conn.cursor() @@ -1234,9 +1251,42 @@ def create_introns( ----- The returned generator can be passed directly to the :meth:`FeatureDB.update` method to permanently add them to the - database, e.g., :: + database. However, this needs to be done carefully to avoid deadlocks + from simultaneous reading/writing. + + When using `update()` you should also use the same keyword arguments + used to create the db in the first place (with the exception of `force`). + + Here are three options for getting the introns back into the database, + depending on the circumstances. + + **OPTION 1: Create list of introns.** + + Consume the `create_introns()` generator completely before writing to + the database. If you have sufficient memory, this is the easiest + option:: + + db.update(list(db.create_introns(**intron_kwargs)), **create_kwargs) + + **OPTION 2: Use `WAL `__** + + The WAL pragma enables simultaneous read/write. WARNING: this does not + work if the database is on a networked filesystem, like those used on + many HPC clusters. + + :: - db.update(db.create_introns()) + db.set_pragmas({"journal_mode": "WAL"}) + db.update(db.create_introns(**intron_kwargs), **create_kwargs) + + **OPTION 3: Write to intermediate file.** + + Use this if you are memory limited and using a networked filesystem:: + + with open('tmp.gtf', 'w') as fout: + for intron in db.create_introns(**intron_kwargs): + fout.write(str(intron) + "\n") + db.update(gffutils.DataIterator('tmp.gtf'), **create_kwargs) """ if (grandparent_featuretype and parent_featuretype) or ( @@ -1390,11 +1440,12 @@ def child_gen(): splice_site.start = splice_site.end - 1 # make ID uniq by adding suffix - splice_site.attributes["ID"] = [new_featuretype + "_" + splice_site.attributes["ID"][0]] + splice_site.attributes["ID"] = [ + new_featuretype + "_" + splice_site.attributes["ID"][0] + ] yield splice_site - def _old_merge(self, features, ignore_strand=False): """ DEPRECATED, only retained here for backwards compatibility. Please use @@ -1710,10 +1761,12 @@ def merge_all( return result_features def children_bp( - self, feature, child_featuretype="exon", merge=False, - merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, - mc.feature_type), - **kwargs + self, + feature, + child_featuretype="exon", + merge=False, + merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type), + **kwargs ): """ Total bp of all children of a featuretype. @@ -1752,9 +1805,14 @@ def children_bp( raise ValueError( "'ignore_strand' has been deprecated; please use " "merge_criteria to control how features should be merged. " - "E.g., leave out the 'mc.strand' criteria to ignore strand.") + "E.g., leave out the 'mc.strand' criteria to ignore strand." + ) else: - raise TypeError("merge() got unexpected keyword arguments '{}'".format(kwargs.keys())) + raise TypeError( + "merge() got unexpected keyword arguments '{}'".format( + kwargs.keys() + ) + ) children = self.children( feature, featuretype=child_featuretype, order_by="start" @@ -1941,7 +1999,6 @@ def seqids(self): for (i,) in c: yield i - # Recycle the docs for _relation so they stay consistent between parents() # and children() children.__doc__ = children.__doc__.format(_relation_docstring=_relation.__doc__) diff --git a/gffutils/iterators.py b/gffutils/iterators.py index 75b85d2..ed0c838 100644 --- a/gffutils/iterators.py +++ b/gffutils/iterators.py @@ -16,13 +16,8 @@ from gffutils.interface import FeatureDB from gffutils import helpers from textwrap import dedent -import six -from six.moves.urllib.request import urlopen - -if six.PY3: - from urllib import parse as urlparse -else: - import urlparse +from urllib.request import urlopen +from urllib import parse as urlparse class Directive(object): @@ -133,7 +128,7 @@ def _custom_iter(self): valid_lines = 0 with self.open_function(self.data) as fh: for i, line in enumerate(fh): - if isinstance(line, six.binary_type): + if isinstance(line, bytes): line = line.decode("utf-8") line = line.rstrip("\n\r") self.current_item = line @@ -295,11 +290,11 @@ def DataIterator( force_dialect_check=force_dialect_check, **kwargs, ) - if isinstance(data, six.string_types): + if isinstance(data, str): if from_string: tmp = tempfile.NamedTemporaryFile(delete=False) data = dedent(data) - if isinstance(data, six.text_type): + if isinstance(data, str): data = data.encode("utf-8") tmp.write(data) tmp.close() diff --git a/gffutils/parser.py b/gffutils/parser.py index ad2853d..058423a 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -3,7 +3,7 @@ import re import copy import collections -from six.moves import urllib +import urllib from gffutils import constants from gffutils.exceptions import AttributeStringError diff --git a/gffutils/pybedtools_integration.py b/gffutils/pybedtools_integration.py index c5ad938..5c5c2b9 100644 --- a/gffutils/pybedtools_integration.py +++ b/gffutils/pybedtools_integration.py @@ -6,7 +6,6 @@ import pybedtools from pybedtools import featurefuncs from gffutils import helpers -import six def to_bedtool(iterator): @@ -210,7 +209,7 @@ def gen(): if merge_overlapping or as_bed6: - if isinstance(attrs, six.string_types): + if isinstance(attrs, str): attrs = [attrs] def to_bed(f): diff --git a/gffutils/scripts/gffutils-cli b/gffutils/scripts/gffutils-cli index f9dc83e..051b76d 100755 --- a/gffutils/scripts/gffutils-cli +++ b/gffutils/scripts/gffutils-cli @@ -1,7 +1,5 @@ #!/usr/bin/python -from __future__ import print_function - """ Command line interface for gffutils. diff --git a/gffutils/test/conftest.py b/gffutils/test/conftest.py index 47d768e..d4d1c0a 100644 --- a/gffutils/test/conftest.py +++ b/gffutils/test/conftest.py @@ -1 +1 @@ -collect_ignore=["data"] +collect_ignore = ["data"] diff --git a/gffutils/test/test_1.py b/gffutils/test/test_1.py index c603b8e..2b88cc0 100644 --- a/gffutils/test/test_1.py +++ b/gffutils/test/test_1.py @@ -9,11 +9,10 @@ import gffutils.iterators as iterators import sys import os -import six import shutil import threading import tempfile -from six.moves import SimpleHTTPServer +import http.server as SimpleHTTPServer if sys.version_info.major == 3: import socketserver as SocketServer @@ -395,7 +394,7 @@ def test_gffwriter(): ), "unsanitized.gff should not have a gffutils-style header." db_in = gffutils.create_db(fn, ":memory:", keep_order=True) # Fetch first record - rec = six.next(db_in.all_features()) + rec = next(db_in.all_features()) ## ## Write GFF file in-place test ## @@ -551,7 +550,7 @@ def test_nonascii(): # ...but fails using plain nosetests or when using regular Python # interpreter except UnicodeEncodeError: - print(six.text_type(i)) + print(str(i)) def test_feature_merge(): @@ -637,7 +636,7 @@ def test_feature_merge(): id_spec="gene_id", force_merge_fields=["start"], keep_order=True, - ) + ) # test that warnings are raised because of strand and frame with warnings.catch_warnings(record=True) as w: @@ -903,7 +902,7 @@ def gen(): db.update(gen(), merge_strategy="replace") assert len(list(db.all_features())) == 12 assert len(list(db.features_of_type("gene"))) == 1 - g = six.next(db.features_of_type("gene")) + g = next(db.features_of_type("gene")) assert g.start == 1, g.start assert g.stop == 100, g.stop @@ -924,7 +923,7 @@ def _transform(f): ) assert len(list(db.all_features())) == 12 assert len(list(db.features_of_type("gene"))) == 1 - g = six.next(db.features_of_type("gene")) + g = next(db.features_of_type("gene")) print(g) assert g.start == 1, g.start assert g.stop == 100, g.stop @@ -934,17 +933,19 @@ def _transform(f): [(i.start, i.stop) for i in db.features_of_type("exon")] ) + def clean_tempdir(): tempfile.tempdir = tempdir if os.path.exists(tempdir): shutil.rmtree(tempdir) os.makedirs(tempdir) + # specify a writeable temp dir for testing tempdir = "/tmp/gffutils-test" -def test_tempfiles(): +def test_tempfiles(): clean_tempdir() @@ -992,6 +993,7 @@ def test_tempfiles(): assert len(filelist) == 1, filelist assert filelist[0].endswith(".GFFtmp") + @pytest.mark.skip(reason="Unclear if still needed; currently failing") def test_parallel_db(): # DISABLING in v0.12 @@ -1114,7 +1116,7 @@ def test_deprecation_handler(): gffutils.example_filename("FBgn0031208.gtf"), ":memory:", infer_gene_extent=False, - ) + ) def test_nonsense_kwarg(): @@ -1123,7 +1125,7 @@ def test_nonsense_kwarg(): gffutils.example_filename("FBgn0031208.gtf"), ":memory:", asdf=True, - ) + ) def test_infer_gene_extent(): @@ -1242,7 +1244,8 @@ def test_create_splice_sites(): db = gffutils.create_db(fn, ":memory:") db = db.update(db.create_splice_sites()) observed = "\n".join(str(feature) for feature in db.all_features()) - expected = dedent("""\ + expected = dedent( + """\ chr1 ensGene gene 4763287 4775820 . - . Name=ENSMUSG00000033845;ID=ENSMUSG00000033845;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845 chr1 ensGene mRNA 4764517 4775779 . - . Name=ENSMUST00000045689;Parent=ENSMUSG00000033845;ID=ENSMUST00000045689;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845 chr1 ensGene CDS 4775654 4775758 . - 0 Name=ENSMUST00000045689.cds0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds0;gid=ENSMUSG00000033845 @@ -1260,13 +1263,12 @@ def test_create_splice_sites(): chr1 gffutils_derived three_prime_cis_splice_site 4772815 4772816 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845 chr1 gffutils_derived five_prime_cis_splice_site 4767604 4767605 . - . Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845 chr1 gffutils_derived five_prime_cis_splice_site 4772647 4772648 . - . Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845 - chr1 gffutils_derived five_prime_cis_splice_site 4775652 4775653 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845""") + chr1 gffutils_derived five_prime_cis_splice_site 4775652 4775653 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845""" + ) assert observed == expected - - if __name__ == "__main__": # this test case fails # test_attributes_modify() diff --git a/gffutils/test/test_biopython_integration.py b/gffutils/test/test_biopython_integration.py index c0d4f2a..58c5866 100644 --- a/gffutils/test/test_biopython_integration.py +++ b/gffutils/test/test_biopython_integration.py @@ -13,8 +13,8 @@ def test_roundtrip(): feature.keep_order = True dialect = feature.dialect s = bp.to_seqfeature(feature) - assert s.location.start.position == feature.start - 1 - assert s.location.end.position == feature.stop + assert s.location.start == feature.start - 1 + assert s.location.end == feature.stop assert s.id == feature.id f = bp.from_seqfeature(s, dialect=dialect, keep_order=True) assert feature == f diff --git a/gffutils/test/test_cli.py b/gffutils/test/test_cli.py new file mode 100644 index 0000000..bf49999 --- /dev/null +++ b/gffutils/test/test_cli.py @@ -0,0 +1,18 @@ +import subprocess as sp +import gffutils +from gffutils import example_filename, create, feature + + +def test_issue_224(): + fn = gffutils.example_filename("FBgn0031208.gtf") + sp.run(["gffutils-cli", "create", "--force", fn]) + p = sp.run( + ["gffutils-cli", "children", fn + ".db", "FBgn0031208"], + check=True, + capture_output=True, + universal_newlines=True, + ) + assert ( + p.stdout.splitlines()[0] + == 'chr2L\tgffutils_derived\tgene\t7529\t9484\t.\t+\t.\tgene_id "FBgn0031208";' + ) diff --git a/gffutils/test/test_issues.py b/gffutils/test/test_issues.py index 74171db..79996ba 100644 --- a/gffutils/test/test_issues.py +++ b/gffutils/test/test_issues.py @@ -13,6 +13,7 @@ import pytest + def test_issue_79(): gtf = gffutils.example_filename("keep-order-test.gtf") db = gffutils.create_db( @@ -291,41 +292,41 @@ def test_issue_128(): # In #128, some lines had separators of "; " and some with ";". The first # one in the file would win. Now the detection pays more attention to lines # with more attributes to make it work properly - gff = gffutils.example_filename('gms2_example.gff3') + gff = gffutils.example_filename("gms2_example.gff3") db = gffutils.create_db(gff, ":memory:", force=True) expected = { - 'ID': ['1'], - 'Parent': ['gene_1'], - 'gene_type': ['native'], - 'partial': ['11'], - 'gc': ['33'], - 'length': ['363'], + "ID": ["1"], + "Parent": ["gene_1"], + "gene_type": ["native"], + "partial": ["11"], + "gc": ["33"], + "length": ["363"], } - assert dict(db['1'].attributes) == expected + assert dict(db["1"].attributes) == expected def test_issue_157(): # With the merge overhaul, children_bp incorrectly still used ignore_strand. - db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ":memory:") - gene = next(db.features_of_type('gene')) - children = list(db.children(gene, featuretype='exon')) + db = gffutils.create_db(gffutils.example_filename("FBgn0031208.gff"), ":memory:") + gene = next(db.features_of_type("gene")) + children = list(db.children(gene, featuretype="exon")) # Modify the last one to have a different strand so we can test the # ignore_strand argument. - children[-1].strand = '-' - db.update(children[-1:], merge_strategy='replace') + children[-1].strand = "-" + db.update(children[-1:], merge_strategy="replace") # and, since updating has been problematic in the past, double-check again # that the strand is changed in the db. - assert list(db.children(gene, featuretype='exon'))[-1].strand == '-' - cbp1 = db.children_bp(gene, child_featuretype='exon') + assert list(db.children(gene, featuretype="exon"))[-1].strand == "-" + cbp1 = db.children_bp(gene, child_featuretype="exon") # Previously this would give: # TypeError: merge() got an unexpected keyword argument 'ignore_strand' # - # Now changing to ValueError and suggesting a fix. + # Now changing to ValueError and suggesting a fix. with pytest.raises(ValueError): - db.children_bp(gene, child_featuretype='exon', merge=True, ignore_strand=True) + db.children_bp(gene, child_featuretype="exon", merge=True, ignore_strand=True) with pytest.raises(ValueError): db.children_bp(gene, ignore_strand=True, nonexistent=True) with pytest.raises(TypeError): @@ -333,12 +334,17 @@ def test_issue_157(): # The way to do it now is the following (we can omit the mc.feature_type # since we're preselecting for exons anyway): - db.children_bp(gene, child_featuretype='exon', merge=True, merge_criteria=(mc.overlap_end_inclusive)) + db.children_bp( + gene, + child_featuretype="exon", + merge=True, + merge_criteria=(mc.overlap_end_inclusive), + ) def test_issue_159(): - db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ":memory:") - fasta = gffutils.example_filename('dm6-chr2L.fa') + db = gffutils.create_db(gffutils.example_filename("FBgn0031208.gff"), ":memory:") + fasta = gffutils.example_filename("dm6-chr2L.fa") for transcript, seq in gffutils.helpers.canonical_transcripts(db, fasta): pass @@ -347,56 +353,59 @@ def test_issue_164(): # Something strange with the original annotation, but seems fine at least # after pasting in the offending genes from the GitHub comments. db = gffutils.create_db( - gffutils.example_filename('sharr.gtf'), - ':memory:', + gffutils.example_filename("sharr.gtf"), + ":memory:", disable_infer_transcripts=True, disable_infer_genes=True, - id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'}, - merge_strategy='create_unique', - keep_order=True) + id_spec={"gene": "gene_id", "transcript": "transcript_id"}, + merge_strategy="create_unique", + keep_order=True, + ) def test_issue_166(): # Added the new FeatureDB.seqids() method. - db = gffutils.create_db(gffutils.example_filename('nonascii'), ':memory:') + db = gffutils.create_db(gffutils.example_filename("nonascii"), ":memory:") seqs = list(db.seqids()) - assert seqs == ['2L', '2R', '3L', '3R', 'X'], seqs + assert seqs == ["2L", "2R", "3L", "3R", "X"], seqs def test_issue_167(): # Previously was causing sqlite3.InterfaceError - db = gffutils.create_db(gffutils.example_filename('issue167.gff'), ':memory:') + db = gffutils.create_db(gffutils.example_filename("issue167.gff"), ":memory:") def test_issue_174(): db = gffutils.create_db( - gffutils.example_filename('issue174.gtf'), - ':memory:', - merge_strategy='warning', + gffutils.example_filename("issue174.gtf"), + ":memory:", + merge_strategy="warning", ) introns = [f for f in db.create_introns()] - observed = [i.attributes['exon_number'] for i in introns] - assert observed[7] == ['8', '9'] - assert observed[8] == ['10', '9'] - assert observed[9] == ['10', '11'] + observed = [i.attributes["exon_number"] for i in introns] + assert observed[7] == ["8", "9"] + assert observed[8] == ["10", "9"] + assert observed[9] == ["10", "11"] # Now do the same thing, but with the new numeric_sort arg introns = [f for f in db.create_introns(numeric_sort=True)] - observed = [i.attributes['exon_number'] for i in introns] - assert observed[7] == ['8', '9'] + observed = [i.attributes["exon_number"] for i in introns] + assert observed[7] == ["8", "9"] # This should be fixed: - assert observed[8] == ['9', '10'] - assert observed[9] == ['10', '11'] + assert observed[8] == ["9", "10"] + assert observed[9] == ["10", "11"] + def test_issue_197(): # Previously this would fail with ValueError due to using the stop position # of the last item on the previous chrom as the start position. - db = gffutils.create_db(gffutils.example_filename('issue_197.gff'), ':memory:', merge_strategy='error') - genes = list(db.features_of_type('gene')) - igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') ) - + db = gffutils.create_db( + gffutils.example_filename("issue_197.gff"), ":memory:", merge_strategy="error" + ) + genes = list(db.features_of_type("gene")) + igss = list(db.interfeatures(genes, new_featuretype="intergenic_space")) # Prior to PR #219, multiple IDs could be created by interfeatures, which # in turn was patched here by providing the transform to db.update. With @@ -405,23 +414,24 @@ def test_issue_197(): # # However, keeping the test as-is to ensure backward-compatibility. def transform(f): - f['ID'] = [ '-'.join(f.attributes['ID']) ] + f["ID"] = ["-".join(f.attributes["ID"])] return f - db = db.update(igss, transform=transform, merge_strategy='error') + db = db.update(igss, transform=transform, merge_strategy="error") - obs = list(db.features_of_type('intergenic_space')) + obs = list(db.features_of_type("intergenic_space")) for i in obs: print(i) assert [str(i) for i in obs] == [ - 'tig00000492\tgffutils_derived\tintergenic_space\t47236\t47350\t.\t-\t.\tID=ctg492.gene0001-ctg492.gene0002;Name=gene0001,gene0002', - 'tig00000492\tgffutils_derived\tintergenic_space\t48257\t49999\t.\t-\t.\tID=ctg492.gene0002-gene0;Name=gene0002', - 'tig00000492\tgffutils_derived\tintergenic_space\t50050\t50054\t.\t-\t.\tID=gene3-gene4', - 'tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5', - 'tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6', + "tig00000492\tgffutils_derived\tintergenic_space\t47236\t47350\t.\t-\t.\tID=ctg492.gene0001-ctg492.gene0002;Name=gene0001,gene0002", + "tig00000492\tgffutils_derived\tintergenic_space\t48257\t49999\t.\t-\t.\tID=ctg492.gene0002-gene0;Name=gene0002", + "tig00000492\tgffutils_derived\tintergenic_space\t50050\t50054\t.\t-\t.\tID=gene3-gene4", + "tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5", + "tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6", ] + def test_issue_198(): line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; db_xref "HGNC:HGNC:38034"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' @@ -440,7 +450,7 @@ def test_issue_198(): # of repeated keys always wins. f = feature.feature_from_line(line) - assert f.attributes['description'] == ['WASP family homolog 7, pseudogene'] + assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] # If we remove one of the db_xref keys, then the parser sees the comma and # figures it's a multivalue key. @@ -449,59 +459,62 @@ def test_issue_198(): # Previous result, note leading space --------------------------->| | # assert f.attributes['description'] == ['WASP family homolog 7', ' pseudogene'] - assert f.attributes['description'] == ['WASP family homolog 7, pseudogene'] + assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] # But removing that space before "pseudogene" means it's interpreted as # a multivalue attribute line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' f = feature.feature_from_line(line) - assert f.attributes['description'] == ['WASP family homolog 7', 'pseudogene'] + assert f.attributes["description"] == ["WASP family homolog 7", "pseudogene"] # Confirm behavior of corner cases like a trailing comma line = "chr17 RefSeq CDS 6806527 6806553 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1," f = feature.feature_from_line(line) - assert f.attributes['Parent'] == ['XM_001475631.1', ''] + assert f.attributes["Parent"] == ["XM_001475631.1", ""] def test_issue_207(): - def _check(txt, expected_keys, dialect_trailing_semicolon): - db = gffutils.create_db(txt.replace(' ', '\t'), ':memory:', from_string=True) + db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True) assert [list(f.attributes.keys()) for f in db.all_features()] == expected_keys - assert db.dialect['trailing semicolon'] == dialect_trailing_semicolon + assert db.dialect["trailing semicolon"] == dialect_trailing_semicolon # All lines have trailing semicolon _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1; chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1; - """), - expected_keys = [ - ['ID'], - ['ID', 'Parent'], - ['ID', 'Parent'], - ['ID', 'Parent'], + """ + ), + expected_keys=[ + ["ID"], + ["ID", "Parent"], + ["ID", "Parent"], + ["ID", "Parent"], ], - dialect_trailing_semicolon=True + dialect_trailing_semicolon=True, ) # First two lines have trailing semicolon. However, the heuristics of # dialect selection, which favor attributes with more values (assuming more # information), decides that this file does NOT have trailing semicolons. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1 - """), - expected_keys = [ - ['ID', ''], - ['ID', 'Parent', ''], - ['ID', 'Parent'], - ['ID', 'Parent'], + """ + ), + expected_keys=[ + ["ID", ""], + ["ID", "Parent", ""], + ["ID", "Parent"], + ["ID", "Parent"], ], dialect_trailing_semicolon=False, ) @@ -514,18 +527,15 @@ def _check(txt, expected_keys, dialect_trailing_semicolon): # as above to give higher weight, and to break the tie between with and # without trailing semicolon, falls back to first dialect observed. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1 - """), - expected_keys=[ - ['ID'], - ['ID'], - ['ID'], - ['ID'] - ], + """ + ), + expected_keys=[["ID"], ["ID"], ["ID"], ["ID"]], dialect_trailing_semicolon=True, ) @@ -533,39 +543,37 @@ def _check(txt, expected_keys, dialect_trailing_semicolon): # semicolon by giving one more line as evidence. Only difference is from # above is the last line. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1904.t1.d1.1 - """), + """ + ), expected_keys=[ - ['ID', ''], - ['ID', ''], - ['ID'], - ['ID'], - ['ID'], + ["ID", ""], + ["ID", ""], + ["ID"], + ["ID"], + ["ID"], ], dialect_trailing_semicolon=False, ) - # Again seems inconsistent at first, but heuristics break ties by # preferring first dialect, which here is no trailing semicolon. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903 chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1 chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1; chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1; - """), - expected_keys=[ - ['ID'], - ['ID'], - ['ID', ''], - ['ID', ''] - ], + """ + ), + expected_keys=[["ID"], ["ID"], ["ID", ""], ["ID", ""]], dialect_trailing_semicolon=False, ) @@ -587,7 +595,6 @@ def test_issue_213(): it = gffutils.iterators.DataIterator(data, from_string=True) assert it.directives == ["gff-version 3"] - # Ensure they're parsed into the db from a string db = gffutils.create_db(data, dbfn=":memory:", from_string=True, verbose=False) assert db.directives == ["gff-version 3"], db.directives @@ -602,6 +609,6 @@ def test_issue_213(): # Ensure they're parsed into the db from a file, and going to a file (to # exactly replicate example in #213) - db = gffutils.create_db(tmp, dbfn='issue_213.db', force=True) + db = gffutils.create_db(tmp, dbfn="issue_213.db", force=True) assert db.directives == ["gff-version 3"], db.directives assert len(db.directives) == 1 diff --git a/gffutils/version.py b/gffutils/version.py index 72e040c..70fcf05 100644 --- a/gffutils/version.py +++ b/gffutils/version.py @@ -1 +1 @@ -version = "0.12" +version = "0.13" diff --git a/requirements.txt b/requirements.txt index 0f374d3..3defc96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ pyfaidx>=0.5.5.2 -six>=1.12.0 argh>=0.26.2 argcomplete>=1.9.4 simplejson