daler · daler · Apr 13, 2024 · Feb 28, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -4,7 +4,7 @@ jobs:
   build-and-test:
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -30,7 +30,7 @@ jobs:
           conda config --system --set channel_priority strict
           mamba create -y -n gffutils-env \
             python=${{ matrix.python-version }} \
-            --file requirements.txt
+            bedtools
 
           conda activate gffutils-env
           python setup.py clean sdist
@@ -43,9 +43,9 @@ jobs:
         run: |
           source "${HOME}/conda/etc/profile.d/conda.sh"
           source "${HOME}/conda/etc/profile.d/mamba.sh"
-          mamba install -y -n gffutils-env --file optional-requirements.txt pytest hypothesis
 
           conda activate gffutils-env
+          pip install pytest hypothesis biopython pybedtools
           pytest -v --doctest-modules gffutils
           conda deactivate
 

diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -3,6 +3,17 @@
 Change log
 ==========
 
+v0.13
+-----
+
+- Document options for avoiding deadlocks when simultaneously reading/writing
+  to a db on disk (fixes `#227
+  <https://github.com/daler/gffutils/issues/227>`__).
+- Support later versions of BioPython (fixes `#228
+  <https://github.com/daler/gffutils/issues/228>`__).
+- Drop support for Python 3.7 and unused ``six`` dependency; support Python
+  3.11 and 3.12 (fixes `#223 <https://github.com/daler/gffutils/issues/223>`__)
+
 v0.12
 -----
 

diff --git a/gffutils/attributes.py b/gffutils/attributes.py
@@ -1,4 +1,3 @@
-import six
 import collections
 
 try:
@@ -95,7 +94,7 @@ def __str__(self):
         return "\n".join(s)
 
     def update(self, *args, **kwargs):
-        for k, v in six.iteritems(dict(*args, **kwargs)):
+        for k, v in dict(*args, **kwargs).items():
             self[k] = v
 
 

diff --git a/gffutils/biopython_integration.py b/gffutils/biopython_integration.py
@@ -2,7 +2,6 @@
 Module for integration with BioPython, specifically SeqRecords and SeqFeature
 objects.
 """
-import six
 
 try:
     from Bio.SeqFeature import SeqFeature, FeatureLocation
@@ -15,7 +14,8 @@
 _biopython_strand = {
     "+": 1,
     "-": -1,
-    ".": 0,
+    ".": None,
+    "?": 0,
 }
 _feature_strand = dict((v, k) for k, v in _biopython_strand.items())
 
@@ -33,7 +33,7 @@ def to_seqfeature(feature):
         If string, assume it is a GFF or GTF-format line; otherwise just use
         the provided feature directly.
     """
-    if isinstance(feature, six.string_types):
+    if isinstance(feature, str):
         feature = feature_from_line(feature)
 
     qualifiers = {
@@ -46,10 +46,11 @@ def to_seqfeature(feature):
     return SeqFeature(
         # Convert from GFF 1-based to standard Python 0-based indexing used by
         # BioPython
-        FeatureLocation(feature.start - 1, feature.stop),
+        FeatureLocation(
+            feature.start - 1, feature.stop, strand=_biopython_strand[feature.strand]
+        ),
         id=feature.id,
         type=feature.featuretype,
-        strand=_biopython_strand[feature.strand],
         qualifiers=qualifiers,
     )
 
@@ -66,12 +67,12 @@ def from_seqfeature(s, **kwargs):
     score = s.qualifiers.get("score", ".")[0]
     seqid = s.qualifiers.get("seqid", ".")[0]
     frame = s.qualifiers.get("frame", ".")[0]
-    strand = _feature_strand[s.strand]
+    strand = _feature_strand[s.location.strand]
 
     # BioPython parses 1-based GenBank positions into 0-based for use within
     # Python.  We need to convert back to 1-based GFF format here.
-    start = s.location.start.position + 1
-    stop = s.location.end.position
+    start = s.location.start + 1
+    stop = s.location.end
     featuretype = s.type
     id = s.id
     attributes = dict(s.qualifiers)

diff --git a/gffutils/convert.py b/gffutils/convert.py
@@ -2,8 +2,6 @@
 Conversion functions that operate on :class:`FeatureDB` classes.
 """
 
-import six
-
 
 def to_bed12(f, db, child_type="exon", name_field="ID"):
     """
@@ -22,7 +20,7 @@ def to_bed12(f, db, child_type="exon", name_field="ID"):
         Attribute to be used in the "name" field of the BED12 entry.  Usually
         "ID" for GFF; "transcript_id" for GTF.
     """
-    if isinstance(f, six.string_types):
+    if isinstance(f, str):
         f = db[f]
     children = list(db.children(f, featuretype=child_type, order_by="start"))
     sizes = [len(i) for i in children]

diff --git a/gffutils/create.py b/gffutils/create.py
@@ -5,7 +5,6 @@
 import sys
 import os
 import sqlite3
-import six
 from textwrap import dedent
 from gffutils import constants
 from gffutils import version
@@ -119,7 +118,7 @@ def __init__(
                 os.unlink(dbfn)
         self.dbfn = dbfn
         self.id_spec = id_spec
-        if isinstance(dbfn, six.string_types):
+        if isinstance(dbfn, str):
             conn = sqlite3.connect(dbfn)
         else:
             conn = dbfn
@@ -171,7 +170,7 @@ def _id_handler(self, f):
         """
 
         # If id_spec is a string or callable, convert to iterable for later
-        if isinstance(self.id_spec, six.string_types):
+        if isinstance(self.id_spec, str):
             id_key = [self.id_spec]
         elif hasattr(self.id_spec, "__call__"):
             id_key = [self.id_spec]
@@ -181,7 +180,7 @@ def _id_handler(self, f):
         elif isinstance(self.id_spec, dict):
             try:
                 id_key = self.id_spec[f.featuretype]
-                if isinstance(id_key, six.string_types):
+                if isinstance(id_key, str):
                     id_key = [id_key]
 
             # Otherwise, use default auto-increment.
@@ -217,7 +216,8 @@ def _id_handler(self, f):
                                 "a single value is required for a primary key in the "
                                 "database. Consider using a custom id_spec to "
                                 "convert these multiple values into a single "
-                                "value".format(k))
+                                "value".format(k)
+                            )
                     except KeyError:
                         pass
                     try:
@@ -684,7 +684,7 @@ def _update_relations(self):
         # c.execute('CREATE INDEX childindex ON relations (child)')
         # self.conn.commit()
 
-        if isinstance(self._keep_tempfiles, six.string_types):
+        if isinstance(self._keep_tempfiles, str):
             suffix = self._keep_tempfiles
         else:
             suffix = ".gffutils"
@@ -883,7 +883,7 @@ def _update_relations(self):
             msg = "transcript"
         logger.info("Inferring %s extents " "and writing to tempfile" % msg)
 
-        if isinstance(self._keep_tempfiles, six.string_types):
+        if isinstance(self._keep_tempfiles, str):
             suffix = self._keep_tempfiles
         else:
             suffix = ".gffutils"

diff --git a/gffutils/feature.py b/gffutils/feature.py
@@ -1,5 +1,4 @@
 from pyfaidx import Fasta
-import six
 import simplejson as json
 from gffutils import constants
 from gffutils import helpers
@@ -166,7 +165,7 @@ def __init__(
         # for testing.
         attributes = attributes or dict_class()
 
-        if isinstance(attributes, six.string_types):
+        if isinstance(attributes, str):
             try:
                 attributes = helpers._unjsonify(attributes, isattributes=True)
 
@@ -182,7 +181,7 @@ def __init__(
         # If string, then try un-JSONifying it into a list; if that doesn't
         # work then assume it's tab-delimited and convert to a list.
         extra = extra or []
-        if isinstance(extra, six.string_types):
+        if isinstance(extra, str):
             try:
                 extra = helpers._unjsonify(extra)
             except json.JSONDecodeError:
@@ -254,10 +253,7 @@ def __setitem__(self, key, value):
             self.attributes[key] = value
 
     def __str__(self):
-        if six.PY3:
-            return self.__unicode__()
-        else:
-            return unicode(self).encode("utf-8")
+        return self.__unicode__()
 
     def __unicode__(self):
 
@@ -387,7 +383,7 @@ def sequence(self, fasta, use_strand=True):
         -------
         string
         """
-        if isinstance(fasta, six.string_types):
+        if isinstance(fasta, str):
             fasta = Fasta(fasta, as_raw=False)
 
         # recall GTF/GFF is 1-based closed;  pyfaidx uses Python slice notation

diff --git a/gffutils/gffwriter.py b/gffutils/gffwriter.py
@@ -1,7 +1,6 @@
 ##
 ## GFF Writer (writer): serializing gffutils records as GFF text files.
 ##
-import six
 import tempfile
 import shutil
 from time import strftime, localtime
@@ -41,7 +40,7 @@ def __init__(self, out, with_header=True, in_place=False):
         self.temp_file = None
         # Output stream to write to
         self.out_stream = None
-        if isinstance(out, six.string_types):
+        if isinstance(out, str):
             if self.in_place:
                 # Use temporary file
                 self.temp_file = tempfile.NamedTemporaryFile(delete=False)

diff --git a/gffutils/helpers.py b/gffutils/helpers.py
@@ -4,7 +4,6 @@
 import simplejson as json
 import time
 import tempfile
-import six
 from gffutils import constants
 from gffutils import bins
 import gffutils
@@ -202,7 +201,7 @@ def make_query(
         # e.g., "featuretype = 'exon'"
         #
         # or, "featuretype IN ('exon', 'CDS')"
-        if isinstance(featuretype, six.string_types):
+        if isinstance(featuretype, str):
             d["FEATURETYPE"] = "features.featuretype = ?"
             args.append(featuretype)
         else:
@@ -218,7 +217,7 @@ def make_query(
         # `limit` is a string or a tuple of (chrom, start, stop)
         #
         # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000"
-        if isinstance(limit, six.string_types):
+        if isinstance(limit, str):
             seqid, startstop = limit.split(":")
             start, end = startstop.split("-")
         else:
@@ -257,7 +256,7 @@ def make_query(
         # Default is essentially random order.
         #
         # e.g. "ORDER BY seqid, start DESC"
-        if isinstance(order_by, six.string_types):
+        if isinstance(order_by, str):
             _order_by.append(order_by)
 
         else:
@@ -387,7 +386,7 @@ def merge_attributes(attr1, attr2, numeric_sort=False):
         if not isinstance(v, list):
             new_d[k] = [v]
 
-    for k, v in six.iteritems(attr1):
+    for k, v in attr1.items():
         if k in attr2:
             if not isinstance(v, list):
                 v = [v]
@@ -507,9 +506,9 @@ def is_gff_db(db_fname):
 
 
 def to_unicode(obj, encoding="utf-8"):
-    if isinstance(obj, six.string_types):
-        if not isinstance(obj, six.text_type):
-            obj = six.text_type(obj, encoding)
+    if isinstance(obj, str):
+        if not isinstance(obj, str):
+            obj = str(obj, encoding)
     return obj
 
 
@@ -520,7 +519,6 @@ def canonical_transcripts(db, fasta_filename):
     """
     import pyfaidx
 
-
     fasta = pyfaidx.Fasta(fasta_filename, as_raw=False)
     for gene in db.features_of_type("gene"):
 
@@ -536,7 +534,20 @@ def canonical_transcripts(db, fasta_filename):
                     cds_len += exon_length
                 total_len += exon_length
 
-            exon_list.append((cds_len, total_len, transcript, exons if cds_len == 0 else [e for e in exons if e.featuretype in ['CDS', 'five_prime_UTR', 'three_prime_UTR']]))
+            exon_list.append(
+                (
+                    cds_len,
+                    total_len,
+                    transcript,
+                    exons
+                    if cds_len == 0
+                    else [
+                        e
+                        for e in exons
+                        if e.featuretype in ["CDS", "five_prime_UTR", "three_prime_UTR"]
+                    ],
+                )
+            )
 
         # If we have CDS, then use the longest coding transcript
         if max(i[0] for i in exon_list) > 0:
@@ -549,7 +560,12 @@ def canonical_transcripts(db, fasta_filename):
 
         canonical_exons = best[-1]
         transcript = best[-2]
-        seqs = [i.sequence(fasta) for i in sorted(canonical_exons, key=lambda x: x.start, reverse=transcript.strand != '+')]
+        seqs = [
+            i.sequence(fasta)
+            for i in sorted(
+                canonical_exons, key=lambda x: x.start, reverse=transcript.strand != "+"
+            )
+        ]
         yield transcript, "".join(seqs)