diff --git a/gffutils/biopython_integration.py b/gffutils/biopython_integration.py index 7248253..63356cd 100644 --- a/gffutils/biopython_integration.py +++ b/gffutils/biopython_integration.py @@ -47,9 +47,8 @@ def to_seqfeature(feature): # Convert from GFF 1-based to standard Python 0-based indexing used by # BioPython FeatureLocation( - feature.start - 1, - feature.stop, - strand=_biopython_strand[feature.strand]), + feature.start - 1, feature.stop, strand=_biopython_strand[feature.strand] + ), id=feature.id, type=feature.featuretype, qualifiers=qualifiers, diff --git a/gffutils/convert.py b/gffutils/convert.py index 30517e1..ab2e286 100644 --- a/gffutils/convert.py +++ b/gffutils/convert.py @@ -3,7 +3,6 @@ """ - def to_bed12(f, db, child_type="exon", name_field="ID"): """ Given a top-level feature (e.g., transcript), construct a BED12 entry diff --git a/gffutils/create.py b/gffutils/create.py index d2b20c4..e137c1a 100644 --- a/gffutils/create.py +++ b/gffutils/create.py @@ -216,7 +216,8 @@ def _id_handler(self, f): "a single value is required for a primary key in the " "database. Consider using a custom id_spec to " "convert these multiple values into a single " - "value".format(k)) + "value".format(k) + ) except KeyError: pass try: diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 6de223b..234994d 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -519,7 +519,6 @@ def canonical_transcripts(db, fasta_filename): """ import pyfaidx - fasta = pyfaidx.Fasta(fasta_filename, as_raw=False) for gene in db.features_of_type("gene"): @@ -535,7 +534,20 @@ def canonical_transcripts(db, fasta_filename): cds_len += exon_length total_len += exon_length - exon_list.append((cds_len, total_len, transcript, exons if cds_len == 0 else [e for e in exons if e.featuretype in ['CDS', 'five_prime_UTR', 'three_prime_UTR']])) + exon_list.append( + ( + cds_len, + total_len, + transcript, + exons + if cds_len == 0 + else [ + e + for e in exons + if e.featuretype in ["CDS", "five_prime_UTR", "three_prime_UTR"] + ], + ) + ) # If we have CDS, then use the longest coding transcript if max(i[0] for i in exon_list) > 0: @@ -548,7 +560,12 @@ def canonical_transcripts(db, fasta_filename): canonical_exons = best[-1] transcript = best[-2] - seqs = [i.sequence(fasta) for i in sorted(canonical_exons, key=lambda x: x.start, reverse=transcript.strand != '+')] + seqs = [ + i.sequence(fasta) + for i in sorted( + canonical_exons, key=lambda x: x.start, reverse=transcript.strand != "+" + ) + ] yield transcript, "".join(seqs) diff --git a/gffutils/interface.py b/gffutils/interface.py index f35f43d..3b764ed 100644 --- a/gffutils/interface.py +++ b/gffutils/interface.py @@ -101,7 +101,7 @@ def __init__( keep_order=False, pragmas=constants.default_pragmas, sort_attribute_values=False, - text_factory=str + text_factory=str, ): """ Connect to a database created by :func:`gffutils.create_db`. @@ -871,10 +871,21 @@ def _init_interfeature(f): Used to initialize a new interfeature that is ready to be updated in-place. """ - keys = ['id', 'seqid', 'source', 'featuretype', 'start', 'end', - 'score', 'strand', 'frame', 'attributes', 'bin'] + keys = [ + "id", + "seqid", + "source", + "featuretype", + "start", + "end", + "score", + "strand", + "frame", + "attributes", + "bin", + ] d = dict(zip(keys, f.astuple())) - d['source'] = 'gffutils_derived' + d["source"] = "gffutils_derived" return d def _prep_for_yield(d): @@ -885,12 +896,12 @@ def _prep_for_yield(d): If start is greater than stop (which happens when trying to get interfeatures for overlapping features), then return None. """ - d['start'] += 1 - d['end'] -= 1 - new_bin = bins.bins(d['start'], d['end'], one=True) - d['bin'] = new_bin + d["start"] += 1 + d["end"] -= 1 + new_bin = bins.bins(d["start"], d["end"], one=True) + d["bin"] = new_bin - if d['start'] > d['end']: + if d["start"] > d["end"]: return None new_feature = self._feature_returner(**d) @@ -898,12 +909,13 @@ def _prep_for_yield(d): # concat list of ID to create uniq IDs because feature with # multiple values for their ID are no longer permitted since v0.11 if "ID" in new_feature.attributes and len(new_feature.attributes["ID"]) > 1: - new_id = '-'.join(new_feature.attributes["ID"]) + new_id = "-".join(new_feature.attributes["ID"]) new_feature.attributes["ID"] = [new_id] return new_feature # If not provided, use a no-op function instead. if not attribute_func: + def attribute_func(a): return a @@ -932,23 +944,23 @@ def attribute_func(a): nfeatures += 1 # Adjust the interfeature dict in-place with coords... - interfeature['start'] = last_feature.stop - interfeature['end'] = f.start + interfeature["start"] = last_feature.stop + interfeature["end"] = f.start # ...featuretype if new_featuretype is None: - interfeature['featuretype'] = "inter_%s_%s" % ( + interfeature["featuretype"] = "inter_%s_%s" % ( last_feature.featuretype, f.featuretype, ) else: - interfeature['featuretype'] = new_featuretype + interfeature["featuretype"] = new_featuretype # ...strand if last_feature.strand != f.strand: - interfeature['strand'] = '.' + interfeature["strand"] = "." else: - interfeature['strand'] = f.strand + interfeature["strand"] = f.strand # and attributes if merge_attributes: @@ -963,7 +975,7 @@ def attribute_func(a): if update_attributes: new_attributes.update(update_attributes) - interfeature['attributes'] = new_attributes + interfeature["attributes"] = new_attributes # Ready to yield new_feature = _prep_for_yield(interfeature) @@ -1389,11 +1401,12 @@ def child_gen(): splice_site.start = splice_site.end - 1 # make ID uniq by adding suffix - splice_site.attributes["ID"] = [new_featuretype + "_" + splice_site.attributes["ID"][0]] + splice_site.attributes["ID"] = [ + new_featuretype + "_" + splice_site.attributes["ID"][0] + ] yield splice_site - def _old_merge(self, features, ignore_strand=False): """ DEPRECATED, only retained here for backwards compatibility. Please use @@ -1709,10 +1722,12 @@ def merge_all( return result_features def children_bp( - self, feature, child_featuretype="exon", merge=False, - merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, - mc.feature_type), - **kwargs + self, + feature, + child_featuretype="exon", + merge=False, + merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type), + **kwargs ): """ Total bp of all children of a featuretype. @@ -1751,9 +1766,14 @@ def children_bp( raise ValueError( "'ignore_strand' has been deprecated; please use " "merge_criteria to control how features should be merged. " - "E.g., leave out the 'mc.strand' criteria to ignore strand.") + "E.g., leave out the 'mc.strand' criteria to ignore strand." + ) else: - raise TypeError("merge() got unexpected keyword arguments '{}'".format(kwargs.keys())) + raise TypeError( + "merge() got unexpected keyword arguments '{}'".format( + kwargs.keys() + ) + ) children = self.children( feature, featuretype=child_featuretype, order_by="start" @@ -1940,7 +1960,6 @@ def seqids(self): for (i,) in c: yield i - # Recycle the docs for _relation so they stay consistent between parents() # and children() children.__doc__ = children.__doc__.format(_relation_docstring=_relation.__doc__) diff --git a/gffutils/test/conftest.py b/gffutils/test/conftest.py index 47d768e..d4d1c0a 100644 --- a/gffutils/test/conftest.py +++ b/gffutils/test/conftest.py @@ -1 +1 @@ -collect_ignore=["data"] +collect_ignore = ["data"] diff --git a/gffutils/test/test_1.py b/gffutils/test/test_1.py index c2be6cf..2b88cc0 100644 --- a/gffutils/test/test_1.py +++ b/gffutils/test/test_1.py @@ -636,7 +636,7 @@ def test_feature_merge(): id_spec="gene_id", force_merge_fields=["start"], keep_order=True, - ) + ) # test that warnings are raised because of strand and frame with warnings.catch_warnings(record=True) as w: @@ -933,17 +933,19 @@ def _transform(f): [(i.start, i.stop) for i in db.features_of_type("exon")] ) + def clean_tempdir(): tempfile.tempdir = tempdir if os.path.exists(tempdir): shutil.rmtree(tempdir) os.makedirs(tempdir) + # specify a writeable temp dir for testing tempdir = "/tmp/gffutils-test" -def test_tempfiles(): +def test_tempfiles(): clean_tempdir() @@ -991,6 +993,7 @@ def test_tempfiles(): assert len(filelist) == 1, filelist assert filelist[0].endswith(".GFFtmp") + @pytest.mark.skip(reason="Unclear if still needed; currently failing") def test_parallel_db(): # DISABLING in v0.12 @@ -1113,7 +1116,7 @@ def test_deprecation_handler(): gffutils.example_filename("FBgn0031208.gtf"), ":memory:", infer_gene_extent=False, - ) + ) def test_nonsense_kwarg(): @@ -1122,7 +1125,7 @@ def test_nonsense_kwarg(): gffutils.example_filename("FBgn0031208.gtf"), ":memory:", asdf=True, - ) + ) def test_infer_gene_extent(): @@ -1241,7 +1244,8 @@ def test_create_splice_sites(): db = gffutils.create_db(fn, ":memory:") db = db.update(db.create_splice_sites()) observed = "\n".join(str(feature) for feature in db.all_features()) - expected = dedent("""\ + expected = dedent( + """\ chr1 ensGene gene 4763287 4775820 . - . Name=ENSMUSG00000033845;ID=ENSMUSG00000033845;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845 chr1 ensGene mRNA 4764517 4775779 . - . Name=ENSMUST00000045689;Parent=ENSMUSG00000033845;ID=ENSMUST00000045689;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845 chr1 ensGene CDS 4775654 4775758 . - 0 Name=ENSMUST00000045689.cds0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds0;gid=ENSMUSG00000033845 @@ -1259,13 +1263,12 @@ def test_create_splice_sites(): chr1 gffutils_derived three_prime_cis_splice_site 4772815 4772816 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845 chr1 gffutils_derived five_prime_cis_splice_site 4767604 4767605 . - . Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845 chr1 gffutils_derived five_prime_cis_splice_site 4772647 4772648 . - . Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845 - chr1 gffutils_derived five_prime_cis_splice_site 4775652 4775653 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845""") + chr1 gffutils_derived five_prime_cis_splice_site 4775652 4775653 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845""" + ) assert observed == expected - - if __name__ == "__main__": # this test case fails # test_attributes_modify() diff --git a/gffutils/test/test_issues.py b/gffutils/test/test_issues.py index 74171db..79996ba 100644 --- a/gffutils/test/test_issues.py +++ b/gffutils/test/test_issues.py @@ -13,6 +13,7 @@ import pytest + def test_issue_79(): gtf = gffutils.example_filename("keep-order-test.gtf") db = gffutils.create_db( @@ -291,41 +292,41 @@ def test_issue_128(): # In #128, some lines had separators of "; " and some with ";". The first # one in the file would win. Now the detection pays more attention to lines # with more attributes to make it work properly - gff = gffutils.example_filename('gms2_example.gff3') + gff = gffutils.example_filename("gms2_example.gff3") db = gffutils.create_db(gff, ":memory:", force=True) expected = { - 'ID': ['1'], - 'Parent': ['gene_1'], - 'gene_type': ['native'], - 'partial': ['11'], - 'gc': ['33'], - 'length': ['363'], + "ID": ["1"], + "Parent": ["gene_1"], + "gene_type": ["native"], + "partial": ["11"], + "gc": ["33"], + "length": ["363"], } - assert dict(db['1'].attributes) == expected + assert dict(db["1"].attributes) == expected def test_issue_157(): # With the merge overhaul, children_bp incorrectly still used ignore_strand. - db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ":memory:") - gene = next(db.features_of_type('gene')) - children = list(db.children(gene, featuretype='exon')) + db = gffutils.create_db(gffutils.example_filename("FBgn0031208.gff"), ":memory:") + gene = next(db.features_of_type("gene")) + children = list(db.children(gene, featuretype="exon")) # Modify the last one to have a different strand so we can test the # ignore_strand argument. - children[-1].strand = '-' - db.update(children[-1:], merge_strategy='replace') + children[-1].strand = "-" + db.update(children[-1:], merge_strategy="replace") # and, since updating has been problematic in the past, double-check again # that the strand is changed in the db. - assert list(db.children(gene, featuretype='exon'))[-1].strand == '-' - cbp1 = db.children_bp(gene, child_featuretype='exon') + assert list(db.children(gene, featuretype="exon"))[-1].strand == "-" + cbp1 = db.children_bp(gene, child_featuretype="exon") # Previously this would give: # TypeError: merge() got an unexpected keyword argument 'ignore_strand' # - # Now changing to ValueError and suggesting a fix. + # Now changing to ValueError and suggesting a fix. with pytest.raises(ValueError): - db.children_bp(gene, child_featuretype='exon', merge=True, ignore_strand=True) + db.children_bp(gene, child_featuretype="exon", merge=True, ignore_strand=True) with pytest.raises(ValueError): db.children_bp(gene, ignore_strand=True, nonexistent=True) with pytest.raises(TypeError): @@ -333,12 +334,17 @@ def test_issue_157(): # The way to do it now is the following (we can omit the mc.feature_type # since we're preselecting for exons anyway): - db.children_bp(gene, child_featuretype='exon', merge=True, merge_criteria=(mc.overlap_end_inclusive)) + db.children_bp( + gene, + child_featuretype="exon", + merge=True, + merge_criteria=(mc.overlap_end_inclusive), + ) def test_issue_159(): - db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ":memory:") - fasta = gffutils.example_filename('dm6-chr2L.fa') + db = gffutils.create_db(gffutils.example_filename("FBgn0031208.gff"), ":memory:") + fasta = gffutils.example_filename("dm6-chr2L.fa") for transcript, seq in gffutils.helpers.canonical_transcripts(db, fasta): pass @@ -347,56 +353,59 @@ def test_issue_164(): # Something strange with the original annotation, but seems fine at least # after pasting in the offending genes from the GitHub comments. db = gffutils.create_db( - gffutils.example_filename('sharr.gtf'), - ':memory:', + gffutils.example_filename("sharr.gtf"), + ":memory:", disable_infer_transcripts=True, disable_infer_genes=True, - id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'}, - merge_strategy='create_unique', - keep_order=True) + id_spec={"gene": "gene_id", "transcript": "transcript_id"}, + merge_strategy="create_unique", + keep_order=True, + ) def test_issue_166(): # Added the new FeatureDB.seqids() method. - db = gffutils.create_db(gffutils.example_filename('nonascii'), ':memory:') + db = gffutils.create_db(gffutils.example_filename("nonascii"), ":memory:") seqs = list(db.seqids()) - assert seqs == ['2L', '2R', '3L', '3R', 'X'], seqs + assert seqs == ["2L", "2R", "3L", "3R", "X"], seqs def test_issue_167(): # Previously was causing sqlite3.InterfaceError - db = gffutils.create_db(gffutils.example_filename('issue167.gff'), ':memory:') + db = gffutils.create_db(gffutils.example_filename("issue167.gff"), ":memory:") def test_issue_174(): db = gffutils.create_db( - gffutils.example_filename('issue174.gtf'), - ':memory:', - merge_strategy='warning', + gffutils.example_filename("issue174.gtf"), + ":memory:", + merge_strategy="warning", ) introns = [f for f in db.create_introns()] - observed = [i.attributes['exon_number'] for i in introns] - assert observed[7] == ['8', '9'] - assert observed[8] == ['10', '9'] - assert observed[9] == ['10', '11'] + observed = [i.attributes["exon_number"] for i in introns] + assert observed[7] == ["8", "9"] + assert observed[8] == ["10", "9"] + assert observed[9] == ["10", "11"] # Now do the same thing, but with the new numeric_sort arg introns = [f for f in db.create_introns(numeric_sort=True)] - observed = [i.attributes['exon_number'] for i in introns] - assert observed[7] == ['8', '9'] + observed = [i.attributes["exon_number"] for i in introns] + assert observed[7] == ["8", "9"] # This should be fixed: - assert observed[8] == ['9', '10'] - assert observed[9] == ['10', '11'] + assert observed[8] == ["9", "10"] + assert observed[9] == ["10", "11"] + def test_issue_197(): # Previously this would fail with ValueError due to using the stop position # of the last item on the previous chrom as the start position. - db = gffutils.create_db(gffutils.example_filename('issue_197.gff'), ':memory:', merge_strategy='error') - genes = list(db.features_of_type('gene')) - igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') ) - + db = gffutils.create_db( + gffutils.example_filename("issue_197.gff"), ":memory:", merge_strategy="error" + ) + genes = list(db.features_of_type("gene")) + igss = list(db.interfeatures(genes, new_featuretype="intergenic_space")) # Prior to PR #219, multiple IDs could be created by interfeatures, which # in turn was patched here by providing the transform to db.update. With @@ -405,23 +414,24 @@ def test_issue_197(): # # However, keeping the test as-is to ensure backward-compatibility. def transform(f): - f['ID'] = [ '-'.join(f.attributes['ID']) ] + f["ID"] = ["-".join(f.attributes["ID"])] return f - db = db.update(igss, transform=transform, merge_strategy='error') + db = db.update(igss, transform=transform, merge_strategy="error") - obs = list(db.features_of_type('intergenic_space')) + obs = list(db.features_of_type("intergenic_space")) for i in obs: print(i) assert [str(i) for i in obs] == [ - 'tig00000492\tgffutils_derived\tintergenic_space\t47236\t47350\t.\t-\t.\tID=ctg492.gene0001-ctg492.gene0002;Name=gene0001,gene0002', - 'tig00000492\tgffutils_derived\tintergenic_space\t48257\t49999\t.\t-\t.\tID=ctg492.gene0002-gene0;Name=gene0002', - 'tig00000492\tgffutils_derived\tintergenic_space\t50050\t50054\t.\t-\t.\tID=gene3-gene4', - 'tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5', - 'tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6', + "tig00000492\tgffutils_derived\tintergenic_space\t47236\t47350\t.\t-\t.\tID=ctg492.gene0001-ctg492.gene0002;Name=gene0001,gene0002", + "tig00000492\tgffutils_derived\tintergenic_space\t48257\t49999\t.\t-\t.\tID=ctg492.gene0002-gene0;Name=gene0002", + "tig00000492\tgffutils_derived\tintergenic_space\t50050\t50054\t.\t-\t.\tID=gene3-gene4", + "tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5", + "tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6", ] + def test_issue_198(): line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; db_xref "HGNC:HGNC:38034"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' @@ -440,7 +450,7 @@ def test_issue_198(): # of repeated keys always wins. f = feature.feature_from_line(line) - assert f.attributes['description'] == ['WASP family homolog 7, pseudogene'] + assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] # If we remove one of the db_xref keys, then the parser sees the comma and # figures it's a multivalue key. @@ -449,59 +459,62 @@ def test_issue_198(): # Previous result, note leading space --------------------------->| | # assert f.attributes['description'] == ['WASP family homolog 7', ' pseudogene'] - assert f.attributes['description'] == ['WASP family homolog 7, pseudogene'] + assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] # But removing that space before "pseudogene" means it's interpreted as # a multivalue attribute line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' f = feature.feature_from_line(line) - assert f.attributes['description'] == ['WASP family homolog 7', 'pseudogene'] + assert f.attributes["description"] == ["WASP family homolog 7", "pseudogene"] # Confirm behavior of corner cases like a trailing comma line = "chr17 RefSeq CDS 6806527 6806553 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1," f = feature.feature_from_line(line) - assert f.attributes['Parent'] == ['XM_001475631.1', ''] + assert f.attributes["Parent"] == ["XM_001475631.1", ""] def test_issue_207(): - def _check(txt, expected_keys, dialect_trailing_semicolon): - db = gffutils.create_db(txt.replace(' ', '\t'), ':memory:', from_string=True) + db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True) assert [list(f.attributes.keys()) for f in db.all_features()] == expected_keys - assert db.dialect['trailing semicolon'] == dialect_trailing_semicolon + assert db.dialect["trailing semicolon"] == dialect_trailing_semicolon # All lines have trailing semicolon _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1; chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1; - """), - expected_keys = [ - ['ID'], - ['ID', 'Parent'], - ['ID', 'Parent'], - ['ID', 'Parent'], + """ + ), + expected_keys=[ + ["ID"], + ["ID", "Parent"], + ["ID", "Parent"], + ["ID", "Parent"], ], - dialect_trailing_semicolon=True + dialect_trailing_semicolon=True, ) # First two lines have trailing semicolon. However, the heuristics of # dialect selection, which favor attributes with more values (assuming more # information), decides that this file does NOT have trailing semicolons. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1 - """), - expected_keys = [ - ['ID', ''], - ['ID', 'Parent', ''], - ['ID', 'Parent'], - ['ID', 'Parent'], + """ + ), + expected_keys=[ + ["ID", ""], + ["ID", "Parent", ""], + ["ID", "Parent"], + ["ID", "Parent"], ], dialect_trailing_semicolon=False, ) @@ -514,18 +527,15 @@ def _check(txt, expected_keys, dialect_trailing_semicolon): # as above to give higher weight, and to break the tie between with and # without trailing semicolon, falls back to first dialect observed. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1 - """), - expected_keys=[ - ['ID'], - ['ID'], - ['ID'], - ['ID'] - ], + """ + ), + expected_keys=[["ID"], ["ID"], ["ID"], ["ID"]], dialect_trailing_semicolon=True, ) @@ -533,39 +543,37 @@ def _check(txt, expected_keys, dialect_trailing_semicolon): # semicolon by giving one more line as evidence. Only difference is from # above is the last line. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1; chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1 chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1904.t1.d1.1 - """), + """ + ), expected_keys=[ - ['ID', ''], - ['ID', ''], - ['ID'], - ['ID'], - ['ID'], + ["ID", ""], + ["ID", ""], + ["ID"], + ["ID"], + ["ID"], ], dialect_trailing_semicolon=False, ) - # Again seems inconsistent at first, but heuristics break ties by # preferring first dialect, which here is no trailing semicolon. _check( - txt=dedent("""\ + txt=dedent( + """\ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903 chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1 chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1; chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1; - """), - expected_keys=[ - ['ID'], - ['ID'], - ['ID', ''], - ['ID', ''] - ], + """ + ), + expected_keys=[["ID"], ["ID"], ["ID", ""], ["ID", ""]], dialect_trailing_semicolon=False, ) @@ -587,7 +595,6 @@ def test_issue_213(): it = gffutils.iterators.DataIterator(data, from_string=True) assert it.directives == ["gff-version 3"] - # Ensure they're parsed into the db from a string db = gffutils.create_db(data, dbfn=":memory:", from_string=True, verbose=False) assert db.directives == ["gff-version 3"], db.directives @@ -602,6 +609,6 @@ def test_issue_213(): # Ensure they're parsed into the db from a file, and going to a file (to # exactly replicate example in #213) - db = gffutils.create_db(tmp, dbfn='issue_213.db', force=True) + db = gffutils.create_db(tmp, dbfn="issue_213.db", force=True) assert db.directives == ["gff-version 3"], db.directives assert len(db.directives) == 1