Replace Circos by pyCirclize (#344)

* replace Circos by PyCirclize * commit first draft * implement feature type figure * remove Circos test limitations * rearrange standard plot * add oriC/V/T feature lables * remove orphan imports * polish feature plot * fix plot function in json_io * implement COG plot type * refactor code * add ncRNA reg to legend * refactor plotting code * polish plot center text * add support for custom plot labels * drop python=3.8 support * discard support of DeepSig
oschwengers · Nov 12, 2024 · 6df39d6 · 6df39d6
1 parent 518253a
commit 6df39d6
Show file tree

Hide file tree

Showing 21 changed files with 385 additions and 483 deletions.
diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: 'ubuntu-latest'
     strategy:
       matrix:
-        python-version: ['3.8', '3.10']
+        python-version: ['3.9', '3.10']
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}

diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: ['ubuntu-latest']  # , 'macos-latest']
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.9', '3.10']
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}

diff --git a/Dockerfile b/Dockerfile
@@ -17,7 +17,6 @@ RUN apk update && apk add wget tar bash \
     && cp /root/.bashrc /opt/conda/bashrc
 
 COPY environment.yml /tmp/
-RUN echo -e '\n  - deepsig>=1.2.5' >> /tmp/environment.yml
 
 SHELL ["bash", "-l" ,"-c"]
 

diff --git a/README.md b/README.md
@@ -732,7 +732,7 @@ It accepts the results of a former annotation process in JSON format and allows
 ### Usage
 
 ```bash
-usage: bakta_plot [--config CONFIG] [--output OUTPUT] [--prefix PREFIX] [--sequences SEQUENCES] [--type {features,cog}] [--help] [--verbose] [--debug] [--tmp-dir TMP_DIR] [--version] <input>
+usage: bakta_plot [--config CONFIG] [--output OUTPUT] [--prefix PREFIX] [--sequences SEQUENCES] [--type {features,cog}] [--label LABEL] [--help] [--verbose] [--debug] [--tmp-dir TMP_DIR] [--version] <input>
 
 Rapid & standardized annotation of bacterial genomes, MAGs & plasmids
 
@@ -752,6 +752,9 @@ Plotting:
                         Sequences to plot: comma separated number or name (default = all, numbers one-based)
   --type {features,cog}
                         Plot type: feature/cog (default = features)
+  --label LABEL         Plot center label (for line breaks use '|')
+  --size {4,8,16}       Plot size in inches: 4/8/16 (default = 8)
+  --dpi {150,300,600}   Plot resolution as dots per inch: 150/300/600 (default = 300)
 
 General:
   --help, -h            Show this help message and exit
@@ -778,6 +781,14 @@ In the `cog` mode, all protein-coding genes (CDS) are colored due to assigned CO
 
 In addition, both plot types share two innermost GC content and GC skew rings. The first ring represents the GC content per sliding window over the entire sequence(s) in green (`#33a02c`) and red `#e31a1c` representing GC above and below average, respectively. The 2nd ring represents the GC skew in orange (`#fdbf6f`) and blue (`#1f78b4`). The GC skew gives hints on a replicon's replication bubble and hence, on the completeness of the assembly. On a complete & circular bacterial chromosome, you normally see two inflection points at the origin of replication and at its opposite region -> [Wikipedia](https://en.wikipedia.org/wiki/GC_skew)
 
+Custom plot labels (text in the center) can be provided via `--label`:
+
+```bash
+bakta_plot --sequences 2 --dpi 300 --size 8 --prefix plot-cog-p2 --type cog --label="pO157|plasmid, 92.7 kbp"
+```
+
+![Plot example of Bakta test genome.](/examples/plot-cog-p2.png)
+
 ## Auxiliary scripts
 
 Often, the usage of Bakta is a necessary upfront task followed by deeper analyses implemented in custom scripts. In [scripts](scripts) we'd like to collect & offer a pool of scripts addressing common tasks:

diff --git a/bakta/features/signal_peptides.py b/bakta/features/signal_peptides.py
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
@@ -1,7 +1,7 @@
 import logging
 import re
 
-from datetime import date, datetime
+from datetime import date
 from pathlib import Path
 from typing import Sequence, Tuple
 
@@ -18,9 +18,7 @@
 log = logging.getLogger('INSDC')
 
 
-def write_features(data: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
-    log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path)
-
+def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
     sequence_list = []
     for seq in data['sequences']:
         sequence_features = [feat for feat in features if feat['sequence'] == seq['id']]
@@ -275,7 +273,13 @@ def write_features(data: dict, features: Sequence[dict], genbank_output_path: Pa
                 seq_feature_list.append(acc_feature)
         sequence_record.features = seq_feature_list
         sequence_list.append(sequence_record)
+    return sequence_list
+
+
+def write_features(data: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
+    log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path)
 
+    sequence_list = build_biopython_sequence_list(data, features)
     with genbank_output_path.open('wt', encoding='utf-8') as fh:
         log.info('write GenBank: path=%s', genbank_output_path)
         SeqIO.write(sequence_list, fh, format='genbank')

diff --git a/bakta/json_io.py b/bakta/json_io.py
@@ -149,7 +149,7 @@ def main():
     tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path)
 
     print('\tcircular genome plot...')
-    plot.write(features, data['sequences'], cfg.output_path)
+    plot.write(data, features, cfg.output_path)
 
     hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
     print('\thypothetical TSV...')
@@ -181,7 +181,6 @@ def main():
         fh_out.write(f"CDSs: {len(cdss)}\n")
         fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n")
         fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n")
-        fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n")
         fh_out.write(f"sORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}\n")
         fh_out.write(f"gaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}\n")
         fh_out.write(f"oriCs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIC])}\n")

diff --git a/bakta/main.py b/bakta/main.py
@@ -25,7 +25,6 @@
 import bakta.features.crispr as crispr
 import bakta.features.orf as orf
 import bakta.features.cds as feat_cds
-import bakta.features.signal_peptides as sig_peptides
 import bakta.features.s_orf as s_orf
 import bakta.features.gaps as gaps
 import bakta.features.ori as ori
@@ -310,10 +309,6 @@ def main():
                 user_hmm_found = exp_aa_hmms.search(cdss, cfg.user_hmms)
                 print(f'\t\tuser HMM sequences: {len(user_hmm_found)}')
 
-            if(cfg.gram != bc.GRAM_UNKNOWN):
-                sig_peptides_found = sig_peptides.search(cdss, cds_aa_path)
-                print(f'\tsignal peptides: {len(sig_peptides_found)}')
-
             print('\tcombine annotations and mark hypotheticals...')
             log.debug('combine CDS annotations')
             for cds in cdss:
@@ -408,14 +403,6 @@ def main():
             anno.combine_annotation(feat)  # combine IPS and PSC annotations
         data['features'].extend(sorfs_filtered)
         print(f'\tfiltered sORFs: {len(sorfs_filtered)}')
-
-        if(cfg.gram != bc.GRAM_UNKNOWN  and  len(sorfs_filtered) > 0):
-            sorf_aa_path = cfg.tmp_path.joinpath('sorfs.faa')
-            with sorf_aa_path.open(mode='wt') as fh:
-                for sorf in sorfs_filtered:
-                    fh.write(f">{sorf['aa_hexdigest']}-{sorf['sequence']}-{sorf['start']}\n{sorf['aa']}\n")
-            sig_peptides_found = sig_peptides.search(sorfs_filtered, sorf_aa_path)
-            print(f"\tsignal peptides: {len(sig_peptides_found)}")
 
     ############################################################################
     # gap annotation
@@ -526,7 +513,6 @@ def main():
     print(f"\tCDSs: {len(cdss)}")
     print(f"\t\thypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}")
     print(f"\t\tpseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}")
-    print(f"\t\tsignal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}")
     print(f"\tsORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}")
     print(f"\tgaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}")
     print(f"\toriCs/oriVs: {len([feat for feat in features if (feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV)])}")
@@ -575,7 +561,7 @@ def main():
         print('\tskip generation of circular genome plot...')
     else:
         print('\tcircular genome plot...')
-        plot.write(features, sequences, cfg.output_path)
+        plot.write(data, features, cfg.output_path)
 
     if(cfg.skip_cds is False):
         hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
@@ -619,7 +605,6 @@ def main():
         fh_out.write(f"CDSs: {len(cdss)}\n")
         fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n")
         fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n")
-        fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n")
         fh_out.write(f"sORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}\n")
         fh_out.write(f"gaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}\n")
         fh_out.write(f"oriCs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIC])}\n")