Merge pull request #22 from arvestad/jupytersupport

Jupytersupport
arvestad · Mar 29, 2022 · f4037a8 · f4037a8
2 parents 8104163 + c93b98c
commit f4037a8
Show file tree

Hide file tree

Showing 11 changed files with 213 additions and 48 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,11 @@
 # Changes since last version
 
 
+## v1.7.0
+
+* Added the option --alignment-index to support files with multiple MSAs.
+* Added two functions, view and glimpse, to enable use of alv in notebook environments.
+
 ## v1.6.1
 
 * Fixed accession abbreviation so that short accessions are left as they are.

diff --git a/README.md b/README.md
@@ -16,14 +16,27 @@ tested.
 
 ## Latest feature additions
 
-* Focus on variable columns with the options `--only-variable` and
-  `--only-variable-excluding-indels`, contributed by nikostr, that constrains
-  coloring to columns with variation and variation not counting indels.
-* The command `alv -g huge_msa.fa` displays cut-out of the MSA, guaranteed to fit
-  one terminal page without scrolling or MSA line breaking, that is supposed to
-  give you an idea of alignment quality and contents.
-* Write `alv -r 20 huge_msa.fa` to get a view of the MSA containing only 20 randomly
-  selected sequences.
+* If you have more than one alignment in your input file, then the first alignment is output unless you 
+  use the --alignment-index (-ai) option to choose another.
+* `alv` is now adapted for use in Python notebooks (tested on Jupyter) through two convenience functions
+  'view' and 'glimpse'.  Both functions take a BioPython alignment object and outputs a view of the
+  alignment.
+
+  Writing
+  ```
+  from Bio import AlignIO
+  msa = AlignIO.read('PF00005.fa', 'fasta')
+  import alv
+  alv.view(msa)
+  ```
+  in a Jupyter notebook cell and evaluating will yield a colored alignment in the `alv` style.
+
+  For large alignments, the glimpse function is convenient since a subset of the alignment, selected
+  as an easily detected conserved region, is shown.
+  ```
+  alv.glimpse(msa)
+  ```
+  Look for more usage information view `help(alv.view)` in a notebook cell. 
 
 ## Features
 
@@ -34,6 +47,14 @@ tested.
 * Guesses sequence type (DNA/RNA/AA/coding) by default. You can override with option `-t`.
 * Order sequence explicitly, alphabetically, or by sequence similarity.
 * Restrict coloring to where you don't have indels or where there is a lot of conservation.
+* Focus on variable columns with the options `--only-variable` and
+  `--only-variable-excluding-indels`, contributed by nikostr, that constrains
+  coloring to columns with variation and variation not counting indels.
+* The command `alv -g huge_msa.fa` displays cut-out of the MSA, guaranteed to fit
+  one terminal page without scrolling or MSA line breaking, that is supposed to
+  give you an idea of alignment quality and contents.
+* Write `alv -r 20 huge_msa.fa` to get a view of the MSA containing only 20 randomly
+  selected sequences.
 
 ## Install
 

diff --git a/THANKS.md b/THANKS.md
@@ -8,3 +8,5 @@
 * @SimonGreenhill added support for the Nexus format.
 * Roey Angel reported bug about `alv` not guessing the MSA type correctly when a DNA/RNA sequence is
   completely without "codons".
+* Michael Milton suggested the --alignment-index option.
+* Marina Herrera Sarrias suggested the notebook functionality.
diff --git a/alv/__init__.py b/alv/__init__.py
@@ -1,3 +1,54 @@
-# Empty for now. What would be needed to be initialised in this package?
+from .io import get_alv_objects
+from .alignmentterminal import AlignmentTerminal
 
+def view(msa, seqtype='guess', color_scheme='guess', genetic_code=1, width=80, dotted=False):
+    '''
+    Present a colorised version of a multiple sequence alignment (MSA).
+
+    Options:
+    * msa           An alignment as BioPython object (from AlignIO).
+    * seqtype       String describing the type of sequence. One of 'aa', 'dna',
+                    'rna', 'codon', or 'guess' (which almost always works). 
+    * color_scheme  String indicating what coloring scheme to use. Ignore unless
+                    you want to use 'taylor' or 'hydrophobicity'.
+    * genetic_code  An integer indicating which genetic code to use. In the 
+                    range 1 to 33 as of this writing, with 1 being the standard
+                    code. See NCBI's list at 
+                       https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
+    * width         Integer indicating the desired width of alignment view. 
+                    Default: 60 characters.
+    * dotted        If True, all characters in the first sequence of the MSA is
+                    written and remaining sequences are dotted everwhere except
+                    for where they differ from the first sequence.
+    '''
+    alignment, painter = get_alv_objects(msa, seqtype, color_scheme, genetic_code)
+    terminal = AlignmentTerminal(width)
+    terminal.output_alignment(alignment, painter, width - 12, dotted)
+
+
+def glimpse(msa, n_seq=20, seqtype='guess', color_scheme='guess', genetic_code=1, width=60, dotted=False):
+    '''
+    Works like "view", but extracts a conserved region and outputs a random sample of the sequences or all of them if it is a small alignment.
+
+    Options:
+    * msa           An alignment as BioPython object (from AlignIO).
+    * n_seq         The number of sequences to output.
+    * seqtype       String describing the type of sequence. One of 'aa', 'dna',
+                    'rna', 'codon', or 'guess' (which almost always works). 
+    * color_scheme  String indicating what coloring scheme to use. Ignore unless
+                    you want to use 'taylor' or 'hydrophobicity'.
+    * genetic_code  An integer indicating which genetic code to use. In the 
+                    range 1 to 33 as of this writing, with 1 being the standard
+                    code. See NCBI's list at 
+                       https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
+    * width         Integer indicating the desired width of alignment view. 
+                    Default: 60 characters.
+    * dotted        If True, all characters in the first sequence of the MSA is
+                    written and remaining sequences are dotted everwhere except
+                    for where they differ from the first sequence.
+    '''
+    alignment, painter = get_alv_objects(msa, seqtype, color_scheme, genetic_code)
+    terminal = AlignmentTerminal(width)
+    terminal.height = n_seq + 2
+    terminal.output_glimpse(alignment, painter, width, dotted)
 
diff --git a/alv/alignmentterminal.py b/alv/alignmentterminal.py
@@ -3,26 +3,19 @@
 import sys
 import shutil
 
+
+
 class AlignmentTerminal:
     '''
     This class encapsulates knowledge about the terminal and how to draw on it.
     '''
-    def __init__(self, args):
-        self.width, self.height = shutil.get_terminal_size() # First item in this tuple is the width, the other is the height.
-        self.random_sample_size = args.random_accessions
-        self.sorting = args.sorting
-        if args.sort_by_id:
-            self.sorting = 'by identity'
-            self.order = args.sort_by_id
-        elif args.sorting_order:
-            self.sorting = 'fixed'
-            self.order = args.sorting_order.split(',')
-            if len(self.order) == 0:
-                raise Exception('Bad order specification: no accessions in input')
-        if args.select_matching:
-            self.selection = args.select_matching
-        else:
-            self.selection = False
+    def __init__(self, terminal_width, random_sample_size=0, sorting=None, order=None, accession_pattern=False):
+        self.width = terminal_width
+        self.random_sample_size = random_sample_size
+        self.sorting = sorting
+        self.order = order
+        self.selection = accession_pattern
+
 
     def get_accession_list(self, al):
         '''
@@ -75,14 +68,14 @@ def _setup_left_margin(self, al, chosen_accessions):
         assert self.left_margin < self.width - 10
 
 
-    def output_alignment(self, al, painter, width, dotted=False):
+    def output_alignment(self, al, painter, chosen_width, dotted=False):
         '''
         Output alignment al to stdout in blocks of width at most w with colors from painter.
 
         Args:
           al -- the alignment object
           painter -- Painter object that decides colors for symbols
-          width -- How many columns to use for one alignment block
+          chosen_width -- How many columns to use for one alignment block
           dotted -- Flag. Output periods (.) if position identical to first sequence?
         '''
         chosen_accessions =  self.get_accession_list(al)
@@ -92,28 +85,31 @@ def output_alignment(self, al, painter, width, dotted=False):
 
         self._setup_left_margin(al, chosen_accessions)
 
-        columns_per_block = al.block_width(self.width, width)
+        columns_per_block = al.block_width(self.width, chosen_width)
         for block in al.blocks(columns_per_block):
-            self._output_block(al, painter, width, chosen_accessions, block, dotted)
+            self._output_block(al, painter, chosen_width, chosen_accessions, block, dotted)
 
 
-    def output_glimpse(self, al, painter, width, dotted=False):
+    def output_glimpse(self, al, painter, chosen_width, dotted=False):
         '''
         Output a single-screen glimpse of the alignment. An attempt at finding
         the most interesting (guessed to be the most conserved part of a random
         sample of sequences) of the alignment is done.
         '''
         chosen_accessions =  self.get_accession_list(al)
 
-        n_seqs_to_view = min(len(chosen_accessions), self.height - 2) # -2 to make room for tick line and next prompt
+        if self.height:
+            n_seqs_to_view = min(len(chosen_accessions), self.height - 2) # -2 to make room for tick line and next prompt on terminals
+        else:
+            n_seqs_to_view = min(len(chosen_accessions), 20)
         chosen_accessions = random.sample(chosen_accessions, n_seqs_to_view)
 
         self._setup_left_margin(al, chosen_accessions)
 
-        n_columns = al.block_width(self.width, width) # This many alignment columns
+        n_columns = al.block_width(self.width, chosen_width) # This many alignment columns
         conserved_block = al.get_conserved_block(n_columns)
 
-        self._output_block(al, painter, width, chosen_accessions, conserved_block, dotted)
+        self._output_block(al, painter, chosen_width, chosen_accessions, conserved_block, dotted)
 
 
 def calc_tick_indices(start, end, distance, min_distance):
@@ -167,3 +163,33 @@ def make_tick_string(left_margin, start, end, distance, min_distance):
         index_bar += index_block
         last_pos = pos
     return index_bar
+
+
+class AlignmentShellTerminal(AlignmentTerminal):
+    '''
+    This subclass gets the attributes from an argparse object and
+    checking the actual terminal size.
+    '''
+    def __init__(self, args):
+        self.random_sample_size = args.random_accessions
+
+        sorting = args.sorting
+        order = None
+        if args.sort_by_id:
+            sorting = 'by identity'
+            order = args.sort_by_id
+        elif args.sorting_order:
+            sorting = 'fixed'
+            order = args.sorting_order.split(',')
+            if len(self.order) == 0:
+                raise Exception('Bad order specification: no accessions in input')
+        if args.select_matching:
+            accession_pattern = args.select_matching
+        else:
+            accession_pattern = False
+
+        terminal_width, terminal_height = shutil.get_terminal_size()
+        super().__init__(terminal_width, args.random_accessions, sorting, order, accession_pattern)
+        self.height = terminal_height
+
+
diff --git a/alv/colorize.py b/alv/colorize.py
@@ -220,7 +220,7 @@ def _color_lookup(self, c):
 
 class CodonPainter(Painter):
     '''
-    Put paint of codons.
+    Put paint on codons.
     '''
     def __init__(self, aa_painter):
         super().__init__()

diff --git a/alv/io.py b/alv/io.py
@@ -13,6 +13,8 @@ def guess_format(filename):
     '''
     with open(filename) as h:
         first_line = h.readline()
+        if len(first_line) < 2:
+            raise ValueError('Empty file')
         if first_line[:15] == '# STOCKHOLM 1.0':
             return 'stockholm'
         elif first_line == 'CLUSTAL':
@@ -35,15 +37,28 @@ def guess_format(filename):
                 return 'phylip'
 
 
-def read_alignment(file, seqtype, input_format, color_scheme, genetic_code):
+def read_alignment(file, seqtype, input_format, color_scheme, genetic_code, alignment_no=0):
     '''
     Factory function. Read the alignment with BioPython's support, and
     return an appropriate alv alignment.
     '''
     if file == '-':
         file = sys.stdin        # Start reading from stdin if "magic filename"
-    alignment = AlignIO.read(file, input_format)
+    alignments = list(AlignIO.parse(file, input_format))
 
+    if alignments:
+        n_msas = len(alignments)
+        if alignment_no >= n_msas:
+            raise IOError(f'Alignment index too large, only {n_msas} alignment(s) in the file.')
+        return get_alv_objects(alignments[alignment_no], seqtype, color_scheme, genetic_code)
+    else:
+        raise ValueError('No alignment in input')
+
+def get_alv_objects(alignment, seqtype, color_scheme, genetic_code):
+    '''
+    Take the alignment object and return a suitable Alv alignment object, with respect
+    to sequence type, and colorization object ("painter").
+    '''
     if seqtype == 'guess':
         seqtype = guess_seq_type(alignment)
 
@@ -63,7 +78,7 @@ def read_alignment(file, seqtype, input_format, color_scheme, genetic_code):
         al.set_genetic_code(genetic_code)
         return CodonAlignment(alignment), CodonPainter(painter)
     else:
-        raise Exception('Unknown option')
+        raise IOError(f'Unknown sequence type: "{seqtype}"')
 
 
 

diff --git a/alv/version.py b/alv/version.py
@@ -1 +1 @@
-__version__ = '1.6.1'
+__version__ = '1.7.0'
diff --git a/bin/alv b/bin/alv
@@ -7,7 +7,7 @@ import sys
 from traceback import print_last
 
 from alv.version import __version__
-from alv.alignmentterminal import AlignmentTerminal
+from alv.alignmentterminal import AlignmentShellTerminal
 import alv.io as io
 from alv.exceptions import AlvPossibleFormatError, AlvEmptyAlignment
 
@@ -69,6 +69,8 @@ def setup_argument_parsing():
     ap = argparse.ArgumentParser()
     ap.add_argument('infile', nargs='?', help="The infile is the path to a file, or '-' if reading from stdin.")
     ap.add_argument('--version', action='version', version='%(prog)s ' + __version__)
+    ap.add_argument('-ai', '--alignment-index', type=int, default=0,
+                    help='If reading file with many alignments, choose which one to output with this zero-based index. Default: first alignment in file.')
     ap.add_argument('-f', '--format', choices=['guess', 'fasta', 'clustal', 'nexus', 'phylip', 'stockholm'], default='guess',
                     help="Specify what sequence type to assume. Be specific if the file is not recognized automatically. When reading from stdin, the format is always guessed to be FASTA. Default: %(default)s")
     ap.add_argument('-t', '--type', choices=['aa', 'dna', 'rna', 'codon', 'guess'], default='guess',
@@ -98,7 +100,7 @@ def setup_argument_parsing():
 
     # Options for changing sequence order
     ordering_args = ap.add_argument_group('Sequence selection and ordering')
-    ordering_args.add_argument('-r', '--random-accessions', type=int, metavar='N',
+    ordering_args.add_argument('-r', '--random-accessions', type=int, metavar='N', default=0,
                                help='Only view a random sample of the alignment sequences.')
     ordering_args.add_argument('-s', '--sorting', choices=['infile', 'alpha'], default='infile',
                                help="Sort the sequences as given in the infile or alphabetically (by accession). Default: %(default)s")
@@ -145,7 +147,7 @@ def input_and_option_adaption(args):
             format = 'fasta'    # Hard guess, because a bit complicated when reading from pipe (sys.stdin)
         else:
             format = args.format
-        alignment, painter = io.read_alignment(args.infile, args.type, format, args.color_scheme, args.code)
+        alignment, painter = io.read_alignment(args.infile, args.type, format, args.color_scheme, args.code, args.alignment_index)
         return alignment, painter
 
     except KeyboardInterrupt:
@@ -196,7 +198,14 @@ def main():
         ap.exit()
 
     # Read the data
-    alignment, painter = input_and_option_adaption(args)
+    try:
+        alignment, painter = input_and_option_adaption(args)
+    except IOError as e:
+        print(f'alv: {e}', file=sys.stderr)
+        ap.exit(7)
+    except ValueError as e:
+        print(f'alv: {e}', file=sys.stderr)
+        ap.exit(8)
 
     # In case we just want to know the basics:
     if args.just_info:
@@ -219,7 +228,7 @@ def main():
         args.width = alignment.al_width()
         args.keep_colors_when_redirecting = True
 
-    terminal = AlignmentTerminal(args)
+    terminal = AlignmentShellTerminal(args)
     painter.set_options(args)
     try:
         if args.glimpse:
@@ -237,14 +246,16 @@ def main():
         # This should not cause any specific error at all: correct behaviour is to end the program.
         sys.stderr.close()  # Without this line, python will sometimes give an error
         ap.exit(6)
-
+    except IOError as e:
+        print(f'alv: ', file=sys.stderr)
+        ap.exit(7)
     except ValueError as e:
         print('alv:', e, file=sys.stderr)
         ap.exit(3)
-    # except Exception as e:
-    #     print('Alv bug! Please report!', file=sys.stderr)
-    #     print(e, file=sys.stderr)
-    #     ap.exit(2)
+    except Exception as e:
+        print('Alv bug! Please report!', file=sys.stderr)
+        print(e, file=sys.stderr)
+        ap.exit(2)
 
 if __name__ == '__main__':
     main()