diff --git a/CHANGES.md b/CHANGES.md index 95b2533..37fe793 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,11 @@ # Changes since last version +## v1.7.0 + +* Added the option --alignment-index to support files with multiple MSAs. +* Added two functions, view and glimpse, to enable use of alv in notebook environments. + ## v1.6.1 * Fixed accession abbreviation so that short accessions are left as they are. diff --git a/README.md b/README.md index 8275332..26e0f5e 100644 --- a/README.md +++ b/README.md @@ -16,14 +16,27 @@ tested. ## Latest feature additions -* Focus on variable columns with the options `--only-variable` and - `--only-variable-excluding-indels`, contributed by nikostr, that constrains - coloring to columns with variation and variation not counting indels. -* The command `alv -g huge_msa.fa` displays cut-out of the MSA, guaranteed to fit - one terminal page without scrolling or MSA line breaking, that is supposed to - give you an idea of alignment quality and contents. -* Write `alv -r 20 huge_msa.fa` to get a view of the MSA containing only 20 randomly - selected sequences. +* If you have more than one alignment in your input file, then the first alignment is output unless you + use the --alignment-index (-ai) option to choose another. +* `alv` is now adapted for use in Python notebooks (tested on Jupyter) through two convenience functions + 'view' and 'glimpse'. Both functions take a BioPython alignment object and outputs a view of the + alignment. + + Writing + ``` + from Bio import AlignIO + msa = AlignIO.read('PF00005.fa', 'fasta') + import alv + alv.view(msa) + ``` + in a Jupyter notebook cell and evaluating will yield a colored alignment in the `alv` style. + + For large alignments, the glimpse function is convenient since a subset of the alignment, selected + as an easily detected conserved region, is shown. + ``` + alv.glimpse(msa) + ``` + Look for more usage information view `help(alv.view)` in a notebook cell. ## Features @@ -34,6 +47,14 @@ tested. * Guesses sequence type (DNA/RNA/AA/coding) by default. You can override with option `-t`. * Order sequence explicitly, alphabetically, or by sequence similarity. * Restrict coloring to where you don't have indels or where there is a lot of conservation. +* Focus on variable columns with the options `--only-variable` and + `--only-variable-excluding-indels`, contributed by nikostr, that constrains + coloring to columns with variation and variation not counting indels. +* The command `alv -g huge_msa.fa` displays cut-out of the MSA, guaranteed to fit + one terminal page without scrolling or MSA line breaking, that is supposed to + give you an idea of alignment quality and contents. +* Write `alv -r 20 huge_msa.fa` to get a view of the MSA containing only 20 randomly + selected sequences. ## Install diff --git a/THANKS.md b/THANKS.md index 1529a79..88a043c 100644 --- a/THANKS.md +++ b/THANKS.md @@ -8,3 +8,5 @@ * @SimonGreenhill added support for the Nexus format. * Roey Angel reported bug about `alv` not guessing the MSA type correctly when a DNA/RNA sequence is completely without "codons". +* Michael Milton suggested the --alignment-index option. +* Marina Herrera Sarrias suggested the notebook functionality. diff --git a/alv/__init__.py b/alv/__init__.py index 85ef775..960ce05 100644 --- a/alv/__init__.py +++ b/alv/__init__.py @@ -1,3 +1,54 @@ -# Empty for now. What would be needed to be initialised in this package? +from .io import get_alv_objects +from .alignmentterminal import AlignmentTerminal +def view(msa, seqtype='guess', color_scheme='guess', genetic_code=1, width=80, dotted=False): + ''' + Present a colorised version of a multiple sequence alignment (MSA). + + Options: + * msa An alignment as BioPython object (from AlignIO). + * seqtype String describing the type of sequence. One of 'aa', 'dna', + 'rna', 'codon', or 'guess' (which almost always works). + * color_scheme String indicating what coloring scheme to use. Ignore unless + you want to use 'taylor' or 'hydrophobicity'. + * genetic_code An integer indicating which genetic code to use. In the + range 1 to 33 as of this writing, with 1 being the standard + code. See NCBI's list at + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi + * width Integer indicating the desired width of alignment view. + Default: 60 characters. + * dotted If True, all characters in the first sequence of the MSA is + written and remaining sequences are dotted everwhere except + for where they differ from the first sequence. + ''' + alignment, painter = get_alv_objects(msa, seqtype, color_scheme, genetic_code) + terminal = AlignmentTerminal(width) + terminal.output_alignment(alignment, painter, width - 12, dotted) + + +def glimpse(msa, n_seq=20, seqtype='guess', color_scheme='guess', genetic_code=1, width=60, dotted=False): + ''' + Works like "view", but extracts a conserved region and outputs a random sample of the sequences or all of them if it is a small alignment. + + Options: + * msa An alignment as BioPython object (from AlignIO). + * n_seq The number of sequences to output. + * seqtype String describing the type of sequence. One of 'aa', 'dna', + 'rna', 'codon', or 'guess' (which almost always works). + * color_scheme String indicating what coloring scheme to use. Ignore unless + you want to use 'taylor' or 'hydrophobicity'. + * genetic_code An integer indicating which genetic code to use. In the + range 1 to 33 as of this writing, with 1 being the standard + code. See NCBI's list at + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi + * width Integer indicating the desired width of alignment view. + Default: 60 characters. + * dotted If True, all characters in the first sequence of the MSA is + written and remaining sequences are dotted everwhere except + for where they differ from the first sequence. + ''' + alignment, painter = get_alv_objects(msa, seqtype, color_scheme, genetic_code) + terminal = AlignmentTerminal(width) + terminal.height = n_seq + 2 + terminal.output_glimpse(alignment, painter, width, dotted) diff --git a/alv/alignmentterminal.py b/alv/alignmentterminal.py index 2aa5a7d..c8749f2 100644 --- a/alv/alignmentterminal.py +++ b/alv/alignmentterminal.py @@ -3,26 +3,19 @@ import sys import shutil + + class AlignmentTerminal: ''' This class encapsulates knowledge about the terminal and how to draw on it. ''' - def __init__(self, args): - self.width, self.height = shutil.get_terminal_size() # First item in this tuple is the width, the other is the height. - self.random_sample_size = args.random_accessions - self.sorting = args.sorting - if args.sort_by_id: - self.sorting = 'by identity' - self.order = args.sort_by_id - elif args.sorting_order: - self.sorting = 'fixed' - self.order = args.sorting_order.split(',') - if len(self.order) == 0: - raise Exception('Bad order specification: no accessions in input') - if args.select_matching: - self.selection = args.select_matching - else: - self.selection = False + def __init__(self, terminal_width, random_sample_size=0, sorting=None, order=None, accession_pattern=False): + self.width = terminal_width + self.random_sample_size = random_sample_size + self.sorting = sorting + self.order = order + self.selection = accession_pattern + def get_accession_list(self, al): ''' @@ -75,14 +68,14 @@ def _setup_left_margin(self, al, chosen_accessions): assert self.left_margin < self.width - 10 - def output_alignment(self, al, painter, width, dotted=False): + def output_alignment(self, al, painter, chosen_width, dotted=False): ''' Output alignment al to stdout in blocks of width at most w with colors from painter. Args: al -- the alignment object painter -- Painter object that decides colors for symbols - width -- How many columns to use for one alignment block + chosen_width -- How many columns to use for one alignment block dotted -- Flag. Output periods (.) if position identical to first sequence? ''' chosen_accessions = self.get_accession_list(al) @@ -92,12 +85,12 @@ def output_alignment(self, al, painter, width, dotted=False): self._setup_left_margin(al, chosen_accessions) - columns_per_block = al.block_width(self.width, width) + columns_per_block = al.block_width(self.width, chosen_width) for block in al.blocks(columns_per_block): - self._output_block(al, painter, width, chosen_accessions, block, dotted) + self._output_block(al, painter, chosen_width, chosen_accessions, block, dotted) - def output_glimpse(self, al, painter, width, dotted=False): + def output_glimpse(self, al, painter, chosen_width, dotted=False): ''' Output a single-screen glimpse of the alignment. An attempt at finding the most interesting (guessed to be the most conserved part of a random @@ -105,15 +98,18 @@ def output_glimpse(self, al, painter, width, dotted=False): ''' chosen_accessions = self.get_accession_list(al) - n_seqs_to_view = min(len(chosen_accessions), self.height - 2) # -2 to make room for tick line and next prompt + if self.height: + n_seqs_to_view = min(len(chosen_accessions), self.height - 2) # -2 to make room for tick line and next prompt on terminals + else: + n_seqs_to_view = min(len(chosen_accessions), 20) chosen_accessions = random.sample(chosen_accessions, n_seqs_to_view) self._setup_left_margin(al, chosen_accessions) - n_columns = al.block_width(self.width, width) # This many alignment columns + n_columns = al.block_width(self.width, chosen_width) # This many alignment columns conserved_block = al.get_conserved_block(n_columns) - self._output_block(al, painter, width, chosen_accessions, conserved_block, dotted) + self._output_block(al, painter, chosen_width, chosen_accessions, conserved_block, dotted) def calc_tick_indices(start, end, distance, min_distance): @@ -167,3 +163,33 @@ def make_tick_string(left_margin, start, end, distance, min_distance): index_bar += index_block last_pos = pos return index_bar + + +class AlignmentShellTerminal(AlignmentTerminal): + ''' + This subclass gets the attributes from an argparse object and + checking the actual terminal size. + ''' + def __init__(self, args): + self.random_sample_size = args.random_accessions + + sorting = args.sorting + order = None + if args.sort_by_id: + sorting = 'by identity' + order = args.sort_by_id + elif args.sorting_order: + sorting = 'fixed' + order = args.sorting_order.split(',') + if len(self.order) == 0: + raise Exception('Bad order specification: no accessions in input') + if args.select_matching: + accession_pattern = args.select_matching + else: + accession_pattern = False + + terminal_width, terminal_height = shutil.get_terminal_size() + super().__init__(terminal_width, args.random_accessions, sorting, order, accession_pattern) + self.height = terminal_height + + diff --git a/alv/colorize.py b/alv/colorize.py index 9a4437a..c7e3c17 100644 --- a/alv/colorize.py +++ b/alv/colorize.py @@ -220,7 +220,7 @@ def _color_lookup(self, c): class CodonPainter(Painter): ''' - Put paint of codons. + Put paint on codons. ''' def __init__(self, aa_painter): super().__init__() diff --git a/alv/io.py b/alv/io.py index 8b50ad7..251a345 100644 --- a/alv/io.py +++ b/alv/io.py @@ -13,6 +13,8 @@ def guess_format(filename): ''' with open(filename) as h: first_line = h.readline() + if len(first_line) < 2: + raise ValueError('Empty file') if first_line[:15] == '# STOCKHOLM 1.0': return 'stockholm' elif first_line == 'CLUSTAL': @@ -35,15 +37,28 @@ def guess_format(filename): return 'phylip' -def read_alignment(file, seqtype, input_format, color_scheme, genetic_code): +def read_alignment(file, seqtype, input_format, color_scheme, genetic_code, alignment_no=0): ''' Factory function. Read the alignment with BioPython's support, and return an appropriate alv alignment. ''' if file == '-': file = sys.stdin # Start reading from stdin if "magic filename" - alignment = AlignIO.read(file, input_format) + alignments = list(AlignIO.parse(file, input_format)) + if alignments: + n_msas = len(alignments) + if alignment_no >= n_msas: + raise IOError(f'Alignment index too large, only {n_msas} alignment(s) in the file.') + return get_alv_objects(alignments[alignment_no], seqtype, color_scheme, genetic_code) + else: + raise ValueError('No alignment in input') + +def get_alv_objects(alignment, seqtype, color_scheme, genetic_code): + ''' + Take the alignment object and return a suitable Alv alignment object, with respect + to sequence type, and colorization object ("painter"). + ''' if seqtype == 'guess': seqtype = guess_seq_type(alignment) @@ -63,7 +78,7 @@ def read_alignment(file, seqtype, input_format, color_scheme, genetic_code): al.set_genetic_code(genetic_code) return CodonAlignment(alignment), CodonPainter(painter) else: - raise Exception('Unknown option') + raise IOError(f'Unknown sequence type: "{seqtype}"') diff --git a/alv/version.py b/alv/version.py index bb64aa4..0e1a38d 100644 --- a/alv/version.py +++ b/alv/version.py @@ -1 +1 @@ -__version__ = '1.6.1' +__version__ = '1.7.0' diff --git a/bin/alv b/bin/alv index f16d057..153c19d 100755 --- a/bin/alv +++ b/bin/alv @@ -7,7 +7,7 @@ import sys from traceback import print_last from alv.version import __version__ -from alv.alignmentterminal import AlignmentTerminal +from alv.alignmentterminal import AlignmentShellTerminal import alv.io as io from alv.exceptions import AlvPossibleFormatError, AlvEmptyAlignment @@ -69,6 +69,8 @@ def setup_argument_parsing(): ap = argparse.ArgumentParser() ap.add_argument('infile', nargs='?', help="The infile is the path to a file, or '-' if reading from stdin.") ap.add_argument('--version', action='version', version='%(prog)s ' + __version__) + ap.add_argument('-ai', '--alignment-index', type=int, default=0, + help='If reading file with many alignments, choose which one to output with this zero-based index. Default: first alignment in file.') ap.add_argument('-f', '--format', choices=['guess', 'fasta', 'clustal', 'nexus', 'phylip', 'stockholm'], default='guess', help="Specify what sequence type to assume. Be specific if the file is not recognized automatically. When reading from stdin, the format is always guessed to be FASTA. Default: %(default)s") ap.add_argument('-t', '--type', choices=['aa', 'dna', 'rna', 'codon', 'guess'], default='guess', @@ -98,7 +100,7 @@ def setup_argument_parsing(): # Options for changing sequence order ordering_args = ap.add_argument_group('Sequence selection and ordering') - ordering_args.add_argument('-r', '--random-accessions', type=int, metavar='N', + ordering_args.add_argument('-r', '--random-accessions', type=int, metavar='N', default=0, help='Only view a random sample of the alignment sequences.') ordering_args.add_argument('-s', '--sorting', choices=['infile', 'alpha'], default='infile', help="Sort the sequences as given in the infile or alphabetically (by accession). Default: %(default)s") @@ -145,7 +147,7 @@ def input_and_option_adaption(args): format = 'fasta' # Hard guess, because a bit complicated when reading from pipe (sys.stdin) else: format = args.format - alignment, painter = io.read_alignment(args.infile, args.type, format, args.color_scheme, args.code) + alignment, painter = io.read_alignment(args.infile, args.type, format, args.color_scheme, args.code, args.alignment_index) return alignment, painter except KeyboardInterrupt: @@ -196,7 +198,14 @@ def main(): ap.exit() # Read the data - alignment, painter = input_and_option_adaption(args) + try: + alignment, painter = input_and_option_adaption(args) + except IOError as e: + print(f'alv: {e}', file=sys.stderr) + ap.exit(7) + except ValueError as e: + print(f'alv: {e}', file=sys.stderr) + ap.exit(8) # In case we just want to know the basics: if args.just_info: @@ -219,7 +228,7 @@ def main(): args.width = alignment.al_width() args.keep_colors_when_redirecting = True - terminal = AlignmentTerminal(args) + terminal = AlignmentShellTerminal(args) painter.set_options(args) try: if args.glimpse: @@ -237,14 +246,16 @@ def main(): # This should not cause any specific error at all: correct behaviour is to end the program. sys.stderr.close() # Without this line, python will sometimes give an error ap.exit(6) - + except IOError as e: + print(f'alv: ', file=sys.stderr) + ap.exit(7) except ValueError as e: print('alv:', e, file=sys.stderr) ap.exit(3) - # except Exception as e: - # print('Alv bug! Please report!', file=sys.stderr) - # print(e, file=sys.stderr) - # ap.exit(2) + except Exception as e: + print('Alv bug! Please report!', file=sys.stderr) + print(e, file=sys.stderr) + ap.exit(2) if __name__ == '__main__': main() diff --git a/tests/multi.sthlm b/tests/multi.sthlm new file mode 100644 index 0000000..0514a9c --- /dev/null +++ b/tests/multi.sthlm @@ -0,0 +1,26 @@ +# STOCKHOLM 1.0 +#=GS ARAG_ECOLI/23-172 AC P0AAF3.1 +#=GS YTFR_ECOLI/25-174 AC Q6BEX0.1 +#=GS THIQ_ECOLI/15-158 AC P31548.2 +#=GS MODC_ECOLI/14-157 AC P09833.4 +ARAG_ECOLI/23-172 LTDISFDCYAGQVHALMGENGAGKSTLLKILSGNYAP.....TTGSV.....................VIN.GQEMSF.......SDTTAALNAGVAIIYQE..LHLVPEMT.VAENIYLGQLP.....................................HKGGIVNRSLLNYE........AGLQLKHLGMDI..........................DPDTPL..............KYLSIGQWQ..MVEIAKALARNAKIIAFDEPTS +#=GR ARAG_ECOLI/23-172 pAS ..............................................................................................................................................................................................................................................................................*... +YTFR_ECOLI/25-174 LDNVDFSLRRGEIMALLGENGAGKSTLIKALTGVYHA.....DRGTI.....................WLE.GQAISP.......KNTAHAQQLGIGTVYQE..VNLLPNMS.VADNLFIGREP.....................................KRFGLLRRKEMEKR........ATELMASYGFSL..........................DVREPL..............NRFSVAMQQ..IVAICRAIDLSAKVLILDEPTA +#=GR YTFR_ECOLI/25-174 pAS ..............................................................................................................................................................................................................................................................................*... +THIQ_ECOLI/15-158 PMRFSLTVERGEQVAILGPSGAGKSTLLNLIAGFLTP.....ASGSL.....................TID.GVDHTT..........MPPSRRPVSMLFQE..NNLFSHLT.VAQNIGLGLN........................................PGLKLNAVQQGK........MHAIARQMGIDN..........................LMARLP..............GELSGGQRQ..RVALARCLVREQPILLLDEPFS +#=GR THIQ_ECOLI/15-158 pAS ..............................................................................................................................................................................................................................................................................*... +MODC_ECOLI/14-157 CLTINETLPANGITAIFGVSGAGKTSLINAISGLTRP.....QKGRI.....................VLN.GRVLNDAE....KGICLTPEKRRVGYVFQD..ARLFPHYK.VRGNLRYG..............................................MSKSMVDQ........FDKLVALLGIEP..........................LLDRLP..............GSLSGGEKQ..RVAIGRALLTAPELLLLDEPLA +#=GR MODC_ECOLI/14-157 pAS ..............................................................................................................................................................................................................................................................................*... +// +# STOCKHOLM 1.0 +#=GS ARTP_ECOLI/18-170 AC P0AAF6.1 +#=GS PHNL_ECOLI/24-178 AC P16679.1 +#=GS ABCA_AERSA/41-174 AC Q07698.1 +#=GS NODI_AZOC5/30-174 AC Q07756.1 +#=GS H7EQI2_PSEST/19-157 AC H7EQI2.1 +ARTP_ECOLI/18-170 LFDITLDCPQGETLVLLGPSGAGKSSLLRVLNLLEMP.....RSGTLNIA.GNHFDFTKT..PSDKAIRDLRRNVGMVFQQ..YNLWPHLT.VQQNLIEAPC.......................................RVLGLSKDQALAR........AEKLLERLRLKP..........................YSDRYP..............LHLSGGQQQ..RVAIARALMMEPQVLLFDEPTA +PHNL_ECOLI/24-178 LNRASLTVNAGECVVLHGHSGSGKSTLLRSLYANYLP.....DEGQIQIKHGDEWVDLVT.APARKVVEIRKTTVGWVSQF..LRVIPRIS.ALEVVMQPLL........................................DTGVPREACAAK........AARLLTRLNVPE.............................RLWHLAP..........STFSGGEQQ..RVNIARGFIVDYPILLLDEPTA +ABCA_AERSA/41-174 LRDIELTVYRGETIGIVGHNGAGKSTLLQLITGVMQP.....DCGQI.....................TRTGRVVGLLELG..SGFNPEFT.GRENIFFNGA........................................ILGMSQREMDDR........LERILSFAAIGD..........................FIDQPV..............KNYSSGMMV..RLAFSVIINTDPDVLIIDEALA +NODI_AZOC5/30-174 LRGLDMYVRRGERYGIMGTNGAGKSSLINIILGLTPP.....DRGTVTVF.GKDMRR.........QGHLARARIGVVPQD..DCLEQNMT.PYENVMLYGR........................................LCRMTASEARKR........ADQIFEKFSMMD..........................CANRPV..............RLLSGGMRR..LTMVARALVNDPYVIILDEATV +H7EQI2_PSEST/19-157 LHDLNLNLGEGEVLGLFGHNGAGKTTSMKLILGLLSP.....SEGQVKVL.GRAPND...........PQVRRQLGYLPEN..VTFYPQLS.GRETLR............................................HFARLKGAALTQ........VDELLEQVGLAH..........................AADRRV..............KTYSKGMRQ..RLGLAQALLGEPRLLLLDEPTV +// diff --git a/tests/t2_longaccessions.fa b/tests/t2_longaccessions.fa new file mode 100644 index 0000000..fb64735 --- /dev/null +++ b/tests/t2_longaccessions.fa @@ -0,0 +1,8 @@ +>shubbadooo_s1 +TCT!TAT--- +>hubbahubba2 +TCC-TACATT +>longerandlonger_s3 +TCA-TAAATT +>s4 +TCG-TAGATA