Merge pull request #3 from JSdoubleL/partition

Partition
JSdoubleL · Apr 21, 2024 · 6a232b2 · 6a232b2
2 parents 103492d + 6d86aba
commit 6a232b2
Show file tree

Hide file tree

Showing 7 changed files with 125 additions and 215 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ Decomposition Into Single-COpy gene trees ([DISCO](https://doi.org/10.1093/sysbi
 - v1.2 (June 8th, 2022): Tree rerooting fix (addresses issue with treeswift rooting with no edge lengths)
 - v1.3 (July 8th, 2022): Allows for more flexibility with leaf labels
 - v1.3.1 (October 31st, 2022): Fixes issue where extra newline character at the end of the file causes a crash 
+- v1.4.0 (April 21st, 2024): Added ability to make a partition file to CA-DISCO
 
 **Note:** At present, it is recommended to use the latest version if you want to run the DISCO algorithm as described in Willson et al. 2021, as using earlier versions can causes results inconsistent with the DISCO algorithm in rare cases. 
 
@@ -21,7 +22,7 @@ If you use DISCO, please cite:
 @article{willson2022disco,
   title={DISCO: Species tree inference using multicopy gene family tree decomposition},
   author={Willson, James and Roddur, Mrinmoy Saha and Liu, Baqiao and Zaharias, Paul and Warnow, Tandy},
-  journal={Systematic biology},
+  journal={Systematic Biology},
   volume={71},
   number={3},
   pages={610--629},
@@ -81,41 +82,56 @@ python3 disco.py -i <input_file> -o <ouput_file> -d <delimiter>
 
 #### Example
 
-```cmd
+```bash
 python3 disco.py -i example/gtrees-mult.trees
 ```
 
 ### ca_disco.py
 
-**Input**: File containing list of multi-copy trees in newick format and set of alignment files in phylip format corresponding to the gene families.
+**Input**: File containing list of multi-copy trees in newick format and set of alignment files corresponding to the gene families.
 
-**Output**: Concatenated alignment file in the phylip format
+**Output**: Concatenated alignment file
 
 ```
-python3 ca_disco.py -i <input_trees> -a <alignments_list> -t <taxa_list> -o <output> -d <delimiter> -m <n> 
+python3 ca_disco.py -i <input_trees> -a <aln_list> -o <output> -d <delimiter> -m <number> 
 ```
 
 `disco.py` must be present in the same directory as `ca_disco.py` in order for it to run. Also, unlike `disco.py`, it is necessary for the input newick trees given to `ca_disco.py` to have unique leaf labels where the taxon name comes first and is separated from the rest of the name by some delimiter. 
 
+The `-a` argument should be given the path to an "alignment list" file containing the path to each alignment file you want to concatinate separated by a new line. The order of the alignment files is important---for each tree in the input newick tree file, there should be a corresponding alignment on the same line in the respective alignment list file.  
+
 #### Arguments
 
 ##### Required
 
 ```
 -i, --input           Input newick tree file
 -a, --alignment       Text file containing paths to alignment files
--t, --taxonset        Text file containing taxa list
+-f, --format          Format of alignment file (either "fasta" or "phylip")
 -o, --output          Output concatenated alignment file
 ```
 
 ##### Optional
 ```
 -m, --filter          Minimum number of taxa required sequence group to be included
 -d, --delimiter       Delimiter separating species name from rest of leaf label
+-p, --partition       Create partition file
 ```
 
 #### Example
 
-```cmd
-python3 ca_disco.py -i example/g_100.trees -o example.phy -a example/seq_list.txt -t example/taxa_list.txt
+```bash 
+python3 ca_disco.py -i example/g_100.trees -o example.phy -a example/seq_list.csv -f phylip
+```
+
+#### Partition File
+
+`ca_disco.py` now has the option to create a partition file; this can be done with the optional `-p` argument. If this argument is specified, it is necessary to provide relavent information in the alignment list file. For example, see `example/seq_list.csv`. This would generate a partition file like so:
+```
+GTR+G, 0001=1-200
+GTR+G, 0002=201-700
+GTR+G, 0004=701-1300
+...
 ```
+**Note:** Gene 0003 is missing. This is not a mistake; this is because it does not have enough informative information to generate large enough DISCO subtrees and thus is not included in the alignment.
+
diff --git a/ca_disco.py b/ca_disco.py
@@ -1,111 +1,104 @@
-from disco import *
-import treeswift as ts
-from Bio import AlignIO
-from Bio.AlignIO import MultipleSeqAlignment
-from Bio.SeqRecord import SeqRecord
-from Bio.Seq import Seq
-import os
 import argparse
+import csv
+import os
 
-from argparse import ArgumentParser, ArgumentTypeError
-import re
-
-def fileList(path):
-    return [line.strip() for line in open(path).readlines()]
-
-def parseTaxonList(string):
-    if os.path.isfile(string): 
-        return [line.strip() for line in open(string).readlines()]
-    # https://stackoverflow.com/a/6512463
-    m = re.match(r'(\d+)(?:-(\d+))?$', string)
-    # ^ (or use .split('-'). anyway you like.)
-    if not m:
-        raise ArgumentTypeError("'" + string + "' is not a range of number. Expected forms like '0-5' or '2'.")
-    start = m.group(1)
-    end = m.group(2) or start
-    return list(range(int(start,10), int(end,10)+1))
+from Bio import AlignIO
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+import treeswift as ts
 
+from disco import *
 
-def retrieve_alignment(tre, alnpath, taxonset=range(0,101), delimiter='_'):
+def retrieve_alignment(tree, aln_path, format, taxa_set, label_to_species):
     """
     Parameters
     ----------------
-    tre : single-copy treeswift tree generated from James's code
-    alnpath : path to the phylip formatted alignment of the genes. The row labels should be a superset of the leafset of 'tre'
-    seqlen : sequence length parameter, only the first seqlen columns are taken from the MSA
-    taxonset: set, the taxon set of the entire dataset
+    tree: single-copy treeswift tree generated by DISCO.
+    aln_path: path to the PHYLIP formatted alignment of the genes. 
+        The row labels should be a superset of the leafset of 'tree'.
+    format: file format of the alignment (either "phylip" or "fasta").
+    taxa_set: set, the taxon set of the entire dataset
+    delimiter: delimiter used to get the taxa names from the labels
 
     Returns the MSA that corresponds to the input tree.
     """
-    aln = AlignIO.read(open(alnpath), "phylip") 
-    seqlen = len(aln[0].seq)
-    blank = "-" * seqlen
-    whitelist = set(tre.labels(True, False))
-    rest = set(taxonset)
-    #print(rest)
-    res = MultipleSeqAlignment([])
-    for r in aln[:,:seqlen]:
-        if r.id in whitelist:
-            rid = r.id.split(delimiter)[0]
-            rid_i = rid
-            res.append(SeqRecord(r.seq, id=rid))
-            rest.remove(rid_i)
-    for rst in rest:
-        res.append(SeqRecord(Seq(blank), id=str(rst)))
-    res.sort()
-    return res
-
-def format_phy(i, n):
-    return str(i).zfill(n) + ".phy"
+    aln = AlignIO.read(open(aln_path), format) 
+    seq_len = len(aln[0].seq)
+    blank = "-" * seq_len
+    whitelist, remaining = set(tree.labels(True, False)), set(taxa_set)
+    result = MultipleSeqAlignment([])
+
+    # you can't get sequences by name from aln objects in biopython 
+    # in a better way as far as I can tell
+    for record in aln[:,:seq_len]:
+        if record.id in whitelist:
+            taxon_name = label_to_species(record.id)
+            result.append(SeqRecord(record.seq, id=taxon_name))
+            remaining.remove(taxon_name)
 
-if __name__=="__main__":
+    for taxon_name in remaining:
+        result.append(SeqRecord(Seq(blank), id=str(taxon_name)))
+
+    result.sort()
+    return result
+
+def main(args):
+    input_alignments = [row for row in csv.reader(open(args.alignment, "r"))]
+    label_to_species = lambda x:x.split(args.delimiter)[0]
+    tree_list = ts.read_tree_newick(args.input)
+    assert not isinstance(tree_list, ts.Tree)
+    taxa_set = set(label for tree in tree_list 
+                   for label in map(label_to_species, tree.labels(True, False)))
+
+    # init aln with taxa labels
+    aln = MultipleSeqAlignment([])
+    for taxa in taxa_set:
+        aln.append(SeqRecord(Seq(''), id=taxa))
+    aln.sort()  
+
+    partitions = []
+    p_index = 1
 
-    parser = argparse.ArgumentParser(description = "generate concatenation files from gene-family trees using decomposition strategies")
+    for (aln_file, *_), tree in zip(input_alignments, tree_list):
+        tree.reroot(get_min_root(tree, label_to_species)[0])
+        tag(tree,label_to_species)
+        disco_trees = list(filter(lambda x:x.num_nodes(internal=False) >= args.filter, decompose(tree)))
+        for dtree in disco_trees:
+            aln += retrieve_alignment(dtree, aln_file, args.format,taxa_set, label_to_species)
+        if aln.get_alignment_length() + 1 != p_index:
+            partitions.append(f"{p_index:d}-{aln.get_alignment_length():d}") 
+            p_index = aln.get_alignment_length() + 1
+        else:
+            partitions.append("empty")
+
+    if args.partition:
+        with open(f"{args.output_prefix}-partitions.txt", "w") as f:
+            assert all(len(x) == 2 for x in input_alignments), "alignment list file format problem"
+            for partition, (aln_file, model) in zip(partitions, input_alignments):
+                if partition != "empty":
+                    gene_name = aln_file.split(os.sep)[-1].split('.')[0]
+                    f.write(f"{model}, {gene_name}={partition}\n")
+
+    AlignIO.write(aln, f"{args.output_prefix}-aln.{args.format[:3]}", args.format)
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="generate concatenation files from gene-family trees using decomposition strategies")
 
-    # some of these arguments are directly copied from James's code
     parser.add_argument("-i", "--input", type=str,
-                        help="Input tree list file", required=True)
-    parser.add_argument("-o", "--output", type=str, required=True,
-                        help="Output tree list file")
-    parser.add_argument("-a", "--alignment", type=fileList, required=True,
-                        help="File containing paths to all alignment files in order the genes are found in the input newick file")
+                        help="input tree list file", required=True)
+    parser.add_argument("-o", "--output-prefix", type=str, required=True,
+                        help="output tree list file")
+    parser.add_argument("-a", "--alignment", required=True, type=str,
+                        help="alignment files list")
+    parser.add_argument('-f', '--format', choices=["phylip", "fasta"], required=True,
+                        help="alignment file format")
     parser.add_argument('-d', '--delimiter', type=str, default='_',
-                        help="Delimiter separating taxon label from the rest of the leaf label.")
+                        help="delimiter separating taxon label from the rest of the leaf label.")
     parser.add_argument('-m', '--filter', type=int, default=4,
-                        help="Exclude decomposed trees with less then X taxa")
-    #parser.add_argument("-k", "--ngenes", type=int, default = math.inf, help="maximum number of input gene trees to use")
-    #parser.add_argument("-d", "--decomp", type=str, default = "s", help = "decomposition method, either s or d for sampling (linear) or decomposition")
-    #parser.add_argument("-l", "--seqln", type=int, default = math.inf, help="maximum kept sequence length of alignments")
-    parser.add_argument("-t", "--taxonset", type=parseTaxonList, required=True, 
-                        help="taxon set (a range of numbers in the format of a-b) or file listing taxa (each label alone on one line)")
-    args = parser.parse_args()
-
-
-    # arguments: n (seqlength), alnpath, inpath, k (genetree limit), decompm: either "s" or "d"
-    #N = args.seqln # seqlength
-    ALNROOT = args.alignment # alignment directory, containing zero-padded files like 0001.phy ... 0030.phy that corresponds to the alignment of the genes
-    INPATH = args.input # input multi-copy gene family trees path
-    #K = args.ngenes # maximum number of multi-copy trees to use from INPATH
-    #DECOMPM = args.decomp # decomposition method, either "s" or "d" for sampling or decomposition
-    LEAFSET = args.taxonset
-    OUTPUT = args.output #or INPATH + f".{K}.{DECOMPM}.aln"
-    DELIMITER = args.delimiter
-    F = args.filter
-
-    num_genes = sum(1 for _ in open(INPATH))
-
-    with open(INPATH) as fh:
-        aln = None
-        for i, l in enumerate(fh, start=1):
-            t = ts.read_tree_newick(l)
-            t.reroot(get_min_root(t, lambda x:x.split(DELIMITER)[0])[0])
-            tag(t, lambda x:x.split(DELIMITER)[0])
-            out = list(filter(lambda x:x.num_nodes(internal=False) >= F, decompose(t)))
-            #phy = os.path.join(ALNROOT, format_phy(i, len(str(num_genes))))
-            phy = ALNROOT[i - 1]
-            for ot in out:
-                if aln == None:
-                    aln = retrieve_alignment(ot, phy, LEAFSET, delimiter=DELIMITER)
-                else:
-                    aln += retrieve_alignment(ot, phy, LEAFSET, delimiter=DELIMITER)
-        AlignIO.write(aln, OUTPUT, "phylip")
+                        help="exclude decomposed trees with less then X taxa")
+    parser.add_argument('-p', '--partition', action='store_true', 
+                        help="generate partition file")
+
+    main(parser.parse_args())
+
diff --git a/disco.py b/disco.py
@@ -325,7 +325,7 @@ def main(args):
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description='====================== DISCO v1.3.1 ======================')
+    parser = argparse.ArgumentParser(description='====================== DISCO v1.4.0 ======================')
 
     parser.add_argument("-i", "--input", type=str,
                         help="Input tree list file", required=True)

diff --git a/example/seq_list.csv b/example/seq_list.csv
@@ -0,0 +1,10 @@
+example/seqs/0001.phy,GTR+G
+example/seqs/0002.phy,GTR+G
+example/seqs/0003.phy,GTR+G
+example/seqs/0004.phy,GTR+G
+example/seqs/0005.phy,GTR+G
+example/seqs/0006.phy,GTR+G
+example/seqs/0007.phy,GTR+G
+example/seqs/0008.phy,GTR+G
+example/seqs/0009.phy,GTR+G
+example/seqs/0010.phy,GTR+G
diff --git a/example/seq_list.txt b/example/seq_list.txt