-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from JSdoubleL/partition
Partition
- Loading branch information
Showing
7 changed files
with
125 additions
and
215 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
__pycache__/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,111 +1,104 @@ | ||
from disco import * | ||
import treeswift as ts | ||
from Bio import AlignIO | ||
from Bio.AlignIO import MultipleSeqAlignment | ||
from Bio.SeqRecord import SeqRecord | ||
from Bio.Seq import Seq | ||
import os | ||
import argparse | ||
import csv | ||
import os | ||
|
||
from argparse import ArgumentParser, ArgumentTypeError | ||
import re | ||
|
||
def fileList(path): | ||
return [line.strip() for line in open(path).readlines()] | ||
|
||
def parseTaxonList(string): | ||
if os.path.isfile(string): | ||
return [line.strip() for line in open(string).readlines()] | ||
# https://stackoverflow.com/a/6512463 | ||
m = re.match(r'(\d+)(?:-(\d+))?$', string) | ||
# ^ (or use .split('-'). anyway you like.) | ||
if not m: | ||
raise ArgumentTypeError("'" + string + "' is not a range of number. Expected forms like '0-5' or '2'.") | ||
start = m.group(1) | ||
end = m.group(2) or start | ||
return list(range(int(start,10), int(end,10)+1)) | ||
from Bio import AlignIO | ||
from Bio.Align import MultipleSeqAlignment | ||
from Bio.Seq import Seq | ||
from Bio.SeqRecord import SeqRecord | ||
import treeswift as ts | ||
|
||
from disco import * | ||
|
||
def retrieve_alignment(tre, alnpath, taxonset=range(0,101), delimiter='_'): | ||
def retrieve_alignment(tree, aln_path, format, taxa_set, label_to_species): | ||
""" | ||
Parameters | ||
---------------- | ||
tre : single-copy treeswift tree generated from James's code | ||
alnpath : path to the phylip formatted alignment of the genes. The row labels should be a superset of the leafset of 'tre' | ||
seqlen : sequence length parameter, only the first seqlen columns are taken from the MSA | ||
taxonset: set, the taxon set of the entire dataset | ||
tree: single-copy treeswift tree generated by DISCO. | ||
aln_path: path to the PHYLIP formatted alignment of the genes. | ||
The row labels should be a superset of the leafset of 'tree'. | ||
format: file format of the alignment (either "phylip" or "fasta"). | ||
taxa_set: set, the taxon set of the entire dataset | ||
delimiter: delimiter used to get the taxa names from the labels | ||
Returns the MSA that corresponds to the input tree. | ||
""" | ||
aln = AlignIO.read(open(alnpath), "phylip") | ||
seqlen = len(aln[0].seq) | ||
blank = "-" * seqlen | ||
whitelist = set(tre.labels(True, False)) | ||
rest = set(taxonset) | ||
#print(rest) | ||
res = MultipleSeqAlignment([]) | ||
for r in aln[:,:seqlen]: | ||
if r.id in whitelist: | ||
rid = r.id.split(delimiter)[0] | ||
rid_i = rid | ||
res.append(SeqRecord(r.seq, id=rid)) | ||
rest.remove(rid_i) | ||
for rst in rest: | ||
res.append(SeqRecord(Seq(blank), id=str(rst))) | ||
res.sort() | ||
return res | ||
|
||
def format_phy(i, n): | ||
return str(i).zfill(n) + ".phy" | ||
aln = AlignIO.read(open(aln_path), format) | ||
seq_len = len(aln[0].seq) | ||
blank = "-" * seq_len | ||
whitelist, remaining = set(tree.labels(True, False)), set(taxa_set) | ||
result = MultipleSeqAlignment([]) | ||
|
||
# you can't get sequences by name from aln objects in biopython | ||
# in a better way as far as I can tell | ||
for record in aln[:,:seq_len]: | ||
if record.id in whitelist: | ||
taxon_name = label_to_species(record.id) | ||
result.append(SeqRecord(record.seq, id=taxon_name)) | ||
remaining.remove(taxon_name) | ||
|
||
if __name__=="__main__": | ||
for taxon_name in remaining: | ||
result.append(SeqRecord(Seq(blank), id=str(taxon_name))) | ||
|
||
result.sort() | ||
return result | ||
|
||
def main(args): | ||
input_alignments = [row for row in csv.reader(open(args.alignment, "r"))] | ||
label_to_species = lambda x:x.split(args.delimiter)[0] | ||
tree_list = ts.read_tree_newick(args.input) | ||
assert not isinstance(tree_list, ts.Tree) | ||
taxa_set = set(label for tree in tree_list | ||
for label in map(label_to_species, tree.labels(True, False))) | ||
|
||
# init aln with taxa labels | ||
aln = MultipleSeqAlignment([]) | ||
for taxa in taxa_set: | ||
aln.append(SeqRecord(Seq(''), id=taxa)) | ||
aln.sort() | ||
|
||
partitions = [] | ||
p_index = 1 | ||
|
||
parser = argparse.ArgumentParser(description = "generate concatenation files from gene-family trees using decomposition strategies") | ||
for (aln_file, *_), tree in zip(input_alignments, tree_list): | ||
tree.reroot(get_min_root(tree, label_to_species)[0]) | ||
tag(tree,label_to_species) | ||
disco_trees = list(filter(lambda x:x.num_nodes(internal=False) >= args.filter, decompose(tree))) | ||
for dtree in disco_trees: | ||
aln += retrieve_alignment(dtree, aln_file, args.format,taxa_set, label_to_species) | ||
if aln.get_alignment_length() + 1 != p_index: | ||
partitions.append(f"{p_index:d}-{aln.get_alignment_length():d}") | ||
p_index = aln.get_alignment_length() + 1 | ||
else: | ||
partitions.append("empty") | ||
|
||
if args.partition: | ||
with open(f"{args.output_prefix}-partitions.txt", "w") as f: | ||
assert all(len(x) == 2 for x in input_alignments), "alignment list file format problem" | ||
for partition, (aln_file, model) in zip(partitions, input_alignments): | ||
if partition != "empty": | ||
gene_name = aln_file.split(os.sep)[-1].split('.')[0] | ||
f.write(f"{model}, {gene_name}={partition}\n") | ||
|
||
AlignIO.write(aln, f"{args.output_prefix}-aln.{args.format[:3]}", args.format) | ||
|
||
if __name__=="__main__": | ||
parser = argparse.ArgumentParser(description="generate concatenation files from gene-family trees using decomposition strategies") | ||
|
||
# some of these arguments are directly copied from James's code | ||
parser.add_argument("-i", "--input", type=str, | ||
help="Input tree list file", required=True) | ||
parser.add_argument("-o", "--output", type=str, required=True, | ||
help="Output tree list file") | ||
parser.add_argument("-a", "--alignment", type=fileList, required=True, | ||
help="File containing paths to all alignment files in order the genes are found in the input newick file") | ||
help="input tree list file", required=True) | ||
parser.add_argument("-o", "--output-prefix", type=str, required=True, | ||
help="output tree list file") | ||
parser.add_argument("-a", "--alignment", required=True, type=str, | ||
help="alignment files list") | ||
parser.add_argument('-f', '--format', choices=["phylip", "fasta"], required=True, | ||
help="alignment file format") | ||
parser.add_argument('-d', '--delimiter', type=str, default='_', | ||
help="Delimiter separating taxon label from the rest of the leaf label.") | ||
help="delimiter separating taxon label from the rest of the leaf label.") | ||
parser.add_argument('-m', '--filter', type=int, default=4, | ||
help="Exclude decomposed trees with less then X taxa") | ||
#parser.add_argument("-k", "--ngenes", type=int, default = math.inf, help="maximum number of input gene trees to use") | ||
#parser.add_argument("-d", "--decomp", type=str, default = "s", help = "decomposition method, either s or d for sampling (linear) or decomposition") | ||
#parser.add_argument("-l", "--seqln", type=int, default = math.inf, help="maximum kept sequence length of alignments") | ||
parser.add_argument("-t", "--taxonset", type=parseTaxonList, required=True, | ||
help="taxon set (a range of numbers in the format of a-b) or file listing taxa (each label alone on one line)") | ||
args = parser.parse_args() | ||
|
||
|
||
# arguments: n (seqlength), alnpath, inpath, k (genetree limit), decompm: either "s" or "d" | ||
#N = args.seqln # seqlength | ||
ALNROOT = args.alignment # alignment directory, containing zero-padded files like 0001.phy ... 0030.phy that corresponds to the alignment of the genes | ||
INPATH = args.input # input multi-copy gene family trees path | ||
#K = args.ngenes # maximum number of multi-copy trees to use from INPATH | ||
#DECOMPM = args.decomp # decomposition method, either "s" or "d" for sampling or decomposition | ||
LEAFSET = args.taxonset | ||
OUTPUT = args.output #or INPATH + f".{K}.{DECOMPM}.aln" | ||
DELIMITER = args.delimiter | ||
F = args.filter | ||
|
||
num_genes = sum(1 for _ in open(INPATH)) | ||
|
||
with open(INPATH) as fh: | ||
aln = None | ||
for i, l in enumerate(fh, start=1): | ||
t = ts.read_tree_newick(l) | ||
t.reroot(get_min_root(t, lambda x:x.split(DELIMITER)[0])[0]) | ||
tag(t, lambda x:x.split(DELIMITER)[0]) | ||
out = list(filter(lambda x:x.num_nodes(internal=False) >= F, decompose(t))) | ||
#phy = os.path.join(ALNROOT, format_phy(i, len(str(num_genes)))) | ||
phy = ALNROOT[i - 1] | ||
for ot in out: | ||
if aln == None: | ||
aln = retrieve_alignment(ot, phy, LEAFSET, delimiter=DELIMITER) | ||
else: | ||
aln += retrieve_alignment(ot, phy, LEAFSET, delimiter=DELIMITER) | ||
AlignIO.write(aln, OUTPUT, "phylip") | ||
help="exclude decomposed trees with less then X taxa") | ||
parser.add_argument('-p', '--partition', action='store_true', | ||
help="generate partition file") | ||
|
||
main(parser.parse_args()) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
example/seqs/0001.phy,GTR+G | ||
example/seqs/0002.phy,GTR+G | ||
example/seqs/0003.phy,GTR+G | ||
example/seqs/0004.phy,GTR+G | ||
example/seqs/0005.phy,GTR+G | ||
example/seqs/0006.phy,GTR+G | ||
example/seqs/0007.phy,GTR+G | ||
example/seqs/0008.phy,GTR+G | ||
example/seqs/0009.phy,GTR+G | ||
example/seqs/0010.phy,GTR+G |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.