Skip to content

Commit

Permalink
Merge pull request #53 from andrewjpage/master
Browse files Browse the repository at this point in the history
snp branch output and prefiltering aln
  • Loading branch information
andrewjpage committed Nov 2, 2012
2 parents 9d57e2f + ba837f1 commit a64ba0f
Show file tree
Hide file tree
Showing 14 changed files with 2,537 additions and 30 deletions.
55 changes: 48 additions & 7 deletions run_gubbins.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@
import os
import time
import re
import tempfile
from Bio import Phylo
import dendropy
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from cStringIO import StringIO
import shutil

Expand Down Expand Up @@ -237,6 +239,34 @@ def get_sequence_names_from_alignment(filename):
sequence_names.append(record.id)
handle.close()
return sequence_names


def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose):
input_handle = open(input_filename, "rU")
output_handle = open(output_filename, "w+")
alignments = AlignIO.parse(input_handle, "fasta")
output_alignments = []
for alignment in alignments:
for record in alignment:
number_of_gaps = 0
number_of_gaps += record.seq.count('n')
number_of_gaps += record.seq.count('N')
number_of_gaps += record.seq.count('-')
sequence_length = len(record.seq)

if sequence_length == 0:
if verbose > 0:
print "Excluded sequence " + record.id + " because there werent enough bases in it"
elif((number_of_gaps*100/sequence_length) <= filter_percentage):
output_alignments.append(record)
else:
if verbose > 0:
print "Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed"

AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
output_handle.close()
input_handle.close()
return

# reparsing a fasta file splits the lines which makes fastml work
def reconvert_fasta_file(input_filename, output_filename):
Expand Down Expand Up @@ -296,6 +326,7 @@ def is_exe(fpath):
parser.add_argument('--tree_builder', '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
parser.add_argument('--iterations', '-i', help='Maximum No. of iterations, default is 5', type=int, default = 5)
parser.add_argument('--min_snps', '-m', help='Min SNPs to identify a recombination block, default is 3', type=int, default = 3)
parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int, default = 25)
args = parser.parse_args()

# check that all the external executable dependancies are available
Expand All @@ -318,18 +349,28 @@ def is_exe(fpath):
if args.verbose > 0:
print current_time

# find all snp sites
if args.verbose > 0:
print GUBBINS_EXEC + args.alignment_filename
subprocess.check_call([GUBBINS_EXEC, args.alignment_filename])
if args.verbose > 0:
print int(time.time())
# get the base filename
(base_directory,base_filename) = os.path.split(args.alignment_filename)
(base_filename_without_ext,extension) = os.path.splitext(base_filename)
starting_base_filename = base_filename

# put a filtered copy into a temp directory and work from that
temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())
filter_out_alignments_with_too_much_missing_data(args.alignment_filename, temp_working_dir+"/"+starting_base_filename, args.filter_percentage, args.verbose)
args.alignment_filename = temp_working_dir+"/"+starting_base_filename

# get the base filename
(base_directory,base_filename) = os.path.split(args.alignment_filename)
(base_filename_without_ext,extension) = os.path.splitext(base_filename)
starting_base_filename = base_filename

# find all snp sites
if args.verbose > 0:
print GUBBINS_EXEC +" "+ args.alignment_filename
subprocess.check_call([GUBBINS_EXEC, args.alignment_filename])
if args.verbose > 0:
print int(time.time())

reconvert_fasta_file(starting_base_filename+".gaps.snp_sites.aln",starting_base_filename+".start")

# Perform pairwise comparison if there are only 2 sequences
Expand All @@ -338,7 +379,6 @@ def is_exe(fpath):
pairwise_comparison(args.alignment_filename,starting_base_filename,GUBBINS_EXEC,args.alignment_filename)
sys.exit()


latest_file_name = "latest_tree."+base_filename_without_ext+"."+str(current_time)+".tre"
tree_file_names = []

Expand Down Expand Up @@ -438,4 +478,5 @@ def is_exe(fpath):

fasttree_regex_for_file_deletions = fasttree_regex_for_file_deletions(starting_base_filename, max_intermediate_iteration)
delete_files_based_on_list_of_regexes('.', fasttree_regex_for_file_deletions, args.verbose)
shutil.rmtree(temp_working_dir)

9 changes: 7 additions & 2 deletions src/Newickform.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp
strcpy(block_file_name, filename);
block_file_pointer = fopen(strcat(block_file_name,".tab"), "w");

// output tab file
FILE * branch_snps_file_pointer;
char branch_snps_file_name[MAX_FILENAME_SIZE];
strcpy(branch_snps_file_name, filename);
branch_snps_file_pointer = fopen(strcat(branch_snps_file_name,".branch_snps.tab"), "w");

// output gff file
FILE * gff_file_pointer;
char gff_file_name[MAX_FILENAME_SIZE];
Expand All @@ -84,8 +90,7 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp
char * root_sequence;

carry_unambiguous_gaps_up_tree(root);

root_sequence = generate_branch_sequences(root, vcf_file_pointer, snp_locations, number_of_snps, column_names, number_of_columns,root_sequence, length_of_original_genome, block_file_pointer,gff_file_pointer,min_snps);
root_sequence = generate_branch_sequences(root, vcf_file_pointer, snp_locations, number_of_snps, column_names, number_of_columns,root_sequence, length_of_original_genome, block_file_pointer,gff_file_pointer,min_snps,branch_snps_file_pointer);
int * parent_recombinations;
fill_in_recombinations_with_gaps(root, parent_recombinations, 0, 0,0,root->block_coordinates,length_of_original_genome,snp_locations);

Expand Down
20 changes: 18 additions & 2 deletions src/block_tab_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,28 @@
#include <string.h>
#include "block_tab_file.h"

void print_block_details(FILE * block_file_pointer, int start_coordinate, int end_coordinate, int number_of_snps, int current_node_id, int parent_node_id, char * taxon_names)
void print_block_details(FILE * block_file_pointer, int start_coordinate, int end_coordinate, int number_of_snps, char * current_node_id, char * parent_node_id, char * taxon_names)
{
fprintf(block_file_pointer, "FT misc_feature %d..%d\n", start_coordinate+1, end_coordinate+1);
fprintf(block_file_pointer, "FT /node=\"%d->%d\"\n",parent_node_id,current_node_id);
fprintf(block_file_pointer, "FT /node=\"%s->%s\"\n",parent_node_id,current_node_id);
fprintf(block_file_pointer, "FT /colour=2\n");
fprintf(block_file_pointer, "FT /taxa=\"%s\"\n",taxon_names);
fprintf(block_file_pointer, "FT /SNP_count=%d\n",number_of_snps);
fflush(block_file_pointer);
}


void print_branch_snp_details(FILE * branch_snps_file_pointer, char * current_node_id, char * parent_node_id, int * branches_snp_sites, int number_of_branch_snps, char * branch_snp_sequence, char * branch_snp_ancestor_sequence,char * taxon_names)
{
int i = 0;
for(i=0; i< number_of_branch_snps; i++)
{
fprintf(branch_snps_file_pointer, "FT variation %d\n", branches_snp_sites[i]+1);
fprintf(branch_snps_file_pointer, "FT /node=\"%s->%s\"\n",parent_node_id,current_node_id);
fprintf(branch_snps_file_pointer, "FT /colour=4\n");
fprintf(branch_snps_file_pointer, "FT /taxa=\"%s\"\n",taxon_names);
fprintf(branch_snps_file_pointer, "FT /parent_base=\"%c\"\n",branch_snp_sequence[i]);
fprintf(branch_snps_file_pointer, "FT /replace=\"%c\"\n",branch_snp_ancestor_sequence[i]);
fflush(branch_snps_file_pointer);
}
}
4 changes: 2 additions & 2 deletions src/block_tab_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@
#ifndef _BLOCK_TAB_FILE_H_
#define _BLOCK_TAB_FILE_H_

void print_block_details(FILE * block_file_pointer, int start_coordinate, int end_coordinate, int number_of_snps, int current_node_id, int parent_node_id, char * taxon_names);

void print_block_details(FILE * block_file_pointer, int start_coordinate, int end_coordinate, int number_of_snps, char * current_node_id, char * parent_node_id, char * taxon_names);
void print_branch_snp_details(FILE * branch_snps_file_pointer, char * current_node_id, char * parent_node_id, int * branches_snp_sites, int number_of_branch_snps, char * branch_snp_sequence, char * branch_snp_ancestor_sequence,char * taxon_names);
#endif
23 changes: 14 additions & 9 deletions src/branch_sequences.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat
int sequence_index;
sequence_index = find_sequence_index_from_sample_name(root->taxon);

set_number_of_recombinations_for_sample(root->taxon,num_current_recombinations);
set_number_of_snps_for_sample(root->taxon,(current_total_snps + root->number_of_snps));
set_number_of_blocks_for_sample(root->taxon,(num_blocks + root->number_of_blocks));
set_number_of_recombinations_for_sample(root->taxon,root->num_recombinations);
set_number_of_snps_for_sample(root->taxon,root->number_of_snps);
set_number_of_blocks_for_sample(root->taxon, root->number_of_blocks);

int ** merged_block_coordinates;
merged_block_coordinates = (int **) malloc(3*sizeof(int *));
Expand All @@ -102,7 +102,7 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat
char * child_sequence = (char *) malloc((length_of_original_genome +1)*sizeof(char));
get_sequence_for_sample_name(child_sequence, root->taxon);

set_number_of_bases_in_recombinations(root->taxon, calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, (num_blocks + root->number_of_blocks), child_sequence, snp_locations,current_total_snps));
set_number_of_bases_in_recombinations(root->taxon, calculate_number_of_bases_in_recombations_excluding_gaps(root->block_coordinates, root->number_of_blocks, child_sequence, snp_locations,current_total_snps));

for(i = 0; i < num_current_recombinations; i++)
{
Expand Down Expand Up @@ -219,7 +219,7 @@ void carry_unambiguous_gaps_up_tree(newick_node *root)
}
}

char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, char * leaf_sequence, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps)
char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, char * leaf_sequence, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps,FILE * branch_snps_file_pointer)
{
newick_child *child;
int child_counter = 0;
Expand Down Expand Up @@ -258,7 +258,7 @@ char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int *
while (child != NULL)
{
// recursion
child_sequences[child_counter] = generate_branch_sequences(child->node, vcf_file_pointer, snp_locations, number_of_snps, column_names, number_of_columns, child_sequences[child_counter],length_of_original_genome, block_file_pointer,gff_file_pointer,min_snps);
child_sequences[child_counter] = generate_branch_sequences(child->node, vcf_file_pointer, snp_locations, number_of_snps, column_names, number_of_columns, child_sequences[child_counter],length_of_original_genome, block_file_pointer,gff_file_pointer,min_snps,branch_snps_file_pointer);
child_nodes[child_counter] = child->node;
strcat(root->taxon_names, " ");
strcat(root->taxon_names, child_nodes[child_counter]->taxon_names);
Expand All @@ -283,10 +283,15 @@ char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int *
{
branches_snp_sites[current_branch] = (int *) malloc((number_of_snps +1)*sizeof(int));
char * branch_snp_sequence;
char * branch_snp_ancestor_sequence;
branch_snp_sequence = (char *) malloc((number_of_snps +1)*sizeof(char));
branch_snp_ancestor_sequence = (char *) malloc((number_of_snps +1)*sizeof(char));

branch_genome_size = calculate_size_of_genome_without_gaps(child_sequences[current_branch], 0,number_of_snps, length_of_original_genome);
number_of_branch_snps = calculate_number_of_snps_excluding_gaps(leaf_sequence, child_sequences[current_branch], number_of_snps, branches_snp_sites[current_branch], snp_locations,branch_snp_sequence);
number_of_branch_snps = calculate_number_of_snps_excluding_gaps(leaf_sequence, child_sequences[current_branch], number_of_snps, branches_snp_sites[current_branch], snp_locations,branch_snp_sequence,branch_snp_ancestor_sequence);

print_branch_snp_details(branch_snps_file_pointer, child_nodes[current_branch]->taxon,root->taxon, branches_snp_sites[current_branch], number_of_branch_snps, branch_snp_sequence, branch_snp_ancestor_sequence,root->taxon_names);

get_likelihood_for_windows(child_sequences[current_branch], number_of_snps, branches_snp_sites[current_branch], branch_genome_size, number_of_branch_snps,snp_locations, child_nodes[current_branch], block_file_pointer, root, branch_snp_sequence,gff_file_pointer,min_snps);
}

Expand Down Expand Up @@ -525,8 +530,8 @@ int flag_smallest_log_likelihood_recombinations(int ** candidate_blocks, int num
//current_node->recombinations = realloc(current_node->recombinations, number_of_recombinations*sizeof(int));
current_node->num_recombinations = number_of_recombinations;

print_block_details(block_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index], number_of_recombinations_in_window, current_node->current_node_id, root->current_node_id, current_node->taxon_names);
print_gff_line(gff_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index], number_of_recombinations_in_window, current_node->current_node_id, root->current_node_id, current_node->taxon_names);
print_block_details(block_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index], number_of_recombinations_in_window, current_node->taxon, root->taxon, current_node->taxon_names);
print_gff_line(gff_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index], number_of_recombinations_in_window, current_node->taxon, root->taxon, current_node->taxon_names);
current_node->number_of_blocks = current_node->number_of_blocks + 1;

current_node->block_coordinates[0] = realloc((int *)current_node->block_coordinates[0], ((int)current_node->number_of_blocks +1)*sizeof(int));
Expand Down
2 changes: 1 addition & 1 deletion src/branch_sequences.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#define _BRANCH_SEQUENCES_H_
#include "seqUtil.h"
#include "Newickform.h"
char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, char * leaf_sequence, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps);
char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, char * leaf_sequence, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps, FILE * branch_snps_file_pointer);
void identify_recombinations(int number_of_branch_snps, int * branches_snp_sites,int length_of_original_genome);
double calculate_snp_density(int * branches_snp_sites, int number_of_branch_snps, int index);
void get_likelihood_for_windows(char * child_sequence, int length_of_sequence, int * snp_site_coords, int branch_genome_size, int number_of_branch_snps, int * snp_locations, newick_node * current_node, FILE * block_file_pointer, newick_node *root, char * branch_snp_sequence, FILE * gff_file_pointer,int min_snps);
Expand Down
4 changes: 3 additions & 1 deletion src/snp_searching.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ int calculate_size_of_genome_without_gaps(char * child_sequence, int start_index
return length_of_original_genome;
}

int calculate_number_of_snps_excluding_gaps(char * ancestor_sequence, char * child_sequence, int child_sequence_size, int * branch_snp_coords, int * snp_locations,char * branch_snp_sequence)
int calculate_number_of_snps_excluding_gaps(char * ancestor_sequence, char * child_sequence, int child_sequence_size, int * branch_snp_coords, int * snp_locations,char * branch_snp_sequence, char * branch_snp_ancestor_sequence)
{
int i ;
int number_of_branch_snp_sites = 0;
Expand All @@ -208,11 +208,13 @@ int calculate_number_of_snps_excluding_gaps(char * ancestor_sequence, char * chi
{
branch_snp_coords[number_of_branch_snp_sites] = snp_locations[i];
branch_snp_sequence[number_of_branch_snp_sites] = child_sequence[i];
branch_snp_ancestor_sequence[number_of_branch_snp_sites] = ancestor_sequence[i];
number_of_branch_snp_sites++;
}
}
branch_snp_coords = realloc(branch_snp_coords, (number_of_branch_snp_sites+1)*sizeof(int));
branch_snp_sequence[number_of_branch_snp_sites] = '\0';
branch_snp_ancestor_sequence[number_of_branch_snp_sites] = '\0';
return number_of_branch_snp_sites;
}

Expand Down
2 changes: 1 addition & 1 deletion src/snp_searching.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int get_window_end_coordinates_excluding_gaps(int window_start_coordinate, int w
int find_number_of_snps_in_block(int window_start_coordinate, int window_end_coordinate, int * snp_locations, char * child_sequence, int number_of_snps);
int calculate_block_size_without_gaps(char * child_sequence, int * snp_locations, int starting_coordinate, int ending_coordinate, int length_of_original_genome);
int calculate_size_of_genome_without_gaps(char * child_sequence, int start_index, int length_of_sequence, int length_of_original_genome);
int calculate_number_of_snps_excluding_gaps(char * ancestor_sequence, char * child_sequence, int child_sequence_size, int * branch_snp_coords, int * snp_locations, char * branch_snp_sequence);
int calculate_number_of_snps_excluding_gaps(char * ancestor_sequence, char * child_sequence, int child_sequence_size, int * branch_snp_coords, int * snp_locations, char * branch_snp_sequence, char * branch_snp_ancestor_sequence);
int flag_recombinations_in_window(int window_start_coordinate, int window_end_coordinate, int number_of_snps, int * branch_snp_sites, int * recombinations, int number_of_recombinations,int * snp_locations, int total_num_snps);
int find_matching_coordinate_index(int window_start_coordinate, int * snp_sites, int number_of_snps, int starting_index);
#endif
Loading

0 comments on commit a64ba0f

Please sign in to comment.