Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extend blocks over gaps #21

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 86 additions & 3 deletions src/branch_sequences.c
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ void get_likelihood_for_windows(char * child_sequence, int length_of_sequence, i
}

// block_coordinates will now contain merged blocks
number_of_blocks = merge_adjacent_blocks(block_coordinates, number_of_blocks);
number_of_blocks = merge_adjacent_blocks(block_coordinates, number_of_blocks,branch_snp_sequence,number_of_branch_snps,snp_site_coords);
int * candidate_blocks[3];
candidate_blocks[0] = (int *) malloc((number_of_blocks+1)*sizeof(int));
candidate_blocks[1] = (int *) malloc((number_of_blocks+1)*sizeof(int));
Expand Down Expand Up @@ -417,9 +417,9 @@ int get_smallest_log_likelihood(int ** candidate_blocks, int number_of_candidate


// merge blocks which are beside each other into large blocks and return the number of blocks
int merge_adjacent_blocks(int ** block_coordinates, int number_of_blocks)
int merge_adjacent_blocks(int ** block_coordinates, int number_of_blocks, char * branch_snp_sequence, int number_of_bases, int * snp_site_coords)
{
int i;
int i = 1;
int merged_block_coordinates[2][number_of_blocks];
int current_merged_block = 0;

Expand All @@ -428,6 +428,12 @@ int merge_adjacent_blocks(int ** block_coordinates, int number_of_blocks)
return number_of_blocks;
}

for(i=0; i < number_of_blocks; i++)
{
block_coordinates[0][i] = extend_end_of_block_left_over_gap( block_coordinates[0][i], branch_snp_sequence, number_of_bases, snp_site_coords);
block_coordinates[1][i] = extend_end_of_block_right_over_gap(block_coordinates[1][i], branch_snp_sequence, number_of_bases, snp_site_coords);
}

merged_block_coordinates[0][current_merged_block] = block_coordinates[0][current_merged_block];
merged_block_coordinates[1][current_merged_block] = block_coordinates[1][current_merged_block];

Expand Down Expand Up @@ -466,6 +472,83 @@ int merge_adjacent_blocks(int ** block_coordinates, int number_of_blocks)
}


int extend_end_of_block_left_over_gap(int block_coord, char * branch_snp_sequence, int number_of_bases, int * snp_site_coords)
{
int index = 0;
int last_snp_index = 0;
index = find_starting_index( block_coord, snp_site_coords, 0, number_of_bases);
last_snp_index = index;

if(index >= number_of_bases || snp_site_coords[index] != block_coord)
{
return block_coord;
}

while(index > 0 )
{
if((snp_site_coords[index] -1) == snp_site_coords[index-1] )
{
if(branch_snp_sequence[index-1] != '-')
{
last_snp_index = index-1;
}
}
else
{
break;
}
index--;
}

if(index ==0 || last_snp_index == 0 )
{
return block_coord;
}
else
{
return snp_site_coords[last_snp_index];
}
}

int extend_end_of_block_right_over_gap(int block_coord, char * branch_snp_sequence, int number_of_bases, int * snp_site_coords)
{
int index = 0;
int last_snp_index = 0;
index = find_starting_index( block_coord, snp_site_coords, 0, number_of_bases);
last_snp_index = index;

if(index+1 >= number_of_bases || snp_site_coords[index] != block_coord)
{
return block_coord;
}

while(index+1 < number_of_bases )
{
if((snp_site_coords[index] +1) == snp_site_coords[index+1] )
{
if(branch_snp_sequence[index+1] != '-')
{
last_snp_index = index+1;
}
}
else
{
break;
}
index++;
}

if(index ==0 || last_snp_index == 0 )
{
return block_coord;
}
else
{
return snp_site_coords[last_snp_index];
}
}


double snp_density(int length_of_sequence, int number_of_snps)
{
return number_of_snps*1.0/length_of_sequence;
Expand Down
5 changes: 3 additions & 2 deletions src/branch_sequences.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,12 @@ void fill_in_recombinations_with_reference_bases(newick_node *root, int * parent
int copy_and_concat_integer_arrays(int * array_1, int array_1_size, int * array_2, int array_2_size, int * output_array);
double snp_density(int length_of_sequence, int number_of_snps);
int calculate_cutoff(int branch_genome_size, int window_size, int num_branch_snps);
int merge_adjacent_blocks(int ** block_coordinates, int number_of_blocks);
int merge_adjacent_blocks(int ** block_coordinates, int number_of_blocks, char * branch_snp_sequence, int number_of_bases, int * snp_site_coords);
int get_smallest_log_likelihood(int ** candidate_blocks, int number_of_candidate_blocks);
int exclude_snp_sites_in_block(int window_start_coordinate, int window_end_coordinate, int * snp_site_coords, int number_of_branch_snps);
int flag_smallest_log_likelihood_recombinations(int ** candidate_blocks, int number_of_candidate_blocks, int number_of_branch_snps, int * snp_site_coords, int * recombinations, int number_of_recombinations,newick_node * current_node, FILE * block_file_pointer, newick_node *root,int * snp_locations, int total_num_snps, FILE * gff_file_pointer);

int extend_end_of_block_left_over_gap(int block_coord, char * branch_snp_sequence, int number_of_bases, int * snp_site_coords);
int extend_end_of_block_right_over_gap(int block_coord, char * branch_snp_sequence, int number_of_bases, int * snp_site_coords);

#define MIN_SNPS_FOR_IDENTIFYING_RECOMBINATIONS 3
#define DEFAULT_SNP_DENSITY 0.000001
Expand Down
173 changes: 169 additions & 4 deletions tests/check_branch_sequences.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ START_TEST (check_merge_adjacent_blocks_not_adjacent)
blocks_not_adjacent[0][1] = 1000;
blocks_not_adjacent[1][1] = 1200;

fail_unless(merge_adjacent_blocks(blocks_not_adjacent,2) == 2);
char *branch_sequence = "A";
int snp_site_coords[9] = {10};
int number_of_bases = 1;

fail_unless(merge_adjacent_blocks(blocks_not_adjacent,2, branch_sequence,number_of_bases,snp_site_coords) == 2);
fail_unless(blocks_not_adjacent[0][0] == 10 );
fail_unless(blocks_not_adjacent[1][0] == 20 );
fail_unless(blocks_not_adjacent[0][1] == 1000);
Expand All @@ -38,7 +42,11 @@ START_TEST (check_merge_adjacent_blocks_beside_each_other)
blocks_beside_each_other[0][1] = 20;
blocks_beside_each_other[1][1] = 30;

fail_unless(merge_adjacent_blocks(blocks_beside_each_other,2) == 1);
char *branch_sequence = "A";
int snp_site_coords[9] = {10};
int number_of_bases = 1;

fail_unless(merge_adjacent_blocks(blocks_beside_each_other,2, branch_sequence,number_of_bases,snp_site_coords) == 1);
fail_unless(blocks_beside_each_other[0][0] == 10 );
fail_unless(blocks_beside_each_other[1][0] == 30 );
fail_unless(blocks_beside_each_other[0][1] == 0);
Expand All @@ -56,7 +64,11 @@ START_TEST (check_merge_adjacent_blocks_near_each_other)
blocks_near_each_other[0][1] = 21;
blocks_near_each_other[1][1] = 30;

fail_unless(merge_adjacent_blocks(blocks_near_each_other,2) == 1);
char *branch_sequence = "A";
int snp_site_coords[9] = {10};
int number_of_bases = 1;

fail_unless(merge_adjacent_blocks(blocks_near_each_other,2, branch_sequence,number_of_bases,snp_site_coords) == 1);
fail_unless(blocks_near_each_other[0][0] == 10 );
fail_unless(blocks_near_each_other[1][0] == 30 );
fail_unless(blocks_near_each_other[0][1] == 0);
Expand All @@ -74,7 +86,11 @@ START_TEST (check_merge_adjacent_blocks_overlapping)
blocks_overlapping[0][1] = 19;
blocks_overlapping[1][1] = 30;

fail_unless(merge_adjacent_blocks(blocks_overlapping,2) == 1);
char *branch_sequence = "A";
int snp_site_coords[9] = {10};
int number_of_bases = 1;

fail_unless(merge_adjacent_blocks(blocks_overlapping,2, branch_sequence,number_of_bases,snp_site_coords) == 1);
fail_unless(blocks_overlapping[0][0] == 10 );
fail_unless(blocks_overlapping[1][0] == 30 );
fail_unless(blocks_overlapping[0][1] == 0);
Expand All @@ -83,6 +99,146 @@ START_TEST (check_merge_adjacent_blocks_overlapping)
}
END_TEST


START_TEST (check_merge_block_straddling_gap)
{
int * blocks_overlapping[2];
blocks_overlapping[0] = (int *) malloc((3)*sizeof(int));
blocks_overlapping[1] = (int *) malloc((3)*sizeof(int));
blocks_overlapping[0][0] = 10;
blocks_overlapping[1][0] = 40;
blocks_overlapping[0][1] = 44;
blocks_overlapping[1][1] = 70;

char *branch_sequence = "AAA---CCC";
int snp_site_coords[9] = {10,30,40,41,42,43,44,60,70};
int number_of_bases = 9;

fail_unless(merge_adjacent_blocks(blocks_overlapping,2, branch_sequence,number_of_bases,snp_site_coords) == 1);

fail_unless(blocks_overlapping[0][0] == 10);
fail_unless(blocks_overlapping[1][0] == 70);
fail_unless(blocks_overlapping[0][1] == 0);
fail_unless(blocks_overlapping[1][1] == 0);

}
END_TEST


START_TEST (check_extend_end_of_block_right_over_gap)
{
char *branch_sequence = "AA---CC";
int snp_site_coords[7] = {30,40,41,42,43,44,60};
int number_of_bases = 7;

// Dont extend if there is no gap
fail_unless(extend_end_of_block_right_over_gap(30,branch_sequence,number_of_bases,snp_site_coords ) == 30);
// Dont extend if you cant get the sequence
fail_unless(extend_end_of_block_right_over_gap(31,branch_sequence,number_of_bases,snp_site_coords ) == 31);
fail_unless(extend_end_of_block_right_over_gap(44,branch_sequence,number_of_bases,snp_site_coords ) == 44);
fail_unless(extend_end_of_block_right_over_gap(999,branch_sequence,number_of_bases,snp_site_coords ) == 999);

// Extend block coords
fail_unless(extend_end_of_block_right_over_gap(40,branch_sequence,number_of_bases,snp_site_coords ) == 44);
fail_unless(extend_end_of_block_right_over_gap(41,branch_sequence,number_of_bases,snp_site_coords ) == 44);
}
END_TEST

START_TEST (check_dont_extend_right_if_gap_non_contiguous)
{
char *branch_sequence = "AA---CC";
int snp_site_coords[7] = {30,40,41,42,43,50,60};
int number_of_bases = 7;

// dont extend block over gap if theres no contiguous snp at the end
fail_unless(extend_end_of_block_right_over_gap(40,branch_sequence,number_of_bases,snp_site_coords ) == 40);
fail_unless(extend_end_of_block_right_over_gap(43,branch_sequence,number_of_bases,snp_site_coords ) == 43);
}
END_TEST

START_TEST (check_extend_right_over_multiple_gaps)
{
char *branch_sequence = "AA-T-CC";
int snp_site_coords[7] = {30,40,41,42,43,44,60};
int number_of_bases = 7;

// dont extend block over gap if theres no contiguous snp at the end
fail_unless(extend_end_of_block_right_over_gap(40,branch_sequence,number_of_bases,snp_site_coords ) == 44);
fail_unless(extend_end_of_block_right_over_gap(41,branch_sequence,number_of_bases,snp_site_coords ) == 44);
}
END_TEST


START_TEST (check_extend_right_over_multiple_gaps_stopping_at_last_snp)
{
char *branch_sequence = "AA-T-CC";
int snp_site_coords[7] = {30,40,41,42,43,50,60};
int number_of_bases = 7;

// dont extend block over gap if theres no contiguous snp at the end
fail_unless(extend_end_of_block_right_over_gap(40,branch_sequence,number_of_bases,snp_site_coords ) == 42);
fail_unless(extend_end_of_block_right_over_gap(41,branch_sequence,number_of_bases,snp_site_coords ) == 42);
}
END_TEST



START_TEST (check_extend_end_of_block_left_over_gap)
{
char *branch_sequence = "AA---CC";
int snp_site_coords[7] = {30,40,41,42,43,44,60};
int number_of_bases = 7;

// Dont extend if there is no gap
fail_unless(extend_end_of_block_left_over_gap(60,branch_sequence,number_of_bases,snp_site_coords ) == 60);
// Dont extend if you cant get the sequence
fail_unless(extend_end_of_block_left_over_gap(59,branch_sequence,number_of_bases,snp_site_coords ) == 59);
fail_unless(extend_end_of_block_left_over_gap(40,branch_sequence,number_of_bases,snp_site_coords ) == 40);
fail_unless(extend_end_of_block_left_over_gap(999,branch_sequence,number_of_bases,snp_site_coords ) == 999);

// Extend block coords
fail_unless(extend_end_of_block_left_over_gap(44,branch_sequence,number_of_bases,snp_site_coords ) == 40);
fail_unless(extend_end_of_block_left_over_gap(41,branch_sequence,number_of_bases,snp_site_coords ) == 40);
}
END_TEST

START_TEST (check_dont_extend_left_if_gap_non_contiguous)
{
char *branch_sequence = "AA---CC";
int snp_site_coords[7] = {30,31,41,42,43,50,60};
int number_of_bases = 7;

// dont extend block over gap if theres no contiguous snp at the end
fail_unless(extend_end_of_block_left_over_gap(41,branch_sequence,number_of_bases,snp_site_coords ) == 41);
fail_unless(extend_end_of_block_left_over_gap(43,branch_sequence,number_of_bases,snp_site_coords ) == 43);
}
END_TEST

START_TEST (check_extend_left_over_multiple_gaps)
{
char *branch_sequence = "AA-T-CC";
int snp_site_coords[7] = {30,40,41,42,43,44,60};
int number_of_bases = 7;

fail_unless(extend_end_of_block_left_over_gap(44,branch_sequence,number_of_bases,snp_site_coords ) == 40);
fail_unless(extend_end_of_block_left_over_gap(41,branch_sequence,number_of_bases,snp_site_coords ) == 40);
}
END_TEST


START_TEST (check_extend_left_over_multiple_gaps_stopping_at_last_snp)
{
char *branch_sequence = "AA-T-CC";
int snp_site_coords[7] = {30,31,41,42,43,44,60};
int number_of_bases = 7;

fail_unless(extend_end_of_block_left_over_gap(44,branch_sequence,number_of_bases,snp_site_coords ) == 42);
fail_unless(extend_end_of_block_left_over_gap(43,branch_sequence,number_of_bases,snp_site_coords ) == 42);
}
END_TEST



START_TEST (check_exclude_snp_sites_in_block)
{
int number_of_branch_snps = 8;
Expand Down Expand Up @@ -114,6 +270,15 @@ Suite * check_branch_sequences_suite (void)
tcase_add_test (tc_branch_sequences, check_merge_adjacent_blocks_near_each_other);
tcase_add_test (tc_branch_sequences, check_merge_adjacent_blocks_overlapping);
tcase_add_test (tc_branch_sequences, check_exclude_snp_sites_in_block);
tcase_add_test (tc_branch_sequences, check_merge_block_straddling_gap);
tcase_add_test (tc_branch_sequences, check_extend_end_of_block_right_over_gap);
tcase_add_test (tc_branch_sequences, check_dont_extend_right_if_gap_non_contiguous);
tcase_add_test (tc_branch_sequences, check_extend_right_over_multiple_gaps);
tcase_add_test (tc_branch_sequences, check_extend_right_over_multiple_gaps_stopping_at_last_snp);
tcase_add_test (tc_branch_sequences, check_extend_end_of_block_left_over_gap);
tcase_add_test (tc_branch_sequences, check_dont_extend_left_if_gap_non_contiguous);
tcase_add_test (tc_branch_sequences, check_extend_left_over_multiple_gaps);
tcase_add_test (tc_branch_sequences, check_extend_left_over_multiple_gaps_stopping_at_last_snp);
suite_add_tcase (s, tc_branch_sequences);

return s;
Expand Down
2 changes: 1 addition & 1 deletion tests/data/one_recombination.expected.stats
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Sample name Num recombinations Num mutations r/m Genome Length
sequence_1 0 135 0.000000 135
sequence_2 95 129 0.736434 129
sequence_2 103 129 0.798450 129
sequence_3 0 129 0.000000 129
sequence_4 0 129 0.000000 129
sequence_5 0 129 0.000000 129
Expand Down
8 changes: 0 additions & 8 deletions tests/data/one_recombination.expected.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,4 @@
1 60 . A C . . AB . . . C . . . . . . .
1 61 . C A . . AB . . A A A A A A A A A
1 62 . C A . . AB . . A A A A A A A A A
1 160 . A C . . AB . . C . . . . . . . .
1 161 . A C . . AB . . C . . . . . . . .
1 162 . A C . . AB . . C . . . . . . . .
1 163 . A C . . AB . . C . . . . . . . .
1 164 . A C . . AB . . C . . . . . . . .
1 165 . A C . . AB . . C . . . . . . . .
1 166 . A C . . AB . . C . . . . . . . .
1 167 . A C . . AB . . C . . . . . . . .
1 169 . A C . . AB . . . . . . C . . . .