Skip to content

Commit

Permalink
impovements to priorities
Browse files Browse the repository at this point in the history
  • Loading branch information
emmahodcroft committed Apr 13, 2020
1 parent 55fb877 commit 07b4c77
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 10 deletions.
4 changes: 2 additions & 2 deletions Snakefile_Priorities
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ rule export_priorities:
colors = rules.colors.output.colors,
lat_longs = files.lat_longs,
description = files.description,
clades = "results/clades.json",
clades = "results/clades{region}.json",
recency = rules.recency.output
output:
auspice_json = "auspice/ncov-with-priorities.json"
auspice_json = "auspice/ncov{region}-with-priorities.json"
shell:
"""
augur export v2 \
Expand Down
21 changes: 13 additions & 8 deletions scripts/priorities.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def calculate_distance_matrix(sparse_matrix_A, sparse_matrix_B, consensus):
# load entire alignment and the alignment of focal sequences (upper case -- probably not necessary)
context_seqs_dict = calculate_snp_matrix(args.alignment)
focal_seqs_dict = calculate_snp_matrix(args.focal_alignment, consensus = context_seqs_dict['consensus'])
alignment_length = len(context_seqs_dict['consensus'])
print("Done reading the alignments.")

# read in focal alignment to figure out number N & gap
Expand All @@ -130,22 +131,26 @@ def calculate_distance_matrix(sparse_matrix_A, sparse_matrix_B, consensus):

#focal_seqs2 = {x.id:x.upper() for x in AlignIO.read(args.focal_alignment, format='fasta')}
#focal_seqs_array2 = np.array([list(str(focal_seqs[x].seq)) for x in focal_seqs])
mask_count = np.sum(np.logical_or(focal_seqs_array=='N',focal_seqs_array=='-'), axis=1)
mask_count_focal = np.sum(np.logical_or(focal_seqs_array=='N',focal_seqs_array=='-'), axis=1)

# remove focal sequences from all sequence to provide context data set
keep = [(i, name) for i, name in enumerate(context_seqs_dict['names']) if name not in set(focal_seqs_dict['names'])]
context_seqs_dict['snps'] = context_seqs_dict['snps'].tocsr()[[i for i, name in keep],:].tocsc()
context_seqs_dict['names'] = [name for i, name in keep]

# for each context sequence, calculate minimal distance to focal set
d = calculate_distance_matrix(context_seqs_dict['snps'], focal_seqs_dict['snps'], consensus = context_seqs_dict['consensus'])
closest_match = np.argmin(d+mask_count, axis=1)[:,0]
tmp_context_seqs = {x.id: x.upper() for x in AlignIO.read(args.alignment, format='fasta')}
context_seqs_array = np.array([list(str(tmp_context_seqs[x[1]].seq)) for x in keep])
mask_count_context = {k[1]:m for k,m in zip(keep, np.sum(np.logical_or(context_seqs_array=='N',context_seqs_array=='-'), axis=1))}

# for each context sequence, calculate minimal distance to focal set, weigh with number of N/- to pick best sequence
d = np.array(calculate_distance_matrix(context_seqs_dict['snps'], focal_seqs_dict['snps'], consensus = context_seqs_dict['consensus']))
closest_match = np.argmin(d+mask_count_focal/alignment_length, axis=1)
#closest_match = np.argmin(d, axis=1)[:,0]
print("Done finding closest matches.")

minimal_distance_to_focal_set = {}
for context_index, focal_index in enumerate(np.nditer(closest_match)):
minimal_distance_to_focal_set[context_seqs_dict['names'][context_index]] = (int(d[context_index, focal_index]), int(focal_index))
for context_index, focal_index in enumerate(closest_match):
minimal_distance_to_focal_set[context_seqs_dict['names'][context_index]] = (d[context_index, focal_index], focal_seqs_dict["names"][focal_index])

# for each focal sequence with close matches (using the index), we list all close contexts
close_matches = defaultdict(list)
Expand All @@ -154,7 +159,7 @@ def calculate_distance_matrix(sparse_matrix_A, sparse_matrix_B, consensus):

for f in close_matches:
shuffle(close_matches[f])
close_matches[f].sort(key=lambda x:minimal_distance_to_focal_set[x][0])
close_matches[f].sort(key=lambda x: minimal_distance_to_focal_set[x][0] + mask_count_context[x]/alignment_length)



Expand All @@ -166,5 +171,5 @@ def calculate_distance_matrix(sparse_matrix_A, sparse_matrix_B, consensus):
# penalize if many sequences are close to the same focal one by using the index of the shuffled list of neighbours
# currently each position in this lists reduced priority by 0.2, i.e. 5 other sequences == one mutation
position = close_matches[minimal_distance_to_focal_set[seqid][1]].index(seqid)
priority = -minimal_distance_to_focal_set[seqid][0] - 0.003* int((1*(context_seqs_dict['snps'][i,:]==110)).sum(1)[0]) - 0.4*position
priority = -minimal_distance_to_focal_set[seqid][0] - 0.1*position
fh.write(f"{seqid}\t{priority:1.2f}\n")

0 comments on commit 07b4c77

Please sign in to comment.