Skip to content
This repository has been archived by the owner on Jan 11, 2022. It is now read-only.

add e-value threshold to require internal alignments have e-value < 1 #309

Merged
merged 6 commits into from
Jun 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,9 @@ Changes to X or Y force recomputation of all results when a sample is rerun usin

When releasing a new version, please add a Git tag of the form `vX.Y.Z`.

- 4.10.0
- Add e-value threshold to require all internal alignments (short read and assembly-based) have e-value below threshold.

- 4.9.0
- Update NCBI index databases to those downloaded on 2020-04-20.

Expand Down
2 changes: 1 addition & 1 deletion idseq_dag/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
''' idseq_dag '''
__version__ = "4.9.0"
__version__ = "4.10.0"
28 changes: 17 additions & 11 deletions idseq_dag/steps/blast_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from idseq_dag.steps.run_assembly import PipelineStepRunAssembly
from idseq_dag.util.count import READ_COUNTING_MODE, ReadCountingMode, get_read_cluster_size, load_cdhit_cluster_sizes
from idseq_dag.util.lineage import DEFAULT_BLACKLIST_S3, DEFAULT_WHITELIST_S3
from idseq_dag.util.m8 import MIN_CONTIG_SIZE, NT_MIN_ALIGNMENT_LEN
from idseq_dag.util.m8 import MIN_CONTIG_SIZE, NT_MIN_ALIGNMENT_LEN, MAX_EVALUE_THRESHOLD


MIN_REF_FASTA_SIZE = 25
Expand Down Expand Up @@ -406,6 +406,7 @@ def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta,
blast_command = 'blastn'
min_alignment_length = NT_MIN_ALIGNMENT_LEN
min_pident = NT_MIN_PIDENT
max_evalue = MAX_EVALUE_THRESHOLD
command.execute(
command_patterns.SingleCommand(
cmd="makeblastdb",
Expand Down Expand Up @@ -447,12 +448,13 @@ def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta,
)
)
# further processing of getting the top m8 entry for each contig.
PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8, min_alignment_length, min_pident)
PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8, min_alignment_length, min_pident, max_evalue)

@staticmethod
def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_top_m8):
blast_type = 'prot'
blast_command = 'blastx'
max_evalue = MAX_EVALUE_THRESHOLD
command.execute(
command_patterns.SingleCommand(
cmd="makeblastdb",
Expand Down Expand Up @@ -486,21 +488,23 @@ def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta,
)
)
# further processing of getting the top m8 entry for each contig.
PipelineStepBlastContigs.get_top_m8_nr(blast_m8, blast_top_m8)
PipelineStepBlastContigs.get_top_m8_nr(blast_m8, blast_top_m8, max_evalue)

@staticmethod
def get_top_m8_nr(blast_output, blast_top_m8):
def get_top_m8_nr(blast_output, blast_top_m8, max_evalue):
''' Get top m8 file entry for each contig from blast_output and output to blast_top_m8 '''
m8.unparse_tsv(blast_top_m8, m8.BLAST_OUTPUT_SCHEMA,
PipelineStepBlastContigs.optimal_hit_for_each_query_nr(blast_output))
PipelineStepBlastContigs.optimal_hit_for_each_query_nr(blast_output, max_evalue))

@staticmethod
def optimal_hit_for_each_query_nr(blast_output):
def optimal_hit_for_each_query_nr(blast_output, max_evalue):
contigs_to_best_alignments = defaultdict(list)
accession_counts = defaultdict(lambda: 0)

# For each contig, get the alignments that have the best total score (may be multiple if there are ties).
for alignment in m8.parse_tsv(blast_output, m8.BLAST_OUTPUT_SCHEMA):
if alignment["evalue"] > max_evalue:
continue
query = alignment["qseqid"]
best_alignments = contigs_to_best_alignments[query]

Expand All @@ -526,7 +530,7 @@ def optimal_hit_for_each_query_nr(blast_output):
yield optimal_alignment

@staticmethod
def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident):
def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident, max_evalue):
# Filter and group results by query, yielding one result group at a time.
# A result group consists of all hits for a query, grouped by subject.
# Please see comment in get_top_m8_nt for more context.
Expand All @@ -540,6 +544,8 @@ def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_piden
continue
if hsp["pident"] < min_pident:
continue
if hsp["evalue"] > max_evalue:
continue
query, subject = hsp["qseqid"], hsp["sseqid"]
if query != current_query:
assert query not in previously_seen_queries, "blastn output appears out of order, please resort by (qseqid, sseqid, score)"
Expand All @@ -554,12 +560,12 @@ def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_piden

@staticmethod
# An iterator that, for contig, yields to optimal hit for the contig in the nt blast_output.
def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident):
def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue):
contigs_to_blast_candidates = {}
accession_counts = defaultdict(lambda: 0)

# For each contig, get the collection of blast candidates that have the best total score (may be multiple if there are ties).
for query_hits in PipelineStepBlastContigs.filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident):
for query_hits in PipelineStepBlastContigs.filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident, max_evalue):
best_hits = []
for _subject, hit_fragments in query_hits.items():
# For NT, we take a special approach where we try to find a subset of disjoint HSPs
Expand Down Expand Up @@ -591,7 +597,7 @@ def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident
yield optimal_hit.summary()

@staticmethod
def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident):
def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident, max_evalue):
'''
For each contig Q (query) and reference S (subject), extend the highest-scoring
fragment alignment of Q to S with other *non-overlapping* fragments as far as
Expand All @@ -613,4 +619,4 @@ def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident):

# Output the optimal hit for each query.
m8.unparse_tsv(blast_top_m8, m8.RERANKED_BLAST_OUTPUT_NT_SCHEMA,
PipelineStepBlastContigs.optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident))
PipelineStepBlastContigs.optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue))
12 changes: 12 additions & 0 deletions idseq_dag/util/m8.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
# Nevertheless, it may be useful to re-filter blastx results.
NT_MIN_ALIGNMENT_LEN = 36

# Alignments with e-values greater than 1 are low-quality alignments and associated with
# a high rate of false-positives. These should be filtered at all alignment steps.
MAX_EVALUE_THRESHOLD = 1

# blastn output format 6 as documented in
# http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
# it's also the format of our GSNAP and RAPSEARCH2 output
Expand Down Expand Up @@ -192,6 +196,14 @@ def iterate_m8(m8_file, min_alignment_length=0, debug_caller=None, logging_inter
if alignment_length < min_alignment_length:
continue

# *** E-value Filter ***
# Alignments with e-value > 1 are low-quality and associated with false-positives in
# all alignments steps (NT and NR). When the e-value is greater than 1, ignore the
# alignment
###
if e_value > MAX_EVALUE_THRESHOLD:
continue

if debug_caller and line_count % logging_interval == 0:
msg = "Scanned {} m8 lines from {} for {}, and going.".format(
line_count, m8_file, debug_caller)
Expand Down