From 8714d5f891e356916f5879e04d208e771465de07 Mon Sep 17 00:00:00 2001 From: Katrina Kalantar Date: Wed, 3 Jun 2020 15:39:12 -0700 Subject: [PATCH 1/6] add evalue_threshold parameter to m8 parsing function --- idseq_dag/util/m8.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py index 02b9223d0..1bba0d5d2 100644 --- a/idseq_dag/util/m8.py +++ b/idseq_dag/util/m8.py @@ -18,6 +18,10 @@ # Nevertheless, it may be useful to re-filter blastx results. NT_MIN_ALIGNMENT_LEN = 36 +# Alignments with e-values greater than 1 are low-quality alignments and associated with +# a high rate of false-positives. These should be filtered at all alignment steps. +EVALUE_THRESHOLD = 1 + # blastn output format 6 as documented in # http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 # it's also the format of our GSNAP and RAPSEARCH2 output @@ -192,6 +196,14 @@ def iterate_m8(m8_file, min_alignment_length=0, debug_caller=None, logging_inter if alignment_length < min_alignment_length: continue + # *** E-value Filter *** + # Alignments with e-value > 1 are low-quality and associated with false-positives in + # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the + # alignment + ### + if e_value > EVALUE_THRESHOLD: + continue + if debug_caller and line_count % logging_interval == 0: msg = "Scanned {} m8 lines from {} for {}, and going.".format( line_count, m8_file, debug_caller) From f3cc029ddc87c2c850beb4d1e54392c40186a236 Mon Sep 17 00:00:00 2001 From: Katrina Kalantar Date: Fri, 5 Jun 2020 16:20:04 -0700 Subject: [PATCH 2/6] add evalue filters to blast results --- idseq_dag/steps/blast_contigs.py | 28 +++++++++++++++++----------- idseq_dag/util/m8.py | 4 ++-- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/idseq_dag/steps/blast_contigs.py b/idseq_dag/steps/blast_contigs.py index e086082d8..7d379e605 100644 --- a/idseq_dag/steps/blast_contigs.py +++ b/idseq_dag/steps/blast_contigs.py @@ -14,7 +14,7 @@ from idseq_dag.steps.run_assembly import PipelineStepRunAssembly from idseq_dag.util.count import READ_COUNTING_MODE, ReadCountingMode, get_read_cluster_size, load_cdhit_cluster_sizes from idseq_dag.util.lineage import DEFAULT_BLACKLIST_S3, DEFAULT_WHITELIST_S3 -from idseq_dag.util.m8 import MIN_CONTIG_SIZE, NT_MIN_ALIGNMENT_LEN +from idseq_dag.util.m8 import MIN_CONTIG_SIZE, NT_MIN_ALIGNMENT_LEN, MAX_EVALUE_THRESHOLD MIN_REF_FASTA_SIZE = 25 @@ -406,6 +406,7 @@ def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_command = 'blastn' min_alignment_length = NT_MIN_ALIGNMENT_LEN min_pident = NT_MIN_PIDENT + max_evalue = MAX_EVALUE_THRESHOLD command.execute( command_patterns.SingleCommand( cmd="makeblastdb", @@ -447,12 +448,13 @@ def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta, ) ) # further processing of getting the top m8 entry for each contig. - PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8, min_alignment_length, min_pident) + PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8, min_alignment_length, min_pident, max_evalue) @staticmethod def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_top_m8): blast_type = 'prot' blast_command = 'blastx' + max_evalue = MAX_EVALUE_THRESHOLD command.execute( command_patterns.SingleCommand( cmd="makeblastdb", @@ -486,21 +488,23 @@ def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta, ) ) # further processing of getting the top m8 entry for each contig. - PipelineStepBlastContigs.get_top_m8_nr(blast_m8, blast_top_m8) + PipelineStepBlastContigs.get_top_m8_nr(blast_m8, blast_top_m8, max_evalue) @staticmethod - def get_top_m8_nr(blast_output, blast_top_m8): + def get_top_m8_nr(blast_output, blast_top_m8, max_evalue): ''' Get top m8 file entry for each contig from blast_output and output to blast_top_m8 ''' m8.unparse_tsv(blast_top_m8, m8.BLAST_OUTPUT_SCHEMA, - PipelineStepBlastContigs.optimal_hit_for_each_query_nr(blast_output)) + PipelineStepBlastContigs.optimal_hit_for_each_query_nr(blast_output, max_evalue)) @staticmethod - def optimal_hit_for_each_query_nr(blast_output): + def optimal_hit_for_each_query_nr(blast_output, max_evalue): contigs_to_best_alignments = defaultdict(list) accession_counts = defaultdict(lambda: 0) # For each contig, get the alignments that have the best total score (may be multiple if there are ties). for alignment in m8.parse_tsv(blast_output, m8.BLAST_OUTPUT_SCHEMA): + if alignment["evalue"] > max_evalue: + continue query = alignment["qseqid"] best_alignments = contigs_to_best_alignments[query] @@ -526,7 +530,7 @@ def optimal_hit_for_each_query_nr(blast_output): yield optimal_alignment @staticmethod - def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident): + def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident, max_evalue): # Filter and group results by query, yielding one result group at a time. # A result group consists of all hits for a query, grouped by subject. # Please see comment in get_top_m8_nt for more context. @@ -540,6 +544,8 @@ def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_piden continue if hsp["pident"] < min_pident: continue + if hsp["evalue"] > max_evalue: + continue query, subject = hsp["qseqid"], hsp["sseqid"] if query != current_query: assert query not in previously_seen_queries, "blastn output appears out of order, please resort by (qseqid, sseqid, score)" @@ -554,12 +560,12 @@ def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_piden @staticmethod # An iterator that, for contig, yields to optimal hit for the contig in the nt blast_output. - def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident): + def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue): contigs_to_blast_candidates = {} accession_counts = defaultdict(lambda: 0) # For each contig, get the collection of blast candidates that have the best total score (may be multiple if there are ties). - for query_hits in PipelineStepBlastContigs.filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident): + for query_hits in PipelineStepBlastContigs.filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident, max_evalue): best_hits = [] for _subject, hit_fragments in query_hits.items(): # For NT, we take a special approach where we try to find a subset of disjoint HSPs @@ -591,7 +597,7 @@ def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident yield optimal_hit.summary() @staticmethod - def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident): + def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident, max_evalue): ''' For each contig Q (query) and reference S (subject), extend the highest-scoring fragment alignment of Q to S with other *non-overlapping* fragments as far as @@ -613,4 +619,4 @@ def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident): # Output the optimal hit for each query. m8.unparse_tsv(blast_top_m8, m8.RERANKED_BLAST_OUTPUT_NT_SCHEMA, - PipelineStepBlastContigs.optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident)) + PipelineStepBlastContigs.optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue)) diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py index 1bba0d5d2..c63923ec9 100644 --- a/idseq_dag/util/m8.py +++ b/idseq_dag/util/m8.py @@ -20,7 +20,7 @@ # Alignments with e-values greater than 1 are low-quality alignments and associated with # a high rate of false-positives. These should be filtered at all alignment steps. -EVALUE_THRESHOLD = 1 +MAX_EVALUE_THRESHOLD = 1 # blastn output format 6 as documented in # http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 @@ -71,7 +71,7 @@ MIN_CONTIG_SIZE = 4 -def parse_tsv(path, schema, expect_headers=False, raw_lines=False): +def parse_tsv(path, schema, expect_headers=False, raw_lines=False, min_alignment_length=0): '''Parse TSV file with given schema, yielding a dict per line. See BLAST_OUTPUT_SCHEMA, for example. When expect_headers=True, treat the first line as column headers.''' assert expect_headers == False, "Headers not yet implemented." schema_items = schema.items() From c9f1cae78c1be07c7417fc4de2bc96ac2d266ae4 Mon Sep 17 00:00:00 2001 From: Katrina Kalantar Date: Mon, 8 Jun 2020 10:40:16 -0700 Subject: [PATCH 3/6] fix max_evalue_threshold variable name --- idseq_dag/util/m8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py index c63923ec9..01d641dbd 100644 --- a/idseq_dag/util/m8.py +++ b/idseq_dag/util/m8.py @@ -201,7 +201,7 @@ def iterate_m8(m8_file, min_alignment_length=0, debug_caller=None, logging_inter # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the # alignment ### - if e_value > EVALUE_THRESHOLD: + if e_value > MAX_EVALUE_THRESHOLD: continue if debug_caller and line_count % logging_interval == 0: From 5e4f5c8213d049fc74d7a0b86779048fca3b1419 Mon Sep 17 00:00:00 2001 From: Katrina Kalantar Date: Wed, 10 Jun 2020 10:36:58 -0700 Subject: [PATCH 4/6] bump version and add changelog to README --- README.md | 3 +++ idseq_dag/__init__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ac624341..1feb75b89 100644 --- a/README.md +++ b/README.md @@ -226,6 +226,9 @@ Changes to X or Y force recomputation of all results when a sample is rerun usin When releasing a new version, please add a Git tag of the form `vX.Y.Z`. +- 4.10.0 + - Add e-value threshold to require all internal alignments (short read and assembly-based) have e-value below threshold. + - 4.9.0 - Update NCBI index databases to those downloaded on 2020-04-20. diff --git a/idseq_dag/__init__.py b/idseq_dag/__init__.py index 4ef234258..f50ea2097 100644 --- a/idseq_dag/__init__.py +++ b/idseq_dag/__init__.py @@ -1,2 +1,2 @@ ''' idseq_dag ''' -__version__ = "4.9.0" +__version__ = "4.10.0" From e7e58a61d076f614ce956e43d406198cca3b4040 Mon Sep 17 00:00:00 2001 From: Katrina Kalantar Date: Fri, 12 Jun 2020 09:54:40 -0700 Subject: [PATCH 5/6] remove min_alignment_length kwag where not used --- idseq_dag/util/m8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py index 01d641dbd..7c504afa9 100644 --- a/idseq_dag/util/m8.py +++ b/idseq_dag/util/m8.py @@ -71,7 +71,7 @@ MIN_CONTIG_SIZE = 4 -def parse_tsv(path, schema, expect_headers=False, raw_lines=False, min_alignment_length=0): +def parse_tsv(path, schema, expect_headers=False, raw_lines=False): '''Parse TSV file with given schema, yielding a dict per line. See BLAST_OUTPUT_SCHEMA, for example. When expect_headers=True, treat the first line as column headers.''' assert expect_headers == False, "Headers not yet implemented." schema_items = schema.items() From 0bd6d14b391e7521f90536672c1519cda4dbb213 Mon Sep 17 00:00:00 2001 From: Andrey Kislyuk Date: Fri, 12 Jun 2020 10:24:35 -0700 Subject: [PATCH 6/6] Fix lint error --- idseq_dag/util/m8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py index 7c504afa9..a6f9c9669 100644 --- a/idseq_dag/util/m8.py +++ b/idseq_dag/util/m8.py @@ -198,7 +198,7 @@ def iterate_m8(m8_file, min_alignment_length=0, debug_caller=None, logging_inter # *** E-value Filter *** # Alignments with e-value > 1 are low-quality and associated with false-positives in - # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the + # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the # alignment ### if e_value > MAX_EVALUE_THRESHOLD: