From 8714d5f891e356916f5879e04d208e771465de07 Mon Sep 17 00:00:00 2001
From: Katrina Kalantar <katrina.kalantar@chanzuckerberg.com>
Date: Wed, 3 Jun 2020 15:39:12 -0700
Subject: [PATCH 1/6] add evalue_threshold parameter to m8 parsing function

---
 idseq_dag/util/m8.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py
index 02b9223d0..1bba0d5d2 100644
--- a/idseq_dag/util/m8.py
+++ b/idseq_dag/util/m8.py
@@ -18,6 +18,10 @@
 # Nevertheless, it may be useful to re-filter blastx results.
 NT_MIN_ALIGNMENT_LEN = 36
 
+# Alignments with e-values greater than 1 are low-quality alignments and associated with
+# a high rate of false-positives. These should be filtered at all alignment steps.
+EVALUE_THRESHOLD = 1
+
 # blastn output format 6 as documented in
 # http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
 # it's also the format of our GSNAP and RAPSEARCH2 output
@@ -192,6 +196,14 @@ def iterate_m8(m8_file, min_alignment_length=0, debug_caller=None, logging_inter
             if alignment_length < min_alignment_length:
                 continue
 
+            # *** E-value Filter ***
+            # Alignments with e-value > 1 are low-quality and associated with false-positives in
+            # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the 
+            # alignment
+            ###
+            if e_value > EVALUE_THRESHOLD:
+                continue
+
             if debug_caller and line_count % logging_interval == 0:
                 msg = "Scanned {} m8 lines from {} for {}, and going.".format(
                     line_count, m8_file, debug_caller)

From f3cc029ddc87c2c850beb4d1e54392c40186a236 Mon Sep 17 00:00:00 2001
From: Katrina Kalantar <katrina.kalantar@chanzuckerberg.com>
Date: Fri, 5 Jun 2020 16:20:04 -0700
Subject: [PATCH 2/6] add evalue filters to blast results

---
 idseq_dag/steps/blast_contigs.py | 28 +++++++++++++++++-----------
 idseq_dag/util/m8.py             |  4 ++--
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/idseq_dag/steps/blast_contigs.py b/idseq_dag/steps/blast_contigs.py
index e086082d8..7d379e605 100644
--- a/idseq_dag/steps/blast_contigs.py
+++ b/idseq_dag/steps/blast_contigs.py
@@ -14,7 +14,7 @@
 from idseq_dag.steps.run_assembly import PipelineStepRunAssembly
 from idseq_dag.util.count import READ_COUNTING_MODE, ReadCountingMode, get_read_cluster_size, load_cdhit_cluster_sizes
 from idseq_dag.util.lineage import DEFAULT_BLACKLIST_S3, DEFAULT_WHITELIST_S3
-from idseq_dag.util.m8 import MIN_CONTIG_SIZE, NT_MIN_ALIGNMENT_LEN
+from idseq_dag.util.m8 import MIN_CONTIG_SIZE, NT_MIN_ALIGNMENT_LEN, MAX_EVALUE_THRESHOLD
 
 
 MIN_REF_FASTA_SIZE = 25
@@ -406,6 +406,7 @@ def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta,
         blast_command = 'blastn'
         min_alignment_length = NT_MIN_ALIGNMENT_LEN
         min_pident = NT_MIN_PIDENT
+        max_evalue = MAX_EVALUE_THRESHOLD
         command.execute(
             command_patterns.SingleCommand(
                 cmd="makeblastdb",
@@ -447,12 +448,13 @@ def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta,
             )
         )
         # further processing of getting the top m8 entry for each contig.
-        PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8, min_alignment_length, min_pident)
+        PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8, min_alignment_length, min_pident, max_evalue)
 
     @staticmethod
     def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_top_m8):
         blast_type = 'prot'
         blast_command = 'blastx'
+        max_evalue = MAX_EVALUE_THRESHOLD
         command.execute(
             command_patterns.SingleCommand(
                 cmd="makeblastdb",
@@ -486,21 +488,23 @@ def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta,
             )
         )
         # further processing of getting the top m8 entry for each contig.
-        PipelineStepBlastContigs.get_top_m8_nr(blast_m8, blast_top_m8)
+        PipelineStepBlastContigs.get_top_m8_nr(blast_m8, blast_top_m8, max_evalue)
 
     @staticmethod
-    def get_top_m8_nr(blast_output, blast_top_m8):
+    def get_top_m8_nr(blast_output, blast_top_m8, max_evalue):
         ''' Get top m8 file entry for each contig from blast_output and output to blast_top_m8 '''
         m8.unparse_tsv(blast_top_m8, m8.BLAST_OUTPUT_SCHEMA,
-                       PipelineStepBlastContigs.optimal_hit_for_each_query_nr(blast_output))
+                       PipelineStepBlastContigs.optimal_hit_for_each_query_nr(blast_output, max_evalue))
 
     @staticmethod
-    def optimal_hit_for_each_query_nr(blast_output):
+    def optimal_hit_for_each_query_nr(blast_output, max_evalue):
         contigs_to_best_alignments = defaultdict(list)
         accession_counts = defaultdict(lambda: 0)
 
         # For each contig, get the alignments that have the best total score (may be multiple if there are ties).
         for alignment in m8.parse_tsv(blast_output, m8.BLAST_OUTPUT_SCHEMA):
+            if alignment["evalue"] > max_evalue:
+                continue
             query = alignment["qseqid"]
             best_alignments = contigs_to_best_alignments[query]
 
@@ -526,7 +530,7 @@ def optimal_hit_for_each_query_nr(blast_output):
             yield optimal_alignment
 
     @staticmethod
-    def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident):
+    def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident, max_evalue):
         # Filter and group results by query, yielding one result group at a time.
         # A result group consists of all hits for a query, grouped by subject.
         # Please see comment in get_top_m8_nt for more context.
@@ -540,6 +544,8 @@ def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_piden
                 continue
             if hsp["pident"] < min_pident:
                 continue
+            if hsp["evalue"] > max_evalue:
+                continue
             query, subject = hsp["qseqid"], hsp["sseqid"]
             if query != current_query:
                 assert query not in previously_seen_queries, "blastn output appears out of order, please resort by (qseqid, sseqid, score)"
@@ -554,12 +560,12 @@ def filter_and_group_hits_by_query(blast_output, min_alignment_length, min_piden
 
     @staticmethod
     # An iterator that, for contig, yields to optimal hit for the contig in the nt blast_output.
-    def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident):
+    def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue):
         contigs_to_blast_candidates = {}
         accession_counts = defaultdict(lambda: 0)
 
         # For each contig, get the collection of blast candidates that have the best total score (may be multiple if there are ties).
-        for query_hits in PipelineStepBlastContigs.filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident):
+        for query_hits in PipelineStepBlastContigs.filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident, max_evalue):
             best_hits = []
             for _subject, hit_fragments in query_hits.items():
                 # For NT, we take a special approach where we try to find a subset of disjoint HSPs
@@ -591,7 +597,7 @@ def optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident
             yield optimal_hit.summary()
 
     @staticmethod
-    def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident):
+    def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident, max_evalue):
         '''
         For each contig Q (query) and reference S (subject), extend the highest-scoring
         fragment alignment of Q to S with other *non-overlapping* fragments as far as
@@ -613,4 +619,4 @@ def get_top_m8_nt(blast_output, blast_top_m8, min_alignment_length, min_pident):
 
         # Output the optimal hit for each query.
         m8.unparse_tsv(blast_top_m8, m8.RERANKED_BLAST_OUTPUT_NT_SCHEMA,
-                       PipelineStepBlastContigs.optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident))
+                       PipelineStepBlastContigs.optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue))
diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py
index 1bba0d5d2..c63923ec9 100644
--- a/idseq_dag/util/m8.py
+++ b/idseq_dag/util/m8.py
@@ -20,7 +20,7 @@
 
 # Alignments with e-values greater than 1 are low-quality alignments and associated with
 # a high rate of false-positives. These should be filtered at all alignment steps.
-EVALUE_THRESHOLD = 1
+MAX_EVALUE_THRESHOLD = 1
 
 # blastn output format 6 as documented in
 # http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
@@ -71,7 +71,7 @@
 MIN_CONTIG_SIZE = 4
 
 
-def parse_tsv(path, schema, expect_headers=False, raw_lines=False):
+def parse_tsv(path, schema, expect_headers=False, raw_lines=False, min_alignment_length=0):
     '''Parse TSV file with given schema, yielding a dict per line.  See BLAST_OUTPUT_SCHEMA, for example.  When expect_headers=True, treat the first line as column headers.'''
     assert expect_headers == False, "Headers not yet implemented."
     schema_items = schema.items()

From c9f1cae78c1be07c7417fc4de2bc96ac2d266ae4 Mon Sep 17 00:00:00 2001
From: Katrina Kalantar <katrina.kalantar@chanzuckerberg.com>
Date: Mon, 8 Jun 2020 10:40:16 -0700
Subject: [PATCH 3/6] fix max_evalue_threshold variable name

---
 idseq_dag/util/m8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py
index c63923ec9..01d641dbd 100644
--- a/idseq_dag/util/m8.py
+++ b/idseq_dag/util/m8.py
@@ -201,7 +201,7 @@ def iterate_m8(m8_file, min_alignment_length=0, debug_caller=None, logging_inter
             # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the 
             # alignment
             ###
-            if e_value > EVALUE_THRESHOLD:
+            if e_value > MAX_EVALUE_THRESHOLD:
                 continue
 
             if debug_caller and line_count % logging_interval == 0:

From 5e4f5c8213d049fc74d7a0b86779048fca3b1419 Mon Sep 17 00:00:00 2001
From: Katrina Kalantar <katrina.kalantar@chanzuckerberg.com>
Date: Wed, 10 Jun 2020 10:36:58 -0700
Subject: [PATCH 4/6] bump version and add changelog to README

---
 README.md             | 3 +++
 idseq_dag/__init__.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5ac624341..1feb75b89 100644
--- a/README.md
+++ b/README.md
@@ -226,6 +226,9 @@ Changes to X or Y force recomputation of all results when a sample is rerun usin
 
 When releasing a new version, please add a Git tag of the form `vX.Y.Z`.
 
+- 4.10.0
+  - Add e-value threshold to require all internal alignments (short read and assembly-based) have e-value below threshold.
+
 - 4.9.0
   - Update NCBI index databases to those downloaded on 2020-04-20.
 
diff --git a/idseq_dag/__init__.py b/idseq_dag/__init__.py
index 4ef234258..f50ea2097 100644
--- a/idseq_dag/__init__.py
+++ b/idseq_dag/__init__.py
@@ -1,2 +1,2 @@
 ''' idseq_dag '''
-__version__ = "4.9.0"
+__version__ = "4.10.0"

From e7e58a61d076f614ce956e43d406198cca3b4040 Mon Sep 17 00:00:00 2001
From: Katrina Kalantar <katrina.kalantar@chanzuckerberg.com>
Date: Fri, 12 Jun 2020 09:54:40 -0700
Subject: [PATCH 5/6] remove min_alignment_length kwag where not used

---
 idseq_dag/util/m8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py
index 01d641dbd..7c504afa9 100644
--- a/idseq_dag/util/m8.py
+++ b/idseq_dag/util/m8.py
@@ -71,7 +71,7 @@
 MIN_CONTIG_SIZE = 4
 
 
-def parse_tsv(path, schema, expect_headers=False, raw_lines=False, min_alignment_length=0):
+def parse_tsv(path, schema, expect_headers=False, raw_lines=False):
     '''Parse TSV file with given schema, yielding a dict per line.  See BLAST_OUTPUT_SCHEMA, for example.  When expect_headers=True, treat the first line as column headers.'''
     assert expect_headers == False, "Headers not yet implemented."
     schema_items = schema.items()

From 0bd6d14b391e7521f90536672c1519cda4dbb213 Mon Sep 17 00:00:00 2001
From: Andrey Kislyuk <kislyuk@gmail.com>
Date: Fri, 12 Jun 2020 10:24:35 -0700
Subject: [PATCH 6/6] Fix lint error

---
 idseq_dag/util/m8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idseq_dag/util/m8.py b/idseq_dag/util/m8.py
index 7c504afa9..a6f9c9669 100644
--- a/idseq_dag/util/m8.py
+++ b/idseq_dag/util/m8.py
@@ -198,7 +198,7 @@ def iterate_m8(m8_file, min_alignment_length=0, debug_caller=None, logging_inter
 
             # *** E-value Filter ***
             # Alignments with e-value > 1 are low-quality and associated with false-positives in
-            # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the 
+            # all alignments steps (NT and NR). When the e-value is greater than 1, ignore the
             # alignment
             ###
             if e_value > MAX_EVALUE_THRESHOLD: