Report two sets of consensus seq's: coverage 100 and 1, as part of #442.

Exclude HIV references that don't extend all the way to both ends. Report three sets of coordinates in probe_finder, and process contigs.csv or conseq.csv. Start comparing de novo results with mapping results.
cfe-lab · Oct 16, 2019 · 267553c · 267553c
1 parent ee60448
commit 267553c
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 63 deletions.
diff --git a/micall/blast_db/make_blast_db.py b/micall/blast_db/make_blast_db.py
@@ -5,6 +5,25 @@
 
 DEFAULT_DATABASE = str(Path(__file__).parent / 'refs.fasta')
 DEFAULT_PROJECTS = str(Path(__file__).parent.parent / 'projects.json')
+COMPLETE_HIV_REFS = """\
+HIV1-A1-CD-AM000053-seed
+HIV1-B-FR-K03455-seed
+HIV1-C-IN-KC156210-seed
+HIV1-C-MW-KC156214-seed
+HIV1-CPZ-TZ-JN091691-seed
+HIV1-CPZ-US-AF103818-seed
+HIV1-P-FR-GU111555-seed
+HIV1-C-TZ-KC156220-seed
+HIV1-CPZ-CM-FR686511-seed
+HIV1-O-SN-AJ302646-seed
+HIV1-O-BE-L20587-seed
+HIV1-D-SN-AB485648-seed
+HIV1-F1-RO-AB485658-seed
+HIV1-G-GH-AB287004-seed
+HIV1-D-KR-DQ054367-seed
+HIV1-G-PT-FR846409-seed
+HIV1-B-US-KT284371-seed
+HIV1-A1-IN-KT152839-seed""".splitlines()
 
 
 def parse_args():
@@ -29,6 +48,9 @@ def make_blast_db(projects_json, refs_fasta):
         for name, region in projects['regions'].items():
             if region['seed_group'] is None:
                 continue
+            if (region['seed_group'] == 'HIV1-seed' and
+                    name not in COMPLETE_HIV_REFS):
+                continue
             refs_fasta.write('>' + name + '\n')
             for line in region['reference']:
                 refs_fasta.write(line)

diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py
@@ -30,7 +30,7 @@
 CONSEQ_MIXTURE_CUTOFFS = [0.01, 0.02, 0.05, 0.1, 0.2, 0.25]
 GAP_OPEN_COORD = 40
 GAP_EXTEND_COORD = 10
-CONSENSUS_MIN_COVERAGE = 100
+CONSENSUS_MIN_COVERAGES = (100, 1)
 MAX_CUTOFF = 'MAX'
 
 
@@ -75,6 +75,15 @@ def parse_args():
     return parser.parse_args()
 
 
+def trim_contig_name(contig_name):
+    prefix = contig_name.split('-')[0]
+    if not re.fullmatch(r'[\d_]+', prefix):
+        seed_name = contig_name
+    else:
+        _, seed_name = contig_name.split('-', 1)
+    return seed_name
+
+
 class SequenceReport(object):
     """ Hold the data for several reports related to a sample's genetic sequence.
 
@@ -98,6 +107,7 @@ def __init__(self,
             included as a mixture in the consensus.
         """
         self.consensus_min_coverage = 0
+        self.consensus_min_coverages = None
         self.callback_progress = 0
         self.callback_next = self.callback_chunk_size = self.callback_max = None
         self.insert_writer = insert_writer
@@ -154,11 +164,7 @@ def _count_reads(self, aligned_reads):
             if i == 0:
                 # these will be the same for all rows, so just assign from the first
                 self.detail_seed = row['refname']
-                prefix = self.detail_seed.split('-')[0]
-                if not re.fullmatch(r'[\d_]+', prefix):
-                    self.seed = self.detail_seed
-                else:
-                    _, self.seed = self.detail_seed.split('-', 1)
+                self.seed = trim_contig_name(self.detail_seed)
                 self.qcut = row['qcut']
             nuc_seq = row['seq']
             offset = int(row['offset'])
@@ -730,6 +736,7 @@ def _create_consensus_writer(conseq_file):
         return csv.DictWriter(conseq_file,
                               ['region',
                                'q-cutoff',
+                               'min_coverage',
                                'consensus-percent-cutoff',
                                'offset',
                                'sequence'],
@@ -760,6 +767,7 @@ def get_consensus_rows(self, seed_amino_entries):
             if offset is not None:
                 consensus = consensus.rstrip('-')
                 yield {'q-cutoff': self.qcut,
+                       'min_coverage': self.consensus_min_coverage,
                        'consensus-percent-cutoff':
                            format_cutoff(mixture_cutoff),
                        'offset': offset,
@@ -769,15 +777,19 @@ def write_consensus(self, conseq_writer=None):
         conseq_writer = conseq_writer or self.conseq_writer
         seed_amino_entries = [(seed_amino.consensus_nuc_index, seed_amino)
                               for seed_amino in self.seed_aminos[0]]
-        for row in self.get_consensus_rows(seed_amino_entries):
-            row['region'] = self.detail_seed
-            conseq_writer.writerow(row)
+        min_coverages = (self.consensus_min_coverages or
+                         (self.consensus_min_coverage,))
+        for self.consensus_min_coverage in min_coverages:
+            for row in self.get_consensus_rows(seed_amino_entries):
+                row['region'] = self.detail_seed
+                conseq_writer.writerow(row)
 
     def write_consensus_regions_header(self, conseq_region_file):
         self.conseq_region_writer = csv.DictWriter(conseq_region_file,
                                                    ['seed',
                                                     'region',
                                                     'q-cutoff',
+                                                    'min_coverage',
                                                     'consensus-percent-cutoff',
                                                     'offset',
                                                     'sequence'],
@@ -1319,7 +1331,7 @@ def aln2counts(aligned_csv,
                                 projects,
                                 CONSEQ_MIXTURE_CUTOFFS,
                                 landmarks_yaml=landmarks_yaml)
-        report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE
+        report.consensus_min_coverages = CONSENSUS_MIN_COVERAGES
         report.write_amino_header(amino_csv)
         report.write_consensus_header(conseq_csv)
         report.write_consensus_regions_header(conseq_region_csv)

diff --git a/micall/tests/test_aln2counts.py b/micall/tests/test_aln2counts.py
@@ -275,9 +275,9 @@ def testConsensusFromSingleRead(self):
 R1-seed,15,0,9,0,AAATTT
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,15,MAX,0,AAATTT
-R1-seed,15,0.100,0,AAATTT
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,0,MAX,0,AAATTT
+R1-seed,15,0,0.100,0,AAATTT
 """
 
         self.report.write_consensus_header(self.report_file)
@@ -297,9 +297,9 @@ def testConsensusFromTwoReads(self):
 R1-seed,15,0,1,0,CCCGGG
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,15,MAX,0,AAATTT
-R1-seed,15,0.100,0,MMMKKK
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,0,MAX,0,AAATTT
+R1-seed,15,0,0.100,0,MMMKKK
 """
 
         self.report.write_consensus_header(self.report_file)
@@ -315,9 +315,9 @@ def testConsensusWithOffset(self):
 R1-seed,15,0,1,7,TTGGG
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,15,MAX,3,AAATTTGGG
-R1-seed,15,0.100,3,AAATTTGGG
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,0,MAX,3,AAATTTGGG
+R1-seed,15,0,0.100,3,AAATTTGGG
 """
 
         self.report.write_consensus_header(self.report_file)
@@ -333,9 +333,9 @@ def testConsensusFromPartialContig(self):
 1-R2-seed-partial,15,0,9,0,AAATTT
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-1-R2-seed-partial,15,MAX,0,AAATTT
-1-R2-seed-partial,15,0.100,0,AAATTT
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+1-R2-seed-partial,15,0,MAX,0,AAATTT
+1-R2-seed-partial,15,0,0.100,0,AAATTT
 """
 
         self.report.write_consensus_header(self.report_file)
@@ -351,9 +351,9 @@ def testConsensusLowQualitySections(self):
 R1-seed,15,0,1,7,TTNGG
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,15,MAX,6,TTT-GG
-R1-seed,15,0.100,6,TTT-GG
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,1,MAX,6,TTT-GG
+R1-seed,15,1,0.100,6,TTT-GG
 """
         self.report.consensus_min_coverage = 1
 
@@ -370,7 +370,7 @@ def testConsensusLowQuality(self):
 R1-seed,15,0,1,7,NNNNN
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
 """
         self.report.consensus_min_coverage = 1
 
@@ -388,9 +388,9 @@ def testConsensusLowCoverageInMiddle(self):
 R1-seed,15,0,1,6,GGG
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,15,MAX,0,AAAT--GGG
-R1-seed,15,0.100,0,AAAT--GGG
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,10,MAX,0,AAAT--GGG
+R1-seed,15,10,0.100,0,AAAT--GGG
 """
         self.report.consensus_min_coverage = 10
 
@@ -407,9 +407,9 @@ def testConsensusLowCoverageAtStart(self):
 R1-seed,15,0,1,4,TTGGG
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,15,MAX,4,TTGGG
-R1-seed,15,0.100,4,TTGGG
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,10,MAX,4,TTGGG
+R1-seed,15,10,0.100,4,TTGGG
 """
         self.report.consensus_min_coverage = 10
 
@@ -426,9 +426,9 @@ def testConsensusLowCoverageAtEnd(self):
 R1-seed,15,0,1,0,AAAT
 """)
         expected_text = """\
-region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,15,MAX,0,AAAT
-R1-seed,15,0.100,0,AAAT
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,10,MAX,0,AAAT
+R1-seed,15,10,0.100,0,AAAT
 """
         self.report.consensus_min_coverage = 10
 
@@ -438,6 +438,27 @@ def testConsensusLowCoverageAtEnd(self):
 
         self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
 
+    def testConsensusMultipleCoverageLevels(self):
+        # refname,qcut,rank,count,offset,seq
+        aligned_reads = self.prepareReads("""\
+R1-seed,15,0,9,0,AAATTTGGG
+R1-seed,15,0,1,0,AAAT
+""")
+        expected_text = """\
+region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,15,10,MAX,0,AAAT
+R1-seed,15,10,0.100,0,AAAT
+R1-seed,15,1,MAX,0,AAATTTGGG
+R1-seed,15,1,0.100,0,AAATTTGGG
+"""
+        self.report.consensus_min_coverages = (10, 1)
+
+        self.report.write_consensus_header(self.report_file)
+        self.report.read(aligned_reads)
+        self.report.write_consensus()
+
+        self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
+
     def testSingleReadAminoReport(self):
         """ In this sample, there is a single read with two codons.
         AAA -> K
@@ -2040,11 +2061,11 @@ def testMultipleCoordinateConsensusRegionsReport(self):
 """)
 
         expected_text = """\
-seed,region,q-cutoff,consensus-percent-cutoff,offset,sequence
-R1-seed,R1a,15,MAX,0,AAATTT
-R1-seed,R1a,15,0.100,0,MMMKKK
-R1-seed,R1b,15,MAX,3,AAATTT
-R1-seed,R1b,15,0.100,3,MMMKKK
+seed,region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
+R1-seed,R1a,15,0,MAX,0,AAATTT
+R1-seed,R1a,15,0,0.100,0,MMMKKK
+R1-seed,R1b,15,0,MAX,3,AAATTT
+R1-seed,R1b,15,0,0.100,3,MMMKKK
 """
 
         self.report.read(aligned_reads)

diff --git a/micall/utils/probe_finder.py b/micall/utils/probe_finder.py
@@ -38,10 +38,12 @@ def find_probes(contigs_csv, probes_csv):
     reader = DictReader(contigs_csv)
     columns = ['sample', 'contig']
     for target_name in TARGET_SEQUENCES:
-        for column_type in ['start',
-                            'size',
-                            'hxb2_start',
-                            'hxb2_size',
+        for column_type in ['in_contig_start',
+                            'in_contig_size',
+                            'in_hxb2_start',
+                            'in_hxb2_size',
+                            'merged_hxb2_start',
+                            'merged_hxb2_size',
                             'dist',
                             'score',
                             'seq']:
@@ -56,10 +58,13 @@ def find_probes(contigs_csv, probes_csv):
     for sample_name, sample_rows in groupby(reader, itemgetter('sample')):
         contig_num = 0
         for row in sample_rows:
-            seed_name = row['genotype']
+            seed_name = row.get('genotype') or row.get('ref') or row['region']
+            conseq_cutoff = row.get('consensus-percent-cutoff')
+            if conseq_cutoff and conseq_cutoff != 'MAX':
+                continue
             contig_num += 1
             contig_name = f'{contig_num}-{seed_name}'
-            contig_seq: str = row['contig']
+            contig_seq: str = row.get('contig') or row['sequence']
             aligned_hxb2, aligned_contig_to_hxb2, _ = align_it(
                 hxb2,
                 contig_seq,
@@ -95,24 +100,38 @@ def find_probes(contigs_csv, probes_csv):
                 start_pos = start + 1
                 end_pos = start + size
                 hxb2_pos = contig_pos = 0
-                hxb2_start = hxb2_size = None
+                merged_hxb2_start = merged_hxb2_size = None
                 for hxb2_nuc, contig_nuc in zip(aligned_hxb2,
                                                 aligned_contig_to_hxb2):
                     if hxb2_nuc != '-':
                         hxb2_pos += 1
                     if contig_nuc != '-':
                         contig_pos += 1
                         if contig_pos == start_pos:
-                            hxb2_start = hxb2_pos
+                            merged_hxb2_start = hxb2_pos
                         if contig_pos == end_pos:
-                            hxb2_size = hxb2_pos - hxb2_start + 1
+                            merged_hxb2_size = hxb2_pos - merged_hxb2_start + 1
                             break
 
+                aligned_ref, aligned_match, _ = align_it(
+                    hxb2,
+                    contig_match,
+                    gap_open_penalty,
+                    gap_extend_penalty,
+                    use_terminal_gap_penalty)
+                lstripped_match = aligned_match.lstrip('-')
+                in_hxb2_start = len(aligned_match) - len(lstripped_match)
+                tail_len = len(lstripped_match) - len(lstripped_match.rstrip('-'))
+                ref_match = aligned_ref[in_hxb2_start:-tail_len or None]
+                in_hxb2_size = len(ref_match.replace('-', ''))
+
                 prefix = target_name + '_'
-                new_row[prefix + 'start'] = start_pos
-                new_row[prefix + 'size'] = size
-                new_row[prefix + 'hxb2_start'] = hxb2_start
-                new_row[prefix + 'hxb2_size'] = hxb2_size
+                new_row[prefix + 'in_contig_start'] = start_pos
+                new_row[prefix + 'in_contig_size'] = size
+                new_row[prefix + 'in_hxb2_start'] = in_hxb2_start
+                new_row[prefix + 'in_hxb2_size'] = in_hxb2_size
+                new_row[prefix + 'merged_hxb2_start'] = merged_hxb2_start
+                new_row[prefix + 'merged_hxb2_size'] = merged_hxb2_size
                 new_row[prefix + 'dist'] = dist
                 new_row[prefix + 'score'] = score
                 new_row[prefix + 'seq'] = contig_match
@@ -135,5 +154,5 @@ def main():
     find_probes(args.contigs_csv, args.probes_csv)
 
 
-if __name__ == '__main__':
+if __name__ in ('__main__', '__live_coding__'):
     main()