Skip to content

Commit

Permalink
Report two sets of consensus seq's: coverage 100 and 1, as part of #442.
Browse files Browse the repository at this point in the history
Exclude HIV references that don't extend all the way to both ends.
Report three sets of coordinates in probe_finder, and process contigs.csv or conseq.csv.
Start comparing de novo results with mapping results.
  • Loading branch information
donkirkby committed Oct 16, 2019
1 parent ee60448 commit 267553c
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 63 deletions.
22 changes: 22 additions & 0 deletions micall/blast_db/make_blast_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,25 @@

DEFAULT_DATABASE = str(Path(__file__).parent / 'refs.fasta')
DEFAULT_PROJECTS = str(Path(__file__).parent.parent / 'projects.json')
COMPLETE_HIV_REFS = """\
HIV1-A1-CD-AM000053-seed
HIV1-B-FR-K03455-seed
HIV1-C-IN-KC156210-seed
HIV1-C-MW-KC156214-seed
HIV1-CPZ-TZ-JN091691-seed
HIV1-CPZ-US-AF103818-seed
HIV1-P-FR-GU111555-seed
HIV1-C-TZ-KC156220-seed
HIV1-CPZ-CM-FR686511-seed
HIV1-O-SN-AJ302646-seed
HIV1-O-BE-L20587-seed
HIV1-D-SN-AB485648-seed
HIV1-F1-RO-AB485658-seed
HIV1-G-GH-AB287004-seed
HIV1-D-KR-DQ054367-seed
HIV1-G-PT-FR846409-seed
HIV1-B-US-KT284371-seed
HIV1-A1-IN-KT152839-seed""".splitlines()


def parse_args():
Expand All @@ -29,6 +48,9 @@ def make_blast_db(projects_json, refs_fasta):
for name, region in projects['regions'].items():
if region['seed_group'] is None:
continue
if (region['seed_group'] == 'HIV1-seed' and
name not in COMPLETE_HIV_REFS):
continue
refs_fasta.write('>' + name + '\n')
for line in region['reference']:
refs_fasta.write(line)
Expand Down
32 changes: 22 additions & 10 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
CONSEQ_MIXTURE_CUTOFFS = [0.01, 0.02, 0.05, 0.1, 0.2, 0.25]
GAP_OPEN_COORD = 40
GAP_EXTEND_COORD = 10
CONSENSUS_MIN_COVERAGE = 100
CONSENSUS_MIN_COVERAGES = (100, 1)
MAX_CUTOFF = 'MAX'


Expand Down Expand Up @@ -75,6 +75,15 @@ def parse_args():
return parser.parse_args()


def trim_contig_name(contig_name):
prefix = contig_name.split('-')[0]
if not re.fullmatch(r'[\d_]+', prefix):
seed_name = contig_name
else:
_, seed_name = contig_name.split('-', 1)
return seed_name


class SequenceReport(object):
""" Hold the data for several reports related to a sample's genetic sequence.
Expand All @@ -98,6 +107,7 @@ def __init__(self,
included as a mixture in the consensus.
"""
self.consensus_min_coverage = 0
self.consensus_min_coverages = None
self.callback_progress = 0
self.callback_next = self.callback_chunk_size = self.callback_max = None
self.insert_writer = insert_writer
Expand Down Expand Up @@ -154,11 +164,7 @@ def _count_reads(self, aligned_reads):
if i == 0:
# these will be the same for all rows, so just assign from the first
self.detail_seed = row['refname']
prefix = self.detail_seed.split('-')[0]
if not re.fullmatch(r'[\d_]+', prefix):
self.seed = self.detail_seed
else:
_, self.seed = self.detail_seed.split('-', 1)
self.seed = trim_contig_name(self.detail_seed)
self.qcut = row['qcut']
nuc_seq = row['seq']
offset = int(row['offset'])
Expand Down Expand Up @@ -730,6 +736,7 @@ def _create_consensus_writer(conseq_file):
return csv.DictWriter(conseq_file,
['region',
'q-cutoff',
'min_coverage',
'consensus-percent-cutoff',
'offset',
'sequence'],
Expand Down Expand Up @@ -760,6 +767,7 @@ def get_consensus_rows(self, seed_amino_entries):
if offset is not None:
consensus = consensus.rstrip('-')
yield {'q-cutoff': self.qcut,
'min_coverage': self.consensus_min_coverage,
'consensus-percent-cutoff':
format_cutoff(mixture_cutoff),
'offset': offset,
Expand All @@ -769,15 +777,19 @@ def write_consensus(self, conseq_writer=None):
conseq_writer = conseq_writer or self.conseq_writer
seed_amino_entries = [(seed_amino.consensus_nuc_index, seed_amino)
for seed_amino in self.seed_aminos[0]]
for row in self.get_consensus_rows(seed_amino_entries):
row['region'] = self.detail_seed
conseq_writer.writerow(row)
min_coverages = (self.consensus_min_coverages or
(self.consensus_min_coverage,))
for self.consensus_min_coverage in min_coverages:
for row in self.get_consensus_rows(seed_amino_entries):
row['region'] = self.detail_seed
conseq_writer.writerow(row)

def write_consensus_regions_header(self, conseq_region_file):
self.conseq_region_writer = csv.DictWriter(conseq_region_file,
['seed',
'region',
'q-cutoff',
'min_coverage',
'consensus-percent-cutoff',
'offset',
'sequence'],
Expand Down Expand Up @@ -1319,7 +1331,7 @@ def aln2counts(aligned_csv,
projects,
CONSEQ_MIXTURE_CUTOFFS,
landmarks_yaml=landmarks_yaml)
report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE
report.consensus_min_coverages = CONSENSUS_MIN_COVERAGES
report.write_amino_header(amino_csv)
report.write_consensus_header(conseq_csv)
report.write_consensus_regions_header(conseq_region_csv)
Expand Down
81 changes: 51 additions & 30 deletions micall/tests/test_aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,9 @@ def testConsensusFromSingleRead(self):
R1-seed,15,0,9,0,AAATTT
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,15,MAX,0,AAATTT
R1-seed,15,0.100,0,AAATTT
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,0,MAX,0,AAATTT
R1-seed,15,0,0.100,0,AAATTT
"""

self.report.write_consensus_header(self.report_file)
Expand All @@ -297,9 +297,9 @@ def testConsensusFromTwoReads(self):
R1-seed,15,0,1,0,CCCGGG
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,15,MAX,0,AAATTT
R1-seed,15,0.100,0,MMMKKK
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,0,MAX,0,AAATTT
R1-seed,15,0,0.100,0,MMMKKK
"""

self.report.write_consensus_header(self.report_file)
Expand All @@ -315,9 +315,9 @@ def testConsensusWithOffset(self):
R1-seed,15,0,1,7,TTGGG
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,15,MAX,3,AAATTTGGG
R1-seed,15,0.100,3,AAATTTGGG
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,0,MAX,3,AAATTTGGG
R1-seed,15,0,0.100,3,AAATTTGGG
"""

self.report.write_consensus_header(self.report_file)
Expand All @@ -333,9 +333,9 @@ def testConsensusFromPartialContig(self):
1-R2-seed-partial,15,0,9,0,AAATTT
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
1-R2-seed-partial,15,MAX,0,AAATTT
1-R2-seed-partial,15,0.100,0,AAATTT
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
1-R2-seed-partial,15,0,MAX,0,AAATTT
1-R2-seed-partial,15,0,0.100,0,AAATTT
"""

self.report.write_consensus_header(self.report_file)
Expand All @@ -351,9 +351,9 @@ def testConsensusLowQualitySections(self):
R1-seed,15,0,1,7,TTNGG
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,15,MAX,6,TTT-GG
R1-seed,15,0.100,6,TTT-GG
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,1,MAX,6,TTT-GG
R1-seed,15,1,0.100,6,TTT-GG
"""
self.report.consensus_min_coverage = 1

Expand All @@ -370,7 +370,7 @@ def testConsensusLowQuality(self):
R1-seed,15,0,1,7,NNNNN
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
"""
self.report.consensus_min_coverage = 1

Expand All @@ -388,9 +388,9 @@ def testConsensusLowCoverageInMiddle(self):
R1-seed,15,0,1,6,GGG
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,15,MAX,0,AAAT--GGG
R1-seed,15,0.100,0,AAAT--GGG
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,10,MAX,0,AAAT--GGG
R1-seed,15,10,0.100,0,AAAT--GGG
"""
self.report.consensus_min_coverage = 10

Expand All @@ -407,9 +407,9 @@ def testConsensusLowCoverageAtStart(self):
R1-seed,15,0,1,4,TTGGG
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,15,MAX,4,TTGGG
R1-seed,15,0.100,4,TTGGG
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,10,MAX,4,TTGGG
R1-seed,15,10,0.100,4,TTGGG
"""
self.report.consensus_min_coverage = 10

Expand All @@ -426,9 +426,9 @@ def testConsensusLowCoverageAtEnd(self):
R1-seed,15,0,1,0,AAAT
""")
expected_text = """\
region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,15,MAX,0,AAAT
R1-seed,15,0.100,0,AAAT
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,10,MAX,0,AAAT
R1-seed,15,10,0.100,0,AAAT
"""
self.report.consensus_min_coverage = 10

Expand All @@ -438,6 +438,27 @@ def testConsensusLowCoverageAtEnd(self):

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testConsensusMultipleCoverageLevels(self):
# refname,qcut,rank,count,offset,seq
aligned_reads = self.prepareReads("""\
R1-seed,15,0,9,0,AAATTTGGG
R1-seed,15,0,1,0,AAAT
""")
expected_text = """\
region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,15,10,MAX,0,AAAT
R1-seed,15,10,0.100,0,AAAT
R1-seed,15,1,MAX,0,AAATTTGGG
R1-seed,15,1,0.100,0,AAATTTGGG
"""
self.report.consensus_min_coverages = (10, 1)

self.report.write_consensus_header(self.report_file)
self.report.read(aligned_reads)
self.report.write_consensus()

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testSingleReadAminoReport(self):
""" In this sample, there is a single read with two codons.
AAA -> K
Expand Down Expand Up @@ -2040,11 +2061,11 @@ def testMultipleCoordinateConsensusRegionsReport(self):
""")

expected_text = """\
seed,region,q-cutoff,consensus-percent-cutoff,offset,sequence
R1-seed,R1a,15,MAX,0,AAATTT
R1-seed,R1a,15,0.100,0,MMMKKK
R1-seed,R1b,15,MAX,3,AAATTT
R1-seed,R1b,15,0.100,3,MMMKKK
seed,region,q-cutoff,min_coverage,consensus-percent-cutoff,offset,sequence
R1-seed,R1a,15,0,MAX,0,AAATTT
R1-seed,R1a,15,0,0.100,0,MMMKKK
R1-seed,R1b,15,0,MAX,3,AAATTT
R1-seed,R1b,15,0,0.100,3,MMMKKK
"""

self.report.read(aligned_reads)
Expand Down
47 changes: 33 additions & 14 deletions micall/utils/probe_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ def find_probes(contigs_csv, probes_csv):
reader = DictReader(contigs_csv)
columns = ['sample', 'contig']
for target_name in TARGET_SEQUENCES:
for column_type in ['start',
'size',
'hxb2_start',
'hxb2_size',
for column_type in ['in_contig_start',
'in_contig_size',
'in_hxb2_start',
'in_hxb2_size',
'merged_hxb2_start',
'merged_hxb2_size',
'dist',
'score',
'seq']:
Expand All @@ -56,10 +58,13 @@ def find_probes(contigs_csv, probes_csv):
for sample_name, sample_rows in groupby(reader, itemgetter('sample')):
contig_num = 0
for row in sample_rows:
seed_name = row['genotype']
seed_name = row.get('genotype') or row.get('ref') or row['region']
conseq_cutoff = row.get('consensus-percent-cutoff')
if conseq_cutoff and conseq_cutoff != 'MAX':
continue
contig_num += 1
contig_name = f'{contig_num}-{seed_name}'
contig_seq: str = row['contig']
contig_seq: str = row.get('contig') or row['sequence']
aligned_hxb2, aligned_contig_to_hxb2, _ = align_it(
hxb2,
contig_seq,
Expand Down Expand Up @@ -95,24 +100,38 @@ def find_probes(contigs_csv, probes_csv):
start_pos = start + 1
end_pos = start + size
hxb2_pos = contig_pos = 0
hxb2_start = hxb2_size = None
merged_hxb2_start = merged_hxb2_size = None
for hxb2_nuc, contig_nuc in zip(aligned_hxb2,
aligned_contig_to_hxb2):
if hxb2_nuc != '-':
hxb2_pos += 1
if contig_nuc != '-':
contig_pos += 1
if contig_pos == start_pos:
hxb2_start = hxb2_pos
merged_hxb2_start = hxb2_pos
if contig_pos == end_pos:
hxb2_size = hxb2_pos - hxb2_start + 1
merged_hxb2_size = hxb2_pos - merged_hxb2_start + 1
break

aligned_ref, aligned_match, _ = align_it(
hxb2,
contig_match,
gap_open_penalty,
gap_extend_penalty,
use_terminal_gap_penalty)
lstripped_match = aligned_match.lstrip('-')
in_hxb2_start = len(aligned_match) - len(lstripped_match)
tail_len = len(lstripped_match) - len(lstripped_match.rstrip('-'))
ref_match = aligned_ref[in_hxb2_start:-tail_len or None]
in_hxb2_size = len(ref_match.replace('-', ''))

prefix = target_name + '_'
new_row[prefix + 'start'] = start_pos
new_row[prefix + 'size'] = size
new_row[prefix + 'hxb2_start'] = hxb2_start
new_row[prefix + 'hxb2_size'] = hxb2_size
new_row[prefix + 'in_contig_start'] = start_pos
new_row[prefix + 'in_contig_size'] = size
new_row[prefix + 'in_hxb2_start'] = in_hxb2_start
new_row[prefix + 'in_hxb2_size'] = in_hxb2_size
new_row[prefix + 'merged_hxb2_start'] = merged_hxb2_start
new_row[prefix + 'merged_hxb2_size'] = merged_hxb2_size
new_row[prefix + 'dist'] = dist
new_row[prefix + 'score'] = score
new_row[prefix + 'seq'] = contig_match
Expand All @@ -135,5 +154,5 @@ def main():
find_probes(args.contigs_csv, args.probes_csv)


if __name__ == '__main__':
if __name__ in ('__main__', '__live_coding__'):
main()
Loading

0 comments on commit 267553c

Please sign in to comment.