Skip to content

Commit

Permalink
Replace check that all mutation positions have minimum coverage for #412
Browse files Browse the repository at this point in the history
.

Fill in gaps in amino lists.
  • Loading branch information
donkirkby committed Feb 2, 2018
1 parent 736020d commit 90c7bac
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 6 deletions.
9 changes: 5 additions & 4 deletions micall/hivdb/hivdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def combine_aminos(amino_csv, midi_amino_csv, fail_writer):
fail_writer.writerow(dict(seed=seed,
region=region,
reason=ex.args[0]))
continue
rows = []
if region.endswith('-NS5b'):
region_midi_rows = midi_rows[seed]
rows = combine_midi_rows(rows, region_midi_rows)
Expand Down Expand Up @@ -200,6 +200,8 @@ def read_aminos(amino_rows, min_fraction, reported_regions=None, min_coverage=0)
counts = list(map(int, (row[f] for f in coverage_columns)))
coverage = int(row['coverage'])
pos = int(row['refseq.aa.pos'])
while pos > len(aminos) + 1:
aminos.append({})
if max_pos and pos <= max_pos:
total_coverage += coverage
if pos == max_pos:
Expand Down Expand Up @@ -333,12 +335,11 @@ def interpret(asi, amino_seq, region):
# TODO: Make this more general instead of only applying to missing MIDI.
is_missing_midi = region.endswith('-NS5b') and len(amino_seq) < len(ref_seq)

# TODO: Put back the check that all mutation positions have minimum coverage.
if any(amino_seq):
if is_missing_midi:
# At least some data found, assume wild type in gaps.
new_amino_seq = []
for wild_type, old_aminos in zip_longest(ref_seq, amino_seq):
if old_aminos:
if old_aminos is not None:
new_amino_seq.append(old_aminos)
else:
new_amino_seq.append({wild_type: 1.0})
Expand Down
51 changes: 49 additions & 2 deletions micall/tests/test_hivdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,37 @@ def test_combine_ignores_main_tail_without_midi(self):

self.check_combination(amino_csv, midi_amino_csv, expected_csv)

def test_combine_midi_only(self):
amino_csv = StringIO("""\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage
HCV-1a,HCV1A-H77-NS5b,15,1,1,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
""")
amino_csv.name = 'main_amino.csv'
midi_amino_csv = StringIO("""\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage
HCV-1a,HCV1A-H77-NS5b,15,1,558,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-1a,HCV1A-H77-NS5b,15,1,559,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-1a,HCV1A-H77-NS5b,15,4,560,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-1a,HCV1A-H77-NS5b,15,7,561,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
""")
midi_amino_csv.name = 'midi_amino.csv'
expected_csv = """\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage
HCV-1a,HCV1A-H77-NS5b,15,1,558,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-1a,HCV1A-H77-NS5b,15,1,559,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-1a,HCV1A-H77-NS5b,15,4,560,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-1a,HCV1A-H77-NS5b,15,7,561,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
"""
expected_fail_csv = """\
seed,region,reason
HCV-1a,HCV1A-H77-NS5b,low average coverage
"""

self.check_combination(amino_csv, midi_amino_csv, expected_csv, expected_fail_csv)


class ReadAminosTest(TestCase):
def test_simple(self):
Expand Down Expand Up @@ -690,7 +721,7 @@ def test_good_average_coverage(self):
min_fraction = 0.2
min_coverage = 9
expected_aminos = [AminoList('HCV-Foo-NS5a',
[{'K': 1.0}, {'F': 1.0}],
[{}]*99 + [{'K': 1.0}, {'F': 1.0}],
'1A')]

aminos = list(read_aminos(amino_csv,
Expand Down Expand Up @@ -760,7 +791,7 @@ def test_resistant_even_with_missing_midi(self):
min_fraction = 0.2
min_coverage = 9
expected_aminos = [AminoList('HCV1A-H77-NS5b',
[{'G': 1.0}, {'F': 1.0}],
[{}]*394 + [{'G': 1.0}, {'F': 1.0}],
'1A')]

aminos = list(read_aminos(amino_csv,
Expand Down Expand Up @@ -840,6 +871,22 @@ def test_stop_codons(self):

self.assertEqual(expected_aminos, aminos)

def test_missing_position(self):
amino_csv = DictReader(StringIO("""\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage
R1-seed,R1,15,4,2,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
R1-seed,R1,15,7,3,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
"""))
min_fraction = 0.2
expected_aminos = [AminoList('R1',
[{}, {'F': 1.0}, {'A': 1.0}],
None)]

aminos = list(read_aminos(amino_csv, min_fraction))

self.assertEqual(expected_aminos, aminos)


class FilterAminosTest(TestCase):
@classmethod
Expand Down

0 comments on commit 90c7bac

Please sign in to comment.