From 90c7bacbaba0ebc96f3fa490c56f69f8ff1d0240 Mon Sep 17 00:00:00 2001 From: don Date: Thu, 1 Feb 2018 16:02:41 -0800 Subject: [PATCH] Replace check that all mutation positions have minimum coverage for #412. Fill in gaps in amino lists. --- micall/hivdb/hivdb.py | 9 ++++--- micall/tests/test_hivdb.py | 51 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/micall/hivdb/hivdb.py b/micall/hivdb/hivdb.py index 5aef4b020..158fdf069 100644 --- a/micall/hivdb/hivdb.py +++ b/micall/hivdb/hivdb.py @@ -142,7 +142,7 @@ def combine_aminos(amino_csv, midi_amino_csv, fail_writer): fail_writer.writerow(dict(seed=seed, region=region, reason=ex.args[0])) - continue + rows = [] if region.endswith('-NS5b'): region_midi_rows = midi_rows[seed] rows = combine_midi_rows(rows, region_midi_rows) @@ -200,6 +200,8 @@ def read_aminos(amino_rows, min_fraction, reported_regions=None, min_coverage=0) counts = list(map(int, (row[f] for f in coverage_columns))) coverage = int(row['coverage']) pos = int(row['refseq.aa.pos']) + while pos > len(aminos) + 1: + aminos.append({}) if max_pos and pos <= max_pos: total_coverage += coverage if pos == max_pos: @@ -333,12 +335,11 @@ def interpret(asi, amino_seq, region): # TODO: Make this more general instead of only applying to missing MIDI. is_missing_midi = region.endswith('-NS5b') and len(amino_seq) < len(ref_seq) - # TODO: Put back the check that all mutation positions have minimum coverage. - if any(amino_seq): + if is_missing_midi: # At least some data found, assume wild type in gaps. new_amino_seq = [] for wild_type, old_aminos in zip_longest(ref_seq, amino_seq): - if old_aminos: + if old_aminos is not None: new_amino_seq.append(old_aminos) else: new_amino_seq.append({wild_type: 1.0}) diff --git a/micall/tests/test_hivdb.py b/micall/tests/test_hivdb.py index 2f57268c1..6fd8c4f36 100644 --- a/micall/tests/test_hivdb.py +++ b/micall/tests/test_hivdb.py @@ -510,6 +510,37 @@ def test_combine_ignores_main_tail_without_midi(self): self.check_combination(amino_csv, midi_amino_csv, expected_csv) + def test_combine_midi_only(self): + amino_csv = StringIO("""\ +seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\ +A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage +HCV-1a,HCV1A-H77-NS5b,15,1,1,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +""") + amino_csv.name = 'main_amino.csv' + midi_amino_csv = StringIO("""\ +seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\ +A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage +HCV-1a,HCV1A-H77-NS5b,15,1,558,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +HCV-1a,HCV1A-H77-NS5b,15,1,559,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +HCV-1a,HCV1A-H77-NS5b,15,4,560,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +HCV-1a,HCV1A-H77-NS5b,15,7,561,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +""") + midi_amino_csv.name = 'midi_amino.csv' + expected_csv = """\ +seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\ +A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage +HCV-1a,HCV1A-H77-NS5b,15,1,558,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +HCV-1a,HCV1A-H77-NS5b,15,1,559,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +HCV-1a,HCV1A-H77-NS5b,15,4,560,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +HCV-1a,HCV1A-H77-NS5b,15,7,561,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000 +""" + expected_fail_csv = """\ +seed,region,reason +HCV-1a,HCV1A-H77-NS5b,low average coverage +""" + + self.check_combination(amino_csv, midi_amino_csv, expected_csv, expected_fail_csv) + class ReadAminosTest(TestCase): def test_simple(self): @@ -690,7 +721,7 @@ def test_good_average_coverage(self): min_fraction = 0.2 min_coverage = 9 expected_aminos = [AminoList('HCV-Foo-NS5a', - [{'K': 1.0}, {'F': 1.0}], + [{}]*99 + [{'K': 1.0}, {'F': 1.0}], '1A')] aminos = list(read_aminos(amino_csv, @@ -760,7 +791,7 @@ def test_resistant_even_with_missing_midi(self): min_fraction = 0.2 min_coverage = 9 expected_aminos = [AminoList('HCV1A-H77-NS5b', - [{'G': 1.0}, {'F': 1.0}], + [{}]*394 + [{'G': 1.0}, {'F': 1.0}], '1A')] aminos = list(read_aminos(amino_csv, @@ -840,6 +871,22 @@ def test_stop_codons(self): self.assertEqual(expected_aminos, aminos) + def test_missing_position(self): + amino_csv = DictReader(StringIO("""\ +seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\ +A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage +R1-seed,R1,15,4,2,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 +R1-seed,R1,15,7,3,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 +""")) + min_fraction = 0.2 + expected_aminos = [AminoList('R1', + [{}, {'F': 1.0}, {'A': 1.0}], + None)] + + aminos = list(read_aminos(amino_csv, min_fraction)) + + self.assertEqual(expected_aminos, aminos) + class FilterAminosTest(TestCase): @classmethod