Display genotype on HCV reports for #412.

Also display Canadian drug names and updated HCV disclaimer.
cfe-lab · Jan 6, 2018 · f152e26 · f152e26
1 parent 062c5bc
commit f152e26
Show file tree

Hide file tree

Showing 8 changed files with 223 additions and 165 deletions.
diff --git a/micall/hivdb/genreport.py b/micall/hivdb/genreport.py
@@ -173,10 +173,11 @@ def get_reported_drug_classes(self):
 
 def read_mutations(drug_classes, csv_file):
     """Read in a mutations file from CSV.
+
     Returns a list of dictionaries.
     """
     err_string = "Error in mutations file '{}'".format(csv_file.name)
-    exp_set = frozenset("drug_class,mutation,prevalence".split(","))
+    exp_set = frozenset("drug_class,mutation,prevalence,genotype".split(","))
     data_lst = list(csv.DictReader(csv_file, restkey="dummy"))
     # make sure that all lines have exactly the required fields
     if sum([set(od.keys()) == exp_set for od in data_lst]) != len(data_lst):
@@ -206,7 +207,7 @@ def read_resistance(regions, csv_file):
     """
     err_string = "Error in resistance file '{}'".format(csv_file.name)
     exp_set = frozenset(
-        "region,drug_class,drug,drug_name,level,level_name,score".split(","))
+        "region,drug_class,drug,drug_name,level,level_name,score,genotype".split(","))
     data_lst = list(csv.DictReader(csv_file, restkey="dummy"))
     # make sure that all lines have exactly the required fields
     if sum([set(od.keys()) == exp_set for od in data_lst]) != len(data_lst):
@@ -215,6 +216,7 @@ def read_resistance(regions, csv_file):
         report_page = regions[od['region']]
         level = int(od['level'])
         drug_id = od['drug']
+        report_page.genotype = od['genotype']
         report_page.resistance_calls[drug_id] = (level, od["level_name"])
 
 

diff --git a/micall/hivdb/genreport.yaml b/micall/hivdb/genreport.yaml
@@ -6,31 +6,31 @@
     # NOTE: The order of the drugs in each drug_class will determine the order
     # in the report tables.
     INSTI:
-      - [DTG, dolutegravir]
-      - [EVG, elvitegravir]
-      - [RAL, raltegravir]
+      - [DTG, Dolutegravir]
+      - [EVG, Elvitegravir]
+      - [RAL, Raltegravir]
     PI:
-      - [ATV/r, atazanavir/r]
-      - [DRV/r, darunavir/r]
-      - [FPV/r, fosamprenavir/r]
-      - [IDV/r, indinavir/r]
-      - [LPV/r, lopinavir/r]
-      - [NFV, nelfinavir]
-      - [SQV/r, saquinavir/r]
-      - [TPV/r, tipranavir/r]
+      - [ATV/r, Atazanavir/r]
+      - [DRV/r, Darunavir/r]
+      - [FPV/r, Fosamprenavir/r]
+      - [IDV/r, Indinavir/r]
+      - [LPV/r, Lopinavir/r]
+      - [NFV, Nelfinavir]
+      - [SQV/r, Saquinavir/r]
+      - [TPV/r, Tipranavir/r]
     NRTI:
-      - [3TC, lamivudine]
-      - [ABC, abacavir]
-      - [AZT, zidovudine]
-      - [D4T, stavudine]
-      - [DDI, didanosine]
-      - [FTC, emtricitabine]
-      - [TDF, tenofovir]
+      - [3TC, Lamivudine]
+      - [ABC, Abacavir]
+      - [AZT, Zidovudine]
+      - [D4T, Stavudine]
+      - [DDI, Didanosine]
+      - [FTC, Emtricitabine]
+      - [TDF, Tenofovir]
     NNRTI:
-      - [EFV, efavirenz]
-      - [ETR, etravirine]
-      - [NVP, nevirapine]
-      - [RPV, rilpivirine]
+      - [EFV, Efavirenz]
+      - [ETR, Etravirine]
+      - [NVP, Nevirapine]
+      - [RPV, Rilpivirine]
   known_drug_classes:
     #the order in this list determines the order of the drug_class tables.
     - [NRTI, NRTI/NtRTI]
@@ -75,21 +75,22 @@
     # NOTE: The order of the drugs in each drug_class will determine the order
     # in the report tables.
     NS3:
-      - [BPV, Boceprevir]
-      - [GZR, Grazoprevir]
-      - [PTV, Paritaprevir]
-      - [SPV, Simeprevir]
-      - [TPV, Telaprevir]
+      - [BPV, Boceprevir (Victrelis™)]
+      - [GZR, Grazoprevir (a component of Zepatier™)]
+      - [PTV, Paritaprevir (a component of Technivie™ and Holkira Pak™)]
+      - [SPV, Simeprevir (Galexos™)]
+      - [TPV, Telaprevir (Incivek™)]
     NS5a:
-      - [DCV, Daclatasvir]
-      - [EBV, Elbasvir]
-      - [LDV, Ledipasvir]
-      - [OBV, Ombitasvir]
-      - [VEL, Velpatasvir]
+      - [DCV, Daclatasvir (Daklinza™)]
+      - [EBV, Elbasvir (a component of Zepatier™)]
+      - [LDV, Ledipasvir (a component of Harvoni™)]
+      - [OBV, Ombitasvir (a component of Technivie™ and Holkira Pak™)]
+      - [VEL, Velpatasvir (a component of Epclusa™)]
     NS5b:
-      - [DSV, Dasabuvir]
-      - [SOF-EPC, "Sofosbuvir (Epclusa)"]
-      - [SOF-HAR, "Sofosbuvir (Harvoni)"]
+      - [DSV, Dasabuvir (a component of Holkira Pak™)]
+      - [SOF-EPC, Sofosbuvir (a component of Epclusa™)]
+      - [SOF-HAR, Sofosbuvir (a component of Harvoni™)]
+
   known_drug_classes:
     #the order in this list determines the order of the drug_class tables.
     - [NS3, HCV NS3]
@@ -104,7 +105,18 @@
     3: ['Resistance Likely',           0xDD0000, 0xFFFFFF]
 
   disclaimer_text: >
-    TODO: HCV disclaimer text
+    Mutations in NS3, NS5A and NS5B were detected by deep sequencing of HCV.
+    The mutations considered and the interpretation algorithm can be found at
+    http://cfe-lab.github.io/MiCall . Mutations relative to genotype-specific
+    reference sequences detected above a prevalence of 5% of the total coverage
+    are reported here. This resistance scoring algorithm is still in
+    development and should be regarded as investigational; it is currently
+    defined as "Research Use Only". Patient management should not be based
+    solely on drug susceptibility results provided in this report. The clinical
+    response to the treatment regimen depends on many factors including patient
+    disease status, viral load, number and types of direct acting antivirals
+    and treatment duration.
 
   generated_by_text: >
-    Generated by MiCall {} on Illumina BaseSpace using ???, modified on ???.
+    Generated by MiCall {} on Illumina BaseSpace using cfe-hcv 1.5, modified on
+    7 Dec 2016.
diff --git a/micall/hivdb/hcv_rules.json b/micall/hivdb/hcv_rules.json
@@ -252,12 +252,6 @@
         "reference": "HCV1B-Con1-NS5b",
         "region": "NS5b",
         "genotype": "1B"
-      },
-      {
-        "rules": "SCORE FROM ( 142T => 4, 159F => 4, 237G => 4, 282T => 8, 314IFP => 4, 321A => 4, 355H => 4 )",
-        "reference": "HCV3-S52-NS5b",
-        "region": "NS5b",
-        "genotype": "3"
       }
     ],
     "name": "SOF-HAR",

diff --git a/micall/hivdb/hivdb.py b/micall/hivdb/hivdb.py
@@ -2,6 +2,7 @@
 import json
 import os
 from argparse import ArgumentParser, FileType
+from collections import namedtuple
 from csv import DictReader, DictWriter
 from itertools import groupby
 from operator import itemgetter
@@ -15,6 +16,8 @@
 HIV_RULES_PATH = os.path.join(os.path.dirname(__file__), 'HIVDB_8.3.xml')
 HCV_RULES_PATH = os.path.join(os.path.dirname(__file__), 'hcv_rules.json')
 
+AminoList = namedtuple('AminoList', 'region aminos seed')
+
 
 def parse_args():
     parser = ArgumentParser(
@@ -46,6 +49,17 @@ def get_reported_region(reference):
     return reference
 
 
+def get_genotype(seed):
+    if seed is None:
+        return None
+    parts = seed.split('-')
+    virus = parts[0]
+    if virus != 'HCV':
+        return None
+    full_genotype = parts[1]
+    return full_genotype[0]
+
+
 def find_good_regions(original_regions, coverage_scores_csv):
     good_regions = {}
     for row in DictReader(coverage_scores_csv):
@@ -69,16 +83,16 @@ def read_aminos(amino_csv, min_fraction, reported_regions=None):
     missing_regions = set()
     if reported_regions:
         missing_regions.update(reported_regions.keys())
-    for region, rows in groupby(DictReader(amino_csv),
-                                itemgetter('region')):
+    for (region, seed), rows in groupby(DictReader(amino_csv),
+                                itemgetter('region', 'seed')):
         if reported_regions is not None:
             missing_regions.discard(region)
             translated_region, is_reported = reported_regions.get(region,
                                                                   (None, None))
             if translated_region is None:
                 continue
             if not is_reported:
-                yield region, None
+                yield AminoList(region, None, None)
                 continue
         aminos = []
         for row in rows:
@@ -92,9 +106,9 @@ def read_aminos(amino_csv, min_fraction, reported_regions=None):
             if ins_count >= min_count:
                 pos_aminos['i'] = ins_count / coverage
             aminos.append(pos_aminos)
-        yield region, aminos
+        yield AminoList(region, aminos, seed)
     for region in missing_regions:
-        yield region, None
+        yield AminoList(region, None, None)
 
 
 def write_insufficient_data(resistance_writer, region, asi):
@@ -113,21 +127,42 @@ def write_insufficient_data(resistance_writer, region, asi):
 
 
 def write_resistance(aminos, resistance_csv, mutations_csv):
+    """ Calculate resistance scores and write them to files.
+
+    :param list[AminoList] aminos: region is the coordinate
+        reference name that this gene region was mapped to, and prevalance is a
+        float between 0.0 and 1.0
+    :param resistance_csv: open file to write resistance calls to, grouped by
+        genotype, region, drug_class
+    :param mutations_csv: open file to write mutations to, grouped by genotype,
+        drug_class
+    """
     resistance_writer = DictWriter(
         resistance_csv,
-        ['region', 'drug_class', 'drug', 'drug_name', 'level', 'level_name', 'score'],
+        ['region',
+         'drug_class',
+         'drug',
+         'drug_name',
+         'level',
+         'level_name',
+         'score',
+         'genotype'],
         lineterminator=os.linesep)
     resistance_writer.writeheader()
     mutations_writer = DictWriter(mutations_csv,
-                                  ['drug_class', 'mutation', 'prevalence'],
+                                  ['drug_class',
+                                   'mutation',
+                                   'prevalence',
+                                   'genotype'],
                                   lineterminator=os.linesep)
     mutations_writer.writeheader()
     algorithms = load_asi()
-    for region, amino_seq in aminos:
+    for region, amino_seq, seed in aminos:
         asi = algorithms.get(region)
         if asi is None:
             continue
         reported_region = get_reported_region(region)
+        genotype = get_genotype(seed)
         if amino_seq is None:
             write_insufficient_data(resistance_writer, region, asi)
             continue
@@ -139,7 +174,8 @@ def write_resistance(aminos, resistance_csv, mutations_csv):
                                             drug_name=drug_result.name,
                                             level_name=drug_result.level_name,
                                             level=drug_result.level,
-                                            score=drug_result.score))
+                                            score=drug_result.score,
+                                            genotype=genotype))
         for drug_class, class_mutations in result.mutations.items():
             for mutation in class_mutations:
                 amino = mutation[-1]
@@ -148,7 +184,8 @@ def write_resistance(aminos, resistance_csv, mutations_csv):
                 prevalence = pos_aminos[amino]
                 mutations_writer.writerow(dict(drug_class=drug_class,
                                                mutation=mutation,
-                                               prevalence=prevalence))
+                                               prevalence=prevalence,
+                                               genotype=genotype))
 
 
 def load_asi():

diff --git a/micall/hivdb/pdfreport.py b/micall/hivdb/pdfreport.py
@@ -96,7 +96,7 @@ def drug_class_tablst(row_offset, report_page, dc_name, level_coltab):
             level, level_name = resistance_dct[drug_id]
         else:
             level, level_name = 1, "NOT REPORTED"
-        t_data.append([drug_name.capitalize(), level_name])
+        t_data.append([drug_name, level_name])
         # determine colours for the level
         bg_col, fg_col = level_coltab[level]
         t_style.extend([('TEXTCOLOR', (1, tabline + drow_min), (1, tabline + drow_min), fg_col),
@@ -114,17 +114,7 @@ def drug_class_tablst(row_offset, report_page, dc_name, level_coltab):
     return t_data, t_style
 
 
-def drug_class_table(cfg_dct, dc_name, level_coltab, tabwidth):
-    """Generate a resistance report for a given drug class.
-    tabwidth: the total width allocated for the table.
-    """
-    # NOTE: this fudge factor ensures that the left, drug_name column, is not too wide.
-    t_data, t_style = drug_class_tablst(0, cfg_dct, dc_name, level_coltab)
-    colw = tabwidth * 0.36
-    return plat.Table(t_data, vAlign="TOP", style=t_style, colWidths=[colw, None])
-
-
-def top_table(sample_name, table_width):
+def top_table(sample_name, table_width, genotype):
     """Generate a (mostly empty) top table of three main columns.
     table_width: the overall width of the table.
     """
@@ -135,7 +125,10 @@ def top_table(sample_name, table_width):
     test_dl = [["Patient/Sample Details", "Test Details", "Physician Details"],
                ["", test_details_para("Sample ID: {}".format(samp_name)), ""],
                ["", test_details_para("Report Date: {}".format(nowstr)), ""],
-               ["", "", ""],
+               ["",
+                (genotype or "") and
+                test_details_para("Genotype: " + genotype),
+                ""],
                ["", "", ""]
                ]
     rn_min, rn_max = 1, len(test_dl) - 1
@@ -180,7 +173,7 @@ def write_report_one_column(report_pages, fname, sample_name=None):
         doc_els.append(plat.Paragraph(cfg_dct["report_title"], ti_style))
         doc_els.append(plat.Paragraph("For research use only", re_style))
         # -- top table
-        doc_els.append(top_table(sample_name, table_width))
+        doc_els.append(top_table(sample_name, table_width, report_page.genotype))
         # now drug classes tables, two per line
         known_dc_lst = cfg_dct["known_dclass_list"]
         tot_tab, tot_style = [], []
@@ -193,7 +186,7 @@ def write_report_one_column(report_pages, fname, sample_name=None):
         tot_style.extend([("VALIGN", (0, 0), (1, num_rows-1), "TOP"),
                           ("FONTSIZE", (0, 0), (1, num_rows-1), TAB_FONT_SIZE),
                           ("LEADING", (0, 0), (1, num_rows-1), TAB_FONT_SIZE)])
-        left_col_w = table_width * 0.36
+        left_col_w = table_width * 0.5
         right_col_w = table_width - left_col_w
         doc_els.append(plat.Table(tot_tab,
                                   vAlign="TOP",

diff --git a/micall/tests/test_genreport.py b/micall/tests/test_genreport.py
@@ -60,7 +60,7 @@ def test_repr(self):
 
         self.assertEqual(expected_repr, r)
 
-    def test(self):
+    def test_get_reported_drug_classes(self):
         page = ReportPage(dict(known_drug_classes=[('C1', 'Class 1'),
                                                    ('C2', 'Class 2')],
                                known_drugs={'C1': [('D1', 'Drug 1')],