Combine midi samples with main samples in micall_basespace for #412.

Add some microtest samples that will successfully combine main and midi.
cfe-lab · Feb 7, 2018 · c79cfe8 · c79cfe8
1 parent 187d9bc
commit c79cfe8
Show file tree

Hide file tree

Showing 10 changed files with 2,013 additions and 277 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@
 *.so
 *.fai
 *~
+/*.bsfs.json
 .cache
 .coverage
 /.idea

diff --git a/micall/resistance/resistance.py b/micall/resistance/resistance.py
@@ -11,6 +11,7 @@
 
 from micall.resistance.asi_algorithm import AsiAlgorithm, ResistanceLevels
 from micall.core.aln2counts import AMINO_ALPHABET
+from micall.utils.sample_sheet_parser import sample_sheet_parser
 
 MIN_FRACTION = 0.05  # prevalence of mutations to report
 MIN_COVERAGE = 100
@@ -20,6 +21,8 @@
 
 AminoList = namedtuple('AminoList', 'region aminos genotype')
 
+SampleGroup = namedtuple('SampleGroup', 'enum names')
+
 
 class LowCoverageError(Exception):
     pass
@@ -299,6 +302,8 @@ def write_resistance(aminos, resistance_csv, mutations_csv, algorithms=None):
             asi = algorithms.get(genotype)
             if asi is None:
                 continue
+            if region == 'INT':
+                region = 'IN'
             result = interpret(asi, amino_seq, region)
             region_results.append((region, amino_seq, result))
         if all(drug_result.level == ResistanceLevels.FAIL.level
@@ -400,6 +405,28 @@ def load_asi():
     return algorithms
 
 
+def find_groups(file_names, sample_sheet_path, included_projects=None):
+    with open(sample_sheet_path) as sample_sheet_file:
+        run_info = sample_sheet_parser(sample_sheet_file)
+
+    midi_files = {row['sample']: row['filename']
+                  for row in run_info['DataSplit']
+                  if row['project'] == 'MidHCV'}
+    wide_names = {row['filename']: row['sample']
+                  for row in run_info['DataSplit']
+                  if (row['project'] != 'MidHCV' and
+                      (included_projects is None or
+                       row['project'] in included_projects))}
+    for file_name in file_names:
+        trimmed_file_name = '_'.join(file_name.split('_')[:2])
+        sample_name = wide_names.get(trimmed_file_name)
+        if sample_name is None:
+            # Project was not included.
+            continue
+        midi_file = midi_files.get(sample_name + 'MIDI')
+        yield SampleGroup(sample_name, (file_name, midi_file))
+
+
 def report_resistance(amino_csv,
                       midi_amino_csv,
                       resistance_csv,

diff --git a/micall/tests/microtest/2130A-HCV_S15_L001_R1_001.fastq b/micall/tests/microtest/2130A-HCV_S15_L001_R1_001.fastq
diff --git a/micall/tests/microtest/2130A-HCV_S15_L001_R2_001.fastq b/micall/tests/microtest/2130A-HCV_S15_L001_R2_001.fastq
diff --git a/micall/tests/microtest/2130AMIDI-MidHCV_S16_L001_R1_001.fastq b/micall/tests/microtest/2130AMIDI-MidHCV_S16_L001_R1_001.fastq
diff --git a/micall/tests/microtest/2130AMIDI-MidHCV_S16_L001_R2_001.fastq b/micall/tests/microtest/2130AMIDI-MidHCV_S16_L001_R2_001.fastq
diff --git a/micall/tests/microtest/SampleSheet.csv b/micall/tests/microtest/SampleSheet.csv
@@ -28,3 +28,5 @@ CFE_MS1_23-Jan-2014_N511-N701_2090A_HCV_nopid,2090A_HCV,23-Jan-2014,N/A,TTTAAAAA
 CFE_MS1_23-Jan-2014_N512-N701_2100A_HCV_nopid;CFE_MS1_23-Jan-2014_N512-N701_1337B_V3LOOP_nopid,2100A_HCV;1337B_V3LOOP,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2100A_HCV:FALSE;1337B_V3LOOP:FALSE Comments:2100A_HCV:;1337B_V3LOOP: Disablecontamcheck:2100A_HCV:FALSE;1337B_V3LOOP:FALSE,
 CFE_MS1_23-Jan-2014_N501-N702_2110A_V3LOOP_nopid,2110A_V3LOOP,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2110A_V3LOOP:FALSE Comments:2110A_V3LOOP: Disablecontamcheck:2110A_V3LOOP:FALSE,
 CFE_MS1_23-Jan-2014_N502-N702_2120A_PR_nopid,2120A_PR,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2120A_PR:FALSE Comments:2120A_PR: Disablecontamcheck:2120A_PR:FALSE,
+CFE_MS1_23-Jan-2014_N503-N702_2130A_HCV_nopid,2130A_HCV,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2130A_HCV:FALSE Comments:2130A_HCV: Disablecontamcheck:2130A_HCV:FALSE,
+CFE_MS1_23-Jan-2014_N504-N702_2130AMIDI_MidHCV_nopid,2130AMIDI_MidHCV,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2130AMIDI_MidHCV:FALSE Comments:2130AMIDI_MidHCV: Disablecontamcheck:2130AMIDI_MidHCV:FALSE,
diff --git a/micall/tests/microtest/make_sample.py b/micall/tests/microtest/make_sample.py
@@ -0,0 +1,79 @@
+from gotoh import align_it_aa
+
+from micall.core.project_config import ProjectConfig
+from micall.utils.translation import translate, reverse_and_complement
+
+
+def main():
+    coord_name = 'HCV2-JFH-1-NS5b'
+    extract_num = '2130'
+    start_pos, end_pos = 396, 561
+    read_count = 100
+    ref_name = None  # 'HCV-2a'
+    ref_start, ref_end = 0, 0
+    is_reversed = True
+    projects = ProjectConfig.loadDefault()
+    if ref_name is None:
+        ref_name, ref_start, ref_end = find_coord_pos(projects,
+                                                      coord_name,
+                                                      start_pos,
+                                                      end_pos)
+    ref_nuc_seq = projects.getReference(ref_name)
+    ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
+    # ref_nuc_section[(316-231)*3] = 'A'
+    ref_nuc_section = ''.join(ref_nuc_section)
+    # print(ref_nuc_section[(316-231)*3:(317-231)*3])
+    if is_reversed:
+        ref_nuc_section = reverse_and_complement(ref_nuc_section)
+    phred_scores = 'A' * (ref_end-ref_start)
+    file_num = '2' if is_reversed else '1'
+    for cluster in range(read_count):
+        print('@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1'.format(
+            extract_num,
+            cluster+1,
+            file_num))
+        print(ref_nuc_section)
+        print('+')
+        print(phred_scores)
+
+
+def find_coord_pos(projects, coord_name, start_pos, end_pos):
+    coord_seq = projects.getReference(coord_name)
+    gap_open = 40
+    gap_extend = 10
+    use_terminal_gap_penalty = 1
+    highest_score = 0
+    best_match = None
+    for ref_name in sorted(projects.getProjectSeeds('HCV')):
+        if not ref_name.startswith('HCV-2'):
+            continue
+        ref_nuc_seq = projects.getReference(ref_name)
+        for nuc_offset in range(3):
+            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
+            aligned_coord, aligned_ref, score = align_it_aa(
+                coord_seq,
+                ref_amino_seq,
+                gap_open,
+                gap_extend,
+                use_terminal_gap_penalty)
+            if score > highest_score:
+                highest_score = score
+                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
+    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
+    print(highest_score, ref_name, nuc_offset)
+    coord_pos = ref_pos = 0
+    ref_start = ref_end = None
+    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
+        if coord_amino != '-':
+            coord_pos += 1
+        if ref_amino != '-':
+            ref_pos += 1
+        if start_pos == coord_pos:
+            ref_start = ref_pos * 3 - nuc_offset - 3
+        if coord_pos == end_pos:
+            ref_end = ref_pos * 3 - nuc_offset
+    print(ref_start, ref_end)
+    return ref_name, ref_start, ref_end
+
+
+main()
diff --git a/micall/utils/genreport_rerun.py b/micall/utils/genreport_rerun.py
@@ -2,20 +2,16 @@
 import os
 import shutil
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
-from collections import namedtuple
 from csv import DictReader, DictWriter
 from glob import glob
 from itertools import groupby
 from operator import itemgetter
 
 from datetime import datetime
 
-from micall.hivdb.genreport import gen_report
-from micall.hivdb.hivdb import hivdb
+from micall.resistance.genreport import gen_report
+from micall.resistance.resistance import find_groups, report_resistance
 from micall.settings import NEEDS_PROCESSING, pipeline_version, DONE_PROCESSING
-from micall.utils.sample_sheet_parser import sample_sheet_parser
-
-SampleGroup = namedtuple('SampleGroup', 'enum names')
 
 
 def parse_args():
@@ -54,34 +50,6 @@ def parse_args():
     return args
 
 
-def find_groups(working_paths, source_path):
-    sample_sheet_path = os.path.join(source_path, '../../SampleSheet.csv')
-    with open(sample_sheet_path) as sample_sheet_file:
-        run_info = sample_sheet_parser(sample_sheet_file)
-
-    midi_files = {row['sample']: row['filename']
-                  for row in run_info['DataSplit']
-                  if row['project'] == 'MidHCV'}
-    wide_names = {row['filename']: row['sample']
-                  for row in run_info['DataSplit']
-                  if row['project'] == 'HCV'}
-    for path in working_paths:
-        wide_file = os.path.basename(path)
-        sample_name = wide_names.get(wide_file)
-        if sample_name is None:
-            # Not an HCV sample.
-            continue
-        if not os.path.exists(path):
-            # No results
-            continue
-        midi_file = midi_files.get(sample_name + 'MIDI')
-        if midi_file and not os.path.exists(os.path.join(os.path.dirname(path),
-                                                         midi_file)):
-            # No MIDI results
-            midi_file = None
-        yield SampleGroup(sample_name, (wide_file, midi_file))
-
-
 def rewrite_file(filename):
     backup_csv = filename + '.original.csv'
     os.rename(filename, backup_csv)
@@ -104,8 +72,12 @@ def genreport_rerun(source, working):
     os.makedirs(publish_path)
     print('##', run_name)
     working_paths = split_files(source, working)
-    sorted_working_paths = sorted(working_paths)
-    groups = list(find_groups(sorted_working_paths, source))
+    file_names = (os.path.basename(working_path) for working_path in working_paths)
+    sorted_file_names = sorted(file_names)
+    sample_sheet_path = os.path.join(source, '../../SampleSheet.csv')
+    groups = list(find_groups(sorted_file_names,
+                              sample_sheet_path,
+                              included_projects={'HCV'}))
     for group in groups:
         working_path = os.path.join(working, group.names[0])
         if group.names[1] is None:
@@ -116,17 +88,20 @@ def genreport_rerun(source, working):
             midi_path = os.path.join(
                 working,
                 group.names[1])
+            if not os.path.isdir(midi_path):
+                midi_name = 'failed MidHCV'
+                midi_path = working_path
         print(working_path, midi_name)
         with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \
                 open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \
                 open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \
                 open(os.path.join(working_path, 'mutations.csv'), 'w') as mutations_csv, \
                 open(os.path.join(working_path, 'resistance_fail.csv'), 'w') as resistance_fail_csv:
-            hivdb(amino_csv,
-                  midi_amino_csv,
-                  resistance_csv,
-                  mutations_csv,
-                  resistance_fail_csv)
+            report_resistance(amino_csv,
+                              midi_amino_csv,
+                              resistance_csv,
+                              mutations_csv,
+                              resistance_fail_csv)
         sample_name = os.path.basename(working_path)
         with open(os.path.join(working_path, 'resistance.csv')) as resistance_csv, \
                 open(os.path.join(working_path, 'mutations.csv')) as mutations_csv, \
@@ -162,10 +137,6 @@ def split_files(source, working):
         for sample, rows in groupby(reader, itemgetter('sample')):
             working_path = os.path.join(working, sample)
             working_paths.add(working_path)
-            if __name__ == '__live_coding__':
-                if len(working_paths) > 20:
-                    break
-                continue
             os.makedirs(working_path, exist_ok=True)
             target_path = os.path.join(working_path, file_name)
             with open(target_path, 'w') as target_csv:
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,7 @@ @@
     *.so
     *.fai
     *~
+    /*.bsfs.json
     .cache
     .coverage
     /.idea
@@ Expand Down @@