Skip to content

Commit

Permalink
Combine midi samples with main samples in micall_basespace for #412.
Browse files Browse the repository at this point in the history
Add some microtest samples that will successfully combine main and midi.
  • Loading branch information
donkirkby committed Feb 7, 2018
1 parent 187d9bc commit c79cfe8
Show file tree
Hide file tree
Showing 10 changed files with 2,013 additions and 277 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*.so
*.fai
*~
/*.bsfs.json
.cache
.coverage
/.idea
Expand Down
27 changes: 27 additions & 0 deletions micall/resistance/resistance.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from micall.resistance.asi_algorithm import AsiAlgorithm, ResistanceLevels
from micall.core.aln2counts import AMINO_ALPHABET
from micall.utils.sample_sheet_parser import sample_sheet_parser

MIN_FRACTION = 0.05 # prevalence of mutations to report
MIN_COVERAGE = 100
Expand All @@ -20,6 +21,8 @@

AminoList = namedtuple('AminoList', 'region aminos genotype')

SampleGroup = namedtuple('SampleGroup', 'enum names')


class LowCoverageError(Exception):
pass
Expand Down Expand Up @@ -299,6 +302,8 @@ def write_resistance(aminos, resistance_csv, mutations_csv, algorithms=None):
asi = algorithms.get(genotype)
if asi is None:
continue
if region == 'INT':
region = 'IN'
result = interpret(asi, amino_seq, region)
region_results.append((region, amino_seq, result))
if all(drug_result.level == ResistanceLevels.FAIL.level
Expand Down Expand Up @@ -400,6 +405,28 @@ def load_asi():
return algorithms


def find_groups(file_names, sample_sheet_path, included_projects=None):
with open(sample_sheet_path) as sample_sheet_file:
run_info = sample_sheet_parser(sample_sheet_file)

midi_files = {row['sample']: row['filename']
for row in run_info['DataSplit']
if row['project'] == 'MidHCV'}
wide_names = {row['filename']: row['sample']
for row in run_info['DataSplit']
if (row['project'] != 'MidHCV' and
(included_projects is None or
row['project'] in included_projects))}
for file_name in file_names:
trimmed_file_name = '_'.join(file_name.split('_')[:2])
sample_name = wide_names.get(trimmed_file_name)
if sample_name is None:
# Project was not included.
continue
midi_file = midi_files.get(sample_name + 'MIDI')
yield SampleGroup(sample_name, (file_name, midi_file))


def report_resistance(amino_csv,
midi_amino_csv,
resistance_csv,
Expand Down
400 changes: 400 additions & 0 deletions micall/tests/microtest/2130A-HCV_S15_L001_R1_001.fastq

Large diffs are not rendered by default.

400 changes: 400 additions & 0 deletions micall/tests/microtest/2130A-HCV_S15_L001_R2_001.fastq

Large diffs are not rendered by default.

400 changes: 400 additions & 0 deletions micall/tests/microtest/2130AMIDI-MidHCV_S16_L001_R1_001.fastq

Large diffs are not rendered by default.

400 changes: 400 additions & 0 deletions micall/tests/microtest/2130AMIDI-MidHCV_S16_L001_R2_001.fastq

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions micall/tests/microtest/SampleSheet.csv
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,5 @@ CFE_MS1_23-Jan-2014_N511-N701_2090A_HCV_nopid,2090A_HCV,23-Jan-2014,N/A,TTTAAAAA
CFE_MS1_23-Jan-2014_N512-N701_2100A_HCV_nopid;CFE_MS1_23-Jan-2014_N512-N701_1337B_V3LOOP_nopid,2100A_HCV;1337B_V3LOOP,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2100A_HCV:FALSE;1337B_V3LOOP:FALSE Comments:2100A_HCV:;1337B_V3LOOP: Disablecontamcheck:2100A_HCV:FALSE;1337B_V3LOOP:FALSE,
CFE_MS1_23-Jan-2014_N501-N702_2110A_V3LOOP_nopid,2110A_V3LOOP,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2110A_V3LOOP:FALSE Comments:2110A_V3LOOP: Disablecontamcheck:2110A_V3LOOP:FALSE,
CFE_MS1_23-Jan-2014_N502-N702_2120A_PR_nopid,2120A_PR,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2120A_PR:FALSE Comments:2120A_PR: Disablecontamcheck:2120A_PR:FALSE,
CFE_MS1_23-Jan-2014_N503-N702_2130A_HCV_nopid,2130A_HCV,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2130A_HCV:FALSE Comments:2130A_HCV: Disablecontamcheck:2130A_HCV:FALSE,
CFE_MS1_23-Jan-2014_N504-N702_2130AMIDI_MidHCV_nopid,2130AMIDI_MidHCV,23-Jan-2014,N/A,TTTAAAAA,AAATTTTT,23-Jan-2014,Research:2130AMIDI_MidHCV:FALSE Comments:2130AMIDI_MidHCV: Disablecontamcheck:2130AMIDI_MidHCV:FALSE,
79 changes: 79 additions & 0 deletions micall/tests/microtest/make_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from gotoh import align_it_aa

from micall.core.project_config import ProjectConfig
from micall.utils.translation import translate, reverse_and_complement


def main():
coord_name = 'HCV2-JFH-1-NS5b'
extract_num = '2130'
start_pos, end_pos = 396, 561
read_count = 100
ref_name = None # 'HCV-2a'
ref_start, ref_end = 0, 0
is_reversed = True
projects = ProjectConfig.loadDefault()
if ref_name is None:
ref_name, ref_start, ref_end = find_coord_pos(projects,
coord_name,
start_pos,
end_pos)
ref_nuc_seq = projects.getReference(ref_name)
ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
# ref_nuc_section[(316-231)*3] = 'A'
ref_nuc_section = ''.join(ref_nuc_section)
# print(ref_nuc_section[(316-231)*3:(317-231)*3])
if is_reversed:
ref_nuc_section = reverse_and_complement(ref_nuc_section)
phred_scores = 'A' * (ref_end-ref_start)
file_num = '2' if is_reversed else '1'
for cluster in range(read_count):
print('@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1'.format(
extract_num,
cluster+1,
file_num))
print(ref_nuc_section)
print('+')
print(phred_scores)


def find_coord_pos(projects, coord_name, start_pos, end_pos):
coord_seq = projects.getReference(coord_name)
gap_open = 40
gap_extend = 10
use_terminal_gap_penalty = 1
highest_score = 0
best_match = None
for ref_name in sorted(projects.getProjectSeeds('HCV')):
if not ref_name.startswith('HCV-2'):
continue
ref_nuc_seq = projects.getReference(ref_name)
for nuc_offset in range(3):
ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
aligned_coord, aligned_ref, score = align_it_aa(
coord_seq,
ref_amino_seq,
gap_open,
gap_extend,
use_terminal_gap_penalty)
if score > highest_score:
highest_score = score
best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
print(highest_score, ref_name, nuc_offset)
coord_pos = ref_pos = 0
ref_start = ref_end = None
for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
if coord_amino != '-':
coord_pos += 1
if ref_amino != '-':
ref_pos += 1
if start_pos == coord_pos:
ref_start = ref_pos * 3 - nuc_offset - 3
if coord_pos == end_pos:
ref_end = ref_pos * 3 - nuc_offset
print(ref_start, ref_end)
return ref_name, ref_start, ref_end


main()
61 changes: 16 additions & 45 deletions micall/utils/genreport_rerun.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,16 @@
import os
import shutil
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
from collections import namedtuple
from csv import DictReader, DictWriter
from glob import glob
from itertools import groupby
from operator import itemgetter

from datetime import datetime

from micall.hivdb.genreport import gen_report
from micall.hivdb.hivdb import hivdb
from micall.resistance.genreport import gen_report
from micall.resistance.resistance import find_groups, report_resistance
from micall.settings import NEEDS_PROCESSING, pipeline_version, DONE_PROCESSING
from micall.utils.sample_sheet_parser import sample_sheet_parser

SampleGroup = namedtuple('SampleGroup', 'enum names')


def parse_args():
Expand Down Expand Up @@ -54,34 +50,6 @@ def parse_args():
return args


def find_groups(working_paths, source_path):
sample_sheet_path = os.path.join(source_path, '../../SampleSheet.csv')
with open(sample_sheet_path) as sample_sheet_file:
run_info = sample_sheet_parser(sample_sheet_file)

midi_files = {row['sample']: row['filename']
for row in run_info['DataSplit']
if row['project'] == 'MidHCV'}
wide_names = {row['filename']: row['sample']
for row in run_info['DataSplit']
if row['project'] == 'HCV'}
for path in working_paths:
wide_file = os.path.basename(path)
sample_name = wide_names.get(wide_file)
if sample_name is None:
# Not an HCV sample.
continue
if not os.path.exists(path):
# No results
continue
midi_file = midi_files.get(sample_name + 'MIDI')
if midi_file and not os.path.exists(os.path.join(os.path.dirname(path),
midi_file)):
# No MIDI results
midi_file = None
yield SampleGroup(sample_name, (wide_file, midi_file))


def rewrite_file(filename):
backup_csv = filename + '.original.csv'
os.rename(filename, backup_csv)
Expand All @@ -104,8 +72,12 @@ def genreport_rerun(source, working):
os.makedirs(publish_path)
print('##', run_name)
working_paths = split_files(source, working)
sorted_working_paths = sorted(working_paths)
groups = list(find_groups(sorted_working_paths, source))
file_names = (os.path.basename(working_path) for working_path in working_paths)
sorted_file_names = sorted(file_names)
sample_sheet_path = os.path.join(source, '../../SampleSheet.csv')
groups = list(find_groups(sorted_file_names,
sample_sheet_path,
included_projects={'HCV'}))
for group in groups:
working_path = os.path.join(working, group.names[0])
if group.names[1] is None:
Expand All @@ -116,17 +88,20 @@ def genreport_rerun(source, working):
midi_path = os.path.join(
working,
group.names[1])
if not os.path.isdir(midi_path):
midi_name = 'failed MidHCV'
midi_path = working_path
print(working_path, midi_name)
with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \
open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \
open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \
open(os.path.join(working_path, 'mutations.csv'), 'w') as mutations_csv, \
open(os.path.join(working_path, 'resistance_fail.csv'), 'w') as resistance_fail_csv:
hivdb(amino_csv,
midi_amino_csv,
resistance_csv,
mutations_csv,
resistance_fail_csv)
report_resistance(amino_csv,
midi_amino_csv,
resistance_csv,
mutations_csv,
resistance_fail_csv)
sample_name = os.path.basename(working_path)
with open(os.path.join(working_path, 'resistance.csv')) as resistance_csv, \
open(os.path.join(working_path, 'mutations.csv')) as mutations_csv, \
Expand Down Expand Up @@ -162,10 +137,6 @@ def split_files(source, working):
for sample, rows in groupby(reader, itemgetter('sample')):
working_path = os.path.join(working, sample)
working_paths.add(working_path)
if __name__ == '__live_coding__':
if len(working_paths) > 20:
break
continue
os.makedirs(working_path, exist_ok=True)
target_path = os.path.join(working_path, file_name)
with open(target_path, 'w') as target_csv:
Expand Down
Loading

0 comments on commit c79cfe8

Please sign in to comment.