Skip to content

Commit

Permalink
Pass project code to primer trimming step, for #625.
Browse files Browse the repository at this point in the history
Regenerate 2170 microtest, because of changes to primer trimming, rename project code in 2190, and 2200 to check that primer trimming works.
Split primers into two sets: HCV and SARSCOV2.
  • Loading branch information
donkirkby committed Dec 4, 2020
1 parent f4a43f4 commit ee36386
Show file tree
Hide file tree
Showing 31 changed files with 6,318 additions and 5,325 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
.pytest_cache
/.idea
/micall/tests/working/*
/micall/tests/microtest/micall-results
/micall/tests/microtest/micall-results*
/micall/tests/microtest/scratch
/venv_micall
/simgs/*.simg
Expand Down
4 changes: 2 additions & 2 deletions Singularity
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ From: centos:7

%labels
MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall
KIVE_INPUTS fastq1 fastq2 bad_cycles_csv
KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv
KIVE_OUTPUTS g2p_csv g2p_summary_csv remap_counts_csv \
remap_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \
failed_csv cascade_csv nuc_csv amino_csv coord_ins_csv conseq_csv \
Expand Down Expand Up @@ -188,7 +188,7 @@ From: centos:7
python /opt/micall/micall_kive.py --denovo "$@"

%applabels denovo
KIVE_INPUTS fastq1 fastq2 bad_cycles_csv
KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv
KIVE_OUTPUTS g2p_csv g2p_summary_csv remap_counts_csv \
remap_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \
failed_csv cascade_csv nuc_csv amino_csv coord_ins_csv conseq_csv \
Expand Down
61 changes: 39 additions & 22 deletions micall/core/trim_fastqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import csv
import logging
import shutil
from datetime import datetime
from gzip import GzipFile
import itertools
import math
Expand Down Expand Up @@ -50,6 +49,9 @@ def parse_args():
'-u',
action='store_true',
help='Set if the original FASTQ files are not compressed')
parser.add_argument('--project',
'-p',
help='Project code to choose primers')

return parser.parse_args()

Expand All @@ -59,7 +61,8 @@ def trim(original_fastq_filenames: typing.Sequence[str],
trimmed_fastq_filenames: typing.Sequence[str],
use_gzip: bool = True,
summary_file: typing.TextIO = None,
skip: typing.Tuple[str] = ()):
skip: typing.Tuple[str] = (),
project_code: str = None):
"""
:param original_fastq_filenames: sequence of two filenames, containing
Expand All @@ -72,6 +75,8 @@ def trim(original_fastq_filenames: typing.Sequence[str],
:param summary_file: an open CSV file to write to: write one row
with the average read quality for each file
:param skip: names of steps to skip
:param project_code: select primers to trim from files that start with this
name
"""
if summary_file is None:
summary_writer = None
Expand All @@ -88,7 +93,6 @@ def trim(original_fastq_filenames: typing.Sequence[str],
else:
with open(bad_cycles_filename, 'r') as bad_cycles:
bad_cycles = list(csv.DictReader(bad_cycles))
start_time = datetime.now()
for i, (src_name, dest_name) in enumerate(
zip(original_fastq_filenames, censored_filenames)):

Expand All @@ -100,15 +104,17 @@ def trim(original_fastq_filenames: typing.Sequence[str],
Path(censored_filenames[1]),
Path(trimmed_fastq_filenames[0]),
Path(trimmed_fastq_filenames[1]),
skip)
skip,
project_code)
purge_temp_files(censored_filenames)


def cut_all(censored_fastq1: Path,
censored_fastq2: Path,
trimmed_fastq1: Path,
trimmed_fastq2: Path,
skip: typing.Tuple[str] = ()):
skip: typing.Tuple[str] = (),
project_code: str = None):
dedapted_filenames = [Path(str(filename) + '.dedapted.fastq')
for filename in (censored_fastq1, censored_fastq2)]
ltrimmed_filenames = [Path(str(filename) + '.ltrimmed.fastq')
Expand All @@ -119,26 +125,37 @@ def cut_all(censored_fastq1: Path,
dedapted_filenames[0].symlink_to(censored_fastq1)
dedapted_filenames[1].symlink_to(censored_fastq2)
else:
start_time = datetime.now()
cut_adapters(censored_fastq1,
censored_fastq2,
dedapted_filenames[0],
dedapted_filenames[1])
if TrimSteps.primers in skip:

if project_code is None:
primers_root = None
else:
if 'HCV' in project_code:
primer_set = 'hcv'
else:
primer_set = project_code.lower()
primers_folder = Path(__file__).parent.parent / 'data'
primers_root = str(primers_folder / f'primers_{primer_set}_')
if not Path(primers_root + 'left.fasta').exists():
primers_root = None

if TrimSteps.primers in skip or primers_root is None:
shutil.copy(str(dedapted_filenames[0]), str(trimmed_fastq1))
shutil.copy(str(dedapted_filenames[1]), str(trimmed_fastq2))
else:
start_time = datetime.now()
cut_left_primers(dedapted_filenames[0],
dedapted_filenames[1],
ltrimmed_filenames[0],
ltrimmed_filenames[1])
right_start_time = datetime.now()
ltrimmed_filenames[1],
primers_root)
cut_right_primers(dedapted_filenames[0],
dedapted_filenames[1],
rtrimmed_filenames[0],
rtrimmed_filenames[1])
dimer_start_time = datetime.now()
rtrimmed_filenames[1],
primers_root)
combine_primer_trimming(dedapted_filenames[0],
dedapted_filenames[1],
ltrimmed_filenames[0],
Expand All @@ -165,8 +182,8 @@ def cut_adapters(original_fastq1: Path,
original_fastq2: Path,
trimmed_fastq1: Path,
trimmed_fastq2: Path):
script_path = os.path.dirname(__file__)
adapter_files = [os.path.join(script_path, 'adapters_read{}.fasta'.format(i))
script_path = Path(__file__).parent.parent / 'data'
adapter_files = [str(script_path / f'adapters_read{i}.fasta')
for i in (1, 2)]
run_cut_adapt(['--quiet',
'-a', 'file:' + adapter_files[0],
Expand All @@ -180,7 +197,8 @@ def cut_adapters(original_fastq1: Path,
def cut_left_primers(original_fastq1: Path,
original_fastq2: Path,
trimmed_fastq1: Path,
trimmed_fastq2: Path):
trimmed_fastq2: Path,
primers_root: str):
""" Trim left primers off both sets of reads.
The filtering options are a little tricky. --trimmed-only means that only
Expand All @@ -190,10 +208,9 @@ def cut_left_primers(original_fastq1: Path,
read had a primer trimmed off, then both reads will get written to the
ltrimmed.fastq file.
"""
script_path = Path(__file__).parent
run_cut_adapt(['--quiet',
'-g', f'file:{script_path/"primers_left.fasta"}',
'-A', f'file:{script_path/"primers_left_end.fasta"}',
'-g', f'file:{primers_root}left.fasta',
'-A', f'file:{primers_root}left_end.fasta',
'-o', str(trimmed_fastq1),
'-p', str(trimmed_fastq2),
'--overlap=8',
Expand All @@ -206,11 +223,11 @@ def cut_left_primers(original_fastq1: Path,
def cut_right_primers(original_fastq1: Path,
original_fastq2: Path,
trimmed_fastq1: Path,
trimmed_fastq2: Path):
script_path = Path(__file__).parent
trimmed_fastq2: Path,
primers_root: str):
run_cut_adapt(['--quiet',
'-a', f'file:{script_path/"primers_right_end.fasta"}',
'-G', f'file:{script_path/"primers_right.fasta"}',
'-a', f'file:{primers_root}right_end.fasta',
'-G', f'file:{primers_root}right.fasta',
'-o', str(trimmed_fastq1),
'-p', str(trimmed_fastq2),
'--overlap=8',
Expand Down
File renamed without changes.
File renamed without changes.
12 changes: 12 additions & 0 deletions micall/data/primers_hcv_left.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>HCV_1abGENF1bp
XGGGTCGCGAAAGGCCTTGTGGTACTGCC
>HCV_1abGENF2
XGTACTGCCTGATAGGGTGCTTGCGAGTGCC
>HCV_Pr1
XTGGGGTTCGCGTATGATACCCGCTGCTTTGA
>HCV_Pr2
XTGGGGTTTTCTTACGACACCAGGTGCTTTGA
>HCV_Pr4
XCCGTATGATACCCGCTGCTTTGACTCAAC
>HCV_Pr5
XTCCTACGACACCAGGTGCTTTGATTCAAC
12 changes: 12 additions & 0 deletions micall/data/primers_hcv_left_end.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>HCV_1abGENF1bp
GGCAGTACCACAAGGCCTTTCGCGACCCX
>HCV_1abGENF2
GGCACTCGCAAGCACCCTATCAGGCAGTACX
>HCV_Pr1
TCAAAGCAGCGGGTATCATACGCGAACCCCAX
>HCV_Pr2
TCAAAGCACCTGGTGTCGTAAGAAAACCCCAX
>HCV_Pr4
GTTGAGTCAAAGCAGCGGGTATCATACGGX
>HCV_Pr5
GTTGAATCAAAGCACCTGGTGTCGTAGGAX
12 changes: 12 additions & 0 deletions micall/data/primers_hcv_right.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>HCV_oligo_dA20
XAAAAAAAAAAAAAAAAAAAA
>HCV_Pr3
XGGCGGAATTCCTGGTCATAGCCTCCGTGAA
>HCV_TIM-Pr3
XCAGGAAACAGCTATGACGGCGGAATTCCTGGTCATAGCCTCCGTGAA
>HCV_Pr6
XAATTCCTGGTCATAGCCTCCGTGAAGACTC
>HCV_oligo_dA20-TIM
XCAGGAAACAGCTATGACAAAAAAAAAAAAAAAAAAAA
>HCV_TIM
XCAGGAAACAGCTATGAC
12 changes: 12 additions & 0 deletions micall/data/primers_hcv_right_end.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>HCV_oligo_dA20
TTTTTTTTTTTTTTTTTTTTX
>HCV_Pr3
TTCACGGAGGCTATGACCAGGAATTCCGCCX
>HCV_TIM-Pr3
TTCACGGAGGCTATGACCAGGAATTCCGCCGTCATAGCTGTTTCCTGX
>HCV_Pr6
GAGTCTTCACGGAGGCTATGACCAGGAATTX
>HCV_oligo_dA20-TIM
TTTTTTTTTTTTTTTTTTTTGTCATAGCTGTTTCCTGX
>HCV_TIM
GTCATAGCTGTTTCCTGX
Original file line number Diff line number Diff line change
@@ -1,15 +1,3 @@
>HCV_1abGENF1bp
XGGGTCGCGAAAGGCCTTGTGGTACTGCC
>HCV_1abGENF2
XGTACTGCCTGATAGGGTGCTTGCGAGTGCC
>HCV_Pr1
XTGGGGTTCGCGTATGATACCCGCTGCTTTGA
>HCV_Pr2
XTGGGGTTTTCTTACGACACCAGGTGCTTTGA
>HCV_Pr4
XCCGTATGATACCCGCTGCTTTGACTCAAC
>HCV_Pr5
XTCCTACGACACCAGGTGCTTTGATTCAAC
>nCoV-2019_1_LEFT
XACCAACCAACTTTCGATCTCTTGT
>nCoV-2019_2_LEFT
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,3 @@
>HCV_1abGENF1bp
GGCAGTACCACAAGGCCTTTCGCGACCCX
>HCV_1abGENF2
GGCACTCGCAAGCACCCTATCAGGCAGTACX
>HCV_Pr1
TCAAAGCAGCGGGTATCATACGCGAACCCCAX
>HCV_Pr2
TCAAAGCACCTGGTGTCGTAAGAAAACCCCAX
>HCV_Pr4
GTTGAGTCAAAGCAGCGGGTATCATACGGX
>HCV_Pr5
GTTGAATCAAAGCACCTGGTGTCGTAGGAX
>nCoV-2019_1_LEFT
ACAAGAGATCGAAAGTTGGTTGGTX
>nCoV-2019_2_LEFT
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,3 @@
>HCV_oligo_dA20
XAAAAAAAAAAAAAAAAAAAA
>HCV_Pr3
XGGCGGAATTCCTGGTCATAGCCTCCGTGAA
>HCV_TIM-Pr3
XCAGGAAACAGCTATGACGGCGGAATTCCTGGTCATAGCCTCCGTGAA
>HCV_Pr6
XAATTCCTGGTCATAGCCTCCGTGAAGACTC
>HCV_oligo_dA20-TIM
XCAGGAAACAGCTATGACAAAAAAAAAAAAAAAAAAAA
>HCV_TIM
XCAGGAAACAGCTATGAC
>nCoV-2019_1_RIGHT
XCATCTTTAAGATGTTGACGTGCCTC
>nCoV-2019_2_RIGHT
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,3 @@
>HCV_oligo_dA20
TTTTTTTTTTTTTTTTTTTTX
>HCV_Pr3
TTCACGGAGGCTATGACCAGGAATTCCGCCX
>HCV_TIM-Pr3
TTCACGGAGGCTATGACCAGGAATTCCGCCGTCATAGCTGTTTCCTGX
>HCV_Pr6
GAGTCTTCACGGAGGCTATGACCAGGAATTX
>HCV_oligo_dA20-TIM
TTTTTTTTTTTTTTTTTTTTGTCATAGCTGTTTCCTGX
>HCV_TIM
GTCATAGCTGTTTCCTGX
>nCoV-2019_1_RIGHT
GAGGCACGTCAACATCTTAAAGATGX
>nCoV-2019_2_RIGHT
Expand Down
15 changes: 14 additions & 1 deletion micall/drivers/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import os

import typing
from csv import DictReader
from pathlib import Path

from micall.core.aln2counts import aln2counts
from micall.core.amplicon_finder import write_merge_lengths_plot, merge_for_entropy
Expand Down Expand Up @@ -122,13 +124,16 @@ def process(self,
makedirs(scratch_path)
use_gzip = force_gzip or self.fastq1.endswith('.gz')

sample_info = self.load_sample_info()

with open(self.read_summary_csv, 'w') as read_summary:
trim((self.fastq1, self.fastq2),
self.bad_cycles_csv,
(self.trimmed1_fastq, self.trimmed2_fastq),
summary_file=read_summary,
use_gzip=use_gzip,
skip=self.skip)
skip=self.skip,
project_code=sample_info.get('project'))

if use_denovo:
logger.info('Running merge_for_entropy on %s.', self)
Expand Down Expand Up @@ -260,6 +265,14 @@ def process(self,
cascade_report.generate()
logger.info('Finished sample %s.', self)

def load_sample_info(self):
path = Path(self.sample_info_csv)
if not path.exists():
return {}
with path.open() as info_file:
reader = DictReader(info_file)
return next(reader)

def run_mapping(self, excluded_seeds):
logger.info('Running prelim_map on %s.', self)
scratch_path = self.get_scratch_path()
Expand Down
Loading

0 comments on commit ee36386

Please sign in to comment.