Skip to content

Commit

Permalink
Get savage working, as part of #442.
Browse files Browse the repository at this point in the history
Seems to use a lot of memory, and needs split count set carefully.
  • Loading branch information
donkirkby committed Nov 6, 2018
1 parent 2dad75c commit f6352b9
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 70 deletions.
50 changes: 32 additions & 18 deletions Singularity
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,36 @@ From: centos:7

echo ===== Installing Rust and merge-mates ===== >/dev/null
yum install -q -y rust cargo
git clone https://github.com/jeff-k/merge-mates.git
cd merge-mates
cargo build
cp target/debug/merge-mates /bin
cargo install --root / --git https://github.com/jeff-k/merge-mates.git

echo ===== Installing Savage ===== >/dev/null
yum install -q -y zlib-devel boost-timer boost-program-options boost-devel
cargo install --root / --git https://github.com/jbaaijens/rust-overlaps.git --tag v0.1.1

wget -q https://downloads.sourceforge.net/project/bio-bwa/bwa-0.7.17.tar.bz2
tar xjf bwa-0.7.17.tar.bz2 --no-same-owner
cd bwa-0.7.17
make
mv bwa /bin
cd ..
rm -rf merge-mates
rm -rf bwa-0.7.17 bwa-0.7.17.tar.bz2

wget -q https://github.com/pachterlab/kallisto/releases/download/v0.44.0/kallisto_linux-v0.44.0.tar.gz
tar xzf kallisto_linux-v0.44.0.tar.gz --no-same-owner
mv kallisto_linux-v0.44.0/kallisto /bin
rm -rf kallisto_linux-v0.44.0 kallisto_linux-v0.44.0.tar.gz

cd /opt
git clone https://bitbucket.org/jbaaijens/savage.git
cd savage
git checkout tags/v0.4.0
make
echo \#\!/usr/bin/env sh > /bin/savage
echo /opt/savage/savage.py \$@ >> /bin/savage
chmod +x /bin/savage
cd /

yum remove -q -y zlib-devel boost-devel

echo ===== Installing blast ===== >/dev/null
cd /root
Expand Down Expand Up @@ -85,7 +109,7 @@ From: centos:7
chmod +x kmc kmc_dump
cd /opt
wget -q https://sourceforge.net/projects/mummer/files/mummer/3.23/MUMmer3.23.tar.gz
tar --no-same-owner -xzf MUMmer3.23.tar.gz
tar -xzf MUMmer3.23.tar.gz --no-same-owner
cd MUMmer3.23
make --quiet install
rm -r docs src ../MUMmer3.23.tar.gz
Expand All @@ -95,15 +119,15 @@ From: centos:7
/bin
cd /opt
wget -q https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2
tar --no-same-owner --bzip2 -xf samtools-1.3.1.tar.bz2
tar -xf samtools-1.3.1.tar.bz2 --no-same-owner --bzip2
cd samtools-1.3.1
./configure --quiet --prefix=/
make --quiet
make --quiet install
cd /opt
rm -rf samtools-1.3.1*
wget -q http://downloads.sourceforge.net/project/smalt/smalt-0.7.6-bin.tar.gz
tar --no-same-owner -xzf smalt-0.7.6-bin.tar.gz
tar -xzf smalt-0.7.6-bin.tar.gz --no-same-owner
ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt

echo ===== Installing Python packages ===== >/dev/null
Expand All @@ -122,15 +146,5 @@ From: centos:7

rm -rf /var/cache/yum

## Savage assembler
#export PATH="/opt/miniconda/bin:$PATH"
#source /opt/miniconda/bin/activate
#conda config --add channels r
#conda config --add channels defaults
#conda config --add channels conda-forge
#conda config --add channels bioconda
#conda install savage
#ls /opt/miniconda/bin/

%environment
export PATH=/bin:/opt/bowtie2
98 changes: 52 additions & 46 deletions micall/core/denovo.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,36 @@
import argparse
import logging
import os
import sys
from csv import DictWriter, DictReader
from datetime import datetime, timedelta
from enum import Enum
from glob import glob
from io import StringIO
from shutil import rmtree
from subprocess import Popen
from subprocess import run, PIPE, CalledProcessError
from tempfile import mkdtemp

from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline

from micall.utils.externals import LineCounter
from micall.utils.iva_wrapper import assemble, SeedingError

PEAR = "/opt/bin/pear"
SAVAGE = "/opt/savage_wrapper.sh"
IVA = "iva"
IS_SAVAGE_ENABLED = False
DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__),
'..',
'blast_db',
'refs.fasta')
ASSEMBLY_TIMEOUT = timedelta(hours=4).total_seconds()
GENOME_WIDTH = 5000 # Wild guess at the genome width.
TARGET_DEPTH = 1000 # Savage recommends 500 to 1000.
logger = logging.getLogger(__name__)

# noinspection PyArgumentList
Assembler = Enum('Assembler', 'IVA SAVAGE')


def write_genotypes(contigs_fasta_path, contigs_csv, merged_contigs_csv=None):
writer = DictWriter(contigs_csv,
Expand Down Expand Up @@ -84,56 +89,57 @@ def genotype(fasta, db=DEFAULT_DATABASE):
return samples


def denovo(fastq1_path, fastq2_path, contigs, work_dir='.', merged_contigs_csv=None):
prefix = "sample"

old_tmp_dirs = glob(os.path.join(work_dir, 'iva_*'))
def denovo(fastq1_path,
fastq2_path,
contigs,
work_dir='.',
merged_contigs_csv=None,
assembler=Assembler.SAVAGE):
old_tmp_dirs = glob(os.path.join(work_dir, 'assembly_*'))
for old_tmp_dir in old_tmp_dirs:
rmtree(old_tmp_dir, ignore_errors=True)

tmp_dir = mkdtemp(dir=work_dir, prefix='iva_')
tmp_dir = mkdtemp(dir=work_dir, prefix='assembly_')
start_time = datetime.now()
start_dir = os.getcwd()
try:
assemble(os.path.join(tmp_dir, "iva"),
str(fastq1_path),
str(fastq2_path),
timeout_seconds=ASSEMBLY_TIMEOUT)
is_successful = True
except SeedingError:
logger.warning('De novo assembly failed.')
is_successful = False
contigs_fasta_path = os.path.join(tmp_dir, 'contigs.fasta')
if assembler == Assembler.IVA:
try:
assemble(os.path.join(tmp_dir, "iva"),
str(fastq1_path),
str(fastq2_path),
timeout_seconds=ASSEMBLY_TIMEOUT)
contigs_fasta_path = os.path.join(tmp_dir, 'iva', 'contigs.fasta')
except SeedingError:
logger.warning('De novo assembly failed.')
else:
counter = LineCounter()
read_count = counter.count(fastq1_path) / 4
expected_depth = read_count / GENOME_WIDTH
split = max(1, expected_depth // TARGET_DEPTH)
merged_reads_path = os.path.join(tmp_dir, 'merged.fastq')
run(['merge-mates',
'--out', merged_reads_path,
fastq1_path,
fastq2_path],
check=True)

try:
run(['savage',
'--split', str(split),
'-s', merged_reads_path,
'-t', '1',
'--merge_contigs', '0.01',
'--overlap_len_stage_c', '100'],
cwd=tmp_dir,
check=True,
stdout=PIPE)
contigs_fasta_path = os.path.join(tmp_dir, 'contigs_stage_c.fasta')
except CalledProcessError:
logger.warning('De novo assembly failed.')

os.chdir(start_dir)
duration = datetime.now() - start_time

if IS_SAVAGE_ENABLED:
pear_proc = Popen([PEAR, "-f", fastq1_path, "-r", fastq2_path, "-o", prefix],
cwd=work_dir)

if pear_proc.wait():
raise Exception

savage_proc = Popen([
SAVAGE, "--split", "1", "-s",
"{}/sample.assembled.fastq".format(work_dir), "-p1",
"{}/sample.unassembled.forward.fastq".format(work_dir),
"-p2",
"{}/sample.unassembled.reverse.fastq".format(work_dir),
"-t", "1",
"--merge_contigs", "0.01", "--overlap_len_stage_c",
"100",
], cwd=work_dir, shell=True)

savage_proc.wait(timeout=ASSEMBLY_TIMEOUT)
if not savage_proc.wait():
write_genotypes("{}/contigs_stage_c.fasta".format(work_dir), contigs)
else:
print("savage exits with error", file=sys.stderr)

if is_successful:
contigs_fasta_path = os.path.join(tmp_dir, 'iva', 'contigs.fasta')
else:
contigs_fasta_path = os.path.join(tmp_dir, 'contigs.fasta')
contig_count = write_genotypes(contigs_fasta_path, contigs, merged_contigs_csv)
logger.info('Assembled %d contigs in %s (%ds) on %s.',
contig_count,
Expand Down
28 changes: 25 additions & 3 deletions micall/tests/test_denovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from pytest import fixture

from micall.core.denovo import write_genotypes, denovo, DEFAULT_DATABASE
from micall.core.denovo import write_genotypes, denovo, DEFAULT_DATABASE, Assembler
from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS


Expand Down Expand Up @@ -114,7 +114,7 @@ def test_merged_contig(tmpdir, hcv_db):
assert expected_contigs_csv == contigs_csv.getvalue()


def test_denovo(tmpdir, hcv_db):
def test_denovo_iva(tmpdir, hcv_db):
microtest_path = Path(__file__).parent / 'microtest'
contigs_csv = StringIO()
expected_contigs_csv = """\
Expand All @@ -130,6 +130,28 @@ def test_denovo(tmpdir, hcv_db):
denovo(microtest_path / '2140A-HCV_S17_L001_R1_001.fastq',
microtest_path / '2140A-HCV_S17_L001_R2_001.fastq',
contigs_csv,
tmpdir)
tmpdir,
assembler=Assembler.IVA)

assert expected_contigs_csv == contigs_csv.getvalue()


def test_denovo_savage(tmpdir, hcv_db):
microtest_path = Path(__file__).parent / 'microtest'
contigs_csv = StringIO()
expected_contigs_csv = """\
genotype,match,contig
HCV-1a,1.0,GCAAGCTCCCCCTGGCCGTGATGGGAAGCTCCTACGGATTCCAATAC\
TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAAGTCCAAGAAGACCCCGATGGGGTTCT\
CGTATGATACCCGCTGTTTTGACTCCACAGTCACTGAGAGCGACATCCGTACGGAGGAGGCAATTTACCAATGTTGTG\
ACCTGGACCCCCAAGCCCGCGTGGCCATCAAGTCCCTCACTGAGAGGCTTTATGTTGGGGGCCCTCTTACCAATTCAA\
GGGGGGAAAACTGCGGCTACCGCAGGTGCCGCGCGAGCGGCGTA
"""

denovo(microtest_path / '2140A-HCV_S17_L001_R1_001.fastq',
microtest_path / '2140A-HCV_S17_L001_R2_001.fastq',
contigs_csv,
tmpdir,
assembler=Assembler.SAVAGE)

assert expected_contigs_csv == contigs_csv.getvalue()
4 changes: 3 additions & 1 deletion micall/utils/contig_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def main():
empty_count += 1
else:
contigs_fasta_paths = list(
sample_dir.glob('iva_*/iva/contigs.fasta'))
sample_dir.glob('assembly_*/iva/contigs.fasta'))
contigs_fasta_paths += list(
sample_dir.glob('assembly_*/contigs_stage_c.fasta'))
if len(contigs_fasta_paths) != 1:
print(sample_dir, contigs_fasta_paths)
continue
Expand Down
Loading

0 comments on commit f6352b9

Please sign in to comment.