Skip to content

Commit

Permalink
Merge pull request #40 from martinghunt/spades_try_all_kmers
Browse files Browse the repository at this point in the history
Spades try all kmers
  • Loading branch information
aslett1 committed Oct 13, 2015
2 parents ec07974 + df63552 commit 200e8fa
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 30 deletions.
71 changes: 47 additions & 24 deletions circlator/assemble.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import tempfile
import sys
import shutil
import pyfastaq
Expand All @@ -13,13 +14,9 @@ def __init__(self,
threads=1,
spades_kmers=None,
verbose=False,
spades_use_first_success=False,
):
self.outdir = os.path.abspath(outdir)
try:
os.mkdir(self.outdir)
except:
raise Error('Error mkdir ' + self.outdir)

self.reads = os.path.abspath(reads)
if not os.path.exists(self.reads):
raise Error('Reads file not found:' + self.reads)
Expand All @@ -28,12 +25,14 @@ def __init__(self,
self.threads = threads
self.spades = external_progs.make_and_check_prog('spades', verbose=self.verbose)
self.spades_kmers = self._build_spades_kmers(spades_kmers)
self.spades_use_first_success = spades_use_first_success
self.samtools = external_progs.make_and_check_prog('samtools', verbose=self.verbose)
self.assembler = 'spades'


def _build_spades_kmers(self, kmers):
if kmers is None:
return [127,121,111,101,95,91,85,81,75,71]
return [127,117,107,97,87,77]
elif type(kmers) == str:
try:
kmer_list = [int(k) for k in kmers.split(',')]
Expand All @@ -60,30 +59,54 @@ def run_spades_once(self, kmer, outdir):
return common.syscall(cmd, verbose=self.verbose, allow_fail=True)


def run_spades(self):
'''Runs spades, starting with biggest kmer. Takes the first result where SPAdes returns success.
We do this because sometimes a large kmer makes SPAdes crash because the coverage is too low.
Even if you give SPAdes a list of kmers, if one of the kmers doesn't work, then it stops.
So need to run separately on each kmer'''
spades_errors = []
def run_spades(self, stop_at_first_success=False):
'''Runs spades on all kmers. Each a separate run because SPAdes dies if any kmer does
not work. Chooses the 'best' assembly to be the one with the biggest N50'''
n50 = {}
kmer_to_dir = {}

for k in self.spades_kmers:
ok, errs = self.run_spades_once(k, self.outdir)
tmpdir = tempfile.mkdtemp(prefix=self.outdir + '.tmp.spades.' + str(k) + '.', dir=os.getcwd())
kmer_to_dir[k] = tmpdir
ok, errs = self.run_spades_once(k, tmpdir)
if ok:
return

shutil.rmtree(self.outdir)
spades_errors.append(errs)

print('Error running SPAdes. Output from all runs follows...', file=sys.stderr)
print('\n\n______________________________________________\n\n'.join(spades_errors), file=sys.stderr)
print('\n\nError running SPAdes, most likely due to low read coverage.', file=sys.stderr)
print('Please see above for the output from SPAdes. Cannot continue', file=sys.stderr)
sys.exit(1)
contigs_fasta = os.path.join(tmpdir, 'contigs.fasta')
contigs_fai = contigs_fasta + '.fai'
common.syscall(self.samtools.exe() + ' faidx ' + contigs_fasta, verbose=self.verbose)
stats = pyfastaq.tasks.stats_from_fai(contigs_fai)
if stats['N50'] != 0:
n50[k] = stats['N50']

if stop_at_first_success:
break

if len(n50) > 0:
if self.verbose:
print('[assemble]\tkmer\tN50')
for k in sorted(n50):
print('[assemble]', k, n50[k], sep='\t')

best_k = None

for k in sorted(n50):
if best_k is None or n50[k] >= n50[best_k]:
best_k = k

assert best_k is not None

for k, directory in kmer_to_dir.items():
if k == best_k:
if self.verbose:
print('[assemble] using assembly with kmer', k)
os.rename(directory, self.outdir)
else:
shutil.rmtree(directory)
else:
raise Error('Error running SPAdes. Output directories are:\n ' + '\n '.join(kmer_to_dir.values()) + '\nThe reason why should be in the spades.log file in each directory.')


def run(self):
if self.assembler == 'spades':
self.run_spades()
self.run_spades(stop_at_first_success=self.spades_use_first_success)
else:
raise Error('Unknown assembler: "' + self.assembler + '". cannot continue')
4 changes: 3 additions & 1 deletion circlator/tasks/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def run():
bam2reads_group.add_argument('--b2r_min_read_length', type=int, help='Minimum length of read to output [%(default)s]', default=250, metavar='INT')

assemble_group = parser.add_argument_group('assemble options')
assemble_group.add_argument('--assemble_spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,121,111,101,95,91,85,81,75,71', metavar='k1,k2,k3,...')
parser.add_argument('--assemble_spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...')
parser.add_argument('--assemble_spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50')

merge_group = parser.add_argument_group('merge options')
merge_group.add_argument('--merge_diagdiff', type=int, help='Nucmer diagdiff option [%(default)s]', metavar='INT', default=25)
Expand Down Expand Up @@ -143,6 +144,7 @@ def run():
assembly_dir,
threads=options.threads,
spades_kmers=options.assemble_spades_k,
spades_use_first_success=options.assemble_spades_use_first,
verbose=options.verbose
)
a.run()
Expand Down
4 changes: 3 additions & 1 deletion circlator/tasks/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def run():
usage = 'circlator assemble [options] <in.reads.fasta> <out_dir>')
parser.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
parser.add_argument('--verbose', action='store_true', help='Be verbose')
parser.add_argument('--spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,121,111,101,95,91,85,81,75,71', metavar='k1,k2,k3,...')
parser.add_argument('--spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...')
parser.add_argument('--spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50')
parser.add_argument('reads', help='Name of input reads FASTA file', metavar='in.reads.fasta')
parser.add_argument('out_dir', help='Output directory (must not already exist)')
options = parser.parse_args()
Expand All @@ -19,6 +20,7 @@ def run():
options.out_dir,
threads=options.threads,
spades_kmers=options.spades_k,
spades_use_first_success=options.spades_use_first,
verbose=options.verbose
)
a.run()
3 changes: 0 additions & 3 deletions circlator/tests/assemble_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,3 @@ def test_build_spades_kmers(self):
with self.assertRaises(assemble.Error):
self.assembler._build_spades_kmers('41,spam')


def tearDown(self):
shutil.rmtree(self.tmp_assemble_dir)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
tests_require=['nose >= 1.3'],
install_requires=[
'openpyxl',
'pyfastaq >= 3.6.1',
'pyfastaq >= 3.8.0',
'pysam >= 0.8.1',
'pymummer>=0.4.0',
'bio_assembly_refinement>=0.3.3',
Expand Down

0 comments on commit 200e8fa

Please sign in to comment.