Skip to content

Commit

Permalink
Merge pull request #1 from martinghunt/master
Browse files Browse the repository at this point in the history
Initial version
  • Loading branch information
martinghunt committed Apr 16, 2015
2 parents 4f404ef + 85c3608 commit 02f0b28
Show file tree
Hide file tree
Showing 68 changed files with 2,237 additions and 1 deletion.
47 changes: 46 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,47 @@
# circlator
circlator
=========

A tool to circularise bacterial genome assemblies




Installation
------------

circlator has the following dependencies, which need to be installed:
* [BWA] [BWA] version >= 0.7.12
* [samtools] [samtools] (version 0.1.9 or 1.2)
* [MUMmer] [mummer] version >= 3.23
* [SPAdes] [spades] version >= 3.5.0


Once the dependencies are installed, install circlator using pip3:

pip3 install circlator

Alternatively, you can download the latest release from this github repository,
or clone the repository. Then run the tests:

python3 setup.py test

If the tests all pass, install:

python3 setup.py install


Usage
-----

The input is an assembly in FASTA format and corrected long reads in FASTA format. To run:

circlator all assembly.fasta corrected_reads.fasta output_directory



[BWA]: http://bio-bwa.sourceforge.net/
[mummer]: http://mummer.sourceforge.net/
[samtools]: http://www.htslib.org/
[spades]: http://bioinf.spbau.ru/spades


13 changes: 13 additions & 0 deletions circlator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
__all__ = [
'assemble',
'common',
'bamfilter',
'external_progs',
'mapping',
'merge',
'program',
'tasks',
]

from circlator import *

51 changes: 51 additions & 0 deletions circlator/assemble.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import pyfastaq
from circlator import common, external_progs

class Error (Exception): pass

class Assembler:
def __init__(self,
reads,
outdir,
threads=1,
spades_kmer=127,
verbose=False,
):
self.outdir = os.path.abspath(outdir)
try:
os.mkdir(self.outdir)
except:
raise Error('Error mkdir ' + self.outdir)

self.reads = os.path.abspath(reads)
if not os.path.exists(self.reads):
raise Error('Reads file not found:' + self.reads)

self.verbose = verbose
self.threads = threads
self.spades = external_progs.make_and_check_prog('spades', verbose=self.verbose)
self.spades_kmer = spades_kmer
self.assembler = 'spades'


def run_spades(self):
cmd = ' '.join([
self.spades.exe(),
'-s', self.reads,
'-k', str(self.spades_kmer),
'--careful',
'--only-assembler',
'-t', str(self.threads),
'-o', self.outdir,
])

common.syscall(cmd, verbose=self.verbose)


def run(self):
if self.assembler == 'spades':
self.run_spades()
else:
raise Error('Unknown assembler: "' + self.assembler + '". cannot continue')

119 changes: 119 additions & 0 deletions circlator/bamfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import pysam
import pyfastaq
from circlator import common, mapping

class Error (Exception): pass


class BamFilter:
def __init__(
self,
bam,
outprefix,
min_length_short=5000,
min_length_long=100000,
min_read_length=250,
min_remove_end_distance=45000,
):
self.bam = os.path.abspath(bam)
if not os.path.exists(self.bam):
raise Error('File not found:' + self.bam)

self.min_read_length = min_read_length
self.min_length_short = min_length_short
self.min_length_long = min_length_long
self.min_remove_end_distance = min_remove_end_distance
assert self.min_length_short < self.min_length_long

self.reads_fa = os.path.abspath(outprefix + '.fasta')
self.log = os.path.abspath(outprefix + '.log')



def _get_ref_lengths(self):
'''Gets the length of each reference sequence from the header of the bam. Returns dict name => length'''
sam_reader = pysam.Samfile(self.bam, "rb")
return dict(zip(sam_reader.references, sam_reader.lengths))


def _all_reads_from_contig(self, contig, fout):
'''Gets all reads from contig called "contig" and writes to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(contig):
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)


def _get_all_unmapped_reads(self, fout):
'''Writes all unmapped reads to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(until_eof=True):
if read.is_unmapped:
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)


def _break_reads(self, contig, position, fout):
'''Get all reads from contig, but breaks them all at given position (0-based) in the reference. Writes to fout. Currently pproximate where it breaks (ignores indels in the alignment)'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(contig):
seqs = []
if read.pos < position < read.reference_end - 1:
split_point = position - read.pos
if split_point - 1 >= self.min_read_length:
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(0, split_point)
sequence.id += '.left'
seqs.append(sequence)
if read.query_length - split_point >= self.min_read_length:
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(split_point, read.query_length)
sequence.id += '.right'
seqs.append(sequence)
else:
seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True))

for seq in seqs:
if read.is_reverse:
seq.revcomp()
print(seq, file=fout)


def _exclude_region(self, contig, start, end, fout):
'''Writes reads not mapping to the given region of contig, start and end as per python convention'''
sam_reader = pysam.Samfile(self.bam, "rb")
exclude_interval = pyfastaq.intervals.Interval(start, end - 1)
for read in sam_reader.fetch(contig):
read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
if not read_interval.intersects(exclude_interval):
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)


def _choose_region(self, contig, contig_length):
return pyfastaq.intervals.Interval(self.min_remove_end_distance, contig_length - self.min_remove_end_distance)


def run(self):
ref_lengths = self._get_ref_lengths()
assert len(ref_lengths) > 0
f_log = pyfastaq.utils.open_file_write(self.log)
f_fq = pyfastaq.utils.open_file_write(self.reads_fa)

for contig in sorted(ref_lengths):
if ref_lengths[contig] < self.min_length_short:
self._all_reads_from_contig(contig, f_fq)
print(contig, ref_lengths[contig], 'unchanged', sep='\t', file=f_log)
elif self.min_length_short <= ref_lengths[contig] < self.min_length_long:
break_pos = int(ref_lengths[contig] / 2)
print(contig, ref_lengths[contig], 'break at position ' + str(break_pos + 1), sep='\t', file=f_log)
self._break_reads(contig, break_pos, f_fq)
else:
region_to_remove = self._choose_region(contig, ref_lengths[contig])
if region_to_remove is not None:
print(contig, ref_lengths[contig], 'remove region ' + str(region_to_remove.start + 1) + '-' + str(region_to_remove.end + 1), sep='\t', file=f_log)
self._exclude_region(contig, region_to_remove.start, region_to_remove.end, f_fq)
else:
break_pos = int(ref_lengths[contig] / 2)
print(contig, ref_lengths[contig], 'could not find region to remove - break at position ' + str(break_pos + 1), sep='\t', file=f_log)
self._break_reads(contig, int(ref_lengths[contig] / 2), f_fq)

self._get_all_unmapped_reads(f_fq)
pyfastaq.utils.close(f_fq)
pyfastaq.utils.close(f_log)
39 changes: 39 additions & 0 deletions circlator/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import subprocess

version = '0.1.0'

def syscall(cmd, allow_fail=False, verbose=False):
if verbose:
print('syscall:', cmd, flush=True)
try:
subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as error:
errors = error.output.decode()
print('The following command failed with exit code', error.returncode, file=sys.stderr)
print(cmd, file=sys.stderr)
print('\nThe output was:\n', file=sys.stderr)
print(errors, file=sys.stderr, flush=True)

if allow_fail:
return False, errors
else:
sys.exit(1)

return True, None


def syscall_get_stdout(cmd):
try:
out = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE).communicate()[0].decode('utf-8').rstrip()
return out.split('\n')
except:
raise Error('Error in system call. I tried to run:\n' + str(cmd))


def decode(x):
try:
s = x.decode()
except:
return x
return s
53 changes: 53 additions & 0 deletions circlator/external_progs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re
from circlator import program

class Error (Exception): pass

prog_to_env_var = {
'spades': 'CIRCLATOR_SPADES',
}


prog_to_version_cmd = {
'bwa': ('', re.compile('^Version: ([0-9\.]+)')),
'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
'samtools': ('', re.compile('^Version: ([0-9\.]+)')),
'spades': ('', re.compile('^SPAdes genome assembler v.([0-9\.]+)')),
}


min_versions = {
'bwa': '0.7.12',
'nucmer': '3.1',
'samtools': '0.1.19',
'spades': '3.5.0',
}


prog_name_to_default = {
'bwa': 'bwa',
'nucmer': 'nucmer',
'spades': 'spades.py',
'samtools': 'samtools',
}

def make_and_check_prog(name, verbose=False):
p = program.Program(
prog_name_to_default[name],
prog_to_version_cmd[name][0],
prog_to_version_cmd[name][1],
environment_var=prog_to_env_var.get(name, None)
)

if not p.version_at_least(min_versions[name]):
raise Error('Version of ' + name + ' too low. I found ' + p.version() + ', but must be at least ' + min_versions[name])

if verbose:
print('Using', name, 'version', p.version())

return p


def check_all_progs(verbose=False):
for prog in prog_name_to_default:
make_and_check_prog(prog, verbose=verbose)
Loading

0 comments on commit 02f0b28

Please sign in to comment.