Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial version #1

Merged
merged 4 commits into from
Apr 16, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,47 @@
# circlator
circlator
=========

A tool to circularise bacterial genome assemblies




Installation
------------

circlator has the following dependencies, which need to be installed:
* [BWA] [BWA] version >= 0.7.12
* [samtools] [samtools] (version 0.1.9 or 1.2)
* [MUMmer] [mummer] version >= 3.23
* [SPAdes] [spades] version >= 3.5.0


Once the dependencies are installed, install circlator using pip3:

pip3 install circlator

Alternatively, you can download the latest release from this github repository,
or clone the repository. Then run the tests:

python3 setup.py test

If the tests all pass, install:

python3 setup.py install


Usage
-----

The input is an assembly in FASTA format and corrected long reads in FASTA format. To run:

circlator all assembly.fasta corrected_reads.fasta output_directory



[BWA]: http://bio-bwa.sourceforge.net/
[mummer]: http://mummer.sourceforge.net/
[samtools]: http://www.htslib.org/
[spades]: http://bioinf.spbau.ru/spades


13 changes: 13 additions & 0 deletions circlator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
__all__ = [
'assemble',
'common',
'bamfilter',
'external_progs',
'mapping',
'merge',
'program',
'tasks',
]

from circlator import *

51 changes: 51 additions & 0 deletions circlator/assemble.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import pyfastaq
from circlator import common, external_progs

class Error (Exception): pass

class Assembler:
def __init__(self,
reads,
outdir,
threads=1,
spades_kmer=127,
verbose=False,
):
self.outdir = os.path.abspath(outdir)
try:
os.mkdir(self.outdir)
except:
raise Error('Error mkdir ' + self.outdir)

self.reads = os.path.abspath(reads)
if not os.path.exists(self.reads):
raise Error('Reads file not found:' + self.reads)

self.verbose = verbose
self.threads = threads
self.spades = external_progs.make_and_check_prog('spades', verbose=self.verbose)
self.spades_kmer = spades_kmer
self.assembler = 'spades'


def run_spades(self):
cmd = ' '.join([
self.spades.exe(),
'-s', self.reads,
'-k', str(self.spades_kmer),
'--careful',
'--only-assembler',
'-t', str(self.threads),
'-o', self.outdir,
])

common.syscall(cmd, verbose=self.verbose)


def run(self):
if self.assembler == 'spades':
self.run_spades()
else:
raise Error('Unknown assembler: "' + self.assembler + '". cannot continue')

119 changes: 119 additions & 0 deletions circlator/bamfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import pysam
import pyfastaq
from circlator import common, mapping

class Error (Exception): pass


class BamFilter:
def __init__(
self,
bam,
outprefix,
min_length_short=5000,
min_length_long=100000,
min_read_length=250,
min_remove_end_distance=45000,
):
self.bam = os.path.abspath(bam)
if not os.path.exists(self.bam):
raise Error('File not found:' + self.bam)

self.min_read_length = min_read_length
self.min_length_short = min_length_short
self.min_length_long = min_length_long
self.min_remove_end_distance = min_remove_end_distance
assert self.min_length_short < self.min_length_long

self.reads_fa = os.path.abspath(outprefix + '.fasta')
self.log = os.path.abspath(outprefix + '.log')



def _get_ref_lengths(self):
'''Gets the length of each reference sequence from the header of the bam. Returns dict name => length'''
sam_reader = pysam.Samfile(self.bam, "rb")
return dict(zip(sam_reader.references, sam_reader.lengths))


def _all_reads_from_contig(self, contig, fout):
'''Gets all reads from contig called "contig" and writes to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(contig):
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)


def _get_all_unmapped_reads(self, fout):
'''Writes all unmapped reads to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(until_eof=True):
if read.is_unmapped:
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)


def _break_reads(self, contig, position, fout):
'''Get all reads from contig, but breaks them all at given position (0-based) in the reference. Writes to fout. Currently pproximate where it breaks (ignores indels in the alignment)'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(contig):
seqs = []
if read.pos < position < read.reference_end - 1:
split_point = position - read.pos
if split_point - 1 >= self.min_read_length:
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(0, split_point)
sequence.id += '.left'
seqs.append(sequence)
if read.query_length - split_point >= self.min_read_length:
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(split_point, read.query_length)
sequence.id += '.right'
seqs.append(sequence)
else:
seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True))

for seq in seqs:
if read.is_reverse:
seq.revcomp()
print(seq, file=fout)


def _exclude_region(self, contig, start, end, fout):
'''Writes reads not mapping to the given region of contig, start and end as per python convention'''
sam_reader = pysam.Samfile(self.bam, "rb")
exclude_interval = pyfastaq.intervals.Interval(start, end - 1)
for read in sam_reader.fetch(contig):
read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
if not read_interval.intersects(exclude_interval):
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)


def _choose_region(self, contig, contig_length):
return pyfastaq.intervals.Interval(self.min_remove_end_distance, contig_length - self.min_remove_end_distance)


def run(self):
ref_lengths = self._get_ref_lengths()
assert len(ref_lengths) > 0
f_log = pyfastaq.utils.open_file_write(self.log)
f_fq = pyfastaq.utils.open_file_write(self.reads_fa)

for contig in sorted(ref_lengths):
if ref_lengths[contig] < self.min_length_short:
self._all_reads_from_contig(contig, f_fq)
print(contig, ref_lengths[contig], 'unchanged', sep='\t', file=f_log)
elif self.min_length_short <= ref_lengths[contig] < self.min_length_long:
break_pos = int(ref_lengths[contig] / 2)
print(contig, ref_lengths[contig], 'break at position ' + str(break_pos + 1), sep='\t', file=f_log)
self._break_reads(contig, break_pos, f_fq)
else:
region_to_remove = self._choose_region(contig, ref_lengths[contig])
if region_to_remove is not None:
print(contig, ref_lengths[contig], 'remove region ' + str(region_to_remove.start + 1) + '-' + str(region_to_remove.end + 1), sep='\t', file=f_log)
self._exclude_region(contig, region_to_remove.start, region_to_remove.end, f_fq)
else:
break_pos = int(ref_lengths[contig] / 2)
print(contig, ref_lengths[contig], 'could not find region to remove - break at position ' + str(break_pos + 1), sep='\t', file=f_log)
self._break_reads(contig, int(ref_lengths[contig] / 2), f_fq)

self._get_all_unmapped_reads(f_fq)
pyfastaq.utils.close(f_fq)
pyfastaq.utils.close(f_log)
39 changes: 39 additions & 0 deletions circlator/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import subprocess

version = '0.1.0'

def syscall(cmd, allow_fail=False, verbose=False):
if verbose:
print('syscall:', cmd, flush=True)
try:
subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as error:
errors = error.output.decode()
print('The following command failed with exit code', error.returncode, file=sys.stderr)
print(cmd, file=sys.stderr)
print('\nThe output was:\n', file=sys.stderr)
print(errors, file=sys.stderr, flush=True)

if allow_fail:
return False, errors
else:
sys.exit(1)

return True, None


def syscall_get_stdout(cmd):
try:
out = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE).communicate()[0].decode('utf-8').rstrip()
return out.split('\n')
except:
raise Error('Error in system call. I tried to run:\n' + str(cmd))


def decode(x):
try:
s = x.decode()
except:
return x
return s
53 changes: 53 additions & 0 deletions circlator/external_progs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re
from circlator import program

class Error (Exception): pass

prog_to_env_var = {
'spades': 'CIRCLATOR_SPADES',
}


prog_to_version_cmd = {
'bwa': ('', re.compile('^Version: ([0-9\.]+)')),
'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
'samtools': ('', re.compile('^Version: ([0-9\.]+)')),
'spades': ('', re.compile('^SPAdes genome assembler v.([0-9\.]+)')),
}


min_versions = {
'bwa': '0.7.12',
'nucmer': '3.1',
'samtools': '0.1.19',
'spades': '3.5.0',
}


prog_name_to_default = {
'bwa': 'bwa',
'nucmer': 'nucmer',
'spades': 'spades.py',
'samtools': 'samtools',
}

def make_and_check_prog(name, verbose=False):
p = program.Program(
prog_name_to_default[name],
prog_to_version_cmd[name][0],
prog_to_version_cmd[name][1],
environment_var=prog_to_env_var.get(name, None)
)

if not p.version_at_least(min_versions[name]):
raise Error('Version of ' + name + ' too low. I found ' + p.version() + ', but must be at least ' + min_versions[name])

if verbose:
print('Using', name, 'version', p.version())

return p


def check_all_progs(verbose=False):
for prog in prog_name_to_default:
make_and_check_prog(prog, verbose=verbose)
Loading