-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from martinghunt/master
Initial version
- Loading branch information
Showing
68 changed files
with
2,237 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,47 @@ | ||
# circlator | ||
circlator | ||
========= | ||
|
||
A tool to circularise bacterial genome assemblies | ||
|
||
|
||
|
||
|
||
Installation | ||
------------ | ||
|
||
circlator has the following dependencies, which need to be installed: | ||
* [BWA] [BWA] version >= 0.7.12 | ||
* [samtools] [samtools] (version 0.1.9 or 1.2) | ||
* [MUMmer] [mummer] version >= 3.23 | ||
* [SPAdes] [spades] version >= 3.5.0 | ||
|
||
|
||
Once the dependencies are installed, install circlator using pip3: | ||
|
||
pip3 install circlator | ||
|
||
Alternatively, you can download the latest release from this github repository, | ||
or clone the repository. Then run the tests: | ||
|
||
python3 setup.py test | ||
|
||
If the tests all pass, install: | ||
|
||
python3 setup.py install | ||
|
||
|
||
Usage | ||
----- | ||
|
||
The input is an assembly in FASTA format and corrected long reads in FASTA format. To run: | ||
|
||
circlator all assembly.fasta corrected_reads.fasta output_directory | ||
|
||
|
||
|
||
[BWA]: http://bio-bwa.sourceforge.net/ | ||
[mummer]: http://mummer.sourceforge.net/ | ||
[samtools]: http://www.htslib.org/ | ||
[spades]: http://bioinf.spbau.ru/spades | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
__all__ = [ | ||
'assemble', | ||
'common', | ||
'bamfilter', | ||
'external_progs', | ||
'mapping', | ||
'merge', | ||
'program', | ||
'tasks', | ||
] | ||
|
||
from circlator import * | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import os | ||
import pyfastaq | ||
from circlator import common, external_progs | ||
|
||
class Error (Exception): pass | ||
|
||
class Assembler: | ||
def __init__(self, | ||
reads, | ||
outdir, | ||
threads=1, | ||
spades_kmer=127, | ||
verbose=False, | ||
): | ||
self.outdir = os.path.abspath(outdir) | ||
try: | ||
os.mkdir(self.outdir) | ||
except: | ||
raise Error('Error mkdir ' + self.outdir) | ||
|
||
self.reads = os.path.abspath(reads) | ||
if not os.path.exists(self.reads): | ||
raise Error('Reads file not found:' + self.reads) | ||
|
||
self.verbose = verbose | ||
self.threads = threads | ||
self.spades = external_progs.make_and_check_prog('spades', verbose=self.verbose) | ||
self.spades_kmer = spades_kmer | ||
self.assembler = 'spades' | ||
|
||
|
||
def run_spades(self): | ||
cmd = ' '.join([ | ||
self.spades.exe(), | ||
'-s', self.reads, | ||
'-k', str(self.spades_kmer), | ||
'--careful', | ||
'--only-assembler', | ||
'-t', str(self.threads), | ||
'-o', self.outdir, | ||
]) | ||
|
||
common.syscall(cmd, verbose=self.verbose) | ||
|
||
|
||
def run(self): | ||
if self.assembler == 'spades': | ||
self.run_spades() | ||
else: | ||
raise Error('Unknown assembler: "' + self.assembler + '". cannot continue') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import os | ||
import pysam | ||
import pyfastaq | ||
from circlator import common, mapping | ||
|
||
class Error (Exception): pass | ||
|
||
|
||
class BamFilter: | ||
def __init__( | ||
self, | ||
bam, | ||
outprefix, | ||
min_length_short=5000, | ||
min_length_long=100000, | ||
min_read_length=250, | ||
min_remove_end_distance=45000, | ||
): | ||
self.bam = os.path.abspath(bam) | ||
if not os.path.exists(self.bam): | ||
raise Error('File not found:' + self.bam) | ||
|
||
self.min_read_length = min_read_length | ||
self.min_length_short = min_length_short | ||
self.min_length_long = min_length_long | ||
self.min_remove_end_distance = min_remove_end_distance | ||
assert self.min_length_short < self.min_length_long | ||
|
||
self.reads_fa = os.path.abspath(outprefix + '.fasta') | ||
self.log = os.path.abspath(outprefix + '.log') | ||
|
||
|
||
|
||
def _get_ref_lengths(self): | ||
'''Gets the length of each reference sequence from the header of the bam. Returns dict name => length''' | ||
sam_reader = pysam.Samfile(self.bam, "rb") | ||
return dict(zip(sam_reader.references, sam_reader.lengths)) | ||
|
||
|
||
def _all_reads_from_contig(self, contig, fout): | ||
'''Gets all reads from contig called "contig" and writes to fout''' | ||
sam_reader = pysam.Samfile(self.bam, "rb") | ||
for read in sam_reader.fetch(contig): | ||
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout) | ||
|
||
|
||
def _get_all_unmapped_reads(self, fout): | ||
'''Writes all unmapped reads to fout''' | ||
sam_reader = pysam.Samfile(self.bam, "rb") | ||
for read in sam_reader.fetch(until_eof=True): | ||
if read.is_unmapped: | ||
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout) | ||
|
||
|
||
def _break_reads(self, contig, position, fout): | ||
'''Get all reads from contig, but breaks them all at given position (0-based) in the reference. Writes to fout. Currently pproximate where it breaks (ignores indels in the alignment)''' | ||
sam_reader = pysam.Samfile(self.bam, "rb") | ||
for read in sam_reader.fetch(contig): | ||
seqs = [] | ||
if read.pos < position < read.reference_end - 1: | ||
split_point = position - read.pos | ||
if split_point - 1 >= self.min_read_length: | ||
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(0, split_point) | ||
sequence.id += '.left' | ||
seqs.append(sequence) | ||
if read.query_length - split_point >= self.min_read_length: | ||
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(split_point, read.query_length) | ||
sequence.id += '.right' | ||
seqs.append(sequence) | ||
else: | ||
seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True)) | ||
|
||
for seq in seqs: | ||
if read.is_reverse: | ||
seq.revcomp() | ||
print(seq, file=fout) | ||
|
||
|
||
def _exclude_region(self, contig, start, end, fout): | ||
'''Writes reads not mapping to the given region of contig, start and end as per python convention''' | ||
sam_reader = pysam.Samfile(self.bam, "rb") | ||
exclude_interval = pyfastaq.intervals.Interval(start, end - 1) | ||
for read in sam_reader.fetch(contig): | ||
read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1) | ||
if not read_interval.intersects(exclude_interval): | ||
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout) | ||
|
||
|
||
def _choose_region(self, contig, contig_length): | ||
return pyfastaq.intervals.Interval(self.min_remove_end_distance, contig_length - self.min_remove_end_distance) | ||
|
||
|
||
def run(self): | ||
ref_lengths = self._get_ref_lengths() | ||
assert len(ref_lengths) > 0 | ||
f_log = pyfastaq.utils.open_file_write(self.log) | ||
f_fq = pyfastaq.utils.open_file_write(self.reads_fa) | ||
|
||
for contig in sorted(ref_lengths): | ||
if ref_lengths[contig] < self.min_length_short: | ||
self._all_reads_from_contig(contig, f_fq) | ||
print(contig, ref_lengths[contig], 'unchanged', sep='\t', file=f_log) | ||
elif self.min_length_short <= ref_lengths[contig] < self.min_length_long: | ||
break_pos = int(ref_lengths[contig] / 2) | ||
print(contig, ref_lengths[contig], 'break at position ' + str(break_pos + 1), sep='\t', file=f_log) | ||
self._break_reads(contig, break_pos, f_fq) | ||
else: | ||
region_to_remove = self._choose_region(contig, ref_lengths[contig]) | ||
if region_to_remove is not None: | ||
print(contig, ref_lengths[contig], 'remove region ' + str(region_to_remove.start + 1) + '-' + str(region_to_remove.end + 1), sep='\t', file=f_log) | ||
self._exclude_region(contig, region_to_remove.start, region_to_remove.end, f_fq) | ||
else: | ||
break_pos = int(ref_lengths[contig] / 2) | ||
print(contig, ref_lengths[contig], 'could not find region to remove - break at position ' + str(break_pos + 1), sep='\t', file=f_log) | ||
self._break_reads(contig, int(ref_lengths[contig] / 2), f_fq) | ||
|
||
self._get_all_unmapped_reads(f_fq) | ||
pyfastaq.utils.close(f_fq) | ||
pyfastaq.utils.close(f_log) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import sys | ||
import subprocess | ||
|
||
version = '0.1.0' | ||
|
||
def syscall(cmd, allow_fail=False, verbose=False): | ||
if verbose: | ||
print('syscall:', cmd, flush=True) | ||
try: | ||
subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) | ||
except subprocess.CalledProcessError as error: | ||
errors = error.output.decode() | ||
print('The following command failed with exit code', error.returncode, file=sys.stderr) | ||
print(cmd, file=sys.stderr) | ||
print('\nThe output was:\n', file=sys.stderr) | ||
print(errors, file=sys.stderr, flush=True) | ||
|
||
if allow_fail: | ||
return False, errors | ||
else: | ||
sys.exit(1) | ||
|
||
return True, None | ||
|
||
|
||
def syscall_get_stdout(cmd): | ||
try: | ||
out = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE).communicate()[0].decode('utf-8').rstrip() | ||
return out.split('\n') | ||
except: | ||
raise Error('Error in system call. I tried to run:\n' + str(cmd)) | ||
|
||
|
||
def decode(x): | ||
try: | ||
s = x.decode() | ||
except: | ||
return x | ||
return s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import re | ||
from circlator import program | ||
|
||
class Error (Exception): pass | ||
|
||
prog_to_env_var = { | ||
'spades': 'CIRCLATOR_SPADES', | ||
} | ||
|
||
|
||
prog_to_version_cmd = { | ||
'bwa': ('', re.compile('^Version: ([0-9\.]+)')), | ||
'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')), | ||
'samtools': ('', re.compile('^Version: ([0-9\.]+)')), | ||
'spades': ('', re.compile('^SPAdes genome assembler v.([0-9\.]+)')), | ||
} | ||
|
||
|
||
min_versions = { | ||
'bwa': '0.7.12', | ||
'nucmer': '3.1', | ||
'samtools': '0.1.19', | ||
'spades': '3.5.0', | ||
} | ||
|
||
|
||
prog_name_to_default = { | ||
'bwa': 'bwa', | ||
'nucmer': 'nucmer', | ||
'spades': 'spades.py', | ||
'samtools': 'samtools', | ||
} | ||
|
||
def make_and_check_prog(name, verbose=False): | ||
p = program.Program( | ||
prog_name_to_default[name], | ||
prog_to_version_cmd[name][0], | ||
prog_to_version_cmd[name][1], | ||
environment_var=prog_to_env_var.get(name, None) | ||
) | ||
|
||
if not p.version_at_least(min_versions[name]): | ||
raise Error('Version of ' + name + ' too low. I found ' + p.version() + ', but must be at least ' + min_versions[name]) | ||
|
||
if verbose: | ||
print('Using', name, 'version', p.version()) | ||
|
||
return p | ||
|
||
|
||
def check_all_progs(verbose=False): | ||
for prog in prog_name_to_default: | ||
make_and_check_prog(prog, verbose=verbose) |
Oops, something went wrong.