Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Splitting reads for small contigs when using Canu #84

Merged
merged 2 commits into from
Feb 23, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AUTHORS
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Martin Hunt (mh12@sanger.ac.uk)

This small modification to allow replacing Spades with Canu has been done by Nicola De Maio (nicola.de.maio.85@gmail.com)
This small modification to allow replacing Spades with Canu, and different handling of small contigs, has been done by Nicola De Maio (nicola.de.maio.85@gmail.com)
32 changes: 28 additions & 4 deletions circlator/bamfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(
discard_unmapped=False,
log_prefix='[bamfilter]',
verbose=False,
useCanu=False
):
self.bam = os.path.abspath(bam)
if not os.path.exists(self.bam):
Expand All @@ -32,6 +33,8 @@ def __init__(
self.discard_unmapped = discard_unmapped
self.min_read_length = min_read_length
self.verbose = verbose
self.useCanu = useCanu



def _get_ref_lengths(self):
Expand Down Expand Up @@ -158,10 +161,31 @@ def run(self):
continue

if ref_lengths[contig] <= self.length_cutoff:
self._all_reads_from_contig(contig, f_fa)
print(self.log_prefix, contig, ref_lengths[contig], 'all', sep='\t', file=f_log)
if self.verbose:
print('Getting all reads that map to contig', contig, flush=True)
#NEW version: even for small contigs, split them in two halves, and split the reads mapping through the middle point of the contig in two, so that each piece maps to only one half.
if self.useCanu:
end_bases_keep = int(0.5 * ref_lengths[contig])
start = end_bases_keep - 1
end = end_bases_keep #max(end_bases_keep - 1, ref_lengths[contig] - end_bases_keep)
self._get_region(contig, 0, start, f_fa, min_length=self.min_read_length)
self._get_region(contig, end, ref_lengths[contig], f_fa, min_length=self.min_read_length)
coords_string = '1-' + str(start + 1) + ';' + str(end + 1) + '-' + str(ref_lengths[contig])
print(
self.log_prefix,
contig,
ref_lengths[contig],
coords_string,
sep='\t',
file=f_log
)
if self.verbose:
print('Getting all reads that map to ends of contig ', contig, '. Coords: ', coords_string, sep='', flush=True)
else:
#OLD version, do not split reads so that SPAdes attempts to circulrize.
self._all_reads_from_contig(contig, f_fa)
print(self.log_prefix, contig, ref_lengths[contig], 'all', sep='\t', file=f_log)
if self.verbose:
print('Getting all reads that map to contig', contig, flush=True)

else:
end_bases_keep = int(0.5 * self.length_cutoff)
start = end_bases_keep - 1
Expand Down
1 change: 1 addition & 0 deletions circlator/tasks/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def run():
contigs_to_use=options.b2r_only_contigs,
discard_unmapped=options.b2r_discard_unmapped,
verbose=options.verbose,
useCanu=options.useCanu,
)
bam_filter.run()

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='circlator',
version='1.4.2',
version='1.4.3',
description='circlator: a tool to circularise genome assemblies',
packages = find_packages(),
package_data={'circlator': ['data/*']},
Expand Down