Skip to content

Commit

Permalink
allow vsearch merge-pairs to output unmerged reads (#95)
Browse files Browse the repository at this point in the history
  • Loading branch information
colinvwood authored Dec 12, 2023
1 parent 693d1c2 commit c8b3ace
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 95 deletions.
218 changes: 140 additions & 78 deletions q2_vsearch/_merge_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,90 +33,120 @@
}


def merge_pairs(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
truncqual: int = _mp_defaults['truncqual'],
minlen: int = _mp_defaults['minlen'],
maxns: int = _mp_defaults['maxns'],
allowmergestagger: bool = _mp_defaults['allowmergestagger'],
minovlen: int = _mp_defaults['minovlen'],
maxdiffs: int = _mp_defaults['maxdiffs'],
minmergelen: int = _mp_defaults['minmergelen'],
maxmergelen: int = _mp_defaults['maxmergelen'],
maxee: float = _mp_defaults['maxee'],
threads: int = _mp_defaults['threads'],
) -> SingleLanePerSampleSingleEndFastqDirFmt:
_, result = _merge_pairs_w_command_output(
def merge_pairs(
demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
truncqual: int = _mp_defaults['truncqual'],
minlen: int = _mp_defaults['minlen'],
maxns: int = _mp_defaults['maxns'],
allowmergestagger: bool = _mp_defaults['allowmergestagger'],
minovlen: int = _mp_defaults['minovlen'],
maxdiffs: int = _mp_defaults['maxdiffs'],
minmergelen: int = _mp_defaults['minmergelen'],
maxmergelen: int = _mp_defaults['maxmergelen'],
maxee: float = _mp_defaults['maxee'],
threads: int = _mp_defaults['threads'],
) -> (
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt
):
_, merged, unmerged = _merge_pairs_w_command_output(
demultiplexed_seqs, truncqual, minlen, maxns, allowmergestagger,
minovlen, maxdiffs, minmergelen, maxmergelen, maxee, threads)
return result
minovlen, maxdiffs, minmergelen, maxmergelen, maxee, threads
)

return merged, unmerged


def _merge_pairs_w_command_output(
demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
truncqual: int = _mp_defaults['truncqual'],
minlen: int = _mp_defaults['minlen'],
maxns: int = _mp_defaults['maxns'],
allowmergestagger: bool = _mp_defaults['allowmergestagger'],
minovlen: int = _mp_defaults['minovlen'],
maxdiffs: int = _mp_defaults['maxdiffs'],
minmergelen: int = _mp_defaults['minmergelen'],
maxmergelen: int = _mp_defaults['maxmergelen'],
maxee: float = _mp_defaults['maxee'],
threads: int = _mp_defaults['threads'],
) -> (List[str], SingleLanePerSampleSingleEndFastqDirFmt):
demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
truncqual: int = _mp_defaults['truncqual'],
minlen: int = _mp_defaults['minlen'],
maxns: int = _mp_defaults['maxns'],
allowmergestagger: bool = _mp_defaults['allowmergestagger'],
minovlen: int = _mp_defaults['minovlen'],
maxdiffs: int = _mp_defaults['maxdiffs'],
minmergelen: int = _mp_defaults['minmergelen'],
maxmergelen: int = _mp_defaults['maxmergelen'],
maxee: float = _mp_defaults['maxee'],
threads: int = _mp_defaults['threads'],
) -> (
List[str],
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt
):
# this function exists only to simplify unit testing

result = SingleLanePerSampleSingleEndFastqDirFmt()
# create formats
merged = SingleLanePerSampleSingleEndFastqDirFmt()
unmerged = SingleLanePerSamplePairedEndFastqDirFmt()

# create manifests
merged_manifest = FastqManifestFormat()
merged_manifest_fh = merged_manifest.open()
unmerged_manifest = FastqManifestFormat()
unmerged_manifest_fh = unmerged_manifest.open()

# write manifest headers
_write_manifest_header(merged_manifest_fh, add_warning=True)
_write_manifest_header(unmerged_manifest_fh)

# generate input reads iterable
manifest = pd.read_csv(
os.path.join(str(demultiplexed_seqs),
demultiplexed_seqs.manifest.pathspec),
header=0, comment='#')
manifest.filename = manifest.filename.apply(
lambda x: os.path.join(str(demultiplexed_seqs), x))
os.path.join(
str(demultiplexed_seqs), demultiplexed_seqs.manifest.pathspec
),
header=0,
comment='#'
)

phred_offset = yaml.load(open(
os.path.join(str(demultiplexed_seqs),
demultiplexed_seqs.metadata.pathspec)),
Loader=yaml.SafeLoader)['phred-offset']
manifest.filename = manifest.filename.apply(
lambda x: os.path.join(str(demultiplexed_seqs), x)
)

id_to_fps = manifest.pivot(index='sample-id', columns='direction',
values='filename')
phred_offset = yaml.load(
open(os.path.join(
str(demultiplexed_seqs), demultiplexed_seqs.metadata.pathspec
)),
Loader=yaml.SafeLoader
)['phred-offset']

output_manifest = FastqManifestFormat()
output_manifest_fh = output_manifest.open()
output_manifest_fh.write('sample-id,filename,direction\n')
output_manifest_fh.write('# direction is not meaningful in this file '
'as these\n')
output_manifest_fh.write('# data may be derived from forward, reverse, '
'or \n')
output_manifest_fh.write('# joined reads\n')
id_to_fps = manifest.pivot(
index='sample-id', columns='direction', values='filename'
)

for i, (sample_id, (fwd_fp, rev_fp)) in enumerate(id_to_fps.iterrows()):
# The barcode id, lane number and read number are not relevant
# here. We might ultimately want to use a dir format other than
# SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
# about this information. Similarly, the direction of the read
# isn't relevant here anymore.
path = result.sequences.path_maker(sample_id=sample_id,
barcode_id=i,
lane_number=1,
read_number=1)
uncompressed_path = str(path).strip('.gz')

cmd = ['vsearch',
'--fastq_mergepairs', fwd_fp,
'--reverse', rev_fp,
'--fastqout', uncompressed_path,
'--fastq_ascii', str(phred_offset),
'--fastq_minlen', str(minlen),
'--fastq_minovlen', str(minovlen),
'--fastq_maxdiffs', str(maxdiffs),
'--fastq_qmin', '0',
'--fastq_qminout', '0',
'--fastq_qmax', '41',
'--fastq_qmaxout', '41',
'--fasta_width', '0']
# The barcode id and lane number are not relevant for either format.
# We might ultimately want to use a dir format other than these which
# doesn't care about this information.
# The read number (direction) is only relevant for the unmerged reads.

gz_merged_path, fq_merged_path = _get_output_paths(
merged, sample_id, i, 1
)
gz_unmerged_fwd_path, fq_unmerged_fwd_path = _get_output_paths(
unmerged, sample_id, i, 1
)
gz_unmerged_rev_path, fq_unmerged_rev_path = _get_output_paths(
unmerged, sample_id, i, 2
)

cmd = [
'vsearch',
'--fastq_mergepairs', fwd_fp,
'--reverse', rev_fp,
'--fastqout', fq_merged_path,
'--fastqout_notmerged_fwd', fq_unmerged_fwd_path,
'--fastqout_notmerged_rev', fq_unmerged_rev_path,
'--fastq_ascii', str(phred_offset),
'--fastq_minlen', str(minlen),
'--fastq_minovlen', str(minovlen),
'--fastq_maxdiffs', str(maxdiffs),
'--fastq_qmin', '0',
'--fastq_qminout', '0',
'--fastq_qmax', '41',
'--fastq_qmaxout', '41',
'--fasta_width', '0'
]
if truncqual is not None:
cmd += ['--fastq_truncqual', str(truncqual)]
if maxns is not None:
Expand All @@ -130,16 +160,48 @@ def _merge_pairs_w_command_output(
cmd += ['--threads', str(threads)]
if allowmergestagger:
cmd.append('--fastq_allowmergestagger')

run_command(cmd)
run_command(['gzip', uncompressed_path])
output_manifest_fh.write(
'%s,%s,%s\n' % (sample_id, path.name, 'forward'))

output_manifest_fh.close()
result.manifest.write_data(output_manifest, FastqManifestFormat)
run_command([
'gzip', fq_merged_path, fq_unmerged_fwd_path, fq_unmerged_rev_path
])

merged_manifest_fh.write(
'%s,%s,%s\n' % (sample_id, gz_merged_path.name, 'forward')
)
unmerged_manifest_fh.write(
'%s,%s,%s\n' % (sample_id, gz_unmerged_fwd_path.name, 'forward')
)
unmerged_manifest_fh.write(
'%s,%s,%s\n' % (sample_id, gz_unmerged_rev_path.name, 'reverse')
)

merged_manifest_fh.close()
unmerged_manifest_fh.close()
merged.manifest.write_data(merged_manifest, FastqManifestFormat)
unmerged.manifest.write_data(unmerged_manifest, FastqManifestFormat)

metadata = YamlFormat()
metadata.path.write_text(yaml.dump({'phred-offset': phred_offset}))
result.metadata.write_data(metadata, YamlFormat)
merged.metadata.write_data(metadata, YamlFormat)
unmerged.metadata.write_data(metadata, YamlFormat)

return cmd, merged, unmerged


def _get_output_paths(format_, sample_id, barcode_id, direction):
path = format_.sequences.path_maker(
sample_id=sample_id,
barcode_id=barcode_id,
lane_number=1,
read_number=direction
)
return path, str(path).strip('.gz')


return cmd, result
def _write_manifest_header(manifest_fh, add_warning=False):
manifest_fh.write('sample-id,filename,direction\n')
if add_warning:
manifest_fh.write('')
manifest_fh.write('# direction is not meaningful for joined reads\n')
8 changes: 5 additions & 3 deletions q2_vsearch/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@
'minlen': qiime2.plugin.Int % qiime2.plugin.Range(0, None),
'maxns': qiime2.plugin.Int % qiime2.plugin.Range(0, None),
'allowmergestagger': qiime2.plugin.Bool,
'minovlen': qiime2.plugin.Int % qiime2.plugin.Range(0, None),
'minovlen': qiime2.plugin.Int % qiime2.plugin.Range(5, None),
'maxdiffs': qiime2.plugin.Int % qiime2.plugin.Range(0, None),
'minmergelen': qiime2.plugin.Int % qiime2.plugin.Range(0, None),
'maxmergelen': qiime2.plugin.Int % qiime2.plugin.Range(0, None),
Expand All @@ -291,7 +291,8 @@
0, 8, inclusive_start=True, inclusive_end=True)
},
outputs=[
('merged_sequences', SampleData[JoinedSequencesWithQuality])
('merged_sequences', SampleData[JoinedSequencesWithQuality]),
('unmerged_sequences', SampleData[PairedEndSequencesWithQuality])
],
input_descriptions={
'demultiplexed_seqs': ('The demultiplexed paired-end sequences to '
Expand All @@ -317,7 +318,8 @@
'not scale much past 4 threads.')
},
output_descriptions={
'merged_sequences': ('The merged sequences.'),
'merged_sequences': 'The merged sequences.',
'unmerged_sequences': 'The unmerged paired-end reads.'
},
name='Merge paired-end reads.',
description=('Merge paired-end sequence reads using vsearch\'s '
Expand Down
Loading

0 comments on commit c8b3ace

Please sign in to comment.