Skip to content
This repository has been archived by the owner on Nov 9, 2023. It is now read-only.

Added functionality to attempt read orientation for single_end_barcodes,... #1987

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 79 additions & 20 deletions qiime/extract_barcodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,6 @@ def extract_barcodes(fastq1,
disable_header_match: if True, suppresses checks between fastq headers.
"""

# Turn off extra file creation for single read.
if input_type == "barcode_single_end" and attempt_read_orientation:
attempt_read_orientation = False
if attempt_read_orientation:
header, mapping_data, run_description, errors, warnings =\
process_id_map(map_fp)
Expand All @@ -81,8 +78,11 @@ def extract_barcodes(fastq1,
"barcodes_not_oriented.fastq.incomplete"), "w")
fastq1_out_not_oriented = open(join(output_dir,
"reads1_not_oriented.fastq.incomplete"), "w")
fastq2_out_not_oriented = open(join(output_dir,
"reads2_not_oriented.fastq.incomplete"), "w")
if input_type == "barcode_paired_end":
fastq2_out_not_oriented = open(join(output_dir,
"reads2_not_oriented.fastq.incomplete"), "w")
else:
fastq2_out_not_oriented = None
else:
forward_primers = None
reverse_primers = None
Expand All @@ -95,7 +95,7 @@ def extract_barcodes(fastq1,
output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w")
output_fastq2 = None
final_fastq1_name = join(output_dir, "reads.fastq")
elif input_type in ["barcode_paired_end"]:
elif input_type == "barcode_paired_end":
output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w")
output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w")
final_fastq1_name = join(output_dir, "reads1.fastq")
Expand Down Expand Up @@ -126,7 +126,10 @@ def extract_barcodes(fastq1,

if input_type == "barcode_single_end":
process_barcode_single_end_data(read1_data, output_bc_fastq,
output_fastq1, bc1_len, rev_comp_bc1)
output_fastq1, bc1_len, rev_comp_bc1,
attempt_read_orientation, forward_primers,
reverse_primers, output_bc_not_oriented,
fastq1_out_not_oriented)

elif input_type == "barcode_paired_end":
process_barcode_paired_end_data(read1_data, read2_data,
Expand Down Expand Up @@ -195,33 +198,89 @@ def process_barcode_single_end_data(read1_data,
output_bc_fastq,
output_fastq1,
bc1_len=6,
rev_comp_bc1=False):
rev_comp_bc1=False,
attempt_read_orientation=False,
forward_primers=None,
reverse_primers=None,
output_bc_not_oriented=None,
fastq_out_not_oriented=None):
""" Processes, writes single-end barcode data, parsed sequence

read1_data: list of header, read, quality scores
output_bc_fastq: open output fastq filepath
output_fastq1: open output fastq reads filepath
bc1_len: length of barcode to remove from beginning of data
rev_comp_bc1: reverse complement barcode before writing.
attempt_read_orientation: If True, will attempt to orient the reads
according to the forward primers in the mapping file. If primer is
detected in current orientation, leave the read as is, but if reverse
complement is detected (or ReversePrimer is detected in the current
orientation) the read will either be written to the forward (read 1) or
reverse (read 2) reads for the case of paired files, or the read will be
reverse complemented in the case of stitched reads.
forward_primers: list of regular expression generators, forward primers
reverse_primers: list of regular expression generators, reverse primers
output_bc_not_oriented: Barcode output from reads that are not oriented
fastq_out_not_oriented: Open filepath to write reads where primers
can't be found when attempt_read_orientation is True.
"""

header_index = 0
sequence_index = 1
quality_index = 2

bc_read = read1_data[sequence_index][:bc1_len]
bc_qual = read1_data[quality_index][:bc1_len]
if rev_comp_bc1:
bc_read = str(DNA(bc_read).rc())
bc_qual = bc_qual[::-1]

bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
output_bc_fastq.write(bc_lines)
seq_lines = format_fastq_record(read1_data[header_index],

if not attempt_read_orientation:
bc_read = read1_data[sequence_index][:bc1_len]
bc_qual = read1_data[quality_index][:bc1_len]
if rev_comp_bc1:
bc_read = str(DNA(bc_read).rc())
bc_qual = bc_qual[::-1]

bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
output_bc_fastq.write(bc_lines)
seq_lines = format_fastq_record(read1_data[header_index],
read1_data[sequence_index][bc1_len:],
read1_data[quality_index][bc1_len:])
output_fastq1.write(seq_lines)

output_fastq1.write(seq_lines)
else:
read_seq = read1_data[sequence_index]
read_qual = read1_data[quality_index]

found_primer_match = False
# Break from orientation search as soon as a match is found
if attempt_read_orientation:
for curr_primer in forward_primers:
if curr_primer.search(read1_data[sequence_index]):
found_primer_match = True
break
if not found_primer_match:
for curr_primer in reverse_primers:
if curr_primer.search(read1_data[sequence_index]):
read_seq = str(DNA(read_seq).rc())
read_qual = read_qual[::-1]
found_primer_match = True
break

if not found_primer_match and attempt_read_orientation:
output_bc = output_bc_not_oriented
output_read = fastq_out_not_oriented
else:
output_bc = output_bc_fastq
output_read = output_fastq1

bc_read1 = read_seq[0:bc1_len]
bc_qual1 = read_qual[0:bc1_len]

if rev_comp_bc1:
bc_read1 = str(DNA(bc_read1).rc())
bc_qual1 = bc_qual1[::-1]

bc_lines = format_fastq_record(read1_data[header_index], bc_read1, bc_qual1)
output_bc.write(bc_lines)
seq_lines = format_fastq_record(read1_data[header_index],
read_seq[bc1_len:], read_qual[bc1_len:])
output_read.write(seq_lines)

return


Expand Down
32 changes: 32 additions & 0 deletions tests/test_extract_barcodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,38 @@ def test_process_barcode_single_end_data(self):
expected_reads = ['@HWI-ST830', 'TTTCCCCGGGG', '+', ')*+,-./0123', '']

self.assertEqual(actual_reads, expected_reads)

def test_process_barcode_single_end_data_with_orientation(self):
""" Handles fastq lines, parses barcodes, orients reads """

fastq_data = ["HWI-ST830", "AAAATTTTCCCCGGGG",
np.arange(3, 19, dtype=np.int8)]
reads_out = FakeOutFile()
bcs_out = FakeOutFile()
forward_primers = [compile(''.join([self.iupac[symbol] for
symbol in 'AYA']))]
reverse_primers = [compile(''.join([self.iupac[symbol] for
symbol in 'TCCCCG']))]
output_bc_not_oriented = FakeOutFile()
fastq1_out_not_oriented = FakeOutFile()

process_barcode_single_end_data(fastq_data, bcs_out, reads_out,
bc1_len=5, rev_comp_bc1=False,
attempt_read_orientation=True,
forward_primers=forward_primers,
reverse_primers=reverse_primers,
output_bc_not_oriented=output_bc_not_oriented,
fastq_out_not_oriented=fastq1_out_not_oriented)

actual_bcs = bcs_out.data.split('\n')
expected_bcs = ["@HWI-ST830", "CCCCG", "+", "3210/", ""]

self.assertEqual(actual_bcs, expected_bcs)

actual_reads = reads_out.data.split('\n')
expected_reads = ['@HWI-ST830', 'GGGAAAATTTT', '+', ".-,+*)('&%$", '']

self.assertEqual(actual_reads, expected_reads)

def test_process_barcode_paired_end_data(self):
""" Handles paired fastq lines, parses barcodes """
Expand Down