Skip to content

Commit

Permalink
[feat] add an option to use the sample barcode in the FASTQ header
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed Dec 20, 2022
1 parent 67bac88 commit fc801ba
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 24 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,8 @@ then the `Sample_Barcode` field for each sample should be composed as follows:
|--writer-threads |No |5 | The number of threads to use to write compressed FASTQ data to disk. |
|--override-matcher |No |n/a | The algorithm to use for matching, either `CachedHammingDistance` or `PreCompute`. By default if barcodes are 12bp or shorter `PreCompute` is used which pre-computes all possible matches, or if barcodes are longer than 12bp `CachedHammingDistance` is used which calculates matches when needed then caches the results. |
|--most-unmatched-to-output|No |1000 | Report on the top N most frequently observed unmatched barcode sequences. |
|--skip-read-name-check|No |False | If this is true, then all the read names across FASTQs will not be enforced to be the same. This may be useful when the read names are known to be the same and performance matters. Regardless, the first read name in each FASTQ will always be checked.sequences. |

|--skip-read-name-check|No |False | If this is true, then all the read names across FASTQs will not be enforced to be the same. This may be useful when the read names are known to be the same and performance matters. Regardless, the first read name in each FASTQ will always be checked. |
|--sample-barcode-in-fastq-header|No |False | If this is true, then the sample barcode is expected to be in the FASTQ read header. For dual indexed data, the barcodes must be `+` (plus) delimited. Additionally, if true, then neither index FASTQ files nor sample barcode segments in the read structure may be specified. |

#### Performance Considerations

Expand Down
85 changes: 83 additions & 2 deletions src/lib/demux.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use std::{borrow::Cow, iter::IntoIterator, vec::Vec};

use anyhow::{ensure, Context, Result};
use anyhow::{bail, ensure, Context, Result};
use bstr::ByteSlice;
use itertools::Itertools;
use log::debug;
Expand Down Expand Up @@ -38,6 +38,12 @@ impl ExtractedBarcode {
Self::default()
}

/// Create a new [`ExtractedBarcode`] from the delimited barcode, with `+` as the delimiter.
fn from_delimited(delimited: &[u8]) -> Self {
let concatenated = delimited.splitn(2, |c| *c == b'+').collect_vec().concat();
ExtractedBarcode { concatenated, delimited: delimited.to_vec() }
}

/// Add bases to the [`ExtractedBarcode`], this will add to both the concatenated and delimited forms.
fn add_bases(&mut self, bases: &[u8]) {
if self.non_empty() {
Expand Down Expand Up @@ -191,6 +197,11 @@ pub struct Demultiplexer<'a, M: Matcher> {
/// If this is true, then the read names across FASTQs will not be enforced to be the same. This may be useful when
/// the read names are known to be the same and performance matters.
skip_read_name_check: bool,
/// If this is true, then the sample barcode is expected to be in the FASTQ read header. For
/// dual indexed data, the barcodes must be `+` (plus) delimited. Additionally, if true, then
/// neither index FASTQ files nor sample barcode segments in the read structure may be
/// specified.
sample_barcode_in_fastq_header: bool,
}

impl<'a, M> Demultiplexer<'a, M>
Expand All @@ -208,6 +219,7 @@ where
matcher: M,
collect_unmatched: bool,
skip_read_name_check: bool,
sample_barcode_in_fastq_header: bool,
) -> Result<Self> {
// TODO: use display instead of debug formatting
ensure!(
Expand Down Expand Up @@ -244,6 +256,7 @@ where
sample_barcode_hop_checker,
collect_unmatched,
skip_read_name_check,
sample_barcode_in_fastq_header,
})
}

Expand Down Expand Up @@ -457,6 +470,27 @@ where
None => (),
}

// If desired, get the sample barcode bases from the FASTQ header (of the first read)
if self.sample_barcode_in_fastq_header {
// get the FASTQ header from the first read
let read = &zipped_reads[0];
let header =
FastqHeader::try_from(read.head()).with_context(|| {
format!(
"Unable to parse read header: {}",
String::from_utf8_lossy(read.head()),
)
})?;
// get the sample barcode from the FASTQ header
match header.sample_barcode() {
None => bail!(
"Could not read the sample barcode from the read header: {}",
String::from_utf8_lossy(read.head())
),
Some(barcode) => sample_barcode = ExtractedBarcode::from_delimited(barcode),
};
}

// Extract the barcodes from the read structure and build up the record to be output
for (idx, (read, structure)) in
zipped_reads.iter().zip(self.grouped_read_segments.iter()).enumerate()
Expand Down Expand Up @@ -716,6 +750,7 @@ mod test {
matcher,
false,
false,
self.opts.sample_barcode_in_fastq_header,
)
.unwrap(),
)
Expand All @@ -736,6 +771,7 @@ mod test {
matcher,
false,
false,
self.opts.sample_barcode_in_fastq_header,
)
.unwrap(),
)
Expand Down Expand Up @@ -832,7 +868,6 @@ mod test {
let demuxer = context.demux();
let result = demuxer.demultiplex(&context.per_fastq_record_set).unwrap();


assert!(result.per_sample_reads[1..].iter().all(demux::OutputPerSampleReads::is_empty), "Samples 2-4 and UNDETERMINED have no reads assigned");
assert_eq!(result.per_sample_reads[0].len(), 1, "Sample1 contains a single record.");
assert_eq!(result.per_sample_reads[0].per_fastq_reads.len(), 1, "Sample1 has only a single output fastq expected.");
Expand Down Expand Up @@ -1645,6 +1680,7 @@ mod test {
matcher,
false,
false,
false,
)
.unwrap();
let per_fastq_record_set =
Expand Down Expand Up @@ -1705,4 +1741,49 @@ mod test {
let result = demuxer.demultiplex(&context.per_fastq_record_set);
assert!(result.is_err());
}

#[rstest]
#[rustfmt::skip]
fn test_demux_fragment_reads_sample_barcode_from_fastq_header_single_index() {
let read_structures = vec![ReadStructure::from_str("100T").unwrap()];
let fastq_record = Fq { name: "frag", bases: &[b'A'; 100], set_bc_to: Some(SAMPLE_BARCODE_1), ..Fq::default() }.to_owned_record();
let mut context = DemuxTestContext::demux_structures(vec![fastq_record], read_structures, MatcherKind::CachedHammingDistance);
context.opts.sample_barcode_in_fastq_header = true;
let demuxer = context.demux();
let result = demuxer.demultiplex(&context.per_fastq_record_set).unwrap();

assert!(result.per_sample_reads[1..].iter().all(demux::OutputPerSampleReads::is_empty), "Samples 2-4 and UNDETERMINED have no reads assigned");
assert_eq!(result.per_sample_reads[0].len(), 1, "Sample1 contains a single record.");
assert_eq!(result.per_sample_reads[0].per_fastq_reads.len(), 1, "Sample1 has only a single output fastq expected.");
assert_eq!(result.per_sample_reads[0].per_fastq_reads[0][0].seq, &[b'A'; 100], "Sample1 seq has had the barcode removed");
assert_eq!(result.per_sample_reads[0].per_fastq_reads[0][0].qual, &[b'!'; 100], "Sample1 quals has had the barcode removed");

let header = FastqHeader::try_from(result.per_sample_reads[0].per_fastq_reads[0][0].head.as_slice()).unwrap();

assert!(header.umi().is_none(), "Fastq header has no UMI set");
assert_eq!(header.sample_barcode(), Some(SAMPLE_BARCODE_1),"Set barcode matches Sample1");
}

#[rstest]
#[rustfmt::skip]
fn test_demux_fragment_reads_sample_barcode_from_fastq_header_dual_index() {
let read_structures = vec![ReadStructure::from_str("100T").unwrap()];
let (index1, index2) = SAMPLE_BARCODE_1.split_at(10);
let delimited_sample_barcode = [index1, &[b'+'], index2].concat();
let fastq_record = Fq { name: "frag", bases: &[b'A'; 100], set_bc_to: Some(&delimited_sample_barcode), ..Fq::default() }.to_owned_record();
let mut context = DemuxTestContext::demux_structures(vec![fastq_record], read_structures, MatcherKind::CachedHammingDistance);
context.opts.sample_barcode_in_fastq_header = true;
let demuxer = context.demux();
let result = demuxer.demultiplex(&context.per_fastq_record_set).unwrap();

assert!(result.per_sample_reads[1..].iter().all(demux::OutputPerSampleReads::is_empty), "Samples 2-4 and UNDETERMINED have no reads assigned");
assert_eq!(result.per_sample_reads[0].len(), 1, "Sample1 contains a single record.");
assert_eq!(result.per_sample_reads[0].per_fastq_reads.len(), 1, "Sample1 has only a single output fastq expected.");
assert_eq!(result.per_sample_reads[0].per_fastq_reads[0][0].seq, &[b'A'; 100], "Sample1 seq has had the barcode removed");
assert_eq!(result.per_sample_reads[0].per_fastq_reads[0][0].qual, &[b'!'; 100], "Sample1 quals has had the barcode removed");

let header = FastqHeader::try_from(result.per_sample_reads[0].per_fastq_reads[0][0].head.as_slice()).unwrap();
assert!(header.umi().is_none(), "Fastq header has no UMI set");
assert_eq!(header.sample_barcode(), Some(delimited_sample_barcode.as_slice()),"Set barcode matches Sample1");
}
}
Loading

0 comments on commit fc801ba

Please sign in to comment.