Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GenotypeAllele encoding and documentation for pushing to Record #285

Merged
merged 6 commits into from
Dec 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 58 additions & 6 deletions src/bcf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
//! Note that BCF corresponds to the in-memory representation of BCF/VCF records in Htslib
//! itself. Thus, it comes without a runtime penalty for parsing, in contrast to reading VCF
//! files.
//! # Example
//!
//! # Example (reading)
//!
//! - Obtaining 0-based locus index of the VCF record.
//! - Obtaining alleles of the VCF record.
//! - calculate alt-allele dosage in a mutli-sample VCF / BCF
Expand Down Expand Up @@ -54,6 +56,52 @@
//! }
//! }
//! ```
//!
//! # Example (writing)
//!
//! - Setting up a VCF writer from scratch (including a simple header)
//! - Creating a VCF record and writing it to the VCF file
//!
//! ```
//! use rust_htslib::bcf::{Format, Writer};
//! use rust_htslib::bcf::header::Header;
//! use rust_htslib::bcf::record::GenotypeAllele;
//!
//! // Create minimal VCF header with a single contig and a single sample
//! let mut header = Header::new();
//! let header_contig_line = r#"##contig=<ID=1,length=10>"#;
//! header.push_record(header_contig_line.as_bytes());
//! let header_gt_line = r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#;
//! header.push_record(header_gt_line.as_bytes());
//! header.push_sample("test_sample".as_bytes());
//!
//! // Write uncompressed VCF to stdout with above header and get an empty record
//! let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
//! let mut record = vcf.empty_record();
//!
//! // Set chrom and pos to 1 and 7, respectively - note the 0-based positions
//! let rid = vcf.header().name2rid(b"1").unwrap();
//! record.set_rid(Some(rid));
//! record.set_pos(6);
//!
//! // Set record genotype to 0|1 - note first allele is always unphased
//! let alleles = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)];
//! record.push_genotypes(alleles).unwrap();
//!
//! // Write record
//! vcf.write(&record).unwrap()
//! ```
//!
//! This will print the following VCF to stdout:
//!
//! ```lang-none
//! ##fileformat=VCFv4.2
//! ##FILTER=<ID=PASS,Description="All filters passed">
//! ##contig=<ID=1,length=10>
//! ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
//! #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_sample
//! 1 7 . . . 0 . . GT 0|1
//! ```

use std::ffi;
use std::path::Path;
Expand Down Expand Up @@ -830,7 +878,7 @@ mod tests {
// the artificial "not observed" allele is present in each record.
assert_eq!(record.alleles().iter().last().unwrap(), b"<X>");

let mut fmt = record.format(b"PL");
let fmt = record.format(b"PL");
let pl = fmt.integer().expect("Error reading format.");
assert_eq!(pl.len(), 1);
if i == 59 {
Expand Down Expand Up @@ -923,7 +971,7 @@ mod tests {
let mut buffer = Buffer::new();
for (i, rec) in vcf.records().enumerate() {
println!("record {}", i);
let mut record = rec.expect("Error reading record.");
let record = rec.expect("Error reading record.");
assert_eq!(
record
.info_shared_buffer(b"S1", &mut buffer)
Expand Down Expand Up @@ -966,7 +1014,7 @@ mod tests {
let f1 = [false, true];
let mut buffer = Buffer::new();
for (i, rec) in vcf.records().enumerate() {
let mut record = rec.expect("Error reading record.");
let record = rec.expect("Error reading record.");
assert_eq!(
record
.info_shared_buffer(b"F1", &mut buffer)
Expand Down Expand Up @@ -996,7 +1044,7 @@ mod tests {
let mut vcf = Reader::from_path(&"test/test_string.vcf").expect("Error opening file.");
let expected = ["./1", "1|1", "0/1", "0|1", "1|.", "1/1"];
for (rec, exp_gt) in vcf.records().zip(expected.iter()) {
let mut rec = rec.expect("Error reading record.");
let rec = rec.expect("Error reading record.");
let genotypes = rec.genotypes().expect("Error reading genotypes");
assert_eq!(&format!("{}", genotypes.get(0)), exp_gt);
}
Expand Down Expand Up @@ -1399,6 +1447,8 @@ mod tests {
let converted: i32 = allele.into();
let expected = 4;
assert_eq!(converted, expected);
let reverse_conversion = GenotypeAllele::from(expected);
assert_eq!(allele, reverse_conversion);
}

#[test]
Expand All @@ -1407,6 +1457,8 @@ mod tests {
let converted: i32 = allele.into();
let expected = 1;
assert_eq!(converted, expected);
let reverse_conversion = GenotypeAllele::from(expected);
assert_eq!(allele, reverse_conversion);
}

#[test]
Expand All @@ -1416,7 +1468,7 @@ mod tests {
let _header = bcf.header();
// FORMAT fields of first record of the vcf should look like:
// GT:FS1:FN1 ./1:LongString1:1 1/1:ss1:2
let mut first_record = bcf.records().next().unwrap().expect("Fail to read record");
let first_record = bcf.records().next().unwrap().expect("Fail to read record");
let sample_count = usize::try_from(first_record.sample_count()).unwrap();
assert_eq!(sample_count, 2);
let mut n_ref = vec![0; sample_count];
Expand Down
99 changes: 91 additions & 8 deletions src/bcf/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ impl Record {

/// Get the reference id of the record.
///
/// To look up the contig name, use `bcf::header::HeaderView::rid2name`.
/// To look up the contig name,
/// use [`HeaderView::rid2name`](../header/struct.HeaderView.html#method.rid2name).
///
/// # Returns
///
Expand All @@ -197,7 +198,32 @@ impl Record {
}
}

// Update the internal reference ID number.
/// Update the reference id of the record.
///
/// To look up reference id for a contig name,
/// use [`HeaderView::name2rid`](../header/struct.HeaderView.html#method.name2rid).
///
/// # Example
///
/// Example assumes we have a Record `record` from a VCF with a header containing region
/// named `1`. See [module documentation](../index.html#example-writing) for how to set
/// up VCF, header, and record.
///
/// ```
/// # use rust_htslib::bcf::{Format, Writer};
/// # use rust_htslib::bcf::header::Header;
/// # let mut header = Header::new();
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
/// # header.push_record(header_contig_line.as_bytes());
/// # header.push_sample("test_sample".as_bytes());
/// # let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
/// # let mut record = vcf.empty_record();
/// let rid = record.header().name2rid(b"1").ok();
/// record.set_rid(rid);
/// assert_eq!(record.rid(), rid);
/// let name = record.header().rid2name(record.rid().unwrap()).ok();
/// assert_eq!(Some("1".as_bytes()), name);
/// ```
pub fn set_rid(&mut self, rid: Option<u32>) {
match rid {
Some(rid) => self.inner_mut().rid = rid as i32,
Expand Down Expand Up @@ -424,6 +450,29 @@ impl Record {
/// # Errors
///
/// Returns error if GT tag is not present in header.
///
/// # Example
///
/// Example assumes we have a Record `record` from a VCF with a `GT` `FORMAT` tag.
/// See [module documentation](../index.html#example-writing) for how to set up
/// VCF, header, and record.
///
/// ```
/// # use rust_htslib::bcf::{Format, Writer};
/// # use rust_htslib::bcf::header::Header;
/// # use rust_htslib::bcf::record::GenotypeAllele;
/// # let mut header = Header::new();
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
/// # header.push_record(header_contig_line.as_bytes());
/// # let header_gt_line = r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#;
/// # header.push_record(header_gt_line.as_bytes());
/// # header.push_sample("test_sample".as_bytes());
/// # let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
/// # let mut record = vcf.empty_record();
/// let alleles = &[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)];
/// record.push_genotypes(alleles);
/// assert_eq!("1/1", &format!("{}", record.genotypes().unwrap().get(0)));
/// ```
pub fn push_genotypes(&mut self, genotypes: &[GenotypeAllele]) -> Result<()> {
let encoded: Vec<i32> = genotypes.iter().map(|gt| i32::from(*gt)).collect();
self.push_format_integer(b"GT", &encoded)
Expand Down Expand Up @@ -494,6 +543,28 @@ impl Record {
/// # Errors
///
/// Returns error if tag is not present in header.
///
/// # Example
///
/// Example assumes we have a Record `record` from a VCF with an `AF` `FORMAT` tag.
/// See [module documentation](../index.html#example-writing) for how to set up
/// VCF, header, and record.
///
/// ```
/// # use rust_htslib::bcf::{Format, Writer};
/// # use rust_htslib::bcf::header::Header;
/// # use rust_htslib::bcf::record::GenotypeAllele;
/// # let mut header = Header::new();
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
/// # header.push_record(header_contig_line.as_bytes());
/// # let header_af_line = r#"##FORMAT=<ID=AF,Number=1,Type=Float,Description="Frequency">"#;
/// # header.push_record(header_af_line.as_bytes());
/// # header.push_sample("test_sample".as_bytes());
/// # let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
/// # let mut record = vcf.empty_record();
/// record.push_format_float(b"AF", &[0.5]);
/// assert_eq!(0.5, record.format(b"AF").float().unwrap()[0][0]);
/// ```
pub fn push_format_float(&mut self, tag: &[u8], data: &[f32]) -> Result<()> {
self.push_format(tag, data, htslib::BCF_HT_REAL)
}
Expand Down Expand Up @@ -747,6 +818,10 @@ pub enum GenotypeAllele {

impl GenotypeAllele {
/// Decode given integer according to BCF standard.
#[deprecated(
since = "0.36.0",
note = "Please use the conversion trait From<i32> for GenotypeAllele instead."
)]
pub fn from_encoded(encoded: i32) -> Self {
match (encoded, encoded & 1) {
(0, 0) => GenotypeAllele::UnphasedMissing,
Expand Down Expand Up @@ -783,7 +858,19 @@ impl From<GenotypeAllele> for i32 {
GenotypeAllele::Unphased(a) => (a, 0),
GenotypeAllele::Phased(a) => (a, 1),
};
allele + 1 << 1 | phased
(allele + 1) << 1 | phased
}
}

impl From<i32> for GenotypeAllele {
fn from(encoded: i32) -> GenotypeAllele {
match (encoded, encoded & 1) {
(0, 0) => GenotypeAllele::UnphasedMissing,
(1, 1) => GenotypeAllele::PhasedMissing,
(e, 1) => GenotypeAllele::Phased((e >> 1) - 1),
(e, 0) => GenotypeAllele::Unphased((e >> 1) - 1),
_ => panic!("unexpected phasing type"),
}
}
}

Expand Down Expand Up @@ -825,11 +912,7 @@ impl<'a, B: Borrow<Buffer> + 'a> Genotypes<'a, B> {
/// this method will return `[Unphased(1), Phased(1)]`.
pub fn get(&self, i: usize) -> Genotype {
let igt = self.encoded[i];
Genotype(
igt.iter()
.map(|&e| GenotypeAllele::from_encoded(e))
.collect(),
)
Genotype(igt.iter().map(|&e| GenotypeAllele::from(e)).collect())
}
}

Expand Down